Page MenuHomeFreeBSD

No OneTemporary

This file is larger than 256 KB, so syntax highlighting was skipped.
This document is not UTF8. It was detected as ISO-8859-1 (Latin 1) and converted to UTF8 for display.
Index: head/sys/amd64/amd64/machdep.c
===================================================================
--- head/sys/amd64/amd64/machdep.c (revision 225616)
+++ head/sys/amd64/amd64/machdep.c (revision 225617)
@@ -1,2423 +1,2423 @@
/*-
* Copyright (c) 2003 Peter Wemm.
* Copyright (c) 1992 Terrence R. Lambert.
* Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
* All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* William Jolitz.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* from: @(#)machdep.c 7.4 (Berkeley) 6/3/91
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_atalk.h"
#include "opt_atpic.h"
#include "opt_compat.h"
#include "opt_cpu.h"
#include "opt_ddb.h"
#include "opt_inet.h"
#include "opt_ipx.h"
#include "opt_isa.h"
#include "opt_kstack_pages.h"
#include "opt_maxmem.h"
#include "opt_mp_watchdog.h"
#include "opt_perfmon.h"
#include "opt_sched.h"
#include "opt_kdtrace.h"
#include <sys/param.h>
#include <sys/proc.h>
#include <sys/systm.h>
#include <sys/bio.h>
#include <sys/buf.h>
#include <sys/bus.h>
#include <sys/callout.h>
#include <sys/cons.h>
#include <sys/cpu.h>
#include <sys/eventhandler.h>
#include <sys/exec.h>
#include <sys/imgact.h>
#include <sys/kdb.h>
#include <sys/kernel.h>
#include <sys/ktr.h>
#include <sys/linker.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/msgbuf.h>
#include <sys/mutex.h>
#include <sys/pcpu.h>
#include <sys/ptrace.h>
#include <sys/reboot.h>
#include <sys/sched.h>
#include <sys/signalvar.h>
#ifdef SMP
#include <sys/smp.h>
#endif
#include <sys/syscallsubr.h>
#include <sys/sysctl.h>
#include <sys/sysent.h>
#include <sys/sysproto.h>
#include <sys/ucontext.h>
#include <sys/vmmeter.h>
#include <vm/vm.h>
#include <vm/vm_extern.h>
#include <vm/vm_kern.h>
#include <vm/vm_page.h>
#include <vm/vm_map.h>
#include <vm/vm_object.h>
#include <vm/vm_pager.h>
#include <vm/vm_param.h>
#ifdef DDB
#ifndef KDB
#error KDB must be enabled in order for DDB to work!
#endif
#include <ddb/ddb.h>
#include <ddb/db_sym.h>
#endif
#include <net/netisr.h>
#include <machine/clock.h>
#include <machine/cpu.h>
#include <machine/cputypes.h>
#include <machine/intr_machdep.h>
#include <x86/mca.h>
#include <machine/md_var.h>
#include <machine/metadata.h>
#include <machine/mp_watchdog.h>
#include <machine/pc/bios.h>
#include <machine/pcb.h>
#include <machine/proc.h>
#include <machine/reg.h>
#include <machine/sigframe.h>
#include <machine/specialreg.h>
#ifdef PERFMON
#include <machine/perfmon.h>
#endif
#include <machine/tss.h>
#ifdef SMP
#include <machine/smp.h>
#endif
#ifdef DEV_ATPIC
#include <x86/isa/icu.h>
#else
#include <machine/apicvar.h>
#endif
#include <isa/isareg.h>
#include <isa/rtc.h>
/* Sanity check for __curthread() */
CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
extern u_int64_t hammer_time(u_int64_t, u_int64_t);
extern void printcpuinfo(void); /* XXX header file */
extern void identify_cpu(void);
extern void panicifcpuunsupported(void);
#define CS_SECURE(cs) (ISPL(cs) == SEL_UPL)
#define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
static void cpu_startup(void *);
static void get_fpcontext(struct thread *td, mcontext_t *mcp);
static int set_fpcontext(struct thread *td, const mcontext_t *mcp);
SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
/*
* The file "conf/ldscript.amd64" defines the symbol "kernphys". Its value is
* the physical address at which the kernel is loaded.
*/
extern char kernphys[];
#ifdef DDB
extern vm_offset_t ksym_start, ksym_end;
#endif
struct msgbuf *msgbufp;
/* Intel ICH registers */
#define ICH_PMBASE 0x400
#define ICH_SMI_EN ICH_PMBASE + 0x30
int _udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel;
int cold = 1;
long Maxmem = 0;
long realmem = 0;
/*
* The number of PHYSMAP entries must be one less than the number of
* PHYSSEG entries because the PHYSMAP entry that spans the largest
* physical address that is accessible by ISA DMA is split into two
* PHYSSEG entries.
*/
#define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1))
vm_paddr_t phys_avail[PHYSMAP_SIZE + 2];
vm_paddr_t dump_avail[PHYSMAP_SIZE + 2];
/* must be 2 less so 0 0 can signal end of chunks */
#define PHYS_AVAIL_ARRAY_END ((sizeof(phys_avail) / sizeof(phys_avail[0])) - 2)
#define DUMP_AVAIL_ARRAY_END ((sizeof(dump_avail) / sizeof(dump_avail[0])) - 2)
struct kva_md_info kmi;
static struct trapframe proc0_tf;
struct region_descriptor r_gdt, r_idt;
struct pcpu __pcpu[MAXCPU];
struct mtx icu_lock;
struct mtx dt_lock; /* lock for GDT and LDT */
static void
cpu_startup(dummy)
void *dummy;
{
uintmax_t memsize;
char *sysenv;
/*
* On MacBooks, we need to disallow the legacy USB circuit to
* generate an SMI# because this can cause several problems,
* namely: incorrect CPU frequency detection and failure to
* start the APs.
* We do this by disabling a bit in the SMI_EN (SMI Control and
* Enable register) of the Intel ICH LPC Interface Bridge.
*/
sysenv = getenv("smbios.system.product");
if (sysenv != NULL) {
if (strncmp(sysenv, "MacBook1,1", 10) == 0 ||
strncmp(sysenv, "MacBook3,1", 10) == 0 ||
strncmp(sysenv, "MacBookPro1,1", 13) == 0 ||
strncmp(sysenv, "MacBookPro1,2", 13) == 0 ||
strncmp(sysenv, "MacBookPro3,1", 13) == 0 ||
strncmp(sysenv, "Macmini1,1", 10) == 0) {
if (bootverbose)
printf("Disabling LEGACY_USB_EN bit on "
"Intel ICH.\n");
outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8);
}
freeenv(sysenv);
}
/*
* Good {morning,afternoon,evening,night}.
*/
startrtclock();
printcpuinfo();
panicifcpuunsupported();
#ifdef PERFMON
perfmon_init();
#endif
realmem = Maxmem;
/*
* Display physical memory if SMBIOS reports reasonable amount.
*/
memsize = 0;
sysenv = getenv("smbios.memory.enabled");
if (sysenv != NULL) {
memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
freeenv(sysenv);
}
if (memsize < ptoa((uintmax_t)cnt.v_free_count))
memsize = ptoa((uintmax_t)Maxmem);
printf("real memory = %ju (%ju MB)\n", memsize, memsize >> 20);
/*
* Display any holes after the first chunk of extended memory.
*/
if (bootverbose) {
int indx;
printf("Physical memory chunk(s):\n");
for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
vm_paddr_t size;
size = phys_avail[indx + 1] - phys_avail[indx];
printf(
"0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
(uintmax_t)phys_avail[indx],
(uintmax_t)phys_avail[indx + 1] - 1,
(uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
}
}
vm_ksubmap_init(&kmi);
printf("avail memory = %ju (%ju MB)\n",
ptoa((uintmax_t)cnt.v_free_count),
ptoa((uintmax_t)cnt.v_free_count) / 1048576);
/*
* Set up buffers, so they can be used to read disk labels.
*/
bufinit();
vm_pager_bufferinit();
cpu_setregs();
}
/*
* Send an interrupt to process.
*
* Stack is set up to allow sigcode stored
* at top to call routine, followed by call
* to sigreturn routine below. After sigreturn
* resets the signal mask, the stack, and the
* frame pointer, it returns to the user
* specified pc, psl.
*/
void
sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
{
struct sigframe sf, *sfp;
struct pcb *pcb;
struct proc *p;
struct thread *td;
struct sigacts *psp;
char *sp;
struct trapframe *regs;
int sig;
int oonstack;
td = curthread;
pcb = td->td_pcb;
p = td->td_proc;
PROC_LOCK_ASSERT(p, MA_OWNED);
sig = ksi->ksi_signo;
psp = p->p_sigacts;
mtx_assert(&psp->ps_mtx, MA_OWNED);
regs = td->td_frame;
oonstack = sigonstack(regs->tf_rsp);
/* Save user context. */
bzero(&sf, sizeof(sf));
sf.sf_uc.uc_sigmask = *mask;
sf.sf_uc.uc_stack = td->td_sigstk;
sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(*regs));
sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */
get_fpcontext(td, &sf.sf_uc.uc_mcontext);
fpstate_drop(td);
sf.sf_uc.uc_mcontext.mc_fsbase = pcb->pcb_fsbase;
sf.sf_uc.uc_mcontext.mc_gsbase = pcb->pcb_gsbase;
bzero(sf.sf_uc.uc_mcontext.mc_spare,
sizeof(sf.sf_uc.uc_mcontext.mc_spare));
bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__));
/* Allocate space for the signal handler context. */
if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
SIGISMEMBER(psp->ps_sigonstack, sig)) {
sp = td->td_sigstk.ss_sp +
td->td_sigstk.ss_size - sizeof(struct sigframe);
#if defined(COMPAT_43)
td->td_sigstk.ss_flags |= SS_ONSTACK;
#endif
} else
sp = (char *)regs->tf_rsp - sizeof(struct sigframe) - 128;
/* Align to 16 bytes. */
sfp = (struct sigframe *)((unsigned long)sp & ~0xFul);
/* Translate the signal if appropriate. */
if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
/* Build the argument list for the signal handler. */
regs->tf_rdi = sig; /* arg 1 in %rdi */
regs->tf_rdx = (register_t)&sfp->sf_uc; /* arg 3 in %rdx */
bzero(&sf.sf_si, sizeof(sf.sf_si));
if (SIGISMEMBER(psp->ps_siginfo, sig)) {
/* Signal handler installed with SA_SIGINFO. */
regs->tf_rsi = (register_t)&sfp->sf_si; /* arg 2 in %rsi */
sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
/* Fill in POSIX parts */
sf.sf_si = ksi->ksi_info;
sf.sf_si.si_signo = sig; /* maybe a translated signal */
regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
} else {
/* Old FreeBSD-style arguments. */
regs->tf_rsi = ksi->ksi_code; /* arg 2 in %rsi */
regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
sf.sf_ahu.sf_handler = catcher;
}
mtx_unlock(&psp->ps_mtx);
PROC_UNLOCK(p);
/*
* Copy the sigframe out to the user's stack.
*/
if (copyout(&sf, sfp, sizeof(*sfp)) != 0) {
#ifdef DEBUG
printf("process %ld has trashed its stack\n", (long)p->p_pid);
#endif
PROC_LOCK(p);
sigexit(td, SIGILL);
}
regs->tf_rsp = (long)sfp;
regs->tf_rip = p->p_sysent->sv_sigcode_base;
regs->tf_rflags &= ~(PSL_T | PSL_D);
regs->tf_cs = _ucodesel;
regs->tf_ds = _udatasel;
regs->tf_es = _udatasel;
regs->tf_fs = _ufssel;
regs->tf_gs = _ugssel;
regs->tf_flags = TF_HASSEGS;
set_pcb_flags(pcb, PCB_FULL_IRET);
PROC_LOCK(p);
mtx_lock(&psp->ps_mtx);
}
/*
* System call to cleanup state after a signal
* has been taken. Reset signal mask and
* stack state from context left by sendsig (above).
* Return to previous pc and psl as specified by
* context left by sendsig. Check carefully to
* make sure that the user has not modified the
* state to gain improper privileges.
*
* MPSAFE
*/
int
-sigreturn(td, uap)
+sys_sigreturn(td, uap)
struct thread *td;
struct sigreturn_args /* {
const struct __ucontext *sigcntxp;
} */ *uap;
{
ucontext_t uc;
struct pcb *pcb;
struct proc *p;
struct trapframe *regs;
ucontext_t *ucp;
long rflags;
int cs, error, ret;
ksiginfo_t ksi;
pcb = td->td_pcb;
p = td->td_proc;
error = copyin(uap->sigcntxp, &uc, sizeof(uc));
if (error != 0) {
uprintf("pid %d (%s): sigreturn copyin failed\n",
p->p_pid, td->td_name);
return (error);
}
ucp = &uc;
if ((ucp->uc_mcontext.mc_flags & ~_MC_FLAG_MASK) != 0) {
uprintf("pid %d (%s): sigreturn mc_flags %x\n", p->p_pid,
td->td_name, ucp->uc_mcontext.mc_flags);
return (EINVAL);
}
regs = td->td_frame;
rflags = ucp->uc_mcontext.mc_rflags;
/*
* Don't allow users to change privileged or reserved flags.
*/
/*
* XXX do allow users to change the privileged flag PSL_RF.
* The cpu sets PSL_RF in tf_rflags for faults. Debuggers
* should sometimes set it there too. tf_rflags is kept in
* the signal context during signal handling and there is no
* other place to remember it, so the PSL_RF bit may be
* corrupted by the signal handler without us knowing.
* Corruption of the PSL_RF bit at worst causes one more or
* one less debugger trap, so allowing it is fairly harmless.
*/
if (!EFL_SECURE(rflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF)) {
uprintf("pid %d (%s): sigreturn rflags = 0x%lx\n", p->p_pid,
td->td_name, rflags);
return (EINVAL);
}
/*
* Don't allow users to load a valid privileged %cs. Let the
* hardware check for invalid selectors, excess privilege in
* other selectors, invalid %eip's and invalid %esp's.
*/
cs = ucp->uc_mcontext.mc_cs;
if (!CS_SECURE(cs)) {
uprintf("pid %d (%s): sigreturn cs = 0x%x\n", p->p_pid,
td->td_name, cs);
ksiginfo_init_trap(&ksi);
ksi.ksi_signo = SIGBUS;
ksi.ksi_code = BUS_OBJERR;
ksi.ksi_trapno = T_PROTFLT;
ksi.ksi_addr = (void *)regs->tf_rip;
trapsignal(td, &ksi);
return (EINVAL);
}
ret = set_fpcontext(td, &ucp->uc_mcontext);
if (ret != 0) {
uprintf("pid %d (%s): sigreturn set_fpcontext err %d\n",
p->p_pid, td->td_name, ret);
return (ret);
}
bcopy(&ucp->uc_mcontext.mc_rdi, regs, sizeof(*regs));
pcb->pcb_fsbase = ucp->uc_mcontext.mc_fsbase;
pcb->pcb_gsbase = ucp->uc_mcontext.mc_gsbase;
#if defined(COMPAT_43)
if (ucp->uc_mcontext.mc_onstack & 1)
td->td_sigstk.ss_flags |= SS_ONSTACK;
else
td->td_sigstk.ss_flags &= ~SS_ONSTACK;
#endif
kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
set_pcb_flags(pcb, PCB_FULL_IRET);
return (EJUSTRETURN);
}
#ifdef COMPAT_FREEBSD4
int
freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap)
{
- return sigreturn(td, (struct sigreturn_args *)uap);
+ return sys_sigreturn(td, (struct sigreturn_args *)uap);
}
#endif
/*
* Machine dependent boot() routine
*
* I haven't seen anything to put here yet
* Possibly some stuff might be grafted back here from boot()
*/
void
cpu_boot(int howto)
{
}
/*
* Flush the D-cache for non-DMA I/O so that the I-cache can
* be made coherent later.
*/
void
cpu_flush_dcache(void *ptr, size_t len)
{
/* Not applicable */
}
/* Get current clock frequency for the given cpu id. */
int
cpu_est_clockrate(int cpu_id, uint64_t *rate)
{
uint64_t tsc1, tsc2;
uint64_t acnt, mcnt, perf;
register_t reg;
if (pcpu_find(cpu_id) == NULL || rate == NULL)
return (EINVAL);
/*
* If TSC is P-state invariant and APERF/MPERF MSRs do not exist,
* DELAY(9) based logic fails.
*/
if (tsc_is_invariant && !tsc_perf_stat)
return (EOPNOTSUPP);
#ifdef SMP
if (smp_cpus > 1) {
/* Schedule ourselves on the indicated cpu. */
thread_lock(curthread);
sched_bind(curthread, cpu_id);
thread_unlock(curthread);
}
#endif
/* Calibrate by measuring a short delay. */
reg = intr_disable();
if (tsc_is_invariant) {
wrmsr(MSR_MPERF, 0);
wrmsr(MSR_APERF, 0);
tsc1 = rdtsc();
DELAY(1000);
mcnt = rdmsr(MSR_MPERF);
acnt = rdmsr(MSR_APERF);
tsc2 = rdtsc();
intr_restore(reg);
perf = 1000 * acnt / mcnt;
*rate = (tsc2 - tsc1) * perf;
} else {
tsc1 = rdtsc();
DELAY(1000);
tsc2 = rdtsc();
intr_restore(reg);
*rate = (tsc2 - tsc1) * 1000;
}
#ifdef SMP
if (smp_cpus > 1) {
thread_lock(curthread);
sched_unbind(curthread);
thread_unlock(curthread);
}
#endif
return (0);
}
/*
* Shutdown the CPU as much as possible
*/
void
cpu_halt(void)
{
for (;;)
__asm__ ("hlt");
}
void (*cpu_idle_hook)(void) = NULL; /* ACPI idle hook. */
static int cpu_ident_amdc1e = 0; /* AMD C1E supported. */
static int idle_mwait = 1; /* Use MONITOR/MWAIT for short idle. */
TUNABLE_INT("machdep.idle_mwait", &idle_mwait);
SYSCTL_INT(_machdep, OID_AUTO, idle_mwait, CTLFLAG_RW, &idle_mwait,
0, "Use MONITOR/MWAIT for short idle");
#define STATE_RUNNING 0x0
#define STATE_MWAIT 0x1
#define STATE_SLEEPING 0x2
static void
cpu_idle_acpi(int busy)
{
int *state;
state = (int *)PCPU_PTR(monitorbuf);
*state = STATE_SLEEPING;
disable_intr();
if (sched_runnable())
enable_intr();
else if (cpu_idle_hook)
cpu_idle_hook();
else
__asm __volatile("sti; hlt");
*state = STATE_RUNNING;
}
static void
cpu_idle_hlt(int busy)
{
int *state;
state = (int *)PCPU_PTR(monitorbuf);
*state = STATE_SLEEPING;
/*
* We must absolutely guarentee that hlt is the next instruction
* after sti or we introduce a timing window.
*/
disable_intr();
if (sched_runnable())
enable_intr();
else
__asm __volatile("sti; hlt");
*state = STATE_RUNNING;
}
/*
* MWAIT cpu power states. Lower 4 bits are sub-states.
*/
#define MWAIT_C0 0xf0
#define MWAIT_C1 0x00
#define MWAIT_C2 0x10
#define MWAIT_C3 0x20
#define MWAIT_C4 0x30
static void
cpu_idle_mwait(int busy)
{
int *state;
state = (int *)PCPU_PTR(monitorbuf);
*state = STATE_MWAIT;
if (!sched_runnable()) {
cpu_monitor(state, 0, 0);
if (*state == STATE_MWAIT)
cpu_mwait(0, MWAIT_C1);
}
*state = STATE_RUNNING;
}
static void
cpu_idle_spin(int busy)
{
int *state;
int i;
state = (int *)PCPU_PTR(monitorbuf);
*state = STATE_RUNNING;
for (i = 0; i < 1000; i++) {
if (sched_runnable())
return;
cpu_spinwait();
}
}
/*
* C1E renders the local APIC timer dead, so we disable it by
* reading the Interrupt Pending Message register and clearing
* both C1eOnCmpHalt (bit 28) and SmiOnCmpHalt (bit 27).
*
* Reference:
* "BIOS and Kernel Developer's Guide for AMD NPT Family 0Fh Processors"
* #32559 revision 3.00+
*/
#define MSR_AMDK8_IPM 0xc0010055
#define AMDK8_SMIONCMPHALT (1ULL << 27)
#define AMDK8_C1EONCMPHALT (1ULL << 28)
#define AMDK8_CMPHALT (AMDK8_SMIONCMPHALT | AMDK8_C1EONCMPHALT)
static void
cpu_probe_amdc1e(void)
{
/*
* Detect the presence of C1E capability mostly on latest
* dual-cores (or future) k8 family.
*/
if (cpu_vendor_id == CPU_VENDOR_AMD &&
(cpu_id & 0x00000f00) == 0x00000f00 &&
(cpu_id & 0x0fff0000) >= 0x00040000) {
cpu_ident_amdc1e = 1;
}
}
void (*cpu_idle_fn)(int) = cpu_idle_acpi;
void
cpu_idle(int busy)
{
uint64_t msr;
CTR2(KTR_SPARE2, "cpu_idle(%d) at %d",
busy, curcpu);
#ifdef MP_WATCHDOG
ap_watchdog(PCPU_GET(cpuid));
#endif
/* If we are busy - try to use fast methods. */
if (busy) {
if ((cpu_feature2 & CPUID2_MON) && idle_mwait) {
cpu_idle_mwait(busy);
goto out;
}
}
/* If we have time - switch timers into idle mode. */
if (!busy) {
critical_enter();
cpu_idleclock();
}
/* Apply AMD APIC timer C1E workaround. */
if (cpu_ident_amdc1e && cpu_disable_deep_sleep) {
msr = rdmsr(MSR_AMDK8_IPM);
if (msr & AMDK8_CMPHALT)
wrmsr(MSR_AMDK8_IPM, msr & ~AMDK8_CMPHALT);
}
/* Call main idle method. */
cpu_idle_fn(busy);
/* Switch timers mack into active mode. */
if (!busy) {
cpu_activeclock();
critical_exit();
}
out:
CTR2(KTR_SPARE2, "cpu_idle(%d) at %d done",
busy, curcpu);
}
int
cpu_idle_wakeup(int cpu)
{
struct pcpu *pcpu;
int *state;
pcpu = pcpu_find(cpu);
state = (int *)pcpu->pc_monitorbuf;
/*
* This doesn't need to be atomic since missing the race will
* simply result in unnecessary IPIs.
*/
if (*state == STATE_SLEEPING)
return (0);
if (*state == STATE_MWAIT)
*state = STATE_RUNNING;
return (1);
}
/*
* Ordered by speed/power consumption.
*/
struct {
void *id_fn;
char *id_name;
} idle_tbl[] = {
{ cpu_idle_spin, "spin" },
{ cpu_idle_mwait, "mwait" },
{ cpu_idle_hlt, "hlt" },
{ cpu_idle_acpi, "acpi" },
{ NULL, NULL }
};
static int
idle_sysctl_available(SYSCTL_HANDLER_ARGS)
{
char *avail, *p;
int error;
int i;
avail = malloc(256, M_TEMP, M_WAITOK);
p = avail;
for (i = 0; idle_tbl[i].id_name != NULL; i++) {
if (strstr(idle_tbl[i].id_name, "mwait") &&
(cpu_feature2 & CPUID2_MON) == 0)
continue;
if (strcmp(idle_tbl[i].id_name, "acpi") == 0 &&
cpu_idle_hook == NULL)
continue;
p += sprintf(p, "%s%s", p != avail ? ", " : "",
idle_tbl[i].id_name);
}
error = sysctl_handle_string(oidp, avail, 0, req);
free(avail, M_TEMP);
return (error);
}
SYSCTL_PROC(_machdep, OID_AUTO, idle_available, CTLTYPE_STRING | CTLFLAG_RD,
0, 0, idle_sysctl_available, "A", "list of available idle functions");
static int
idle_sysctl(SYSCTL_HANDLER_ARGS)
{
char buf[16];
int error;
char *p;
int i;
p = "unknown";
for (i = 0; idle_tbl[i].id_name != NULL; i++) {
if (idle_tbl[i].id_fn == cpu_idle_fn) {
p = idle_tbl[i].id_name;
break;
}
}
strncpy(buf, p, sizeof(buf));
error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
if (error != 0 || req->newptr == NULL)
return (error);
for (i = 0; idle_tbl[i].id_name != NULL; i++) {
if (strstr(idle_tbl[i].id_name, "mwait") &&
(cpu_feature2 & CPUID2_MON) == 0)
continue;
if (strcmp(idle_tbl[i].id_name, "acpi") == 0 &&
cpu_idle_hook == NULL)
continue;
if (strcmp(idle_tbl[i].id_name, buf))
continue;
cpu_idle_fn = idle_tbl[i].id_fn;
return (0);
}
return (EINVAL);
}
SYSCTL_PROC(_machdep, OID_AUTO, idle, CTLTYPE_STRING | CTLFLAG_RW, 0, 0,
idle_sysctl, "A", "currently selected idle function");
/*
* Reset registers to default values on exec.
*/
void
exec_setregs(struct thread *td, struct image_params *imgp, u_long stack)
{
struct trapframe *regs = td->td_frame;
struct pcb *pcb = td->td_pcb;
mtx_lock(&dt_lock);
if (td->td_proc->p_md.md_ldt != NULL)
user_ldt_free(td);
else
mtx_unlock(&dt_lock);
pcb->pcb_fsbase = 0;
pcb->pcb_gsbase = 0;
clear_pcb_flags(pcb, PCB_32BIT | PCB_GS32BIT);
pcb->pcb_initial_fpucw = __INITIAL_FPUCW__;
set_pcb_flags(pcb, PCB_FULL_IRET);
bzero((char *)regs, sizeof(struct trapframe));
regs->tf_rip = imgp->entry_addr;
regs->tf_rsp = ((stack - 8) & ~0xFul) + 8;
regs->tf_rdi = stack; /* argv */
regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T);
regs->tf_ss = _udatasel;
regs->tf_cs = _ucodesel;
regs->tf_ds = _udatasel;
regs->tf_es = _udatasel;
regs->tf_fs = _ufssel;
regs->tf_gs = _ugssel;
regs->tf_flags = TF_HASSEGS;
td->td_retval[1] = 0;
/*
* Reset the hardware debug registers if they were in use.
* They won't have any meaning for the newly exec'd process.
*/
if (pcb->pcb_flags & PCB_DBREGS) {
pcb->pcb_dr0 = 0;
pcb->pcb_dr1 = 0;
pcb->pcb_dr2 = 0;
pcb->pcb_dr3 = 0;
pcb->pcb_dr6 = 0;
pcb->pcb_dr7 = 0;
if (pcb == PCPU_GET(curpcb)) {
/*
* Clear the debug registers on the running
* CPU, otherwise they will end up affecting
* the next process we switch to.
*/
reset_dbregs();
}
clear_pcb_flags(pcb, PCB_DBREGS);
}
/*
* Drop the FP state if we hold it, so that the process gets a
* clean FP state if it uses the FPU again.
*/
fpstate_drop(td);
}
void
cpu_setregs(void)
{
register_t cr0;
cr0 = rcr0();
/*
* CR0_MP, CR0_NE and CR0_TS are also set by npx_probe() for the
* BSP. See the comments there about why we set them.
*/
cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
load_cr0(cr0);
}
/*
* Initialize amd64 and configure to run kernel
*/
/*
* Initialize segments & interrupt table
*/
struct user_segment_descriptor gdt[NGDT * MAXCPU];/* global descriptor tables */
static struct gate_descriptor idt0[NIDT];
struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */
static char dblfault_stack[PAGE_SIZE] __aligned(16);
static char nmi0_stack[PAGE_SIZE] __aligned(16);
CTASSERT(sizeof(struct nmi_pcpu) == 16);
struct amd64tss common_tss[MAXCPU];
/*
* Software prototypes -- in more palatable form.
*
* Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same
* slots as corresponding segments for i386 kernel.
*/
struct soft_segment_descriptor gdt_segs[] = {
/* GNULL_SEL 0 Null Descriptor */
{ .ssd_base = 0x0,
.ssd_limit = 0x0,
.ssd_type = 0,
.ssd_dpl = 0,
.ssd_p = 0,
.ssd_long = 0,
.ssd_def32 = 0,
.ssd_gran = 0 },
/* GNULL2_SEL 1 Null Descriptor */
{ .ssd_base = 0x0,
.ssd_limit = 0x0,
.ssd_type = 0,
.ssd_dpl = 0,
.ssd_p = 0,
.ssd_long = 0,
.ssd_def32 = 0,
.ssd_gran = 0 },
/* GUFS32_SEL 2 32 bit %gs Descriptor for user */
{ .ssd_base = 0x0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMRWA,
.ssd_dpl = SEL_UPL,
.ssd_p = 1,
.ssd_long = 0,
.ssd_def32 = 1,
.ssd_gran = 1 },
/* GUGS32_SEL 3 32 bit %fs Descriptor for user */
{ .ssd_base = 0x0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMRWA,
.ssd_dpl = SEL_UPL,
.ssd_p = 1,
.ssd_long = 0,
.ssd_def32 = 1,
.ssd_gran = 1 },
/* GCODE_SEL 4 Code Descriptor for kernel */
{ .ssd_base = 0x0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMERA,
.ssd_dpl = SEL_KPL,
.ssd_p = 1,
.ssd_long = 1,
.ssd_def32 = 0,
.ssd_gran = 1 },
/* GDATA_SEL 5 Data Descriptor for kernel */
{ .ssd_base = 0x0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMRWA,
.ssd_dpl = SEL_KPL,
.ssd_p = 1,
.ssd_long = 1,
.ssd_def32 = 0,
.ssd_gran = 1 },
/* GUCODE32_SEL 6 32 bit Code Descriptor for user */
{ .ssd_base = 0x0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMERA,
.ssd_dpl = SEL_UPL,
.ssd_p = 1,
.ssd_long = 0,
.ssd_def32 = 1,
.ssd_gran = 1 },
/* GUDATA_SEL 7 32/64 bit Data Descriptor for user */
{ .ssd_base = 0x0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMRWA,
.ssd_dpl = SEL_UPL,
.ssd_p = 1,
.ssd_long = 0,
.ssd_def32 = 1,
.ssd_gran = 1 },
/* GUCODE_SEL 8 64 bit Code Descriptor for user */
{ .ssd_base = 0x0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMERA,
.ssd_dpl = SEL_UPL,
.ssd_p = 1,
.ssd_long = 1,
.ssd_def32 = 0,
.ssd_gran = 1 },
/* GPROC0_SEL 9 Proc 0 Tss Descriptor */
{ .ssd_base = 0x0,
.ssd_limit = sizeof(struct amd64tss) + IOPAGES * PAGE_SIZE - 1,
.ssd_type = SDT_SYSTSS,
.ssd_dpl = SEL_KPL,
.ssd_p = 1,
.ssd_long = 0,
.ssd_def32 = 0,
.ssd_gran = 0 },
/* Actually, the TSS is a system descriptor which is double size */
{ .ssd_base = 0x0,
.ssd_limit = 0x0,
.ssd_type = 0,
.ssd_dpl = 0,
.ssd_p = 0,
.ssd_long = 0,
.ssd_def32 = 0,
.ssd_gran = 0 },
/* GUSERLDT_SEL 11 LDT Descriptor */
{ .ssd_base = 0x0,
.ssd_limit = 0x0,
.ssd_type = 0,
.ssd_dpl = 0,
.ssd_p = 0,
.ssd_long = 0,
.ssd_def32 = 0,
.ssd_gran = 0 },
/* GUSERLDT_SEL 12 LDT Descriptor, double size */
{ .ssd_base = 0x0,
.ssd_limit = 0x0,
.ssd_type = 0,
.ssd_dpl = 0,
.ssd_p = 0,
.ssd_long = 0,
.ssd_def32 = 0,
.ssd_gran = 0 },
};
void
setidt(idx, func, typ, dpl, ist)
int idx;
inthand_t *func;
int typ;
int dpl;
int ist;
{
struct gate_descriptor *ip;
ip = idt + idx;
ip->gd_looffset = (uintptr_t)func;
ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
ip->gd_ist = ist;
ip->gd_xx = 0;
ip->gd_type = typ;
ip->gd_dpl = dpl;
ip->gd_p = 1;
ip->gd_hioffset = ((uintptr_t)func)>>16 ;
}
extern inthand_t
IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
IDTVEC(xmm), IDTVEC(dblfault),
#ifdef KDTRACE_HOOKS
IDTVEC(dtrace_ret),
#endif
IDTVEC(fast_syscall), IDTVEC(fast_syscall32);
#ifdef DDB
/*
* Display the index and function name of any IDT entries that don't use
* the default 'rsvd' entry point.
*/
DB_SHOW_COMMAND(idt, db_show_idt)
{
struct gate_descriptor *ip;
int idx;
uintptr_t func;
ip = idt;
for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
if (func != (uintptr_t)&IDTVEC(rsvd)) {
db_printf("%3d\t", idx);
db_printsym(func, DB_STGY_PROC);
db_printf("\n");
}
ip++;
}
}
#endif
void
sdtossd(sd, ssd)
struct user_segment_descriptor *sd;
struct soft_segment_descriptor *ssd;
{
ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase;
ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
ssd->ssd_type = sd->sd_type;
ssd->ssd_dpl = sd->sd_dpl;
ssd->ssd_p = sd->sd_p;
ssd->ssd_long = sd->sd_long;
ssd->ssd_def32 = sd->sd_def32;
ssd->ssd_gran = sd->sd_gran;
}
void
ssdtosd(ssd, sd)
struct soft_segment_descriptor *ssd;
struct user_segment_descriptor *sd;
{
sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff;
sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
sd->sd_type = ssd->ssd_type;
sd->sd_dpl = ssd->ssd_dpl;
sd->sd_p = ssd->ssd_p;
sd->sd_long = ssd->ssd_long;
sd->sd_def32 = ssd->ssd_def32;
sd->sd_gran = ssd->ssd_gran;
}
void
ssdtosyssd(ssd, sd)
struct soft_segment_descriptor *ssd;
struct system_segment_descriptor *sd;
{
sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful;
sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
sd->sd_type = ssd->ssd_type;
sd->sd_dpl = ssd->ssd_dpl;
sd->sd_p = ssd->ssd_p;
sd->sd_gran = ssd->ssd_gran;
}
#if !defined(DEV_ATPIC) && defined(DEV_ISA)
#include <isa/isavar.h>
#include <isa/isareg.h>
/*
* Return a bitmap of the current interrupt requests. This is 8259-specific
* and is only suitable for use at probe time.
* This is only here to pacify sio. It is NOT FATAL if this doesn't work.
* It shouldn't be here. There should probably be an APIC centric
* implementation in the apic driver code, if at all.
*/
intrmask_t
isa_irq_pending(void)
{
u_char irr1;
u_char irr2;
irr1 = inb(IO_ICU1);
irr2 = inb(IO_ICU2);
return ((irr2 << 8) | irr1);
}
#endif
u_int basemem;
static int
add_smap_entry(struct bios_smap *smap, vm_paddr_t *physmap, int *physmap_idxp)
{
int i, insert_idx, physmap_idx;
physmap_idx = *physmap_idxp;
if (boothowto & RB_VERBOSE)
printf("SMAP type=%02x base=%016lx len=%016lx\n",
smap->type, smap->base, smap->length);
if (smap->type != SMAP_TYPE_MEMORY)
return (1);
if (smap->length == 0)
return (0);
/*
* Find insertion point while checking for overlap. Start off by
* assuming the new entry will be added to the end.
*/
insert_idx = physmap_idx + 2;
for (i = 0; i <= physmap_idx; i += 2) {
if (smap->base < physmap[i + 1]) {
if (smap->base + smap->length <= physmap[i]) {
insert_idx = i;
break;
}
if (boothowto & RB_VERBOSE)
printf(
"Overlapping memory regions, ignoring second region\n");
return (1);
}
}
/* See if we can prepend to the next entry. */
if (insert_idx <= physmap_idx &&
smap->base + smap->length == physmap[insert_idx]) {
physmap[insert_idx] = smap->base;
return (1);
}
/* See if we can append to the previous entry. */
if (insert_idx > 0 && smap->base == physmap[insert_idx - 1]) {
physmap[insert_idx - 1] += smap->length;
return (1);
}
physmap_idx += 2;
*physmap_idxp = physmap_idx;
if (physmap_idx == PHYSMAP_SIZE) {
printf(
"Too many segments in the physical address map, giving up\n");
return (0);
}
/*
* Move the last 'N' entries down to make room for the new
* entry if needed.
*/
for (i = physmap_idx; i > insert_idx; i -= 2) {
physmap[i] = physmap[i - 2];
physmap[i + 1] = physmap[i - 1];
}
/* Insert the new entry. */
physmap[insert_idx] = smap->base;
physmap[insert_idx + 1] = smap->base + smap->length;
return (1);
}
/*
* Populate the (physmap) array with base/bound pairs describing the
* available physical memory in the system, then test this memory and
* build the phys_avail array describing the actually-available memory.
*
* Total memory size may be set by the kernel environment variable
* hw.physmem or the compile-time define MAXMEM.
*
* XXX first should be vm_paddr_t.
*/
static void
getmemsize(caddr_t kmdp, u_int64_t first)
{
int i, physmap_idx, pa_indx, da_indx;
vm_paddr_t pa, physmap[PHYSMAP_SIZE];
u_long physmem_tunable, memtest;
pt_entry_t *pte;
struct bios_smap *smapbase, *smap, *smapend;
u_int32_t smapsize;
quad_t dcons_addr, dcons_size;
bzero(physmap, sizeof(physmap));
basemem = 0;
physmap_idx = 0;
/*
* get memory map from INT 15:E820, kindly supplied by the loader.
*
* subr_module.c says:
* "Consumer may safely assume that size value precedes data."
* ie: an int32_t immediately precedes smap.
*/
smapbase = (struct bios_smap *)preload_search_info(kmdp,
MODINFO_METADATA | MODINFOMD_SMAP);
if (smapbase == NULL)
panic("No BIOS smap info from loader!");
smapsize = *((u_int32_t *)smapbase - 1);
smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
for (smap = smapbase; smap < smapend; smap++)
if (!add_smap_entry(smap, physmap, &physmap_idx))
break;
/*
* Find the 'base memory' segment for SMP
*/
basemem = 0;
for (i = 0; i <= physmap_idx; i += 2) {
if (physmap[i] == 0x00000000) {
basemem = physmap[i + 1] / 1024;
break;
}
}
if (basemem == 0)
panic("BIOS smap did not include a basemem segment!");
#ifdef SMP
/* make hole for AP bootstrap code */
physmap[1] = mp_bootaddress(physmap[1] / 1024);
#endif
/*
* Maxmem isn't the "maximum memory", it's one larger than the
* highest page of the physical address space. It should be
* called something like "Maxphyspage". We may adjust this
* based on ``hw.physmem'' and the results of the memory test.
*/
Maxmem = atop(physmap[physmap_idx + 1]);
#ifdef MAXMEM
Maxmem = MAXMEM / 4;
#endif
if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
Maxmem = atop(physmem_tunable);
/*
* By default keep the memtest enabled. Use a general name so that
* one could eventually do more with the code than just disable it.
*/
memtest = 1;
TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
/*
* Don't allow MAXMEM or hw.physmem to extend the amount of memory
* in the system.
*/
if (Maxmem > atop(physmap[physmap_idx + 1]))
Maxmem = atop(physmap[physmap_idx + 1]);
if (atop(physmap[physmap_idx + 1]) != Maxmem &&
(boothowto & RB_VERBOSE))
printf("Physical memory use set to %ldK\n", Maxmem * 4);
/* call pmap initialization to make new kernel address space */
pmap_bootstrap(&first);
/*
* Size up each available chunk of physical memory.
*/
physmap[0] = PAGE_SIZE; /* mask off page 0 */
pa_indx = 0;
da_indx = 1;
phys_avail[pa_indx++] = physmap[0];
phys_avail[pa_indx] = physmap[0];
dump_avail[da_indx] = physmap[0];
pte = CMAP1;
/*
* Get dcons buffer address
*/
if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
getenv_quad("dcons.size", &dcons_size) == 0)
dcons_addr = 0;
/*
* physmap is in bytes, so when converting to page boundaries,
* round up the start address and round down the end address.
*/
for (i = 0; i <= physmap_idx; i += 2) {
vm_paddr_t end;
end = ptoa((vm_paddr_t)Maxmem);
if (physmap[i + 1] < end)
end = trunc_page(physmap[i + 1]);
for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
int tmp, page_bad, full;
int *ptr = (int *)CADDR1;
full = FALSE;
/*
* block out kernel memory as not available.
*/
if (pa >= (vm_paddr_t)kernphys && pa < first)
goto do_dump_avail;
/*
* block out dcons buffer
*/
if (dcons_addr > 0
&& pa >= trunc_page(dcons_addr)
&& pa < dcons_addr + dcons_size)
goto do_dump_avail;
page_bad = FALSE;
if (memtest == 0)
goto skip_memtest;
/*
* map page into kernel: valid, read/write,non-cacheable
*/
*pte = pa | PG_V | PG_RW | PG_N;
invltlb();
tmp = *(int *)ptr;
/*
* Test for alternating 1's and 0's
*/
*(volatile int *)ptr = 0xaaaaaaaa;
if (*(volatile int *)ptr != 0xaaaaaaaa)
page_bad = TRUE;
/*
* Test for alternating 0's and 1's
*/
*(volatile int *)ptr = 0x55555555;
if (*(volatile int *)ptr != 0x55555555)
page_bad = TRUE;
/*
* Test for all 1's
*/
*(volatile int *)ptr = 0xffffffff;
if (*(volatile int *)ptr != 0xffffffff)
page_bad = TRUE;
/*
* Test for all 0's
*/
*(volatile int *)ptr = 0x0;
if (*(volatile int *)ptr != 0x0)
page_bad = TRUE;
/*
* Restore original value.
*/
*(int *)ptr = tmp;
skip_memtest:
/*
* Adjust array of valid/good pages.
*/
if (page_bad == TRUE)
continue;
/*
* If this good page is a continuation of the
* previous set of good pages, then just increase
* the end pointer. Otherwise start a new chunk.
* Note that "end" points one higher than end,
* making the range >= start and < end.
* If we're also doing a speculative memory
* test and we at or past the end, bump up Maxmem
* so that we keep going. The first bad page
* will terminate the loop.
*/
if (phys_avail[pa_indx] == pa) {
phys_avail[pa_indx] += PAGE_SIZE;
} else {
pa_indx++;
if (pa_indx == PHYS_AVAIL_ARRAY_END) {
printf(
"Too many holes in the physical address space, giving up\n");
pa_indx--;
full = TRUE;
goto do_dump_avail;
}
phys_avail[pa_indx++] = pa; /* start */
phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
}
physmem++;
do_dump_avail:
if (dump_avail[da_indx] == pa) {
dump_avail[da_indx] += PAGE_SIZE;
} else {
da_indx++;
if (da_indx == DUMP_AVAIL_ARRAY_END) {
da_indx--;
goto do_next;
}
dump_avail[da_indx++] = pa; /* start */
dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
}
do_next:
if (full)
break;
}
}
*pte = 0;
invltlb();
/*
* XXX
* The last chunk must contain at least one page plus the message
* buffer to avoid complicating other code (message buffer address
* calculation, etc.).
*/
while (phys_avail[pa_indx - 1] + PAGE_SIZE +
round_page(msgbufsize) >= phys_avail[pa_indx]) {
physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
phys_avail[pa_indx--] = 0;
phys_avail[pa_indx--] = 0;
}
Maxmem = atop(phys_avail[pa_indx]);
/* Trim off space for the message buffer. */
phys_avail[pa_indx] -= round_page(msgbufsize);
/* Map the message buffer. */
msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]);
}
u_int64_t
hammer_time(u_int64_t modulep, u_int64_t physfree)
{
caddr_t kmdp;
int gsel_tss, x;
struct pcpu *pc;
struct nmi_pcpu *np;
u_int64_t msr;
char *env;
size_t kstack0_sz;
thread0.td_kstack = physfree + KERNBASE;
thread0.td_kstack_pages = KSTACK_PAGES;
kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
bzero((void *)thread0.td_kstack, kstack0_sz);
physfree += kstack0_sz;
thread0.td_pcb = (struct pcb *)(thread0.td_kstack + kstack0_sz) - 1;
/*
* This may be done better later if it gets more high level
* components in it. If so just link td->td_proc here.
*/
proc_linkup0(&proc0, &thread0);
preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE);
preload_bootstrap_relocate(KERNBASE);
kmdp = preload_search_by_type("elf kernel");
if (kmdp == NULL)
kmdp = preload_search_by_type("elf64 kernel");
boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
kern_envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *) + KERNBASE;
#ifdef DDB
ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t);
ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t);
#endif
/* Init basic tunables, hz etc */
init_param1();
/*
* make gdt memory segments
*/
for (x = 0; x < NGDT; x++) {
if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) &&
x != GUSERLDT_SEL && x != (GUSERLDT_SEL) + 1)
ssdtosd(&gdt_segs[x], &gdt[x]);
}
gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&common_tss[0];
ssdtosyssd(&gdt_segs[GPROC0_SEL],
(struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
r_gdt.rd_base = (long) gdt;
lgdt(&r_gdt);
pc = &__pcpu[0];
wrmsr(MSR_FSBASE, 0); /* User value */
wrmsr(MSR_GSBASE, (u_int64_t)pc);
wrmsr(MSR_KGSBASE, 0); /* User value while in the kernel */
pcpu_init(pc, 0, sizeof(struct pcpu));
dpcpu_init((void *)(physfree + KERNBASE), 0);
physfree += DPCPU_SIZE;
PCPU_SET(prvspace, pc);
PCPU_SET(curthread, &thread0);
PCPU_SET(curpcb, thread0.td_pcb);
PCPU_SET(tssp, &common_tss[0]);
PCPU_SET(commontssp, &common_tss[0]);
PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]);
PCPU_SET(fs32p, &gdt[GUFS32_SEL]);
PCPU_SET(gs32p, &gdt[GUGS32_SEL]);
/*
* Initialize mutexes.
*
* icu_lock: in order to allow an interrupt to occur in a critical
* section, to set pcpu->ipending (etc...) properly, we
* must be able to get the icu lock, so it can't be
* under witness.
*/
mutex_init();
mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS);
mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF);
/* exceptions */
for (x = 0; x < NIDT; x++)
setidt(x, &IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0);
setidt(IDT_DE, &IDTVEC(div), SDT_SYSIGT, SEL_KPL, 0);
setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 0);
setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYSIGT, SEL_KPL, 2);
setidt(IDT_BP, &IDTVEC(bpt), SDT_SYSIGT, SEL_UPL, 0);
setidt(IDT_OF, &IDTVEC(ofl), SDT_SYSIGT, SEL_KPL, 0);
setidt(IDT_BR, &IDTVEC(bnd), SDT_SYSIGT, SEL_KPL, 0);
setidt(IDT_UD, &IDTVEC(ill), SDT_SYSIGT, SEL_KPL, 0);
setidt(IDT_NM, &IDTVEC(dna), SDT_SYSIGT, SEL_KPL, 0);
setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
setidt(IDT_FPUGP, &IDTVEC(fpusegm), SDT_SYSIGT, SEL_KPL, 0);
setidt(IDT_TS, &IDTVEC(tss), SDT_SYSIGT, SEL_KPL, 0);
setidt(IDT_NP, &IDTVEC(missing), SDT_SYSIGT, SEL_KPL, 0);
setidt(IDT_SS, &IDTVEC(stk), SDT_SYSIGT, SEL_KPL, 0);
setidt(IDT_GP, &IDTVEC(prot), SDT_SYSIGT, SEL_KPL, 0);
setidt(IDT_PF, &IDTVEC(page), SDT_SYSIGT, SEL_KPL, 0);
setidt(IDT_MF, &IDTVEC(fpu), SDT_SYSIGT, SEL_KPL, 0);
setidt(IDT_AC, &IDTVEC(align), SDT_SYSIGT, SEL_KPL, 0);
setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 0);
setidt(IDT_XF, &IDTVEC(xmm), SDT_SYSIGT, SEL_KPL, 0);
#ifdef KDTRACE_HOOKS
setidt(IDT_DTRACE_RET, &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0);
#endif
r_idt.rd_limit = sizeof(idt0) - 1;
r_idt.rd_base = (long) idt;
lidt(&r_idt);
/*
* Initialize the i8254 before the console so that console
* initialization can use DELAY().
*/
i8254_init();
/*
* Initialize the console before we print anything out.
*/
cninit();
#ifdef DEV_ISA
#ifdef DEV_ATPIC
elcr_probe();
atpic_startup();
#else
/* Reset and mask the atpics and leave them shut down. */
atpic_reset();
/*
* Point the ICU spurious interrupt vectors at the APIC spurious
* interrupt handler.
*/
setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
#endif
#else
#error "have you forgotten the isa device?";
#endif
kdb_init();
#ifdef KDB
if (boothowto & RB_KDB)
kdb_enter(KDB_WHY_BOOTFLAGS,
"Boot flags requested debugger");
#endif
identify_cpu(); /* Final stage of CPU initialization */
initializecpu(); /* Initialize CPU registers */
initializecpucache();
/* make an initial tss so cpu can get interrupt stack on syscall! */
common_tss[0].tss_rsp0 = thread0.td_kstack +
kstack0_sz - sizeof(struct pcb);
/* Ensure the stack is aligned to 16 bytes */
common_tss[0].tss_rsp0 &= ~0xFul;
PCPU_SET(rsp0, common_tss[0].tss_rsp0);
/* doublefault stack space, runs on ist1 */
common_tss[0].tss_ist1 = (long)&dblfault_stack[sizeof(dblfault_stack)];
/*
* NMI stack, runs on ist2. The pcpu pointer is stored just
* above the start of the ist2 stack.
*/
np = ((struct nmi_pcpu *) &nmi0_stack[sizeof(nmi0_stack)]) - 1;
np->np_pcpu = (register_t) pc;
common_tss[0].tss_ist2 = (long) np;
/* Set the IO permission bitmap (empty due to tss seg limit) */
common_tss[0].tss_iobase = sizeof(struct amd64tss) +
IOPAGES * PAGE_SIZE;
gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
ltr(gsel_tss);
/* Set up the fast syscall stuff */
msr = rdmsr(MSR_EFER) | EFER_SCE;
wrmsr(MSR_EFER, msr);
wrmsr(MSR_LSTAR, (u_int64_t)IDTVEC(fast_syscall));
wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
wrmsr(MSR_STAR, msr);
wrmsr(MSR_SF_MASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D);
getmemsize(kmdp, physfree);
init_param2(physmem);
/* now running on new page tables, configured,and u/iom is accessible */
msgbufinit(msgbufp, msgbufsize);
fpuinit();
/* transfer to user mode */
_ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
_udatasel = GSEL(GUDATA_SEL, SEL_UPL);
_ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL);
_ufssel = GSEL(GUFS32_SEL, SEL_UPL);
_ugssel = GSEL(GUGS32_SEL, SEL_UPL);
load_ds(_udatasel);
load_es(_udatasel);
load_fs(_ufssel);
/* setup proc 0's pcb */
thread0.td_pcb->pcb_flags = 0;
thread0.td_pcb->pcb_cr3 = KPML4phys;
thread0.td_frame = &proc0_tf;
env = getenv("kernelname");
if (env != NULL)
strlcpy(kernelname, env, sizeof(kernelname));
#ifdef XENHVM
if (inw(0x10) == 0x49d2) {
if (bootverbose)
printf("Xen detected: disabling emulated block and network devices\n");
outw(0x10, 3);
}
#endif
cpu_probe_amdc1e();
/* Location of kernel stack for locore */
return ((u_int64_t)thread0.td_pcb);
}
void
cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
{
pcpu->pc_acpi_id = 0xffffffff;
}
void
spinlock_enter(void)
{
struct thread *td;
register_t flags;
td = curthread;
if (td->td_md.md_spinlock_count == 0) {
flags = intr_disable();
td->td_md.md_spinlock_count = 1;
td->td_md.md_saved_flags = flags;
} else
td->td_md.md_spinlock_count++;
critical_enter();
}
void
spinlock_exit(void)
{
struct thread *td;
register_t flags;
td = curthread;
critical_exit();
flags = td->td_md.md_saved_flags;
td->td_md.md_spinlock_count--;
if (td->td_md.md_spinlock_count == 0)
intr_restore(flags);
}
/*
* Construct a PCB from a trapframe. This is called from kdb_trap() where
* we want to start a backtrace from the function that caused us to enter
* the debugger. We have the context in the trapframe, but base the trace
* on the PCB. The PCB doesn't have to be perfect, as long as it contains
* enough for a backtrace.
*/
void
makectx(struct trapframe *tf, struct pcb *pcb)
{
pcb->pcb_r12 = tf->tf_r12;
pcb->pcb_r13 = tf->tf_r13;
pcb->pcb_r14 = tf->tf_r14;
pcb->pcb_r15 = tf->tf_r15;
pcb->pcb_rbp = tf->tf_rbp;
pcb->pcb_rbx = tf->tf_rbx;
pcb->pcb_rip = tf->tf_rip;
pcb->pcb_rsp = tf->tf_rsp;
}
int
ptrace_set_pc(struct thread *td, unsigned long addr)
{
td->td_frame->tf_rip = addr;
return (0);
}
int
ptrace_single_step(struct thread *td)
{
td->td_frame->tf_rflags |= PSL_T;
return (0);
}
int
ptrace_clear_single_step(struct thread *td)
{
td->td_frame->tf_rflags &= ~PSL_T;
return (0);
}
int
fill_regs(struct thread *td, struct reg *regs)
{
struct trapframe *tp;
tp = td->td_frame;
return (fill_frame_regs(tp, regs));
}
int
fill_frame_regs(struct trapframe *tp, struct reg *regs)
{
regs->r_r15 = tp->tf_r15;
regs->r_r14 = tp->tf_r14;
regs->r_r13 = tp->tf_r13;
regs->r_r12 = tp->tf_r12;
regs->r_r11 = tp->tf_r11;
regs->r_r10 = tp->tf_r10;
regs->r_r9 = tp->tf_r9;
regs->r_r8 = tp->tf_r8;
regs->r_rdi = tp->tf_rdi;
regs->r_rsi = tp->tf_rsi;
regs->r_rbp = tp->tf_rbp;
regs->r_rbx = tp->tf_rbx;
regs->r_rdx = tp->tf_rdx;
regs->r_rcx = tp->tf_rcx;
regs->r_rax = tp->tf_rax;
regs->r_rip = tp->tf_rip;
regs->r_cs = tp->tf_cs;
regs->r_rflags = tp->tf_rflags;
regs->r_rsp = tp->tf_rsp;
regs->r_ss = tp->tf_ss;
if (tp->tf_flags & TF_HASSEGS) {
regs->r_ds = tp->tf_ds;
regs->r_es = tp->tf_es;
regs->r_fs = tp->tf_fs;
regs->r_gs = tp->tf_gs;
} else {
regs->r_ds = 0;
regs->r_es = 0;
regs->r_fs = 0;
regs->r_gs = 0;
}
return (0);
}
int
set_regs(struct thread *td, struct reg *regs)
{
struct trapframe *tp;
register_t rflags;
tp = td->td_frame;
rflags = regs->r_rflags & 0xffffffff;
if (!EFL_SECURE(rflags, tp->tf_rflags) || !CS_SECURE(regs->r_cs))
return (EINVAL);
tp->tf_r15 = regs->r_r15;
tp->tf_r14 = regs->r_r14;
tp->tf_r13 = regs->r_r13;
tp->tf_r12 = regs->r_r12;
tp->tf_r11 = regs->r_r11;
tp->tf_r10 = regs->r_r10;
tp->tf_r9 = regs->r_r9;
tp->tf_r8 = regs->r_r8;
tp->tf_rdi = regs->r_rdi;
tp->tf_rsi = regs->r_rsi;
tp->tf_rbp = regs->r_rbp;
tp->tf_rbx = regs->r_rbx;
tp->tf_rdx = regs->r_rdx;
tp->tf_rcx = regs->r_rcx;
tp->tf_rax = regs->r_rax;
tp->tf_rip = regs->r_rip;
tp->tf_cs = regs->r_cs;
tp->tf_rflags = rflags;
tp->tf_rsp = regs->r_rsp;
tp->tf_ss = regs->r_ss;
if (0) { /* XXXKIB */
tp->tf_ds = regs->r_ds;
tp->tf_es = regs->r_es;
tp->tf_fs = regs->r_fs;
tp->tf_gs = regs->r_gs;
tp->tf_flags = TF_HASSEGS;
set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
}
return (0);
}
/* XXX check all this stuff! */
/* externalize from sv_xmm */
static void
fill_fpregs_xmm(struct savefpu *sv_xmm, struct fpreg *fpregs)
{
struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
struct envxmm *penv_xmm = &sv_xmm->sv_env;
int i;
/* pcb -> fpregs */
bzero(fpregs, sizeof(*fpregs));
/* FPU control/status */
penv_fpreg->en_cw = penv_xmm->en_cw;
penv_fpreg->en_sw = penv_xmm->en_sw;
penv_fpreg->en_tw = penv_xmm->en_tw;
penv_fpreg->en_opcode = penv_xmm->en_opcode;
penv_fpreg->en_rip = penv_xmm->en_rip;
penv_fpreg->en_rdp = penv_xmm->en_rdp;
penv_fpreg->en_mxcsr = penv_xmm->en_mxcsr;
penv_fpreg->en_mxcsr_mask = penv_xmm->en_mxcsr_mask;
/* FPU registers */
for (i = 0; i < 8; ++i)
bcopy(sv_xmm->sv_fp[i].fp_acc.fp_bytes, fpregs->fpr_acc[i], 10);
/* SSE registers */
for (i = 0; i < 16; ++i)
bcopy(sv_xmm->sv_xmm[i].xmm_bytes, fpregs->fpr_xacc[i], 16);
}
/* internalize from fpregs into sv_xmm */
static void
set_fpregs_xmm(struct fpreg *fpregs, struct savefpu *sv_xmm)
{
struct envxmm *penv_xmm = &sv_xmm->sv_env;
struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
int i;
/* fpregs -> pcb */
/* FPU control/status */
penv_xmm->en_cw = penv_fpreg->en_cw;
penv_xmm->en_sw = penv_fpreg->en_sw;
penv_xmm->en_tw = penv_fpreg->en_tw;
penv_xmm->en_opcode = penv_fpreg->en_opcode;
penv_xmm->en_rip = penv_fpreg->en_rip;
penv_xmm->en_rdp = penv_fpreg->en_rdp;
penv_xmm->en_mxcsr = penv_fpreg->en_mxcsr;
penv_xmm->en_mxcsr_mask = penv_fpreg->en_mxcsr_mask & cpu_mxcsr_mask;
/* FPU registers */
for (i = 0; i < 8; ++i)
bcopy(fpregs->fpr_acc[i], sv_xmm->sv_fp[i].fp_acc.fp_bytes, 10);
/* SSE registers */
for (i = 0; i < 16; ++i)
bcopy(fpregs->fpr_xacc[i], sv_xmm->sv_xmm[i].xmm_bytes, 16);
}
/* externalize from td->pcb */
int
fill_fpregs(struct thread *td, struct fpreg *fpregs)
{
KASSERT(td == curthread || TD_IS_SUSPENDED(td),
("not suspended thread %p", td));
fpugetregs(td);
fill_fpregs_xmm(&td->td_pcb->pcb_user_save, fpregs);
return (0);
}
/* internalize to td->pcb */
int
set_fpregs(struct thread *td, struct fpreg *fpregs)
{
set_fpregs_xmm(fpregs, &td->td_pcb->pcb_user_save);
fpuuserinited(td);
return (0);
}
/*
* Get machine context.
*/
int
get_mcontext(struct thread *td, mcontext_t *mcp, int flags)
{
struct pcb *pcb;
struct trapframe *tp;
pcb = td->td_pcb;
tp = td->td_frame;
PROC_LOCK(curthread->td_proc);
mcp->mc_onstack = sigonstack(tp->tf_rsp);
PROC_UNLOCK(curthread->td_proc);
mcp->mc_r15 = tp->tf_r15;
mcp->mc_r14 = tp->tf_r14;
mcp->mc_r13 = tp->tf_r13;
mcp->mc_r12 = tp->tf_r12;
mcp->mc_r11 = tp->tf_r11;
mcp->mc_r10 = tp->tf_r10;
mcp->mc_r9 = tp->tf_r9;
mcp->mc_r8 = tp->tf_r8;
mcp->mc_rdi = tp->tf_rdi;
mcp->mc_rsi = tp->tf_rsi;
mcp->mc_rbp = tp->tf_rbp;
mcp->mc_rbx = tp->tf_rbx;
mcp->mc_rcx = tp->tf_rcx;
mcp->mc_rflags = tp->tf_rflags;
if (flags & GET_MC_CLEAR_RET) {
mcp->mc_rax = 0;
mcp->mc_rdx = 0;
mcp->mc_rflags &= ~PSL_C;
} else {
mcp->mc_rax = tp->tf_rax;
mcp->mc_rdx = tp->tf_rdx;
}
mcp->mc_rip = tp->tf_rip;
mcp->mc_cs = tp->tf_cs;
mcp->mc_rsp = tp->tf_rsp;
mcp->mc_ss = tp->tf_ss;
mcp->mc_ds = tp->tf_ds;
mcp->mc_es = tp->tf_es;
mcp->mc_fs = tp->tf_fs;
mcp->mc_gs = tp->tf_gs;
mcp->mc_flags = tp->tf_flags;
mcp->mc_len = sizeof(*mcp);
get_fpcontext(td, mcp);
mcp->mc_fsbase = pcb->pcb_fsbase;
mcp->mc_gsbase = pcb->pcb_gsbase;
bzero(mcp->mc_spare, sizeof(mcp->mc_spare));
return (0);
}
/*
* Set machine context.
*
* However, we don't set any but the user modifiable flags, and we won't
* touch the cs selector.
*/
int
set_mcontext(struct thread *td, const mcontext_t *mcp)
{
struct pcb *pcb;
struct trapframe *tp;
long rflags;
int ret;
pcb = td->td_pcb;
tp = td->td_frame;
if (mcp->mc_len != sizeof(*mcp) ||
(mcp->mc_flags & ~_MC_FLAG_MASK) != 0)
return (EINVAL);
rflags = (mcp->mc_rflags & PSL_USERCHANGE) |
(tp->tf_rflags & ~PSL_USERCHANGE);
ret = set_fpcontext(td, mcp);
if (ret != 0)
return (ret);
tp->tf_r15 = mcp->mc_r15;
tp->tf_r14 = mcp->mc_r14;
tp->tf_r13 = mcp->mc_r13;
tp->tf_r12 = mcp->mc_r12;
tp->tf_r11 = mcp->mc_r11;
tp->tf_r10 = mcp->mc_r10;
tp->tf_r9 = mcp->mc_r9;
tp->tf_r8 = mcp->mc_r8;
tp->tf_rdi = mcp->mc_rdi;
tp->tf_rsi = mcp->mc_rsi;
tp->tf_rbp = mcp->mc_rbp;
tp->tf_rbx = mcp->mc_rbx;
tp->tf_rdx = mcp->mc_rdx;
tp->tf_rcx = mcp->mc_rcx;
tp->tf_rax = mcp->mc_rax;
tp->tf_rip = mcp->mc_rip;
tp->tf_rflags = rflags;
tp->tf_rsp = mcp->mc_rsp;
tp->tf_ss = mcp->mc_ss;
tp->tf_flags = mcp->mc_flags;
if (tp->tf_flags & TF_HASSEGS) {
tp->tf_ds = mcp->mc_ds;
tp->tf_es = mcp->mc_es;
tp->tf_fs = mcp->mc_fs;
tp->tf_gs = mcp->mc_gs;
}
if (mcp->mc_flags & _MC_HASBASES) {
pcb->pcb_fsbase = mcp->mc_fsbase;
pcb->pcb_gsbase = mcp->mc_gsbase;
}
set_pcb_flags(pcb, PCB_FULL_IRET);
return (0);
}
static void
get_fpcontext(struct thread *td, mcontext_t *mcp)
{
mcp->mc_ownedfp = fpugetregs(td);
bcopy(&td->td_pcb->pcb_user_save, &mcp->mc_fpstate,
sizeof(mcp->mc_fpstate));
mcp->mc_fpformat = fpuformat();
}
static int
set_fpcontext(struct thread *td, const mcontext_t *mcp)
{
struct savefpu *fpstate;
if (mcp->mc_fpformat == _MC_FPFMT_NODEV)
return (0);
else if (mcp->mc_fpformat != _MC_FPFMT_XMM)
return (EINVAL);
else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE)
/* We don't care what state is left in the FPU or PCB. */
fpstate_drop(td);
else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU ||
mcp->mc_ownedfp == _MC_FPOWNED_PCB) {
fpstate = (struct savefpu *)&mcp->mc_fpstate;
fpstate->sv_env.en_mxcsr &= cpu_mxcsr_mask;
fpusetregs(td, fpstate);
} else
return (EINVAL);
return (0);
}
void
fpstate_drop(struct thread *td)
{
KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu"));
critical_enter();
if (PCPU_GET(fpcurthread) == td)
fpudrop();
/*
* XXX force a full drop of the fpu. The above only drops it if we
* owned it.
*
* XXX I don't much like fpugetuserregs()'s semantics of doing a full
* drop. Dropping only to the pcb matches fnsave's behaviour.
* We only need to drop to !PCB_INITDONE in sendsig(). But
* sendsig() is the only caller of fpugetuserregs()... perhaps we just
* have too many layers.
*/
clear_pcb_flags(curthread->td_pcb,
PCB_FPUINITDONE | PCB_USERFPUINITDONE);
critical_exit();
}
int
fill_dbregs(struct thread *td, struct dbreg *dbregs)
{
struct pcb *pcb;
if (td == NULL) {
dbregs->dr[0] = rdr0();
dbregs->dr[1] = rdr1();
dbregs->dr[2] = rdr2();
dbregs->dr[3] = rdr3();
dbregs->dr[6] = rdr6();
dbregs->dr[7] = rdr7();
} else {
pcb = td->td_pcb;
dbregs->dr[0] = pcb->pcb_dr0;
dbregs->dr[1] = pcb->pcb_dr1;
dbregs->dr[2] = pcb->pcb_dr2;
dbregs->dr[3] = pcb->pcb_dr3;
dbregs->dr[6] = pcb->pcb_dr6;
dbregs->dr[7] = pcb->pcb_dr7;
}
dbregs->dr[4] = 0;
dbregs->dr[5] = 0;
dbregs->dr[8] = 0;
dbregs->dr[9] = 0;
dbregs->dr[10] = 0;
dbregs->dr[11] = 0;
dbregs->dr[12] = 0;
dbregs->dr[13] = 0;
dbregs->dr[14] = 0;
dbregs->dr[15] = 0;
return (0);
}
int
set_dbregs(struct thread *td, struct dbreg *dbregs)
{
struct pcb *pcb;
int i;
if (td == NULL) {
load_dr0(dbregs->dr[0]);
load_dr1(dbregs->dr[1]);
load_dr2(dbregs->dr[2]);
load_dr3(dbregs->dr[3]);
load_dr6(dbregs->dr[6]);
load_dr7(dbregs->dr[7]);
} else {
/*
* Don't let an illegal value for dr7 get set. Specifically,
* check for undefined settings. Setting these bit patterns
* result in undefined behaviour and can lead to an unexpected
* TRCTRAP or a general protection fault right here.
* Upper bits of dr6 and dr7 must not be set
*/
for (i = 0; i < 4; i++) {
if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02)
return (EINVAL);
if (td->td_frame->tf_cs == _ucode32sel &&
DBREG_DR7_LEN(dbregs->dr[7], i) == DBREG_DR7_LEN_8)
return (EINVAL);
}
if ((dbregs->dr[6] & 0xffffffff00000000ul) != 0 ||
(dbregs->dr[7] & 0xffffffff00000000ul) != 0)
return (EINVAL);
pcb = td->td_pcb;
/*
* Don't let a process set a breakpoint that is not within the
* process's address space. If a process could do this, it
* could halt the system by setting a breakpoint in the kernel
* (if ddb was enabled). Thus, we need to check to make sure
* that no breakpoints are being enabled for addresses outside
* process's address space.
*
* XXX - what about when the watched area of the user's
* address space is written into from within the kernel
* ... wouldn't that still cause a breakpoint to be generated
* from within kernel mode?
*/
if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) {
/* dr0 is enabled */
if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS)
return (EINVAL);
}
if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) {
/* dr1 is enabled */
if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS)
return (EINVAL);
}
if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) {
/* dr2 is enabled */
if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS)
return (EINVAL);
}
if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) {
/* dr3 is enabled */
if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS)
return (EINVAL);
}
pcb->pcb_dr0 = dbregs->dr[0];
pcb->pcb_dr1 = dbregs->dr[1];
pcb->pcb_dr2 = dbregs->dr[2];
pcb->pcb_dr3 = dbregs->dr[3];
pcb->pcb_dr6 = dbregs->dr[6];
pcb->pcb_dr7 = dbregs->dr[7];
set_pcb_flags(pcb, PCB_DBREGS);
}
return (0);
}
void
reset_dbregs(void)
{
load_dr7(0); /* Turn off the control bits first */
load_dr0(0);
load_dr1(0);
load_dr2(0);
load_dr3(0);
load_dr6(0);
}
/*
* Return > 0 if a hardware breakpoint has been hit, and the
* breakpoint was in user space. Return 0, otherwise.
*/
int
user_dbreg_trap(void)
{
u_int64_t dr7, dr6; /* debug registers dr6 and dr7 */
u_int64_t bp; /* breakpoint bits extracted from dr6 */
int nbp; /* number of breakpoints that triggered */
caddr_t addr[4]; /* breakpoint addresses */
int i;
dr7 = rdr7();
if ((dr7 & 0x000000ff) == 0) {
/*
* all GE and LE bits in the dr7 register are zero,
* thus the trap couldn't have been caused by the
* hardware debug registers
*/
return 0;
}
nbp = 0;
dr6 = rdr6();
bp = dr6 & 0x0000000f;
if (!bp) {
/*
* None of the breakpoint bits are set meaning this
* trap was not caused by any of the debug registers
*/
return 0;
}
/*
* at least one of the breakpoints were hit, check to see
* which ones and if any of them are user space addresses
*/
if (bp & 0x01) {
addr[nbp++] = (caddr_t)rdr0();
}
if (bp & 0x02) {
addr[nbp++] = (caddr_t)rdr1();
}
if (bp & 0x04) {
addr[nbp++] = (caddr_t)rdr2();
}
if (bp & 0x08) {
addr[nbp++] = (caddr_t)rdr3();
}
for (i = 0; i < nbp; i++) {
if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) {
/*
* addr[i] is in user space
*/
return nbp;
}
}
/*
* None of the breakpoints are in user space.
*/
return 0;
}
#ifdef KDB
/*
* Provide inb() and outb() as functions. They are normally only available as
* inline functions, thus cannot be called from the debugger.
*/
/* silence compiler warnings */
u_char inb_(u_short);
void outb_(u_short, u_char);
u_char
inb_(u_short port)
{
return inb(port);
}
void
outb_(u_short port, u_char data)
{
outb(port, data);
}
#endif /* KDB */
Index: head/sys/amd64/linux32/linux32_machdep.c
===================================================================
--- head/sys/amd64/linux32/linux32_machdep.c (revision 225616)
+++ head/sys/amd64/linux32/linux32_machdep.c (revision 225617)
@@ -1,1086 +1,1086 @@
/*-
* Copyright (c) 2004 Tim J. Robbins
* Copyright (c) 2002 Doug Rabson
* Copyright (c) 2000 Marcel Moolenaar
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer
* in this position and unchanged.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/systm.h>
#include <sys/capability.h>
#include <sys/file.h>
#include <sys/fcntl.h>
#include <sys/clock.h>
#include <sys/imgact.h>
#include <sys/limits.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mman.h>
#include <sys/mutex.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/resource.h>
#include <sys/resourcevar.h>
#include <sys/sched.h>
#include <sys/syscallsubr.h>
#include <sys/sysproto.h>
#include <sys/unistd.h>
#include <sys/wait.h>
#include <machine/frame.h>
#include <machine/pcb.h>
#include <machine/psl.h>
#include <machine/segments.h>
#include <machine/specialreg.h>
#include <vm/vm.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
#include <compat/freebsd32/freebsd32_util.h>
#include <amd64/linux32/linux.h>
#include <amd64/linux32/linux32_proto.h>
#include <compat/linux/linux_ipc.h>
#include <compat/linux/linux_misc.h>
#include <compat/linux/linux_signal.h>
#include <compat/linux/linux_util.h>
#include <compat/linux/linux_emul.h>
struct l_old_select_argv {
l_int nfds;
l_uintptr_t readfds;
l_uintptr_t writefds;
l_uintptr_t exceptfds;
l_uintptr_t timeout;
} __packed;
int
linux_to_bsd_sigaltstack(int lsa)
{
int bsa = 0;
if (lsa & LINUX_SS_DISABLE)
bsa |= SS_DISABLE;
if (lsa & LINUX_SS_ONSTACK)
bsa |= SS_ONSTACK;
return (bsa);
}
static int linux_mmap_common(struct thread *td, l_uintptr_t addr,
l_size_t len, l_int prot, l_int flags, l_int fd,
l_loff_t pos);
int
bsd_to_linux_sigaltstack(int bsa)
{
int lsa = 0;
if (bsa & SS_DISABLE)
lsa |= LINUX_SS_DISABLE;
if (bsa & SS_ONSTACK)
lsa |= LINUX_SS_ONSTACK;
return (lsa);
}
static void
bsd_to_linux_rusage(struct rusage *ru, struct l_rusage *lru)
{
lru->ru_utime.tv_sec = ru->ru_utime.tv_sec;
lru->ru_utime.tv_usec = ru->ru_utime.tv_usec;
lru->ru_stime.tv_sec = ru->ru_stime.tv_sec;
lru->ru_stime.tv_usec = ru->ru_stime.tv_usec;
lru->ru_maxrss = ru->ru_maxrss;
lru->ru_ixrss = ru->ru_ixrss;
lru->ru_idrss = ru->ru_idrss;
lru->ru_isrss = ru->ru_isrss;
lru->ru_minflt = ru->ru_minflt;
lru->ru_majflt = ru->ru_majflt;
lru->ru_nswap = ru->ru_nswap;
lru->ru_inblock = ru->ru_inblock;
lru->ru_oublock = ru->ru_oublock;
lru->ru_msgsnd = ru->ru_msgsnd;
lru->ru_msgrcv = ru->ru_msgrcv;
lru->ru_nsignals = ru->ru_nsignals;
lru->ru_nvcsw = ru->ru_nvcsw;
lru->ru_nivcsw = ru->ru_nivcsw;
}
int
linux_execve(struct thread *td, struct linux_execve_args *args)
{
struct image_args eargs;
char *path;
int error;
LCONVPATHEXIST(td, args->path, &path);
#ifdef DEBUG
if (ldebug(execve))
printf(ARGS(execve, "%s"), path);
#endif
error = freebsd32_exec_copyin_args(&eargs, path, UIO_SYSSPACE,
args->argp, args->envp);
free(path, M_TEMP);
if (error == 0)
error = kern_execve(td, &eargs, NULL);
if (error == 0)
/* Linux process can execute FreeBSD one, do not attempt
* to create emuldata for such process using
* linux_proc_init, this leads to a panic on KASSERT
* because such process has p->p_emuldata == NULL.
*/
if (SV_PROC_ABI(td->td_proc) == SV_ABI_LINUX)
error = linux_proc_init(td, 0, 0);
return (error);
}
CTASSERT(sizeof(struct l_iovec32) == 8);
static int
linux32_copyinuio(struct l_iovec32 *iovp, l_ulong iovcnt, struct uio **uiop)
{
struct l_iovec32 iov32;
struct iovec *iov;
struct uio *uio;
uint32_t iovlen;
int error, i;
*uiop = NULL;
if (iovcnt > UIO_MAXIOV)
return (EINVAL);
iovlen = iovcnt * sizeof(struct iovec);
uio = malloc(iovlen + sizeof(*uio), M_IOV, M_WAITOK);
iov = (struct iovec *)(uio + 1);
for (i = 0; i < iovcnt; i++) {
error = copyin(&iovp[i], &iov32, sizeof(struct l_iovec32));
if (error) {
free(uio, M_IOV);
return (error);
}
iov[i].iov_base = PTRIN(iov32.iov_base);
iov[i].iov_len = iov32.iov_len;
}
uio->uio_iov = iov;
uio->uio_iovcnt = iovcnt;
uio->uio_segflg = UIO_USERSPACE;
uio->uio_offset = -1;
uio->uio_resid = 0;
for (i = 0; i < iovcnt; i++) {
if (iov->iov_len > INT_MAX - uio->uio_resid) {
free(uio, M_IOV);
return (EINVAL);
}
uio->uio_resid += iov->iov_len;
iov++;
}
*uiop = uio;
return (0);
}
int
linux32_copyiniov(struct l_iovec32 *iovp32, l_ulong iovcnt, struct iovec **iovp,
int error)
{
struct l_iovec32 iov32;
struct iovec *iov;
uint32_t iovlen;
int i;
*iovp = NULL;
if (iovcnt > UIO_MAXIOV)
return (error);
iovlen = iovcnt * sizeof(struct iovec);
iov = malloc(iovlen, M_IOV, M_WAITOK);
for (i = 0; i < iovcnt; i++) {
error = copyin(&iovp32[i], &iov32, sizeof(struct l_iovec32));
if (error) {
free(iov, M_IOV);
return (error);
}
iov[i].iov_base = PTRIN(iov32.iov_base);
iov[i].iov_len = iov32.iov_len;
}
*iovp = iov;
return(0);
}
int
linux_readv(struct thread *td, struct linux_readv_args *uap)
{
struct uio *auio;
int error;
error = linux32_copyinuio(uap->iovp, uap->iovcnt, &auio);
if (error)
return (error);
error = kern_readv(td, uap->fd, auio);
free(auio, M_IOV);
return (error);
}
int
linux_writev(struct thread *td, struct linux_writev_args *uap)
{
struct uio *auio;
int error;
error = linux32_copyinuio(uap->iovp, uap->iovcnt, &auio);
if (error)
return (error);
error = kern_writev(td, uap->fd, auio);
free(auio, M_IOV);
return (error);
}
struct l_ipc_kludge {
l_uintptr_t msgp;
l_long msgtyp;
} __packed;
int
linux_ipc(struct thread *td, struct linux_ipc_args *args)
{
switch (args->what & 0xFFFF) {
case LINUX_SEMOP: {
struct linux_semop_args a;
a.semid = args->arg1;
a.tsops = args->ptr;
a.nsops = args->arg2;
return (linux_semop(td, &a));
}
case LINUX_SEMGET: {
struct linux_semget_args a;
a.key = args->arg1;
a.nsems = args->arg2;
a.semflg = args->arg3;
return (linux_semget(td, &a));
}
case LINUX_SEMCTL: {
struct linux_semctl_args a;
int error;
a.semid = args->arg1;
a.semnum = args->arg2;
a.cmd = args->arg3;
error = copyin(args->ptr, &a.arg, sizeof(a.arg));
if (error)
return (error);
return (linux_semctl(td, &a));
}
case LINUX_MSGSND: {
struct linux_msgsnd_args a;
a.msqid = args->arg1;
a.msgp = args->ptr;
a.msgsz = args->arg2;
a.msgflg = args->arg3;
return (linux_msgsnd(td, &a));
}
case LINUX_MSGRCV: {
struct linux_msgrcv_args a;
a.msqid = args->arg1;
a.msgsz = args->arg2;
a.msgflg = args->arg3;
if ((args->what >> 16) == 0) {
struct l_ipc_kludge tmp;
int error;
if (args->ptr == 0)
return (EINVAL);
error = copyin(args->ptr, &tmp, sizeof(tmp));
if (error)
return (error);
a.msgp = PTRIN(tmp.msgp);
a.msgtyp = tmp.msgtyp;
} else {
a.msgp = args->ptr;
a.msgtyp = args->arg5;
}
return (linux_msgrcv(td, &a));
}
case LINUX_MSGGET: {
struct linux_msgget_args a;
a.key = args->arg1;
a.msgflg = args->arg2;
return (linux_msgget(td, &a));
}
case LINUX_MSGCTL: {
struct linux_msgctl_args a;
a.msqid = args->arg1;
a.cmd = args->arg2;
a.buf = args->ptr;
return (linux_msgctl(td, &a));
}
case LINUX_SHMAT: {
struct linux_shmat_args a;
a.shmid = args->arg1;
a.shmaddr = args->ptr;
a.shmflg = args->arg2;
a.raddr = PTRIN((l_uint)args->arg3);
return (linux_shmat(td, &a));
}
case LINUX_SHMDT: {
struct linux_shmdt_args a;
a.shmaddr = args->ptr;
return (linux_shmdt(td, &a));
}
case LINUX_SHMGET: {
struct linux_shmget_args a;
a.key = args->arg1;
a.size = args->arg2;
a.shmflg = args->arg3;
return (linux_shmget(td, &a));
}
case LINUX_SHMCTL: {
struct linux_shmctl_args a;
a.shmid = args->arg1;
a.cmd = args->arg2;
a.buf = args->ptr;
return (linux_shmctl(td, &a));
}
default:
break;
}
return (EINVAL);
}
int
linux_old_select(struct thread *td, struct linux_old_select_args *args)
{
struct l_old_select_argv linux_args;
struct linux_select_args newsel;
int error;
#ifdef DEBUG
if (ldebug(old_select))
printf(ARGS(old_select, "%p"), args->ptr);
#endif
error = copyin(args->ptr, &linux_args, sizeof(linux_args));
if (error)
return (error);
newsel.nfds = linux_args.nfds;
newsel.readfds = PTRIN(linux_args.readfds);
newsel.writefds = PTRIN(linux_args.writefds);
newsel.exceptfds = PTRIN(linux_args.exceptfds);
newsel.timeout = PTRIN(linux_args.timeout);
return (linux_select(td, &newsel));
}
int
linux_set_cloned_tls(struct thread *td, void *desc)
{
struct user_segment_descriptor sd;
struct l_user_desc info;
struct pcb *pcb;
int error;
int a[2];
error = copyin(desc, &info, sizeof(struct l_user_desc));
if (error) {
printf(LMSG("copyin failed!"));
} else {
/* We might copy out the entry_number as GUGS32_SEL. */
info.entry_number = GUGS32_SEL;
error = copyout(&info, desc, sizeof(struct l_user_desc));
if (error)
printf(LMSG("copyout failed!"));
a[0] = LINUX_LDT_entry_a(&info);
a[1] = LINUX_LDT_entry_b(&info);
memcpy(&sd, &a, sizeof(a));
#ifdef DEBUG
if (ldebug(clone))
printf("Segment created in clone with "
"CLONE_SETTLS: lobase: %x, hibase: %x, "
"lolimit: %x, hilimit: %x, type: %i, "
"dpl: %i, p: %i, xx: %i, long: %i, "
"def32: %i, gran: %i\n", sd.sd_lobase,
sd.sd_hibase, sd.sd_lolimit, sd.sd_hilimit,
sd.sd_type, sd.sd_dpl, sd.sd_p, sd.sd_xx,
sd.sd_long, sd.sd_def32, sd.sd_gran);
#endif
pcb = td->td_pcb;
pcb->pcb_gsbase = (register_t)info.base_addr;
/* XXXKIB pcb->pcb_gs32sd = sd; */
td->td_frame->tf_gs = GSEL(GUGS32_SEL, SEL_UPL);
set_pcb_flags(pcb, PCB_GS32BIT | PCB_32BIT);
}
return (error);
}
int
linux_set_upcall_kse(struct thread *td, register_t stack)
{
td->td_frame->tf_rsp = stack;
return (0);
}
#define STACK_SIZE (2 * 1024 * 1024)
#define GUARD_SIZE (4 * PAGE_SIZE)
int
linux_mmap2(struct thread *td, struct linux_mmap2_args *args)
{
#ifdef DEBUG
if (ldebug(mmap2))
printf(ARGS(mmap2, "0x%08x, %d, %d, 0x%08x, %d, %d"),
args->addr, args->len, args->prot,
args->flags, args->fd, args->pgoff);
#endif
return (linux_mmap_common(td, PTROUT(args->addr), args->len, args->prot,
args->flags, args->fd, (uint64_t)(uint32_t)args->pgoff *
PAGE_SIZE));
}
int
linux_mmap(struct thread *td, struct linux_mmap_args *args)
{
int error;
struct l_mmap_argv linux_args;
error = copyin(args->ptr, &linux_args, sizeof(linux_args));
if (error)
return (error);
#ifdef DEBUG
if (ldebug(mmap))
printf(ARGS(mmap, "0x%08x, %d, %d, 0x%08x, %d, %d"),
linux_args.addr, linux_args.len, linux_args.prot,
linux_args.flags, linux_args.fd, linux_args.pgoff);
#endif
return (linux_mmap_common(td, linux_args.addr, linux_args.len,
linux_args.prot, linux_args.flags, linux_args.fd,
(uint32_t)linux_args.pgoff));
}
static int
linux_mmap_common(struct thread *td, l_uintptr_t addr, l_size_t len, l_int prot,
l_int flags, l_int fd, l_loff_t pos)
{
struct proc *p = td->td_proc;
struct mmap_args /* {
caddr_t addr;
size_t len;
int prot;
int flags;
int fd;
long pad;
off_t pos;
} */ bsd_args;
int error;
struct file *fp;
error = 0;
bsd_args.flags = 0;
fp = NULL;
/*
* Linux mmap(2):
* You must specify exactly one of MAP_SHARED and MAP_PRIVATE
*/
if (!((flags & LINUX_MAP_SHARED) ^ (flags & LINUX_MAP_PRIVATE)))
return (EINVAL);
if (flags & LINUX_MAP_SHARED)
bsd_args.flags |= MAP_SHARED;
if (flags & LINUX_MAP_PRIVATE)
bsd_args.flags |= MAP_PRIVATE;
if (flags & LINUX_MAP_FIXED)
bsd_args.flags |= MAP_FIXED;
if (flags & LINUX_MAP_ANON) {
/* Enforce pos to be on page boundary, then ignore. */
if ((pos & PAGE_MASK) != 0)
return (EINVAL);
pos = 0;
bsd_args.flags |= MAP_ANON;
} else
bsd_args.flags |= MAP_NOSYNC;
if (flags & LINUX_MAP_GROWSDOWN)
bsd_args.flags |= MAP_STACK;
/*
* PROT_READ, PROT_WRITE, or PROT_EXEC implies PROT_READ and PROT_EXEC
* on Linux/i386. We do this to ensure maximum compatibility.
* Linux/ia64 does the same in i386 emulation mode.
*/
bsd_args.prot = prot;
if (bsd_args.prot & (PROT_READ | PROT_WRITE | PROT_EXEC))
bsd_args.prot |= PROT_READ | PROT_EXEC;
/* Linux does not check file descriptor when MAP_ANONYMOUS is set. */
bsd_args.fd = (bsd_args.flags & MAP_ANON) ? -1 : fd;
if (bsd_args.fd != -1) {
/*
* Linux follows Solaris mmap(2) description:
* The file descriptor fildes is opened with
* read permission, regardless of the
* protection options specified.
*/
if ((error = fget(td, bsd_args.fd, CAP_MMAP, &fp)) != 0)
return (error);
if (fp->f_type != DTYPE_VNODE) {
fdrop(fp, td);
return (EINVAL);
}
/* Linux mmap() just fails for O_WRONLY files */
if (!(fp->f_flag & FREAD)) {
fdrop(fp, td);
return (EACCES);
}
fdrop(fp, td);
}
if (flags & LINUX_MAP_GROWSDOWN) {
/*
* The Linux MAP_GROWSDOWN option does not limit auto
* growth of the region. Linux mmap with this option
* takes as addr the inital BOS, and as len, the initial
* region size. It can then grow down from addr without
* limit. However, Linux threads has an implicit internal
* limit to stack size of STACK_SIZE. Its just not
* enforced explicitly in Linux. But, here we impose
* a limit of (STACK_SIZE - GUARD_SIZE) on the stack
* region, since we can do this with our mmap.
*
* Our mmap with MAP_STACK takes addr as the maximum
* downsize limit on BOS, and as len the max size of
* the region. It then maps the top SGROWSIZ bytes,
* and auto grows the region down, up to the limit
* in addr.
*
* If we don't use the MAP_STACK option, the effect
* of this code is to allocate a stack region of a
* fixed size of (STACK_SIZE - GUARD_SIZE).
*/
if ((caddr_t)PTRIN(addr) + len > p->p_vmspace->vm_maxsaddr) {
/*
* Some Linux apps will attempt to mmap
* thread stacks near the top of their
* address space. If their TOS is greater
* than vm_maxsaddr, vm_map_growstack()
* will confuse the thread stack with the
* process stack and deliver a SEGV if they
* attempt to grow the thread stack past their
* current stacksize rlimit. To avoid this,
* adjust vm_maxsaddr upwards to reflect
* the current stacksize rlimit rather
* than the maximum possible stacksize.
* It would be better to adjust the
* mmap'ed region, but some apps do not check
* mmap's return value.
*/
PROC_LOCK(p);
p->p_vmspace->vm_maxsaddr = (char *)LINUX32_USRSTACK -
lim_cur(p, RLIMIT_STACK);
PROC_UNLOCK(p);
}
/*
* This gives us our maximum stack size and a new BOS.
* If we're using VM_STACK, then mmap will just map
* the top SGROWSIZ bytes, and let the stack grow down
* to the limit at BOS. If we're not using VM_STACK
* we map the full stack, since we don't have a way
* to autogrow it.
*/
if (len > STACK_SIZE - GUARD_SIZE) {
bsd_args.addr = (caddr_t)PTRIN(addr);
bsd_args.len = len;
} else {
bsd_args.addr = (caddr_t)PTRIN(addr) -
(STACK_SIZE - GUARD_SIZE - len);
bsd_args.len = STACK_SIZE - GUARD_SIZE;
}
} else {
bsd_args.addr = (caddr_t)PTRIN(addr);
bsd_args.len = len;
}
bsd_args.pos = pos;
#ifdef DEBUG
if (ldebug(mmap))
printf("-> %s(%p, %d, %d, 0x%08x, %d, 0x%x)\n",
__func__,
(void *)bsd_args.addr, (int)bsd_args.len, bsd_args.prot,
bsd_args.flags, bsd_args.fd, (int)bsd_args.pos);
#endif
- error = mmap(td, &bsd_args);
+ error = sys_mmap(td, &bsd_args);
#ifdef DEBUG
if (ldebug(mmap))
printf("-> %s() return: 0x%x (0x%08x)\n",
__func__, error, (u_int)td->td_retval[0]);
#endif
return (error);
}
int
linux_mprotect(struct thread *td, struct linux_mprotect_args *uap)
{
struct mprotect_args bsd_args;
bsd_args.addr = uap->addr;
bsd_args.len = uap->len;
bsd_args.prot = uap->prot;
if (bsd_args.prot & (PROT_READ | PROT_WRITE | PROT_EXEC))
bsd_args.prot |= PROT_READ | PROT_EXEC;
- return (mprotect(td, &bsd_args));
+ return (sys_mprotect(td, &bsd_args));
}
int
linux_iopl(struct thread *td, struct linux_iopl_args *args)
{
int error;
if (args->level < 0 || args->level > 3)
return (EINVAL);
if ((error = priv_check(td, PRIV_IO)) != 0)
return (error);
if ((error = securelevel_gt(td->td_ucred, 0)) != 0)
return (error);
td->td_frame->tf_rflags = (td->td_frame->tf_rflags & ~PSL_IOPL) |
(args->level * (PSL_IOPL / 3));
return (0);
}
int
linux_pipe(struct thread *td, struct linux_pipe_args *args)
{
int error;
int fildes[2];
#ifdef DEBUG
if (ldebug(pipe))
printf(ARGS(pipe, "*"));
#endif
error = kern_pipe(td, fildes);
if (error)
return (error);
/* XXX: Close descriptors on error. */
return (copyout(fildes, args->pipefds, sizeof fildes));
}
int
linux_sigaction(struct thread *td, struct linux_sigaction_args *args)
{
l_osigaction_t osa;
l_sigaction_t act, oact;
int error;
#ifdef DEBUG
if (ldebug(sigaction))
printf(ARGS(sigaction, "%d, %p, %p"),
args->sig, (void *)args->nsa, (void *)args->osa);
#endif
if (args->nsa != NULL) {
error = copyin(args->nsa, &osa, sizeof(l_osigaction_t));
if (error)
return (error);
act.lsa_handler = osa.lsa_handler;
act.lsa_flags = osa.lsa_flags;
act.lsa_restorer = osa.lsa_restorer;
LINUX_SIGEMPTYSET(act.lsa_mask);
act.lsa_mask.__bits[0] = osa.lsa_mask;
}
error = linux_do_sigaction(td, args->sig, args->nsa ? &act : NULL,
args->osa ? &oact : NULL);
if (args->osa != NULL && !error) {
osa.lsa_handler = oact.lsa_handler;
osa.lsa_flags = oact.lsa_flags;
osa.lsa_restorer = oact.lsa_restorer;
osa.lsa_mask = oact.lsa_mask.__bits[0];
error = copyout(&osa, args->osa, sizeof(l_osigaction_t));
}
return (error);
}
/*
* Linux has two extra args, restart and oldmask. We don't use these,
* but it seems that "restart" is actually a context pointer that
* enables the signal to happen with a different register set.
*/
int
linux_sigsuspend(struct thread *td, struct linux_sigsuspend_args *args)
{
sigset_t sigmask;
l_sigset_t mask;
#ifdef DEBUG
if (ldebug(sigsuspend))
printf(ARGS(sigsuspend, "%08lx"), (unsigned long)args->mask);
#endif
LINUX_SIGEMPTYSET(mask);
mask.__bits[0] = args->mask;
linux_to_bsd_sigset(&mask, &sigmask);
return (kern_sigsuspend(td, sigmask));
}
int
linux_rt_sigsuspend(struct thread *td, struct linux_rt_sigsuspend_args *uap)
{
l_sigset_t lmask;
sigset_t sigmask;
int error;
#ifdef DEBUG
if (ldebug(rt_sigsuspend))
printf(ARGS(rt_sigsuspend, "%p, %d"),
(void *)uap->newset, uap->sigsetsize);
#endif
if (uap->sigsetsize != sizeof(l_sigset_t))
return (EINVAL);
error = copyin(uap->newset, &lmask, sizeof(l_sigset_t));
if (error)
return (error);
linux_to_bsd_sigset(&lmask, &sigmask);
return (kern_sigsuspend(td, sigmask));
}
int
linux_pause(struct thread *td, struct linux_pause_args *args)
{
struct proc *p = td->td_proc;
sigset_t sigmask;
#ifdef DEBUG
if (ldebug(pause))
printf(ARGS(pause, ""));
#endif
PROC_LOCK(p);
sigmask = td->td_sigmask;
PROC_UNLOCK(p);
return (kern_sigsuspend(td, sigmask));
}
int
linux_sigaltstack(struct thread *td, struct linux_sigaltstack_args *uap)
{
stack_t ss, oss;
l_stack_t lss;
int error;
#ifdef DEBUG
if (ldebug(sigaltstack))
printf(ARGS(sigaltstack, "%p, %p"), uap->uss, uap->uoss);
#endif
if (uap->uss != NULL) {
error = copyin(uap->uss, &lss, sizeof(l_stack_t));
if (error)
return (error);
ss.ss_sp = PTRIN(lss.ss_sp);
ss.ss_size = lss.ss_size;
ss.ss_flags = linux_to_bsd_sigaltstack(lss.ss_flags);
}
error = kern_sigaltstack(td, (uap->uss != NULL) ? &ss : NULL,
(uap->uoss != NULL) ? &oss : NULL);
if (!error && uap->uoss != NULL) {
lss.ss_sp = PTROUT(oss.ss_sp);
lss.ss_size = oss.ss_size;
lss.ss_flags = bsd_to_linux_sigaltstack(oss.ss_flags);
error = copyout(&lss, uap->uoss, sizeof(l_stack_t));
}
return (error);
}
int
linux_ftruncate64(struct thread *td, struct linux_ftruncate64_args *args)
{
struct ftruncate_args sa;
#ifdef DEBUG
if (ldebug(ftruncate64))
printf(ARGS(ftruncate64, "%u, %jd"), args->fd,
(intmax_t)args->length);
#endif
sa.fd = args->fd;
sa.length = args->length;
- return ftruncate(td, &sa);
+ return sys_ftruncate(td, &sa);
}
int
linux_gettimeofday(struct thread *td, struct linux_gettimeofday_args *uap)
{
struct timeval atv;
l_timeval atv32;
struct timezone rtz;
int error = 0;
if (uap->tp) {
microtime(&atv);
atv32.tv_sec = atv.tv_sec;
atv32.tv_usec = atv.tv_usec;
error = copyout(&atv32, uap->tp, sizeof(atv32));
}
if (error == 0 && uap->tzp != NULL) {
rtz.tz_minuteswest = tz_minuteswest;
rtz.tz_dsttime = tz_dsttime;
error = copyout(&rtz, uap->tzp, sizeof(rtz));
}
return (error);
}
int
linux_settimeofday(struct thread *td, struct linux_settimeofday_args *uap)
{
l_timeval atv32;
struct timeval atv, *tvp;
struct timezone atz, *tzp;
int error;
if (uap->tp) {
error = copyin(uap->tp, &atv32, sizeof(atv32));
if (error)
return (error);
atv.tv_sec = atv32.tv_sec;
atv.tv_usec = atv32.tv_usec;
tvp = &atv;
} else
tvp = NULL;
if (uap->tzp) {
error = copyin(uap->tzp, &atz, sizeof(atz));
if (error)
return (error);
tzp = &atz;
} else
tzp = NULL;
return (kern_settimeofday(td, tvp, tzp));
}
int
linux_getrusage(struct thread *td, struct linux_getrusage_args *uap)
{
struct l_rusage s32;
struct rusage s;
int error;
error = kern_getrusage(td, uap->who, &s);
if (error != 0)
return (error);
if (uap->rusage != NULL) {
bsd_to_linux_rusage(&s, &s32);
error = copyout(&s32, uap->rusage, sizeof(s32));
}
return (error);
}
int
linux_sched_rr_get_interval(struct thread *td,
struct linux_sched_rr_get_interval_args *uap)
{
struct timespec ts;
struct l_timespec ts32;
int error;
error = kern_sched_rr_get_interval(td, uap->pid, &ts);
if (error != 0)
return (error);
ts32.tv_sec = ts.tv_sec;
ts32.tv_nsec = ts.tv_nsec;
return (copyout(&ts32, uap->interval, sizeof(ts32)));
}
int
linux_set_thread_area(struct thread *td,
struct linux_set_thread_area_args *args)
{
struct l_user_desc info;
struct user_segment_descriptor sd;
struct pcb *pcb;
int a[2];
int error;
error = copyin(args->desc, &info, sizeof(struct l_user_desc));
if (error)
return (error);
#ifdef DEBUG
if (ldebug(set_thread_area))
printf(ARGS(set_thread_area, "%i, %x, %x, %i, %i, %i, "
"%i, %i, %i"), info.entry_number, info.base_addr,
info.limit, info.seg_32bit, info.contents,
info.read_exec_only, info.limit_in_pages,
info.seg_not_present, info.useable);
#endif
/*
* Semantics of Linux version: every thread in the system has array
* of three TLS descriptors. 1st is GLIBC TLS, 2nd is WINE, 3rd unknown.
* This syscall loads one of the selected TLS decriptors with a value
* and also loads GDT descriptors 6, 7 and 8 with the content of
* the per-thread descriptors.
*
* Semantics of FreeBSD version: I think we can ignore that Linux has
* three per-thread descriptors and use just the first one.
* The tls_array[] is used only in [gs]et_thread_area() syscalls and
* for loading the GDT descriptors. We use just one GDT descriptor
* for TLS, so we will load just one.
*
* XXX: This doesn't work when a user space process tries to use more
* than one TLS segment. Comment in the Linux source says wine might
* do this.
*/
/*
* GLIBC reads current %gs and call set_thread_area() with it.
* We should let GUDATA_SEL and GUGS32_SEL proceed as well because
* we use these segments.
*/
switch (info.entry_number) {
case GUGS32_SEL:
case GUDATA_SEL:
case 6:
case -1:
info.entry_number = GUGS32_SEL;
break;
default:
return (EINVAL);
}
/*
* We have to copy out the GDT entry we use.
*
* XXX: What if a user space program does not check the return value
* and tries to use 6, 7 or 8?
*/
error = copyout(&info, args->desc, sizeof(struct l_user_desc));
if (error)
return (error);
if (LINUX_LDT_empty(&info)) {
a[0] = 0;
a[1] = 0;
} else {
a[0] = LINUX_LDT_entry_a(&info);
a[1] = LINUX_LDT_entry_b(&info);
}
memcpy(&sd, &a, sizeof(a));
#ifdef DEBUG
if (ldebug(set_thread_area))
printf("Segment created in set_thread_area: "
"lobase: %x, hibase: %x, lolimit: %x, hilimit: %x, "
"type: %i, dpl: %i, p: %i, xx: %i, long: %i, "
"def32: %i, gran: %i\n",
sd.sd_lobase,
sd.sd_hibase,
sd.sd_lolimit,
sd.sd_hilimit,
sd.sd_type,
sd.sd_dpl,
sd.sd_p,
sd.sd_xx,
sd.sd_long,
sd.sd_def32,
sd.sd_gran);
#endif
pcb = td->td_pcb;
pcb->pcb_gsbase = (register_t)info.base_addr;
set_pcb_flags(pcb, PCB_32BIT | PCB_GS32BIT);
update_gdt_gsbase(td, info.base_addr);
return (0);
}
int
linux_wait4(struct thread *td, struct linux_wait4_args *args)
{
int error, options;
struct rusage ru, *rup;
struct l_rusage lru;
#ifdef DEBUG
if (ldebug(wait4))
printf(ARGS(wait4, "%d, %p, %d, %p"),
args->pid, (void *)args->status, args->options,
(void *)args->rusage);
#endif
options = (args->options & (WNOHANG | WUNTRACED));
/* WLINUXCLONE should be equal to __WCLONE, but we make sure */
if (args->options & __WCLONE)
options |= WLINUXCLONE;
if (args->rusage != NULL)
rup = &ru;
else
rup = NULL;
error = linux_common_wait(td, args->pid, args->status, options, rup);
if (error)
return (error);
if (args->rusage != NULL) {
bsd_to_linux_rusage(rup, &lru);
error = copyout(&lru, args->rusage, sizeof(lru));
}
return (error);
}
Index: head/sys/arm/arm/machdep.c
===================================================================
--- head/sys/arm/arm/machdep.c (revision 225616)
+++ head/sys/arm/arm/machdep.c (revision 225617)
@@ -1,709 +1,709 @@
/* $NetBSD: arm32_machdep.c,v 1.44 2004/03/24 15:34:47 atatat Exp $ */
/*-
* Copyright (c) 2004 Olivier Houchard
* Copyright (c) 1994-1998 Mark Brinicombe.
* Copyright (c) 1994 Brini.
* All rights reserved.
*
* This code is derived from software written for Brini by Mark Brinicombe
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by Mark Brinicombe
* for the NetBSD Project.
* 4. The name of the company nor the name of the author may be used to
* endorse or promote products derived from this software without specific
* prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
* INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* Machine dependant functions for kernel setup
*
* Created : 17/09/94
* Updated : 18/04/01 updated for new wscons
*/
#include "opt_compat.h"
#include "opt_ddb.h"
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/proc.h>
#include <sys/systm.h>
#include <sys/bio.h>
#include <sys/buf.h>
#include <sys/bus.h>
#include <sys/cons.h>
#include <sys/cpu.h>
#include <sys/exec.h>
#include <sys/imgact.h>
#include <sys/kernel.h>
#include <sys/ktr.h>
#include <sys/linker.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
#include <sys/pcpu.h>
#include <sys/ptrace.h>
#include <sys/signalvar.h>
#include <sys/syscallsubr.h>
#include <sys/sysent.h>
#include <sys/sysproto.h>
#include <sys/uio.h>
#include <vm/vm.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
#include <vm/vm_object.h>
#include <vm/vm_page.h>
#include <vm/vm_pager.h>
#include <machine/armreg.h>
#include <machine/cpu.h>
#include <machine/machdep.h>
#include <machine/md_var.h>
#include <machine/metadata.h>
#include <machine/pcb.h>
#include <machine/pmap.h>
#include <machine/reg.h>
#include <machine/trap.h>
#include <machine/undefined.h>
#include <machine/vmparam.h>
#include <machine/sysarch.h>
uint32_t cpu_reset_address = 0;
int cold = 1;
vm_offset_t vector_page;
long realmem = 0;
int (*_arm_memcpy)(void *, void *, int, int) = NULL;
int (*_arm_bzero)(void *, int, int) = NULL;
int _min_memcpy_size = 0;
int _min_bzero_size = 0;
extern int *end;
#ifdef DDB
extern vm_offset_t ksym_start, ksym_end;
#endif
void
sendsig(catcher, ksi, mask)
sig_t catcher;
ksiginfo_t *ksi;
sigset_t *mask;
{
struct thread *td;
struct proc *p;
struct trapframe *tf;
struct sigframe *fp, frame;
struct sigacts *psp;
int onstack;
int sig;
int code;
td = curthread;
p = td->td_proc;
PROC_LOCK_ASSERT(p, MA_OWNED);
sig = ksi->ksi_signo;
code = ksi->ksi_code;
psp = p->p_sigacts;
mtx_assert(&psp->ps_mtx, MA_OWNED);
tf = td->td_frame;
onstack = sigonstack(tf->tf_usr_sp);
CTR4(KTR_SIG, "sendsig: td=%p (%s) catcher=%p sig=%d", td, p->p_comm,
catcher, sig);
/* Allocate and validate space for the signal handler context. */
if ((td->td_flags & TDP_ALTSTACK) != 0 && !(onstack) &&
SIGISMEMBER(psp->ps_sigonstack, sig)) {
fp = (struct sigframe *)(td->td_sigstk.ss_sp +
td->td_sigstk.ss_size);
#if defined(COMPAT_43)
td->td_sigstk.ss_flags |= SS_ONSTACK;
#endif
} else
fp = (struct sigframe *)td->td_frame->tf_usr_sp;
/* make room on the stack */
fp--;
/* make the stack aligned */
fp = (struct sigframe *)STACKALIGN(fp);
/* Populate the siginfo frame. */
get_mcontext(td, &frame.sf_uc.uc_mcontext, 0);
frame.sf_si = ksi->ksi_info;
frame.sf_uc.uc_sigmask = *mask;
frame.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK )
? ((onstack) ? SS_ONSTACK : 0) : SS_DISABLE;
frame.sf_uc.uc_stack = td->td_sigstk;
mtx_unlock(&psp->ps_mtx);
PROC_UNLOCK(td->td_proc);
/* Copy the sigframe out to the user's stack. */
if (copyout(&frame, fp, sizeof(*fp)) != 0) {
/* Process has trashed its stack. Kill it. */
CTR2(KTR_SIG, "sendsig: sigexit td=%p fp=%p", td, fp);
PROC_LOCK(p);
sigexit(td, SIGILL);
}
/* Translate the signal if appropriate. */
if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
/*
* Build context to run handler in. We invoke the handler
* directly, only returning via the trampoline. Note the
* trampoline version numbers are coordinated with machine-
* dependent code in libc.
*/
tf->tf_r0 = sig;
tf->tf_r1 = (register_t)&fp->sf_si;
tf->tf_r2 = (register_t)&fp->sf_uc;
/* the trampoline uses r5 as the uc address */
tf->tf_r5 = (register_t)&fp->sf_uc;
tf->tf_pc = (register_t)catcher;
tf->tf_usr_sp = (register_t)fp;
tf->tf_usr_lr = (register_t)(PS_STRINGS - *(p->p_sysent->sv_szsigcode));
CTR3(KTR_SIG, "sendsig: return td=%p pc=%#x sp=%#x", td, tf->tf_usr_lr,
tf->tf_usr_sp);
PROC_LOCK(p);
mtx_lock(&psp->ps_mtx);
}
struct kva_md_info kmi;
/*
* arm32_vector_init:
*
* Initialize the vector page, and select whether or not to
* relocate the vectors.
*
* NOTE: We expect the vector page to be mapped at its expected
* destination.
*/
extern unsigned int page0[], page0_data[];
void
arm_vector_init(vm_offset_t va, int which)
{
unsigned int *vectors = (int *) va;
unsigned int *vectors_data = vectors + (page0_data - page0);
int vec;
/*
* Loop through the vectors we're taking over, and copy the
* vector's insn and data word.
*/
for (vec = 0; vec < ARM_NVEC; vec++) {
if ((which & (1 << vec)) == 0) {
/* Don't want to take over this vector. */
continue;
}
vectors[vec] = page0[vec];
vectors_data[vec] = page0_data[vec];
}
/* Now sync the vectors. */
cpu_icache_sync_range(va, (ARM_NVEC * 2) * sizeof(u_int));
vector_page = va;
if (va == ARM_VECTORS_HIGH) {
/*
* Assume the MD caller knows what it's doing here, and
* really does want the vector page relocated.
*
* Note: This has to be done here (and not just in
* cpu_setup()) because the vector page needs to be
* accessible *before* cpu_startup() is called.
* Think ddb(9) ...
*
* NOTE: If the CPU control register is not readable,
* this will totally fail! We'll just assume that
* any system that has high vector support has a
* readable CPU control register, for now. If we
* ever encounter one that does not, we'll have to
* rethink this.
*/
cpu_control(CPU_CONTROL_VECRELOC, CPU_CONTROL_VECRELOC);
}
}
static void
cpu_startup(void *dummy)
{
struct pcb *pcb = thread0.td_pcb;
#ifndef ARM_CACHE_LOCK_ENABLE
vm_page_t m;
#endif
cpu_setup("");
identify_arm_cpu();
printf("real memory = %ju (%ju MB)\n", (uintmax_t)ptoa(physmem),
(uintmax_t)ptoa(physmem) / 1048576);
realmem = physmem;
/*
* Display the RAM layout.
*/
if (bootverbose) {
int indx;
printf("Physical memory chunk(s):\n");
for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
vm_paddr_t size;
size = phys_avail[indx + 1] - phys_avail[indx];
printf("%#08jx - %#08jx, %ju bytes (%ju pages)\n",
(uintmax_t)phys_avail[indx],
(uintmax_t)phys_avail[indx + 1] - 1,
(uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
}
}
vm_ksubmap_init(&kmi);
printf("avail memory = %ju (%ju MB)\n",
(uintmax_t)ptoa(cnt.v_free_count),
(uintmax_t)ptoa(cnt.v_free_count) / 1048576);
bufinit();
vm_pager_bufferinit();
pcb->un_32.pcb32_und_sp = (u_int)thread0.td_kstack +
USPACE_UNDEF_STACK_TOP;
pcb->un_32.pcb32_sp = (u_int)thread0.td_kstack +
USPACE_SVC_STACK_TOP;
vector_page_setprot(VM_PROT_READ);
pmap_set_pcb_pagedir(pmap_kernel(), pcb);
pmap_postinit();
#ifdef ARM_CACHE_LOCK_ENABLE
pmap_kenter_user(ARM_TP_ADDRESS, ARM_TP_ADDRESS);
arm_lock_cache_line(ARM_TP_ADDRESS);
#else
m = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ | VM_ALLOC_ZERO);
pmap_kenter_user(ARM_TP_ADDRESS, VM_PAGE_TO_PHYS(m));
#endif
}
SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
/*
* Flush the D-cache for non-DMA I/O so that the I-cache can
* be made coherent later.
*/
void
cpu_flush_dcache(void *ptr, size_t len)
{
cpu_dcache_wb_range((uintptr_t)ptr, len);
cpu_l2cache_wb_range((uintptr_t)ptr, len);
}
/* Get current clock frequency for the given cpu id. */
int
cpu_est_clockrate(int cpu_id, uint64_t *rate)
{
return (ENXIO);
}
void
cpu_idle(int busy)
{
cpu_sleep(0);
}
int
cpu_idle_wakeup(int cpu)
{
return (0);
}
int
fill_regs(struct thread *td, struct reg *regs)
{
struct trapframe *tf = td->td_frame;
bcopy(&tf->tf_r0, regs->r, sizeof(regs->r));
regs->r_sp = tf->tf_usr_sp;
regs->r_lr = tf->tf_usr_lr;
regs->r_pc = tf->tf_pc;
regs->r_cpsr = tf->tf_spsr;
return (0);
}
int
fill_fpregs(struct thread *td, struct fpreg *regs)
{
bzero(regs, sizeof(*regs));
return (0);
}
int
set_regs(struct thread *td, struct reg *regs)
{
struct trapframe *tf = td->td_frame;
bcopy(regs->r, &tf->tf_r0, sizeof(regs->r));
tf->tf_usr_sp = regs->r_sp;
tf->tf_usr_lr = regs->r_lr;
tf->tf_pc = regs->r_pc;
tf->tf_spsr &= ~PSR_FLAGS;
tf->tf_spsr |= regs->r_cpsr & PSR_FLAGS;
return (0);
}
int
set_fpregs(struct thread *td, struct fpreg *regs)
{
return (0);
}
int
fill_dbregs(struct thread *td, struct dbreg *regs)
{
return (0);
}
int
set_dbregs(struct thread *td, struct dbreg *regs)
{
return (0);
}
static int
ptrace_read_int(struct thread *td, vm_offset_t addr, u_int32_t *v)
{
struct iovec iov;
struct uio uio;
PROC_LOCK_ASSERT(td->td_proc, MA_NOTOWNED);
iov.iov_base = (caddr_t) v;
iov.iov_len = sizeof(u_int32_t);
uio.uio_iov = &iov;
uio.uio_iovcnt = 1;
uio.uio_offset = (off_t)addr;
uio.uio_resid = sizeof(u_int32_t);
uio.uio_segflg = UIO_SYSSPACE;
uio.uio_rw = UIO_READ;
uio.uio_td = td;
return proc_rwmem(td->td_proc, &uio);
}
static int
ptrace_write_int(struct thread *td, vm_offset_t addr, u_int32_t v)
{
struct iovec iov;
struct uio uio;
PROC_LOCK_ASSERT(td->td_proc, MA_NOTOWNED);
iov.iov_base = (caddr_t) &v;
iov.iov_len = sizeof(u_int32_t);
uio.uio_iov = &iov;
uio.uio_iovcnt = 1;
uio.uio_offset = (off_t)addr;
uio.uio_resid = sizeof(u_int32_t);
uio.uio_segflg = UIO_SYSSPACE;
uio.uio_rw = UIO_WRITE;
uio.uio_td = td;
return proc_rwmem(td->td_proc, &uio);
}
int
ptrace_single_step(struct thread *td)
{
struct proc *p;
int error;
KASSERT(td->td_md.md_ptrace_instr == 0,
("Didn't clear single step"));
p = td->td_proc;
PROC_UNLOCK(p);
error = ptrace_read_int(td, td->td_frame->tf_pc + 4,
&td->td_md.md_ptrace_instr);
if (error)
goto out;
error = ptrace_write_int(td, td->td_frame->tf_pc + 4,
PTRACE_BREAKPOINT);
if (error)
td->td_md.md_ptrace_instr = 0;
td->td_md.md_ptrace_addr = td->td_frame->tf_pc + 4;
out:
PROC_LOCK(p);
return (error);
}
int
ptrace_clear_single_step(struct thread *td)
{
struct proc *p;
if (td->td_md.md_ptrace_instr) {
p = td->td_proc;
PROC_UNLOCK(p);
ptrace_write_int(td, td->td_md.md_ptrace_addr,
td->td_md.md_ptrace_instr);
PROC_LOCK(p);
td->td_md.md_ptrace_instr = 0;
}
return (0);
}
int
ptrace_set_pc(struct thread *td, unsigned long addr)
{
td->td_frame->tf_pc = addr;
return (0);
}
void
cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
{
}
void
spinlock_enter(void)
{
struct thread *td;
register_t cspr;
td = curthread;
if (td->td_md.md_spinlock_count == 0) {
cspr = disable_interrupts(I32_bit | F32_bit);
td->td_md.md_spinlock_count = 1;
td->td_md.md_saved_cspr = cspr;
} else
td->td_md.md_spinlock_count++;
critical_enter();
}
void
spinlock_exit(void)
{
struct thread *td;
register_t cspr;
td = curthread;
critical_exit();
cspr = td->td_md.md_saved_cspr;
td->td_md.md_spinlock_count--;
if (td->td_md.md_spinlock_count == 0)
restore_interrupts(cspr);
}
/*
* Clear registers on exec
*/
void
exec_setregs(struct thread *td, struct image_params *imgp, u_long stack)
{
struct trapframe *tf = td->td_frame;
memset(tf, 0, sizeof(*tf));
tf->tf_usr_sp = stack;
tf->tf_usr_lr = imgp->entry_addr;
tf->tf_svc_lr = 0x77777777;
tf->tf_pc = imgp->entry_addr;
tf->tf_spsr = PSR_USR32_MODE;
}
/*
* Get machine context.
*/
int
get_mcontext(struct thread *td, mcontext_t *mcp, int clear_ret)
{
struct trapframe *tf = td->td_frame;
__greg_t *gr = mcp->__gregs;
if (clear_ret & GET_MC_CLEAR_RET)
gr[_REG_R0] = 0;
else
gr[_REG_R0] = tf->tf_r0;
gr[_REG_R1] = tf->tf_r1;
gr[_REG_R2] = tf->tf_r2;
gr[_REG_R3] = tf->tf_r3;
gr[_REG_R4] = tf->tf_r4;
gr[_REG_R5] = tf->tf_r5;
gr[_REG_R6] = tf->tf_r6;
gr[_REG_R7] = tf->tf_r7;
gr[_REG_R8] = tf->tf_r8;
gr[_REG_R9] = tf->tf_r9;
gr[_REG_R10] = tf->tf_r10;
gr[_REG_R11] = tf->tf_r11;
gr[_REG_R12] = tf->tf_r12;
gr[_REG_SP] = tf->tf_usr_sp;
gr[_REG_LR] = tf->tf_usr_lr;
gr[_REG_PC] = tf->tf_pc;
gr[_REG_CPSR] = tf->tf_spsr;
return (0);
}
/*
* Set machine context.
*
* However, we don't set any but the user modifiable flags, and we won't
* touch the cs selector.
*/
int
set_mcontext(struct thread *td, const mcontext_t *mcp)
{
struct trapframe *tf = td->td_frame;
const __greg_t *gr = mcp->__gregs;
tf->tf_r0 = gr[_REG_R0];
tf->tf_r1 = gr[_REG_R1];
tf->tf_r2 = gr[_REG_R2];
tf->tf_r3 = gr[_REG_R3];
tf->tf_r4 = gr[_REG_R4];
tf->tf_r5 = gr[_REG_R5];
tf->tf_r6 = gr[_REG_R6];
tf->tf_r7 = gr[_REG_R7];
tf->tf_r8 = gr[_REG_R8];
tf->tf_r9 = gr[_REG_R9];
tf->tf_r10 = gr[_REG_R10];
tf->tf_r11 = gr[_REG_R11];
tf->tf_r12 = gr[_REG_R12];
tf->tf_usr_sp = gr[_REG_SP];
tf->tf_usr_lr = gr[_REG_LR];
tf->tf_pc = gr[_REG_PC];
tf->tf_spsr = gr[_REG_CPSR];
return (0);
}
/*
* MPSAFE
*/
int
-sigreturn(td, uap)
+sys_sigreturn(td, uap)
struct thread *td;
struct sigreturn_args /* {
const struct __ucontext *sigcntxp;
} */ *uap;
{
struct sigframe sf;
struct trapframe *tf;
int spsr;
if (uap == NULL)
return (EFAULT);
if (copyin(uap->sigcntxp, &sf, sizeof(sf)))
return (EFAULT);
/*
* Make sure the processor mode has not been tampered with and
* interrupts have not been disabled.
*/
spsr = sf.sf_uc.uc_mcontext.__gregs[_REG_CPSR];
if ((spsr & PSR_MODE) != PSR_USR32_MODE ||
(spsr & (I32_bit | F32_bit)) != 0)
return (EINVAL);
/* Restore register context. */
tf = td->td_frame;
set_mcontext(td, &sf.sf_uc.uc_mcontext);
/* Restore signal mask. */
kern_sigprocmask(td, SIG_SETMASK, &sf.sf_uc.uc_sigmask, NULL, 0);
return (EJUSTRETURN);
}
/*
* Construct a PCB from a trapframe. This is called from kdb_trap() where
* we want to start a backtrace from the function that caused us to enter
* the debugger. We have the context in the trapframe, but base the trace
* on the PCB. The PCB doesn't have to be perfect, as long as it contains
* enough for a backtrace.
*/
void
makectx(struct trapframe *tf, struct pcb *pcb)
{
pcb->un_32.pcb32_r8 = tf->tf_r8;
pcb->un_32.pcb32_r9 = tf->tf_r9;
pcb->un_32.pcb32_r10 = tf->tf_r10;
pcb->un_32.pcb32_r11 = tf->tf_r11;
pcb->un_32.pcb32_r12 = tf->tf_r12;
pcb->un_32.pcb32_pc = tf->tf_pc;
pcb->un_32.pcb32_lr = tf->tf_usr_lr;
pcb->un_32.pcb32_sp = tf->tf_usr_sp;
}
/*
* Fake up a boot descriptor table
*/
vm_offset_t
fake_preload_metadata(void)
{
#ifdef DDB
vm_offset_t zstart = 0, zend = 0;
#endif
vm_offset_t lastaddr;
int i = 0;
static uint32_t fake_preload[35];
fake_preload[i++] = MODINFO_NAME;
fake_preload[i++] = strlen("elf kernel") + 1;
strcpy((char*)&fake_preload[i++], "elf kernel");
i += 2;
fake_preload[i++] = MODINFO_TYPE;
fake_preload[i++] = strlen("elf kernel") + 1;
strcpy((char*)&fake_preload[i++], "elf kernel");
i += 2;
fake_preload[i++] = MODINFO_ADDR;
fake_preload[i++] = sizeof(vm_offset_t);
fake_preload[i++] = KERNVIRTADDR;
fake_preload[i++] = MODINFO_SIZE;
fake_preload[i++] = sizeof(uint32_t);
fake_preload[i++] = (uint32_t)&end - KERNVIRTADDR;
#ifdef DDB
if (*(uint32_t *)KERNVIRTADDR == MAGIC_TRAMP_NUMBER) {
fake_preload[i++] = MODINFO_METADATA|MODINFOMD_SSYM;
fake_preload[i++] = sizeof(vm_offset_t);
fake_preload[i++] = *(uint32_t *)(KERNVIRTADDR + 4);
fake_preload[i++] = MODINFO_METADATA|MODINFOMD_ESYM;
fake_preload[i++] = sizeof(vm_offset_t);
fake_preload[i++] = *(uint32_t *)(KERNVIRTADDR + 8);
lastaddr = *(uint32_t *)(KERNVIRTADDR + 8);
zend = lastaddr;
zstart = *(uint32_t *)(KERNVIRTADDR + 4);
ksym_start = zstart;
ksym_end = zend;
} else
#endif
lastaddr = (vm_offset_t)&end;
fake_preload[i++] = 0;
fake_preload[i] = 0;
preload_metadata = (void *)fake_preload;
return (lastaddr);
}
Index: head/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c (revision 225616)
+++ head/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c (revision 225617)
@@ -1,16518 +1,16518 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*
* $FreeBSD$
*/
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
/*
* DTrace - Dynamic Tracing for Solaris
*
* This is the implementation of the Solaris Dynamic Tracing framework
* (DTrace). The user-visible interface to DTrace is described at length in
* the "Solaris Dynamic Tracing Guide". The interfaces between the libdtrace
* library, the in-kernel DTrace framework, and the DTrace providers are
* described in the block comments in the <sys/dtrace.h> header file. The
* internal architecture of DTrace is described in the block comments in the
* <sys/dtrace_impl.h> header file. The comments contained within the DTrace
* implementation very much assume mastery of all of these sources; if one has
* an unanswered question about the implementation, one should consult them
* first.
*
* The functions here are ordered roughly as follows:
*
* - Probe context functions
* - Probe hashing functions
* - Non-probe context utility functions
* - Matching functions
* - Provider-to-Framework API functions
* - Probe management functions
* - DIF object functions
* - Format functions
* - Predicate functions
* - ECB functions
* - Buffer functions
* - Enabling functions
* - DOF functions
* - Anonymous enabling functions
* - Consumer state functions
* - Helper functions
* - Hook functions
* - Driver cookbook functions
*
* Each group of functions begins with a block comment labelled the "DTrace
* [Group] Functions", allowing one to find each block by searching forward
* on capital-f functions.
*/
#include <sys/errno.h>
#if !defined(sun)
#include <sys/time.h>
#endif
#include <sys/stat.h>
#include <sys/modctl.h>
#include <sys/conf.h>
#include <sys/systm.h>
#if defined(sun)
#include <sys/ddi.h>
#include <sys/sunddi.h>
#endif
#include <sys/cpuvar.h>
#include <sys/kmem.h>
#if defined(sun)
#include <sys/strsubr.h>
#endif
#include <sys/sysmacros.h>
#include <sys/dtrace_impl.h>
#include <sys/atomic.h>
#include <sys/cmn_err.h>
#if defined(sun)
#include <sys/mutex_impl.h>
#include <sys/rwlock_impl.h>
#endif
#include <sys/ctf_api.h>
#if defined(sun)
#include <sys/panic.h>
#include <sys/priv_impl.h>
#endif
#include <sys/policy.h>
#if defined(sun)
#include <sys/cred_impl.h>
#include <sys/procfs_isa.h>
#endif
#include <sys/taskq.h>
#if defined(sun)
#include <sys/mkdev.h>
#include <sys/kdi.h>
#endif
#include <sys/zone.h>
#include <sys/socket.h>
#include <netinet/in.h>
/* FreeBSD includes: */
#if !defined(sun)
#include <sys/callout.h>
#include <sys/ctype.h>
#include <sys/limits.h>
#include <sys/kdb.h>
#include <sys/kernel.h>
#include <sys/malloc.h>
#include <sys/sysctl.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/rwlock.h>
#include <sys/sx.h>
#include <sys/dtrace_bsd.h>
#include <netinet/in.h>
#include "dtrace_cddl.h"
#include "dtrace_debug.c"
#endif
/*
* DTrace Tunable Variables
*
* The following variables may be tuned by adding a line to /etc/system that
* includes both the name of the DTrace module ("dtrace") and the name of the
* variable. For example:
*
* set dtrace:dtrace_destructive_disallow = 1
*
* In general, the only variables that one should be tuning this way are those
* that affect system-wide DTrace behavior, and for which the default behavior
* is undesirable. Most of these variables are tunable on a per-consumer
* basis using DTrace options, and need not be tuned on a system-wide basis.
* When tuning these variables, avoid pathological values; while some attempt
* is made to verify the integrity of these variables, they are not considered
* part of the supported interface to DTrace, and they are therefore not
* checked comprehensively. Further, these variables should not be tuned
* dynamically via "mdb -kw" or other means; they should only be tuned via
* /etc/system.
*/
int dtrace_destructive_disallow = 0;
dtrace_optval_t dtrace_nonroot_maxsize = (16 * 1024 * 1024);
size_t dtrace_difo_maxsize = (256 * 1024);
dtrace_optval_t dtrace_dof_maxsize = (256 * 1024);
size_t dtrace_global_maxsize = (16 * 1024);
size_t dtrace_actions_max = (16 * 1024);
size_t dtrace_retain_max = 1024;
dtrace_optval_t dtrace_helper_actions_max = 32;
dtrace_optval_t dtrace_helper_providers_max = 32;
dtrace_optval_t dtrace_dstate_defsize = (1 * 1024 * 1024);
size_t dtrace_strsize_default = 256;
dtrace_optval_t dtrace_cleanrate_default = 9900990; /* 101 hz */
dtrace_optval_t dtrace_cleanrate_min = 200000; /* 5000 hz */
dtrace_optval_t dtrace_cleanrate_max = (uint64_t)60 * NANOSEC; /* 1/minute */
dtrace_optval_t dtrace_aggrate_default = NANOSEC; /* 1 hz */
dtrace_optval_t dtrace_statusrate_default = NANOSEC; /* 1 hz */
dtrace_optval_t dtrace_statusrate_max = (hrtime_t)10 * NANOSEC; /* 6/minute */
dtrace_optval_t dtrace_switchrate_default = NANOSEC; /* 1 hz */
dtrace_optval_t dtrace_nspec_default = 1;
dtrace_optval_t dtrace_specsize_default = 32 * 1024;
dtrace_optval_t dtrace_stackframes_default = 20;
dtrace_optval_t dtrace_ustackframes_default = 20;
dtrace_optval_t dtrace_jstackframes_default = 50;
dtrace_optval_t dtrace_jstackstrsize_default = 512;
int dtrace_msgdsize_max = 128;
hrtime_t dtrace_chill_max = 500 * (NANOSEC / MILLISEC); /* 500 ms */
hrtime_t dtrace_chill_interval = NANOSEC; /* 1000 ms */
int dtrace_devdepth_max = 32;
int dtrace_err_verbose;
hrtime_t dtrace_deadman_interval = NANOSEC;
hrtime_t dtrace_deadman_timeout = (hrtime_t)10 * NANOSEC;
hrtime_t dtrace_deadman_user = (hrtime_t)30 * NANOSEC;
/*
* DTrace External Variables
*
* As dtrace(7D) is a kernel module, any DTrace variables are obviously
* available to DTrace consumers via the backtick (`) syntax. One of these,
* dtrace_zero, is made deliberately so: it is provided as a source of
* well-known, zero-filled memory. While this variable is not documented,
* it is used by some translators as an implementation detail.
*/
const char dtrace_zero[256] = { 0 }; /* zero-filled memory */
/*
* DTrace Internal Variables
*/
#if defined(sun)
static dev_info_t *dtrace_devi; /* device info */
#endif
#if defined(sun)
static vmem_t *dtrace_arena; /* probe ID arena */
static vmem_t *dtrace_minor; /* minor number arena */
static taskq_t *dtrace_taskq; /* task queue */
#else
static struct unrhdr *dtrace_arena; /* Probe ID number. */
#endif
static dtrace_probe_t **dtrace_probes; /* array of all probes */
static int dtrace_nprobes; /* number of probes */
static dtrace_provider_t *dtrace_provider; /* provider list */
static dtrace_meta_t *dtrace_meta_pid; /* user-land meta provider */
static int dtrace_opens; /* number of opens */
static int dtrace_helpers; /* number of helpers */
#if defined(sun)
static void *dtrace_softstate; /* softstate pointer */
#endif
static dtrace_hash_t *dtrace_bymod; /* probes hashed by module */
static dtrace_hash_t *dtrace_byfunc; /* probes hashed by function */
static dtrace_hash_t *dtrace_byname; /* probes hashed by name */
static dtrace_toxrange_t *dtrace_toxrange; /* toxic range array */
static int dtrace_toxranges; /* number of toxic ranges */
static int dtrace_toxranges_max; /* size of toxic range array */
static dtrace_anon_t dtrace_anon; /* anonymous enabling */
static kmem_cache_t *dtrace_state_cache; /* cache for dynamic state */
static uint64_t dtrace_vtime_references; /* number of vtimestamp refs */
static kthread_t *dtrace_panicked; /* panicking thread */
static dtrace_ecb_t *dtrace_ecb_create_cache; /* cached created ECB */
static dtrace_genid_t dtrace_probegen; /* current probe generation */
static dtrace_helpers_t *dtrace_deferred_pid; /* deferred helper list */
static dtrace_enabling_t *dtrace_retained; /* list of retained enablings */
static dtrace_dynvar_t dtrace_dynhash_sink; /* end of dynamic hash chains */
#if !defined(sun)
static struct mtx dtrace_unr_mtx;
MTX_SYSINIT(dtrace_unr_mtx, &dtrace_unr_mtx, "Unique resource identifier", MTX_DEF);
int dtrace_in_probe; /* non-zero if executing a probe */
#if defined(__i386__) || defined(__amd64__)
uintptr_t dtrace_in_probe_addr; /* Address of invop when already in probe */
#endif
#endif
/*
* DTrace Locking
* DTrace is protected by three (relatively coarse-grained) locks:
*
* (1) dtrace_lock is required to manipulate essentially any DTrace state,
* including enabling state, probes, ECBs, consumer state, helper state,
* etc. Importantly, dtrace_lock is _not_ required when in probe context;
* probe context is lock-free -- synchronization is handled via the
* dtrace_sync() cross call mechanism.
*
* (2) dtrace_provider_lock is required when manipulating provider state, or
* when provider state must be held constant.
*
* (3) dtrace_meta_lock is required when manipulating meta provider state, or
* when meta provider state must be held constant.
*
* The lock ordering between these three locks is dtrace_meta_lock before
* dtrace_provider_lock before dtrace_lock. (In particular, there are
* several places where dtrace_provider_lock is held by the framework as it
* calls into the providers -- which then call back into the framework,
* grabbing dtrace_lock.)
*
* There are two other locks in the mix: mod_lock and cpu_lock. With respect
* to dtrace_provider_lock and dtrace_lock, cpu_lock continues its historical
* role as a coarse-grained lock; it is acquired before both of these locks.
* With respect to dtrace_meta_lock, its behavior is stranger: cpu_lock must
* be acquired _between_ dtrace_meta_lock and any other DTrace locks.
* mod_lock is similar with respect to dtrace_provider_lock in that it must be
* acquired _between_ dtrace_provider_lock and dtrace_lock.
*/
static kmutex_t dtrace_lock; /* probe state lock */
static kmutex_t dtrace_provider_lock; /* provider state lock */
static kmutex_t dtrace_meta_lock; /* meta-provider state lock */
#if !defined(sun)
/* XXX FreeBSD hacks. */
static kmutex_t mod_lock;
#define cr_suid cr_svuid
#define cr_sgid cr_svgid
#define ipaddr_t in_addr_t
#define mod_modname pathname
#define vuprintf vprintf
#define ttoproc(_a) ((_a)->td_proc)
#define crgetzoneid(_a) 0
#define NCPU MAXCPU
#define SNOCD 0
#define CPU_ON_INTR(_a) 0
#define PRIV_EFFECTIVE (1 << 0)
#define PRIV_DTRACE_KERNEL (1 << 1)
#define PRIV_DTRACE_PROC (1 << 2)
#define PRIV_DTRACE_USER (1 << 3)
#define PRIV_PROC_OWNER (1 << 4)
#define PRIV_PROC_ZONE (1 << 5)
#define PRIV_ALL ~0
SYSCTL_NODE(_debug, OID_AUTO, dtrace, CTLFLAG_RD, 0, "DTrace Information");
#endif
#if defined(sun)
#define curcpu CPU->cpu_id
#endif
/*
* DTrace Provider Variables
*
* These are the variables relating to DTrace as a provider (that is, the
* provider of the BEGIN, END, and ERROR probes).
*/
static dtrace_pattr_t dtrace_provider_attr = {
{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
};
static void
dtrace_nullop(void)
{}
static dtrace_pops_t dtrace_provider_ops = {
(void (*)(void *, dtrace_probedesc_t *))dtrace_nullop,
(void (*)(void *, modctl_t *))dtrace_nullop,
(void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
(void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
(void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
(void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
NULL,
NULL,
NULL,
(void (*)(void *, dtrace_id_t, void *))dtrace_nullop
};
static dtrace_id_t dtrace_probeid_begin; /* special BEGIN probe */
static dtrace_id_t dtrace_probeid_end; /* special END probe */
dtrace_id_t dtrace_probeid_error; /* special ERROR probe */
/*
* DTrace Helper Tracing Variables
*/
uint32_t dtrace_helptrace_next = 0;
uint32_t dtrace_helptrace_nlocals;
char *dtrace_helptrace_buffer;
int dtrace_helptrace_bufsize = 512 * 1024;
#ifdef DEBUG
int dtrace_helptrace_enabled = 1;
#else
int dtrace_helptrace_enabled = 0;
#endif
/*
* DTrace Error Hashing
*
* On DEBUG kernels, DTrace will track the errors that has seen in a hash
* table. This is very useful for checking coverage of tests that are
* expected to induce DIF or DOF processing errors, and may be useful for
* debugging problems in the DIF code generator or in DOF generation . The
* error hash may be examined with the ::dtrace_errhash MDB dcmd.
*/
#ifdef DEBUG
static dtrace_errhash_t dtrace_errhash[DTRACE_ERRHASHSZ];
static const char *dtrace_errlast;
static kthread_t *dtrace_errthread;
static kmutex_t dtrace_errlock;
#endif
/*
* DTrace Macros and Constants
*
* These are various macros that are useful in various spots in the
* implementation, along with a few random constants that have no meaning
* outside of the implementation. There is no real structure to this cpp
* mishmash -- but is there ever?
*/
#define DTRACE_HASHSTR(hash, probe) \
dtrace_hash_str(*((char **)((uintptr_t)(probe) + (hash)->dth_stroffs)))
#define DTRACE_HASHNEXT(hash, probe) \
(dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_nextoffs)
#define DTRACE_HASHPREV(hash, probe) \
(dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_prevoffs)
#define DTRACE_HASHEQ(hash, lhs, rhs) \
(strcmp(*((char **)((uintptr_t)(lhs) + (hash)->dth_stroffs)), \
*((char **)((uintptr_t)(rhs) + (hash)->dth_stroffs))) == 0)
#define DTRACE_AGGHASHSIZE_SLEW 17
#define DTRACE_V4MAPPED_OFFSET (sizeof (uint32_t) * 3)
/*
* The key for a thread-local variable consists of the lower 61 bits of the
* t_did, plus the 3 bits of the highest active interrupt above LOCK_LEVEL.
* We add DIF_VARIABLE_MAX to t_did to assure that the thread key is never
* equal to a variable identifier. This is necessary (but not sufficient) to
* assure that global associative arrays never collide with thread-local
* variables. To guarantee that they cannot collide, we must also define the
* order for keying dynamic variables. That order is:
*
* [ key0 ] ... [ keyn ] [ variable-key ] [ tls-key ]
*
* Because the variable-key and the tls-key are in orthogonal spaces, there is
* no way for a global variable key signature to match a thread-local key
* signature.
*/
#if defined(sun)
#define DTRACE_TLS_THRKEY(where) { \
uint_t intr = 0; \
uint_t actv = CPU->cpu_intr_actv >> (LOCK_LEVEL + 1); \
for (; actv; actv >>= 1) \
intr++; \
ASSERT(intr < (1 << 3)); \
(where) = ((curthread->t_did + DIF_VARIABLE_MAX) & \
(((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \
}
#else
#define DTRACE_TLS_THRKEY(where) { \
solaris_cpu_t *_c = &solaris_cpu[curcpu]; \
uint_t intr = 0; \
uint_t actv = _c->cpu_intr_actv; \
for (; actv; actv >>= 1) \
intr++; \
ASSERT(intr < (1 << 3)); \
(where) = ((curthread->td_tid + DIF_VARIABLE_MAX) & \
(((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \
}
#endif
#define DT_BSWAP_8(x) ((x) & 0xff)
#define DT_BSWAP_16(x) ((DT_BSWAP_8(x) << 8) | DT_BSWAP_8((x) >> 8))
#define DT_BSWAP_32(x) ((DT_BSWAP_16(x) << 16) | DT_BSWAP_16((x) >> 16))
#define DT_BSWAP_64(x) ((DT_BSWAP_32(x) << 32) | DT_BSWAP_32((x) >> 32))
#define DT_MASK_LO 0x00000000FFFFFFFFULL
#define DTRACE_STORE(type, tomax, offset, what) \
*((type *)((uintptr_t)(tomax) + (uintptr_t)offset)) = (type)(what);
#ifndef __i386
#define DTRACE_ALIGNCHECK(addr, size, flags) \
if (addr & (size - 1)) { \
*flags |= CPU_DTRACE_BADALIGN; \
cpu_core[curcpu].cpuc_dtrace_illval = addr; \
return (0); \
}
#else
#define DTRACE_ALIGNCHECK(addr, size, flags)
#endif
/*
* Test whether a range of memory starting at testaddr of size testsz falls
* within the range of memory described by addr, sz. We take care to avoid
* problems with overflow and underflow of the unsigned quantities, and
* disallow all negative sizes. Ranges of size 0 are allowed.
*/
#define DTRACE_INRANGE(testaddr, testsz, baseaddr, basesz) \
((testaddr) - (baseaddr) < (basesz) && \
(testaddr) + (testsz) - (baseaddr) <= (basesz) && \
(testaddr) + (testsz) >= (testaddr))
/*
* Test whether alloc_sz bytes will fit in the scratch region. We isolate
* alloc_sz on the righthand side of the comparison in order to avoid overflow
* or underflow in the comparison with it. This is simpler than the INRANGE
* check above, because we know that the dtms_scratch_ptr is valid in the
* range. Allocations of size zero are allowed.
*/
#define DTRACE_INSCRATCH(mstate, alloc_sz) \
((mstate)->dtms_scratch_base + (mstate)->dtms_scratch_size - \
(mstate)->dtms_scratch_ptr >= (alloc_sz))
#define DTRACE_LOADFUNC(bits) \
/*CSTYLED*/ \
uint##bits##_t \
dtrace_load##bits(uintptr_t addr) \
{ \
size_t size = bits / NBBY; \
/*CSTYLED*/ \
uint##bits##_t rval; \
int i; \
volatile uint16_t *flags = (volatile uint16_t *) \
&cpu_core[curcpu].cpuc_dtrace_flags; \
\
DTRACE_ALIGNCHECK(addr, size, flags); \
\
for (i = 0; i < dtrace_toxranges; i++) { \
if (addr >= dtrace_toxrange[i].dtt_limit) \
continue; \
\
if (addr + size <= dtrace_toxrange[i].dtt_base) \
continue; \
\
/* \
* This address falls within a toxic region; return 0. \
*/ \
*flags |= CPU_DTRACE_BADADDR; \
cpu_core[curcpu].cpuc_dtrace_illval = addr; \
return (0); \
} \
\
*flags |= CPU_DTRACE_NOFAULT; \
/*CSTYLED*/ \
rval = *((volatile uint##bits##_t *)addr); \
*flags &= ~CPU_DTRACE_NOFAULT; \
\
return (!(*flags & CPU_DTRACE_FAULT) ? rval : 0); \
}
#ifdef _LP64
#define dtrace_loadptr dtrace_load64
#else
#define dtrace_loadptr dtrace_load32
#endif
#define DTRACE_DYNHASH_FREE 0
#define DTRACE_DYNHASH_SINK 1
#define DTRACE_DYNHASH_VALID 2
#define DTRACE_MATCH_NEXT 0
#define DTRACE_MATCH_DONE 1
#define DTRACE_ANCHORED(probe) ((probe)->dtpr_func[0] != '\0')
#define DTRACE_STATE_ALIGN 64
#define DTRACE_FLAGS2FLT(flags) \
(((flags) & CPU_DTRACE_BADADDR) ? DTRACEFLT_BADADDR : \
((flags) & CPU_DTRACE_ILLOP) ? DTRACEFLT_ILLOP : \
((flags) & CPU_DTRACE_DIVZERO) ? DTRACEFLT_DIVZERO : \
((flags) & CPU_DTRACE_KPRIV) ? DTRACEFLT_KPRIV : \
((flags) & CPU_DTRACE_UPRIV) ? DTRACEFLT_UPRIV : \
((flags) & CPU_DTRACE_TUPOFLOW) ? DTRACEFLT_TUPOFLOW : \
((flags) & CPU_DTRACE_BADALIGN) ? DTRACEFLT_BADALIGN : \
((flags) & CPU_DTRACE_NOSCRATCH) ? DTRACEFLT_NOSCRATCH : \
((flags) & CPU_DTRACE_BADSTACK) ? DTRACEFLT_BADSTACK : \
DTRACEFLT_UNKNOWN)
#define DTRACEACT_ISSTRING(act) \
((act)->dta_kind == DTRACEACT_DIFEXPR && \
(act)->dta_difo->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING)
/* Function prototype definitions: */
static size_t dtrace_strlen(const char *, size_t);
static dtrace_probe_t *dtrace_probe_lookup_id(dtrace_id_t id);
static void dtrace_enabling_provide(dtrace_provider_t *);
static int dtrace_enabling_match(dtrace_enabling_t *, int *);
static void dtrace_enabling_matchall(void);
static dtrace_state_t *dtrace_anon_grab(void);
static uint64_t dtrace_helper(int, dtrace_mstate_t *,
dtrace_state_t *, uint64_t, uint64_t);
static dtrace_helpers_t *dtrace_helpers_create(proc_t *);
static void dtrace_buffer_drop(dtrace_buffer_t *);
static intptr_t dtrace_buffer_reserve(dtrace_buffer_t *, size_t, size_t,
dtrace_state_t *, dtrace_mstate_t *);
static int dtrace_state_option(dtrace_state_t *, dtrace_optid_t,
dtrace_optval_t);
static int dtrace_ecb_create_enable(dtrace_probe_t *, void *);
static void dtrace_helper_provider_destroy(dtrace_helper_provider_t *);
uint16_t dtrace_load16(uintptr_t);
uint32_t dtrace_load32(uintptr_t);
uint64_t dtrace_load64(uintptr_t);
uint8_t dtrace_load8(uintptr_t);
void dtrace_dynvar_clean(dtrace_dstate_t *);
dtrace_dynvar_t *dtrace_dynvar(dtrace_dstate_t *, uint_t, dtrace_key_t *,
size_t, dtrace_dynvar_op_t, dtrace_mstate_t *, dtrace_vstate_t *);
uintptr_t dtrace_dif_varstr(uintptr_t, dtrace_state_t *, dtrace_mstate_t *);
/*
* DTrace Probe Context Functions
*
* These functions are called from probe context. Because probe context is
* any context in which C may be called, arbitrarily locks may be held,
* interrupts may be disabled, we may be in arbitrary dispatched state, etc.
* As a result, functions called from probe context may only call other DTrace
* support functions -- they may not interact at all with the system at large.
* (Note that the ASSERT macro is made probe-context safe by redefining it in
* terms of dtrace_assfail(), a probe-context safe function.) If arbitrary
* loads are to be performed from probe context, they _must_ be in terms of
* the safe dtrace_load*() variants.
*
* Some functions in this block are not actually called from probe context;
* for these functions, there will be a comment above the function reading
* "Note: not called from probe context."
*/
void
dtrace_panic(const char *format, ...)
{
va_list alist;
va_start(alist, format);
dtrace_vpanic(format, alist);
va_end(alist);
}
int
dtrace_assfail(const char *a, const char *f, int l)
{
dtrace_panic("assertion failed: %s, file: %s, line: %d", a, f, l);
/*
* We just need something here that even the most clever compiler
* cannot optimize away.
*/
return (a[(uintptr_t)f]);
}
/*
* Atomically increment a specified error counter from probe context.
*/
static void
dtrace_error(uint32_t *counter)
{
/*
* Most counters stored to in probe context are per-CPU counters.
* However, there are some error conditions that are sufficiently
* arcane that they don't merit per-CPU storage. If these counters
* are incremented concurrently on different CPUs, scalability will be
* adversely affected -- but we don't expect them to be white-hot in a
* correctly constructed enabling...
*/
uint32_t oval, nval;
do {
oval = *counter;
if ((nval = oval + 1) == 0) {
/*
* If the counter would wrap, set it to 1 -- assuring
* that the counter is never zero when we have seen
* errors. (The counter must be 32-bits because we
* aren't guaranteed a 64-bit compare&swap operation.)
* To save this code both the infamy of being fingered
* by a priggish news story and the indignity of being
* the target of a neo-puritan witch trial, we're
* carefully avoiding any colorful description of the
* likelihood of this condition -- but suffice it to
* say that it is only slightly more likely than the
* overflow of predicate cache IDs, as discussed in
* dtrace_predicate_create().
*/
nval = 1;
}
} while (dtrace_cas32(counter, oval, nval) != oval);
}
/*
* Use the DTRACE_LOADFUNC macro to define functions for each of loading a
* uint8_t, a uint16_t, a uint32_t and a uint64_t.
*/
DTRACE_LOADFUNC(8)
DTRACE_LOADFUNC(16)
DTRACE_LOADFUNC(32)
DTRACE_LOADFUNC(64)
static int
dtrace_inscratch(uintptr_t dest, size_t size, dtrace_mstate_t *mstate)
{
if (dest < mstate->dtms_scratch_base)
return (0);
if (dest + size < dest)
return (0);
if (dest + size > mstate->dtms_scratch_ptr)
return (0);
return (1);
}
static int
dtrace_canstore_statvar(uint64_t addr, size_t sz,
dtrace_statvar_t **svars, int nsvars)
{
int i;
for (i = 0; i < nsvars; i++) {
dtrace_statvar_t *svar = svars[i];
if (svar == NULL || svar->dtsv_size == 0)
continue;
if (DTRACE_INRANGE(addr, sz, svar->dtsv_data, svar->dtsv_size))
return (1);
}
return (0);
}
/*
* Check to see if the address is within a memory region to which a store may
* be issued. This includes the DTrace scratch areas, and any DTrace variable
* region. The caller of dtrace_canstore() is responsible for performing any
* alignment checks that are needed before stores are actually executed.
*/
static int
dtrace_canstore(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
dtrace_vstate_t *vstate)
{
/*
* First, check to see if the address is in scratch space...
*/
if (DTRACE_INRANGE(addr, sz, mstate->dtms_scratch_base,
mstate->dtms_scratch_size))
return (1);
/*
* Now check to see if it's a dynamic variable. This check will pick
* up both thread-local variables and any global dynamically-allocated
* variables.
*/
if (DTRACE_INRANGE(addr, sz, (uintptr_t)vstate->dtvs_dynvars.dtds_base,
vstate->dtvs_dynvars.dtds_size)) {
dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
uintptr_t base = (uintptr_t)dstate->dtds_base +
(dstate->dtds_hashsize * sizeof (dtrace_dynhash_t));
uintptr_t chunkoffs;
/*
* Before we assume that we can store here, we need to make
* sure that it isn't in our metadata -- storing to our
* dynamic variable metadata would corrupt our state. For
* the range to not include any dynamic variable metadata,
* it must:
*
* (1) Start above the hash table that is at the base of
* the dynamic variable space
*
* (2) Have a starting chunk offset that is beyond the
* dtrace_dynvar_t that is at the base of every chunk
*
* (3) Not span a chunk boundary
*
*/
if (addr < base)
return (0);
chunkoffs = (addr - base) % dstate->dtds_chunksize;
if (chunkoffs < sizeof (dtrace_dynvar_t))
return (0);
if (chunkoffs + sz > dstate->dtds_chunksize)
return (0);
return (1);
}
/*
* Finally, check the static local and global variables. These checks
* take the longest, so we perform them last.
*/
if (dtrace_canstore_statvar(addr, sz,
vstate->dtvs_locals, vstate->dtvs_nlocals))
return (1);
if (dtrace_canstore_statvar(addr, sz,
vstate->dtvs_globals, vstate->dtvs_nglobals))
return (1);
return (0);
}
/*
* Convenience routine to check to see if the address is within a memory
* region in which a load may be issued given the user's privilege level;
* if not, it sets the appropriate error flags and loads 'addr' into the
* illegal value slot.
*
* DTrace subroutines (DIF_SUBR_*) should use this helper to implement
* appropriate memory access protection.
*/
static int
dtrace_canload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
dtrace_vstate_t *vstate)
{
volatile uintptr_t *illval = &cpu_core[curcpu].cpuc_dtrace_illval;
/*
* If we hold the privilege to read from kernel memory, then
* everything is readable.
*/
if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
return (1);
/*
* You can obviously read that which you can store.
*/
if (dtrace_canstore(addr, sz, mstate, vstate))
return (1);
/*
* We're allowed to read from our own string table.
*/
if (DTRACE_INRANGE(addr, sz, (uintptr_t)mstate->dtms_difo->dtdo_strtab,
mstate->dtms_difo->dtdo_strlen))
return (1);
DTRACE_CPUFLAG_SET(CPU_DTRACE_KPRIV);
*illval = addr;
return (0);
}
/*
* Convenience routine to check to see if a given string is within a memory
* region in which a load may be issued given the user's privilege level;
* this exists so that we don't need to issue unnecessary dtrace_strlen()
* calls in the event that the user has all privileges.
*/
static int
dtrace_strcanload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
dtrace_vstate_t *vstate)
{
size_t strsz;
/*
* If we hold the privilege to read from kernel memory, then
* everything is readable.
*/
if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
return (1);
strsz = 1 + dtrace_strlen((char *)(uintptr_t)addr, sz);
if (dtrace_canload(addr, strsz, mstate, vstate))
return (1);
return (0);
}
/*
* Convenience routine to check to see if a given variable is within a memory
* region in which a load may be issued given the user's privilege level.
*/
static int
dtrace_vcanload(void *src, dtrace_diftype_t *type, dtrace_mstate_t *mstate,
dtrace_vstate_t *vstate)
{
size_t sz;
ASSERT(type->dtdt_flags & DIF_TF_BYREF);
/*
* If we hold the privilege to read from kernel memory, then
* everything is readable.
*/
if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
return (1);
if (type->dtdt_kind == DIF_TYPE_STRING)
sz = dtrace_strlen(src,
vstate->dtvs_state->dts_options[DTRACEOPT_STRSIZE]) + 1;
else
sz = type->dtdt_size;
return (dtrace_canload((uintptr_t)src, sz, mstate, vstate));
}
/*
* Compare two strings using safe loads.
*/
static int
dtrace_strncmp(char *s1, char *s2, size_t limit)
{
uint8_t c1, c2;
volatile uint16_t *flags;
if (s1 == s2 || limit == 0)
return (0);
flags = (volatile uint16_t *)&cpu_core[curcpu].cpuc_dtrace_flags;
do {
if (s1 == NULL) {
c1 = '\0';
} else {
c1 = dtrace_load8((uintptr_t)s1++);
}
if (s2 == NULL) {
c2 = '\0';
} else {
c2 = dtrace_load8((uintptr_t)s2++);
}
if (c1 != c2)
return (c1 - c2);
} while (--limit && c1 != '\0' && !(*flags & CPU_DTRACE_FAULT));
return (0);
}
/*
* Compute strlen(s) for a string using safe memory accesses. The additional
* len parameter is used to specify a maximum length to ensure completion.
*/
static size_t
dtrace_strlen(const char *s, size_t lim)
{
uint_t len;
for (len = 0; len != lim; len++) {
if (dtrace_load8((uintptr_t)s++) == '\0')
break;
}
return (len);
}
/*
* Check if an address falls within a toxic region.
*/
static int
dtrace_istoxic(uintptr_t kaddr, size_t size)
{
uintptr_t taddr, tsize;
int i;
for (i = 0; i < dtrace_toxranges; i++) {
taddr = dtrace_toxrange[i].dtt_base;
tsize = dtrace_toxrange[i].dtt_limit - taddr;
if (kaddr - taddr < tsize) {
DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
cpu_core[curcpu].cpuc_dtrace_illval = kaddr;
return (1);
}
if (taddr - kaddr < size) {
DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
cpu_core[curcpu].cpuc_dtrace_illval = taddr;
return (1);
}
}
return (0);
}
/*
* Copy src to dst using safe memory accesses. The src is assumed to be unsafe
* memory specified by the DIF program. The dst is assumed to be safe memory
* that we can store to directly because it is managed by DTrace. As with
* standard bcopy, overlapping copies are handled properly.
*/
static void
dtrace_bcopy(const void *src, void *dst, size_t len)
{
if (len != 0) {
uint8_t *s1 = dst;
const uint8_t *s2 = src;
if (s1 <= s2) {
do {
*s1++ = dtrace_load8((uintptr_t)s2++);
} while (--len != 0);
} else {
s2 += len;
s1 += len;
do {
*--s1 = dtrace_load8((uintptr_t)--s2);
} while (--len != 0);
}
}
}
/*
* Copy src to dst using safe memory accesses, up to either the specified
* length, or the point that a nul byte is encountered. The src is assumed to
* be unsafe memory specified by the DIF program. The dst is assumed to be
* safe memory that we can store to directly because it is managed by DTrace.
* Unlike dtrace_bcopy(), overlapping regions are not handled.
*/
static void
dtrace_strcpy(const void *src, void *dst, size_t len)
{
if (len != 0) {
uint8_t *s1 = dst, c;
const uint8_t *s2 = src;
do {
*s1++ = c = dtrace_load8((uintptr_t)s2++);
} while (--len != 0 && c != '\0');
}
}
/*
* Copy src to dst, deriving the size and type from the specified (BYREF)
* variable type. The src is assumed to be unsafe memory specified by the DIF
* program. The dst is assumed to be DTrace variable memory that is of the
* specified type; we assume that we can store to directly.
*/
static void
dtrace_vcopy(void *src, void *dst, dtrace_diftype_t *type)
{
ASSERT(type->dtdt_flags & DIF_TF_BYREF);
if (type->dtdt_kind == DIF_TYPE_STRING) {
dtrace_strcpy(src, dst, type->dtdt_size);
} else {
dtrace_bcopy(src, dst, type->dtdt_size);
}
}
/*
* Compare s1 to s2 using safe memory accesses. The s1 data is assumed to be
* unsafe memory specified by the DIF program. The s2 data is assumed to be
* safe memory that we can access directly because it is managed by DTrace.
*/
static int
dtrace_bcmp(const void *s1, const void *s2, size_t len)
{
volatile uint16_t *flags;
flags = (volatile uint16_t *)&cpu_core[curcpu].cpuc_dtrace_flags;
if (s1 == s2)
return (0);
if (s1 == NULL || s2 == NULL)
return (1);
if (s1 != s2 && len != 0) {
const uint8_t *ps1 = s1;
const uint8_t *ps2 = s2;
do {
if (dtrace_load8((uintptr_t)ps1++) != *ps2++)
return (1);
} while (--len != 0 && !(*flags & CPU_DTRACE_FAULT));
}
return (0);
}
/*
* Zero the specified region using a simple byte-by-byte loop. Note that this
* is for safe DTrace-managed memory only.
*/
static void
dtrace_bzero(void *dst, size_t len)
{
uchar_t *cp;
for (cp = dst; len != 0; len--)
*cp++ = 0;
}
static void
dtrace_add_128(uint64_t *addend1, uint64_t *addend2, uint64_t *sum)
{
uint64_t result[2];
result[0] = addend1[0] + addend2[0];
result[1] = addend1[1] + addend2[1] +
(result[0] < addend1[0] || result[0] < addend2[0] ? 1 : 0);
sum[0] = result[0];
sum[1] = result[1];
}
/*
* Shift the 128-bit value in a by b. If b is positive, shift left.
* If b is negative, shift right.
*/
static void
dtrace_shift_128(uint64_t *a, int b)
{
uint64_t mask;
if (b == 0)
return;
if (b < 0) {
b = -b;
if (b >= 64) {
a[0] = a[1] >> (b - 64);
a[1] = 0;
} else {
a[0] >>= b;
mask = 1LL << (64 - b);
mask -= 1;
a[0] |= ((a[1] & mask) << (64 - b));
a[1] >>= b;
}
} else {
if (b >= 64) {
a[1] = a[0] << (b - 64);
a[0] = 0;
} else {
a[1] <<= b;
mask = a[0] >> (64 - b);
a[1] |= mask;
a[0] <<= b;
}
}
}
/*
* The basic idea is to break the 2 64-bit values into 4 32-bit values,
* use native multiplication on those, and then re-combine into the
* resulting 128-bit value.
*
* (hi1 << 32 + lo1) * (hi2 << 32 + lo2) =
* hi1 * hi2 << 64 +
* hi1 * lo2 << 32 +
* hi2 * lo1 << 32 +
* lo1 * lo2
*/
static void
dtrace_multiply_128(uint64_t factor1, uint64_t factor2, uint64_t *product)
{
uint64_t hi1, hi2, lo1, lo2;
uint64_t tmp[2];
hi1 = factor1 >> 32;
hi2 = factor2 >> 32;
lo1 = factor1 & DT_MASK_LO;
lo2 = factor2 & DT_MASK_LO;
product[0] = lo1 * lo2;
product[1] = hi1 * hi2;
tmp[0] = hi1 * lo2;
tmp[1] = 0;
dtrace_shift_128(tmp, 32);
dtrace_add_128(product, tmp, product);
tmp[0] = hi2 * lo1;
tmp[1] = 0;
dtrace_shift_128(tmp, 32);
dtrace_add_128(product, tmp, product);
}
/*
* This privilege check should be used by actions and subroutines to
* verify that the user credentials of the process that enabled the
* invoking ECB match the target credentials
*/
static int
dtrace_priv_proc_common_user(dtrace_state_t *state)
{
cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
/*
* We should always have a non-NULL state cred here, since if cred
* is null (anonymous tracing), we fast-path bypass this routine.
*/
ASSERT(s_cr != NULL);
if ((cr = CRED()) != NULL &&
s_cr->cr_uid == cr->cr_uid &&
s_cr->cr_uid == cr->cr_ruid &&
s_cr->cr_uid == cr->cr_suid &&
s_cr->cr_gid == cr->cr_gid &&
s_cr->cr_gid == cr->cr_rgid &&
s_cr->cr_gid == cr->cr_sgid)
return (1);
return (0);
}
/*
* This privilege check should be used by actions and subroutines to
* verify that the zone of the process that enabled the invoking ECB
* matches the target credentials
*/
static int
dtrace_priv_proc_common_zone(dtrace_state_t *state)
{
#if defined(sun)
cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
/*
* We should always have a non-NULL state cred here, since if cred
* is null (anonymous tracing), we fast-path bypass this routine.
*/
ASSERT(s_cr != NULL);
if ((cr = CRED()) != NULL &&
s_cr->cr_zone == cr->cr_zone)
return (1);
return (0);
#else
return (1);
#endif
}
/*
* This privilege check should be used by actions and subroutines to
* verify that the process has not setuid or changed credentials.
*/
static int
dtrace_priv_proc_common_nocd(void)
{
proc_t *proc;
if ((proc = ttoproc(curthread)) != NULL &&
!(proc->p_flag & SNOCD))
return (1);
return (0);
}
static int
dtrace_priv_proc_destructive(dtrace_state_t *state)
{
int action = state->dts_cred.dcr_action;
if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE) == 0) &&
dtrace_priv_proc_common_zone(state) == 0)
goto bad;
if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER) == 0) &&
dtrace_priv_proc_common_user(state) == 0)
goto bad;
if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG) == 0) &&
dtrace_priv_proc_common_nocd() == 0)
goto bad;
return (1);
bad:
cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
return (0);
}
static int
dtrace_priv_proc_control(dtrace_state_t *state)
{
if (state->dts_cred.dcr_action & DTRACE_CRA_PROC_CONTROL)
return (1);
if (dtrace_priv_proc_common_zone(state) &&
dtrace_priv_proc_common_user(state) &&
dtrace_priv_proc_common_nocd())
return (1);
cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
return (0);
}
static int
dtrace_priv_proc(dtrace_state_t *state)
{
if (state->dts_cred.dcr_action & DTRACE_CRA_PROC)
return (1);
cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
return (0);
}
static int
dtrace_priv_kernel(dtrace_state_t *state)
{
if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL)
return (1);
cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
return (0);
}
static int
dtrace_priv_kernel_destructive(dtrace_state_t *state)
{
if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL_DESTRUCTIVE)
return (1);
cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
return (0);
}
/*
* Note: not called from probe context. This function is called
* asynchronously (and at a regular interval) from outside of probe context to
* clean the dirty dynamic variable lists on all CPUs. Dynamic variable
* cleaning is explained in detail in <sys/dtrace_impl.h>.
*/
void
dtrace_dynvar_clean(dtrace_dstate_t *dstate)
{
dtrace_dynvar_t *dirty;
dtrace_dstate_percpu_t *dcpu;
int i, work = 0;
for (i = 0; i < NCPU; i++) {
dcpu = &dstate->dtds_percpu[i];
ASSERT(dcpu->dtdsc_rinsing == NULL);
/*
* If the dirty list is NULL, there is no dirty work to do.
*/
if (dcpu->dtdsc_dirty == NULL)
continue;
/*
* If the clean list is non-NULL, then we're not going to do
* any work for this CPU -- it means that there has not been
* a dtrace_dynvar() allocation on this CPU (or from this CPU)
* since the last time we cleaned house.
*/
if (dcpu->dtdsc_clean != NULL)
continue;
work = 1;
/*
* Atomically move the dirty list aside.
*/
do {
dirty = dcpu->dtdsc_dirty;
/*
* Before we zap the dirty list, set the rinsing list.
* (This allows for a potential assertion in
* dtrace_dynvar(): if a free dynamic variable appears
* on a hash chain, either the dirty list or the
* rinsing list for some CPU must be non-NULL.)
*/
dcpu->dtdsc_rinsing = dirty;
dtrace_membar_producer();
} while (dtrace_casptr(&dcpu->dtdsc_dirty,
dirty, NULL) != dirty);
}
if (!work) {
/*
* We have no work to do; we can simply return.
*/
return;
}
dtrace_sync();
for (i = 0; i < NCPU; i++) {
dcpu = &dstate->dtds_percpu[i];
if (dcpu->dtdsc_rinsing == NULL)
continue;
/*
* We are now guaranteed that no hash chain contains a pointer
* into this dirty list; we can make it clean.
*/
ASSERT(dcpu->dtdsc_clean == NULL);
dcpu->dtdsc_clean = dcpu->dtdsc_rinsing;
dcpu->dtdsc_rinsing = NULL;
}
/*
* Before we actually set the state to be DTRACE_DSTATE_CLEAN, make
* sure that all CPUs have seen all of the dtdsc_clean pointers.
* This prevents a race whereby a CPU incorrectly decides that
* the state should be something other than DTRACE_DSTATE_CLEAN
* after dtrace_dynvar_clean() has completed.
*/
dtrace_sync();
dstate->dtds_state = DTRACE_DSTATE_CLEAN;
}
/*
* Depending on the value of the op parameter, this function looks-up,
* allocates or deallocates an arbitrarily-keyed dynamic variable. If an
* allocation is requested, this function will return a pointer to a
* dtrace_dynvar_t corresponding to the allocated variable -- or NULL if no
* variable can be allocated. If NULL is returned, the appropriate counter
* will be incremented.
*/
dtrace_dynvar_t *
dtrace_dynvar(dtrace_dstate_t *dstate, uint_t nkeys,
dtrace_key_t *key, size_t dsize, dtrace_dynvar_op_t op,
dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
{
uint64_t hashval = DTRACE_DYNHASH_VALID;
dtrace_dynhash_t *hash = dstate->dtds_hash;
dtrace_dynvar_t *free, *new_free, *next, *dvar, *start, *prev = NULL;
processorid_t me = curcpu, cpu = me;
dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[me];
size_t bucket, ksize;
size_t chunksize = dstate->dtds_chunksize;
uintptr_t kdata, lock, nstate;
uint_t i;
ASSERT(nkeys != 0);
/*
* Hash the key. As with aggregations, we use Jenkins' "One-at-a-time"
* algorithm. For the by-value portions, we perform the algorithm in
* 16-bit chunks (as opposed to 8-bit chunks). This speeds things up a
* bit, and seems to have only a minute effect on distribution. For
* the by-reference data, we perform "One-at-a-time" iterating (safely)
* over each referenced byte. It's painful to do this, but it's much
* better than pathological hash distribution. The efficacy of the
* hashing algorithm (and a comparison with other algorithms) may be
* found by running the ::dtrace_dynstat MDB dcmd.
*/
for (i = 0; i < nkeys; i++) {
if (key[i].dttk_size == 0) {
uint64_t val = key[i].dttk_value;
hashval += (val >> 48) & 0xffff;
hashval += (hashval << 10);
hashval ^= (hashval >> 6);
hashval += (val >> 32) & 0xffff;
hashval += (hashval << 10);
hashval ^= (hashval >> 6);
hashval += (val >> 16) & 0xffff;
hashval += (hashval << 10);
hashval ^= (hashval >> 6);
hashval += val & 0xffff;
hashval += (hashval << 10);
hashval ^= (hashval >> 6);
} else {
/*
* This is incredibly painful, but it beats the hell
* out of the alternative.
*/
uint64_t j, size = key[i].dttk_size;
uintptr_t base = (uintptr_t)key[i].dttk_value;
if (!dtrace_canload(base, size, mstate, vstate))
break;
for (j = 0; j < size; j++) {
hashval += dtrace_load8(base + j);
hashval += (hashval << 10);
hashval ^= (hashval >> 6);
}
}
}
if (DTRACE_CPUFLAG_ISSET(CPU_DTRACE_FAULT))
return (NULL);
hashval += (hashval << 3);
hashval ^= (hashval >> 11);
hashval += (hashval << 15);
/*
* There is a remote chance (ideally, 1 in 2^31) that our hashval
* comes out to be one of our two sentinel hash values. If this
* actually happens, we set the hashval to be a value known to be a
* non-sentinel value.
*/
if (hashval == DTRACE_DYNHASH_FREE || hashval == DTRACE_DYNHASH_SINK)
hashval = DTRACE_DYNHASH_VALID;
/*
* Yes, it's painful to do a divide here. If the cycle count becomes
* important here, tricks can be pulled to reduce it. (However, it's
* critical that hash collisions be kept to an absolute minimum;
* they're much more painful than a divide.) It's better to have a
* solution that generates few collisions and still keeps things
* relatively simple.
*/
bucket = hashval % dstate->dtds_hashsize;
if (op == DTRACE_DYNVAR_DEALLOC) {
volatile uintptr_t *lockp = &hash[bucket].dtdh_lock;
for (;;) {
while ((lock = *lockp) & 1)
continue;
if (dtrace_casptr((volatile void *)lockp,
(volatile void *)lock, (volatile void *)(lock + 1)) == (void *)lock)
break;
}
dtrace_membar_producer();
}
top:
prev = NULL;
lock = hash[bucket].dtdh_lock;
dtrace_membar_consumer();
start = hash[bucket].dtdh_chain;
ASSERT(start != NULL && (start->dtdv_hashval == DTRACE_DYNHASH_SINK ||
start->dtdv_hashval != DTRACE_DYNHASH_FREE ||
op != DTRACE_DYNVAR_DEALLOC));
for (dvar = start; dvar != NULL; dvar = dvar->dtdv_next) {
dtrace_tuple_t *dtuple = &dvar->dtdv_tuple;
dtrace_key_t *dkey = &dtuple->dtt_key[0];
if (dvar->dtdv_hashval != hashval) {
if (dvar->dtdv_hashval == DTRACE_DYNHASH_SINK) {
/*
* We've reached the sink, and therefore the
* end of the hash chain; we can kick out of
* the loop knowing that we have seen a valid
* snapshot of state.
*/
ASSERT(dvar->dtdv_next == NULL);
ASSERT(dvar == &dtrace_dynhash_sink);
break;
}
if (dvar->dtdv_hashval == DTRACE_DYNHASH_FREE) {
/*
* We've gone off the rails: somewhere along
* the line, one of the members of this hash
* chain was deleted. Note that we could also
* detect this by simply letting this loop run
* to completion, as we would eventually hit
* the end of the dirty list. However, we
* want to avoid running the length of the
* dirty list unnecessarily (it might be quite
* long), so we catch this as early as
* possible by detecting the hash marker. In
* this case, we simply set dvar to NULL and
* break; the conditional after the loop will
* send us back to top.
*/
dvar = NULL;
break;
}
goto next;
}
if (dtuple->dtt_nkeys != nkeys)
goto next;
for (i = 0; i < nkeys; i++, dkey++) {
if (dkey->dttk_size != key[i].dttk_size)
goto next; /* size or type mismatch */
if (dkey->dttk_size != 0) {
if (dtrace_bcmp(
(void *)(uintptr_t)key[i].dttk_value,
(void *)(uintptr_t)dkey->dttk_value,
dkey->dttk_size))
goto next;
} else {
if (dkey->dttk_value != key[i].dttk_value)
goto next;
}
}
if (op != DTRACE_DYNVAR_DEALLOC)
return (dvar);
ASSERT(dvar->dtdv_next == NULL ||
dvar->dtdv_next->dtdv_hashval != DTRACE_DYNHASH_FREE);
if (prev != NULL) {
ASSERT(hash[bucket].dtdh_chain != dvar);
ASSERT(start != dvar);
ASSERT(prev->dtdv_next == dvar);
prev->dtdv_next = dvar->dtdv_next;
} else {
if (dtrace_casptr(&hash[bucket].dtdh_chain,
start, dvar->dtdv_next) != start) {
/*
* We have failed to atomically swing the
* hash table head pointer, presumably because
* of a conflicting allocation on another CPU.
* We need to reread the hash chain and try
* again.
*/
goto top;
}
}
dtrace_membar_producer();
/*
* Now set the hash value to indicate that it's free.
*/
ASSERT(hash[bucket].dtdh_chain != dvar);
dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
dtrace_membar_producer();
/*
* Set the next pointer to point at the dirty list, and
* atomically swing the dirty pointer to the newly freed dvar.
*/
do {
next = dcpu->dtdsc_dirty;
dvar->dtdv_next = next;
} while (dtrace_casptr(&dcpu->dtdsc_dirty, next, dvar) != next);
/*
* Finally, unlock this hash bucket.
*/
ASSERT(hash[bucket].dtdh_lock == lock);
ASSERT(lock & 1);
hash[bucket].dtdh_lock++;
return (NULL);
next:
prev = dvar;
continue;
}
if (dvar == NULL) {
/*
* If dvar is NULL, it is because we went off the rails:
* one of the elements that we traversed in the hash chain
* was deleted while we were traversing it. In this case,
* we assert that we aren't doing a dealloc (deallocs lock
* the hash bucket to prevent themselves from racing with
* one another), and retry the hash chain traversal.
*/
ASSERT(op != DTRACE_DYNVAR_DEALLOC);
goto top;
}
if (op != DTRACE_DYNVAR_ALLOC) {
/*
* If we are not to allocate a new variable, we want to
* return NULL now. Before we return, check that the value
* of the lock word hasn't changed. If it has, we may have
* seen an inconsistent snapshot.
*/
if (op == DTRACE_DYNVAR_NOALLOC) {
if (hash[bucket].dtdh_lock != lock)
goto top;
} else {
ASSERT(op == DTRACE_DYNVAR_DEALLOC);
ASSERT(hash[bucket].dtdh_lock == lock);
ASSERT(lock & 1);
hash[bucket].dtdh_lock++;
}
return (NULL);
}
/*
* We need to allocate a new dynamic variable. The size we need is the
* size of dtrace_dynvar plus the size of nkeys dtrace_key_t's plus the
* size of any auxiliary key data (rounded up to 8-byte alignment) plus
* the size of any referred-to data (dsize). We then round the final
* size up to the chunksize for allocation.
*/
for (ksize = 0, i = 0; i < nkeys; i++)
ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
/*
* This should be pretty much impossible, but could happen if, say,
* strange DIF specified the tuple. Ideally, this should be an
* assertion and not an error condition -- but that requires that the
* chunksize calculation in dtrace_difo_chunksize() be absolutely
* bullet-proof. (That is, it must not be able to be fooled by
* malicious DIF.) Given the lack of backwards branches in DIF,
* solving this would presumably not amount to solving the Halting
* Problem -- but it still seems awfully hard.
*/
if (sizeof (dtrace_dynvar_t) + sizeof (dtrace_key_t) * (nkeys - 1) +
ksize + dsize > chunksize) {
dcpu->dtdsc_drops++;
return (NULL);
}
nstate = DTRACE_DSTATE_EMPTY;
do {
retry:
free = dcpu->dtdsc_free;
if (free == NULL) {
dtrace_dynvar_t *clean = dcpu->dtdsc_clean;
void *rval;
if (clean == NULL) {
/*
* We're out of dynamic variable space on
* this CPU. Unless we have tried all CPUs,
* we'll try to allocate from a different
* CPU.
*/
switch (dstate->dtds_state) {
case DTRACE_DSTATE_CLEAN: {
void *sp = &dstate->dtds_state;
if (++cpu >= NCPU)
cpu = 0;
if (dcpu->dtdsc_dirty != NULL &&
nstate == DTRACE_DSTATE_EMPTY)
nstate = DTRACE_DSTATE_DIRTY;
if (dcpu->dtdsc_rinsing != NULL)
nstate = DTRACE_DSTATE_RINSING;
dcpu = &dstate->dtds_percpu[cpu];
if (cpu != me)
goto retry;
(void) dtrace_cas32(sp,
DTRACE_DSTATE_CLEAN, nstate);
/*
* To increment the correct bean
* counter, take another lap.
*/
goto retry;
}
case DTRACE_DSTATE_DIRTY:
dcpu->dtdsc_dirty_drops++;
break;
case DTRACE_DSTATE_RINSING:
dcpu->dtdsc_rinsing_drops++;
break;
case DTRACE_DSTATE_EMPTY:
dcpu->dtdsc_drops++;
break;
}
DTRACE_CPUFLAG_SET(CPU_DTRACE_DROP);
return (NULL);
}
/*
* The clean list appears to be non-empty. We want to
* move the clean list to the free list; we start by
* moving the clean pointer aside.
*/
if (dtrace_casptr(&dcpu->dtdsc_clean,
clean, NULL) != clean) {
/*
* We are in one of two situations:
*
* (a) The clean list was switched to the
* free list by another CPU.
*
* (b) The clean list was added to by the
* cleansing cyclic.
*
* In either of these situations, we can
* just reattempt the free list allocation.
*/
goto retry;
}
ASSERT(clean->dtdv_hashval == DTRACE_DYNHASH_FREE);
/*
* Now we'll move the clean list to the free list.
* It's impossible for this to fail: the only way
* the free list can be updated is through this
* code path, and only one CPU can own the clean list.
* Thus, it would only be possible for this to fail if
* this code were racing with dtrace_dynvar_clean().
* (That is, if dtrace_dynvar_clean() updated the clean
* list, and we ended up racing to update the free
* list.) This race is prevented by the dtrace_sync()
* in dtrace_dynvar_clean() -- which flushes the
* owners of the clean lists out before resetting
* the clean lists.
*/
rval = dtrace_casptr(&dcpu->dtdsc_free, NULL, clean);
ASSERT(rval == NULL);
goto retry;
}
dvar = free;
new_free = dvar->dtdv_next;
} while (dtrace_casptr(&dcpu->dtdsc_free, free, new_free) != free);
/*
* We have now allocated a new chunk. We copy the tuple keys into the
* tuple array and copy any referenced key data into the data space
* following the tuple array. As we do this, we relocate dttk_value
* in the final tuple to point to the key data address in the chunk.
*/
kdata = (uintptr_t)&dvar->dtdv_tuple.dtt_key[nkeys];
dvar->dtdv_data = (void *)(kdata + ksize);
dvar->dtdv_tuple.dtt_nkeys = nkeys;
for (i = 0; i < nkeys; i++) {
dtrace_key_t *dkey = &dvar->dtdv_tuple.dtt_key[i];
size_t kesize = key[i].dttk_size;
if (kesize != 0) {
dtrace_bcopy(
(const void *)(uintptr_t)key[i].dttk_value,
(void *)kdata, kesize);
dkey->dttk_value = kdata;
kdata += P2ROUNDUP(kesize, sizeof (uint64_t));
} else {
dkey->dttk_value = key[i].dttk_value;
}
dkey->dttk_size = kesize;
}
ASSERT(dvar->dtdv_hashval == DTRACE_DYNHASH_FREE);
dvar->dtdv_hashval = hashval;
dvar->dtdv_next = start;
if (dtrace_casptr(&hash[bucket].dtdh_chain, start, dvar) == start)
return (dvar);
/*
* The cas has failed. Either another CPU is adding an element to
* this hash chain, or another CPU is deleting an element from this
* hash chain. The simplest way to deal with both of these cases
* (though not necessarily the most efficient) is to free our
* allocated block and tail-call ourselves. Note that the free is
* to the dirty list and _not_ to the free list. This is to prevent
* races with allocators, above.
*/
dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
dtrace_membar_producer();
do {
free = dcpu->dtdsc_dirty;
dvar->dtdv_next = free;
} while (dtrace_casptr(&dcpu->dtdsc_dirty, free, dvar) != free);
return (dtrace_dynvar(dstate, nkeys, key, dsize, op, mstate, vstate));
}
/*ARGSUSED*/
static void
dtrace_aggregate_min(uint64_t *oval, uint64_t nval, uint64_t arg)
{
if ((int64_t)nval < (int64_t)*oval)
*oval = nval;
}
/*ARGSUSED*/
static void
dtrace_aggregate_max(uint64_t *oval, uint64_t nval, uint64_t arg)
{
if ((int64_t)nval > (int64_t)*oval)
*oval = nval;
}
static void
dtrace_aggregate_quantize(uint64_t *quanta, uint64_t nval, uint64_t incr)
{
int i, zero = DTRACE_QUANTIZE_ZEROBUCKET;
int64_t val = (int64_t)nval;
if (val < 0) {
for (i = 0; i < zero; i++) {
if (val <= DTRACE_QUANTIZE_BUCKETVAL(i)) {
quanta[i] += incr;
return;
}
}
} else {
for (i = zero + 1; i < DTRACE_QUANTIZE_NBUCKETS; i++) {
if (val < DTRACE_QUANTIZE_BUCKETVAL(i)) {
quanta[i - 1] += incr;
return;
}
}
quanta[DTRACE_QUANTIZE_NBUCKETS - 1] += incr;
return;
}
ASSERT(0);
}
static void
dtrace_aggregate_lquantize(uint64_t *lquanta, uint64_t nval, uint64_t incr)
{
uint64_t arg = *lquanta++;
int32_t base = DTRACE_LQUANTIZE_BASE(arg);
uint16_t step = DTRACE_LQUANTIZE_STEP(arg);
uint16_t levels = DTRACE_LQUANTIZE_LEVELS(arg);
int32_t val = (int32_t)nval, level;
ASSERT(step != 0);
ASSERT(levels != 0);
if (val < base) {
/*
* This is an underflow.
*/
lquanta[0] += incr;
return;
}
level = (val - base) / step;
if (level < levels) {
lquanta[level + 1] += incr;
return;
}
/*
* This is an overflow.
*/
lquanta[levels + 1] += incr;
}
/*ARGSUSED*/
static void
dtrace_aggregate_avg(uint64_t *data, uint64_t nval, uint64_t arg)
{
data[0]++;
data[1] += nval;
}
/*ARGSUSED*/
static void
dtrace_aggregate_stddev(uint64_t *data, uint64_t nval, uint64_t arg)
{
int64_t snval = (int64_t)nval;
uint64_t tmp[2];
data[0]++;
data[1] += nval;
/*
* What we want to say here is:
*
* data[2] += nval * nval;
*
* But given that nval is 64-bit, we could easily overflow, so
* we do this as 128-bit arithmetic.
*/
if (snval < 0)
snval = -snval;
dtrace_multiply_128((uint64_t)snval, (uint64_t)snval, tmp);
dtrace_add_128(data + 2, tmp, data + 2);
}
/*ARGSUSED*/
static void
dtrace_aggregate_count(uint64_t *oval, uint64_t nval, uint64_t arg)
{
*oval = *oval + 1;
}
/*ARGSUSED*/
static void
dtrace_aggregate_sum(uint64_t *oval, uint64_t nval, uint64_t arg)
{
*oval += nval;
}
/*
* Aggregate given the tuple in the principal data buffer, and the aggregating
* action denoted by the specified dtrace_aggregation_t. The aggregation
* buffer is specified as the buf parameter. This routine does not return
* failure; if there is no space in the aggregation buffer, the data will be
* dropped, and a corresponding counter incremented.
*/
static void
dtrace_aggregate(dtrace_aggregation_t *agg, dtrace_buffer_t *dbuf,
intptr_t offset, dtrace_buffer_t *buf, uint64_t expr, uint64_t arg)
{
dtrace_recdesc_t *rec = &agg->dtag_action.dta_rec;
uint32_t i, ndx, size, fsize;
uint32_t align = sizeof (uint64_t) - 1;
dtrace_aggbuffer_t *agb;
dtrace_aggkey_t *key;
uint32_t hashval = 0, limit, isstr;
caddr_t tomax, data, kdata;
dtrace_actkind_t action;
dtrace_action_t *act;
uintptr_t offs;
if (buf == NULL)
return;
if (!agg->dtag_hasarg) {
/*
* Currently, only quantize() and lquantize() take additional
* arguments, and they have the same semantics: an increment
* value that defaults to 1 when not present. If additional
* aggregating actions take arguments, the setting of the
* default argument value will presumably have to become more
* sophisticated...
*/
arg = 1;
}
action = agg->dtag_action.dta_kind - DTRACEACT_AGGREGATION;
size = rec->dtrd_offset - agg->dtag_base;
fsize = size + rec->dtrd_size;
ASSERT(dbuf->dtb_tomax != NULL);
data = dbuf->dtb_tomax + offset + agg->dtag_base;
if ((tomax = buf->dtb_tomax) == NULL) {
dtrace_buffer_drop(buf);
return;
}
/*
* The metastructure is always at the bottom of the buffer.
*/
agb = (dtrace_aggbuffer_t *)(tomax + buf->dtb_size -
sizeof (dtrace_aggbuffer_t));
if (buf->dtb_offset == 0) {
/*
* We just kludge up approximately 1/8th of the size to be
* buckets. If this guess ends up being routinely
* off-the-mark, we may need to dynamically readjust this
* based on past performance.
*/
uintptr_t hashsize = (buf->dtb_size >> 3) / sizeof (uintptr_t);
if ((uintptr_t)agb - hashsize * sizeof (dtrace_aggkey_t *) <
(uintptr_t)tomax || hashsize == 0) {
/*
* We've been given a ludicrously small buffer;
* increment our drop count and leave.
*/
dtrace_buffer_drop(buf);
return;
}
/*
* And now, a pathetic attempt to try to get a an odd (or
* perchance, a prime) hash size for better hash distribution.
*/
if (hashsize > (DTRACE_AGGHASHSIZE_SLEW << 3))
hashsize -= DTRACE_AGGHASHSIZE_SLEW;
agb->dtagb_hashsize = hashsize;
agb->dtagb_hash = (dtrace_aggkey_t **)((uintptr_t)agb -
agb->dtagb_hashsize * sizeof (dtrace_aggkey_t *));
agb->dtagb_free = (uintptr_t)agb->dtagb_hash;
for (i = 0; i < agb->dtagb_hashsize; i++)
agb->dtagb_hash[i] = NULL;
}
ASSERT(agg->dtag_first != NULL);
ASSERT(agg->dtag_first->dta_intuple);
/*
* Calculate the hash value based on the key. Note that we _don't_
* include the aggid in the hashing (but we will store it as part of
* the key). The hashing algorithm is Bob Jenkins' "One-at-a-time"
* algorithm: a simple, quick algorithm that has no known funnels, and
* gets good distribution in practice. The efficacy of the hashing
* algorithm (and a comparison with other algorithms) may be found by
* running the ::dtrace_aggstat MDB dcmd.
*/
for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
i = act->dta_rec.dtrd_offset - agg->dtag_base;
limit = i + act->dta_rec.dtrd_size;
ASSERT(limit <= size);
isstr = DTRACEACT_ISSTRING(act);
for (; i < limit; i++) {
hashval += data[i];
hashval += (hashval << 10);
hashval ^= (hashval >> 6);
if (isstr && data[i] == '\0')
break;
}
}
hashval += (hashval << 3);
hashval ^= (hashval >> 11);
hashval += (hashval << 15);
/*
* Yes, the divide here is expensive -- but it's generally the least
* of the performance issues given the amount of data that we iterate
* over to compute hash values, compare data, etc.
*/
ndx = hashval % agb->dtagb_hashsize;
for (key = agb->dtagb_hash[ndx]; key != NULL; key = key->dtak_next) {
ASSERT((caddr_t)key >= tomax);
ASSERT((caddr_t)key < tomax + buf->dtb_size);
if (hashval != key->dtak_hashval || key->dtak_size != size)
continue;
kdata = key->dtak_data;
ASSERT(kdata >= tomax && kdata < tomax + buf->dtb_size);
for (act = agg->dtag_first; act->dta_intuple;
act = act->dta_next) {
i = act->dta_rec.dtrd_offset - agg->dtag_base;
limit = i + act->dta_rec.dtrd_size;
ASSERT(limit <= size);
isstr = DTRACEACT_ISSTRING(act);
for (; i < limit; i++) {
if (kdata[i] != data[i])
goto next;
if (isstr && data[i] == '\0')
break;
}
}
if (action != key->dtak_action) {
/*
* We are aggregating on the same value in the same
* aggregation with two different aggregating actions.
* (This should have been picked up in the compiler,
* so we may be dealing with errant or devious DIF.)
* This is an error condition; we indicate as much,
* and return.
*/
DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
return;
}
/*
* This is a hit: we need to apply the aggregator to
* the value at this key.
*/
agg->dtag_aggregate((uint64_t *)(kdata + size), expr, arg);
return;
next:
continue;
}
/*
* We didn't find it. We need to allocate some zero-filled space,
* link it into the hash table appropriately, and apply the aggregator
* to the (zero-filled) value.
*/
offs = buf->dtb_offset;
while (offs & (align - 1))
offs += sizeof (uint32_t);
/*
* If we don't have enough room to both allocate a new key _and_
* its associated data, increment the drop count and return.
*/
if ((uintptr_t)tomax + offs + fsize >
agb->dtagb_free - sizeof (dtrace_aggkey_t)) {
dtrace_buffer_drop(buf);
return;
}
/*CONSTCOND*/
ASSERT(!(sizeof (dtrace_aggkey_t) & (sizeof (uintptr_t) - 1)));
key = (dtrace_aggkey_t *)(agb->dtagb_free - sizeof (dtrace_aggkey_t));
agb->dtagb_free -= sizeof (dtrace_aggkey_t);
key->dtak_data = kdata = tomax + offs;
buf->dtb_offset = offs + fsize;
/*
* Now copy the data across.
*/
*((dtrace_aggid_t *)kdata) = agg->dtag_id;
for (i = sizeof (dtrace_aggid_t); i < size; i++)
kdata[i] = data[i];
/*
* Because strings are not zeroed out by default, we need to iterate
* looking for actions that store strings, and we need to explicitly
* pad these strings out with zeroes.
*/
for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
int nul;
if (!DTRACEACT_ISSTRING(act))
continue;
i = act->dta_rec.dtrd_offset - agg->dtag_base;
limit = i + act->dta_rec.dtrd_size;
ASSERT(limit <= size);
for (nul = 0; i < limit; i++) {
if (nul) {
kdata[i] = '\0';
continue;
}
if (data[i] != '\0')
continue;
nul = 1;
}
}
for (i = size; i < fsize; i++)
kdata[i] = 0;
key->dtak_hashval = hashval;
key->dtak_size = size;
key->dtak_action = action;
key->dtak_next = agb->dtagb_hash[ndx];
agb->dtagb_hash[ndx] = key;
/*
* Finally, apply the aggregator.
*/
*((uint64_t *)(key->dtak_data + size)) = agg->dtag_initial;
agg->dtag_aggregate((uint64_t *)(key->dtak_data + size), expr, arg);
}
/*
* Given consumer state, this routine finds a speculation in the INACTIVE
* state and transitions it into the ACTIVE state. If there is no speculation
* in the INACTIVE state, 0 is returned. In this case, no error counter is
* incremented -- it is up to the caller to take appropriate action.
*/
static int
dtrace_speculation(dtrace_state_t *state)
{
int i = 0;
dtrace_speculation_state_t current;
uint32_t *stat = &state->dts_speculations_unavail, count;
while (i < state->dts_nspeculations) {
dtrace_speculation_t *spec = &state->dts_speculations[i];
current = spec->dtsp_state;
if (current != DTRACESPEC_INACTIVE) {
if (current == DTRACESPEC_COMMITTINGMANY ||
current == DTRACESPEC_COMMITTING ||
current == DTRACESPEC_DISCARDING)
stat = &state->dts_speculations_busy;
i++;
continue;
}
if (dtrace_cas32((uint32_t *)&spec->dtsp_state,
current, DTRACESPEC_ACTIVE) == current)
return (i + 1);
}
/*
* We couldn't find a speculation. If we found as much as a single
* busy speculation buffer, we'll attribute this failure as "busy"
* instead of "unavail".
*/
do {
count = *stat;
} while (dtrace_cas32(stat, count, count + 1) != count);
return (0);
}
/*
* This routine commits an active speculation. If the specified speculation
* is not in a valid state to perform a commit(), this routine will silently do
* nothing. The state of the specified speculation is transitioned according
* to the state transition diagram outlined in <sys/dtrace_impl.h>
*/
static void
dtrace_speculation_commit(dtrace_state_t *state, processorid_t cpu,
dtrace_specid_t which)
{
dtrace_speculation_t *spec;
dtrace_buffer_t *src, *dest;
uintptr_t daddr, saddr, dlimit;
dtrace_speculation_state_t current, new = 0;
intptr_t offs;
if (which == 0)
return;
if (which > state->dts_nspeculations) {
cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
return;
}
spec = &state->dts_speculations[which - 1];
src = &spec->dtsp_buffer[cpu];
dest = &state->dts_buffer[cpu];
do {
current = spec->dtsp_state;
if (current == DTRACESPEC_COMMITTINGMANY)
break;
switch (current) {
case DTRACESPEC_INACTIVE:
case DTRACESPEC_DISCARDING:
return;
case DTRACESPEC_COMMITTING:
/*
* This is only possible if we are (a) commit()'ing
* without having done a prior speculate() on this CPU
* and (b) racing with another commit() on a different
* CPU. There's nothing to do -- we just assert that
* our offset is 0.
*/
ASSERT(src->dtb_offset == 0);
return;
case DTRACESPEC_ACTIVE:
new = DTRACESPEC_COMMITTING;
break;
case DTRACESPEC_ACTIVEONE:
/*
* This speculation is active on one CPU. If our
* buffer offset is non-zero, we know that the one CPU
* must be us. Otherwise, we are committing on a
* different CPU from the speculate(), and we must
* rely on being asynchronously cleaned.
*/
if (src->dtb_offset != 0) {
new = DTRACESPEC_COMMITTING;
break;
}
/*FALLTHROUGH*/
case DTRACESPEC_ACTIVEMANY:
new = DTRACESPEC_COMMITTINGMANY;
break;
default:
ASSERT(0);
}
} while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
current, new) != current);
/*
* We have set the state to indicate that we are committing this
* speculation. Now reserve the necessary space in the destination
* buffer.
*/
if ((offs = dtrace_buffer_reserve(dest, src->dtb_offset,
sizeof (uint64_t), state, NULL)) < 0) {
dtrace_buffer_drop(dest);
goto out;
}
/*
* We have the space; copy the buffer across. (Note that this is a
* highly subobtimal bcopy(); in the unlikely event that this becomes
* a serious performance issue, a high-performance DTrace-specific
* bcopy() should obviously be invented.)
*/
daddr = (uintptr_t)dest->dtb_tomax + offs;
dlimit = daddr + src->dtb_offset;
saddr = (uintptr_t)src->dtb_tomax;
/*
* First, the aligned portion.
*/
while (dlimit - daddr >= sizeof (uint64_t)) {
*((uint64_t *)daddr) = *((uint64_t *)saddr);
daddr += sizeof (uint64_t);
saddr += sizeof (uint64_t);
}
/*
* Now any left-over bit...
*/
while (dlimit - daddr)
*((uint8_t *)daddr++) = *((uint8_t *)saddr++);
/*
* Finally, commit the reserved space in the destination buffer.
*/
dest->dtb_offset = offs + src->dtb_offset;
out:
/*
* If we're lucky enough to be the only active CPU on this speculation
* buffer, we can just set the state back to DTRACESPEC_INACTIVE.
*/
if (current == DTRACESPEC_ACTIVE ||
(current == DTRACESPEC_ACTIVEONE && new == DTRACESPEC_COMMITTING)) {
uint32_t rval = dtrace_cas32((uint32_t *)&spec->dtsp_state,
DTRACESPEC_COMMITTING, DTRACESPEC_INACTIVE);
ASSERT(rval == DTRACESPEC_COMMITTING);
}
src->dtb_offset = 0;
src->dtb_xamot_drops += src->dtb_drops;
src->dtb_drops = 0;
}
/*
* This routine discards an active speculation. If the specified speculation
* is not in a valid state to perform a discard(), this routine will silently
* do nothing. The state of the specified speculation is transitioned
* according to the state transition diagram outlined in <sys/dtrace_impl.h>
*/
static void
dtrace_speculation_discard(dtrace_state_t *state, processorid_t cpu,
dtrace_specid_t which)
{
dtrace_speculation_t *spec;
dtrace_speculation_state_t current, new = 0;
dtrace_buffer_t *buf;
if (which == 0)
return;
if (which > state->dts_nspeculations) {
cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
return;
}
spec = &state->dts_speculations[which - 1];
buf = &spec->dtsp_buffer[cpu];
do {
current = spec->dtsp_state;
switch (current) {
case DTRACESPEC_INACTIVE:
case DTRACESPEC_COMMITTINGMANY:
case DTRACESPEC_COMMITTING:
case DTRACESPEC_DISCARDING:
return;
case DTRACESPEC_ACTIVE:
case DTRACESPEC_ACTIVEMANY:
new = DTRACESPEC_DISCARDING;
break;
case DTRACESPEC_ACTIVEONE:
if (buf->dtb_offset != 0) {
new = DTRACESPEC_INACTIVE;
} else {
new = DTRACESPEC_DISCARDING;
}
break;
default:
ASSERT(0);
}
} while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
current, new) != current);
buf->dtb_offset = 0;
buf->dtb_drops = 0;
}
/*
* Note: not called from probe context. This function is called
* asynchronously from cross call context to clean any speculations that are
* in the COMMITTINGMANY or DISCARDING states. These speculations may not be
* transitioned back to the INACTIVE state until all CPUs have cleaned the
* speculation.
*/
static void
dtrace_speculation_clean_here(dtrace_state_t *state)
{
dtrace_icookie_t cookie;
processorid_t cpu = curcpu;
dtrace_buffer_t *dest = &state->dts_buffer[cpu];
dtrace_specid_t i;
cookie = dtrace_interrupt_disable();
if (dest->dtb_tomax == NULL) {
dtrace_interrupt_enable(cookie);
return;
}
for (i = 0; i < state->dts_nspeculations; i++) {
dtrace_speculation_t *spec = &state->dts_speculations[i];
dtrace_buffer_t *src = &spec->dtsp_buffer[cpu];
if (src->dtb_tomax == NULL)
continue;
if (spec->dtsp_state == DTRACESPEC_DISCARDING) {
src->dtb_offset = 0;
continue;
}
if (spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
continue;
if (src->dtb_offset == 0)
continue;
dtrace_speculation_commit(state, cpu, i + 1);
}
dtrace_interrupt_enable(cookie);
}
/*
* Note: not called from probe context. This function is called
* asynchronously (and at a regular interval) to clean any speculations that
* are in the COMMITTINGMANY or DISCARDING states. If it discovers that there
* is work to be done, it cross calls all CPUs to perform that work;
* COMMITMANY and DISCARDING speculations may not be transitioned back to the
* INACTIVE state until they have been cleaned by all CPUs.
*/
static void
dtrace_speculation_clean(dtrace_state_t *state)
{
int work = 0, rv;
dtrace_specid_t i;
for (i = 0; i < state->dts_nspeculations; i++) {
dtrace_speculation_t *spec = &state->dts_speculations[i];
ASSERT(!spec->dtsp_cleaning);
if (spec->dtsp_state != DTRACESPEC_DISCARDING &&
spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
continue;
work++;
spec->dtsp_cleaning = 1;
}
if (!work)
return;
dtrace_xcall(DTRACE_CPUALL,
(dtrace_xcall_t)dtrace_speculation_clean_here, state);
/*
* We now know that all CPUs have committed or discarded their
* speculation buffers, as appropriate. We can now set the state
* to inactive.
*/
for (i = 0; i < state->dts_nspeculations; i++) {
dtrace_speculation_t *spec = &state->dts_speculations[i];
dtrace_speculation_state_t current, new;
if (!spec->dtsp_cleaning)
continue;
current = spec->dtsp_state;
ASSERT(current == DTRACESPEC_DISCARDING ||
current == DTRACESPEC_COMMITTINGMANY);
new = DTRACESPEC_INACTIVE;
rv = dtrace_cas32((uint32_t *)&spec->dtsp_state, current, new);
ASSERT(rv == current);
spec->dtsp_cleaning = 0;
}
}
/*
* Called as part of a speculate() to get the speculative buffer associated
* with a given speculation. Returns NULL if the specified speculation is not
* in an ACTIVE state. If the speculation is in the ACTIVEONE state -- and
* the active CPU is not the specified CPU -- the speculation will be
* atomically transitioned into the ACTIVEMANY state.
*/
static dtrace_buffer_t *
dtrace_speculation_buffer(dtrace_state_t *state, processorid_t cpuid,
dtrace_specid_t which)
{
dtrace_speculation_t *spec;
dtrace_speculation_state_t current, new = 0;
dtrace_buffer_t *buf;
if (which == 0)
return (NULL);
if (which > state->dts_nspeculations) {
cpu_core[cpuid].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
return (NULL);
}
spec = &state->dts_speculations[which - 1];
buf = &spec->dtsp_buffer[cpuid];
do {
current = spec->dtsp_state;
switch (current) {
case DTRACESPEC_INACTIVE:
case DTRACESPEC_COMMITTINGMANY:
case DTRACESPEC_DISCARDING:
return (NULL);
case DTRACESPEC_COMMITTING:
ASSERT(buf->dtb_offset == 0);
return (NULL);
case DTRACESPEC_ACTIVEONE:
/*
* This speculation is currently active on one CPU.
* Check the offset in the buffer; if it's non-zero,
* that CPU must be us (and we leave the state alone).
* If it's zero, assume that we're starting on a new
* CPU -- and change the state to indicate that the
* speculation is active on more than one CPU.
*/
if (buf->dtb_offset != 0)
return (buf);
new = DTRACESPEC_ACTIVEMANY;
break;
case DTRACESPEC_ACTIVEMANY:
return (buf);
case DTRACESPEC_ACTIVE:
new = DTRACESPEC_ACTIVEONE;
break;
default:
ASSERT(0);
}
} while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
current, new) != current);
ASSERT(new == DTRACESPEC_ACTIVEONE || new == DTRACESPEC_ACTIVEMANY);
return (buf);
}
/*
* Return a string. In the event that the user lacks the privilege to access
* arbitrary kernel memory, we copy the string out to scratch memory so that we
* don't fail access checking.
*
* dtrace_dif_variable() uses this routine as a helper for various
* builtin values such as 'execname' and 'probefunc.'
*/
uintptr_t
dtrace_dif_varstr(uintptr_t addr, dtrace_state_t *state,
dtrace_mstate_t *mstate)
{
uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
uintptr_t ret;
size_t strsz;
/*
* The easy case: this probe is allowed to read all of memory, so
* we can just return this as a vanilla pointer.
*/
if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
return (addr);
/*
* This is the tougher case: we copy the string in question from
* kernel memory into scratch memory and return it that way: this
* ensures that we won't trip up when access checking tests the
* BYREF return value.
*/
strsz = dtrace_strlen((char *)addr, size) + 1;
if (mstate->dtms_scratch_ptr + strsz >
mstate->dtms_scratch_base + mstate->dtms_scratch_size) {
DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
return (0);
}
dtrace_strcpy((const void *)addr, (void *)mstate->dtms_scratch_ptr,
strsz);
ret = mstate->dtms_scratch_ptr;
mstate->dtms_scratch_ptr += strsz;
return (ret);
}
/*
* Return a string from a memoy address which is known to have one or
* more concatenated, individually zero terminated, sub-strings.
* In the event that the user lacks the privilege to access
* arbitrary kernel memory, we copy the string out to scratch memory so that we
* don't fail access checking.
*
* dtrace_dif_variable() uses this routine as a helper for various
* builtin values such as 'execargs'.
*/
static uintptr_t
dtrace_dif_varstrz(uintptr_t addr, size_t strsz, dtrace_state_t *state,
dtrace_mstate_t *mstate)
{
char *p;
size_t i;
uintptr_t ret;
if (mstate->dtms_scratch_ptr + strsz >
mstate->dtms_scratch_base + mstate->dtms_scratch_size) {
DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
return (0);
}
dtrace_bcopy((const void *)addr, (void *)mstate->dtms_scratch_ptr,
strsz);
/* Replace sub-string termination characters with a space. */
for (p = (char *) mstate->dtms_scratch_ptr, i = 0; i < strsz - 1;
p++, i++)
if (*p == '\0')
*p = ' ';
ret = mstate->dtms_scratch_ptr;
mstate->dtms_scratch_ptr += strsz;
return (ret);
}
/*
* This function implements the DIF emulator's variable lookups. The emulator
* passes a reserved variable identifier and optional built-in array index.
*/
static uint64_t
dtrace_dif_variable(dtrace_mstate_t *mstate, dtrace_state_t *state, uint64_t v,
uint64_t ndx)
{
/*
* If we're accessing one of the uncached arguments, we'll turn this
* into a reference in the args array.
*/
if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9) {
ndx = v - DIF_VAR_ARG0;
v = DIF_VAR_ARGS;
}
switch (v) {
case DIF_VAR_ARGS:
ASSERT(mstate->dtms_present & DTRACE_MSTATE_ARGS);
if (ndx >= sizeof (mstate->dtms_arg) /
sizeof (mstate->dtms_arg[0])) {
int aframes = mstate->dtms_probe->dtpr_aframes + 2;
dtrace_provider_t *pv;
uint64_t val;
pv = mstate->dtms_probe->dtpr_provider;
if (pv->dtpv_pops.dtps_getargval != NULL)
val = pv->dtpv_pops.dtps_getargval(pv->dtpv_arg,
mstate->dtms_probe->dtpr_id,
mstate->dtms_probe->dtpr_arg, ndx, aframes);
else
val = dtrace_getarg(ndx, aframes);
/*
* This is regrettably required to keep the compiler
* from tail-optimizing the call to dtrace_getarg().
* The condition always evaluates to true, but the
* compiler has no way of figuring that out a priori.
* (None of this would be necessary if the compiler
* could be relied upon to _always_ tail-optimize
* the call to dtrace_getarg() -- but it can't.)
*/
if (mstate->dtms_probe != NULL)
return (val);
ASSERT(0);
}
return (mstate->dtms_arg[ndx]);
#if defined(sun)
case DIF_VAR_UREGS: {
klwp_t *lwp;
if (!dtrace_priv_proc(state))
return (0);
if ((lwp = curthread->t_lwp) == NULL) {
DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
cpu_core[curcpu].cpuc_dtrace_illval = NULL;
return (0);
}
return (dtrace_getreg(lwp->lwp_regs, ndx));
return (0);
}
#else
case DIF_VAR_UREGS: {
struct trapframe *tframe;
if (!dtrace_priv_proc(state))
return (0);
if ((tframe = curthread->td_frame) == NULL) {
DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
cpu_core[curcpu].cpuc_dtrace_illval = 0;
return (0);
}
return (dtrace_getreg(tframe, ndx));
}
#endif
case DIF_VAR_CURTHREAD:
if (!dtrace_priv_kernel(state))
return (0);
return ((uint64_t)(uintptr_t)curthread);
case DIF_VAR_TIMESTAMP:
if (!(mstate->dtms_present & DTRACE_MSTATE_TIMESTAMP)) {
mstate->dtms_timestamp = dtrace_gethrtime();
mstate->dtms_present |= DTRACE_MSTATE_TIMESTAMP;
}
return (mstate->dtms_timestamp);
case DIF_VAR_VTIMESTAMP:
ASSERT(dtrace_vtime_references != 0);
return (curthread->t_dtrace_vtime);
case DIF_VAR_WALLTIMESTAMP:
if (!(mstate->dtms_present & DTRACE_MSTATE_WALLTIMESTAMP)) {
mstate->dtms_walltimestamp = dtrace_gethrestime();
mstate->dtms_present |= DTRACE_MSTATE_WALLTIMESTAMP;
}
return (mstate->dtms_walltimestamp);
#if defined(sun)
case DIF_VAR_IPL:
if (!dtrace_priv_kernel(state))
return (0);
if (!(mstate->dtms_present & DTRACE_MSTATE_IPL)) {
mstate->dtms_ipl = dtrace_getipl();
mstate->dtms_present |= DTRACE_MSTATE_IPL;
}
return (mstate->dtms_ipl);
#endif
case DIF_VAR_EPID:
ASSERT(mstate->dtms_present & DTRACE_MSTATE_EPID);
return (mstate->dtms_epid);
case DIF_VAR_ID:
ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
return (mstate->dtms_probe->dtpr_id);
case DIF_VAR_STACKDEPTH:
if (!dtrace_priv_kernel(state))
return (0);
if (!(mstate->dtms_present & DTRACE_MSTATE_STACKDEPTH)) {
int aframes = mstate->dtms_probe->dtpr_aframes + 2;
mstate->dtms_stackdepth = dtrace_getstackdepth(aframes);
mstate->dtms_present |= DTRACE_MSTATE_STACKDEPTH;
}
return (mstate->dtms_stackdepth);
case DIF_VAR_USTACKDEPTH:
if (!dtrace_priv_proc(state))
return (0);
if (!(mstate->dtms_present & DTRACE_MSTATE_USTACKDEPTH)) {
/*
* See comment in DIF_VAR_PID.
*/
if (DTRACE_ANCHORED(mstate->dtms_probe) &&
CPU_ON_INTR(CPU)) {
mstate->dtms_ustackdepth = 0;
} else {
DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
mstate->dtms_ustackdepth =
dtrace_getustackdepth();
DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
}
mstate->dtms_present |= DTRACE_MSTATE_USTACKDEPTH;
}
return (mstate->dtms_ustackdepth);
case DIF_VAR_CALLER:
if (!dtrace_priv_kernel(state))
return (0);
if (!(mstate->dtms_present & DTRACE_MSTATE_CALLER)) {
int aframes = mstate->dtms_probe->dtpr_aframes + 2;
if (!DTRACE_ANCHORED(mstate->dtms_probe)) {
/*
* If this is an unanchored probe, we are
* required to go through the slow path:
* dtrace_caller() only guarantees correct
* results for anchored probes.
*/
pc_t caller[2] = {0, 0};
dtrace_getpcstack(caller, 2, aframes,
(uint32_t *)(uintptr_t)mstate->dtms_arg[0]);
mstate->dtms_caller = caller[1];
} else if ((mstate->dtms_caller =
dtrace_caller(aframes)) == -1) {
/*
* We have failed to do this the quick way;
* we must resort to the slower approach of
* calling dtrace_getpcstack().
*/
pc_t caller = 0;
dtrace_getpcstack(&caller, 1, aframes, NULL);
mstate->dtms_caller = caller;
}
mstate->dtms_present |= DTRACE_MSTATE_CALLER;
}
return (mstate->dtms_caller);
case DIF_VAR_UCALLER:
if (!dtrace_priv_proc(state))
return (0);
if (!(mstate->dtms_present & DTRACE_MSTATE_UCALLER)) {
uint64_t ustack[3];
/*
* dtrace_getupcstack() fills in the first uint64_t
* with the current PID. The second uint64_t will
* be the program counter at user-level. The third
* uint64_t will contain the caller, which is what
* we're after.
*/
ustack[2] = 0;
DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
dtrace_getupcstack(ustack, 3);
DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
mstate->dtms_ucaller = ustack[2];
mstate->dtms_present |= DTRACE_MSTATE_UCALLER;
}
return (mstate->dtms_ucaller);
case DIF_VAR_PROBEPROV:
ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
return (dtrace_dif_varstr(
(uintptr_t)mstate->dtms_probe->dtpr_provider->dtpv_name,
state, mstate));
case DIF_VAR_PROBEMOD:
ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
return (dtrace_dif_varstr(
(uintptr_t)mstate->dtms_probe->dtpr_mod,
state, mstate));
case DIF_VAR_PROBEFUNC:
ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
return (dtrace_dif_varstr(
(uintptr_t)mstate->dtms_probe->dtpr_func,
state, mstate));
case DIF_VAR_PROBENAME:
ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
return (dtrace_dif_varstr(
(uintptr_t)mstate->dtms_probe->dtpr_name,
state, mstate));
case DIF_VAR_PID:
if (!dtrace_priv_proc(state))
return (0);
#if defined(sun)
/*
* Note that we are assuming that an unanchored probe is
* always due to a high-level interrupt. (And we're assuming
* that there is only a single high level interrupt.)
*/
if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
return (pid0.pid_id);
/*
* It is always safe to dereference one's own t_procp pointer:
* it always points to a valid, allocated proc structure.
* Further, it is always safe to dereference the p_pidp member
* of one's own proc structure. (These are truisms becuase
* threads and processes don't clean up their own state --
* they leave that task to whomever reaps them.)
*/
return ((uint64_t)curthread->t_procp->p_pidp->pid_id);
#else
return ((uint64_t)curproc->p_pid);
#endif
case DIF_VAR_PPID:
if (!dtrace_priv_proc(state))
return (0);
#if defined(sun)
/*
* See comment in DIF_VAR_PID.
*/
if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
return (pid0.pid_id);
/*
* It is always safe to dereference one's own t_procp pointer:
* it always points to a valid, allocated proc structure.
* (This is true because threads don't clean up their own
* state -- they leave that task to whomever reaps them.)
*/
return ((uint64_t)curthread->t_procp->p_ppid);
#else
return ((uint64_t)curproc->p_pptr->p_pid);
#endif
case DIF_VAR_TID:
#if defined(sun)
/*
* See comment in DIF_VAR_PID.
*/
if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
return (0);
#endif
return ((uint64_t)curthread->t_tid);
case DIF_VAR_EXECARGS: {
struct pargs *p_args = curthread->td_proc->p_args;
if (p_args == NULL)
return(0);
return (dtrace_dif_varstrz(
(uintptr_t) p_args->ar_args, p_args->ar_length, state, mstate));
}
case DIF_VAR_EXECNAME:
#if defined(sun)
if (!dtrace_priv_proc(state))
return (0);
/*
* See comment in DIF_VAR_PID.
*/
if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
return ((uint64_t)(uintptr_t)p0.p_user.u_comm);
/*
* It is always safe to dereference one's own t_procp pointer:
* it always points to a valid, allocated proc structure.
* (This is true because threads don't clean up their own
* state -- they leave that task to whomever reaps them.)
*/
return (dtrace_dif_varstr(
(uintptr_t)curthread->t_procp->p_user.u_comm,
state, mstate));
#else
return (dtrace_dif_varstr(
(uintptr_t) curthread->td_proc->p_comm, state, mstate));
#endif
case DIF_VAR_ZONENAME:
#if defined(sun)
if (!dtrace_priv_proc(state))
return (0);
/*
* See comment in DIF_VAR_PID.
*/
if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
return ((uint64_t)(uintptr_t)p0.p_zone->zone_name);
/*
* It is always safe to dereference one's own t_procp pointer:
* it always points to a valid, allocated proc structure.
* (This is true because threads don't clean up their own
* state -- they leave that task to whomever reaps them.)
*/
return (dtrace_dif_varstr(
(uintptr_t)curthread->t_procp->p_zone->zone_name,
state, mstate));
#else
return (0);
#endif
case DIF_VAR_UID:
if (!dtrace_priv_proc(state))
return (0);
#if defined(sun)
/*
* See comment in DIF_VAR_PID.
*/
if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
return ((uint64_t)p0.p_cred->cr_uid);
#endif
/*
* It is always safe to dereference one's own t_procp pointer:
* it always points to a valid, allocated proc structure.
* (This is true because threads don't clean up their own
* state -- they leave that task to whomever reaps them.)
*
* Additionally, it is safe to dereference one's own process
* credential, since this is never NULL after process birth.
*/
return ((uint64_t)curthread->t_procp->p_cred->cr_uid);
case DIF_VAR_GID:
if (!dtrace_priv_proc(state))
return (0);
#if defined(sun)
/*
* See comment in DIF_VAR_PID.
*/
if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
return ((uint64_t)p0.p_cred->cr_gid);
#endif
/*
* It is always safe to dereference one's own t_procp pointer:
* it always points to a valid, allocated proc structure.
* (This is true because threads don't clean up their own
* state -- they leave that task to whomever reaps them.)
*
* Additionally, it is safe to dereference one's own process
* credential, since this is never NULL after process birth.
*/
return ((uint64_t)curthread->t_procp->p_cred->cr_gid);
case DIF_VAR_ERRNO: {
#if defined(sun)
klwp_t *lwp;
if (!dtrace_priv_proc(state))
return (0);
/*
* See comment in DIF_VAR_PID.
*/
if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
return (0);
/*
* It is always safe to dereference one's own t_lwp pointer in
* the event that this pointer is non-NULL. (This is true
* because threads and lwps don't clean up their own state --
* they leave that task to whomever reaps them.)
*/
if ((lwp = curthread->t_lwp) == NULL)
return (0);
return ((uint64_t)lwp->lwp_errno);
#else
return (curthread->td_errno);
#endif
}
default:
DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
return (0);
}
}
/*
* Emulate the execution of DTrace ID subroutines invoked by the call opcode.
* Notice that we don't bother validating the proper number of arguments or
* their types in the tuple stack. This isn't needed because all argument
* interpretation is safe because of our load safety -- the worst that can
* happen is that a bogus program can obtain bogus results.
*/
static void
dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs,
dtrace_key_t *tupregs, int nargs,
dtrace_mstate_t *mstate, dtrace_state_t *state)
{
volatile uint16_t *flags = &cpu_core[curcpu].cpuc_dtrace_flags;
volatile uintptr_t *illval = &cpu_core[curcpu].cpuc_dtrace_illval;
dtrace_vstate_t *vstate = &state->dts_vstate;
#if defined(sun)
union {
mutex_impl_t mi;
uint64_t mx;
} m;
union {
krwlock_t ri;
uintptr_t rw;
} r;
#else
struct thread *lowner;
union {
struct lock_object *li;
uintptr_t lx;
} l;
#endif
switch (subr) {
case DIF_SUBR_RAND:
regs[rd] = (dtrace_gethrtime() * 2416 + 374441) % 1771875;
break;
#if defined(sun)
case DIF_SUBR_MUTEX_OWNED:
if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
mstate, vstate)) {
regs[rd] = 0;
break;
}
m.mx = dtrace_load64(tupregs[0].dttk_value);
if (MUTEX_TYPE_ADAPTIVE(&m.mi))
regs[rd] = MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER;
else
regs[rd] = LOCK_HELD(&m.mi.m_spin.m_spinlock);
break;
case DIF_SUBR_MUTEX_OWNER:
if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
mstate, vstate)) {
regs[rd] = 0;
break;
}
m.mx = dtrace_load64(tupregs[0].dttk_value);
if (MUTEX_TYPE_ADAPTIVE(&m.mi) &&
MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER)
regs[rd] = (uintptr_t)MUTEX_OWNER(&m.mi);
else
regs[rd] = 0;
break;
case DIF_SUBR_MUTEX_TYPE_ADAPTIVE:
if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
mstate, vstate)) {
regs[rd] = 0;
break;
}
m.mx = dtrace_load64(tupregs[0].dttk_value);
regs[rd] = MUTEX_TYPE_ADAPTIVE(&m.mi);
break;
case DIF_SUBR_MUTEX_TYPE_SPIN:
if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
mstate, vstate)) {
regs[rd] = 0;
break;
}
m.mx = dtrace_load64(tupregs[0].dttk_value);
regs[rd] = MUTEX_TYPE_SPIN(&m.mi);
break;
case DIF_SUBR_RW_READ_HELD: {
uintptr_t tmp;
if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
mstate, vstate)) {
regs[rd] = 0;
break;
}
r.rw = dtrace_loadptr(tupregs[0].dttk_value);
regs[rd] = _RW_READ_HELD(&r.ri, tmp);
break;
}
case DIF_SUBR_RW_WRITE_HELD:
if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),
mstate, vstate)) {
regs[rd] = 0;
break;
}
r.rw = dtrace_loadptr(tupregs[0].dttk_value);
regs[rd] = _RW_WRITE_HELD(&r.ri);
break;
case DIF_SUBR_RW_ISWRITER:
if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),
mstate, vstate)) {
regs[rd] = 0;
break;
}
r.rw = dtrace_loadptr(tupregs[0].dttk_value);
regs[rd] = _RW_ISWRITER(&r.ri);
break;
#else
case DIF_SUBR_MUTEX_OWNED:
if (!dtrace_canload(tupregs[0].dttk_value,
sizeof (struct lock_object), mstate, vstate)) {
regs[rd] = 0;
break;
}
l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);
regs[rd] = LOCK_CLASS(l.li)->lc_owner(l.li, &lowner);
break;
case DIF_SUBR_MUTEX_OWNER:
if (!dtrace_canload(tupregs[0].dttk_value,
sizeof (struct lock_object), mstate, vstate)) {
regs[rd] = 0;
break;
}
l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);
LOCK_CLASS(l.li)->lc_owner(l.li, &lowner);
regs[rd] = (uintptr_t)lowner;
break;
case DIF_SUBR_MUTEX_TYPE_ADAPTIVE:
if (!dtrace_canload(tupregs[0].dttk_value, sizeof (struct mtx),
mstate, vstate)) {
regs[rd] = 0;
break;
}
l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);
/* XXX - should be only LC_SLEEPABLE? */
regs[rd] = (LOCK_CLASS(l.li)->lc_flags &
(LC_SLEEPLOCK | LC_SLEEPABLE)) != 0;
break;
case DIF_SUBR_MUTEX_TYPE_SPIN:
if (!dtrace_canload(tupregs[0].dttk_value, sizeof (struct mtx),
mstate, vstate)) {
regs[rd] = 0;
break;
}
l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);
regs[rd] = (LOCK_CLASS(l.li)->lc_flags & LC_SPINLOCK) != 0;
break;
case DIF_SUBR_RW_READ_HELD:
case DIF_SUBR_SX_SHARED_HELD:
if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
mstate, vstate)) {
regs[rd] = 0;
break;
}
l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);
regs[rd] = LOCK_CLASS(l.li)->lc_owner(l.li, &lowner) &&
lowner == NULL;
break;
case DIF_SUBR_RW_WRITE_HELD:
case DIF_SUBR_SX_EXCLUSIVE_HELD:
if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
mstate, vstate)) {
regs[rd] = 0;
break;
}
l.lx = dtrace_loadptr(tupregs[0].dttk_value);
LOCK_CLASS(l.li)->lc_owner(l.li, &lowner);
regs[rd] = (lowner == curthread);
break;
case DIF_SUBR_RW_ISWRITER:
case DIF_SUBR_SX_ISEXCLUSIVE:
if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
mstate, vstate)) {
regs[rd] = 0;
break;
}
l.lx = dtrace_loadptr(tupregs[0].dttk_value);
regs[rd] = LOCK_CLASS(l.li)->lc_owner(l.li, &lowner) &&
lowner != NULL;
break;
#endif /* ! defined(sun) */
case DIF_SUBR_BCOPY: {
/*
* We need to be sure that the destination is in the scratch
* region -- no other region is allowed.
*/
uintptr_t src = tupregs[0].dttk_value;
uintptr_t dest = tupregs[1].dttk_value;
size_t size = tupregs[2].dttk_value;
if (!dtrace_inscratch(dest, size, mstate)) {
*flags |= CPU_DTRACE_BADADDR;
*illval = regs[rd];
break;
}
if (!dtrace_canload(src, size, mstate, vstate)) {
regs[rd] = 0;
break;
}
dtrace_bcopy((void *)src, (void *)dest, size);
break;
}
case DIF_SUBR_ALLOCA:
case DIF_SUBR_COPYIN: {
uintptr_t dest = P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
uint64_t size =
tupregs[subr == DIF_SUBR_ALLOCA ? 0 : 1].dttk_value;
size_t scratch_size = (dest - mstate->dtms_scratch_ptr) + size;
/*
* This action doesn't require any credential checks since
* probes will not activate in user contexts to which the
* enabling user does not have permissions.
*/
/*
* Rounding up the user allocation size could have overflowed
* a large, bogus allocation (like -1ULL) to 0.
*/
if (scratch_size < size ||
!DTRACE_INSCRATCH(mstate, scratch_size)) {
DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
regs[rd] = 0;
break;
}
if (subr == DIF_SUBR_COPYIN) {
DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
}
mstate->dtms_scratch_ptr += scratch_size;
regs[rd] = dest;
break;
}
case DIF_SUBR_COPYINTO: {
uint64_t size = tupregs[1].dttk_value;
uintptr_t dest = tupregs[2].dttk_value;
/*
* This action doesn't require any credential checks since
* probes will not activate in user contexts to which the
* enabling user does not have permissions.
*/
if (!dtrace_inscratch(dest, size, mstate)) {
*flags |= CPU_DTRACE_BADADDR;
*illval = regs[rd];
break;
}
DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
break;
}
case DIF_SUBR_COPYINSTR: {
uintptr_t dest = mstate->dtms_scratch_ptr;
uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
if (nargs > 1 && tupregs[1].dttk_value < size)
size = tupregs[1].dttk_value + 1;
/*
* This action doesn't require any credential checks since
* probes will not activate in user contexts to which the
* enabling user does not have permissions.
*/
if (!DTRACE_INSCRATCH(mstate, size)) {
DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
regs[rd] = 0;
break;
}
DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
dtrace_copyinstr(tupregs[0].dttk_value, dest, size, flags);
DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
((char *)dest)[size - 1] = '\0';
mstate->dtms_scratch_ptr += size;
regs[rd] = dest;
break;
}
#if defined(sun)
case DIF_SUBR_MSGSIZE:
case DIF_SUBR_MSGDSIZE: {
uintptr_t baddr = tupregs[0].dttk_value, daddr;
uintptr_t wptr, rptr;
size_t count = 0;
int cont = 0;
while (baddr != 0 && !(*flags & CPU_DTRACE_FAULT)) {
if (!dtrace_canload(baddr, sizeof (mblk_t), mstate,
vstate)) {
regs[rd] = 0;
break;
}
wptr = dtrace_loadptr(baddr +
offsetof(mblk_t, b_wptr));
rptr = dtrace_loadptr(baddr +
offsetof(mblk_t, b_rptr));
if (wptr < rptr) {
*flags |= CPU_DTRACE_BADADDR;
*illval = tupregs[0].dttk_value;
break;
}
daddr = dtrace_loadptr(baddr +
offsetof(mblk_t, b_datap));
baddr = dtrace_loadptr(baddr +
offsetof(mblk_t, b_cont));
/*
* We want to prevent against denial-of-service here,
* so we're only going to search the list for
* dtrace_msgdsize_max mblks.
*/
if (cont++ > dtrace_msgdsize_max) {
*flags |= CPU_DTRACE_ILLOP;
break;
}
if (subr == DIF_SUBR_MSGDSIZE) {
if (dtrace_load8(daddr +
offsetof(dblk_t, db_type)) != M_DATA)
continue;
}
count += wptr - rptr;
}
if (!(*flags & CPU_DTRACE_FAULT))
regs[rd] = count;
break;
}
#endif
case DIF_SUBR_PROGENYOF: {
pid_t pid = tupregs[0].dttk_value;
proc_t *p;
int rval = 0;
DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
for (p = curthread->t_procp; p != NULL; p = p->p_parent) {
#if defined(sun)
if (p->p_pidp->pid_id == pid) {
#else
if (p->p_pid == pid) {
#endif
rval = 1;
break;
}
}
DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
regs[rd] = rval;
break;
}
case DIF_SUBR_SPECULATION:
regs[rd] = dtrace_speculation(state);
break;
case DIF_SUBR_COPYOUT: {
uintptr_t kaddr = tupregs[0].dttk_value;
uintptr_t uaddr = tupregs[1].dttk_value;
uint64_t size = tupregs[2].dttk_value;
if (!dtrace_destructive_disallow &&
dtrace_priv_proc_control(state) &&
!dtrace_istoxic(kaddr, size)) {
DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
dtrace_copyout(kaddr, uaddr, size, flags);
DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
}
break;
}
case DIF_SUBR_COPYOUTSTR: {
uintptr_t kaddr = tupregs[0].dttk_value;
uintptr_t uaddr = tupregs[1].dttk_value;
uint64_t size = tupregs[2].dttk_value;
if (!dtrace_destructive_disallow &&
dtrace_priv_proc_control(state) &&
!dtrace_istoxic(kaddr, size)) {
DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
dtrace_copyoutstr(kaddr, uaddr, size, flags);
DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
}
break;
}
case DIF_SUBR_STRLEN: {
size_t sz;
uintptr_t addr = (uintptr_t)tupregs[0].dttk_value;
sz = dtrace_strlen((char *)addr,
state->dts_options[DTRACEOPT_STRSIZE]);
if (!dtrace_canload(addr, sz + 1, mstate, vstate)) {
regs[rd] = 0;
break;
}
regs[rd] = sz;
break;
}
case DIF_SUBR_STRCHR:
case DIF_SUBR_STRRCHR: {
/*
* We're going to iterate over the string looking for the
* specified character. We will iterate until we have reached
* the string length or we have found the character. If this
* is DIF_SUBR_STRRCHR, we will look for the last occurrence
* of the specified character instead of the first.
*/
uintptr_t saddr = tupregs[0].dttk_value;
uintptr_t addr = tupregs[0].dttk_value;
uintptr_t limit = addr + state->dts_options[DTRACEOPT_STRSIZE];
char c, target = (char)tupregs[1].dttk_value;
for (regs[rd] = 0; addr < limit; addr++) {
if ((c = dtrace_load8(addr)) == target) {
regs[rd] = addr;
if (subr == DIF_SUBR_STRCHR)
break;
}
if (c == '\0')
break;
}
if (!dtrace_canload(saddr, addr - saddr, mstate, vstate)) {
regs[rd] = 0;
break;
}
break;
}
case DIF_SUBR_STRSTR:
case DIF_SUBR_INDEX:
case DIF_SUBR_RINDEX: {
/*
* We're going to iterate over the string looking for the
* specified string. We will iterate until we have reached
* the string length or we have found the string. (Yes, this
* is done in the most naive way possible -- but considering
* that the string we're searching for is likely to be
* relatively short, the complexity of Rabin-Karp or similar
* hardly seems merited.)
*/
char *addr = (char *)(uintptr_t)tupregs[0].dttk_value;
char *substr = (char *)(uintptr_t)tupregs[1].dttk_value;
uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
size_t len = dtrace_strlen(addr, size);
size_t sublen = dtrace_strlen(substr, size);
char *limit = addr + len, *orig = addr;
int notfound = subr == DIF_SUBR_STRSTR ? 0 : -1;
int inc = 1;
regs[rd] = notfound;
if (!dtrace_canload((uintptr_t)addr, len + 1, mstate, vstate)) {
regs[rd] = 0;
break;
}
if (!dtrace_canload((uintptr_t)substr, sublen + 1, mstate,
vstate)) {
regs[rd] = 0;
break;
}
/*
* strstr() and index()/rindex() have similar semantics if
* both strings are the empty string: strstr() returns a
* pointer to the (empty) string, and index() and rindex()
* both return index 0 (regardless of any position argument).
*/
if (sublen == 0 && len == 0) {
if (subr == DIF_SUBR_STRSTR)
regs[rd] = (uintptr_t)addr;
else
regs[rd] = 0;
break;
}
if (subr != DIF_SUBR_STRSTR) {
if (subr == DIF_SUBR_RINDEX) {
limit = orig - 1;
addr += len;
inc = -1;
}
/*
* Both index() and rindex() take an optional position
* argument that denotes the starting position.
*/
if (nargs == 3) {
int64_t pos = (int64_t)tupregs[2].dttk_value;
/*
* If the position argument to index() is
* negative, Perl implicitly clamps it at
* zero. This semantic is a little surprising
* given the special meaning of negative
* positions to similar Perl functions like
* substr(), but it appears to reflect a
* notion that index() can start from a
* negative index and increment its way up to
* the string. Given this notion, Perl's
* rindex() is at least self-consistent in
* that it implicitly clamps positions greater
* than the string length to be the string
* length. Where Perl completely loses
* coherence, however, is when the specified
* substring is the empty string (""). In
* this case, even if the position is
* negative, rindex() returns 0 -- and even if
* the position is greater than the length,
* index() returns the string length. These
* semantics violate the notion that index()
* should never return a value less than the
* specified position and that rindex() should
* never return a value greater than the
* specified position. (One assumes that
* these semantics are artifacts of Perl's
* implementation and not the results of
* deliberate design -- it beggars belief that
* even Larry Wall could desire such oddness.)
* While in the abstract one would wish for
* consistent position semantics across
* substr(), index() and rindex() -- or at the
* very least self-consistent position
* semantics for index() and rindex() -- we
* instead opt to keep with the extant Perl
* semantics, in all their broken glory. (Do
* we have more desire to maintain Perl's
* semantics than Perl does? Probably.)
*/
if (subr == DIF_SUBR_RINDEX) {
if (pos < 0) {
if (sublen == 0)
regs[rd] = 0;
break;
}
if (pos > len)
pos = len;
} else {
if (pos < 0)
pos = 0;
if (pos >= len) {
if (sublen == 0)
regs[rd] = len;
break;
}
}
addr = orig + pos;
}
}
for (regs[rd] = notfound; addr != limit; addr += inc) {
if (dtrace_strncmp(addr, substr, sublen) == 0) {
if (subr != DIF_SUBR_STRSTR) {
/*
* As D index() and rindex() are
* modeled on Perl (and not on awk),
* we return a zero-based (and not a
* one-based) index. (For you Perl
* weenies: no, we're not going to add
* $[ -- and shouldn't you be at a con
* or something?)
*/
regs[rd] = (uintptr_t)(addr - orig);
break;
}
ASSERT(subr == DIF_SUBR_STRSTR);
regs[rd] = (uintptr_t)addr;
break;
}
}
break;
}
case DIF_SUBR_STRTOK: {
uintptr_t addr = tupregs[0].dttk_value;
uintptr_t tokaddr = tupregs[1].dttk_value;
uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
uintptr_t limit, toklimit = tokaddr + size;
uint8_t c = 0, tokmap[32]; /* 256 / 8 */
char *dest = (char *)mstate->dtms_scratch_ptr;
int i;
/*
* Check both the token buffer and (later) the input buffer,
* since both could be non-scratch addresses.
*/
if (!dtrace_strcanload(tokaddr, size, mstate, vstate)) {
regs[rd] = 0;
break;
}
if (!DTRACE_INSCRATCH(mstate, size)) {
DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
regs[rd] = 0;
break;
}
if (addr == 0) {
/*
* If the address specified is NULL, we use our saved
* strtok pointer from the mstate. Note that this
* means that the saved strtok pointer is _only_
* valid within multiple enablings of the same probe --
* it behaves like an implicit clause-local variable.
*/
addr = mstate->dtms_strtok;
} else {
/*
* If the user-specified address is non-NULL we must
* access check it. This is the only time we have
* a chance to do so, since this address may reside
* in the string table of this clause-- future calls
* (when we fetch addr from mstate->dtms_strtok)
* would fail this access check.
*/
if (!dtrace_strcanload(addr, size, mstate, vstate)) {
regs[rd] = 0;
break;
}
}
/*
* First, zero the token map, and then process the token
* string -- setting a bit in the map for every character
* found in the token string.
*/
for (i = 0; i < sizeof (tokmap); i++)
tokmap[i] = 0;
for (; tokaddr < toklimit; tokaddr++) {
if ((c = dtrace_load8(tokaddr)) == '\0')
break;
ASSERT((c >> 3) < sizeof (tokmap));
tokmap[c >> 3] |= (1 << (c & 0x7));
}
for (limit = addr + size; addr < limit; addr++) {
/*
* We're looking for a character that is _not_ contained
* in the token string.
*/
if ((c = dtrace_load8(addr)) == '\0')
break;
if (!(tokmap[c >> 3] & (1 << (c & 0x7))))
break;
}
if (c == '\0') {
/*
* We reached the end of the string without finding
* any character that was not in the token string.
* We return NULL in this case, and we set the saved
* address to NULL as well.
*/
regs[rd] = 0;
mstate->dtms_strtok = 0;
break;
}
/*
* From here on, we're copying into the destination string.
*/
for (i = 0; addr < limit && i < size - 1; addr++) {
if ((c = dtrace_load8(addr)) == '\0')
break;
if (tokmap[c >> 3] & (1 << (c & 0x7)))
break;
ASSERT(i < size);
dest[i++] = c;
}
ASSERT(i < size);
dest[i] = '\0';
regs[rd] = (uintptr_t)dest;
mstate->dtms_scratch_ptr += size;
mstate->dtms_strtok = addr;
break;
}
case DIF_SUBR_SUBSTR: {
uintptr_t s = tupregs[0].dttk_value;
uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
char *d = (char *)mstate->dtms_scratch_ptr;
int64_t index = (int64_t)tupregs[1].dttk_value;
int64_t remaining = (int64_t)tupregs[2].dttk_value;
size_t len = dtrace_strlen((char *)s, size);
int64_t i = 0;
if (!dtrace_canload(s, len + 1, mstate, vstate)) {
regs[rd] = 0;
break;
}
if (!DTRACE_INSCRATCH(mstate, size)) {
DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
regs[rd] = 0;
break;
}
if (nargs <= 2)
remaining = (int64_t)size;
if (index < 0) {
index += len;
if (index < 0 && index + remaining > 0) {
remaining += index;
index = 0;
}
}
if (index >= len || index < 0) {
remaining = 0;
} else if (remaining < 0) {
remaining += len - index;
} else if (index + remaining > size) {
remaining = size - index;
}
for (i = 0; i < remaining; i++) {
if ((d[i] = dtrace_load8(s + index + i)) == '\0')
break;
}
d[i] = '\0';
mstate->dtms_scratch_ptr += size;
regs[rd] = (uintptr_t)d;
break;
}
#if defined(sun)
case DIF_SUBR_GETMAJOR:
#ifdef _LP64
regs[rd] = (tupregs[0].dttk_value >> NBITSMINOR64) & MAXMAJ64;
#else
regs[rd] = (tupregs[0].dttk_value >> NBITSMINOR) & MAXMAJ;
#endif
break;
case DIF_SUBR_GETMINOR:
#ifdef _LP64
regs[rd] = tupregs[0].dttk_value & MAXMIN64;
#else
regs[rd] = tupregs[0].dttk_value & MAXMIN;
#endif
break;
case DIF_SUBR_DDI_PATHNAME: {
/*
* This one is a galactic mess. We are going to roughly
* emulate ddi_pathname(), but it's made more complicated
* by the fact that we (a) want to include the minor name and
* (b) must proceed iteratively instead of recursively.
*/
uintptr_t dest = mstate->dtms_scratch_ptr;
uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
char *start = (char *)dest, *end = start + size - 1;
uintptr_t daddr = tupregs[0].dttk_value;
int64_t minor = (int64_t)tupregs[1].dttk_value;
char *s;
int i, len, depth = 0;
/*
* Due to all the pointer jumping we do and context we must
* rely upon, we just mandate that the user must have kernel
* read privileges to use this routine.
*/
if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) == 0) {
*flags |= CPU_DTRACE_KPRIV;
*illval = daddr;
regs[rd] = 0;
}
if (!DTRACE_INSCRATCH(mstate, size)) {
DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
regs[rd] = 0;
break;
}
*end = '\0';
/*
* We want to have a name for the minor. In order to do this,
* we need to walk the minor list from the devinfo. We want
* to be sure that we don't infinitely walk a circular list,
* so we check for circularity by sending a scout pointer
* ahead two elements for every element that we iterate over;
* if the list is circular, these will ultimately point to the
* same element. You may recognize this little trick as the
* answer to a stupid interview question -- one that always
* seems to be asked by those who had to have it laboriously
* explained to them, and who can't even concisely describe
* the conditions under which one would be forced to resort to
* this technique. Needless to say, those conditions are
* found here -- and probably only here. Is this the only use
* of this infamous trick in shipping, production code? If it
* isn't, it probably should be...
*/
if (minor != -1) {
uintptr_t maddr = dtrace_loadptr(daddr +
offsetof(struct dev_info, devi_minor));
uintptr_t next = offsetof(struct ddi_minor_data, next);
uintptr_t name = offsetof(struct ddi_minor_data,
d_minor) + offsetof(struct ddi_minor, name);
uintptr_t dev = offsetof(struct ddi_minor_data,
d_minor) + offsetof(struct ddi_minor, dev);
uintptr_t scout;
if (maddr != NULL)
scout = dtrace_loadptr(maddr + next);
while (maddr != NULL && !(*flags & CPU_DTRACE_FAULT)) {
uint64_t m;
#ifdef _LP64
m = dtrace_load64(maddr + dev) & MAXMIN64;
#else
m = dtrace_load32(maddr + dev) & MAXMIN;
#endif
if (m != minor) {
maddr = dtrace_loadptr(maddr + next);
if (scout == NULL)
continue;
scout = dtrace_loadptr(scout + next);
if (scout == NULL)
continue;
scout = dtrace_loadptr(scout + next);
if (scout == NULL)
continue;
if (scout == maddr) {
*flags |= CPU_DTRACE_ILLOP;
break;
}
continue;
}
/*
* We have the minor data. Now we need to
* copy the minor's name into the end of the
* pathname.
*/
s = (char *)dtrace_loadptr(maddr + name);
len = dtrace_strlen(s, size);
if (*flags & CPU_DTRACE_FAULT)
break;
if (len != 0) {
if ((end -= (len + 1)) < start)
break;
*end = ':';
}
for (i = 1; i <= len; i++)
end[i] = dtrace_load8((uintptr_t)s++);
break;
}
}
while (daddr != NULL && !(*flags & CPU_DTRACE_FAULT)) {
ddi_node_state_t devi_state;
devi_state = dtrace_load32(daddr +
offsetof(struct dev_info, devi_node_state));
if (*flags & CPU_DTRACE_FAULT)
break;
if (devi_state >= DS_INITIALIZED) {
s = (char *)dtrace_loadptr(daddr +
offsetof(struct dev_info, devi_addr));
len = dtrace_strlen(s, size);
if (*flags & CPU_DTRACE_FAULT)
break;
if (len != 0) {
if ((end -= (len + 1)) < start)
break;
*end = '@';
}
for (i = 1; i <= len; i++)
end[i] = dtrace_load8((uintptr_t)s++);
}
/*
* Now for the node name...
*/
s = (char *)dtrace_loadptr(daddr +
offsetof(struct dev_info, devi_node_name));
daddr = dtrace_loadptr(daddr +
offsetof(struct dev_info, devi_parent));
/*
* If our parent is NULL (that is, if we're the root
* node), we're going to use the special path
* "devices".
*/
if (daddr == 0)
s = "devices";
len = dtrace_strlen(s, size);
if (*flags & CPU_DTRACE_FAULT)
break;
if ((end -= (len + 1)) < start)
break;
for (i = 1; i <= len; i++)
end[i] = dtrace_load8((uintptr_t)s++);
*end = '/';
if (depth++ > dtrace_devdepth_max) {
*flags |= CPU_DTRACE_ILLOP;
break;
}
}
if (end < start)
DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
if (daddr == 0) {
regs[rd] = (uintptr_t)end;
mstate->dtms_scratch_ptr += size;
}
break;
}
#endif
case DIF_SUBR_STRJOIN: {
char *d = (char *)mstate->dtms_scratch_ptr;
uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
uintptr_t s1 = tupregs[0].dttk_value;
uintptr_t s2 = tupregs[1].dttk_value;
int i = 0;
if (!dtrace_strcanload(s1, size, mstate, vstate) ||
!dtrace_strcanload(s2, size, mstate, vstate)) {
regs[rd] = 0;
break;
}
if (!DTRACE_INSCRATCH(mstate, size)) {
DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
regs[rd] = 0;
break;
}
for (;;) {
if (i >= size) {
DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
regs[rd] = 0;
break;
}
if ((d[i++] = dtrace_load8(s1++)) == '\0') {
i--;
break;
}
}
for (;;) {
if (i >= size) {
DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
regs[rd] = 0;
break;
}
if ((d[i++] = dtrace_load8(s2++)) == '\0')
break;
}
if (i < size) {
mstate->dtms_scratch_ptr += i;
regs[rd] = (uintptr_t)d;
}
break;
}
case DIF_SUBR_LLTOSTR: {
int64_t i = (int64_t)tupregs[0].dttk_value;
int64_t val = i < 0 ? i * -1 : i;
uint64_t size = 22; /* enough room for 2^64 in decimal */
char *end = (char *)mstate->dtms_scratch_ptr + size - 1;
if (!DTRACE_INSCRATCH(mstate, size)) {
DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
regs[rd] = 0;
break;
}
for (*end-- = '\0'; val; val /= 10)
*end-- = '0' + (val % 10);
if (i == 0)
*end-- = '0';
if (i < 0)
*end-- = '-';
regs[rd] = (uintptr_t)end + 1;
mstate->dtms_scratch_ptr += size;
break;
}
case DIF_SUBR_HTONS:
case DIF_SUBR_NTOHS:
#if BYTE_ORDER == BIG_ENDIAN
regs[rd] = (uint16_t)tupregs[0].dttk_value;
#else
regs[rd] = DT_BSWAP_16((uint16_t)tupregs[0].dttk_value);
#endif
break;
case DIF_SUBR_HTONL:
case DIF_SUBR_NTOHL:
#if BYTE_ORDER == BIG_ENDIAN
regs[rd] = (uint32_t)tupregs[0].dttk_value;
#else
regs[rd] = DT_BSWAP_32((uint32_t)tupregs[0].dttk_value);
#endif
break;
case DIF_SUBR_HTONLL:
case DIF_SUBR_NTOHLL:
#if BYTE_ORDER == BIG_ENDIAN
regs[rd] = (uint64_t)tupregs[0].dttk_value;
#else
regs[rd] = DT_BSWAP_64((uint64_t)tupregs[0].dttk_value);
#endif
break;
case DIF_SUBR_DIRNAME:
case DIF_SUBR_BASENAME: {
char *dest = (char *)mstate->dtms_scratch_ptr;
uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
uintptr_t src = tupregs[0].dttk_value;
int i, j, len = dtrace_strlen((char *)src, size);
int lastbase = -1, firstbase = -1, lastdir = -1;
int start, end;
if (!dtrace_canload(src, len + 1, mstate, vstate)) {
regs[rd] = 0;
break;
}
if (!DTRACE_INSCRATCH(mstate, size)) {
DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
regs[rd] = 0;
break;
}
/*
* The basename and dirname for a zero-length string is
* defined to be "."
*/
if (len == 0) {
len = 1;
src = (uintptr_t)".";
}
/*
* Start from the back of the string, moving back toward the
* front until we see a character that isn't a slash. That
* character is the last character in the basename.
*/
for (i = len - 1; i >= 0; i--) {
if (dtrace_load8(src + i) != '/')
break;
}
if (i >= 0)
lastbase = i;
/*
* Starting from the last character in the basename, move
* towards the front until we find a slash. The character
* that we processed immediately before that is the first
* character in the basename.
*/
for (; i >= 0; i--) {
if (dtrace_load8(src + i) == '/')
break;
}
if (i >= 0)
firstbase = i + 1;
/*
* Now keep going until we find a non-slash character. That
* character is the last character in the dirname.
*/
for (; i >= 0; i--) {
if (dtrace_load8(src + i) != '/')
break;
}
if (i >= 0)
lastdir = i;
ASSERT(!(lastbase == -1 && firstbase != -1));
ASSERT(!(firstbase == -1 && lastdir != -1));
if (lastbase == -1) {
/*
* We didn't find a non-slash character. We know that
* the length is non-zero, so the whole string must be
* slashes. In either the dirname or the basename
* case, we return '/'.
*/
ASSERT(firstbase == -1);
firstbase = lastbase = lastdir = 0;
}
if (firstbase == -1) {
/*
* The entire string consists only of a basename
* component. If we're looking for dirname, we need
* to change our string to be just "."; if we're
* looking for a basename, we'll just set the first
* character of the basename to be 0.
*/
if (subr == DIF_SUBR_DIRNAME) {
ASSERT(lastdir == -1);
src = (uintptr_t)".";
lastdir = 0;
} else {
firstbase = 0;
}
}
if (subr == DIF_SUBR_DIRNAME) {
if (lastdir == -1) {
/*
* We know that we have a slash in the name --
* or lastdir would be set to 0, above. And
* because lastdir is -1, we know that this
* slash must be the first character. (That
* is, the full string must be of the form
* "/basename".) In this case, the last
* character of the directory name is 0.
*/
lastdir = 0;
}
start = 0;
end = lastdir;
} else {
ASSERT(subr == DIF_SUBR_BASENAME);
ASSERT(firstbase != -1 && lastbase != -1);
start = firstbase;
end = lastbase;
}
for (i = start, j = 0; i <= end && j < size - 1; i++, j++)
dest[j] = dtrace_load8(src + i);
dest[j] = '\0';
regs[rd] = (uintptr_t)dest;
mstate->dtms_scratch_ptr += size;
break;
}
case DIF_SUBR_CLEANPATH: {
char *dest = (char *)mstate->dtms_scratch_ptr, c;
uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
uintptr_t src = tupregs[0].dttk_value;
int i = 0, j = 0;
if (!dtrace_strcanload(src, size, mstate, vstate)) {
regs[rd] = 0;
break;
}
if (!DTRACE_INSCRATCH(mstate, size)) {
DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
regs[rd] = 0;
break;
}
/*
* Move forward, loading each character.
*/
do {
c = dtrace_load8(src + i++);
next:
if (j + 5 >= size) /* 5 = strlen("/..c\0") */
break;
if (c != '/') {
dest[j++] = c;
continue;
}
c = dtrace_load8(src + i++);
if (c == '/') {
/*
* We have two slashes -- we can just advance
* to the next character.
*/
goto next;
}
if (c != '.') {
/*
* This is not "." and it's not ".." -- we can
* just store the "/" and this character and
* drive on.
*/
dest[j++] = '/';
dest[j++] = c;
continue;
}
c = dtrace_load8(src + i++);
if (c == '/') {
/*
* This is a "/./" component. We're not going
* to store anything in the destination buffer;
* we're just going to go to the next component.
*/
goto next;
}
if (c != '.') {
/*
* This is not ".." -- we can just store the
* "/." and this character and continue
* processing.
*/
dest[j++] = '/';
dest[j++] = '.';
dest[j++] = c;
continue;
}
c = dtrace_load8(src + i++);
if (c != '/' && c != '\0') {
/*
* This is not ".." -- it's "..[mumble]".
* We'll store the "/.." and this character
* and continue processing.
*/
dest[j++] = '/';
dest[j++] = '.';
dest[j++] = '.';
dest[j++] = c;
continue;
}
/*
* This is "/../" or "/..\0". We need to back up
* our destination pointer until we find a "/".
*/
i--;
while (j != 0 && dest[--j] != '/')
continue;
if (c == '\0')
dest[++j] = '/';
} while (c != '\0');
dest[j] = '\0';
regs[rd] = (uintptr_t)dest;
mstate->dtms_scratch_ptr += size;
break;
}
case DIF_SUBR_INET_NTOA:
case DIF_SUBR_INET_NTOA6:
case DIF_SUBR_INET_NTOP: {
size_t size;
int af, argi, i;
char *base, *end;
if (subr == DIF_SUBR_INET_NTOP) {
af = (int)tupregs[0].dttk_value;
argi = 1;
} else {
af = subr == DIF_SUBR_INET_NTOA ? AF_INET: AF_INET6;
argi = 0;
}
if (af == AF_INET) {
ipaddr_t ip4;
uint8_t *ptr8, val;
/*
* Safely load the IPv4 address.
*/
ip4 = dtrace_load32(tupregs[argi].dttk_value);
/*
* Check an IPv4 string will fit in scratch.
*/
size = INET_ADDRSTRLEN;
if (!DTRACE_INSCRATCH(mstate, size)) {
DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
regs[rd] = 0;
break;
}
base = (char *)mstate->dtms_scratch_ptr;
end = (char *)mstate->dtms_scratch_ptr + size - 1;
/*
* Stringify as a dotted decimal quad.
*/
*end-- = '\0';
ptr8 = (uint8_t *)&ip4;
for (i = 3; i >= 0; i--) {
val = ptr8[i];
if (val == 0) {
*end-- = '0';
} else {
for (; val; val /= 10) {
*end-- = '0' + (val % 10);
}
}
if (i > 0)
*end-- = '.';
}
ASSERT(end + 1 >= base);
} else if (af == AF_INET6) {
struct in6_addr ip6;
int firstzero, tryzero, numzero, v6end;
uint16_t val;
const char digits[] = "0123456789abcdef";
/*
* Stringify using RFC 1884 convention 2 - 16 bit
* hexadecimal values with a zero-run compression.
* Lower case hexadecimal digits are used.
* eg, fe80::214:4fff:fe0b:76c8.
* The IPv4 embedded form is returned for inet_ntop,
* just the IPv4 string is returned for inet_ntoa6.
*/
/*
* Safely load the IPv6 address.
*/
dtrace_bcopy(
(void *)(uintptr_t)tupregs[argi].dttk_value,
(void *)(uintptr_t)&ip6, sizeof (struct in6_addr));
/*
* Check an IPv6 string will fit in scratch.
*/
size = INET6_ADDRSTRLEN;
if (!DTRACE_INSCRATCH(mstate, size)) {
DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
regs[rd] = 0;
break;
}
base = (char *)mstate->dtms_scratch_ptr;
end = (char *)mstate->dtms_scratch_ptr + size - 1;
*end-- = '\0';
/*
* Find the longest run of 16 bit zero values
* for the single allowed zero compression - "::".
*/
firstzero = -1;
tryzero = -1;
numzero = 1;
for (i = 0; i < sizeof (struct in6_addr); i++) {
#if defined(sun)
if (ip6._S6_un._S6_u8[i] == 0 &&
#else
if (ip6.__u6_addr.__u6_addr8[i] == 0 &&
#endif
tryzero == -1 && i % 2 == 0) {
tryzero = i;
continue;
}
if (tryzero != -1 &&
#if defined(sun)
(ip6._S6_un._S6_u8[i] != 0 ||
#else
(ip6.__u6_addr.__u6_addr8[i] != 0 ||
#endif
i == sizeof (struct in6_addr) - 1)) {
if (i - tryzero <= numzero) {
tryzero = -1;
continue;
}
firstzero = tryzero;
numzero = i - i % 2 - tryzero;
tryzero = -1;
#if defined(sun)
if (ip6._S6_un._S6_u8[i] == 0 &&
#else
if (ip6.__u6_addr.__u6_addr8[i] == 0 &&
#endif
i == sizeof (struct in6_addr) - 1)
numzero += 2;
}
}
ASSERT(firstzero + numzero <= sizeof (struct in6_addr));
/*
* Check for an IPv4 embedded address.
*/
v6end = sizeof (struct in6_addr) - 2;
if (IN6_IS_ADDR_V4MAPPED(&ip6) ||
IN6_IS_ADDR_V4COMPAT(&ip6)) {
for (i = sizeof (struct in6_addr) - 1;
i >= DTRACE_V4MAPPED_OFFSET; i--) {
ASSERT(end >= base);
#if defined(sun)
val = ip6._S6_un._S6_u8[i];
#else
val = ip6.__u6_addr.__u6_addr8[i];
#endif
if (val == 0) {
*end-- = '0';
} else {
for (; val; val /= 10) {
*end-- = '0' + val % 10;
}
}
if (i > DTRACE_V4MAPPED_OFFSET)
*end-- = '.';
}
if (subr == DIF_SUBR_INET_NTOA6)
goto inetout;
/*
* Set v6end to skip the IPv4 address that
* we have already stringified.
*/
v6end = 10;
}
/*
* Build the IPv6 string by working through the
* address in reverse.
*/
for (i = v6end; i >= 0; i -= 2) {
ASSERT(end >= base);
if (i == firstzero + numzero - 2) {
*end-- = ':';
*end-- = ':';
i -= numzero - 2;
continue;
}
if (i < 14 && i != firstzero - 2)
*end-- = ':';
#if defined(sun)
val = (ip6._S6_un._S6_u8[i] << 8) +
ip6._S6_un._S6_u8[i + 1];
#else
val = (ip6.__u6_addr.__u6_addr8[i] << 8) +
ip6.__u6_addr.__u6_addr8[i + 1];
#endif
if (val == 0) {
*end-- = '0';
} else {
for (; val; val /= 16) {
*end-- = digits[val % 16];
}
}
}
ASSERT(end + 1 >= base);
} else {
/*
* The user didn't use AH_INET or AH_INET6.
*/
DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
regs[rd] = 0;
break;
}
inetout: regs[rd] = (uintptr_t)end + 1;
mstate->dtms_scratch_ptr += size;
break;
}
case DIF_SUBR_MEMREF: {
uintptr_t size = 2 * sizeof(uintptr_t);
uintptr_t *memref = (uintptr_t *) P2ROUNDUP(mstate->dtms_scratch_ptr, sizeof(uintptr_t));
size_t scratch_size = ((uintptr_t) memref - mstate->dtms_scratch_ptr) + size;
/* address and length */
memref[0] = tupregs[0].dttk_value;
memref[1] = tupregs[1].dttk_value;
regs[rd] = (uintptr_t) memref;
mstate->dtms_scratch_ptr += scratch_size;
break;
}
case DIF_SUBR_TYPEREF: {
uintptr_t size = 4 * sizeof(uintptr_t);
uintptr_t *typeref = (uintptr_t *) P2ROUNDUP(mstate->dtms_scratch_ptr, sizeof(uintptr_t));
size_t scratch_size = ((uintptr_t) typeref - mstate->dtms_scratch_ptr) + size;
/* address, num_elements, type_str, type_len */
typeref[0] = tupregs[0].dttk_value;
typeref[1] = tupregs[1].dttk_value;
typeref[2] = tupregs[2].dttk_value;
typeref[3] = tupregs[3].dttk_value;
regs[rd] = (uintptr_t) typeref;
mstate->dtms_scratch_ptr += scratch_size;
break;
}
}
}
/*
* Emulate the execution of DTrace IR instructions specified by the given
* DIF object. This function is deliberately void of assertions as all of
* the necessary checks are handled by a call to dtrace_difo_validate().
*/
static uint64_t
dtrace_dif_emulate(dtrace_difo_t *difo, dtrace_mstate_t *mstate,
dtrace_vstate_t *vstate, dtrace_state_t *state)
{
const dif_instr_t *text = difo->dtdo_buf;
const uint_t textlen = difo->dtdo_len;
const char *strtab = difo->dtdo_strtab;
const uint64_t *inttab = difo->dtdo_inttab;
uint64_t rval = 0;
dtrace_statvar_t *svar;
dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
dtrace_difv_t *v;
volatile uint16_t *flags = &cpu_core[curcpu].cpuc_dtrace_flags;
volatile uintptr_t *illval = &cpu_core[curcpu].cpuc_dtrace_illval;
dtrace_key_t tupregs[DIF_DTR_NREGS + 2]; /* +2 for thread and id */
uint64_t regs[DIF_DIR_NREGS];
uint64_t *tmp;
uint8_t cc_n = 0, cc_z = 0, cc_v = 0, cc_c = 0;
int64_t cc_r;
uint_t pc = 0, id, opc = 0;
uint8_t ttop = 0;
dif_instr_t instr;
uint_t r1, r2, rd;
/*
* We stash the current DIF object into the machine state: we need it
* for subsequent access checking.
*/
mstate->dtms_difo = difo;
regs[DIF_REG_R0] = 0; /* %r0 is fixed at zero */
while (pc < textlen && !(*flags & CPU_DTRACE_FAULT)) {
opc = pc;
instr = text[pc++];
r1 = DIF_INSTR_R1(instr);
r2 = DIF_INSTR_R2(instr);
rd = DIF_INSTR_RD(instr);
switch (DIF_INSTR_OP(instr)) {
case DIF_OP_OR:
regs[rd] = regs[r1] | regs[r2];
break;
case DIF_OP_XOR:
regs[rd] = regs[r1] ^ regs[r2];
break;
case DIF_OP_AND:
regs[rd] = regs[r1] & regs[r2];
break;
case DIF_OP_SLL:
regs[rd] = regs[r1] << regs[r2];
break;
case DIF_OP_SRL:
regs[rd] = regs[r1] >> regs[r2];
break;
case DIF_OP_SUB:
regs[rd] = regs[r1] - regs[r2];
break;
case DIF_OP_ADD:
regs[rd] = regs[r1] + regs[r2];
break;
case DIF_OP_MUL:
regs[rd] = regs[r1] * regs[r2];
break;
case DIF_OP_SDIV:
if (regs[r2] == 0) {
regs[rd] = 0;
*flags |= CPU_DTRACE_DIVZERO;
} else {
regs[rd] = (int64_t)regs[r1] /
(int64_t)regs[r2];
}
break;
case DIF_OP_UDIV:
if (regs[r2] == 0) {
regs[rd] = 0;
*flags |= CPU_DTRACE_DIVZERO;
} else {
regs[rd] = regs[r1] / regs[r2];
}
break;
case DIF_OP_SREM:
if (regs[r2] == 0) {
regs[rd] = 0;
*flags |= CPU_DTRACE_DIVZERO;
} else {
regs[rd] = (int64_t)regs[r1] %
(int64_t)regs[r2];
}
break;
case DIF_OP_UREM:
if (regs[r2] == 0) {
regs[rd] = 0;
*flags |= CPU_DTRACE_DIVZERO;
} else {
regs[rd] = regs[r1] % regs[r2];
}
break;
case DIF_OP_NOT:
regs[rd] = ~regs[r1];
break;
case DIF_OP_MOV:
regs[rd] = regs[r1];
break;
case DIF_OP_CMP:
cc_r = regs[r1] - regs[r2];
cc_n = cc_r < 0;
cc_z = cc_r == 0;
cc_v = 0;
cc_c = regs[r1] < regs[r2];
break;
case DIF_OP_TST:
cc_n = cc_v = cc_c = 0;
cc_z = regs[r1] == 0;
break;
case DIF_OP_BA:
pc = DIF_INSTR_LABEL(instr);
break;
case DIF_OP_BE:
if (cc_z)
pc = DIF_INSTR_LABEL(instr);
break;
case DIF_OP_BNE:
if (cc_z == 0)
pc = DIF_INSTR_LABEL(instr);
break;
case DIF_OP_BG:
if ((cc_z | (cc_n ^ cc_v)) == 0)
pc = DIF_INSTR_LABEL(instr);
break;
case DIF_OP_BGU:
if ((cc_c | cc_z) == 0)
pc = DIF_INSTR_LABEL(instr);
break;
case DIF_OP_BGE:
if ((cc_n ^ cc_v) == 0)
pc = DIF_INSTR_LABEL(instr);
break;
case DIF_OP_BGEU:
if (cc_c == 0)
pc = DIF_INSTR_LABEL(instr);
break;
case DIF_OP_BL:
if (cc_n ^ cc_v)
pc = DIF_INSTR_LABEL(instr);
break;
case DIF_OP_BLU:
if (cc_c)
pc = DIF_INSTR_LABEL(instr);
break;
case DIF_OP_BLE:
if (cc_z | (cc_n ^ cc_v))
pc = DIF_INSTR_LABEL(instr);
break;
case DIF_OP_BLEU:
if (cc_c | cc_z)
pc = DIF_INSTR_LABEL(instr);
break;
case DIF_OP_RLDSB:
if (!dtrace_canstore(regs[r1], 1, mstate, vstate)) {
*flags |= CPU_DTRACE_KPRIV;
*illval = regs[r1];
break;
}
/*FALLTHROUGH*/
case DIF_OP_LDSB:
regs[rd] = (int8_t)dtrace_load8(regs[r1]);
break;
case DIF_OP_RLDSH:
if (!dtrace_canstore(regs[r1], 2, mstate, vstate)) {
*flags |= CPU_DTRACE_KPRIV;
*illval = regs[r1];
break;
}
/*FALLTHROUGH*/
case DIF_OP_LDSH:
regs[rd] = (int16_t)dtrace_load16(regs[r1]);
break;
case DIF_OP_RLDSW:
if (!dtrace_canstore(regs[r1], 4, mstate, vstate)) {
*flags |= CPU_DTRACE_KPRIV;
*illval = regs[r1];
break;
}
/*FALLTHROUGH*/
case DIF_OP_LDSW:
regs[rd] = (int32_t)dtrace_load32(regs[r1]);
break;
case DIF_OP_RLDUB:
if (!dtrace_canstore(regs[r1], 1, mstate, vstate)) {
*flags |= CPU_DTRACE_KPRIV;
*illval = regs[r1];
break;
}
/*FALLTHROUGH*/
case DIF_OP_LDUB:
regs[rd] = dtrace_load8(regs[r1]);
break;
case DIF_OP_RLDUH:
if (!dtrace_canstore(regs[r1], 2, mstate, vstate)) {
*flags |= CPU_DTRACE_KPRIV;
*illval = regs[r1];
break;
}
/*FALLTHROUGH*/
case DIF_OP_LDUH:
regs[rd] = dtrace_load16(regs[r1]);
break;
case DIF_OP_RLDUW:
if (!dtrace_canstore(regs[r1], 4, mstate, vstate)) {
*flags |= CPU_DTRACE_KPRIV;
*illval = regs[r1];
break;
}
/*FALLTHROUGH*/
case DIF_OP_LDUW:
regs[rd] = dtrace_load32(regs[r1]);
break;
case DIF_OP_RLDX:
if (!dtrace_canstore(regs[r1], 8, mstate, vstate)) {
*flags |= CPU_DTRACE_KPRIV;
*illval = regs[r1];
break;
}
/*FALLTHROUGH*/
case DIF_OP_LDX:
regs[rd] = dtrace_load64(regs[r1]);
break;
case DIF_OP_ULDSB:
regs[rd] = (int8_t)
dtrace_fuword8((void *)(uintptr_t)regs[r1]);
break;
case DIF_OP_ULDSH:
regs[rd] = (int16_t)
dtrace_fuword16((void *)(uintptr_t)regs[r1]);
break;
case DIF_OP_ULDSW:
regs[rd] = (int32_t)
dtrace_fuword32((void *)(uintptr_t)regs[r1]);
break;
case DIF_OP_ULDUB:
regs[rd] =
dtrace_fuword8((void *)(uintptr_t)regs[r1]);
break;
case DIF_OP_ULDUH:
regs[rd] =
dtrace_fuword16((void *)(uintptr_t)regs[r1]);
break;
case DIF_OP_ULDUW:
regs[rd] =
dtrace_fuword32((void *)(uintptr_t)regs[r1]);
break;
case DIF_OP_ULDX:
regs[rd] =
dtrace_fuword64((void *)(uintptr_t)regs[r1]);
break;
case DIF_OP_RET:
rval = regs[rd];
pc = textlen;
break;
case DIF_OP_NOP:
break;
case DIF_OP_SETX:
regs[rd] = inttab[DIF_INSTR_INTEGER(instr)];
break;
case DIF_OP_SETS:
regs[rd] = (uint64_t)(uintptr_t)
(strtab + DIF_INSTR_STRING(instr));
break;
case DIF_OP_SCMP: {
size_t sz = state->dts_options[DTRACEOPT_STRSIZE];
uintptr_t s1 = regs[r1];
uintptr_t s2 = regs[r2];
if (s1 != 0 &&
!dtrace_strcanload(s1, sz, mstate, vstate))
break;
if (s2 != 0 &&
!dtrace_strcanload(s2, sz, mstate, vstate))
break;
cc_r = dtrace_strncmp((char *)s1, (char *)s2, sz);
cc_n = cc_r < 0;
cc_z = cc_r == 0;
cc_v = cc_c = 0;
break;
}
case DIF_OP_LDGA:
regs[rd] = dtrace_dif_variable(mstate, state,
r1, regs[r2]);
break;
case DIF_OP_LDGS:
id = DIF_INSTR_VAR(instr);
if (id >= DIF_VAR_OTHER_UBASE) {
uintptr_t a;
id -= DIF_VAR_OTHER_UBASE;
svar = vstate->dtvs_globals[id];
ASSERT(svar != NULL);
v = &svar->dtsv_var;
if (!(v->dtdv_type.dtdt_flags & DIF_TF_BYREF)) {
regs[rd] = svar->dtsv_data;
break;
}
a = (uintptr_t)svar->dtsv_data;
if (*(uint8_t *)a == UINT8_MAX) {
/*
* If the 0th byte is set to UINT8_MAX
* then this is to be treated as a
* reference to a NULL variable.
*/
regs[rd] = 0;
} else {
regs[rd] = a + sizeof (uint64_t);
}
break;
}
regs[rd] = dtrace_dif_variable(mstate, state, id, 0);
break;
case DIF_OP_STGS:
id = DIF_INSTR_VAR(instr);
ASSERT(id >= DIF_VAR_OTHER_UBASE);
id -= DIF_VAR_OTHER_UBASE;
svar = vstate->dtvs_globals[id];
ASSERT(svar != NULL);
v = &svar->dtsv_var;
if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
uintptr_t a = (uintptr_t)svar->dtsv_data;
ASSERT(a != 0);
ASSERT(svar->dtsv_size != 0);
if (regs[rd] == 0) {
*(uint8_t *)a = UINT8_MAX;
break;
} else {
*(uint8_t *)a = 0;
a += sizeof (uint64_t);
}
if (!dtrace_vcanload(
(void *)(uintptr_t)regs[rd], &v->dtdv_type,
mstate, vstate))
break;
dtrace_vcopy((void *)(uintptr_t)regs[rd],
(void *)a, &v->dtdv_type);
break;
}
svar->dtsv_data = regs[rd];
break;
case DIF_OP_LDTA:
/*
* There are no DTrace built-in thread-local arrays at
* present. This opcode is saved for future work.
*/
*flags |= CPU_DTRACE_ILLOP;
regs[rd] = 0;
break;
case DIF_OP_LDLS:
id = DIF_INSTR_VAR(instr);
if (id < DIF_VAR_OTHER_UBASE) {
/*
* For now, this has no meaning.
*/
regs[rd] = 0;
break;
}
id -= DIF_VAR_OTHER_UBASE;
ASSERT(id < vstate->dtvs_nlocals);
ASSERT(vstate->dtvs_locals != NULL);
svar = vstate->dtvs_locals[id];
ASSERT(svar != NULL);
v = &svar->dtsv_var;
if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
uintptr_t a = (uintptr_t)svar->dtsv_data;
size_t sz = v->dtdv_type.dtdt_size;
sz += sizeof (uint64_t);
ASSERT(svar->dtsv_size == NCPU * sz);
a += curcpu * sz;
if (*(uint8_t *)a == UINT8_MAX) {
/*
* If the 0th byte is set to UINT8_MAX
* then this is to be treated as a
* reference to a NULL variable.
*/
regs[rd] = 0;
} else {
regs[rd] = a + sizeof (uint64_t);
}
break;
}
ASSERT(svar->dtsv_size == NCPU * sizeof (uint64_t));
tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
regs[rd] = tmp[curcpu];
break;
case DIF_OP_STLS:
id = DIF_INSTR_VAR(instr);
ASSERT(id >= DIF_VAR_OTHER_UBASE);
id -= DIF_VAR_OTHER_UBASE;
ASSERT(id < vstate->dtvs_nlocals);
ASSERT(vstate->dtvs_locals != NULL);
svar = vstate->dtvs_locals[id];
ASSERT(svar != NULL);
v = &svar->dtsv_var;
if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
uintptr_t a = (uintptr_t)svar->dtsv_data;
size_t sz = v->dtdv_type.dtdt_size;
sz += sizeof (uint64_t);
ASSERT(svar->dtsv_size == NCPU * sz);
a += curcpu * sz;
if (regs[rd] == 0) {
*(uint8_t *)a = UINT8_MAX;
break;
} else {
*(uint8_t *)a = 0;
a += sizeof (uint64_t);
}
if (!dtrace_vcanload(
(void *)(uintptr_t)regs[rd], &v->dtdv_type,
mstate, vstate))
break;
dtrace_vcopy((void *)(uintptr_t)regs[rd],
(void *)a, &v->dtdv_type);
break;
}
ASSERT(svar->dtsv_size == NCPU * sizeof (uint64_t));
tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
tmp[curcpu] = regs[rd];
break;
case DIF_OP_LDTS: {
dtrace_dynvar_t *dvar;
dtrace_key_t *key;
id = DIF_INSTR_VAR(instr);
ASSERT(id >= DIF_VAR_OTHER_UBASE);
id -= DIF_VAR_OTHER_UBASE;
v = &vstate->dtvs_tlocals[id];
key = &tupregs[DIF_DTR_NREGS];
key[0].dttk_value = (uint64_t)id;
key[0].dttk_size = 0;
DTRACE_TLS_THRKEY(key[1].dttk_value);
key[1].dttk_size = 0;
dvar = dtrace_dynvar(dstate, 2, key,
sizeof (uint64_t), DTRACE_DYNVAR_NOALLOC,
mstate, vstate);
if (dvar == NULL) {
regs[rd] = 0;
break;
}
if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
} else {
regs[rd] = *((uint64_t *)dvar->dtdv_data);
}
break;
}
case DIF_OP_STTS: {
dtrace_dynvar_t *dvar;
dtrace_key_t *key;
id = DIF_INSTR_VAR(instr);
ASSERT(id >= DIF_VAR_OTHER_UBASE);
id -= DIF_VAR_OTHER_UBASE;
key = &tupregs[DIF_DTR_NREGS];
key[0].dttk_value = (uint64_t)id;
key[0].dttk_size = 0;
DTRACE_TLS_THRKEY(key[1].dttk_value);
key[1].dttk_size = 0;
v = &vstate->dtvs_tlocals[id];
dvar = dtrace_dynvar(dstate, 2, key,
v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
v->dtdv_type.dtdt_size : sizeof (uint64_t),
regs[rd] ? DTRACE_DYNVAR_ALLOC :
DTRACE_DYNVAR_DEALLOC, mstate, vstate);
/*
* Given that we're storing to thread-local data,
* we need to flush our predicate cache.
*/
curthread->t_predcache = 0;
if (dvar == NULL)
break;
if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
if (!dtrace_vcanload(
(void *)(uintptr_t)regs[rd],
&v->dtdv_type, mstate, vstate))
break;
dtrace_vcopy((void *)(uintptr_t)regs[rd],
dvar->dtdv_data, &v->dtdv_type);
} else {
*((uint64_t *)dvar->dtdv_data) = regs[rd];
}
break;
}
case DIF_OP_SRA:
regs[rd] = (int64_t)regs[r1] >> regs[r2];
break;
case DIF_OP_CALL:
dtrace_dif_subr(DIF_INSTR_SUBR(instr), rd,
regs, tupregs, ttop, mstate, state);
break;
case DIF_OP_PUSHTR:
if (ttop == DIF_DTR_NREGS) {
*flags |= CPU_DTRACE_TUPOFLOW;
break;
}
if (r1 == DIF_TYPE_STRING) {
/*
* If this is a string type and the size is 0,
* we'll use the system-wide default string
* size. Note that we are _not_ looking at
* the value of the DTRACEOPT_STRSIZE option;
* had this been set, we would expect to have
* a non-zero size value in the "pushtr".
*/
tupregs[ttop].dttk_size =
dtrace_strlen((char *)(uintptr_t)regs[rd],
regs[r2] ? regs[r2] :
dtrace_strsize_default) + 1;
} else {
tupregs[ttop].dttk_size = regs[r2];
}
tupregs[ttop++].dttk_value = regs[rd];
break;
case DIF_OP_PUSHTV:
if (ttop == DIF_DTR_NREGS) {
*flags |= CPU_DTRACE_TUPOFLOW;
break;
}
tupregs[ttop].dttk_value = regs[rd];
tupregs[ttop++].dttk_size = 0;
break;
case DIF_OP_POPTS:
if (ttop != 0)
ttop--;
break;
case DIF_OP_FLUSHTS:
ttop = 0;
break;
case DIF_OP_LDGAA:
case DIF_OP_LDTAA: {
dtrace_dynvar_t *dvar;
dtrace_key_t *key = tupregs;
uint_t nkeys = ttop;
id = DIF_INSTR_VAR(instr);
ASSERT(id >= DIF_VAR_OTHER_UBASE);
id -= DIF_VAR_OTHER_UBASE;
key[nkeys].dttk_value = (uint64_t)id;
key[nkeys++].dttk_size = 0;
if (DIF_INSTR_OP(instr) == DIF_OP_LDTAA) {
DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
key[nkeys++].dttk_size = 0;
v = &vstate->dtvs_tlocals[id];
} else {
v = &vstate->dtvs_globals[id]->dtsv_var;
}
dvar = dtrace_dynvar(dstate, nkeys, key,
v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
v->dtdv_type.dtdt_size : sizeof (uint64_t),
DTRACE_DYNVAR_NOALLOC, mstate, vstate);
if (dvar == NULL) {
regs[rd] = 0;
break;
}
if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
} else {
regs[rd] = *((uint64_t *)dvar->dtdv_data);
}
break;
}
case DIF_OP_STGAA:
case DIF_OP_STTAA: {
dtrace_dynvar_t *dvar;
dtrace_key_t *key = tupregs;
uint_t nkeys = ttop;
id = DIF_INSTR_VAR(instr);
ASSERT(id >= DIF_VAR_OTHER_UBASE);
id -= DIF_VAR_OTHER_UBASE;
key[nkeys].dttk_value = (uint64_t)id;
key[nkeys++].dttk_size = 0;
if (DIF_INSTR_OP(instr) == DIF_OP_STTAA) {
DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
key[nkeys++].dttk_size = 0;
v = &vstate->dtvs_tlocals[id];
} else {
v = &vstate->dtvs_globals[id]->dtsv_var;
}
dvar = dtrace_dynvar(dstate, nkeys, key,
v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
v->dtdv_type.dtdt_size : sizeof (uint64_t),
regs[rd] ? DTRACE_DYNVAR_ALLOC :
DTRACE_DYNVAR_DEALLOC, mstate, vstate);
if (dvar == NULL)
break;
if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
if (!dtrace_vcanload(
(void *)(uintptr_t)regs[rd], &v->dtdv_type,
mstate, vstate))
break;
dtrace_vcopy((void *)(uintptr_t)regs[rd],
dvar->dtdv_data, &v->dtdv_type);
} else {
*((uint64_t *)dvar->dtdv_data) = regs[rd];
}
break;
}
case DIF_OP_ALLOCS: {
uintptr_t ptr = P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
size_t size = ptr - mstate->dtms_scratch_ptr + regs[r1];
/*
* Rounding up the user allocation size could have
* overflowed large, bogus allocations (like -1ULL) to
* 0.
*/
if (size < regs[r1] ||
!DTRACE_INSCRATCH(mstate, size)) {
DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
regs[rd] = 0;
break;
}
dtrace_bzero((void *) mstate->dtms_scratch_ptr, size);
mstate->dtms_scratch_ptr += size;
regs[rd] = ptr;
break;
}
case DIF_OP_COPYS:
if (!dtrace_canstore(regs[rd], regs[r2],
mstate, vstate)) {
*flags |= CPU_DTRACE_BADADDR;
*illval = regs[rd];
break;
}
if (!dtrace_canload(regs[r1], regs[r2], mstate, vstate))
break;
dtrace_bcopy((void *)(uintptr_t)regs[r1],
(void *)(uintptr_t)regs[rd], (size_t)regs[r2]);
break;
case DIF_OP_STB:
if (!dtrace_canstore(regs[rd], 1, mstate, vstate)) {
*flags |= CPU_DTRACE_BADADDR;
*illval = regs[rd];
break;
}
*((uint8_t *)(uintptr_t)regs[rd]) = (uint8_t)regs[r1];
break;
case DIF_OP_STH:
if (!dtrace_canstore(regs[rd], 2, mstate, vstate)) {
*flags |= CPU_DTRACE_BADADDR;
*illval = regs[rd];
break;
}
if (regs[rd] & 1) {
*flags |= CPU_DTRACE_BADALIGN;
*illval = regs[rd];
break;
}
*((uint16_t *)(uintptr_t)regs[rd]) = (uint16_t)regs[r1];
break;
case DIF_OP_STW:
if (!dtrace_canstore(regs[rd], 4, mstate, vstate)) {
*flags |= CPU_DTRACE_BADADDR;
*illval = regs[rd];
break;
}
if (regs[rd] & 3) {
*flags |= CPU_DTRACE_BADALIGN;
*illval = regs[rd];
break;
}
*((uint32_t *)(uintptr_t)regs[rd]) = (uint32_t)regs[r1];
break;
case DIF_OP_STX:
if (!dtrace_canstore(regs[rd], 8, mstate, vstate)) {
*flags |= CPU_DTRACE_BADADDR;
*illval = regs[rd];
break;
}
if (regs[rd] & 7) {
*flags |= CPU_DTRACE_BADALIGN;
*illval = regs[rd];
break;
}
*((uint64_t *)(uintptr_t)regs[rd]) = regs[r1];
break;
}
}
if (!(*flags & CPU_DTRACE_FAULT))
return (rval);
mstate->dtms_fltoffs = opc * sizeof (dif_instr_t);
mstate->dtms_present |= DTRACE_MSTATE_FLTOFFS;
return (0);
}
static void
dtrace_action_breakpoint(dtrace_ecb_t *ecb)
{
dtrace_probe_t *probe = ecb->dte_probe;
dtrace_provider_t *prov = probe->dtpr_provider;
char c[DTRACE_FULLNAMELEN + 80], *str;
char *msg = "dtrace: breakpoint action at probe ";
char *ecbmsg = " (ecb ";
uintptr_t mask = (0xf << (sizeof (uintptr_t) * NBBY / 4));
uintptr_t val = (uintptr_t)ecb;
int shift = (sizeof (uintptr_t) * NBBY) - 4, i = 0;
if (dtrace_destructive_disallow)
return;
/*
* It's impossible to be taking action on the NULL probe.
*/
ASSERT(probe != NULL);
/*
* This is a poor man's (destitute man's?) sprintf(): we want to
* print the provider name, module name, function name and name of
* the probe, along with the hex address of the ECB with the breakpoint
* action -- all of which we must place in the character buffer by
* hand.
*/
while (*msg != '\0')
c[i++] = *msg++;
for (str = prov->dtpv_name; *str != '\0'; str++)
c[i++] = *str;
c[i++] = ':';
for (str = probe->dtpr_mod; *str != '\0'; str++)
c[i++] = *str;
c[i++] = ':';
for (str = probe->dtpr_func; *str != '\0'; str++)
c[i++] = *str;
c[i++] = ':';
for (str = probe->dtpr_name; *str != '\0'; str++)
c[i++] = *str;
while (*ecbmsg != '\0')
c[i++] = *ecbmsg++;
while (shift >= 0) {
mask = (uintptr_t)0xf << shift;
if (val >= ((uintptr_t)1 << shift))
c[i++] = "0123456789abcdef"[(val & mask) >> shift];
shift -= 4;
}
c[i++] = ')';
c[i] = '\0';
#if defined(sun)
debug_enter(c);
#else
kdb_enter(KDB_WHY_DTRACE, "breakpoint action");
#endif
}
static void
dtrace_action_panic(dtrace_ecb_t *ecb)
{
dtrace_probe_t *probe = ecb->dte_probe;
/*
* It's impossible to be taking action on the NULL probe.
*/
ASSERT(probe != NULL);
if (dtrace_destructive_disallow)
return;
if (dtrace_panicked != NULL)
return;
if (dtrace_casptr(&dtrace_panicked, NULL, curthread) != NULL)
return;
/*
* We won the right to panic. (We want to be sure that only one
* thread calls panic() from dtrace_probe(), and that panic() is
* called exactly once.)
*/
dtrace_panic("dtrace: panic action at probe %s:%s:%s:%s (ecb %p)",
probe->dtpr_provider->dtpv_name, probe->dtpr_mod,
probe->dtpr_func, probe->dtpr_name, (void *)ecb);
}
static void
dtrace_action_raise(uint64_t sig)
{
if (dtrace_destructive_disallow)
return;
if (sig >= NSIG) {
DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
return;
}
#if defined(sun)
/*
* raise() has a queue depth of 1 -- we ignore all subsequent
* invocations of the raise() action.
*/
if (curthread->t_dtrace_sig == 0)
curthread->t_dtrace_sig = (uint8_t)sig;
curthread->t_sig_check = 1;
aston(curthread);
#else
struct proc *p = curproc;
PROC_LOCK(p);
- psignal(p, sig);
+ kern_psignal(p, sig);
PROC_UNLOCK(p);
#endif
}
static void
dtrace_action_stop(void)
{
if (dtrace_destructive_disallow)
return;
#if defined(sun)
if (!curthread->t_dtrace_stop) {
curthread->t_dtrace_stop = 1;
curthread->t_sig_check = 1;
aston(curthread);
}
#else
struct proc *p = curproc;
PROC_LOCK(p);
- psignal(p, SIGSTOP);
+ kern_psignal(p, SIGSTOP);
PROC_UNLOCK(p);
#endif
}
static void
dtrace_action_chill(dtrace_mstate_t *mstate, hrtime_t val)
{
hrtime_t now;
volatile uint16_t *flags;
#if defined(sun)
cpu_t *cpu = CPU;
#else
cpu_t *cpu = &solaris_cpu[curcpu];
#endif
if (dtrace_destructive_disallow)
return;
flags = (volatile uint16_t *)&cpu_core[cpu->cpu_id].cpuc_dtrace_flags;
now = dtrace_gethrtime();
if (now - cpu->cpu_dtrace_chillmark > dtrace_chill_interval) {
/*
* We need to advance the mark to the current time.
*/
cpu->cpu_dtrace_chillmark = now;
cpu->cpu_dtrace_chilled = 0;
}
/*
* Now check to see if the requested chill time would take us over
* the maximum amount of time allowed in the chill interval. (Or
* worse, if the calculation itself induces overflow.)
*/
if (cpu->cpu_dtrace_chilled + val > dtrace_chill_max ||
cpu->cpu_dtrace_chilled + val < cpu->cpu_dtrace_chilled) {
*flags |= CPU_DTRACE_ILLOP;
return;
}
while (dtrace_gethrtime() - now < val)
continue;
/*
* Normally, we assure that the value of the variable "timestamp" does
* not change within an ECB. The presence of chill() represents an
* exception to this rule, however.
*/
mstate->dtms_present &= ~DTRACE_MSTATE_TIMESTAMP;
cpu->cpu_dtrace_chilled += val;
}
static void
dtrace_action_ustack(dtrace_mstate_t *mstate, dtrace_state_t *state,
uint64_t *buf, uint64_t arg)
{
int nframes = DTRACE_USTACK_NFRAMES(arg);
int strsize = DTRACE_USTACK_STRSIZE(arg);
uint64_t *pcs = &buf[1], *fps;
char *str = (char *)&pcs[nframes];
int size, offs = 0, i, j;
uintptr_t old = mstate->dtms_scratch_ptr, saved;
uint16_t *flags = &cpu_core[curcpu].cpuc_dtrace_flags;
char *sym;
/*
* Should be taking a faster path if string space has not been
* allocated.
*/
ASSERT(strsize != 0);
/*
* We will first allocate some temporary space for the frame pointers.
*/
fps = (uint64_t *)P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
size = (uintptr_t)fps - mstate->dtms_scratch_ptr +
(nframes * sizeof (uint64_t));
if (!DTRACE_INSCRATCH(mstate, size)) {
/*
* Not enough room for our frame pointers -- need to indicate
* that we ran out of scratch space.
*/
DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
return;
}
mstate->dtms_scratch_ptr += size;
saved = mstate->dtms_scratch_ptr;
/*
* Now get a stack with both program counters and frame pointers.
*/
DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
dtrace_getufpstack(buf, fps, nframes + 1);
DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
/*
* If that faulted, we're cooked.
*/
if (*flags & CPU_DTRACE_FAULT)
goto out;
/*
* Now we want to walk up the stack, calling the USTACK helper. For
* each iteration, we restore the scratch pointer.
*/
for (i = 0; i < nframes; i++) {
mstate->dtms_scratch_ptr = saved;
if (offs >= strsize)
break;
sym = (char *)(uintptr_t)dtrace_helper(
DTRACE_HELPER_ACTION_USTACK,
mstate, state, pcs[i], fps[i]);
/*
* If we faulted while running the helper, we're going to
* clear the fault and null out the corresponding string.
*/
if (*flags & CPU_DTRACE_FAULT) {
*flags &= ~CPU_DTRACE_FAULT;
str[offs++] = '\0';
continue;
}
if (sym == NULL) {
str[offs++] = '\0';
continue;
}
DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
/*
* Now copy in the string that the helper returned to us.
*/
for (j = 0; offs + j < strsize; j++) {
if ((str[offs + j] = sym[j]) == '\0')
break;
}
DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
offs += j + 1;
}
if (offs >= strsize) {
/*
* If we didn't have room for all of the strings, we don't
* abort processing -- this needn't be a fatal error -- but we
* still want to increment a counter (dts_stkstroverflows) to
* allow this condition to be warned about. (If this is from
* a jstack() action, it is easily tuned via jstackstrsize.)
*/
dtrace_error(&state->dts_stkstroverflows);
}
while (offs < strsize)
str[offs++] = '\0';
out:
mstate->dtms_scratch_ptr = old;
}
/*
* If you're looking for the epicenter of DTrace, you just found it. This
* is the function called by the provider to fire a probe -- from which all
* subsequent probe-context DTrace activity emanates.
*/
void
dtrace_probe(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1,
uintptr_t arg2, uintptr_t arg3, uintptr_t arg4)
{
processorid_t cpuid;
dtrace_icookie_t cookie;
dtrace_probe_t *probe;
dtrace_mstate_t mstate;
dtrace_ecb_t *ecb;
dtrace_action_t *act;
intptr_t offs;
size_t size;
int vtime, onintr;
volatile uint16_t *flags;
hrtime_t now;
#if defined(sun)
/*
* Kick out immediately if this CPU is still being born (in which case
* curthread will be set to -1) or the current thread can't allow
* probes in its current context.
*/
if (((uintptr_t)curthread & 1) || (curthread->t_flag & T_DONTDTRACE))
return;
#endif
cookie = dtrace_interrupt_disable();
probe = dtrace_probes[id - 1];
cpuid = curcpu;
onintr = CPU_ON_INTR(CPU);
if (!onintr && probe->dtpr_predcache != DTRACE_CACHEIDNONE &&
probe->dtpr_predcache == curthread->t_predcache) {
/*
* We have hit in the predicate cache; we know that
* this predicate would evaluate to be false.
*/
dtrace_interrupt_enable(cookie);
return;
}
#if defined(sun)
if (panic_quiesce) {
#else
if (panicstr != NULL) {
#endif
/*
* We don't trace anything if we're panicking.
*/
dtrace_interrupt_enable(cookie);
return;
}
now = dtrace_gethrtime();
vtime = dtrace_vtime_references != 0;
if (vtime && curthread->t_dtrace_start)
curthread->t_dtrace_vtime += now - curthread->t_dtrace_start;
mstate.dtms_difo = NULL;
mstate.dtms_probe = probe;
mstate.dtms_strtok = 0;
mstate.dtms_arg[0] = arg0;
mstate.dtms_arg[1] = arg1;
mstate.dtms_arg[2] = arg2;
mstate.dtms_arg[3] = arg3;
mstate.dtms_arg[4] = arg4;
flags = (volatile uint16_t *)&cpu_core[cpuid].cpuc_dtrace_flags;
for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) {
dtrace_predicate_t *pred = ecb->dte_predicate;
dtrace_state_t *state = ecb->dte_state;
dtrace_buffer_t *buf = &state->dts_buffer[cpuid];
dtrace_buffer_t *aggbuf = &state->dts_aggbuffer[cpuid];
dtrace_vstate_t *vstate = &state->dts_vstate;
dtrace_provider_t *prov = probe->dtpr_provider;
int committed = 0;
caddr_t tomax;
/*
* A little subtlety with the following (seemingly innocuous)
* declaration of the automatic 'val': by looking at the
* code, you might think that it could be declared in the
* action processing loop, below. (That is, it's only used in
* the action processing loop.) However, it must be declared
* out of that scope because in the case of DIF expression
* arguments to aggregating actions, one iteration of the
* action loop will use the last iteration's value.
*/
uint64_t val = 0;
mstate.dtms_present = DTRACE_MSTATE_ARGS | DTRACE_MSTATE_PROBE;
*flags &= ~CPU_DTRACE_ERROR;
if (prov == dtrace_provider) {
/*
* If dtrace itself is the provider of this probe,
* we're only going to continue processing the ECB if
* arg0 (the dtrace_state_t) is equal to the ECB's
* creating state. (This prevents disjoint consumers
* from seeing one another's metaprobes.)
*/
if (arg0 != (uint64_t)(uintptr_t)state)
continue;
}
if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE) {
/*
* We're not currently active. If our provider isn't
* the dtrace pseudo provider, we're not interested.
*/
if (prov != dtrace_provider)
continue;
/*
* Now we must further check if we are in the BEGIN
* probe. If we are, we will only continue processing
* if we're still in WARMUP -- if one BEGIN enabling
* has invoked the exit() action, we don't want to
* evaluate subsequent BEGIN enablings.
*/
if (probe->dtpr_id == dtrace_probeid_begin &&
state->dts_activity != DTRACE_ACTIVITY_WARMUP) {
ASSERT(state->dts_activity ==
DTRACE_ACTIVITY_DRAINING);
continue;
}
}
if (ecb->dte_cond) {
/*
* If the dte_cond bits indicate that this
* consumer is only allowed to see user-mode firings
* of this probe, call the provider's dtps_usermode()
* entry point to check that the probe was fired
* while in a user context. Skip this ECB if that's
* not the case.
*/
if ((ecb->dte_cond & DTRACE_COND_USERMODE) &&
prov->dtpv_pops.dtps_usermode(prov->dtpv_arg,
probe->dtpr_id, probe->dtpr_arg) == 0)
continue;
#if defined(sun)
/*
* This is more subtle than it looks. We have to be
* absolutely certain that CRED() isn't going to
* change out from under us so it's only legit to
* examine that structure if we're in constrained
* situations. Currently, the only times we'll this
* check is if a non-super-user has enabled the
* profile or syscall providers -- providers that
* allow visibility of all processes. For the
* profile case, the check above will ensure that
* we're examining a user context.
*/
if (ecb->dte_cond & DTRACE_COND_OWNER) {
cred_t *cr;
cred_t *s_cr =
ecb->dte_state->dts_cred.dcr_cred;
proc_t *proc;
ASSERT(s_cr != NULL);
if ((cr = CRED()) == NULL ||
s_cr->cr_uid != cr->cr_uid ||
s_cr->cr_uid != cr->cr_ruid ||
s_cr->cr_uid != cr->cr_suid ||
s_cr->cr_gid != cr->cr_gid ||
s_cr->cr_gid != cr->cr_rgid ||
s_cr->cr_gid != cr->cr_sgid ||
(proc = ttoproc(curthread)) == NULL ||
(proc->p_flag & SNOCD))
continue;
}
if (ecb->dte_cond & DTRACE_COND_ZONEOWNER) {
cred_t *cr;
cred_t *s_cr =
ecb->dte_state->dts_cred.dcr_cred;
ASSERT(s_cr != NULL);
if ((cr = CRED()) == NULL ||
s_cr->cr_zone->zone_id !=
cr->cr_zone->zone_id)
continue;
}
#endif
}
if (now - state->dts_alive > dtrace_deadman_timeout) {
/*
* We seem to be dead. Unless we (a) have kernel
* destructive permissions (b) have expicitly enabled
* destructive actions and (c) destructive actions have
* not been disabled, we're going to transition into
* the KILLED state, from which no further processing
* on this state will be performed.
*/
if (!dtrace_priv_kernel_destructive(state) ||
!state->dts_cred.dcr_destructive ||
dtrace_destructive_disallow) {
void *activity = &state->dts_activity;
dtrace_activity_t current;
do {
current = state->dts_activity;
} while (dtrace_cas32(activity, current,
DTRACE_ACTIVITY_KILLED) != current);
continue;
}
}
if ((offs = dtrace_buffer_reserve(buf, ecb->dte_needed,
ecb->dte_alignment, state, &mstate)) < 0)
continue;
tomax = buf->dtb_tomax;
ASSERT(tomax != NULL);
if (ecb->dte_size != 0)
DTRACE_STORE(uint32_t, tomax, offs, ecb->dte_epid);
mstate.dtms_epid = ecb->dte_epid;
mstate.dtms_present |= DTRACE_MSTATE_EPID;
if (state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)
mstate.dtms_access = DTRACE_ACCESS_KERNEL;
else
mstate.dtms_access = 0;
if (pred != NULL) {
dtrace_difo_t *dp = pred->dtp_difo;
int rval;
rval = dtrace_dif_emulate(dp, &mstate, vstate, state);
if (!(*flags & CPU_DTRACE_ERROR) && !rval) {
dtrace_cacheid_t cid = probe->dtpr_predcache;
if (cid != DTRACE_CACHEIDNONE && !onintr) {
/*
* Update the predicate cache...
*/
ASSERT(cid == pred->dtp_cacheid);
curthread->t_predcache = cid;
}
continue;
}
}
for (act = ecb->dte_action; !(*flags & CPU_DTRACE_ERROR) &&
act != NULL; act = act->dta_next) {
size_t valoffs;
dtrace_difo_t *dp;
dtrace_recdesc_t *rec = &act->dta_rec;
size = rec->dtrd_size;
valoffs = offs + rec->dtrd_offset;
if (DTRACEACT_ISAGG(act->dta_kind)) {
uint64_t v = 0xbad;
dtrace_aggregation_t *agg;
agg = (dtrace_aggregation_t *)act;
if ((dp = act->dta_difo) != NULL)
v = dtrace_dif_emulate(dp,
&mstate, vstate, state);
if (*flags & CPU_DTRACE_ERROR)
continue;
/*
* Note that we always pass the expression
* value from the previous iteration of the
* action loop. This value will only be used
* if there is an expression argument to the
* aggregating action, denoted by the
* dtag_hasarg field.
*/
dtrace_aggregate(agg, buf,
offs, aggbuf, v, val);
continue;
}
switch (act->dta_kind) {
case DTRACEACT_STOP:
if (dtrace_priv_proc_destructive(state))
dtrace_action_stop();
continue;
case DTRACEACT_BREAKPOINT:
if (dtrace_priv_kernel_destructive(state))
dtrace_action_breakpoint(ecb);
continue;
case DTRACEACT_PANIC:
if (dtrace_priv_kernel_destructive(state))
dtrace_action_panic(ecb);
continue;
case DTRACEACT_STACK:
if (!dtrace_priv_kernel(state))
continue;
dtrace_getpcstack((pc_t *)(tomax + valoffs),
size / sizeof (pc_t), probe->dtpr_aframes,
DTRACE_ANCHORED(probe) ? NULL :
(uint32_t *)arg0);
continue;
case DTRACEACT_JSTACK:
case DTRACEACT_USTACK:
if (!dtrace_priv_proc(state))
continue;
/*
* See comment in DIF_VAR_PID.
*/
if (DTRACE_ANCHORED(mstate.dtms_probe) &&
CPU_ON_INTR(CPU)) {
int depth = DTRACE_USTACK_NFRAMES(
rec->dtrd_arg) + 1;
dtrace_bzero((void *)(tomax + valoffs),
DTRACE_USTACK_STRSIZE(rec->dtrd_arg)
+ depth * sizeof (uint64_t));
continue;
}
if (DTRACE_USTACK_STRSIZE(rec->dtrd_arg) != 0 &&
curproc->p_dtrace_helpers != NULL) {
/*
* This is the slow path -- we have
* allocated string space, and we're
* getting the stack of a process that
* has helpers. Call into a separate
* routine to perform this processing.
*/
dtrace_action_ustack(&mstate, state,
(uint64_t *)(tomax + valoffs),
rec->dtrd_arg);
continue;
}
DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
dtrace_getupcstack((uint64_t *)
(tomax + valoffs),
DTRACE_USTACK_NFRAMES(rec->dtrd_arg) + 1);
DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
continue;
default:
break;
}
dp = act->dta_difo;
ASSERT(dp != NULL);
val = dtrace_dif_emulate(dp, &mstate, vstate, state);
if (*flags & CPU_DTRACE_ERROR)
continue;
switch (act->dta_kind) {
case DTRACEACT_SPECULATE:
ASSERT(buf == &state->dts_buffer[cpuid]);
buf = dtrace_speculation_buffer(state,
cpuid, val);
if (buf == NULL) {
*flags |= CPU_DTRACE_DROP;
continue;
}
offs = dtrace_buffer_reserve(buf,
ecb->dte_needed, ecb->dte_alignment,
state, NULL);
if (offs < 0) {
*flags |= CPU_DTRACE_DROP;
continue;
}
tomax = buf->dtb_tomax;
ASSERT(tomax != NULL);
if (ecb->dte_size != 0)
DTRACE_STORE(uint32_t, tomax, offs,
ecb->dte_epid);
continue;
case DTRACEACT_PRINTM: {
/* The DIF returns a 'memref'. */
uintptr_t *memref = (uintptr_t *)(uintptr_t) val;
/* Get the size from the memref. */
size = memref[1];
/*
* Check if the size exceeds the allocated
* buffer size.
*/
if (size + sizeof(uintptr_t) > dp->dtdo_rtype.dtdt_size) {
/* Flag a drop! */
*flags |= CPU_DTRACE_DROP;
continue;
}
/* Store the size in the buffer first. */
DTRACE_STORE(uintptr_t, tomax,
valoffs, size);
/*
* Offset the buffer address to the start
* of the data.
*/
valoffs += sizeof(uintptr_t);
/*
* Reset to the memory address rather than
* the memref array, then let the BYREF
* code below do the work to store the
* memory data in the buffer.
*/
val = memref[0];
break;
}
case DTRACEACT_PRINTT: {
/* The DIF returns a 'typeref'. */
uintptr_t *typeref = (uintptr_t *)(uintptr_t) val;
char c = '\0' + 1;
size_t s;
/*
* Get the type string length and round it
* up so that the data that follows is
* aligned for easy access.
*/
size_t typs = strlen((char *) typeref[2]) + 1;
typs = roundup(typs, sizeof(uintptr_t));
/*
*Get the size from the typeref using the
* number of elements and the type size.
*/
size = typeref[1] * typeref[3];
/*
* Check if the size exceeds the allocated
* buffer size.
*/
if (size + typs + 2 * sizeof(uintptr_t) > dp->dtdo_rtype.dtdt_size) {
/* Flag a drop! */
*flags |= CPU_DTRACE_DROP;
}
/* Store the size in the buffer first. */
DTRACE_STORE(uintptr_t, tomax,
valoffs, size);
valoffs += sizeof(uintptr_t);
/* Store the type size in the buffer. */
DTRACE_STORE(uintptr_t, tomax,
valoffs, typeref[3]);
valoffs += sizeof(uintptr_t);
val = typeref[2];
for (s = 0; s < typs; s++) {
if (c != '\0')
c = dtrace_load8(val++);
DTRACE_STORE(uint8_t, tomax,
valoffs++, c);
}
/*
* Reset to the memory address rather than
* the typeref array, then let the BYREF
* code below do the work to store the
* memory data in the buffer.
*/
val = typeref[0];
break;
}
case DTRACEACT_CHILL:
if (dtrace_priv_kernel_destructive(state))
dtrace_action_chill(&mstate, val);
continue;
case DTRACEACT_RAISE:
if (dtrace_priv_proc_destructive(state))
dtrace_action_raise(val);
continue;
case DTRACEACT_COMMIT:
ASSERT(!committed);
/*
* We need to commit our buffer state.
*/
if (ecb->dte_size)
buf->dtb_offset = offs + ecb->dte_size;
buf = &state->dts_buffer[cpuid];
dtrace_speculation_commit(state, cpuid, val);
committed = 1;
continue;
case DTRACEACT_DISCARD:
dtrace_speculation_discard(state, cpuid, val);
continue;
case DTRACEACT_DIFEXPR:
case DTRACEACT_LIBACT:
case DTRACEACT_PRINTF:
case DTRACEACT_PRINTA:
case DTRACEACT_SYSTEM:
case DTRACEACT_FREOPEN:
break;
case DTRACEACT_SYM:
case DTRACEACT_MOD:
if (!dtrace_priv_kernel(state))
continue;
break;
case DTRACEACT_USYM:
case DTRACEACT_UMOD:
case DTRACEACT_UADDR: {
#if defined(sun)
struct pid *pid = curthread->t_procp->p_pidp;
#endif
if (!dtrace_priv_proc(state))
continue;
DTRACE_STORE(uint64_t, tomax,
#if defined(sun)
valoffs, (uint64_t)pid->pid_id);
#else
valoffs, (uint64_t) curproc->p_pid);
#endif
DTRACE_STORE(uint64_t, tomax,
valoffs + sizeof (uint64_t), val);
continue;
}
case DTRACEACT_EXIT: {
/*
* For the exit action, we are going to attempt
* to atomically set our activity to be
* draining. If this fails (either because
* another CPU has beat us to the exit action,
* or because our current activity is something
* other than ACTIVE or WARMUP), we will
* continue. This assures that the exit action
* can be successfully recorded at most once
* when we're in the ACTIVE state. If we're
* encountering the exit() action while in
* COOLDOWN, however, we want to honor the new
* status code. (We know that we're the only
* thread in COOLDOWN, so there is no race.)
*/
void *activity = &state->dts_activity;
dtrace_activity_t current = state->dts_activity;
if (current == DTRACE_ACTIVITY_COOLDOWN)
break;
if (current != DTRACE_ACTIVITY_WARMUP)
current = DTRACE_ACTIVITY_ACTIVE;
if (dtrace_cas32(activity, current,
DTRACE_ACTIVITY_DRAINING) != current) {
*flags |= CPU_DTRACE_DROP;
continue;
}
break;
}
default:
ASSERT(0);
}
if (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF) {
uintptr_t end = valoffs + size;
if (!dtrace_vcanload((void *)(uintptr_t)val,
&dp->dtdo_rtype, &mstate, vstate))
continue;
/*
* If this is a string, we're going to only
* load until we find the zero byte -- after
* which we'll store zero bytes.
*/
if (dp->dtdo_rtype.dtdt_kind ==
DIF_TYPE_STRING) {
char c = '\0' + 1;
int intuple = act->dta_intuple;
size_t s;
for (s = 0; s < size; s++) {
if (c != '\0')
c = dtrace_load8(val++);
DTRACE_STORE(uint8_t, tomax,
valoffs++, c);
if (c == '\0' && intuple)
break;
}
continue;
}
while (valoffs < end) {
DTRACE_STORE(uint8_t, tomax, valoffs++,
dtrace_load8(val++));
}
continue;
}
switch (size) {
case 0:
break;
case sizeof (uint8_t):
DTRACE_STORE(uint8_t, tomax, valoffs, val);
break;
case sizeof (uint16_t):
DTRACE_STORE(uint16_t, tomax, valoffs, val);
break;
case sizeof (uint32_t):
DTRACE_STORE(uint32_t, tomax, valoffs, val);
break;
case sizeof (uint64_t):
DTRACE_STORE(uint64_t, tomax, valoffs, val);
break;
default:
/*
* Any other size should have been returned by
* reference, not by value.
*/
ASSERT(0);
break;
}
}
if (*flags & CPU_DTRACE_DROP)
continue;
if (*flags & CPU_DTRACE_FAULT) {
int ndx;
dtrace_action_t *err;
buf->dtb_errors++;
if (probe->dtpr_id == dtrace_probeid_error) {
/*
* There's nothing we can do -- we had an
* error on the error probe. We bump an
* error counter to at least indicate that
* this condition happened.
*/
dtrace_error(&state->dts_dblerrors);
continue;
}
if (vtime) {
/*
* Before recursing on dtrace_probe(), we
* need to explicitly clear out our start
* time to prevent it from being accumulated
* into t_dtrace_vtime.
*/
curthread->t_dtrace_start = 0;
}
/*
* Iterate over the actions to figure out which action
* we were processing when we experienced the error.
* Note that act points _past_ the faulting action; if
* act is ecb->dte_action, the fault was in the
* predicate, if it's ecb->dte_action->dta_next it's
* in action #1, and so on.
*/
for (err = ecb->dte_action, ndx = 0;
err != act; err = err->dta_next, ndx++)
continue;
dtrace_probe_error(state, ecb->dte_epid, ndx,
(mstate.dtms_present & DTRACE_MSTATE_FLTOFFS) ?
mstate.dtms_fltoffs : -1, DTRACE_FLAGS2FLT(*flags),
cpu_core[cpuid].cpuc_dtrace_illval);
continue;
}
if (!committed)
buf->dtb_offset = offs + ecb->dte_size;
}
if (vtime)
curthread->t_dtrace_start = dtrace_gethrtime();
dtrace_interrupt_enable(cookie);
}
/*
* DTrace Probe Hashing Functions
*
* The functions in this section (and indeed, the functions in remaining
* sections) are not _called_ from probe context. (Any exceptions to this are
* marked with a "Note:".) Rather, they are called from elsewhere in the
* DTrace framework to look-up probes in, add probes to and remove probes from
* the DTrace probe hashes. (Each probe is hashed by each element of the
* probe tuple -- allowing for fast lookups, regardless of what was
* specified.)
*/
static uint_t
dtrace_hash_str(const char *p)
{
unsigned int g;
uint_t hval = 0;
while (*p) {
hval = (hval << 4) + *p++;
if ((g = (hval & 0xf0000000)) != 0)
hval ^= g >> 24;
hval &= ~g;
}
return (hval);
}
static dtrace_hash_t *
dtrace_hash_create(uintptr_t stroffs, uintptr_t nextoffs, uintptr_t prevoffs)
{
dtrace_hash_t *hash = kmem_zalloc(sizeof (dtrace_hash_t), KM_SLEEP);
hash->dth_stroffs = stroffs;
hash->dth_nextoffs = nextoffs;
hash->dth_prevoffs = prevoffs;
hash->dth_size = 1;
hash->dth_mask = hash->dth_size - 1;
hash->dth_tab = kmem_zalloc(hash->dth_size *
sizeof (dtrace_hashbucket_t *), KM_SLEEP);
return (hash);
}
static void
dtrace_hash_destroy(dtrace_hash_t *hash)
{
#ifdef DEBUG
int i;
for (i = 0; i < hash->dth_size; i++)
ASSERT(hash->dth_tab[i] == NULL);
#endif
kmem_free(hash->dth_tab,
hash->dth_size * sizeof (dtrace_hashbucket_t *));
kmem_free(hash, sizeof (dtrace_hash_t));
}
static void
dtrace_hash_resize(dtrace_hash_t *hash)
{
int size = hash->dth_size, i, ndx;
int new_size = hash->dth_size << 1;
int new_mask = new_size - 1;
dtrace_hashbucket_t **new_tab, *bucket, *next;
ASSERT((new_size & new_mask) == 0);
new_tab = kmem_zalloc(new_size * sizeof (void *), KM_SLEEP);
for (i = 0; i < size; i++) {
for (bucket = hash->dth_tab[i]; bucket != NULL; bucket = next) {
dtrace_probe_t *probe = bucket->dthb_chain;
ASSERT(probe != NULL);
ndx = DTRACE_HASHSTR(hash, probe) & new_mask;
next = bucket->dthb_next;
bucket->dthb_next = new_tab[ndx];
new_tab[ndx] = bucket;
}
}
kmem_free(hash->dth_tab, hash->dth_size * sizeof (void *));
hash->dth_tab = new_tab;
hash->dth_size = new_size;
hash->dth_mask = new_mask;
}
static void
dtrace_hash_add(dtrace_hash_t *hash, dtrace_probe_t *new)
{
int hashval = DTRACE_HASHSTR(hash, new);
int ndx = hashval & hash->dth_mask;
dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
dtrace_probe_t **nextp, **prevp;
for (; bucket != NULL; bucket = bucket->dthb_next) {
if (DTRACE_HASHEQ(hash, bucket->dthb_chain, new))
goto add;
}
if ((hash->dth_nbuckets >> 1) > hash->dth_size) {
dtrace_hash_resize(hash);
dtrace_hash_add(hash, new);
return;
}
bucket = kmem_zalloc(sizeof (dtrace_hashbucket_t), KM_SLEEP);
bucket->dthb_next = hash->dth_tab[ndx];
hash->dth_tab[ndx] = bucket;
hash->dth_nbuckets++;
add:
nextp = DTRACE_HASHNEXT(hash, new);
ASSERT(*nextp == NULL && *(DTRACE_HASHPREV(hash, new)) == NULL);
*nextp = bucket->dthb_chain;
if (bucket->dthb_chain != NULL) {
prevp = DTRACE_HASHPREV(hash, bucket->dthb_chain);
ASSERT(*prevp == NULL);
*prevp = new;
}
bucket->dthb_chain = new;
bucket->dthb_len++;
}
static dtrace_probe_t *
dtrace_hash_lookup(dtrace_hash_t *hash, dtrace_probe_t *template)
{
int hashval = DTRACE_HASHSTR(hash, template);
int ndx = hashval & hash->dth_mask;
dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
for (; bucket != NULL; bucket = bucket->dthb_next) {
if (DTRACE_HASHEQ(hash, bucket->dthb_chain, template))
return (bucket->dthb_chain);
}
return (NULL);
}
static int
dtrace_hash_collisions(dtrace_hash_t *hash, dtrace_probe_t *template)
{
int hashval = DTRACE_HASHSTR(hash, template);
int ndx = hashval & hash->dth_mask;
dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
for (; bucket != NULL; bucket = bucket->dthb_next) {
if (DTRACE_HASHEQ(hash, bucket->dthb_chain, template))
return (bucket->dthb_len);
}
return (0);
}
static void
dtrace_hash_remove(dtrace_hash_t *hash, dtrace_probe_t *probe)
{
int ndx = DTRACE_HASHSTR(hash, probe) & hash->dth_mask;
dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
dtrace_probe_t **prevp = DTRACE_HASHPREV(hash, probe);
dtrace_probe_t **nextp = DTRACE_HASHNEXT(hash, probe);
/*
* Find the bucket that we're removing this probe from.
*/
for (; bucket != NULL; bucket = bucket->dthb_next) {
if (DTRACE_HASHEQ(hash, bucket->dthb_chain, probe))
break;
}
ASSERT(bucket != NULL);
if (*prevp == NULL) {
if (*nextp == NULL) {
/*
* The removed probe was the only probe on this
* bucket; we need to remove the bucket.
*/
dtrace_hashbucket_t *b = hash->dth_tab[ndx];
ASSERT(bucket->dthb_chain == probe);
ASSERT(b != NULL);
if (b == bucket) {
hash->dth_tab[ndx] = bucket->dthb_next;
} else {
while (b->dthb_next != bucket)
b = b->dthb_next;
b->dthb_next = bucket->dthb_next;
}
ASSERT(hash->dth_nbuckets > 0);
hash->dth_nbuckets--;
kmem_free(bucket, sizeof (dtrace_hashbucket_t));
return;
}
bucket->dthb_chain = *nextp;
} else {
*(DTRACE_HASHNEXT(hash, *prevp)) = *nextp;
}
if (*nextp != NULL)
*(DTRACE_HASHPREV(hash, *nextp)) = *prevp;
}
/*
* DTrace Utility Functions
*
* These are random utility functions that are _not_ called from probe context.
*/
static int
dtrace_badattr(const dtrace_attribute_t *a)
{
return (a->dtat_name > DTRACE_STABILITY_MAX ||
a->dtat_data > DTRACE_STABILITY_MAX ||
a->dtat_class > DTRACE_CLASS_MAX);
}
/*
* Return a duplicate copy of a string. If the specified string is NULL,
* this function returns a zero-length string.
*/
static char *
dtrace_strdup(const char *str)
{
char *new = kmem_zalloc((str != NULL ? strlen(str) : 0) + 1, KM_SLEEP);
if (str != NULL)
(void) strcpy(new, str);
return (new);
}
#define DTRACE_ISALPHA(c) \
(((c) >= 'a' && (c) <= 'z') || ((c) >= 'A' && (c) <= 'Z'))
static int
dtrace_badname(const char *s)
{
char c;
if (s == NULL || (c = *s++) == '\0')
return (0);
if (!DTRACE_ISALPHA(c) && c != '-' && c != '_' && c != '.')
return (1);
while ((c = *s++) != '\0') {
if (!DTRACE_ISALPHA(c) && (c < '0' || c > '9') &&
c != '-' && c != '_' && c != '.' && c != '`')
return (1);
}
return (0);
}
static void
dtrace_cred2priv(cred_t *cr, uint32_t *privp, uid_t *uidp, zoneid_t *zoneidp)
{
uint32_t priv;
#if defined(sun)
if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
/*
* For DTRACE_PRIV_ALL, the uid and zoneid don't matter.
*/
priv = DTRACE_PRIV_ALL;
} else {
*uidp = crgetuid(cr);
*zoneidp = crgetzoneid(cr);
priv = 0;
if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE))
priv |= DTRACE_PRIV_KERNEL | DTRACE_PRIV_USER;
else if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE))
priv |= DTRACE_PRIV_USER;
if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE))
priv |= DTRACE_PRIV_PROC;
if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
priv |= DTRACE_PRIV_OWNER;
if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
priv |= DTRACE_PRIV_ZONEOWNER;
}
#else
priv = DTRACE_PRIV_ALL;
#endif
*privp = priv;
}
#ifdef DTRACE_ERRDEBUG
static void
dtrace_errdebug(const char *str)
{
int hval = dtrace_hash_str(str) % DTRACE_ERRHASHSZ;
int occupied = 0;
mutex_enter(&dtrace_errlock);
dtrace_errlast = str;
dtrace_errthread = curthread;
while (occupied++ < DTRACE_ERRHASHSZ) {
if (dtrace_errhash[hval].dter_msg == str) {
dtrace_errhash[hval].dter_count++;
goto out;
}
if (dtrace_errhash[hval].dter_msg != NULL) {
hval = (hval + 1) % DTRACE_ERRHASHSZ;
continue;
}
dtrace_errhash[hval].dter_msg = str;
dtrace_errhash[hval].dter_count = 1;
goto out;
}
panic("dtrace: undersized error hash");
out:
mutex_exit(&dtrace_errlock);
}
#endif
/*
* DTrace Matching Functions
*
* These functions are used to match groups of probes, given some elements of
* a probe tuple, or some globbed expressions for elements of a probe tuple.
*/
static int
dtrace_match_priv(const dtrace_probe_t *prp, uint32_t priv, uid_t uid,
zoneid_t zoneid)
{
if (priv != DTRACE_PRIV_ALL) {
uint32_t ppriv = prp->dtpr_provider->dtpv_priv.dtpp_flags;
uint32_t match = priv & ppriv;
/*
* No PRIV_DTRACE_* privileges...
*/
if ((priv & (DTRACE_PRIV_PROC | DTRACE_PRIV_USER |
DTRACE_PRIV_KERNEL)) == 0)
return (0);
/*
* No matching bits, but there were bits to match...
*/
if (match == 0 && ppriv != 0)
return (0);
/*
* Need to have permissions to the process, but don't...
*/
if (((ppriv & ~match) & DTRACE_PRIV_OWNER) != 0 &&
uid != prp->dtpr_provider->dtpv_priv.dtpp_uid) {
return (0);
}
/*
* Need to be in the same zone unless we possess the
* privilege to examine all zones.
*/
if (((ppriv & ~match) & DTRACE_PRIV_ZONEOWNER) != 0 &&
zoneid != prp->dtpr_provider->dtpv_priv.dtpp_zoneid) {
return (0);
}
}
return (1);
}
/*
* dtrace_match_probe compares a dtrace_probe_t to a pre-compiled key, which
* consists of input pattern strings and an ops-vector to evaluate them.
* This function returns >0 for match, 0 for no match, and <0 for error.
*/
static int
dtrace_match_probe(const dtrace_probe_t *prp, const dtrace_probekey_t *pkp,
uint32_t priv, uid_t uid, zoneid_t zoneid)
{
dtrace_provider_t *pvp = prp->dtpr_provider;
int rv;
if (pvp->dtpv_defunct)
return (0);
if ((rv = pkp->dtpk_pmatch(pvp->dtpv_name, pkp->dtpk_prov, 0)) <= 0)
return (rv);
if ((rv = pkp->dtpk_mmatch(prp->dtpr_mod, pkp->dtpk_mod, 0)) <= 0)
return (rv);
if ((rv = pkp->dtpk_fmatch(prp->dtpr_func, pkp->dtpk_func, 0)) <= 0)
return (rv);
if ((rv = pkp->dtpk_nmatch(prp->dtpr_name, pkp->dtpk_name, 0)) <= 0)
return (rv);
if (dtrace_match_priv(prp, priv, uid, zoneid) == 0)
return (0);
return (rv);
}
/*
* dtrace_match_glob() is a safe kernel implementation of the gmatch(3GEN)
* interface for matching a glob pattern 'p' to an input string 's'. Unlike
* libc's version, the kernel version only applies to 8-bit ASCII strings.
* In addition, all of the recursion cases except for '*' matching have been
* unwound. For '*', we still implement recursive evaluation, but a depth
* counter is maintained and matching is aborted if we recurse too deep.
* The function returns 0 if no match, >0 if match, and <0 if recursion error.
*/
static int
dtrace_match_glob(const char *s, const char *p, int depth)
{
const char *olds;
char s1, c;
int gs;
if (depth > DTRACE_PROBEKEY_MAXDEPTH)
return (-1);
if (s == NULL)
s = ""; /* treat NULL as empty string */
top:
olds = s;
s1 = *s++;
if (p == NULL)
return (0);
if ((c = *p++) == '\0')
return (s1 == '\0');
switch (c) {
case '[': {
int ok = 0, notflag = 0;
char lc = '\0';
if (s1 == '\0')
return (0);
if (*p == '!') {
notflag = 1;
p++;
}
if ((c = *p++) == '\0')
return (0);
do {
if (c == '-' && lc != '\0' && *p != ']') {
if ((c = *p++) == '\0')
return (0);
if (c == '\\' && (c = *p++) == '\0')
return (0);
if (notflag) {
if (s1 < lc || s1 > c)
ok++;
else
return (0);
} else if (lc <= s1 && s1 <= c)
ok++;
} else if (c == '\\' && (c = *p++) == '\0')
return (0);
lc = c; /* save left-hand 'c' for next iteration */
if (notflag) {
if (s1 != c)
ok++;
else
return (0);
} else if (s1 == c)
ok++;
if ((c = *p++) == '\0')
return (0);
} while (c != ']');
if (ok)
goto top;
return (0);
}
case '\\':
if ((c = *p++) == '\0')
return (0);
/*FALLTHRU*/
default:
if (c != s1)
return (0);
/*FALLTHRU*/
case '?':
if (s1 != '\0')
goto top;
return (0);
case '*':
while (*p == '*')
p++; /* consecutive *'s are identical to a single one */
if (*p == '\0')
return (1);
for (s = olds; *s != '\0'; s++) {
if ((gs = dtrace_match_glob(s, p, depth + 1)) != 0)
return (gs);
}
return (0);
}
}
/*ARGSUSED*/
static int
dtrace_match_string(const char *s, const char *p, int depth)
{
return (s != NULL && strcmp(s, p) == 0);
}
/*ARGSUSED*/
static int
dtrace_match_nul(const char *s, const char *p, int depth)
{
return (1); /* always match the empty pattern */
}
/*ARGSUSED*/
static int
dtrace_match_nonzero(const char *s, const char *p, int depth)
{
return (s != NULL && s[0] != '\0');
}
static int
dtrace_match(const dtrace_probekey_t *pkp, uint32_t priv, uid_t uid,
zoneid_t zoneid, int (*matched)(dtrace_probe_t *, void *), void *arg)
{
dtrace_probe_t template, *probe;
dtrace_hash_t *hash = NULL;
int len, best = INT_MAX, nmatched = 0;
dtrace_id_t i;
ASSERT(MUTEX_HELD(&dtrace_lock));
/*
* If the probe ID is specified in the key, just lookup by ID and
* invoke the match callback once if a matching probe is found.
*/
if (pkp->dtpk_id != DTRACE_IDNONE) {
if ((probe = dtrace_probe_lookup_id(pkp->dtpk_id)) != NULL &&
dtrace_match_probe(probe, pkp, priv, uid, zoneid) > 0) {
(void) (*matched)(probe, arg);
nmatched++;
}
return (nmatched);
}
template.dtpr_mod = (char *)pkp->dtpk_mod;
template.dtpr_func = (char *)pkp->dtpk_func;
template.dtpr_name = (char *)pkp->dtpk_name;
/*
* We want to find the most distinct of the module name, function
* name, and name. So for each one that is not a glob pattern or
* empty string, we perform a lookup in the corresponding hash and
* use the hash table with the fewest collisions to do our search.
*/
if (pkp->dtpk_mmatch == &dtrace_match_string &&
(len = dtrace_hash_collisions(dtrace_bymod, &template)) < best) {
best = len;
hash = dtrace_bymod;
}
if (pkp->dtpk_fmatch == &dtrace_match_string &&
(len = dtrace_hash_collisions(dtrace_byfunc, &template)) < best) {
best = len;
hash = dtrace_byfunc;
}
if (pkp->dtpk_nmatch == &dtrace_match_string &&
(len = dtrace_hash_collisions(dtrace_byname, &template)) < best) {
best = len;
hash = dtrace_byname;
}
/*
* If we did not select a hash table, iterate over every probe and
* invoke our callback for each one that matches our input probe key.
*/
if (hash == NULL) {
for (i = 0; i < dtrace_nprobes; i++) {
if ((probe = dtrace_probes[i]) == NULL ||
dtrace_match_probe(probe, pkp, priv, uid,
zoneid) <= 0)
continue;
nmatched++;
if ((*matched)(probe, arg) != DTRACE_MATCH_NEXT)
break;
}
return (nmatched);
}
/*
* If we selected a hash table, iterate over each probe of the same key
* name and invoke the callback for every probe that matches the other
* attributes of our input probe key.
*/
for (probe = dtrace_hash_lookup(hash, &template); probe != NULL;
probe = *(DTRACE_HASHNEXT(hash, probe))) {
if (dtrace_match_probe(probe, pkp, priv, uid, zoneid) <= 0)
continue;
nmatched++;
if ((*matched)(probe, arg) != DTRACE_MATCH_NEXT)
break;
}
return (nmatched);
}
/*
* Return the function pointer dtrace_probecmp() should use to compare the
* specified pattern with a string. For NULL or empty patterns, we select
* dtrace_match_nul(). For glob pattern strings, we use dtrace_match_glob().
* For non-empty non-glob strings, we use dtrace_match_string().
*/
static dtrace_probekey_f *
dtrace_probekey_func(const char *p)
{
char c;
if (p == NULL || *p == '\0')
return (&dtrace_match_nul);
while ((c = *p++) != '\0') {
if (c == '[' || c == '?' || c == '*' || c == '\\')
return (&dtrace_match_glob);
}
return (&dtrace_match_string);
}
/*
* Build a probe comparison key for use with dtrace_match_probe() from the
* given probe description. By convention, a null key only matches anchored
* probes: if each field is the empty string, reset dtpk_fmatch to
* dtrace_match_nonzero().
*/
static void
dtrace_probekey(dtrace_probedesc_t *pdp, dtrace_probekey_t *pkp)
{
pkp->dtpk_prov = pdp->dtpd_provider;
pkp->dtpk_pmatch = dtrace_probekey_func(pdp->dtpd_provider);
pkp->dtpk_mod = pdp->dtpd_mod;
pkp->dtpk_mmatch = dtrace_probekey_func(pdp->dtpd_mod);
pkp->dtpk_func = pdp->dtpd_func;
pkp->dtpk_fmatch = dtrace_probekey_func(pdp->dtpd_func);
pkp->dtpk_name = pdp->dtpd_name;
pkp->dtpk_nmatch = dtrace_probekey_func(pdp->dtpd_name);
pkp->dtpk_id = pdp->dtpd_id;
if (pkp->dtpk_id == DTRACE_IDNONE &&
pkp->dtpk_pmatch == &dtrace_match_nul &&
pkp->dtpk_mmatch == &dtrace_match_nul &&
pkp->dtpk_fmatch == &dtrace_match_nul &&
pkp->dtpk_nmatch == &dtrace_match_nul)
pkp->dtpk_fmatch = &dtrace_match_nonzero;
}
/*
* DTrace Provider-to-Framework API Functions
*
* These functions implement much of the Provider-to-Framework API, as
* described in <sys/dtrace.h>. The parts of the API not in this section are
* the functions in the API for probe management (found below), and
* dtrace_probe() itself (found above).
*/
/*
* Register the calling provider with the DTrace framework. This should
* generally be called by DTrace providers in their attach(9E) entry point.
*/
int
dtrace_register(const char *name, const dtrace_pattr_t *pap, uint32_t priv,
cred_t *cr, const dtrace_pops_t *pops, void *arg, dtrace_provider_id_t *idp)
{
dtrace_provider_t *provider;
if (name == NULL || pap == NULL || pops == NULL || idp == NULL) {
cmn_err(CE_WARN, "failed to register provider '%s': invalid "
"arguments", name ? name : "<NULL>");
return (EINVAL);
}
if (name[0] == '\0' || dtrace_badname(name)) {
cmn_err(CE_WARN, "failed to register provider '%s': invalid "
"provider name", name);
return (EINVAL);
}
if ((pops->dtps_provide == NULL && pops->dtps_provide_module == NULL) ||
pops->dtps_enable == NULL || pops->dtps_disable == NULL ||
pops->dtps_destroy == NULL ||
((pops->dtps_resume == NULL) != (pops->dtps_suspend == NULL))) {
cmn_err(CE_WARN, "failed to register provider '%s': invalid "
"provider ops", name);
return (EINVAL);
}
if (dtrace_badattr(&pap->dtpa_provider) ||
dtrace_badattr(&pap->dtpa_mod) ||
dtrace_badattr(&pap->dtpa_func) ||
dtrace_badattr(&pap->dtpa_name) ||
dtrace_badattr(&pap->dtpa_args)) {
cmn_err(CE_WARN, "failed to register provider '%s': invalid "
"provider attributes", name);
return (EINVAL);
}
if (priv & ~DTRACE_PRIV_ALL) {
cmn_err(CE_WARN, "failed to register provider '%s': invalid "
"privilege attributes", name);
return (EINVAL);
}
if ((priv & DTRACE_PRIV_KERNEL) &&
(priv & (DTRACE_PRIV_USER | DTRACE_PRIV_OWNER)) &&
pops->dtps_usermode == NULL) {
cmn_err(CE_WARN, "failed to register provider '%s': need "
"dtps_usermode() op for given privilege attributes", name);
return (EINVAL);
}
provider = kmem_zalloc(sizeof (dtrace_provider_t), KM_SLEEP);
provider->dtpv_name = kmem_alloc(strlen(name) + 1, KM_SLEEP);
(void) strcpy(provider->dtpv_name, name);
provider->dtpv_attr = *pap;
provider->dtpv_priv.dtpp_flags = priv;
if (cr != NULL) {
provider->dtpv_priv.dtpp_uid = crgetuid(cr);
provider->dtpv_priv.dtpp_zoneid = crgetzoneid(cr);
}
provider->dtpv_pops = *pops;
if (pops->dtps_provide == NULL) {
ASSERT(pops->dtps_provide_module != NULL);
provider->dtpv_pops.dtps_provide =
(void (*)(void *, dtrace_probedesc_t *))dtrace_nullop;
}
if (pops->dtps_provide_module == NULL) {
ASSERT(pops->dtps_provide != NULL);
provider->dtpv_pops.dtps_provide_module =
(void (*)(void *, modctl_t *))dtrace_nullop;
}
if (pops->dtps_suspend == NULL) {
ASSERT(pops->dtps_resume == NULL);
provider->dtpv_pops.dtps_suspend =
(void (*)(void *, dtrace_id_t, void *))dtrace_nullop;
provider->dtpv_pops.dtps_resume =
(void (*)(void *, dtrace_id_t, void *))dtrace_nullop;
}
provider->dtpv_arg = arg;
*idp = (dtrace_provider_id_t)provider;
if (pops == &dtrace_provider_ops) {
ASSERT(MUTEX_HELD(&dtrace_provider_lock));
ASSERT(MUTEX_HELD(&dtrace_lock));
ASSERT(dtrace_anon.dta_enabling == NULL);
/*
* We make sure that the DTrace provider is at the head of
* the provider chain.
*/
provider->dtpv_next = dtrace_provider;
dtrace_provider = provider;
return (0);
}
mutex_enter(&dtrace_provider_lock);
mutex_enter(&dtrace_lock);
/*
* If there is at least one provider registered, we'll add this
* provider after the first provider.
*/
if (dtrace_provider != NULL) {
provider->dtpv_next = dtrace_provider->dtpv_next;
dtrace_provider->dtpv_next = provider;
} else {
dtrace_provider = provider;
}
if (dtrace_retained != NULL) {
dtrace_enabling_provide(provider);
/*
* Now we need to call dtrace_enabling_matchall() -- which
* will acquire cpu_lock and dtrace_lock. We therefore need
* to drop all of our locks before calling into it...
*/
mutex_exit(&dtrace_lock);
mutex_exit(&dtrace_provider_lock);
dtrace_enabling_matchall();
return (0);
}
mutex_exit(&dtrace_lock);
mutex_exit(&dtrace_provider_lock);
return (0);
}
/*
* Unregister the specified provider from the DTrace framework. This should
* generally be called by DTrace providers in their detach(9E) entry point.
*/
int
dtrace_unregister(dtrace_provider_id_t id)
{
dtrace_provider_t *old = (dtrace_provider_t *)id;
dtrace_provider_t *prev = NULL;
int i, self = 0;
dtrace_probe_t *probe, *first = NULL;
if (old->dtpv_pops.dtps_enable ==
(void (*)(void *, dtrace_id_t, void *))dtrace_nullop) {
/*
* If DTrace itself is the provider, we're called with locks
* already held.
*/
ASSERT(old == dtrace_provider);
#if defined(sun)
ASSERT(dtrace_devi != NULL);
#endif
ASSERT(MUTEX_HELD(&dtrace_provider_lock));
ASSERT(MUTEX_HELD(&dtrace_lock));
self = 1;
if (dtrace_provider->dtpv_next != NULL) {
/*
* There's another provider here; return failure.
*/
return (EBUSY);
}
} else {
mutex_enter(&dtrace_provider_lock);
mutex_enter(&mod_lock);
mutex_enter(&dtrace_lock);
}
/*
* If anyone has /dev/dtrace open, or if there are anonymous enabled
* probes, we refuse to let providers slither away, unless this
* provider has already been explicitly invalidated.
*/
if (!old->dtpv_defunct &&
(dtrace_opens || (dtrace_anon.dta_state != NULL &&
dtrace_anon.dta_state->dts_necbs > 0))) {
if (!self) {
mutex_exit(&dtrace_lock);
mutex_exit(&mod_lock);
mutex_exit(&dtrace_provider_lock);
}
return (EBUSY);
}
/*
* Attempt to destroy the probes associated with this provider.
*/
for (i = 0; i < dtrace_nprobes; i++) {
if ((probe = dtrace_probes[i]) == NULL)
continue;
if (probe->dtpr_provider != old)
continue;
if (probe->dtpr_ecb == NULL)
continue;
/*
* We have at least one ECB; we can't remove this provider.
*/
if (!self) {
mutex_exit(&dtrace_lock);
mutex_exit(&mod_lock);
mutex_exit(&dtrace_provider_lock);
}
return (EBUSY);
}
/*
* All of the probes for this provider are disabled; we can safely
* remove all of them from their hash chains and from the probe array.
*/
for (i = 0; i < dtrace_nprobes; i++) {
if ((probe = dtrace_probes[i]) == NULL)
continue;
if (probe->dtpr_provider != old)
continue;
dtrace_probes[i] = NULL;
dtrace_hash_remove(dtrace_bymod, probe);
dtrace_hash_remove(dtrace_byfunc, probe);
dtrace_hash_remove(dtrace_byname, probe);
if (first == NULL) {
first = probe;
probe->dtpr_nextmod = NULL;
} else {
probe->dtpr_nextmod = first;
first = probe;
}
}
/*
* The provider's probes have been removed from the hash chains and
* from the probe array. Now issue a dtrace_sync() to be sure that
* everyone has cleared out from any probe array processing.
*/
dtrace_sync();
for (probe = first; probe != NULL; probe = first) {
first = probe->dtpr_nextmod;
old->dtpv_pops.dtps_destroy(old->dtpv_arg, probe->dtpr_id,
probe->dtpr_arg);
kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
#if defined(sun)
vmem_free(dtrace_arena, (void *)(uintptr_t)(probe->dtpr_id), 1);
#else
free_unr(dtrace_arena, probe->dtpr_id);
#endif
kmem_free(probe, sizeof (dtrace_probe_t));
}
if ((prev = dtrace_provider) == old) {
#if defined(sun)
ASSERT(self || dtrace_devi == NULL);
ASSERT(old->dtpv_next == NULL || dtrace_devi == NULL);
#endif
dtrace_provider = old->dtpv_next;
} else {
while (prev != NULL && prev->dtpv_next != old)
prev = prev->dtpv_next;
if (prev == NULL) {
panic("attempt to unregister non-existent "
"dtrace provider %p\n", (void *)id);
}
prev->dtpv_next = old->dtpv_next;
}
if (!self) {
mutex_exit(&dtrace_lock);
mutex_exit(&mod_lock);
mutex_exit(&dtrace_provider_lock);
}
kmem_free(old->dtpv_name, strlen(old->dtpv_name) + 1);
kmem_free(old, sizeof (dtrace_provider_t));
return (0);
}
/*
* Invalidate the specified provider. All subsequent probe lookups for the
* specified provider will fail, but its probes will not be removed.
*/
void
dtrace_invalidate(dtrace_provider_id_t id)
{
dtrace_provider_t *pvp = (dtrace_provider_t *)id;
ASSERT(pvp->dtpv_pops.dtps_enable !=
(void (*)(void *, dtrace_id_t, void *))dtrace_nullop);
mutex_enter(&dtrace_provider_lock);
mutex_enter(&dtrace_lock);
pvp->dtpv_defunct = 1;
mutex_exit(&dtrace_lock);
mutex_exit(&dtrace_provider_lock);
}
/*
* Indicate whether or not DTrace has attached.
*/
int
dtrace_attached(void)
{
/*
* dtrace_provider will be non-NULL iff the DTrace driver has
* attached. (It's non-NULL because DTrace is always itself a
* provider.)
*/
return (dtrace_provider != NULL);
}
/*
* Remove all the unenabled probes for the given provider. This function is
* not unlike dtrace_unregister(), except that it doesn't remove the provider
* -- just as many of its associated probes as it can.
*/
int
dtrace_condense(dtrace_provider_id_t id)
{
dtrace_provider_t *prov = (dtrace_provider_t *)id;
int i;
dtrace_probe_t *probe;
/*
* Make sure this isn't the dtrace provider itself.
*/
ASSERT(prov->dtpv_pops.dtps_enable !=
(void (*)(void *, dtrace_id_t, void *))dtrace_nullop);
mutex_enter(&dtrace_provider_lock);
mutex_enter(&dtrace_lock);
/*
* Attempt to destroy the probes associated with this provider.
*/
for (i = 0; i < dtrace_nprobes; i++) {
if ((probe = dtrace_probes[i]) == NULL)
continue;
if (probe->dtpr_provider != prov)
continue;
if (probe->dtpr_ecb != NULL)
continue;
dtrace_probes[i] = NULL;
dtrace_hash_remove(dtrace_bymod, probe);
dtrace_hash_remove(dtrace_byfunc, probe);
dtrace_hash_remove(dtrace_byname, probe);
prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, i + 1,
probe->dtpr_arg);
kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
kmem_free(probe, sizeof (dtrace_probe_t));
#if defined(sun)
vmem_free(dtrace_arena, (void *)((uintptr_t)i + 1), 1);
#else
free_unr(dtrace_arena, i + 1);
#endif
}
mutex_exit(&dtrace_lock);
mutex_exit(&dtrace_provider_lock);
return (0);
}
/*
* DTrace Probe Management Functions
*
* The functions in this section perform the DTrace probe management,
* including functions to create probes, look-up probes, and call into the
* providers to request that probes be provided. Some of these functions are
* in the Provider-to-Framework API; these functions can be identified by the
* fact that they are not declared "static".
*/
/*
* Create a probe with the specified module name, function name, and name.
*/
dtrace_id_t
dtrace_probe_create(dtrace_provider_id_t prov, const char *mod,
const char *func, const char *name, int aframes, void *arg)
{
dtrace_probe_t *probe, **probes;
dtrace_provider_t *provider = (dtrace_provider_t *)prov;
dtrace_id_t id;
if (provider == dtrace_provider) {
ASSERT(MUTEX_HELD(&dtrace_lock));
} else {
mutex_enter(&dtrace_lock);
}
#if defined(sun)
id = (dtrace_id_t)(uintptr_t)vmem_alloc(dtrace_arena, 1,
VM_BESTFIT | VM_SLEEP);
#else
id = alloc_unr(dtrace_arena);
#endif
probe = kmem_zalloc(sizeof (dtrace_probe_t), KM_SLEEP);
probe->dtpr_id = id;
probe->dtpr_gen = dtrace_probegen++;
probe->dtpr_mod = dtrace_strdup(mod);
probe->dtpr_func = dtrace_strdup(func);
probe->dtpr_name = dtrace_strdup(name);
probe->dtpr_arg = arg;
probe->dtpr_aframes = aframes;
probe->dtpr_provider = provider;
dtrace_hash_add(dtrace_bymod, probe);
dtrace_hash_add(dtrace_byfunc, probe);
dtrace_hash_add(dtrace_byname, probe);
if (id - 1 >= dtrace_nprobes) {
size_t osize = dtrace_nprobes * sizeof (dtrace_probe_t *);
size_t nsize = osize << 1;
if (nsize == 0) {
ASSERT(osize == 0);
ASSERT(dtrace_probes == NULL);
nsize = sizeof (dtrace_probe_t *);
}
probes = kmem_zalloc(nsize, KM_SLEEP);
if (dtrace_probes == NULL) {
ASSERT(osize == 0);
dtrace_probes = probes;
dtrace_nprobes = 1;
} else {
dtrace_probe_t **oprobes = dtrace_probes;
bcopy(oprobes, probes, osize);
dtrace_membar_producer();
dtrace_probes = probes;
dtrace_sync();
/*
* All CPUs are now seeing the new probes array; we can
* safely free the old array.
*/
kmem_free(oprobes, osize);
dtrace_nprobes <<= 1;
}
ASSERT(id - 1 < dtrace_nprobes);
}
ASSERT(dtrace_probes[id - 1] == NULL);
dtrace_probes[id - 1] = probe;
if (provider != dtrace_provider)
mutex_exit(&dtrace_lock);
return (id);
}
static dtrace_probe_t *
dtrace_probe_lookup_id(dtrace_id_t id)
{
ASSERT(MUTEX_HELD(&dtrace_lock));
if (id == 0 || id > dtrace_nprobes)
return (NULL);
return (dtrace_probes[id - 1]);
}
static int
dtrace_probe_lookup_match(dtrace_probe_t *probe, void *arg)
{
*((dtrace_id_t *)arg) = probe->dtpr_id;
return (DTRACE_MATCH_DONE);
}
/*
* Look up a probe based on provider and one or more of module name, function
* name and probe name.
*/
dtrace_id_t
dtrace_probe_lookup(dtrace_provider_id_t prid, char *mod,
char *func, char *name)
{
dtrace_probekey_t pkey;
dtrace_id_t id;
int match;
pkey.dtpk_prov = ((dtrace_provider_t *)prid)->dtpv_name;
pkey.dtpk_pmatch = &dtrace_match_string;
pkey.dtpk_mod = mod;
pkey.dtpk_mmatch = mod ? &dtrace_match_string : &dtrace_match_nul;
pkey.dtpk_func = func;
pkey.dtpk_fmatch = func ? &dtrace_match_string : &dtrace_match_nul;
pkey.dtpk_name = name;
pkey.dtpk_nmatch = name ? &dtrace_match_string : &dtrace_match_nul;
pkey.dtpk_id = DTRACE_IDNONE;
mutex_enter(&dtrace_lock);
match = dtrace_match(&pkey, DTRACE_PRIV_ALL, 0, 0,
dtrace_probe_lookup_match, &id);
mutex_exit(&dtrace_lock);
ASSERT(match == 1 || match == 0);
return (match ? id : 0);
}
/*
* Returns the probe argument associated with the specified probe.
*/
void *
dtrace_probe_arg(dtrace_provider_id_t id, dtrace_id_t pid)
{
dtrace_probe_t *probe;
void *rval = NULL;
mutex_enter(&dtrace_lock);
if ((probe = dtrace_probe_lookup_id(pid)) != NULL &&
probe->dtpr_provider == (dtrace_provider_t *)id)
rval = probe->dtpr_arg;
mutex_exit(&dtrace_lock);
return (rval);
}
/*
* Copy a probe into a probe description.
*/
static void
dtrace_probe_description(const dtrace_probe_t *prp, dtrace_probedesc_t *pdp)
{
bzero(pdp, sizeof (dtrace_probedesc_t));
pdp->dtpd_id = prp->dtpr_id;
(void) strncpy(pdp->dtpd_provider,
prp->dtpr_provider->dtpv_name, DTRACE_PROVNAMELEN - 1);
(void) strncpy(pdp->dtpd_mod, prp->dtpr_mod, DTRACE_MODNAMELEN - 1);
(void) strncpy(pdp->dtpd_func, prp->dtpr_func, DTRACE_FUNCNAMELEN - 1);
(void) strncpy(pdp->dtpd_name, prp->dtpr_name, DTRACE_NAMELEN - 1);
}
#if !defined(sun)
static int
dtrace_probe_provide_cb(linker_file_t lf, void *arg)
{
dtrace_provider_t *prv = (dtrace_provider_t *) arg;
prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, lf);
return(0);
}
#endif
/*
* Called to indicate that a probe -- or probes -- should be provided by a
* specfied provider. If the specified description is NULL, the provider will
* be told to provide all of its probes. (This is done whenever a new
* consumer comes along, or whenever a retained enabling is to be matched.) If
* the specified description is non-NULL, the provider is given the
* opportunity to dynamically provide the specified probe, allowing providers
* to support the creation of probes on-the-fly. (So-called _autocreated_
* probes.) If the provider is NULL, the operations will be applied to all
* providers; if the provider is non-NULL the operations will only be applied
* to the specified provider. The dtrace_provider_lock must be held, and the
* dtrace_lock must _not_ be held -- the provider's dtps_provide() operation
* will need to grab the dtrace_lock when it reenters the framework through
* dtrace_probe_lookup(), dtrace_probe_create(), etc.
*/
static void
dtrace_probe_provide(dtrace_probedesc_t *desc, dtrace_provider_t *prv)
{
#if defined(sun)
modctl_t *ctl;
#endif
int all = 0;
ASSERT(MUTEX_HELD(&dtrace_provider_lock));
if (prv == NULL) {
all = 1;
prv = dtrace_provider;
}
do {
/*
* First, call the blanket provide operation.
*/
prv->dtpv_pops.dtps_provide(prv->dtpv_arg, desc);
/*
* Now call the per-module provide operation. We will grab
* mod_lock to prevent the list from being modified. Note
* that this also prevents the mod_busy bits from changing.
* (mod_busy can only be changed with mod_lock held.)
*/
mutex_enter(&mod_lock);
#if defined(sun)
ctl = &modules;
do {
if (ctl->mod_busy || ctl->mod_mp == NULL)
continue;
prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
} while ((ctl = ctl->mod_next) != &modules);
#else
(void) linker_file_foreach(dtrace_probe_provide_cb, prv);
#endif
mutex_exit(&mod_lock);
} while (all && (prv = prv->dtpv_next) != NULL);
}
#if defined(sun)
/*
* Iterate over each probe, and call the Framework-to-Provider API function
* denoted by offs.
*/
static void
dtrace_probe_foreach(uintptr_t offs)
{
dtrace_provider_t *prov;
void (*func)(void *, dtrace_id_t, void *);
dtrace_probe_t *probe;
dtrace_icookie_t cookie;
int i;
/*
* We disable interrupts to walk through the probe array. This is
* safe -- the dtrace_sync() in dtrace_unregister() assures that we
* won't see stale data.
*/
cookie = dtrace_interrupt_disable();
for (i = 0; i < dtrace_nprobes; i++) {
if ((probe = dtrace_probes[i]) == NULL)
continue;
if (probe->dtpr_ecb == NULL) {
/*
* This probe isn't enabled -- don't call the function.
*/
continue;
}
prov = probe->dtpr_provider;
func = *((void(**)(void *, dtrace_id_t, void *))
((uintptr_t)&prov->dtpv_pops + offs));
func(prov->dtpv_arg, i + 1, probe->dtpr_arg);
}
dtrace_interrupt_enable(cookie);
}
#endif
static int
dtrace_probe_enable(dtrace_probedesc_t *desc, dtrace_enabling_t *enab)
{
dtrace_probekey_t pkey;
uint32_t priv;
uid_t uid;
zoneid_t zoneid;
ASSERT(MUTEX_HELD(&dtrace_lock));
dtrace_ecb_create_cache = NULL;
if (desc == NULL) {
/*
* If we're passed a NULL description, we're being asked to
* create an ECB with a NULL probe.
*/
(void) dtrace_ecb_create_enable(NULL, enab);
return (0);
}
dtrace_probekey(desc, &pkey);
dtrace_cred2priv(enab->dten_vstate->dtvs_state->dts_cred.dcr_cred,
&priv, &uid, &zoneid);
return (dtrace_match(&pkey, priv, uid, zoneid, dtrace_ecb_create_enable,
enab));
}
/*
* DTrace Helper Provider Functions
*/
static void
dtrace_dofattr2attr(dtrace_attribute_t *attr, const dof_attr_t dofattr)
{
attr->dtat_name = DOF_ATTR_NAME(dofattr);
attr->dtat_data = DOF_ATTR_DATA(dofattr);
attr->dtat_class = DOF_ATTR_CLASS(dofattr);
}
static void
dtrace_dofprov2hprov(dtrace_helper_provdesc_t *hprov,
const dof_provider_t *dofprov, char *strtab)
{
hprov->dthpv_provname = strtab + dofprov->dofpv_name;
dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_provider,
dofprov->dofpv_provattr);
dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_mod,
dofprov->dofpv_modattr);
dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_func,
dofprov->dofpv_funcattr);
dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_name,
dofprov->dofpv_nameattr);
dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_args,
dofprov->dofpv_argsattr);
}
static void
dtrace_helper_provide_one(dof_helper_t *dhp, dof_sec_t *sec, pid_t pid)
{
uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
dof_hdr_t *dof = (dof_hdr_t *)daddr;
dof_sec_t *str_sec, *prb_sec, *arg_sec, *off_sec, *enoff_sec;
dof_provider_t *provider;
dof_probe_t *probe;
uint32_t *off, *enoff;
uint8_t *arg;
char *strtab;
uint_t i, nprobes;
dtrace_helper_provdesc_t dhpv;
dtrace_helper_probedesc_t dhpb;
dtrace_meta_t *meta = dtrace_meta_pid;
dtrace_mops_t *mops = &meta->dtm_mops;
void *parg;
provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
provider->dofpv_strtab * dof->dofh_secsize);
prb_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
provider->dofpv_probes * dof->dofh_secsize);
arg_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
provider->dofpv_prargs * dof->dofh_secsize);
off_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
provider->dofpv_proffs * dof->dofh_secsize);
strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
off = (uint32_t *)(uintptr_t)(daddr + off_sec->dofs_offset);
arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset);
enoff = NULL;
/*
* See dtrace_helper_provider_validate().
*/
if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
provider->dofpv_prenoffs != DOF_SECT_NONE) {
enoff_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
provider->dofpv_prenoffs * dof->dofh_secsize);
enoff = (uint32_t *)(uintptr_t)(daddr + enoff_sec->dofs_offset);
}
nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize;
/*
* Create the provider.
*/
dtrace_dofprov2hprov(&dhpv, provider, strtab);
if ((parg = mops->dtms_provide_pid(meta->dtm_arg, &dhpv, pid)) == NULL)
return;
meta->dtm_count++;
/*
* Create the probes.
*/
for (i = 0; i < nprobes; i++) {
probe = (dof_probe_t *)(uintptr_t)(daddr +
prb_sec->dofs_offset + i * prb_sec->dofs_entsize);
dhpb.dthpb_mod = dhp->dofhp_mod;
dhpb.dthpb_func = strtab + probe->dofpr_func;
dhpb.dthpb_name = strtab + probe->dofpr_name;
dhpb.dthpb_base = probe->dofpr_addr;
dhpb.dthpb_offs = off + probe->dofpr_offidx;
dhpb.dthpb_noffs = probe->dofpr_noffs;
if (enoff != NULL) {
dhpb.dthpb_enoffs = enoff + probe->dofpr_enoffidx;
dhpb.dthpb_nenoffs = probe->dofpr_nenoffs;
} else {
dhpb.dthpb_enoffs = NULL;
dhpb.dthpb_nenoffs = 0;
}
dhpb.dthpb_args = arg + probe->dofpr_argidx;
dhpb.dthpb_nargc = probe->dofpr_nargc;
dhpb.dthpb_xargc = probe->dofpr_xargc;
dhpb.dthpb_ntypes = strtab + probe->dofpr_nargv;
dhpb.dthpb_xtypes = strtab + probe->dofpr_xargv;
mops->dtms_create_probe(meta->dtm_arg, parg, &dhpb);
}
}
static void
dtrace_helper_provide(dof_helper_t *dhp, pid_t pid)
{
uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
dof_hdr_t *dof = (dof_hdr_t *)daddr;
int i;
ASSERT(MUTEX_HELD(&dtrace_meta_lock));
for (i = 0; i < dof->dofh_secnum; i++) {
dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
dof->dofh_secoff + i * dof->dofh_secsize);
if (sec->dofs_type != DOF_SECT_PROVIDER)
continue;
dtrace_helper_provide_one(dhp, sec, pid);
}
/*
* We may have just created probes, so we must now rematch against
* any retained enablings. Note that this call will acquire both
* cpu_lock and dtrace_lock; the fact that we are holding
* dtrace_meta_lock now is what defines the ordering with respect to
* these three locks.
*/
dtrace_enabling_matchall();
}
static void
dtrace_helper_provider_remove_one(dof_helper_t *dhp, dof_sec_t *sec, pid_t pid)
{
uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
dof_hdr_t *dof = (dof_hdr_t *)daddr;
dof_sec_t *str_sec;
dof_provider_t *provider;
char *strtab;
dtrace_helper_provdesc_t dhpv;
dtrace_meta_t *meta = dtrace_meta_pid;
dtrace_mops_t *mops = &meta->dtm_mops;
provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
provider->dofpv_strtab * dof->dofh_secsize);
strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
/*
* Create the provider.
*/
dtrace_dofprov2hprov(&dhpv, provider, strtab);
mops->dtms_remove_pid(meta->dtm_arg, &dhpv, pid);
meta->dtm_count--;
}
static void
dtrace_helper_provider_remove(dof_helper_t *dhp, pid_t pid)
{
uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
dof_hdr_t *dof = (dof_hdr_t *)daddr;
int i;
ASSERT(MUTEX_HELD(&dtrace_meta_lock));
for (i = 0; i < dof->dofh_secnum; i++) {
dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
dof->dofh_secoff + i * dof->dofh_secsize);
if (sec->dofs_type != DOF_SECT_PROVIDER)
continue;
dtrace_helper_provider_remove_one(dhp, sec, pid);
}
}
/*
* DTrace Meta Provider-to-Framework API Functions
*
* These functions implement the Meta Provider-to-Framework API, as described
* in <sys/dtrace.h>.
*/
int
dtrace_meta_register(const char *name, const dtrace_mops_t *mops, void *arg,
dtrace_meta_provider_id_t *idp)
{
dtrace_meta_t *meta;
dtrace_helpers_t *help, *next;
int i;
*idp = DTRACE_METAPROVNONE;
/*
* We strictly don't need the name, but we hold onto it for
* debuggability. All hail error queues!
*/
if (name == NULL) {
cmn_err(CE_WARN, "failed to register meta-provider: "
"invalid name");
return (EINVAL);
}
if (mops == NULL ||
mops->dtms_create_probe == NULL ||
mops->dtms_provide_pid == NULL ||
mops->dtms_remove_pid == NULL) {
cmn_err(CE_WARN, "failed to register meta-register %s: "
"invalid ops", name);
return (EINVAL);
}
meta = kmem_zalloc(sizeof (dtrace_meta_t), KM_SLEEP);
meta->dtm_mops = *mops;
meta->dtm_name = kmem_alloc(strlen(name) + 1, KM_SLEEP);
(void) strcpy(meta->dtm_name, name);
meta->dtm_arg = arg;
mutex_enter(&dtrace_meta_lock);
mutex_enter(&dtrace_lock);
if (dtrace_meta_pid != NULL) {
mutex_exit(&dtrace_lock);
mutex_exit(&dtrace_meta_lock);
cmn_err(CE_WARN, "failed to register meta-register %s: "
"user-land meta-provider exists", name);
kmem_free(meta->dtm_name, strlen(meta->dtm_name) + 1);
kmem_free(meta, sizeof (dtrace_meta_t));
return (EINVAL);
}
dtrace_meta_pid = meta;
*idp = (dtrace_meta_provider_id_t)meta;
/*
* If there are providers and probes ready to go, pass them
* off to the new meta provider now.
*/
help = dtrace_deferred_pid;
dtrace_deferred_pid = NULL;
mutex_exit(&dtrace_lock);
while (help != NULL) {
for (i = 0; i < help->dthps_nprovs; i++) {
dtrace_helper_provide(&help->dthps_provs[i]->dthp_prov,
help->dthps_pid);
}
next = help->dthps_next;
help->dthps_next = NULL;
help->dthps_prev = NULL;
help->dthps_deferred = 0;
help = next;
}
mutex_exit(&dtrace_meta_lock);
return (0);
}
int
dtrace_meta_unregister(dtrace_meta_provider_id_t id)
{
dtrace_meta_t **pp, *old = (dtrace_meta_t *)id;
mutex_enter(&dtrace_meta_lock);
mutex_enter(&dtrace_lock);
if (old == dtrace_meta_pid) {
pp = &dtrace_meta_pid;
} else {
panic("attempt to unregister non-existent "
"dtrace meta-provider %p\n", (void *)old);
}
if (old->dtm_count != 0) {
mutex_exit(&dtrace_lock);
mutex_exit(&dtrace_meta_lock);
return (EBUSY);
}
*pp = NULL;
mutex_exit(&dtrace_lock);
mutex_exit(&dtrace_meta_lock);
kmem_free(old->dtm_name, strlen(old->dtm_name) + 1);
kmem_free(old, sizeof (dtrace_meta_t));
return (0);
}
/*
* DTrace DIF Object Functions
*/
static int
dtrace_difo_err(uint_t pc, const char *format, ...)
{
if (dtrace_err_verbose) {
va_list alist;
(void) uprintf("dtrace DIF object error: [%u]: ", pc);
va_start(alist, format);
(void) vuprintf(format, alist);
va_end(alist);
}
#ifdef DTRACE_ERRDEBUG
dtrace_errdebug(format);
#endif
return (1);
}
/*
* Validate a DTrace DIF object by checking the IR instructions. The following
* rules are currently enforced by dtrace_difo_validate():
*
* 1. Each instruction must have a valid opcode
* 2. Each register, string, variable, or subroutine reference must be valid
* 3. No instruction can modify register %r0 (must be zero)
* 4. All instruction reserved bits must be set to zero
* 5. The last instruction must be a "ret" instruction
* 6. All branch targets must reference a valid instruction _after_ the branch
*/
static int
dtrace_difo_validate(dtrace_difo_t *dp, dtrace_vstate_t *vstate, uint_t nregs,
cred_t *cr)
{
int err = 0, i;
int (*efunc)(uint_t pc, const char *, ...) = dtrace_difo_err;
int kcheckload;
uint_t pc;
kcheckload = cr == NULL ||
(vstate->dtvs_state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) == 0;
dp->dtdo_destructive = 0;
for (pc = 0; pc < dp->dtdo_len && err == 0; pc++) {
dif_instr_t instr = dp->dtdo_buf[pc];
uint_t r1 = DIF_INSTR_R1(instr);
uint_t r2 = DIF_INSTR_R2(instr);
uint_t rd = DIF_INSTR_RD(instr);
uint_t rs = DIF_INSTR_RS(instr);
uint_t label = DIF_INSTR_LABEL(instr);
uint_t v = DIF_INSTR_VAR(instr);
uint_t subr = DIF_INSTR_SUBR(instr);
uint_t type = DIF_INSTR_TYPE(instr);
uint_t op = DIF_INSTR_OP(instr);
switch (op) {
case DIF_OP_OR:
case DIF_OP_XOR:
case DIF_OP_AND:
case DIF_OP_SLL:
case DIF_OP_SRL:
case DIF_OP_SRA:
case DIF_OP_SUB:
case DIF_OP_ADD:
case DIF_OP_MUL:
case DIF_OP_SDIV:
case DIF_OP_UDIV:
case DIF_OP_SREM:
case DIF_OP_UREM:
case DIF_OP_COPYS:
if (r1 >= nregs)
err += efunc(pc, "invalid register %u\n", r1);
if (r2 >= nregs)
err += efunc(pc, "invalid register %u\n", r2);
if (rd >= nregs)
err += efunc(pc, "invalid register %u\n", rd);
if (rd == 0)
err += efunc(pc, "cannot write to %r0\n");
break;
case DIF_OP_NOT:
case DIF_OP_MOV:
case DIF_OP_ALLOCS:
if (r1 >= nregs)
err += efunc(pc, "invalid register %u\n", r1);
if (r2 != 0)
err += efunc(pc, "non-zero reserved bits\n");
if (rd >= nregs)
err += efunc(pc, "invalid register %u\n", rd);
if (rd == 0)
err += efunc(pc, "cannot write to %r0\n");
break;
case DIF_OP_LDSB:
case DIF_OP_LDSH:
case DIF_OP_LDSW:
case DIF_OP_LDUB:
case DIF_OP_LDUH:
case DIF_OP_LDUW:
case DIF_OP_LDX:
if (r1 >= nregs)
err += efunc(pc, "invalid register %u\n", r1);
if (r2 != 0)
err += efunc(pc, "non-zero reserved bits\n");
if (rd >= nregs)
err += efunc(pc, "invalid register %u\n", rd);
if (rd == 0)
err += efunc(pc, "cannot write to %r0\n");
if (kcheckload)
dp->dtdo_buf[pc] = DIF_INSTR_LOAD(op +
DIF_OP_RLDSB - DIF_OP_LDSB, r1, rd);
break;
case DIF_OP_RLDSB:
case DIF_OP_RLDSH:
case DIF_OP_RLDSW:
case DIF_OP_RLDUB:
case DIF_OP_RLDUH:
case DIF_OP_RLDUW:
case DIF_OP_RLDX:
if (r1 >= nregs)
err += efunc(pc, "invalid register %u\n", r1);
if (r2 != 0)
err += efunc(pc, "non-zero reserved bits\n");
if (rd >= nregs)
err += efunc(pc, "invalid register %u\n", rd);
if (rd == 0)
err += efunc(pc, "cannot write to %r0\n");
break;
case DIF_OP_ULDSB:
case DIF_OP_ULDSH:
case DIF_OP_ULDSW:
case DIF_OP_ULDUB:
case DIF_OP_ULDUH:
case DIF_OP_ULDUW:
case DIF_OP_ULDX:
if (r1 >= nregs)
err += efunc(pc, "invalid register %u\n", r1);
if (r2 != 0)
err += efunc(pc, "non-zero reserved bits\n");
if (rd >= nregs)
err += efunc(pc, "invalid register %u\n", rd);
if (rd == 0)
err += efunc(pc, "cannot write to %r0\n");
break;
case DIF_OP_STB:
case DIF_OP_STH:
case DIF_OP_STW:
case DIF_OP_STX:
if (r1 >= nregs)
err += efunc(pc, "invalid register %u\n", r1);
if (r2 != 0)
err += efunc(pc, "non-zero reserved bits\n");
if (rd >= nregs)
err += efunc(pc, "invalid register %u\n", rd);
if (rd == 0)
err += efunc(pc, "cannot write to 0 address\n");
break;
case DIF_OP_CMP:
case DIF_OP_SCMP:
if (r1 >= nregs)
err += efunc(pc, "invalid register %u\n", r1);
if (r2 >= nregs)
err += efunc(pc, "invalid register %u\n", r2);
if (rd != 0)
err += efunc(pc, "non-zero reserved bits\n");
break;
case DIF_OP_TST:
if (r1 >= nregs)
err += efunc(pc, "invalid register %u\n", r1);
if (r2 != 0 || rd != 0)
err += efunc(pc, "non-zero reserved bits\n");
break;
case DIF_OP_BA:
case DIF_OP_BE:
case DIF_OP_BNE:
case DIF_OP_BG:
case DIF_OP_BGU:
case DIF_OP_BGE:
case DIF_OP_BGEU:
case DIF_OP_BL:
case DIF_OP_BLU:
case DIF_OP_BLE:
case DIF_OP_BLEU:
if (label >= dp->dtdo_len) {
err += efunc(pc, "invalid branch target %u\n",
label);
}
if (label <= pc) {
err += efunc(pc, "backward branch to %u\n",
label);
}
break;
case DIF_OP_RET:
if (r1 != 0 || r2 != 0)
err += efunc(pc, "non-zero reserved bits\n");
if (rd >= nregs)
err += efunc(pc, "invalid register %u\n", rd);
break;
case DIF_OP_NOP:
case DIF_OP_POPTS:
case DIF_OP_FLUSHTS:
if (r1 != 0 || r2 != 0 || rd != 0)
err += efunc(pc, "non-zero reserved bits\n");
break;
case DIF_OP_SETX:
if (DIF_INSTR_INTEGER(instr) >= dp->dtdo_intlen) {
err += efunc(pc, "invalid integer ref %u\n",
DIF_INSTR_INTEGER(instr));
}
if (rd >= nregs)
err += efunc(pc, "invalid register %u\n", rd);
if (rd == 0)
err += efunc(pc, "cannot write to %r0\n");
break;
case DIF_OP_SETS:
if (DIF_INSTR_STRING(instr) >= dp->dtdo_strlen) {
err += efunc(pc, "invalid string ref %u\n",
DIF_INSTR_STRING(instr));
}
if (rd >= nregs)
err += efunc(pc, "invalid register %u\n", rd);
if (rd == 0)
err += efunc(pc, "cannot write to %r0\n");
break;
case DIF_OP_LDGA:
case DIF_OP_LDTA:
if (r1 > DIF_VAR_ARRAY_MAX)
err += efunc(pc, "invalid array %u\n", r1);
if (r2 >= nregs)
err += efunc(pc, "invalid register %u\n", r2);
if (rd >= nregs)
err += efunc(pc, "invalid register %u\n", rd);
if (rd == 0)
err += efunc(pc, "cannot write to %r0\n");
break;
case DIF_OP_LDGS:
case DIF_OP_LDTS:
case DIF_OP_LDLS:
case DIF_OP_LDGAA:
case DIF_OP_LDTAA:
if (v < DIF_VAR_OTHER_MIN || v > DIF_VAR_OTHER_MAX)
err += efunc(pc, "invalid variable %u\n", v);
if (rd >= nregs)
err += efunc(pc, "invalid register %u\n", rd);
if (rd == 0)
err += efunc(pc, "cannot write to %r0\n");
break;
case DIF_OP_STGS:
case DIF_OP_STTS:
case DIF_OP_STLS:
case DIF_OP_STGAA:
case DIF_OP_STTAA:
if (v < DIF_VAR_OTHER_UBASE || v > DIF_VAR_OTHER_MAX)
err += efunc(pc, "invalid variable %u\n", v);
if (rs >= nregs)
err += efunc(pc, "invalid register %u\n", rd);
break;
case DIF_OP_CALL:
if (subr > DIF_SUBR_MAX)
err += efunc(pc, "invalid subr %u\n", subr);
if (rd >= nregs)
err += efunc(pc, "invalid register %u\n", rd);
if (rd == 0)
err += efunc(pc, "cannot write to %r0\n");
if (subr == DIF_SUBR_COPYOUT ||
subr == DIF_SUBR_COPYOUTSTR) {
dp->dtdo_destructive = 1;
}
break;
case DIF_OP_PUSHTR:
if (type != DIF_TYPE_STRING && type != DIF_TYPE_CTF)
err += efunc(pc, "invalid ref type %u\n", type);
if (r2 >= nregs)
err += efunc(pc, "invalid register %u\n", r2);
if (rs >= nregs)
err += efunc(pc, "invalid register %u\n", rs);
break;
case DIF_OP_PUSHTV:
if (type != DIF_TYPE_CTF)
err += efunc(pc, "invalid val type %u\n", type);
if (r2 >= nregs)
err += efunc(pc, "invalid register %u\n", r2);
if (rs >= nregs)
err += efunc(pc, "invalid register %u\n", rs);
break;
default:
err += efunc(pc, "invalid opcode %u\n",
DIF_INSTR_OP(instr));
}
}
if (dp->dtdo_len != 0 &&
DIF_INSTR_OP(dp->dtdo_buf[dp->dtdo_len - 1]) != DIF_OP_RET) {
err += efunc(dp->dtdo_len - 1,
"expected 'ret' as last DIF instruction\n");
}
if (!(dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF)) {
/*
* If we're not returning by reference, the size must be either
* 0 or the size of one of the base types.
*/
switch (dp->dtdo_rtype.dtdt_size) {
case 0:
case sizeof (uint8_t):
case sizeof (uint16_t):
case sizeof (uint32_t):
case sizeof (uint64_t):
break;
default:
err += efunc(dp->dtdo_len - 1, "bad return size");
}
}
for (i = 0; i < dp->dtdo_varlen && err == 0; i++) {
dtrace_difv_t *v = &dp->dtdo_vartab[i], *existing = NULL;
dtrace_diftype_t *vt, *et;
uint_t id, ndx;
if (v->dtdv_scope != DIFV_SCOPE_GLOBAL &&
v->dtdv_scope != DIFV_SCOPE_THREAD &&
v->dtdv_scope != DIFV_SCOPE_LOCAL) {
err += efunc(i, "unrecognized variable scope %d\n",
v->dtdv_scope);
break;
}
if (v->dtdv_kind != DIFV_KIND_ARRAY &&
v->dtdv_kind != DIFV_KIND_SCALAR) {
err += efunc(i, "unrecognized variable type %d\n",
v->dtdv_kind);
break;
}
if ((id = v->dtdv_id) > DIF_VARIABLE_MAX) {
err += efunc(i, "%d exceeds variable id limit\n", id);
break;
}
if (id < DIF_VAR_OTHER_UBASE)
continue;
/*
* For user-defined variables, we need to check that this
* definition is identical to any previous definition that we
* encountered.
*/
ndx = id - DIF_VAR_OTHER_UBASE;
switch (v->dtdv_scope) {
case DIFV_SCOPE_GLOBAL:
if (ndx < vstate->dtvs_nglobals) {
dtrace_statvar_t *svar;
if ((svar = vstate->dtvs_globals[ndx]) != NULL)
existing = &svar->dtsv_var;
}
break;
case DIFV_SCOPE_THREAD:
if (ndx < vstate->dtvs_ntlocals)
existing = &vstate->dtvs_tlocals[ndx];
break;
case DIFV_SCOPE_LOCAL:
if (ndx < vstate->dtvs_nlocals) {
dtrace_statvar_t *svar;
if ((svar = vstate->dtvs_locals[ndx]) != NULL)
existing = &svar->dtsv_var;
}
break;
}
vt = &v->dtdv_type;
if (vt->dtdt_flags & DIF_TF_BYREF) {
if (vt->dtdt_size == 0) {
err += efunc(i, "zero-sized variable\n");
break;
}
if (v->dtdv_scope == DIFV_SCOPE_GLOBAL &&
vt->dtdt_size > dtrace_global_maxsize) {
err += efunc(i, "oversized by-ref global\n");
break;
}
}
if (existing == NULL || existing->dtdv_id == 0)
continue;
ASSERT(existing->dtdv_id == v->dtdv_id);
ASSERT(existing->dtdv_scope == v->dtdv_scope);
if (existing->dtdv_kind != v->dtdv_kind)
err += efunc(i, "%d changed variable kind\n", id);
et = &existing->dtdv_type;
if (vt->dtdt_flags != et->dtdt_flags) {
err += efunc(i, "%d changed variable type flags\n", id);
break;
}
if (vt->dtdt_size != 0 && vt->dtdt_size != et->dtdt_size) {
err += efunc(i, "%d changed variable type size\n", id);
break;
}
}
return (err);
}
/*
* Validate a DTrace DIF object that it is to be used as a helper. Helpers
* are much more constrained than normal DIFOs. Specifically, they may
* not:
*
* 1. Make calls to subroutines other than copyin(), copyinstr() or
* miscellaneous string routines
* 2. Access DTrace variables other than the args[] array, and the
* curthread, pid, ppid, tid, execname, zonename, uid and gid variables.
* 3. Have thread-local variables.
* 4. Have dynamic variables.
*/
static int
dtrace_difo_validate_helper(dtrace_difo_t *dp)
{
int (*efunc)(uint_t pc, const char *, ...) = dtrace_difo_err;
int err = 0;
uint_t pc;
for (pc = 0; pc < dp->dtdo_len; pc++) {
dif_instr_t instr = dp->dtdo_buf[pc];
uint_t v = DIF_INSTR_VAR(instr);
uint_t subr = DIF_INSTR_SUBR(instr);
uint_t op = DIF_INSTR_OP(instr);
switch (op) {
case DIF_OP_OR:
case DIF_OP_XOR:
case DIF_OP_AND:
case DIF_OP_SLL:
case DIF_OP_SRL:
case DIF_OP_SRA:
case DIF_OP_SUB:
case DIF_OP_ADD:
case DIF_OP_MUL:
case DIF_OP_SDIV:
case DIF_OP_UDIV:
case DIF_OP_SREM:
case DIF_OP_UREM:
case DIF_OP_COPYS:
case DIF_OP_NOT:
case DIF_OP_MOV:
case DIF_OP_RLDSB:
case DIF_OP_RLDSH:
case DIF_OP_RLDSW:
case DIF_OP_RLDUB:
case DIF_OP_RLDUH:
case DIF_OP_RLDUW:
case DIF_OP_RLDX:
case DIF_OP_ULDSB:
case DIF_OP_ULDSH:
case DIF_OP_ULDSW:
case DIF_OP_ULDUB:
case DIF_OP_ULDUH:
case DIF_OP_ULDUW:
case DIF_OP_ULDX:
case DIF_OP_STB:
case DIF_OP_STH:
case DIF_OP_STW:
case DIF_OP_STX:
case DIF_OP_ALLOCS:
case DIF_OP_CMP:
case DIF_OP_SCMP:
case DIF_OP_TST:
case DIF_OP_BA:
case DIF_OP_BE:
case DIF_OP_BNE:
case DIF_OP_BG:
case DIF_OP_BGU:
case DIF_OP_BGE:
case DIF_OP_BGEU:
case DIF_OP_BL:
case DIF_OP_BLU:
case DIF_OP_BLE:
case DIF_OP_BLEU:
case DIF_OP_RET:
case DIF_OP_NOP:
case DIF_OP_POPTS:
case DIF_OP_FLUSHTS:
case DIF_OP_SETX:
case DIF_OP_SETS:
case DIF_OP_LDGA:
case DIF_OP_LDLS:
case DIF_OP_STGS:
case DIF_OP_STLS:
case DIF_OP_PUSHTR:
case DIF_OP_PUSHTV:
break;
case DIF_OP_LDGS:
if (v >= DIF_VAR_OTHER_UBASE)
break;
if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9)
break;
if (v == DIF_VAR_CURTHREAD || v == DIF_VAR_PID ||
v == DIF_VAR_PPID || v == DIF_VAR_TID ||
v == DIF_VAR_EXECARGS ||
v == DIF_VAR_EXECNAME || v == DIF_VAR_ZONENAME ||
v == DIF_VAR_UID || v == DIF_VAR_GID)
break;
err += efunc(pc, "illegal variable %u\n", v);
break;
case DIF_OP_LDTA:
case DIF_OP_LDTS:
case DIF_OP_LDGAA:
case DIF_OP_LDTAA:
err += efunc(pc, "illegal dynamic variable load\n");
break;
case DIF_OP_STTS:
case DIF_OP_STGAA:
case DIF_OP_STTAA:
err += efunc(pc, "illegal dynamic variable store\n");
break;
case DIF_OP_CALL:
if (subr == DIF_SUBR_ALLOCA ||
subr == DIF_SUBR_BCOPY ||
subr == DIF_SUBR_COPYIN ||
subr == DIF_SUBR_COPYINTO ||
subr == DIF_SUBR_COPYINSTR ||
subr == DIF_SUBR_INDEX ||
subr == DIF_SUBR_INET_NTOA ||
subr == DIF_SUBR_INET_NTOA6 ||
subr == DIF_SUBR_INET_NTOP ||
subr == DIF_SUBR_LLTOSTR ||
subr == DIF_SUBR_RINDEX ||
subr == DIF_SUBR_STRCHR ||
subr == DIF_SUBR_STRJOIN ||
subr == DIF_SUBR_STRRCHR ||
subr == DIF_SUBR_STRSTR ||
subr == DIF_SUBR_HTONS ||
subr == DIF_SUBR_HTONL ||
subr == DIF_SUBR_HTONLL ||
subr == DIF_SUBR_NTOHS ||
subr == DIF_SUBR_NTOHL ||
subr == DIF_SUBR_NTOHLL ||
subr == DIF_SUBR_MEMREF ||
subr == DIF_SUBR_TYPEREF)
break;
err += efunc(pc, "invalid subr %u\n", subr);
break;
default:
err += efunc(pc, "invalid opcode %u\n",
DIF_INSTR_OP(instr));
}
}
return (err);
}
/*
* Returns 1 if the expression in the DIF object can be cached on a per-thread
* basis; 0 if not.
*/
static int
dtrace_difo_cacheable(dtrace_difo_t *dp)
{
int i;
if (dp == NULL)
return (0);
for (i = 0; i < dp->dtdo_varlen; i++) {
dtrace_difv_t *v = &dp->dtdo_vartab[i];
if (v->dtdv_scope != DIFV_SCOPE_GLOBAL)
continue;
switch (v->dtdv_id) {
case DIF_VAR_CURTHREAD:
case DIF_VAR_PID:
case DIF_VAR_TID:
case DIF_VAR_EXECARGS:
case DIF_VAR_EXECNAME:
case DIF_VAR_ZONENAME:
break;
default:
return (0);
}
}
/*
* This DIF object may be cacheable. Now we need to look for any
* array loading instructions, any memory loading instructions, or
* any stores to thread-local variables.
*/
for (i = 0; i < dp->dtdo_len; i++) {
uint_t op = DIF_INSTR_OP(dp->dtdo_buf[i]);
if ((op >= DIF_OP_LDSB && op <= DIF_OP_LDX) ||
(op >= DIF_OP_ULDSB && op <= DIF_OP_ULDX) ||
(op >= DIF_OP_RLDSB && op <= DIF_OP_RLDX) ||
op == DIF_OP_LDGA || op == DIF_OP_STTS)
return (0);
}
return (1);
}
static void
dtrace_difo_hold(dtrace_difo_t *dp)
{
int i;
ASSERT(MUTEX_HELD(&dtrace_lock));
dp->dtdo_refcnt++;
ASSERT(dp->dtdo_refcnt != 0);
/*
* We need to check this DIF object for references to the variable
* DIF_VAR_VTIMESTAMP.
*/
for (i = 0; i < dp->dtdo_varlen; i++) {
dtrace_difv_t *v = &dp->dtdo_vartab[i];
if (v->dtdv_id != DIF_VAR_VTIMESTAMP)
continue;
if (dtrace_vtime_references++ == 0)
dtrace_vtime_enable();
}
}
/*
* This routine calculates the dynamic variable chunksize for a given DIF
* object. The calculation is not fool-proof, and can probably be tricked by
* malicious DIF -- but it works for all compiler-generated DIF. Because this
* calculation is likely imperfect, dtrace_dynvar() is able to gracefully fail
* if a dynamic variable size exceeds the chunksize.
*/
static void
dtrace_difo_chunksize(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
{
uint64_t sval = 0;
dtrace_key_t tupregs[DIF_DTR_NREGS + 2]; /* +2 for thread and id */
const dif_instr_t *text = dp->dtdo_buf;
uint_t pc, srd = 0;
uint_t ttop = 0;
size_t size, ksize;
uint_t id, i;
for (pc = 0; pc < dp->dtdo_len; pc++) {
dif_instr_t instr = text[pc];
uint_t op = DIF_INSTR_OP(instr);
uint_t rd = DIF_INSTR_RD(instr);
uint_t r1 = DIF_INSTR_R1(instr);
uint_t nkeys = 0;
uchar_t scope = 0;
dtrace_key_t *key = tupregs;
switch (op) {
case DIF_OP_SETX:
sval = dp->dtdo_inttab[DIF_INSTR_INTEGER(instr)];
srd = rd;
continue;
case DIF_OP_STTS:
key = &tupregs[DIF_DTR_NREGS];
key[0].dttk_size = 0;
key[1].dttk_size = 0;
nkeys = 2;
scope = DIFV_SCOPE_THREAD;
break;
case DIF_OP_STGAA:
case DIF_OP_STTAA:
nkeys = ttop;
if (DIF_INSTR_OP(instr) == DIF_OP_STTAA)
key[nkeys++].dttk_size = 0;
key[nkeys++].dttk_size = 0;
if (op == DIF_OP_STTAA) {
scope = DIFV_SCOPE_THREAD;
} else {
scope = DIFV_SCOPE_GLOBAL;
}
break;
case DIF_OP_PUSHTR:
if (ttop == DIF_DTR_NREGS)
return;
if ((srd == 0 || sval == 0) && r1 == DIF_TYPE_STRING) {
/*
* If the register for the size of the "pushtr"
* is %r0 (or the value is 0) and the type is
* a string, we'll use the system-wide default
* string size.
*/
tupregs[ttop++].dttk_size =
dtrace_strsize_default;
} else {
if (srd == 0)
return;
tupregs[ttop++].dttk_size = sval;
}
break;
case DIF_OP_PUSHTV:
if (ttop == DIF_DTR_NREGS)
return;
tupregs[ttop++].dttk_size = 0;
break;
case DIF_OP_FLUSHTS:
ttop = 0;
break;
case DIF_OP_POPTS:
if (ttop != 0)
ttop--;
break;
}
sval = 0;
srd = 0;
if (nkeys == 0)
continue;
/*
* We have a dynamic variable allocation; calculate its size.
*/
for (ksize = 0, i = 0; i < nkeys; i++)
ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
size = sizeof (dtrace_dynvar_t);
size += sizeof (dtrace_key_t) * (nkeys - 1);
size += ksize;
/*
* Now we need to determine the size of the stored data.
*/
id = DIF_INSTR_VAR(instr);
for (i = 0; i < dp->dtdo_varlen; i++) {
dtrace_difv_t *v = &dp->dtdo_vartab[i];
if (v->dtdv_id == id && v->dtdv_scope == scope) {
size += v->dtdv_type.dtdt_size;
break;
}
}
if (i == dp->dtdo_varlen)
return;
/*
* We have the size. If this is larger than the chunk size
* for our dynamic variable state, reset the chunk size.
*/
size = P2ROUNDUP(size, sizeof (uint64_t));
if (size > vstate->dtvs_dynvars.dtds_chunksize)
vstate->dtvs_dynvars.dtds_chunksize = size;
}
}
static void
dtrace_difo_init(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
{
int i, oldsvars, osz, nsz, otlocals, ntlocals;
uint_t id;
ASSERT(MUTEX_HELD(&dtrace_lock));
ASSERT(dp->dtdo_buf != NULL && dp->dtdo_len != 0);
for (i = 0; i < dp->dtdo_varlen; i++) {
dtrace_difv_t *v = &dp->dtdo_vartab[i];
dtrace_statvar_t *svar, ***svarp = NULL;
size_t dsize = 0;
uint8_t scope = v->dtdv_scope;
int *np = NULL;
if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE)
continue;
id -= DIF_VAR_OTHER_UBASE;
switch (scope) {
case DIFV_SCOPE_THREAD:
while (id >= (otlocals = vstate->dtvs_ntlocals)) {
dtrace_difv_t *tlocals;
if ((ntlocals = (otlocals << 1)) == 0)
ntlocals = 1;
osz = otlocals * sizeof (dtrace_difv_t);
nsz = ntlocals * sizeof (dtrace_difv_t);
tlocals = kmem_zalloc(nsz, KM_SLEEP);
if (osz != 0) {
bcopy(vstate->dtvs_tlocals,
tlocals, osz);
kmem_free(vstate->dtvs_tlocals, osz);
}
vstate->dtvs_tlocals = tlocals;
vstate->dtvs_ntlocals = ntlocals;
}
vstate->dtvs_tlocals[id] = *v;
continue;
case DIFV_SCOPE_LOCAL:
np = &vstate->dtvs_nlocals;
svarp = &vstate->dtvs_locals;
if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)
dsize = NCPU * (v->dtdv_type.dtdt_size +
sizeof (uint64_t));
else
dsize = NCPU * sizeof (uint64_t);
break;
case DIFV_SCOPE_GLOBAL:
np = &vstate->dtvs_nglobals;
svarp = &vstate->dtvs_globals;
if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)
dsize = v->dtdv_type.dtdt_size +
sizeof (uint64_t);
break;
default:
ASSERT(0);
}
while (id >= (oldsvars = *np)) {
dtrace_statvar_t **statics;
int newsvars, oldsize, newsize;
if ((newsvars = (oldsvars << 1)) == 0)
newsvars = 1;
oldsize = oldsvars * sizeof (dtrace_statvar_t *);
newsize = newsvars * sizeof (dtrace_statvar_t *);
statics = kmem_zalloc(newsize, KM_SLEEP);
if (oldsize != 0) {
bcopy(*svarp, statics, oldsize);
kmem_free(*svarp, oldsize);
}
*svarp = statics;
*np = newsvars;
}
if ((svar = (*svarp)[id]) == NULL) {
svar = kmem_zalloc(sizeof (dtrace_statvar_t), KM_SLEEP);
svar->dtsv_var = *v;
if ((svar->dtsv_size = dsize) != 0) {
svar->dtsv_data = (uint64_t)(uintptr_t)
kmem_zalloc(dsize, KM_SLEEP);
}
(*svarp)[id] = svar;
}
svar->dtsv_refcnt++;
}
dtrace_difo_chunksize(dp, vstate);
dtrace_difo_hold(dp);
}
static dtrace_difo_t *
dtrace_difo_duplicate(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
{
dtrace_difo_t *new;
size_t sz;
ASSERT(dp->dtdo_buf != NULL);
ASSERT(dp->dtdo_refcnt != 0);
new = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP);
ASSERT(dp->dtdo_buf != NULL);
sz = dp->dtdo_len * sizeof (dif_instr_t);
new->dtdo_buf = kmem_alloc(sz, KM_SLEEP);
bcopy(dp->dtdo_buf, new->dtdo_buf, sz);
new->dtdo_len = dp->dtdo_len;
if (dp->dtdo_strtab != NULL) {
ASSERT(dp->dtdo_strlen != 0);
new->dtdo_strtab = kmem_alloc(dp->dtdo_strlen, KM_SLEEP);
bcopy(dp->dtdo_strtab, new->dtdo_strtab, dp->dtdo_strlen);
new->dtdo_strlen = dp->dtdo_strlen;
}
if (dp->dtdo_inttab != NULL) {
ASSERT(dp->dtdo_intlen != 0);
sz = dp->dtdo_intlen * sizeof (uint64_t);
new->dtdo_inttab = kmem_alloc(sz, KM_SLEEP);
bcopy(dp->dtdo_inttab, new->dtdo_inttab, sz);
new->dtdo_intlen = dp->dtdo_intlen;
}
if (dp->dtdo_vartab != NULL) {
ASSERT(dp->dtdo_varlen != 0);
sz = dp->dtdo_varlen * sizeof (dtrace_difv_t);
new->dtdo_vartab = kmem_alloc(sz, KM_SLEEP);
bcopy(dp->dtdo_vartab, new->dtdo_vartab, sz);
new->dtdo_varlen = dp->dtdo_varlen;
}
dtrace_difo_init(new, vstate);
return (new);
}
static void
dtrace_difo_destroy(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
{
int i;
ASSERT(dp->dtdo_refcnt == 0);
for (i = 0; i < dp->dtdo_varlen; i++) {
dtrace_difv_t *v = &dp->dtdo_vartab[i];
dtrace_statvar_t *svar, **svarp = NULL;
uint_t id;
uint8_t scope = v->dtdv_scope;
int *np = NULL;
switch (scope) {
case DIFV_SCOPE_THREAD:
continue;
case DIFV_SCOPE_LOCAL:
np = &vstate->dtvs_nlocals;
svarp = vstate->dtvs_locals;
break;
case DIFV_SCOPE_GLOBAL:
np = &vstate->dtvs_nglobals;
svarp = vstate->dtvs_globals;
break;
default:
ASSERT(0);
}
if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE)
continue;
id -= DIF_VAR_OTHER_UBASE;
ASSERT(id < *np);
svar = svarp[id];
ASSERT(svar != NULL);
ASSERT(svar->dtsv_refcnt > 0);
if (--svar->dtsv_refcnt > 0)
continue;
if (svar->dtsv_size != 0) {
ASSERT(svar->dtsv_data != 0);
kmem_free((void *)(uintptr_t)svar->dtsv_data,
svar->dtsv_size);
}
kmem_free(svar, sizeof (dtrace_statvar_t));
svarp[id] = NULL;
}
if (dp->dtdo_buf != NULL)
kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t));
if (dp->dtdo_inttab != NULL)
kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t));
if (dp->dtdo_strtab != NULL)
kmem_free(dp->dtdo_strtab, dp->dtdo_strlen);
if (dp->dtdo_vartab != NULL)
kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t));
kmem_free(dp, sizeof (dtrace_difo_t));
}
static void
dtrace_difo_release(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
{
int i;
ASSERT(MUTEX_HELD(&dtrace_lock));
ASSERT(dp->dtdo_refcnt != 0);
for (i = 0; i < dp->dtdo_varlen; i++) {
dtrace_difv_t *v = &dp->dtdo_vartab[i];
if (v->dtdv_id != DIF_VAR_VTIMESTAMP)
continue;
ASSERT(dtrace_vtime_references > 0);
if (--dtrace_vtime_references == 0)
dtrace_vtime_disable();
}
if (--dp->dtdo_refcnt == 0)
dtrace_difo_destroy(dp, vstate);
}
/*
* DTrace Format Functions
*/
static uint16_t
dtrace_format_add(dtrace_state_t *state, char *str)
{
char *fmt, **new;
uint16_t ndx, len = strlen(str) + 1;
fmt = kmem_zalloc(len, KM_SLEEP);
bcopy(str, fmt, len);
for (ndx = 0; ndx < state->dts_nformats; ndx++) {
if (state->dts_formats[ndx] == NULL) {
state->dts_formats[ndx] = fmt;
return (ndx + 1);
}
}
if (state->dts_nformats == USHRT_MAX) {
/*
* This is only likely if a denial-of-service attack is being
* attempted. As such, it's okay to fail silently here.
*/
kmem_free(fmt, len);
return (0);
}
/*
* For simplicity, we always resize the formats array to be exactly the
* number of formats.
*/
ndx = state->dts_nformats++;
new = kmem_alloc((ndx + 1) * sizeof (char *), KM_SLEEP);
if (state->dts_formats != NULL) {
ASSERT(ndx != 0);
bcopy(state->dts_formats, new, ndx * sizeof (char *));
kmem_free(state->dts_formats, ndx * sizeof (char *));
}
state->dts_formats = new;
state->dts_formats[ndx] = fmt;
return (ndx + 1);
}
static void
dtrace_format_remove(dtrace_state_t *state, uint16_t format)
{
char *fmt;
ASSERT(state->dts_formats != NULL);
ASSERT(format <= state->dts_nformats);
ASSERT(state->dts_formats[format - 1] != NULL);
fmt = state->dts_formats[format - 1];
kmem_free(fmt, strlen(fmt) + 1);
state->dts_formats[format - 1] = NULL;
}
static void
dtrace_format_destroy(dtrace_state_t *state)
{
int i;
if (state->dts_nformats == 0) {
ASSERT(state->dts_formats == NULL);
return;
}
ASSERT(state->dts_formats != NULL);
for (i = 0; i < state->dts_nformats; i++) {
char *fmt = state->dts_formats[i];
if (fmt == NULL)
continue;
kmem_free(fmt, strlen(fmt) + 1);
}
kmem_free(state->dts_formats, state->dts_nformats * sizeof (char *));
state->dts_nformats = 0;
state->dts_formats = NULL;
}
/*
* DTrace Predicate Functions
*/
static dtrace_predicate_t *
dtrace_predicate_create(dtrace_difo_t *dp)
{
dtrace_predicate_t *pred;
ASSERT(MUTEX_HELD(&dtrace_lock));
ASSERT(dp->dtdo_refcnt != 0);
pred = kmem_zalloc(sizeof (dtrace_predicate_t), KM_SLEEP);
pred->dtp_difo = dp;
pred->dtp_refcnt = 1;
if (!dtrace_difo_cacheable(dp))
return (pred);
if (dtrace_predcache_id == DTRACE_CACHEIDNONE) {
/*
* This is only theoretically possible -- we have had 2^32
* cacheable predicates on this machine. We cannot allow any
* more predicates to become cacheable: as unlikely as it is,
* there may be a thread caching a (now stale) predicate cache
* ID. (N.B.: the temptation is being successfully resisted to
* have this cmn_err() "Holy shit -- we executed this code!")
*/
return (pred);
}
pred->dtp_cacheid = dtrace_predcache_id++;
return (pred);
}
static void
dtrace_predicate_hold(dtrace_predicate_t *pred)
{
ASSERT(MUTEX_HELD(&dtrace_lock));
ASSERT(pred->dtp_difo != NULL && pred->dtp_difo->dtdo_refcnt != 0);
ASSERT(pred->dtp_refcnt > 0);
pred->dtp_refcnt++;
}
static void
dtrace_predicate_release(dtrace_predicate_t *pred, dtrace_vstate_t *vstate)
{
dtrace_difo_t *dp = pred->dtp_difo;
ASSERT(MUTEX_HELD(&dtrace_lock));
ASSERT(dp != NULL && dp->dtdo_refcnt != 0);
ASSERT(pred->dtp_refcnt > 0);
if (--pred->dtp_refcnt == 0) {
dtrace_difo_release(pred->dtp_difo, vstate);
kmem_free(pred, sizeof (dtrace_predicate_t));
}
}
/*
* DTrace Action Description Functions
*/
static dtrace_actdesc_t *
dtrace_actdesc_create(dtrace_actkind_t kind, uint32_t ntuple,
uint64_t uarg, uint64_t arg)
{
dtrace_actdesc_t *act;
#if defined(sun)
ASSERT(!DTRACEACT_ISPRINTFLIKE(kind) || (arg != NULL &&
arg >= KERNELBASE) || (arg == NULL && kind == DTRACEACT_PRINTA));
#endif
act = kmem_zalloc(sizeof (dtrace_actdesc_t), KM_SLEEP);
act->dtad_kind = kind;
act->dtad_ntuple = ntuple;
act->dtad_uarg = uarg;
act->dtad_arg = arg;
act->dtad_refcnt = 1;
return (act);
}
static void
dtrace_actdesc_hold(dtrace_actdesc_t *act)
{
ASSERT(act->dtad_refcnt >= 1);
act->dtad_refcnt++;
}
static void
dtrace_actdesc_release(dtrace_actdesc_t *act, dtrace_vstate_t *vstate)
{
dtrace_actkind_t kind = act->dtad_kind;
dtrace_difo_t *dp;
ASSERT(act->dtad_refcnt >= 1);
if (--act->dtad_refcnt != 0)
return;
if ((dp = act->dtad_difo) != NULL)
dtrace_difo_release(dp, vstate);
if (DTRACEACT_ISPRINTFLIKE(kind)) {
char *str = (char *)(uintptr_t)act->dtad_arg;
#if defined(sun)
ASSERT((str != NULL && (uintptr_t)str >= KERNELBASE) ||
(str == NULL && act->dtad_kind == DTRACEACT_PRINTA));
#endif
if (str != NULL)
kmem_free(str, strlen(str) + 1);
}
kmem_free(act, sizeof (dtrace_actdesc_t));
}
/*
* DTrace ECB Functions
*/
static dtrace_ecb_t *
dtrace_ecb_add(dtrace_state_t *state, dtrace_probe_t *probe)
{
dtrace_ecb_t *ecb;
dtrace_epid_t epid;
ASSERT(MUTEX_HELD(&dtrace_lock));
ecb = kmem_zalloc(sizeof (dtrace_ecb_t), KM_SLEEP);
ecb->dte_predicate = NULL;
ecb->dte_probe = probe;
/*
* The default size is the size of the default action: recording
* the epid.
*/
ecb->dte_size = ecb->dte_needed = sizeof (dtrace_epid_t);
ecb->dte_alignment = sizeof (dtrace_epid_t);
epid = state->dts_epid++;
if (epid - 1 >= state->dts_necbs) {
dtrace_ecb_t **oecbs = state->dts_ecbs, **ecbs;
int necbs = state->dts_necbs << 1;
ASSERT(epid == state->dts_necbs + 1);
if (necbs == 0) {
ASSERT(oecbs == NULL);
necbs = 1;
}
ecbs = kmem_zalloc(necbs * sizeof (*ecbs), KM_SLEEP);
if (oecbs != NULL)
bcopy(oecbs, ecbs, state->dts_necbs * sizeof (*ecbs));
dtrace_membar_producer();
state->dts_ecbs = ecbs;
if (oecbs != NULL) {
/*
* If this state is active, we must dtrace_sync()
* before we can free the old dts_ecbs array: we're
* coming in hot, and there may be active ring
* buffer processing (which indexes into the dts_ecbs
* array) on another CPU.
*/
if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
dtrace_sync();
kmem_free(oecbs, state->dts_necbs * sizeof (*ecbs));
}
dtrace_membar_producer();
state->dts_necbs = necbs;
}
ecb->dte_state = state;
ASSERT(state->dts_ecbs[epid - 1] == NULL);
dtrace_membar_producer();
state->dts_ecbs[(ecb->dte_epid = epid) - 1] = ecb;
return (ecb);
}
static void
dtrace_ecb_enable(dtrace_ecb_t *ecb)
{
dtrace_probe_t *probe = ecb->dte_probe;
ASSERT(MUTEX_HELD(&cpu_lock));
ASSERT(MUTEX_HELD(&dtrace_lock));
ASSERT(ecb->dte_next == NULL);
if (probe == NULL) {
/*
* This is the NULL probe -- there's nothing to do.
*/
return;
}
if (probe->dtpr_ecb == NULL) {
dtrace_provider_t *prov = probe->dtpr_provider;
/*
* We're the first ECB on this probe.
*/
probe->dtpr_ecb = probe->dtpr_ecb_last = ecb;
if (ecb->dte_predicate != NULL)
probe->dtpr_predcache = ecb->dte_predicate->dtp_cacheid;
prov->dtpv_pops.dtps_enable(prov->dtpv_arg,
probe->dtpr_id, probe->dtpr_arg);
} else {
/*
* This probe is already active. Swing the last pointer to
* point to the new ECB, and issue a dtrace_sync() to assure
* that all CPUs have seen the change.
*/
ASSERT(probe->dtpr_ecb_last != NULL);
probe->dtpr_ecb_last->dte_next = ecb;
probe->dtpr_ecb_last = ecb;
probe->dtpr_predcache = 0;
dtrace_sync();
}
}
static void
dtrace_ecb_resize(dtrace_ecb_t *ecb)
{
uint32_t maxalign = sizeof (dtrace_epid_t);
uint32_t align = sizeof (uint8_t), offs, diff;
dtrace_action_t *act;
int wastuple = 0;
uint32_t aggbase = UINT32_MAX;
dtrace_state_t *state = ecb->dte_state;
/*
* If we record anything, we always record the epid. (And we always
* record it first.)
*/
offs = sizeof (dtrace_epid_t);
ecb->dte_size = ecb->dte_needed = sizeof (dtrace_epid_t);
for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
dtrace_recdesc_t *rec = &act->dta_rec;
if ((align = rec->dtrd_alignment) > maxalign)
maxalign = align;
if (!wastuple && act->dta_intuple) {
/*
* This is the first record in a tuple. Align the
* offset to be at offset 4 in an 8-byte aligned
* block.
*/
diff = offs + sizeof (dtrace_aggid_t);
if ((diff = (diff & (sizeof (uint64_t) - 1))))
offs += sizeof (uint64_t) - diff;
aggbase = offs - sizeof (dtrace_aggid_t);
ASSERT(!(aggbase & (sizeof (uint64_t) - 1)));
}
/*LINTED*/
if (rec->dtrd_size != 0 && (diff = (offs & (align - 1)))) {
/*
* The current offset is not properly aligned; align it.
*/
offs += align - diff;
}
rec->dtrd_offset = offs;
if (offs + rec->dtrd_size > ecb->dte_needed) {
ecb->dte_needed = offs + rec->dtrd_size;
if (ecb->dte_needed > state->dts_needed)
state->dts_needed = ecb->dte_needed;
}
if (DTRACEACT_ISAGG(act->dta_kind)) {
dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act;
dtrace_action_t *first = agg->dtag_first, *prev;
ASSERT(rec->dtrd_size != 0 && first != NULL);
ASSERT(wastuple);
ASSERT(aggbase != UINT32_MAX);
agg->dtag_base = aggbase;
while ((prev = first->dta_prev) != NULL &&
DTRACEACT_ISAGG(prev->dta_kind)) {
agg = (dtrace_aggregation_t *)prev;
first = agg->dtag_first;
}
if (prev != NULL) {
offs = prev->dta_rec.dtrd_offset +
prev->dta_rec.dtrd_size;
} else {
offs = sizeof (dtrace_epid_t);
}
wastuple = 0;
} else {
if (!act->dta_intuple)
ecb->dte_size = offs + rec->dtrd_size;
offs += rec->dtrd_size;
}
wastuple = act->dta_intuple;
}
if ((act = ecb->dte_action) != NULL &&
!(act->dta_kind == DTRACEACT_SPECULATE && act->dta_next == NULL) &&
ecb->dte_size == sizeof (dtrace_epid_t)) {
/*
* If the size is still sizeof (dtrace_epid_t), then all
* actions store no data; set the size to 0.
*/
ecb->dte_alignment = maxalign;
ecb->dte_size = 0;
/*
* If the needed space is still sizeof (dtrace_epid_t), then
* all actions need no additional space; set the needed
* size to 0.
*/
if (ecb->dte_needed == sizeof (dtrace_epid_t))
ecb->dte_needed = 0;
return;
}
/*
* Set our alignment, and make sure that the dte_size and dte_needed
* are aligned to the size of an EPID.
*/
ecb->dte_alignment = maxalign;
ecb->dte_size = (ecb->dte_size + (sizeof (dtrace_epid_t) - 1)) &
~(sizeof (dtrace_epid_t) - 1);
ecb->dte_needed = (ecb->dte_needed + (sizeof (dtrace_epid_t) - 1)) &
~(sizeof (dtrace_epid_t) - 1);
ASSERT(ecb->dte_size <= ecb->dte_needed);
}
static dtrace_action_t *
dtrace_ecb_aggregation_create(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc)
{
dtrace_aggregation_t *agg;
size_t size = sizeof (uint64_t);
int ntuple = desc->dtad_ntuple;
dtrace_action_t *act;
dtrace_recdesc_t *frec;
dtrace_aggid_t aggid;
dtrace_state_t *state = ecb->dte_state;
agg = kmem_zalloc(sizeof (dtrace_aggregation_t), KM_SLEEP);
agg->dtag_ecb = ecb;
ASSERT(DTRACEACT_ISAGG(desc->dtad_kind));
switch (desc->dtad_kind) {
case DTRACEAGG_MIN:
agg->dtag_initial = INT64_MAX;
agg->dtag_aggregate = dtrace_aggregate_min;
break;
case DTRACEAGG_MAX:
agg->dtag_initial = INT64_MIN;
agg->dtag_aggregate = dtrace_aggregate_max;
break;
case DTRACEAGG_COUNT:
agg->dtag_aggregate = dtrace_aggregate_count;
break;
case DTRACEAGG_QUANTIZE:
agg->dtag_aggregate = dtrace_aggregate_quantize;
size = (((sizeof (uint64_t) * NBBY) - 1) * 2 + 1) *
sizeof (uint64_t);
break;
case DTRACEAGG_LQUANTIZE: {
uint16_t step = DTRACE_LQUANTIZE_STEP(desc->dtad_arg);
uint16_t levels = DTRACE_LQUANTIZE_LEVELS(desc->dtad_arg);
agg->dtag_initial = desc->dtad_arg;
agg->dtag_aggregate = dtrace_aggregate_lquantize;
if (step == 0 || levels == 0)
goto err;
size = levels * sizeof (uint64_t) + 3 * sizeof (uint64_t);
break;
}
case DTRACEAGG_AVG:
agg->dtag_aggregate = dtrace_aggregate_avg;
size = sizeof (uint64_t) * 2;
break;
case DTRACEAGG_STDDEV:
agg->dtag_aggregate = dtrace_aggregate_stddev;
size = sizeof (uint64_t) * 4;
break;
case DTRACEAGG_SUM:
agg->dtag_aggregate = dtrace_aggregate_sum;
break;
default:
goto err;
}
agg->dtag_action.dta_rec.dtrd_size = size;
if (ntuple == 0)
goto err;
/*
* We must make sure that we have enough actions for the n-tuple.
*/
for (act = ecb->dte_action_last; act != NULL; act = act->dta_prev) {
if (DTRACEACT_ISAGG(act->dta_kind))
break;
if (--ntuple == 0) {
/*
* This is the action with which our n-tuple begins.
*/
agg->dtag_first = act;
goto success;
}
}
/*
* This n-tuple is short by ntuple elements. Return failure.
*/
ASSERT(ntuple != 0);
err:
kmem_free(agg, sizeof (dtrace_aggregation_t));
return (NULL);
success:
/*
* If the last action in the tuple has a size of zero, it's actually
* an expression argument for the aggregating action.
*/
ASSERT(ecb->dte_action_last != NULL);
act = ecb->dte_action_last;
if (act->dta_kind == DTRACEACT_DIFEXPR) {
ASSERT(act->dta_difo != NULL);
if (act->dta_difo->dtdo_rtype.dtdt_size == 0)
agg->dtag_hasarg = 1;
}
/*
* We need to allocate an id for this aggregation.
*/
#if defined(sun)
aggid = (dtrace_aggid_t)(uintptr_t)vmem_alloc(state->dts_aggid_arena, 1,
VM_BESTFIT | VM_SLEEP);
#else
aggid = alloc_unr(state->dts_aggid_arena);
#endif
if (aggid - 1 >= state->dts_naggregations) {
dtrace_aggregation_t **oaggs = state->dts_aggregations;
dtrace_aggregation_t **aggs;
int naggs = state->dts_naggregations << 1;
int onaggs = state->dts_naggregations;
ASSERT(aggid == state->dts_naggregations + 1);
if (naggs == 0) {
ASSERT(oaggs == NULL);
naggs = 1;
}
aggs = kmem_zalloc(naggs * sizeof (*aggs), KM_SLEEP);
if (oaggs != NULL) {
bcopy(oaggs, aggs, onaggs * sizeof (*aggs));
kmem_free(oaggs, onaggs * sizeof (*aggs));
}
state->dts_aggregations = aggs;
state->dts_naggregations = naggs;
}
ASSERT(state->dts_aggregations[aggid - 1] == NULL);
state->dts_aggregations[(agg->dtag_id = aggid) - 1] = agg;
frec = &agg->dtag_first->dta_rec;
if (frec->dtrd_alignment < sizeof (dtrace_aggid_t))
frec->dtrd_alignment = sizeof (dtrace_aggid_t);
for (act = agg->dtag_first; act != NULL; act = act->dta_next) {
ASSERT(!act->dta_intuple);
act->dta_intuple = 1;
}
return (&agg->dtag_action);
}
static void
dtrace_ecb_aggregation_destroy(dtrace_ecb_t *ecb, dtrace_action_t *act)
{
dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act;
dtrace_state_t *state = ecb->dte_state;
dtrace_aggid_t aggid = agg->dtag_id;
ASSERT(DTRACEACT_ISAGG(act->dta_kind));
#if defined(sun)
vmem_free(state->dts_aggid_arena, (void *)(uintptr_t)aggid, 1);
#else
free_unr(state->dts_aggid_arena, aggid);
#endif
ASSERT(state->dts_aggregations[aggid - 1] == agg);
state->dts_aggregations[aggid - 1] = NULL;
kmem_free(agg, sizeof (dtrace_aggregation_t));
}
static int
dtrace_ecb_action_add(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc)
{
dtrace_action_t *action, *last;
dtrace_difo_t *dp = desc->dtad_difo;
uint32_t size = 0, align = sizeof (uint8_t), mask;
uint16_t format = 0;
dtrace_recdesc_t *rec;
dtrace_state_t *state = ecb->dte_state;
dtrace_optval_t *opt = state->dts_options, nframes = 0, strsize;
uint64_t arg = desc->dtad_arg;
ASSERT(MUTEX_HELD(&dtrace_lock));
ASSERT(ecb->dte_action == NULL || ecb->dte_action->dta_refcnt == 1);
if (DTRACEACT_ISAGG(desc->dtad_kind)) {
/*
* If this is an aggregating action, there must be neither
* a speculate nor a commit on the action chain.
*/
dtrace_action_t *act;
for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
if (act->dta_kind == DTRACEACT_COMMIT)
return (EINVAL);
if (act->dta_kind == DTRACEACT_SPECULATE)
return (EINVAL);
}
action = dtrace_ecb_aggregation_create(ecb, desc);
if (action == NULL)
return (EINVAL);
} else {
if (DTRACEACT_ISDESTRUCTIVE(desc->dtad_kind) ||
(desc->dtad_kind == DTRACEACT_DIFEXPR &&
dp != NULL && dp->dtdo_destructive)) {
state->dts_destructive = 1;
}
switch (desc->dtad_kind) {
case DTRACEACT_PRINTF:
case DTRACEACT_PRINTA:
case DTRACEACT_SYSTEM:
case DTRACEACT_FREOPEN:
/*
* We know that our arg is a string -- turn it into a
* format.
*/
if (arg == 0) {
ASSERT(desc->dtad_kind == DTRACEACT_PRINTA);
format = 0;
} else {
ASSERT(arg != 0);
#if defined(sun)
ASSERT(arg > KERNELBASE);
#endif
format = dtrace_format_add(state,
(char *)(uintptr_t)arg);
}
/*FALLTHROUGH*/
case DTRACEACT_LIBACT:
case DTRACEACT_DIFEXPR:
if (dp == NULL)
return (EINVAL);
if ((size = dp->dtdo_rtype.dtdt_size) != 0)
break;
if (dp->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING) {
if (!(dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
return (EINVAL);
size = opt[DTRACEOPT_STRSIZE];
}
break;
case DTRACEACT_STACK:
if ((nframes = arg) == 0) {
nframes = opt[DTRACEOPT_STACKFRAMES];
ASSERT(nframes > 0);
arg = nframes;
}
size = nframes * sizeof (pc_t);
break;
case DTRACEACT_JSTACK:
if ((strsize = DTRACE_USTACK_STRSIZE(arg)) == 0)
strsize = opt[DTRACEOPT_JSTACKSTRSIZE];
if ((nframes = DTRACE_USTACK_NFRAMES(arg)) == 0)
nframes = opt[DTRACEOPT_JSTACKFRAMES];
arg = DTRACE_USTACK_ARG(nframes, strsize);
/*FALLTHROUGH*/
case DTRACEACT_USTACK:
if (desc->dtad_kind != DTRACEACT_JSTACK &&
(nframes = DTRACE_USTACK_NFRAMES(arg)) == 0) {
strsize = DTRACE_USTACK_STRSIZE(arg);
nframes = opt[DTRACEOPT_USTACKFRAMES];
ASSERT(nframes > 0);
arg = DTRACE_USTACK_ARG(nframes, strsize);
}
/*
* Save a slot for the pid.
*/
size = (nframes + 1) * sizeof (uint64_t);
size += DTRACE_USTACK_STRSIZE(arg);
size = P2ROUNDUP(size, (uint32_t)(sizeof (uintptr_t)));
break;
case DTRACEACT_SYM:
case DTRACEACT_MOD:
if (dp == NULL || ((size = dp->dtdo_rtype.dtdt_size) !=
sizeof (uint64_t)) ||
(dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
return (EINVAL);
break;
case DTRACEACT_USYM:
case DTRACEACT_UMOD:
case DTRACEACT_UADDR:
if (dp == NULL ||
(dp->dtdo_rtype.dtdt_size != sizeof (uint64_t)) ||
(dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
return (EINVAL);
/*
* We have a slot for the pid, plus a slot for the
* argument. To keep things simple (aligned with
* bitness-neutral sizing), we store each as a 64-bit
* quantity.
*/
size = 2 * sizeof (uint64_t);
break;
case DTRACEACT_STOP:
case DTRACEACT_BREAKPOINT:
case DTRACEACT_PANIC:
break;
case DTRACEACT_CHILL:
case DTRACEACT_DISCARD:
case DTRACEACT_RAISE:
if (dp == NULL)
return (EINVAL);
break;
case DTRACEACT_EXIT:
if (dp == NULL ||
(size = dp->dtdo_rtype.dtdt_size) != sizeof (int) ||
(dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
return (EINVAL);
break;
case DTRACEACT_SPECULATE:
if (ecb->dte_size > sizeof (dtrace_epid_t))
return (EINVAL);
if (dp == NULL)
return (EINVAL);
state->dts_speculates = 1;
break;
case DTRACEACT_PRINTM:
size = dp->dtdo_rtype.dtdt_size;
break;
case DTRACEACT_PRINTT:
size = dp->dtdo_rtype.dtdt_size;
break;
case DTRACEACT_COMMIT: {
dtrace_action_t *act = ecb->dte_action;
for (; act != NULL; act = act->dta_next) {
if (act->dta_kind == DTRACEACT_COMMIT)
return (EINVAL);
}
if (dp == NULL)
return (EINVAL);
break;
}
default:
return (EINVAL);
}
if (size != 0 || desc->dtad_kind == DTRACEACT_SPECULATE) {
/*
* If this is a data-storing action or a speculate,
* we must be sure that there isn't a commit on the
* action chain.
*/
dtrace_action_t *act = ecb->dte_action;
for (; act != NULL; act = act->dta_next) {
if (act->dta_kind == DTRACEACT_COMMIT)
return (EINVAL);
}
}
action = kmem_zalloc(sizeof (dtrace_action_t), KM_SLEEP);
action->dta_rec.dtrd_size = size;
}
action->dta_refcnt = 1;
rec = &action->dta_rec;
size = rec->dtrd_size;
for (mask = sizeof (uint64_t) - 1; size != 0 && mask > 0; mask >>= 1) {
if (!(size & mask)) {
align = mask + 1;
break;
}
}
action->dta_kind = desc->dtad_kind;
if ((action->dta_difo = dp) != NULL)
dtrace_difo_hold(dp);
rec->dtrd_action = action->dta_kind;
rec->dtrd_arg = arg;
rec->dtrd_uarg = desc->dtad_uarg;
rec->dtrd_alignment = (uint16_t)align;
rec->dtrd_format = format;
if ((last = ecb->dte_action_last) != NULL) {
ASSERT(ecb->dte_action != NULL);
action->dta_prev = last;
last->dta_next = action;
} else {
ASSERT(ecb->dte_action == NULL);
ecb->dte_action = action;
}
ecb->dte_action_last = action;
return (0);
}
static void
dtrace_ecb_action_remove(dtrace_ecb_t *ecb)
{
dtrace_action_t *act = ecb->dte_action, *next;
dtrace_vstate_t *vstate = &ecb->dte_state->dts_vstate;
dtrace_difo_t *dp;
uint16_t format;
if (act != NULL && act->dta_refcnt > 1) {
ASSERT(act->dta_next == NULL || act->dta_next->dta_refcnt == 1);
act->dta_refcnt--;
} else {
for (; act != NULL; act = next) {
next = act->dta_next;
ASSERT(next != NULL || act == ecb->dte_action_last);
ASSERT(act->dta_refcnt == 1);
if ((format = act->dta_rec.dtrd_format) != 0)
dtrace_format_remove(ecb->dte_state, format);
if ((dp = act->dta_difo) != NULL)
dtrace_difo_release(dp, vstate);
if (DTRACEACT_ISAGG(act->dta_kind)) {
dtrace_ecb_aggregation_destroy(ecb, act);
} else {
kmem_free(act, sizeof (dtrace_action_t));
}
}
}
ecb->dte_action = NULL;
ecb->dte_action_last = NULL;
ecb->dte_size = sizeof (dtrace_epid_t);
}
static void
dtrace_ecb_disable(dtrace_ecb_t *ecb)
{
/*
* We disable the ECB by removing it from its probe.
*/
dtrace_ecb_t *pecb, *prev = NULL;
dtrace_probe_t *probe = ecb->dte_probe;
ASSERT(MUTEX_HELD(&dtrace_lock));
if (probe == NULL) {
/*
* This is the NULL probe; there is nothing to disable.
*/
return;
}
for (pecb = probe->dtpr_ecb; pecb != NULL; pecb = pecb->dte_next) {
if (pecb == ecb)
break;
prev = pecb;
}
ASSERT(pecb != NULL);
if (prev == NULL) {
probe->dtpr_ecb = ecb->dte_next;
} else {
prev->dte_next = ecb->dte_next;
}
if (ecb == probe->dtpr_ecb_last) {
ASSERT(ecb->dte_next == NULL);
probe->dtpr_ecb_last = prev;
}
/*
* The ECB has been disconnected from the probe; now sync to assure
* that all CPUs have seen the change before returning.
*/
dtrace_sync();
if (probe->dtpr_ecb == NULL) {
/*
* That was the last ECB on the probe; clear the predicate
* cache ID for the probe, disable it and sync one more time
* to assure that we'll never hit it again.
*/
dtrace_provider_t *prov = probe->dtpr_provider;
ASSERT(ecb->dte_next == NULL);
ASSERT(probe->dtpr_ecb_last == NULL);
probe->dtpr_predcache = DTRACE_CACHEIDNONE;
prov->dtpv_pops.dtps_disable(prov->dtpv_arg,
probe->dtpr_id, probe->dtpr_arg);
dtrace_sync();
} else {
/*
* There is at least one ECB remaining on the probe. If there
* is _exactly_ one, set the probe's predicate cache ID to be
* the predicate cache ID of the remaining ECB.
*/
ASSERT(probe->dtpr_ecb_last != NULL);
ASSERT(probe->dtpr_predcache == DTRACE_CACHEIDNONE);
if (probe->dtpr_ecb == probe->dtpr_ecb_last) {
dtrace_predicate_t *p = probe->dtpr_ecb->dte_predicate;
ASSERT(probe->dtpr_ecb->dte_next == NULL);
if (p != NULL)
probe->dtpr_predcache = p->dtp_cacheid;
}
ecb->dte_next = NULL;
}
}
static void
dtrace_ecb_destroy(dtrace_ecb_t *ecb)
{
dtrace_state_t *state = ecb->dte_state;
dtrace_vstate_t *vstate = &state->dts_vstate;
dtrace_predicate_t *pred;
dtrace_epid_t epid = ecb->dte_epid;
ASSERT(MUTEX_HELD(&dtrace_lock));
ASSERT(ecb->dte_next == NULL);
ASSERT(ecb->dte_probe == NULL || ecb->dte_probe->dtpr_ecb != ecb);
if ((pred = ecb->dte_predicate) != NULL)
dtrace_predicate_release(pred, vstate);
dtrace_ecb_action_remove(ecb);
ASSERT(state->dts_ecbs[epid - 1] == ecb);
state->dts_ecbs[epid - 1] = NULL;
kmem_free(ecb, sizeof (dtrace_ecb_t));
}
static dtrace_ecb_t *
dtrace_ecb_create(dtrace_state_t *state, dtrace_probe_t *probe,
dtrace_enabling_t *enab)
{
dtrace_ecb_t *ecb;
dtrace_predicate_t *pred;
dtrace_actdesc_t *act;
dtrace_provider_t *prov;
dtrace_ecbdesc_t *desc = enab->dten_current;
ASSERT(MUTEX_HELD(&dtrace_lock));
ASSERT(state != NULL);
ecb = dtrace_ecb_add(state, probe);
ecb->dte_uarg = desc->dted_uarg;
if ((pred = desc->dted_pred.dtpdd_predicate) != NULL) {
dtrace_predicate_hold(pred);
ecb->dte_predicate = pred;
}
if (probe != NULL) {
/*
* If the provider shows more leg than the consumer is old
* enough to see, we need to enable the appropriate implicit
* predicate bits to prevent the ecb from activating at
* revealing times.
*
* Providers specifying DTRACE_PRIV_USER at register time
* are stating that they need the /proc-style privilege
* model to be enforced, and this is what DTRACE_COND_OWNER
* and DTRACE_COND_ZONEOWNER will then do at probe time.
*/
prov = probe->dtpr_provider;
if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLPROC) &&
(prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER))
ecb->dte_cond |= DTRACE_COND_OWNER;
if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLZONE) &&
(prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER))
ecb->dte_cond |= DTRACE_COND_ZONEOWNER;
/*
* If the provider shows us kernel innards and the user
* is lacking sufficient privilege, enable the
* DTRACE_COND_USERMODE implicit predicate.
*/
if (!(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) &&
(prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_KERNEL))
ecb->dte_cond |= DTRACE_COND_USERMODE;
}
if (dtrace_ecb_create_cache != NULL) {
/*
* If we have a cached ecb, we'll use its action list instead
* of creating our own (saving both time and space).
*/
dtrace_ecb_t *cached = dtrace_ecb_create_cache;
dtrace_action_t *act = cached->dte_action;
if (act != NULL) {
ASSERT(act->dta_refcnt > 0);
act->dta_refcnt++;
ecb->dte_action = act;
ecb->dte_action_last = cached->dte_action_last;
ecb->dte_needed = cached->dte_needed;
ecb->dte_size = cached->dte_size;
ecb->dte_alignment = cached->dte_alignment;
}
return (ecb);
}
for (act = desc->dted_action; act != NULL; act = act->dtad_next) {
if ((enab->dten_error = dtrace_ecb_action_add(ecb, act)) != 0) {
dtrace_ecb_destroy(ecb);
return (NULL);
}
}
dtrace_ecb_resize(ecb);
return (dtrace_ecb_create_cache = ecb);
}
static int
dtrace_ecb_create_enable(dtrace_probe_t *probe, void *arg)
{
dtrace_ecb_t *ecb;
dtrace_enabling_t *enab = arg;
dtrace_state_t *state = enab->dten_vstate->dtvs_state;
ASSERT(state != NULL);
if (probe != NULL && probe->dtpr_gen < enab->dten_probegen) {
/*
* This probe was created in a generation for which this
* enabling has previously created ECBs; we don't want to
* enable it again, so just kick out.
*/
return (DTRACE_MATCH_NEXT);
}
if ((ecb = dtrace_ecb_create(state, probe, enab)) == NULL)
return (DTRACE_MATCH_DONE);
dtrace_ecb_enable(ecb);
return (DTRACE_MATCH_NEXT);
}
static dtrace_ecb_t *
dtrace_epid2ecb(dtrace_state_t *state, dtrace_epid_t id)
{
dtrace_ecb_t *ecb;
ASSERT(MUTEX_HELD(&dtrace_lock));
if (id == 0 || id > state->dts_necbs)
return (NULL);
ASSERT(state->dts_necbs > 0 && state->dts_ecbs != NULL);
ASSERT((ecb = state->dts_ecbs[id - 1]) == NULL || ecb->dte_epid == id);
return (state->dts_ecbs[id - 1]);
}
static dtrace_aggregation_t *
dtrace_aggid2agg(dtrace_state_t *state, dtrace_aggid_t id)
{
dtrace_aggregation_t *agg;
ASSERT(MUTEX_HELD(&dtrace_lock));
if (id == 0 || id > state->dts_naggregations)
return (NULL);
ASSERT(state->dts_naggregations > 0 && state->dts_aggregations != NULL);
ASSERT((agg = state->dts_aggregations[id - 1]) == NULL ||
agg->dtag_id == id);
return (state->dts_aggregations[id - 1]);
}
/*
* DTrace Buffer Functions
*
* The following functions manipulate DTrace buffers. Most of these functions
* are called in the context of establishing or processing consumer state;
* exceptions are explicitly noted.
*/
/*
* Note: called from cross call context. This function switches the two
* buffers on a given CPU. The atomicity of this operation is assured by
* disabling interrupts while the actual switch takes place; the disabling of
* interrupts serializes the execution with any execution of dtrace_probe() on
* the same CPU.
*/
static void
dtrace_buffer_switch(dtrace_buffer_t *buf)
{
caddr_t tomax = buf->dtb_tomax;
caddr_t xamot = buf->dtb_xamot;
dtrace_icookie_t cookie;
ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
ASSERT(!(buf->dtb_flags & DTRACEBUF_RING));
cookie = dtrace_interrupt_disable();
buf->dtb_tomax = xamot;
buf->dtb_xamot = tomax;
buf->dtb_xamot_drops = buf->dtb_drops;
buf->dtb_xamot_offset = buf->dtb_offset;
buf->dtb_xamot_errors = buf->dtb_errors;
buf->dtb_xamot_flags = buf->dtb_flags;
buf->dtb_offset = 0;
buf->dtb_drops = 0;
buf->dtb_errors = 0;
buf->dtb_flags &= ~(DTRACEBUF_ERROR | DTRACEBUF_DROPPED);
dtrace_interrupt_enable(cookie);
}
/*
* Note: called from cross call context. This function activates a buffer
* on a CPU. As with dtrace_buffer_switch(), the atomicity of the operation
* is guaranteed by the disabling of interrupts.
*/
static void
dtrace_buffer_activate(dtrace_state_t *state)
{
dtrace_buffer_t *buf;
dtrace_icookie_t cookie = dtrace_interrupt_disable();
buf = &state->dts_buffer[curcpu];
if (buf->dtb_tomax != NULL) {
/*
* We might like to assert that the buffer is marked inactive,
* but this isn't necessarily true: the buffer for the CPU
* that processes the BEGIN probe has its buffer activated
* manually. In this case, we take the (harmless) action
* re-clearing the bit INACTIVE bit.
*/
buf->dtb_flags &= ~DTRACEBUF_INACTIVE;
}
dtrace_interrupt_enable(cookie);
}
static int
dtrace_buffer_alloc(dtrace_buffer_t *bufs, size_t size, int flags,
processorid_t cpu)
{
#if defined(sun)
cpu_t *cp;
#endif
dtrace_buffer_t *buf;
#if defined(sun)
ASSERT(MUTEX_HELD(&cpu_lock));
ASSERT(MUTEX_HELD(&dtrace_lock));
if (size > dtrace_nonroot_maxsize &&
!PRIV_POLICY_CHOICE(CRED(), PRIV_ALL, B_FALSE))
return (EFBIG);
cp = cpu_list;
do {
if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id)
continue;
buf = &bufs[cp->cpu_id];
/*
* If there is already a buffer allocated for this CPU, it
* is only possible that this is a DR event. In this case,
*/
if (buf->dtb_tomax != NULL) {
ASSERT(buf->dtb_size == size);
continue;
}
ASSERT(buf->dtb_xamot == NULL);
if ((buf->dtb_tomax = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
goto err;
buf->dtb_size = size;
buf->dtb_flags = flags;
buf->dtb_offset = 0;
buf->dtb_drops = 0;
if (flags & DTRACEBUF_NOSWITCH)
continue;
if ((buf->dtb_xamot = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
goto err;
} while ((cp = cp->cpu_next) != cpu_list);
return (0);
err:
cp = cpu_list;
do {
if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id)
continue;
buf = &bufs[cp->cpu_id];
if (buf->dtb_xamot != NULL) {
ASSERT(buf->dtb_tomax != NULL);
ASSERT(buf->dtb_size == size);
kmem_free(buf->dtb_xamot, size);
}
if (buf->dtb_tomax != NULL) {
ASSERT(buf->dtb_size == size);
kmem_free(buf->dtb_tomax, size);
}
buf->dtb_tomax = NULL;
buf->dtb_xamot = NULL;
buf->dtb_size = 0;
} while ((cp = cp->cpu_next) != cpu_list);
return (ENOMEM);
#else
int i;
#if defined(__amd64__)
/*
* FreeBSD isn't good at limiting the amount of memory we
* ask to malloc, so let's place a limit here before trying
* to do something that might well end in tears at bedtime.
*/
if (size > physmem * PAGE_SIZE / (128 * (mp_maxid + 1)))
return(ENOMEM);
#endif
ASSERT(MUTEX_HELD(&dtrace_lock));
CPU_FOREACH(i) {
if (cpu != DTRACE_CPUALL && cpu != i)
continue;
buf = &bufs[i];
/*
* If there is already a buffer allocated for this CPU, it
* is only possible that this is a DR event. In this case,
* the buffer size must match our specified size.
*/
if (buf->dtb_tomax != NULL) {
ASSERT(buf->dtb_size == size);
continue;
}
ASSERT(buf->dtb_xamot == NULL);
if ((buf->dtb_tomax = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
goto err;
buf->dtb_size = size;
buf->dtb_flags = flags;
buf->dtb_offset = 0;
buf->dtb_drops = 0;
if (flags & DTRACEBUF_NOSWITCH)
continue;
if ((buf->dtb_xamot = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
goto err;
}
return (0);
err:
/*
* Error allocating memory, so free the buffers that were
* allocated before the failed allocation.
*/
CPU_FOREACH(i) {
if (cpu != DTRACE_CPUALL && cpu != i)
continue;
buf = &bufs[i];
if (buf->dtb_xamot != NULL) {
ASSERT(buf->dtb_tomax != NULL);
ASSERT(buf->dtb_size == size);
kmem_free(buf->dtb_xamot, size);
}
if (buf->dtb_tomax != NULL) {
ASSERT(buf->dtb_size == size);
kmem_free(buf->dtb_tomax, size);
}
buf->dtb_tomax = NULL;
buf->dtb_xamot = NULL;
buf->dtb_size = 0;
}
return (ENOMEM);
#endif
}
/*
* Note: called from probe context. This function just increments the drop
* count on a buffer. It has been made a function to allow for the
* possibility of understanding the source of mysterious drop counts. (A
* problem for which one may be particularly disappointed that DTrace cannot
* be used to understand DTrace.)
*/
static void
dtrace_buffer_drop(dtrace_buffer_t *buf)
{
buf->dtb_drops++;
}
/*
* Note: called from probe context. This function is called to reserve space
* in a buffer. If mstate is non-NULL, sets the scratch base and size in the
* mstate. Returns the new offset in the buffer, or a negative value if an
* error has occurred.
*/
static intptr_t
dtrace_buffer_reserve(dtrace_buffer_t *buf, size_t needed, size_t align,
dtrace_state_t *state, dtrace_mstate_t *mstate)
{
intptr_t offs = buf->dtb_offset, soffs;
intptr_t woffs;
caddr_t tomax;
size_t total;
if (buf->dtb_flags & DTRACEBUF_INACTIVE)
return (-1);
if ((tomax = buf->dtb_tomax) == NULL) {
dtrace_buffer_drop(buf);
return (-1);
}
if (!(buf->dtb_flags & (DTRACEBUF_RING | DTRACEBUF_FILL))) {
while (offs & (align - 1)) {
/*
* Assert that our alignment is off by a number which
* is itself sizeof (uint32_t) aligned.
*/
ASSERT(!((align - (offs & (align - 1))) &
(sizeof (uint32_t) - 1)));
DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE);
offs += sizeof (uint32_t);
}
if ((soffs = offs + needed) > buf->dtb_size) {
dtrace_buffer_drop(buf);
return (-1);
}
if (mstate == NULL)
return (offs);
mstate->dtms_scratch_base = (uintptr_t)tomax + soffs;
mstate->dtms_scratch_size = buf->dtb_size - soffs;
mstate->dtms_scratch_ptr = mstate->dtms_scratch_base;
return (offs);
}
if (buf->dtb_flags & DTRACEBUF_FILL) {
if (state->dts_activity != DTRACE_ACTIVITY_COOLDOWN &&
(buf->dtb_flags & DTRACEBUF_FULL))
return (-1);
goto out;
}
total = needed + (offs & (align - 1));
/*
* For a ring buffer, life is quite a bit more complicated. Before
* we can store any padding, we need to adjust our wrapping offset.
* (If we've never before wrapped or we're not about to, no adjustment
* is required.)
*/
if ((buf->dtb_flags & DTRACEBUF_WRAPPED) ||
offs + total > buf->dtb_size) {
woffs = buf->dtb_xamot_offset;
if (offs + total > buf->dtb_size) {
/*
* We can't fit in the end of the buffer. First, a
* sanity check that we can fit in the buffer at all.
*/
if (total > buf->dtb_size) {
dtrace_buffer_drop(buf);
return (-1);
}
/*
* We're going to be storing at the top of the buffer,
* so now we need to deal with the wrapped offset. We
* only reset our wrapped offset to 0 if it is
* currently greater than the current offset. If it
* is less than the current offset, it is because a
* previous allocation induced a wrap -- but the
* allocation didn't subsequently take the space due
* to an error or false predicate evaluation. In this
* case, we'll just leave the wrapped offset alone: if
* the wrapped offset hasn't been advanced far enough
* for this allocation, it will be adjusted in the
* lower loop.
*/
if (buf->dtb_flags & DTRACEBUF_WRAPPED) {
if (woffs >= offs)
woffs = 0;
} else {
woffs = 0;
}
/*
* Now we know that we're going to be storing to the
* top of the buffer and that there is room for us
* there. We need to clear the buffer from the current
* offset to the end (there may be old gunk there).
*/
while (offs < buf->dtb_size)
tomax[offs++] = 0;
/*
* We need to set our offset to zero. And because we
* are wrapping, we need to set the bit indicating as
* much. We can also adjust our needed space back
* down to the space required by the ECB -- we know
* that the top of the buffer is aligned.
*/
offs = 0;
total = needed;
buf->dtb_flags |= DTRACEBUF_WRAPPED;
} else {
/*
* There is room for us in the buffer, so we simply
* need to check the wrapped offset.
*/
if (woffs < offs) {
/*
* The wrapped offset is less than the offset.
* This can happen if we allocated buffer space
* that induced a wrap, but then we didn't
* subsequently take the space due to an error
* or false predicate evaluation. This is
* okay; we know that _this_ allocation isn't
* going to induce a wrap. We still can't
* reset the wrapped offset to be zero,
* however: the space may have been trashed in
* the previous failed probe attempt. But at
* least the wrapped offset doesn't need to
* be adjusted at all...
*/
goto out;
}
}
while (offs + total > woffs) {
dtrace_epid_t epid = *(uint32_t *)(tomax + woffs);
size_t size;
if (epid == DTRACE_EPIDNONE) {
size = sizeof (uint32_t);
} else {
ASSERT(epid <= state->dts_necbs);
ASSERT(state->dts_ecbs[epid - 1] != NULL);
size = state->dts_ecbs[epid - 1]->dte_size;
}
ASSERT(woffs + size <= buf->dtb_size);
ASSERT(size != 0);
if (woffs + size == buf->dtb_size) {
/*
* We've reached the end of the buffer; we want
* to set the wrapped offset to 0 and break
* out. However, if the offs is 0, then we're
* in a strange edge-condition: the amount of
* space that we want to reserve plus the size
* of the record that we're overwriting is
* greater than the size of the buffer. This
* is problematic because if we reserve the
* space but subsequently don't consume it (due
* to a failed predicate or error) the wrapped
* offset will be 0 -- yet the EPID at offset 0
* will not be committed. This situation is
* relatively easy to deal with: if we're in
* this case, the buffer is indistinguishable
* from one that hasn't wrapped; we need only
* finish the job by clearing the wrapped bit,
* explicitly setting the offset to be 0, and
* zero'ing out the old data in the buffer.
*/
if (offs == 0) {
buf->dtb_flags &= ~DTRACEBUF_WRAPPED;
buf->dtb_offset = 0;
woffs = total;
while (woffs < buf->dtb_size)
tomax[woffs++] = 0;
}
woffs = 0;
break;
}
woffs += size;
}
/*
* We have a wrapped offset. It may be that the wrapped offset
* has become zero -- that's okay.
*/
buf->dtb_xamot_offset = woffs;
}
out:
/*
* Now we can plow the buffer with any necessary padding.
*/
while (offs & (align - 1)) {
/*
* Assert that our alignment is off by a number which
* is itself sizeof (uint32_t) aligned.
*/
ASSERT(!((align - (offs & (align - 1))) &
(sizeof (uint32_t) - 1)));
DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE);
offs += sizeof (uint32_t);
}
if (buf->dtb_flags & DTRACEBUF_FILL) {
if (offs + needed > buf->dtb_size - state->dts_reserve) {
buf->dtb_flags |= DTRACEBUF_FULL;
return (-1);
}
}
if (mstate == NULL)
return (offs);
/*
* For ring buffers and fill buffers, the scratch space is always
* the inactive buffer.
*/
mstate->dtms_scratch_base = (uintptr_t)buf->dtb_xamot;
mstate->dtms_scratch_size = buf->dtb_size;
mstate->dtms_scratch_ptr = mstate->dtms_scratch_base;
return (offs);
}
static void
dtrace_buffer_polish(dtrace_buffer_t *buf)
{
ASSERT(buf->dtb_flags & DTRACEBUF_RING);
ASSERT(MUTEX_HELD(&dtrace_lock));
if (!(buf->dtb_flags & DTRACEBUF_WRAPPED))
return;
/*
* We need to polish the ring buffer. There are three cases:
*
* - The first (and presumably most common) is that there is no gap
* between the buffer offset and the wrapped offset. In this case,
* there is nothing in the buffer that isn't valid data; we can
* mark the buffer as polished and return.
*
* - The second (less common than the first but still more common
* than the third) is that there is a gap between the buffer offset
* and the wrapped offset, and the wrapped offset is larger than the
* buffer offset. This can happen because of an alignment issue, or
* can happen because of a call to dtrace_buffer_reserve() that
* didn't subsequently consume the buffer space. In this case,
* we need to zero the data from the buffer offset to the wrapped
* offset.
*
* - The third (and least common) is that there is a gap between the
* buffer offset and the wrapped offset, but the wrapped offset is
* _less_ than the buffer offset. This can only happen because a
* call to dtrace_buffer_reserve() induced a wrap, but the space
* was not subsequently consumed. In this case, we need to zero the
* space from the offset to the end of the buffer _and_ from the
* top of the buffer to the wrapped offset.
*/
if (buf->dtb_offset < buf->dtb_xamot_offset) {
bzero(buf->dtb_tomax + buf->dtb_offset,
buf->dtb_xamot_offset - buf->dtb_offset);
}
if (buf->dtb_offset > buf->dtb_xamot_offset) {
bzero(buf->dtb_tomax + buf->dtb_offset,
buf->dtb_size - buf->dtb_offset);
bzero(buf->dtb_tomax, buf->dtb_xamot_offset);
}
}
static void
dtrace_buffer_free(dtrace_buffer_t *bufs)
{
int i;
for (i = 0; i < NCPU; i++) {
dtrace_buffer_t *buf = &bufs[i];
if (buf->dtb_tomax == NULL) {
ASSERT(buf->dtb_xamot == NULL);
ASSERT(buf->dtb_size == 0);
continue;
}
if (buf->dtb_xamot != NULL) {
ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
kmem_free(buf->dtb_xamot, buf->dtb_size);
}
kmem_free(buf->dtb_tomax, buf->dtb_size);
buf->dtb_size = 0;
buf->dtb_tomax = NULL;
buf->dtb_xamot = NULL;
}
}
/*
* DTrace Enabling Functions
*/
static dtrace_enabling_t *
dtrace_enabling_create(dtrace_vstate_t *vstate)
{
dtrace_enabling_t *enab;
enab = kmem_zalloc(sizeof (dtrace_enabling_t), KM_SLEEP);
enab->dten_vstate = vstate;
return (enab);
}
static void
dtrace_enabling_add(dtrace_enabling_t *enab, dtrace_ecbdesc_t *ecb)
{
dtrace_ecbdesc_t **ndesc;
size_t osize, nsize;
/*
* We can't add to enablings after we've enabled them, or after we've
* retained them.
*/
ASSERT(enab->dten_probegen == 0);
ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL);
if (enab->dten_ndesc < enab->dten_maxdesc) {
enab->dten_desc[enab->dten_ndesc++] = ecb;
return;
}
osize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *);
if (enab->dten_maxdesc == 0) {
enab->dten_maxdesc = 1;
} else {
enab->dten_maxdesc <<= 1;
}
ASSERT(enab->dten_ndesc < enab->dten_maxdesc);
nsize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *);
ndesc = kmem_zalloc(nsize, KM_SLEEP);
bcopy(enab->dten_desc, ndesc, osize);
if (enab->dten_desc != NULL)
kmem_free(enab->dten_desc, osize);
enab->dten_desc = ndesc;
enab->dten_desc[enab->dten_ndesc++] = ecb;
}
static void
dtrace_enabling_addlike(dtrace_enabling_t *enab, dtrace_ecbdesc_t *ecb,
dtrace_probedesc_t *pd)
{
dtrace_ecbdesc_t *new;
dtrace_predicate_t *pred;
dtrace_actdesc_t *act;
/*
* We're going to create a new ECB description that matches the
* specified ECB in every way, but has the specified probe description.
*/
new = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP);
if ((pred = ecb->dted_pred.dtpdd_predicate) != NULL)
dtrace_predicate_hold(pred);
for (act = ecb->dted_action; act != NULL; act = act->dtad_next)
dtrace_actdesc_hold(act);
new->dted_action = ecb->dted_action;
new->dted_pred = ecb->dted_pred;
new->dted_probe = *pd;
new->dted_uarg = ecb->dted_uarg;
dtrace_enabling_add(enab, new);
}
static void
dtrace_enabling_dump(dtrace_enabling_t *enab)
{
int i;
for (i = 0; i < enab->dten_ndesc; i++) {
dtrace_probedesc_t *desc = &enab->dten_desc[i]->dted_probe;
cmn_err(CE_NOTE, "enabling probe %d (%s:%s:%s:%s)", i,
desc->dtpd_provider, desc->dtpd_mod,
desc->dtpd_func, desc->dtpd_name);
}
}
static void
dtrace_enabling_destroy(dtrace_enabling_t *enab)
{
int i;
dtrace_ecbdesc_t *ep;
dtrace_vstate_t *vstate = enab->dten_vstate;
ASSERT(MUTEX_HELD(&dtrace_lock));
for (i = 0; i < enab->dten_ndesc; i++) {
dtrace_actdesc_t *act, *next;
dtrace_predicate_t *pred;
ep = enab->dten_desc[i];
if ((pred = ep->dted_pred.dtpdd_predicate) != NULL)
dtrace_predicate_release(pred, vstate);
for (act = ep->dted_action; act != NULL; act = next) {
next = act->dtad_next;
dtrace_actdesc_release(act, vstate);
}
kmem_free(ep, sizeof (dtrace_ecbdesc_t));
}
if (enab->dten_desc != NULL)
kmem_free(enab->dten_desc,
enab->dten_maxdesc * sizeof (dtrace_enabling_t *));
/*
* If this was a retained enabling, decrement the dts_nretained count
* and take it off of the dtrace_retained list.
*/
if (enab->dten_prev != NULL || enab->dten_next != NULL ||
dtrace_retained == enab) {
ASSERT(enab->dten_vstate->dtvs_state != NULL);
ASSERT(enab->dten_vstate->dtvs_state->dts_nretained > 0);
enab->dten_vstate->dtvs_state->dts_nretained--;
}
if (enab->dten_prev == NULL) {
if (dtrace_retained == enab) {
dtrace_retained = enab->dten_next;
if (dtrace_retained != NULL)
dtrace_retained->dten_prev = NULL;
}
} else {
ASSERT(enab != dtrace_retained);
ASSERT(dtrace_retained != NULL);
enab->dten_prev->dten_next = enab->dten_next;
}
if (enab->dten_next != NULL) {
ASSERT(dtrace_retained != NULL);
enab->dten_next->dten_prev = enab->dten_prev;
}
kmem_free(enab, sizeof (dtrace_enabling_t));
}
static int
dtrace_enabling_retain(dtrace_enabling_t *enab)
{
dtrace_state_t *state;
ASSERT(MUTEX_HELD(&dtrace_lock));
ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL);
ASSERT(enab->dten_vstate != NULL);
state = enab->dten_vstate->dtvs_state;
ASSERT(state != NULL);
/*
* We only allow each state to retain dtrace_retain_max enablings.
*/
if (state->dts_nretained >= dtrace_retain_max)
return (ENOSPC);
state->dts_nretained++;
if (dtrace_retained == NULL) {
dtrace_retained = enab;
return (0);
}
enab->dten_next = dtrace_retained;
dtrace_retained->dten_prev = enab;
dtrace_retained = enab;
return (0);
}
static int
dtrace_enabling_replicate(dtrace_state_t *state, dtrace_probedesc_t *match,
dtrace_probedesc_t *create)
{
dtrace_enabling_t *new, *enab;
int found = 0, err = ENOENT;
ASSERT(MUTEX_HELD(&dtrace_lock));
ASSERT(strlen(match->dtpd_provider) < DTRACE_PROVNAMELEN);
ASSERT(strlen(match->dtpd_mod) < DTRACE_MODNAMELEN);
ASSERT(strlen(match->dtpd_func) < DTRACE_FUNCNAMELEN);
ASSERT(strlen(match->dtpd_name) < DTRACE_NAMELEN);
new = dtrace_enabling_create(&state->dts_vstate);
/*
* Iterate over all retained enablings, looking for enablings that
* match the specified state.
*/
for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
int i;
/*
* dtvs_state can only be NULL for helper enablings -- and
* helper enablings can't be retained.
*/
ASSERT(enab->dten_vstate->dtvs_state != NULL);
if (enab->dten_vstate->dtvs_state != state)
continue;
/*
* Now iterate over each probe description; we're looking for
* an exact match to the specified probe description.
*/
for (i = 0; i < enab->dten_ndesc; i++) {
dtrace_ecbdesc_t *ep = enab->dten_desc[i];
dtrace_probedesc_t *pd = &ep->dted_probe;
if (strcmp(pd->dtpd_provider, match->dtpd_provider))
continue;
if (strcmp(pd->dtpd_mod, match->dtpd_mod))
continue;
if (strcmp(pd->dtpd_func, match->dtpd_func))
continue;
if (strcmp(pd->dtpd_name, match->dtpd_name))
continue;
/*
* We have a winning probe! Add it to our growing
* enabling.
*/
found = 1;
dtrace_enabling_addlike(new, ep, create);
}
}
if (!found || (err = dtrace_enabling_retain(new)) != 0) {
dtrace_enabling_destroy(new);
return (err);
}
return (0);
}
static void
dtrace_enabling_retract(dtrace_state_t *state)
{
dtrace_enabling_t *enab, *next;
ASSERT(MUTEX_HELD(&dtrace_lock));
/*
* Iterate over all retained enablings, destroy the enablings retained
* for the specified state.
*/
for (enab = dtrace_retained; enab != NULL; enab = next) {
next = enab->dten_next;
/*
* dtvs_state can only be NULL for helper enablings -- and
* helper enablings can't be retained.
*/
ASSERT(enab->dten_vstate->dtvs_state != NULL);
if (enab->dten_vstate->dtvs_state == state) {
ASSERT(state->dts_nretained > 0);
dtrace_enabling_destroy(enab);
}
}
ASSERT(state->dts_nretained == 0);
}
static int
dtrace_enabling_match(dtrace_enabling_t *enab, int *nmatched)
{
int i = 0;
int matched = 0;
ASSERT(MUTEX_HELD(&cpu_lock));
ASSERT(MUTEX_HELD(&dtrace_lock));
for (i = 0; i < enab->dten_ndesc; i++) {
dtrace_ecbdesc_t *ep = enab->dten_desc[i];
enab->dten_current = ep;
enab->dten_error = 0;
matched += dtrace_probe_enable(&ep->dted_probe, enab);
if (enab->dten_error != 0) {
/*
* If we get an error half-way through enabling the
* probes, we kick out -- perhaps with some number of
* them enabled. Leaving enabled probes enabled may
* be slightly confusing for user-level, but we expect
* that no one will attempt to actually drive on in
* the face of such errors. If this is an anonymous
* enabling (indicated with a NULL nmatched pointer),
* we cmn_err() a message. We aren't expecting to
* get such an error -- such as it can exist at all,
* it would be a result of corrupted DOF in the driver
* properties.
*/
if (nmatched == NULL) {
cmn_err(CE_WARN, "dtrace_enabling_match() "
"error on %p: %d", (void *)ep,
enab->dten_error);
}
return (enab->dten_error);
}
}
enab->dten_probegen = dtrace_probegen;
if (nmatched != NULL)
*nmatched = matched;
return (0);
}
static void
dtrace_enabling_matchall(void)
{
dtrace_enabling_t *enab;
mutex_enter(&cpu_lock);
mutex_enter(&dtrace_lock);
/*
* Iterate over all retained enablings to see if any probes match
* against them. We only perform this operation on enablings for which
* we have sufficient permissions by virtue of being in the global zone
* or in the same zone as the DTrace client. Because we can be called
* after dtrace_detach() has been called, we cannot assert that there
* are retained enablings. We can safely load from dtrace_retained,
* however: the taskq_destroy() at the end of dtrace_detach() will
* block pending our completion.
*/
for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
#if defined(sun)
cred_t *cr = enab->dten_vstate->dtvs_state->dts_cred.dcr_cred;
if (INGLOBALZONE(curproc) || getzoneid() == crgetzoneid(cr))
#endif
(void) dtrace_enabling_match(enab, NULL);
}
mutex_exit(&dtrace_lock);
mutex_exit(&cpu_lock);
}
/*
* If an enabling is to be enabled without having matched probes (that is, if
* dtrace_state_go() is to be called on the underlying dtrace_state_t), the
* enabling must be _primed_ by creating an ECB for every ECB description.
* This must be done to assure that we know the number of speculations, the
* number of aggregations, the minimum buffer size needed, etc. before we
* transition out of DTRACE_ACTIVITY_INACTIVE. To do this without actually
* enabling any probes, we create ECBs for every ECB decription, but with a
* NULL probe -- which is exactly what this function does.
*/
static void
dtrace_enabling_prime(dtrace_state_t *state)
{
dtrace_enabling_t *enab;
int i;
for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
ASSERT(enab->dten_vstate->dtvs_state != NULL);
if (enab->dten_vstate->dtvs_state != state)
continue;
/*
* We don't want to prime an enabling more than once, lest
* we allow a malicious user to induce resource exhaustion.
* (The ECBs that result from priming an enabling aren't
* leaked -- but they also aren't deallocated until the
* consumer state is destroyed.)
*/
if (enab->dten_primed)
continue;
for (i = 0; i < enab->dten_ndesc; i++) {
enab->dten_current = enab->dten_desc[i];
(void) dtrace_probe_enable(NULL, enab);
}
enab->dten_primed = 1;
}
}
/*
* Called to indicate that probes should be provided due to retained
* enablings. This is implemented in terms of dtrace_probe_provide(), but it
* must take an initial lap through the enabling calling the dtps_provide()
* entry point explicitly to allow for autocreated probes.
*/
static void
dtrace_enabling_provide(dtrace_provider_t *prv)
{
int i, all = 0;
dtrace_probedesc_t desc;
ASSERT(MUTEX_HELD(&dtrace_lock));
ASSERT(MUTEX_HELD(&dtrace_provider_lock));
if (prv == NULL) {
all = 1;
prv = dtrace_provider;
}
do {
dtrace_enabling_t *enab = dtrace_retained;
void *parg = prv->dtpv_arg;
for (; enab != NULL; enab = enab->dten_next) {
for (i = 0; i < enab->dten_ndesc; i++) {
desc = enab->dten_desc[i]->dted_probe;
mutex_exit(&dtrace_lock);
prv->dtpv_pops.dtps_provide(parg, &desc);
mutex_enter(&dtrace_lock);
}
}
} while (all && (prv = prv->dtpv_next) != NULL);
mutex_exit(&dtrace_lock);
dtrace_probe_provide(NULL, all ? NULL : prv);
mutex_enter(&dtrace_lock);
}
/*
* DTrace DOF Functions
*/
/*ARGSUSED*/
static void
dtrace_dof_error(dof_hdr_t *dof, const char *str)
{
if (dtrace_err_verbose)
cmn_err(CE_WARN, "failed to process DOF: %s", str);
#ifdef DTRACE_ERRDEBUG
dtrace_errdebug(str);
#endif
}
/*
* Create DOF out of a currently enabled state. Right now, we only create
* DOF containing the run-time options -- but this could be expanded to create
* complete DOF representing the enabled state.
*/
static dof_hdr_t *
dtrace_dof_create(dtrace_state_t *state)
{
dof_hdr_t *dof;
dof_sec_t *sec;
dof_optdesc_t *opt;
int i, len = sizeof (dof_hdr_t) +
roundup(sizeof (dof_sec_t), sizeof (uint64_t)) +
sizeof (dof_optdesc_t) * DTRACEOPT_MAX;
ASSERT(MUTEX_HELD(&dtrace_lock));
dof = kmem_zalloc(len, KM_SLEEP);
dof->dofh_ident[DOF_ID_MAG0] = DOF_MAG_MAG0;
dof->dofh_ident[DOF_ID_MAG1] = DOF_MAG_MAG1;
dof->dofh_ident[DOF_ID_MAG2] = DOF_MAG_MAG2;
dof->dofh_ident[DOF_ID_MAG3] = DOF_MAG_MAG3;
dof->dofh_ident[DOF_ID_MODEL] = DOF_MODEL_NATIVE;
dof->dofh_ident[DOF_ID_ENCODING] = DOF_ENCODE_NATIVE;
dof->dofh_ident[DOF_ID_VERSION] = DOF_VERSION;
dof->dofh_ident[DOF_ID_DIFVERS] = DIF_VERSION;
dof->dofh_ident[DOF_ID_DIFIREG] = DIF_DIR_NREGS;
dof->dofh_ident[DOF_ID_DIFTREG] = DIF_DTR_NREGS;
dof->dofh_flags = 0;
dof->dofh_hdrsize = sizeof (dof_hdr_t);
dof->dofh_secsize = sizeof (dof_sec_t);
dof->dofh_secnum = 1; /* only DOF_SECT_OPTDESC */
dof->dofh_secoff = sizeof (dof_hdr_t);
dof->dofh_loadsz = len;
dof->dofh_filesz = len;
dof->dofh_pad = 0;
/*
* Fill in the option section header...
*/
sec = (dof_sec_t *)((uintptr_t)dof + sizeof (dof_hdr_t));
sec->dofs_type = DOF_SECT_OPTDESC;
sec->dofs_align = sizeof (uint64_t);
sec->dofs_flags = DOF_SECF_LOAD;
sec->dofs_entsize = sizeof (dof_optdesc_t);
opt = (dof_optdesc_t *)((uintptr_t)sec +
roundup(sizeof (dof_sec_t), sizeof (uint64_t)));
sec->dofs_offset = (uintptr_t)opt - (uintptr_t)dof;
sec->dofs_size = sizeof (dof_optdesc_t) * DTRACEOPT_MAX;
for (i = 0; i < DTRACEOPT_MAX; i++) {
opt[i].dofo_option = i;
opt[i].dofo_strtab = DOF_SECIDX_NONE;
opt[i].dofo_value = state->dts_options[i];
}
return (dof);
}
static dof_hdr_t *
dtrace_dof_copyin(uintptr_t uarg, int *errp)
{
dof_hdr_t hdr, *dof;
ASSERT(!MUTEX_HELD(&dtrace_lock));
/*
* First, we're going to copyin() the sizeof (dof_hdr_t).
*/
if (copyin((void *)uarg, &hdr, sizeof (hdr)) != 0) {
dtrace_dof_error(NULL, "failed to copyin DOF header");
*errp = EFAULT;
return (NULL);
}
/*
* Now we'll allocate the entire DOF and copy it in -- provided
* that the length isn't outrageous.
*/
if (hdr.dofh_loadsz >= dtrace_dof_maxsize) {
dtrace_dof_error(&hdr, "load size exceeds maximum");
*errp = E2BIG;
return (NULL);
}
if (hdr.dofh_loadsz < sizeof (hdr)) {
dtrace_dof_error(&hdr, "invalid load size");
*errp = EINVAL;
return (NULL);
}
dof = kmem_alloc(hdr.dofh_loadsz, KM_SLEEP);
if (copyin((void *)uarg, dof, hdr.dofh_loadsz) != 0) {
kmem_free(dof, hdr.dofh_loadsz);
*errp = EFAULT;
return (NULL);
}
return (dof);
}
#if !defined(sun)
static __inline uchar_t
dtrace_dof_char(char c) {
switch (c) {
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
return (c - '0');
case 'A':
case 'B':
case 'C':
case 'D':
case 'E':
case 'F':
return (c - 'A' + 10);
case 'a':
case 'b':
case 'c':
case 'd':
case 'e':
case 'f':
return (c - 'a' + 10);
}
/* Should not reach here. */
return (0);
}
#endif
static dof_hdr_t *
dtrace_dof_property(const char *name)
{
uchar_t *buf;
uint64_t loadsz;
unsigned int len, i;
dof_hdr_t *dof;
#if defined(sun)
/*
* Unfortunately, array of values in .conf files are always (and
* only) interpreted to be integer arrays. We must read our DOF
* as an integer array, and then squeeze it into a byte array.
*/
if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, dtrace_devi, 0,
(char *)name, (int **)&buf, &len) != DDI_PROP_SUCCESS)
return (NULL);
for (i = 0; i < len; i++)
buf[i] = (uchar_t)(((int *)buf)[i]);
if (len < sizeof (dof_hdr_t)) {
ddi_prop_free(buf);
dtrace_dof_error(NULL, "truncated header");
return (NULL);
}
if (len < (loadsz = ((dof_hdr_t *)buf)->dofh_loadsz)) {
ddi_prop_free(buf);
dtrace_dof_error(NULL, "truncated DOF");
return (NULL);
}
if (loadsz >= dtrace_dof_maxsize) {
ddi_prop_free(buf);
dtrace_dof_error(NULL, "oversized DOF");
return (NULL);
}
dof = kmem_alloc(loadsz, KM_SLEEP);
bcopy(buf, dof, loadsz);
ddi_prop_free(buf);
#else
char *p;
char *p_env;
if ((p_env = getenv(name)) == NULL)
return (NULL);
len = strlen(p_env) / 2;
buf = kmem_alloc(len, KM_SLEEP);
dof = (dof_hdr_t *) buf;
p = p_env;
for (i = 0; i < len; i++) {
buf[i] = (dtrace_dof_char(p[0]) << 4) |
dtrace_dof_char(p[1]);
p += 2;
}
freeenv(p_env);
if (len < sizeof (dof_hdr_t)) {
kmem_free(buf, 0);
dtrace_dof_error(NULL, "truncated header");
return (NULL);
}
if (len < (loadsz = dof->dofh_loadsz)) {
kmem_free(buf, 0);
dtrace_dof_error(NULL, "truncated DOF");
return (NULL);
}
if (loadsz >= dtrace_dof_maxsize) {
kmem_free(buf, 0);
dtrace_dof_error(NULL, "oversized DOF");
return (NULL);
}
#endif
return (dof);
}
static void
dtrace_dof_destroy(dof_hdr_t *dof)
{
kmem_free(dof, dof->dofh_loadsz);
}
/*
* Return the dof_sec_t pointer corresponding to a given section index. If the
* index is not valid, dtrace_dof_error() is called and NULL is returned. If
* a type other than DOF_SECT_NONE is specified, the header is checked against
* this type and NULL is returned if the types do not match.
*/
static dof_sec_t *
dtrace_dof_sect(dof_hdr_t *dof, uint32_t type, dof_secidx_t i)
{
dof_sec_t *sec = (dof_sec_t *)(uintptr_t)
((uintptr_t)dof + dof->dofh_secoff + i * dof->dofh_secsize);
if (i >= dof->dofh_secnum) {
dtrace_dof_error(dof, "referenced section index is invalid");
return (NULL);
}
if (!(sec->dofs_flags & DOF_SECF_LOAD)) {
dtrace_dof_error(dof, "referenced section is not loadable");
return (NULL);
}
if (type != DOF_SECT_NONE && type != sec->dofs_type) {
dtrace_dof_error(dof, "referenced section is the wrong type");
return (NULL);
}
return (sec);
}
static dtrace_probedesc_t *
dtrace_dof_probedesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_probedesc_t *desc)
{
dof_probedesc_t *probe;
dof_sec_t *strtab;
uintptr_t daddr = (uintptr_t)dof;
uintptr_t str;
size_t size;
if (sec->dofs_type != DOF_SECT_PROBEDESC) {
dtrace_dof_error(dof, "invalid probe section");
return (NULL);
}
if (sec->dofs_align != sizeof (dof_secidx_t)) {
dtrace_dof_error(dof, "bad alignment in probe description");
return (NULL);
}
if (sec->dofs_offset + sizeof (dof_probedesc_t) > dof->dofh_loadsz) {
dtrace_dof_error(dof, "truncated probe description");
return (NULL);
}
probe = (dof_probedesc_t *)(uintptr_t)(daddr + sec->dofs_offset);
strtab = dtrace_dof_sect(dof, DOF_SECT_STRTAB, probe->dofp_strtab);
if (strtab == NULL)
return (NULL);
str = daddr + strtab->dofs_offset;
size = strtab->dofs_size;
if (probe->dofp_provider >= strtab->dofs_size) {
dtrace_dof_error(dof, "corrupt probe provider");
return (NULL);
}
(void) strncpy(desc->dtpd_provider,
(char *)(str + probe->dofp_provider),
MIN(DTRACE_PROVNAMELEN - 1, size - probe->dofp_provider));
if (probe->dofp_mod >= strtab->dofs_size) {
dtrace_dof_error(dof, "corrupt probe module");
return (NULL);
}
(void) strncpy(desc->dtpd_mod, (char *)(str + probe->dofp_mod),
MIN(DTRACE_MODNAMELEN - 1, size - probe->dofp_mod));
if (probe->dofp_func >= strtab->dofs_size) {
dtrace_dof_error(dof, "corrupt probe function");
return (NULL);
}
(void) strncpy(desc->dtpd_func, (char *)(str + probe->dofp_func),
MIN(DTRACE_FUNCNAMELEN - 1, size - probe->dofp_func));
if (probe->dofp_name >= strtab->dofs_size) {
dtrace_dof_error(dof, "corrupt probe name");
return (NULL);
}
(void) strncpy(desc->dtpd_name, (char *)(str + probe->dofp_name),
MIN(DTRACE_NAMELEN - 1, size - probe->dofp_name));
return (desc);
}
static dtrace_difo_t *
dtrace_dof_difo(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
cred_t *cr)
{
dtrace_difo_t *dp;
size_t ttl = 0;
dof_difohdr_t *dofd;
uintptr_t daddr = (uintptr_t)dof;
size_t max = dtrace_difo_maxsize;
int i, l, n;
static const struct {
int section;
int bufoffs;
int lenoffs;
int entsize;
int align;
const char *msg;
} difo[] = {
{ DOF_SECT_DIF, offsetof(dtrace_difo_t, dtdo_buf),
offsetof(dtrace_difo_t, dtdo_len), sizeof (dif_instr_t),
sizeof (dif_instr_t), "multiple DIF sections" },
{ DOF_SECT_INTTAB, offsetof(dtrace_difo_t, dtdo_inttab),
offsetof(dtrace_difo_t, dtdo_intlen), sizeof (uint64_t),
sizeof (uint64_t), "multiple integer tables" },
{ DOF_SECT_STRTAB, offsetof(dtrace_difo_t, dtdo_strtab),
offsetof(dtrace_difo_t, dtdo_strlen), 0,
sizeof (char), "multiple string tables" },
{ DOF_SECT_VARTAB, offsetof(dtrace_difo_t, dtdo_vartab),
offsetof(dtrace_difo_t, dtdo_varlen), sizeof (dtrace_difv_t),
sizeof (uint_t), "multiple variable tables" },
{ DOF_SECT_NONE, 0, 0, 0, 0, NULL }
};
if (sec->dofs_type != DOF_SECT_DIFOHDR) {
dtrace_dof_error(dof, "invalid DIFO header section");
return (NULL);
}
if (sec->dofs_align != sizeof (dof_secidx_t)) {
dtrace_dof_error(dof, "bad alignment in DIFO header");
return (NULL);
}
if (sec->dofs_size < sizeof (dof_difohdr_t) ||
sec->dofs_size % sizeof (dof_secidx_t)) {
dtrace_dof_error(dof, "bad size in DIFO header");
return (NULL);
}
dofd = (dof_difohdr_t *)(uintptr_t)(daddr + sec->dofs_offset);
n = (sec->dofs_size - sizeof (*dofd)) / sizeof (dof_secidx_t) + 1;
dp = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP);
dp->dtdo_rtype = dofd->dofd_rtype;
for (l = 0; l < n; l++) {
dof_sec_t *subsec;
void **bufp;
uint32_t *lenp;
if ((subsec = dtrace_dof_sect(dof, DOF_SECT_NONE,
dofd->dofd_links[l])) == NULL)
goto err; /* invalid section link */
if (ttl + subsec->dofs_size > max) {
dtrace_dof_error(dof, "exceeds maximum size");
goto err;
}
ttl += subsec->dofs_size;
for (i = 0; difo[i].section != DOF_SECT_NONE; i++) {
if (subsec->dofs_type != difo[i].section)
continue;
if (!(subsec->dofs_flags & DOF_SECF_LOAD)) {
dtrace_dof_error(dof, "section not loaded");
goto err;
}
if (subsec->dofs_align != difo[i].align) {
dtrace_dof_error(dof, "bad alignment");
goto err;
}
bufp = (void **)((uintptr_t)dp + difo[i].bufoffs);
lenp = (uint32_t *)((uintptr_t)dp + difo[i].lenoffs);
if (*bufp != NULL) {
dtrace_dof_error(dof, difo[i].msg);
goto err;
}
if (difo[i].entsize != subsec->dofs_entsize) {
dtrace_dof_error(dof, "entry size mismatch");
goto err;
}
if (subsec->dofs_entsize != 0 &&
(subsec->dofs_size % subsec->dofs_entsize) != 0) {
dtrace_dof_error(dof, "corrupt entry size");
goto err;
}
*lenp = subsec->dofs_size;
*bufp = kmem_alloc(subsec->dofs_size, KM_SLEEP);
bcopy((char *)(uintptr_t)(daddr + subsec->dofs_offset),
*bufp, subsec->dofs_size);
if (subsec->dofs_entsize != 0)
*lenp /= subsec->dofs_entsize;
break;
}
/*
* If we encounter a loadable DIFO sub-section that is not
* known to us, assume this is a broken program and fail.
*/
if (difo[i].section == DOF_SECT_NONE &&
(subsec->dofs_flags & DOF_SECF_LOAD)) {
dtrace_dof_error(dof, "unrecognized DIFO subsection");
goto err;
}
}
if (dp->dtdo_buf == NULL) {
/*
* We can't have a DIF object without DIF text.
*/
dtrace_dof_error(dof, "missing DIF text");
goto err;
}
/*
* Before we validate the DIF object, run through the variable table
* looking for the strings -- if any of their size are under, we'll set
* their size to be the system-wide default string size. Note that
* this should _not_ happen if the "strsize" option has been set --
* in this case, the compiler should have set the size to reflect the
* setting of the option.
*/
for (i = 0; i < dp->dtdo_varlen; i++) {
dtrace_difv_t *v = &dp->dtdo_vartab[i];
dtrace_diftype_t *t = &v->dtdv_type;
if (v->dtdv_id < DIF_VAR_OTHER_UBASE)
continue;
if (t->dtdt_kind == DIF_TYPE_STRING && t->dtdt_size == 0)
t->dtdt_size = dtrace_strsize_default;
}
if (dtrace_difo_validate(dp, vstate, DIF_DIR_NREGS, cr) != 0)
goto err;
dtrace_difo_init(dp, vstate);
return (dp);
err:
kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t));
kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t));
kmem_free(dp->dtdo_strtab, dp->dtdo_strlen);
kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t));
kmem_free(dp, sizeof (dtrace_difo_t));
return (NULL);
}
static dtrace_predicate_t *
dtrace_dof_predicate(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
cred_t *cr)
{
dtrace_difo_t *dp;
if ((dp = dtrace_dof_difo(dof, sec, vstate, cr)) == NULL)
return (NULL);
return (dtrace_predicate_create(dp));
}
static dtrace_actdesc_t *
dtrace_dof_actdesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
cred_t *cr)
{
dtrace_actdesc_t *act, *first = NULL, *last = NULL, *next;
dof_actdesc_t *desc;
dof_sec_t *difosec;
size_t offs;
uintptr_t daddr = (uintptr_t)dof;
uint64_t arg;
dtrace_actkind_t kind;
if (sec->dofs_type != DOF_SECT_ACTDESC) {
dtrace_dof_error(dof, "invalid action section");
return (NULL);
}
if (sec->dofs_offset + sizeof (dof_actdesc_t) > dof->dofh_loadsz) {
dtrace_dof_error(dof, "truncated action description");
return (NULL);
}
if (sec->dofs_align != sizeof (uint64_t)) {
dtrace_dof_error(dof, "bad alignment in action description");
return (NULL);
}
if (sec->dofs_size < sec->dofs_entsize) {
dtrace_dof_error(dof, "section entry size exceeds total size");
return (NULL);
}
if (sec->dofs_entsize != sizeof (dof_actdesc_t)) {
dtrace_dof_error(dof, "bad entry size in action description");
return (NULL);
}
if (sec->dofs_size / sec->dofs_entsize > dtrace_actions_max) {
dtrace_dof_error(dof, "actions exceed dtrace_actions_max");
return (NULL);
}
for (offs = 0; offs < sec->dofs_size; offs += sec->dofs_entsize) {
desc = (dof_actdesc_t *)(daddr +
(uintptr_t)sec->dofs_offset + offs);
kind = (dtrace_actkind_t)desc->dofa_kind;
if (DTRACEACT_ISPRINTFLIKE(kind) &&
(kind != DTRACEACT_PRINTA ||
desc->dofa_strtab != DOF_SECIDX_NONE)) {
dof_sec_t *strtab;
char *str, *fmt;
uint64_t i;
/*
* printf()-like actions must have a format string.
*/
if ((strtab = dtrace_dof_sect(dof,
DOF_SECT_STRTAB, desc->dofa_strtab)) == NULL)
goto err;
str = (char *)((uintptr_t)dof +
(uintptr_t)strtab->dofs_offset);
for (i = desc->dofa_arg; i < strtab->dofs_size; i++) {
if (str[i] == '\0')
break;
}
if (i >= strtab->dofs_size) {
dtrace_dof_error(dof, "bogus format string");
goto err;
}
if (i == desc->dofa_arg) {
dtrace_dof_error(dof, "empty format string");
goto err;
}
i -= desc->dofa_arg;
fmt = kmem_alloc(i + 1, KM_SLEEP);
bcopy(&str[desc->dofa_arg], fmt, i + 1);
arg = (uint64_t)(uintptr_t)fmt;
} else {
if (kind == DTRACEACT_PRINTA) {
ASSERT(desc->dofa_strtab == DOF_SECIDX_NONE);
arg = 0;
} else {
arg = desc->dofa_arg;
}
}
act = dtrace_actdesc_create(kind, desc->dofa_ntuple,
desc->dofa_uarg, arg);
if (last != NULL) {
last->dtad_next = act;
} else {
first = act;
}
last = act;
if (desc->dofa_difo == DOF_SECIDX_NONE)
continue;
if ((difosec = dtrace_dof_sect(dof,
DOF_SECT_DIFOHDR, desc->dofa_difo)) == NULL)
goto err;
act->dtad_difo = dtrace_dof_difo(dof, difosec, vstate, cr);
if (act->dtad_difo == NULL)
goto err;
}
ASSERT(first != NULL);
return (first);
err:
for (act = first; act != NULL; act = next) {
next = act->dtad_next;
dtrace_actdesc_release(act, vstate);
}
return (NULL);
}
static dtrace_ecbdesc_t *
dtrace_dof_ecbdesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
cred_t *cr)
{
dtrace_ecbdesc_t *ep;
dof_ecbdesc_t *ecb;
dtrace_probedesc_t *desc;
dtrace_predicate_t *pred = NULL;
if (sec->dofs_size < sizeof (dof_ecbdesc_t)) {
dtrace_dof_error(dof, "truncated ECB description");
return (NULL);
}
if (sec->dofs_align != sizeof (uint64_t)) {
dtrace_dof_error(dof, "bad alignment in ECB description");
return (NULL);
}
ecb = (dof_ecbdesc_t *)((uintptr_t)dof + (uintptr_t)sec->dofs_offset);
sec = dtrace_dof_sect(dof, DOF_SECT_PROBEDESC, ecb->dofe_probes);
if (sec == NULL)
return (NULL);
ep = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP);
ep->dted_uarg = ecb->dofe_uarg;
desc = &ep->dted_probe;
if (dtrace_dof_probedesc(dof, sec, desc) == NULL)
goto err;
if (ecb->dofe_pred != DOF_SECIDX_NONE) {
if ((sec = dtrace_dof_sect(dof,
DOF_SECT_DIFOHDR, ecb->dofe_pred)) == NULL)
goto err;
if ((pred = dtrace_dof_predicate(dof, sec, vstate, cr)) == NULL)
goto err;
ep->dted_pred.dtpdd_predicate = pred;
}
if (ecb->dofe_actions != DOF_SECIDX_NONE) {
if ((sec = dtrace_dof_sect(dof,
DOF_SECT_ACTDESC, ecb->dofe_actions)) == NULL)
goto err;
ep->dted_action = dtrace_dof_actdesc(dof, sec, vstate, cr);
if (ep->dted_action == NULL)
goto err;
}
return (ep);
err:
if (pred != NULL)
dtrace_predicate_release(pred, vstate);
kmem_free(ep, sizeof (dtrace_ecbdesc_t));
return (NULL);
}
/*
* Apply the relocations from the specified 'sec' (a DOF_SECT_URELHDR) to the
* specified DOF. At present, this amounts to simply adding 'ubase' to the
* site of any user SETX relocations to account for load object base address.
* In the future, if we need other relocations, this function can be extended.
*/
static int
dtrace_dof_relocate(dof_hdr_t *dof, dof_sec_t *sec, uint64_t ubase)
{
uintptr_t daddr = (uintptr_t)dof;
dof_relohdr_t *dofr =
(dof_relohdr_t *)(uintptr_t)(daddr + sec->dofs_offset);
dof_sec_t *ss, *rs, *ts;
dof_relodesc_t *r;
uint_t i, n;
if (sec->dofs_size < sizeof (dof_relohdr_t) ||
sec->dofs_align != sizeof (dof_secidx_t)) {
dtrace_dof_error(dof, "invalid relocation header");
return (-1);
}
ss = dtrace_dof_sect(dof, DOF_SECT_STRTAB, dofr->dofr_strtab);
rs = dtrace_dof_sect(dof, DOF_SECT_RELTAB, dofr->dofr_relsec);
ts = dtrace_dof_sect(dof, DOF_SECT_NONE, dofr->dofr_tgtsec);
if (ss == NULL || rs == NULL || ts == NULL)
return (-1); /* dtrace_dof_error() has been called already */
if (rs->dofs_entsize < sizeof (dof_relodesc_t) ||
rs->dofs_align != sizeof (uint64_t)) {
dtrace_dof_error(dof, "invalid relocation section");
return (-1);
}
r = (dof_relodesc_t *)(uintptr_t)(daddr + rs->dofs_offset);
n = rs->dofs_size / rs->dofs_entsize;
for (i = 0; i < n; i++) {
uintptr_t taddr = daddr + ts->dofs_offset + r->dofr_offset;
switch (r->dofr_type) {
case DOF_RELO_NONE:
break;
case DOF_RELO_SETX:
if (r->dofr_offset >= ts->dofs_size || r->dofr_offset +
sizeof (uint64_t) > ts->dofs_size) {
dtrace_dof_error(dof, "bad relocation offset");
return (-1);
}
if (!IS_P2ALIGNED(taddr, sizeof (uint64_t))) {
dtrace_dof_error(dof, "misaligned setx relo");
return (-1);
}
*(uint64_t *)taddr += ubase;
break;
default:
dtrace_dof_error(dof, "invalid relocation type");
return (-1);
}
r = (dof_relodesc_t *)((uintptr_t)r + rs->dofs_entsize);
}
return (0);
}
/*
* The dof_hdr_t passed to dtrace_dof_slurp() should be a partially validated
* header: it should be at the front of a memory region that is at least
* sizeof (dof_hdr_t) in size -- and then at least dof_hdr.dofh_loadsz in
* size. It need not be validated in any other way.
*/
static int
dtrace_dof_slurp(dof_hdr_t *dof, dtrace_vstate_t *vstate, cred_t *cr,
dtrace_enabling_t **enabp, uint64_t ubase, int noprobes)
{
uint64_t len = dof->dofh_loadsz, seclen;
uintptr_t daddr = (uintptr_t)dof;
dtrace_ecbdesc_t *ep;
dtrace_enabling_t *enab;
uint_t i;
ASSERT(MUTEX_HELD(&dtrace_lock));
ASSERT(dof->dofh_loadsz >= sizeof (dof_hdr_t));
/*
* Check the DOF header identification bytes. In addition to checking
* valid settings, we also verify that unused bits/bytes are zeroed so
* we can use them later without fear of regressing existing binaries.
*/
if (bcmp(&dof->dofh_ident[DOF_ID_MAG0],
DOF_MAG_STRING, DOF_MAG_STRLEN) != 0) {
dtrace_dof_error(dof, "DOF magic string mismatch");
return (-1);
}
if (dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_ILP32 &&
dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_LP64) {
dtrace_dof_error(dof, "DOF has invalid data model");
return (-1);
}
if (dof->dofh_ident[DOF_ID_ENCODING] != DOF_ENCODE_NATIVE) {
dtrace_dof_error(dof, "DOF encoding mismatch");
return (-1);
}
if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_2) {
dtrace_dof_error(dof, "DOF version mismatch");
return (-1);
}
if (dof->dofh_ident[DOF_ID_DIFVERS] != DIF_VERSION_2) {
dtrace_dof_error(dof, "DOF uses unsupported instruction set");
return (-1);
}
if (dof->dofh_ident[DOF_ID_DIFIREG] > DIF_DIR_NREGS) {
dtrace_dof_error(dof, "DOF uses too many integer registers");
return (-1);
}
if (dof->dofh_ident[DOF_ID_DIFTREG] > DIF_DTR_NREGS) {
dtrace_dof_error(dof, "DOF uses too many tuple registers");
return (-1);
}
for (i = DOF_ID_PAD; i < DOF_ID_SIZE; i++) {
if (dof->dofh_ident[i] != 0) {
dtrace_dof_error(dof, "DOF has invalid ident byte set");
return (-1);
}
}
if (dof->dofh_flags & ~DOF_FL_VALID) {
dtrace_dof_error(dof, "DOF has invalid flag bits set");
return (-1);
}
if (dof->dofh_secsize == 0) {
dtrace_dof_error(dof, "zero section header size");
return (-1);
}
/*
* Check that the section headers don't exceed the amount of DOF
* data. Note that we cast the section size and number of sections
* to uint64_t's to prevent possible overflow in the multiplication.
*/
seclen = (uint64_t)dof->dofh_secnum * (uint64_t)dof->dofh_secsize;
if (dof->dofh_secoff > len || seclen > len ||
dof->dofh_secoff + seclen > len) {
dtrace_dof_error(dof, "truncated section headers");
return (-1);
}
if (!IS_P2ALIGNED(dof->dofh_secoff, sizeof (uint64_t))) {
dtrace_dof_error(dof, "misaligned section headers");
return (-1);
}
if (!IS_P2ALIGNED(dof->dofh_secsize, sizeof (uint64_t))) {
dtrace_dof_error(dof, "misaligned section size");
return (-1);
}
/*
* Take an initial pass through the section headers to be sure that
* the headers don't have stray offsets. If the 'noprobes' flag is
* set, do not permit sections relating to providers, probes, or args.
*/
for (i = 0; i < dof->dofh_secnum; i++) {
dof_sec_t *sec = (dof_sec_t *)(daddr +
(uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
if (noprobes) {
switch (sec->dofs_type) {
case DOF_SECT_PROVIDER:
case DOF_SECT_PROBES:
case DOF_SECT_PRARGS:
case DOF_SECT_PROFFS:
dtrace_dof_error(dof, "illegal sections "
"for enabling");
return (-1);
}
}
if (!(sec->dofs_flags & DOF_SECF_LOAD))
continue; /* just ignore non-loadable sections */
if (sec->dofs_align & (sec->dofs_align - 1)) {
dtrace_dof_error(dof, "bad section alignment");
return (-1);
}
if (sec->dofs_offset & (sec->dofs_align - 1)) {
dtrace_dof_error(dof, "misaligned section");
return (-1);
}
if (sec->dofs_offset > len || sec->dofs_size > len ||
sec->dofs_offset + sec->dofs_size > len) {
dtrace_dof_error(dof, "corrupt section header");
return (-1);
}
if (sec->dofs_type == DOF_SECT_STRTAB && *((char *)daddr +
sec->dofs_offset + sec->dofs_size - 1) != '\0') {
dtrace_dof_error(dof, "non-terminating string table");
return (-1);
}
}
/*
* Take a second pass through the sections and locate and perform any
* relocations that are present. We do this after the first pass to
* be sure that all sections have had their headers validated.
*/
for (i = 0; i < dof->dofh_secnum; i++) {
dof_sec_t *sec = (dof_sec_t *)(daddr +
(uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
if (!(sec->dofs_flags & DOF_SECF_LOAD))
continue; /* skip sections that are not loadable */
switch (sec->dofs_type) {
case DOF_SECT_URELHDR:
if (dtrace_dof_relocate(dof, sec, ubase) != 0)
return (-1);
break;
}
}
if ((enab = *enabp) == NULL)
enab = *enabp = dtrace_enabling_create(vstate);
for (i = 0; i < dof->dofh_secnum; i++) {
dof_sec_t *sec = (dof_sec_t *)(daddr +
(uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
if (sec->dofs_type != DOF_SECT_ECBDESC)
continue;
if ((ep = dtrace_dof_ecbdesc(dof, sec, vstate, cr)) == NULL) {
dtrace_enabling_destroy(enab);
*enabp = NULL;
return (-1);
}
dtrace_enabling_add(enab, ep);
}
return (0);
}
/*
* Process DOF for any options. This routine assumes that the DOF has been
* at least processed by dtrace_dof_slurp().
*/
static int
dtrace_dof_options(dof_hdr_t *dof, dtrace_state_t *state)
{
int i, rval;
uint32_t entsize;
size_t offs;
dof_optdesc_t *desc;
for (i = 0; i < dof->dofh_secnum; i++) {
dof_sec_t *sec = (dof_sec_t *)((uintptr_t)dof +
(uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
if (sec->dofs_type != DOF_SECT_OPTDESC)
continue;
if (sec->dofs_align != sizeof (uint64_t)) {
dtrace_dof_error(dof, "bad alignment in "
"option description");
return (EINVAL);
}
if ((entsize = sec->dofs_entsize) == 0) {
dtrace_dof_error(dof, "zeroed option entry size");
return (EINVAL);
}
if (entsize < sizeof (dof_optdesc_t)) {
dtrace_dof_error(dof, "bad option entry size");
return (EINVAL);
}
for (offs = 0; offs < sec->dofs_size; offs += entsize) {
desc = (dof_optdesc_t *)((uintptr_t)dof +
(uintptr_t)sec->dofs_offset + offs);
if (desc->dofo_strtab != DOF_SECIDX_NONE) {
dtrace_dof_error(dof, "non-zero option string");
return (EINVAL);
}
if (desc->dofo_value == DTRACEOPT_UNSET) {
dtrace_dof_error(dof, "unset option");
return (EINVAL);
}
if ((rval = dtrace_state_option(state,
desc->dofo_option, desc->dofo_value)) != 0) {
dtrace_dof_error(dof, "rejected option");
return (rval);
}
}
}
return (0);
}
/*
* DTrace Consumer State Functions
*/
static int
dtrace_dstate_init(dtrace_dstate_t *dstate, size_t size)
{
size_t hashsize, maxper, min, chunksize = dstate->dtds_chunksize;
void *base;
uintptr_t limit;
dtrace_dynvar_t *dvar, *next, *start;
int i;
ASSERT(MUTEX_HELD(&dtrace_lock));
ASSERT(dstate->dtds_base == NULL && dstate->dtds_percpu == NULL);
bzero(dstate, sizeof (dtrace_dstate_t));
if ((dstate->dtds_chunksize = chunksize) == 0)
dstate->dtds_chunksize = DTRACE_DYNVAR_CHUNKSIZE;
if (size < (min = dstate->dtds_chunksize + sizeof (dtrace_dynhash_t)))
size = min;
if ((base = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
return (ENOMEM);
dstate->dtds_size = size;
dstate->dtds_base = base;
dstate->dtds_percpu = kmem_cache_alloc(dtrace_state_cache, KM_SLEEP);
bzero(dstate->dtds_percpu, NCPU * sizeof (dtrace_dstate_percpu_t));
hashsize = size / (dstate->dtds_chunksize + sizeof (dtrace_dynhash_t));
if (hashsize != 1 && (hashsize & 1))
hashsize--;
dstate->dtds_hashsize = hashsize;
dstate->dtds_hash = dstate->dtds_base;
/*
* Set all of our hash buckets to point to the single sink, and (if
* it hasn't already been set), set the sink's hash value to be the
* sink sentinel value. The sink is needed for dynamic variable
* lookups to know that they have iterated over an entire, valid hash
* chain.
*/
for (i = 0; i < hashsize; i++)
dstate->dtds_hash[i].dtdh_chain = &dtrace_dynhash_sink;
if (dtrace_dynhash_sink.dtdv_hashval != DTRACE_DYNHASH_SINK)
dtrace_dynhash_sink.dtdv_hashval = DTRACE_DYNHASH_SINK;
/*
* Determine number of active CPUs. Divide free list evenly among
* active CPUs.
*/
start = (dtrace_dynvar_t *)
((uintptr_t)base + hashsize * sizeof (dtrace_dynhash_t));
limit = (uintptr_t)base + size;
maxper = (limit - (uintptr_t)start) / NCPU;
maxper = (maxper / dstate->dtds_chunksize) * dstate->dtds_chunksize;
#if !defined(sun)
CPU_FOREACH(i) {
#else
for (i = 0; i < NCPU; i++) {
#endif
dstate->dtds_percpu[i].dtdsc_free = dvar = start;
/*
* If we don't even have enough chunks to make it once through
* NCPUs, we're just going to allocate everything to the first
* CPU. And if we're on the last CPU, we're going to allocate
* whatever is left over. In either case, we set the limit to
* be the limit of the dynamic variable space.
*/
if (maxper == 0 || i == NCPU - 1) {
limit = (uintptr_t)base + size;
start = NULL;
} else {
limit = (uintptr_t)start + maxper;
start = (dtrace_dynvar_t *)limit;
}
ASSERT(limit <= (uintptr_t)base + size);
for (;;) {
next = (dtrace_dynvar_t *)((uintptr_t)dvar +
dstate->dtds_chunksize);
if ((uintptr_t)next + dstate->dtds_chunksize >= limit)
break;
dvar->dtdv_next = next;
dvar = next;
}
if (maxper == 0)
break;
}
return (0);
}
static void
dtrace_dstate_fini(dtrace_dstate_t *dstate)
{
ASSERT(MUTEX_HELD(&cpu_lock));
if (dstate->dtds_base == NULL)
return;
kmem_free(dstate->dtds_base, dstate->dtds_size);
kmem_cache_free(dtrace_state_cache, dstate->dtds_percpu);
}
static void
dtrace_vstate_fini(dtrace_vstate_t *vstate)
{
/*
* Logical XOR, where are you?
*/
ASSERT((vstate->dtvs_nglobals == 0) ^ (vstate->dtvs_globals != NULL));
if (vstate->dtvs_nglobals > 0) {
kmem_free(vstate->dtvs_globals, vstate->dtvs_nglobals *
sizeof (dtrace_statvar_t *));
}
if (vstate->dtvs_ntlocals > 0) {
kmem_free(vstate->dtvs_tlocals, vstate->dtvs_ntlocals *
sizeof (dtrace_difv_t));
}
ASSERT((vstate->dtvs_nlocals == 0) ^ (vstate->dtvs_locals != NULL));
if (vstate->dtvs_nlocals > 0) {
kmem_free(vstate->dtvs_locals, vstate->dtvs_nlocals *
sizeof (dtrace_statvar_t *));
}
}
#if defined(sun)
static void
dtrace_state_clean(dtrace_state_t *state)
{
if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE)
return;
dtrace_dynvar_clean(&state->dts_vstate.dtvs_dynvars);
dtrace_speculation_clean(state);
}
static void
dtrace_state_deadman(dtrace_state_t *state)
{
hrtime_t now;
dtrace_sync();
now = dtrace_gethrtime();
if (state != dtrace_anon.dta_state &&
now - state->dts_laststatus >= dtrace_deadman_user)
return;
/*
* We must be sure that dts_alive never appears to be less than the
* value upon entry to dtrace_state_deadman(), and because we lack a
* dtrace_cas64(), we cannot store to it atomically. We thus instead
* store INT64_MAX to it, followed by a memory barrier, followed by
* the new value. This assures that dts_alive never appears to be
* less than its true value, regardless of the order in which the
* stores to the underlying storage are issued.
*/
state->dts_alive = INT64_MAX;
dtrace_membar_producer();
state->dts_alive = now;
}
#else
static void
dtrace_state_clean(void *arg)
{
dtrace_state_t *state = arg;
dtrace_optval_t *opt = state->dts_options;
if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE)
return;
dtrace_dynvar_clean(&state->dts_vstate.dtvs_dynvars);
dtrace_speculation_clean(state);
callout_reset(&state->dts_cleaner, hz * opt[DTRACEOPT_CLEANRATE] / NANOSEC,
dtrace_state_clean, state);
}
static void
dtrace_state_deadman(void *arg)
{
dtrace_state_t *state = arg;
hrtime_t now;
dtrace_sync();
dtrace_debug_output();
now = dtrace_gethrtime();
if (state != dtrace_anon.dta_state &&
now - state->dts_laststatus >= dtrace_deadman_user)
return;
/*
* We must be sure that dts_alive never appears to be less than the
* value upon entry to dtrace_state_deadman(), and because we lack a
* dtrace_cas64(), we cannot store to it atomically. We thus instead
* store INT64_MAX to it, followed by a memory barrier, followed by
* the new value. This assures that dts_alive never appears to be
* less than its true value, regardless of the order in which the
* stores to the underlying storage are issued.
*/
state->dts_alive = INT64_MAX;
dtrace_membar_producer();
state->dts_alive = now;
callout_reset(&state->dts_deadman, hz * dtrace_deadman_interval / NANOSEC,
dtrace_state_deadman, state);
}
#endif
static dtrace_state_t *
#if defined(sun)
dtrace_state_create(dev_t *devp, cred_t *cr)
#else
dtrace_state_create(struct cdev *dev)
#endif
{
#if defined(sun)
minor_t minor;
major_t major;
#else
cred_t *cr = NULL;
int m = 0;
#endif
char c[30];
dtrace_state_t *state;
dtrace_optval_t *opt;
int bufsize = NCPU * sizeof (dtrace_buffer_t), i;
ASSERT(MUTEX_HELD(&dtrace_lock));
ASSERT(MUTEX_HELD(&cpu_lock));
#if defined(sun)
minor = (minor_t)(uintptr_t)vmem_alloc(dtrace_minor, 1,
VM_BESTFIT | VM_SLEEP);
if (ddi_soft_state_zalloc(dtrace_softstate, minor) != DDI_SUCCESS) {
vmem_free(dtrace_minor, (void *)(uintptr_t)minor, 1);
return (NULL);
}
state = ddi_get_soft_state(dtrace_softstate, minor);
#else
if (dev != NULL) {
cr = dev->si_cred;
m = dev2unit(dev);
}
/* Allocate memory for the state. */
state = kmem_zalloc(sizeof(dtrace_state_t), KM_SLEEP);
#endif
state->dts_epid = DTRACE_EPIDNONE + 1;
(void) snprintf(c, sizeof (c), "dtrace_aggid_%d", m);
#if defined(sun)
state->dts_aggid_arena = vmem_create(c, (void *)1, UINT32_MAX, 1,
NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
if (devp != NULL) {
major = getemajor(*devp);
} else {
major = ddi_driver_major(dtrace_devi);
}
state->dts_dev = makedevice(major, minor);
if (devp != NULL)
*devp = state->dts_dev;
#else
state->dts_aggid_arena = new_unrhdr(1, INT_MAX, &dtrace_unr_mtx);
state->dts_dev = dev;
#endif
/*
* We allocate NCPU buffers. On the one hand, this can be quite
* a bit of memory per instance (nearly 36K on a Starcat). On the
* other hand, it saves an additional memory reference in the probe
* path.
*/
state->dts_buffer = kmem_zalloc(bufsize, KM_SLEEP);
state->dts_aggbuffer = kmem_zalloc(bufsize, KM_SLEEP);
#if defined(sun)
state->dts_cleaner = CYCLIC_NONE;
state->dts_deadman = CYCLIC_NONE;
#else
callout_init(&state->dts_cleaner, CALLOUT_MPSAFE);
callout_init(&state->dts_deadman, CALLOUT_MPSAFE);
#endif
state->dts_vstate.dtvs_state = state;
for (i = 0; i < DTRACEOPT_MAX; i++)
state->dts_options[i] = DTRACEOPT_UNSET;
/*
* Set the default options.
*/
opt = state->dts_options;
opt[DTRACEOPT_BUFPOLICY] = DTRACEOPT_BUFPOLICY_SWITCH;
opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_AUTO;
opt[DTRACEOPT_NSPEC] = dtrace_nspec_default;
opt[DTRACEOPT_SPECSIZE] = dtrace_specsize_default;
opt[DTRACEOPT_CPU] = (dtrace_optval_t)DTRACE_CPUALL;
opt[DTRACEOPT_STRSIZE] = dtrace_strsize_default;
opt[DTRACEOPT_STACKFRAMES] = dtrace_stackframes_default;
opt[DTRACEOPT_USTACKFRAMES] = dtrace_ustackframes_default;
opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_default;
opt[DTRACEOPT_AGGRATE] = dtrace_aggrate_default;
opt[DTRACEOPT_SWITCHRATE] = dtrace_switchrate_default;
opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_default;
opt[DTRACEOPT_JSTACKFRAMES] = dtrace_jstackframes_default;
opt[DTRACEOPT_JSTACKSTRSIZE] = dtrace_jstackstrsize_default;
state->dts_activity = DTRACE_ACTIVITY_INACTIVE;
/*
* Depending on the user credentials, we set flag bits which alter probe
* visibility or the amount of destructiveness allowed. In the case of
* actual anonymous tracing, or the possession of all privileges, all of
* the normal checks are bypassed.
*/
if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
state->dts_cred.dcr_visible = DTRACE_CRV_ALL;
state->dts_cred.dcr_action = DTRACE_CRA_ALL;
} else {
/*
* Set up the credentials for this instantiation. We take a
* hold on the credential to prevent it from disappearing on
* us; this in turn prevents the zone_t referenced by this
* credential from disappearing. This means that we can
* examine the credential and the zone from probe context.
*/
crhold(cr);
state->dts_cred.dcr_cred = cr;
/*
* CRA_PROC means "we have *some* privilege for dtrace" and
* unlocks the use of variables like pid, zonename, etc.
*/
if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE) ||
PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) {
state->dts_cred.dcr_action |= DTRACE_CRA_PROC;
}
/*
* dtrace_user allows use of syscall and profile providers.
* If the user also has proc_owner and/or proc_zone, we
* extend the scope to include additional visibility and
* destructive power.
*/
if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE)) {
if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE)) {
state->dts_cred.dcr_visible |=
DTRACE_CRV_ALLPROC;
state->dts_cred.dcr_action |=
DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
}
if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE)) {
state->dts_cred.dcr_visible |=
DTRACE_CRV_ALLZONE;
state->dts_cred.dcr_action |=
DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
}
/*
* If we have all privs in whatever zone this is,
* we can do destructive things to processes which
* have altered credentials.
*/
#if defined(sun)
if (priv_isequalset(priv_getset(cr, PRIV_EFFECTIVE),
cr->cr_zone->zone_privset)) {
state->dts_cred.dcr_action |=
DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
}
#endif
}
/*
* Holding the dtrace_kernel privilege also implies that
* the user has the dtrace_user privilege from a visibility
* perspective. But without further privileges, some
* destructive actions are not available.
*/
if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE)) {
/*
* Make all probes in all zones visible. However,
* this doesn't mean that all actions become available
* to all zones.
*/
state->dts_cred.dcr_visible |= DTRACE_CRV_KERNEL |
DTRACE_CRV_ALLPROC | DTRACE_CRV_ALLZONE;
state->dts_cred.dcr_action |= DTRACE_CRA_KERNEL |
DTRACE_CRA_PROC;
/*
* Holding proc_owner means that destructive actions
* for *this* zone are allowed.
*/
if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
state->dts_cred.dcr_action |=
DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
/*
* Holding proc_zone means that destructive actions
* for this user/group ID in all zones is allowed.
*/
if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
state->dts_cred.dcr_action |=
DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
#if defined(sun)
/*
* If we have all privs in whatever zone this is,
* we can do destructive things to processes which
* have altered credentials.
*/
if (priv_isequalset(priv_getset(cr, PRIV_EFFECTIVE),
cr->cr_zone->zone_privset)) {
state->dts_cred.dcr_action |=
DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
}
#endif
}
/*
* Holding the dtrace_proc privilege gives control over fasttrap
* and pid providers. We need to grant wider destructive
* privileges in the event that the user has proc_owner and/or
* proc_zone.
*/
if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) {
if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
state->dts_cred.dcr_action |=
DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
state->dts_cred.dcr_action |=
DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
}
}
return (state);
}
static int
dtrace_state_buffer(dtrace_state_t *state, dtrace_buffer_t *buf, int which)
{
dtrace_optval_t *opt = state->dts_options, size;
processorid_t cpu = 0;;
int flags = 0, rval;
ASSERT(MUTEX_HELD(&dtrace_lock));
ASSERT(MUTEX_HELD(&cpu_lock));
ASSERT(which < DTRACEOPT_MAX);
ASSERT(state->dts_activity == DTRACE_ACTIVITY_INACTIVE ||
(state == dtrace_anon.dta_state &&
state->dts_activity == DTRACE_ACTIVITY_ACTIVE));
if (opt[which] == DTRACEOPT_UNSET || opt[which] == 0)
return (0);
if (opt[DTRACEOPT_CPU] != DTRACEOPT_UNSET)
cpu = opt[DTRACEOPT_CPU];
if (which == DTRACEOPT_SPECSIZE)
flags |= DTRACEBUF_NOSWITCH;
if (which == DTRACEOPT_BUFSIZE) {
if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_RING)
flags |= DTRACEBUF_RING;
if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_FILL)
flags |= DTRACEBUF_FILL;
if (state != dtrace_anon.dta_state ||
state->dts_activity != DTRACE_ACTIVITY_ACTIVE)
flags |= DTRACEBUF_INACTIVE;
}
for (size = opt[which]; size >= sizeof (uint64_t); size >>= 1) {
/*
* The size must be 8-byte aligned. If the size is not 8-byte
* aligned, drop it down by the difference.
*/
if (size & (sizeof (uint64_t) - 1))
size -= size & (sizeof (uint64_t) - 1);
if (size < state->dts_reserve) {
/*
* Buffers always must be large enough to accommodate
* their prereserved space. We return E2BIG instead
* of ENOMEM in this case to allow for user-level
* software to differentiate the cases.
*/
return (E2BIG);
}
rval = dtrace_buffer_alloc(buf, size, flags, cpu);
if (rval != ENOMEM) {
opt[which] = size;
return (rval);
}
if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)
return (rval);
}
return (ENOMEM);
}
static int
dtrace_state_buffers(dtrace_state_t *state)
{
dtrace_speculation_t *spec = state->dts_speculations;
int rval, i;
if ((rval = dtrace_state_buffer(state, state->dts_buffer,
DTRACEOPT_BUFSIZE)) != 0)
return (rval);
if ((rval = dtrace_state_buffer(state, state->dts_aggbuffer,
DTRACEOPT_AGGSIZE)) != 0)
return (rval);
for (i = 0; i < state->dts_nspeculations; i++) {
if ((rval = dtrace_state_buffer(state,
spec[i].dtsp_buffer, DTRACEOPT_SPECSIZE)) != 0)
return (rval);
}
return (0);
}
static void
dtrace_state_prereserve(dtrace_state_t *state)
{
dtrace_ecb_t *ecb;
dtrace_probe_t *probe;
state->dts_reserve = 0;
if (state->dts_options[DTRACEOPT_BUFPOLICY] != DTRACEOPT_BUFPOLICY_FILL)
return;
/*
* If our buffer policy is a "fill" buffer policy, we need to set the
* prereserved space to be the space required by the END probes.
*/
probe = dtrace_probes[dtrace_probeid_end - 1];
ASSERT(probe != NULL);
for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) {
if (ecb->dte_state != state)
continue;
state->dts_reserve += ecb->dte_needed + ecb->dte_alignment;
}
}
static int
dtrace_state_go(dtrace_state_t *state, processorid_t *cpu)
{
dtrace_optval_t *opt = state->dts_options, sz, nspec;
dtrace_speculation_t *spec;
dtrace_buffer_t *buf;
#if defined(sun)
cyc_handler_t hdlr;
cyc_time_t when;
#endif
int rval = 0, i, bufsize = NCPU * sizeof (dtrace_buffer_t);
dtrace_icookie_t cookie;
mutex_enter(&cpu_lock);
mutex_enter(&dtrace_lock);
if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {
rval = EBUSY;
goto out;
}
/*
* Before we can perform any checks, we must prime all of the
* retained enablings that correspond to this state.
*/
dtrace_enabling_prime(state);
if (state->dts_destructive && !state->dts_cred.dcr_destructive) {
rval = EACCES;
goto out;
}
dtrace_state_prereserve(state);
/*
* Now we want to do is try to allocate our speculations.
* We do not automatically resize the number of speculations; if
* this fails, we will fail the operation.
*/
nspec = opt[DTRACEOPT_NSPEC];
ASSERT(nspec != DTRACEOPT_UNSET);
if (nspec > INT_MAX) {
rval = ENOMEM;
goto out;
}
spec = kmem_zalloc(nspec * sizeof (dtrace_speculation_t), KM_NOSLEEP);
if (spec == NULL) {
rval = ENOMEM;
goto out;
}
state->dts_speculations = spec;
state->dts_nspeculations = (int)nspec;
for (i = 0; i < nspec; i++) {
if ((buf = kmem_zalloc(bufsize, KM_NOSLEEP)) == NULL) {
rval = ENOMEM;
goto err;
}
spec[i].dtsp_buffer = buf;
}
if (opt[DTRACEOPT_GRABANON] != DTRACEOPT_UNSET) {
if (dtrace_anon.dta_state == NULL) {
rval = ENOENT;
goto out;
}
if (state->dts_necbs != 0) {
rval = EALREADY;
goto out;
}
state->dts_anon = dtrace_anon_grab();
ASSERT(state->dts_anon != NULL);
state = state->dts_anon;
/*
* We want "grabanon" to be set in the grabbed state, so we'll
* copy that option value from the grabbing state into the
* grabbed state.
*/
state->dts_options[DTRACEOPT_GRABANON] =
opt[DTRACEOPT_GRABANON];
*cpu = dtrace_anon.dta_beganon;
/*
* If the anonymous state is active (as it almost certainly
* is if the anonymous enabling ultimately matched anything),
* we don't allow any further option processing -- but we
* don't return failure.
*/
if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
goto out;
}
if (opt[DTRACEOPT_AGGSIZE] != DTRACEOPT_UNSET &&
opt[DTRACEOPT_AGGSIZE] != 0) {
if (state->dts_aggregations == NULL) {
/*
* We're not going to create an aggregation buffer
* because we don't have any ECBs that contain
* aggregations -- set this option to 0.
*/
opt[DTRACEOPT_AGGSIZE] = 0;
} else {
/*
* If we have an aggregation buffer, we must also have
* a buffer to use as scratch.
*/
if (opt[DTRACEOPT_BUFSIZE] == DTRACEOPT_UNSET ||
opt[DTRACEOPT_BUFSIZE] < state->dts_needed) {
opt[DTRACEOPT_BUFSIZE] = state->dts_needed;
}
}
}
if (opt[DTRACEOPT_SPECSIZE] != DTRACEOPT_UNSET &&
opt[DTRACEOPT_SPECSIZE] != 0) {
if (!state->dts_speculates) {
/*
* We're not going to create speculation buffers
* because we don't have any ECBs that actually
* speculate -- set the speculation size to 0.
*/
opt[DTRACEOPT_SPECSIZE] = 0;
}
}
/*
* The bare minimum size for any buffer that we're actually going to
* do anything to is sizeof (uint64_t).
*/
sz = sizeof (uint64_t);
if ((state->dts_needed != 0 && opt[DTRACEOPT_BUFSIZE] < sz) ||
(state->dts_speculates && opt[DTRACEOPT_SPECSIZE] < sz) ||
(state->dts_aggregations != NULL && opt[DTRACEOPT_AGGSIZE] < sz)) {
/*
* A buffer size has been explicitly set to 0 (or to a size
* that will be adjusted to 0) and we need the space -- we
* need to return failure. We return ENOSPC to differentiate
* it from failing to allocate a buffer due to failure to meet
* the reserve (for which we return E2BIG).
*/
rval = ENOSPC;
goto out;
}
if ((rval = dtrace_state_buffers(state)) != 0)
goto err;
if ((sz = opt[DTRACEOPT_DYNVARSIZE]) == DTRACEOPT_UNSET)
sz = dtrace_dstate_defsize;
do {
rval = dtrace_dstate_init(&state->dts_vstate.dtvs_dynvars, sz);
if (rval == 0)
break;
if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)
goto err;
} while (sz >>= 1);
opt[DTRACEOPT_DYNVARSIZE] = sz;
if (rval != 0)
goto err;
if (opt[DTRACEOPT_STATUSRATE] > dtrace_statusrate_max)
opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_max;
if (opt[DTRACEOPT_CLEANRATE] == 0)
opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;
if (opt[DTRACEOPT_CLEANRATE] < dtrace_cleanrate_min)
opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_min;
if (opt[DTRACEOPT_CLEANRATE] > dtrace_cleanrate_max)
opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;
state->dts_alive = state->dts_laststatus = dtrace_gethrtime();
#if defined(sun)
hdlr.cyh_func = (cyc_func_t)dtrace_state_clean;
hdlr.cyh_arg = state;
hdlr.cyh_level = CY_LOW_LEVEL;
when.cyt_when = 0;
when.cyt_interval = opt[DTRACEOPT_CLEANRATE];
state->dts_cleaner = cyclic_add(&hdlr, &when);
hdlr.cyh_func = (cyc_func_t)dtrace_state_deadman;
hdlr.cyh_arg = state;
hdlr.cyh_level = CY_LOW_LEVEL;
when.cyt_when = 0;
when.cyt_interval = dtrace_deadman_interval;
state->dts_deadman = cyclic_add(&hdlr, &when);
#else
callout_reset(&state->dts_cleaner, hz * opt[DTRACEOPT_CLEANRATE] / NANOSEC,
dtrace_state_clean, state);
callout_reset(&state->dts_deadman, hz * dtrace_deadman_interval / NANOSEC,
dtrace_state_deadman, state);
#endif
state->dts_activity = DTRACE_ACTIVITY_WARMUP;
/*
* Now it's time to actually fire the BEGIN probe. We need to disable
* interrupts here both to record the CPU on which we fired the BEGIN
* probe (the data from this CPU will be processed first at user
* level) and to manually activate the buffer for this CPU.
*/
cookie = dtrace_interrupt_disable();
*cpu = curcpu;
ASSERT(state->dts_buffer[*cpu].dtb_flags & DTRACEBUF_INACTIVE);
state->dts_buffer[*cpu].dtb_flags &= ~DTRACEBUF_INACTIVE;
dtrace_probe(dtrace_probeid_begin,
(uint64_t)(uintptr_t)state, 0, 0, 0, 0);
dtrace_interrupt_enable(cookie);
/*
* We may have had an exit action from a BEGIN probe; only change our
* state to ACTIVE if we're still in WARMUP.
*/
ASSERT(state->dts_activity == DTRACE_ACTIVITY_WARMUP ||
state->dts_activity == DTRACE_ACTIVITY_DRAINING);
if (state->dts_activity == DTRACE_ACTIVITY_WARMUP)
state->dts_activity = DTRACE_ACTIVITY_ACTIVE;
/*
* Regardless of whether or not now we're in ACTIVE or DRAINING, we
* want each CPU to transition its principal buffer out of the
* INACTIVE state. Doing this assures that no CPU will suddenly begin
* processing an ECB halfway down a probe's ECB chain; all CPUs will
* atomically transition from processing none of a state's ECBs to
* processing all of them.
*/
dtrace_xcall(DTRACE_CPUALL,
(dtrace_xcall_t)dtrace_buffer_activate, state);
goto out;
err:
dtrace_buffer_free(state->dts_buffer);
dtrace_buffer_free(state->dts_aggbuffer);
if ((nspec = state->dts_nspeculations) == 0) {
ASSERT(state->dts_speculations == NULL);
goto out;
}
spec = state->dts_speculations;
ASSERT(spec != NULL);
for (i = 0; i < state->dts_nspeculations; i++) {
if ((buf = spec[i].dtsp_buffer) == NULL)
break;
dtrace_buffer_free(buf);
kmem_free(buf, bufsize);
}
kmem_free(spec, nspec * sizeof (dtrace_speculation_t));
state->dts_nspeculations = 0;
state->dts_speculations = NULL;
out:
mutex_exit(&dtrace_lock);
mutex_exit(&cpu_lock);
return (rval);
}
static int
dtrace_state_stop(dtrace_state_t *state, processorid_t *cpu)
{
dtrace_icookie_t cookie;
ASSERT(MUTEX_HELD(&dtrace_lock));
if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE &&
state->dts_activity != DTRACE_ACTIVITY_DRAINING)
return (EINVAL);
/*
* We'll set the activity to DTRACE_ACTIVITY_DRAINING, and issue a sync
* to be sure that every CPU has seen it. See below for the details
* on why this is done.
*/
state->dts_activity = DTRACE_ACTIVITY_DRAINING;
dtrace_sync();
/*
* By this point, it is impossible for any CPU to be still processing
* with DTRACE_ACTIVITY_ACTIVE. We can thus set our activity to
* DTRACE_ACTIVITY_COOLDOWN and know that we're not racing with any
* other CPU in dtrace_buffer_reserve(). This allows dtrace_probe()
* and callees to know that the activity is DTRACE_ACTIVITY_COOLDOWN
* iff we're in the END probe.
*/
state->dts_activity = DTRACE_ACTIVITY_COOLDOWN;
dtrace_sync();
ASSERT(state->dts_activity == DTRACE_ACTIVITY_COOLDOWN);
/*
* Finally, we can release the reserve and call the END probe. We
* disable interrupts across calling the END probe to allow us to
* return the CPU on which we actually called the END probe. This
* allows user-land to be sure that this CPU's principal buffer is
* processed last.
*/
state->dts_reserve = 0;
cookie = dtrace_interrupt_disable();
*cpu = curcpu;
dtrace_probe(dtrace_probeid_end,
(uint64_t)(uintptr_t)state, 0, 0, 0, 0);
dtrace_interrupt_enable(cookie);
state->dts_activity = DTRACE_ACTIVITY_STOPPED;
dtrace_sync();
return (0);
}
static int
dtrace_state_option(dtrace_state_t *state, dtrace_optid_t option,
dtrace_optval_t val)
{
ASSERT(MUTEX_HELD(&dtrace_lock));
if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
return (EBUSY);
if (option >= DTRACEOPT_MAX)
return (EINVAL);
if (option != DTRACEOPT_CPU && val < 0)
return (EINVAL);
switch (option) {
case DTRACEOPT_DESTRUCTIVE:
if (dtrace_destructive_disallow)
return (EACCES);
state->dts_cred.dcr_destructive = 1;
break;
case DTRACEOPT_BUFSIZE:
case DTRACEOPT_DYNVARSIZE:
case DTRACEOPT_AGGSIZE:
case DTRACEOPT_SPECSIZE:
case DTRACEOPT_STRSIZE:
if (val < 0)
return (EINVAL);
if (val >= LONG_MAX) {
/*
* If this is an otherwise negative value, set it to
* the highest multiple of 128m less than LONG_MAX.
* Technically, we're adjusting the size without
* regard to the buffer resizing policy, but in fact,
* this has no effect -- if we set the buffer size to
* ~LONG_MAX and the buffer policy is ultimately set to
* be "manual", the buffer allocation is guaranteed to
* fail, if only because the allocation requires two
* buffers. (We set the the size to the highest
* multiple of 128m because it ensures that the size
* will remain a multiple of a megabyte when
* repeatedly halved -- all the way down to 15m.)
*/
val = LONG_MAX - (1 << 27) + 1;
}
}
state->dts_options[option] = val;
return (0);
}
static void
dtrace_state_destroy(dtrace_state_t *state)
{
dtrace_ecb_t *ecb;
dtrace_vstate_t *vstate = &state->dts_vstate;
#if defined(sun)
minor_t minor = getminor(state->dts_dev);
#endif
int i, bufsize = NCPU * sizeof (dtrace_buffer_t);
dtrace_speculation_t *spec = state->dts_speculations;
int nspec = state->dts_nspeculations;
uint32_t match;
ASSERT(MUTEX_HELD(&dtrace_lock));
ASSERT(MUTEX_HELD(&cpu_lock));
/*
* First, retract any retained enablings for this state.
*/
dtrace_enabling_retract(state);
ASSERT(state->dts_nretained == 0);
if (state->dts_activity == DTRACE_ACTIVITY_ACTIVE ||
state->dts_activity == DTRACE_ACTIVITY_DRAINING) {
/*
* We have managed to come into dtrace_state_destroy() on a
* hot enabling -- almost certainly because of a disorderly
* shutdown of a consumer. (That is, a consumer that is
* exiting without having called dtrace_stop().) In this case,
* we're going to set our activity to be KILLED, and then
* issue a sync to be sure that everyone is out of probe
* context before we start blowing away ECBs.
*/
state->dts_activity = DTRACE_ACTIVITY_KILLED;
dtrace_sync();
}
/*
* Release the credential hold we took in dtrace_state_create().
*/
if (state->dts_cred.dcr_cred != NULL)
crfree(state->dts_cred.dcr_cred);
/*
* Now we can safely disable and destroy any enabled probes. Because
* any DTRACE_PRIV_KERNEL probes may actually be slowing our progress
* (especially if they're all enabled), we take two passes through the
* ECBs: in the first, we disable just DTRACE_PRIV_KERNEL probes, and
* in the second we disable whatever is left over.
*/
for (match = DTRACE_PRIV_KERNEL; ; match = 0) {
for (i = 0; i < state->dts_necbs; i++) {
if ((ecb = state->dts_ecbs[i]) == NULL)
continue;
if (match && ecb->dte_probe != NULL) {
dtrace_probe_t *probe = ecb->dte_probe;
dtrace_provider_t *prov = probe->dtpr_provider;
if (!(prov->dtpv_priv.dtpp_flags & match))
continue;
}
dtrace_ecb_disable(ecb);
dtrace_ecb_destroy(ecb);
}
if (!match)
break;
}
/*
* Before we free the buffers, perform one more sync to assure that
* every CPU is out of probe context.
*/
dtrace_sync();
dtrace_buffer_free(state->dts_buffer);
dtrace_buffer_free(state->dts_aggbuffer);
for (i = 0; i < nspec; i++)
dtrace_buffer_free(spec[i].dtsp_buffer);
#if defined(sun)
if (state->dts_cleaner != CYCLIC_NONE)
cyclic_remove(state->dts_cleaner);
if (state->dts_deadman != CYCLIC_NONE)
cyclic_remove(state->dts_deadman);
#else
callout_stop(&state->dts_cleaner);
callout_drain(&state->dts_cleaner);
callout_stop(&state->dts_deadman);
callout_drain(&state->dts_deadman);
#endif
dtrace_dstate_fini(&vstate->dtvs_dynvars);
dtrace_vstate_fini(vstate);
if (state->dts_ecbs != NULL)
kmem_free(state->dts_ecbs, state->dts_necbs * sizeof (dtrace_ecb_t *));
if (state->dts_aggregations != NULL) {
#ifdef DEBUG
for (i = 0; i < state->dts_naggregations; i++)
ASSERT(state->dts_aggregations[i] == NULL);
#endif
ASSERT(state->dts_naggregations > 0);
kmem_free(state->dts_aggregations,
state->dts_naggregations * sizeof (dtrace_aggregation_t *));
}
kmem_free(state->dts_buffer, bufsize);
kmem_free(state->dts_aggbuffer, bufsize);
for (i = 0; i < nspec; i++)
kmem_free(spec[i].dtsp_buffer, bufsize);
if (spec != NULL)
kmem_free(spec, nspec * sizeof (dtrace_speculation_t));
dtrace_format_destroy(state);
if (state->dts_aggid_arena != NULL) {
#if defined(sun)
vmem_destroy(state->dts_aggid_arena);
#else
delete_unrhdr(state->dts_aggid_arena);
#endif
state->dts_aggid_arena = NULL;
}
#if defined(sun)
ddi_soft_state_free(dtrace_softstate, minor);
vmem_free(dtrace_minor, (void *)(uintptr_t)minor, 1);
#endif
}
/*
* DTrace Anonymous Enabling Functions
*/
static dtrace_state_t *
dtrace_anon_grab(void)
{
dtrace_state_t *state;
ASSERT(MUTEX_HELD(&dtrace_lock));
if ((state = dtrace_anon.dta_state) == NULL) {
ASSERT(dtrace_anon.dta_enabling == NULL);
return (NULL);
}
ASSERT(dtrace_anon.dta_enabling != NULL);
ASSERT(dtrace_retained != NULL);
dtrace_enabling_destroy(dtrace_anon.dta_enabling);
dtrace_anon.dta_enabling = NULL;
dtrace_anon.dta_state = NULL;
return (state);
}
static void
dtrace_anon_property(void)
{
int i, rv;
dtrace_state_t *state;
dof_hdr_t *dof;
char c[32]; /* enough for "dof-data-" + digits */
ASSERT(MUTEX_HELD(&dtrace_lock));
ASSERT(MUTEX_HELD(&cpu_lock));
for (i = 0; ; i++) {
(void) snprintf(c, sizeof (c), "dof-data-%d", i);
dtrace_err_verbose = 1;
if ((dof = dtrace_dof_property(c)) == NULL) {
dtrace_err_verbose = 0;
break;
}
#if defined(sun)
/*
* We want to create anonymous state, so we need to transition
* the kernel debugger to indicate that DTrace is active. If
* this fails (e.g. because the debugger has modified text in
* some way), we won't continue with the processing.
*/
if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != 0) {
cmn_err(CE_NOTE, "kernel debugger active; anonymous "
"enabling ignored.");
dtrace_dof_destroy(dof);
break;
}
#endif
/*
* If we haven't allocated an anonymous state, we'll do so now.
*/
if ((state = dtrace_anon.dta_state) == NULL) {
#if defined(sun)
state = dtrace_state_create(NULL, NULL);
#else
state = dtrace_state_create(NULL);
#endif
dtrace_anon.dta_state = state;
if (state == NULL) {
/*
* This basically shouldn't happen: the only
* failure mode from dtrace_state_create() is a
* failure of ddi_soft_state_zalloc() that
* itself should never happen. Still, the
* interface allows for a failure mode, and
* we want to fail as gracefully as possible:
* we'll emit an error message and cease
* processing anonymous state in this case.
*/
cmn_err(CE_WARN, "failed to create "
"anonymous state");
dtrace_dof_destroy(dof);
break;
}
}
rv = dtrace_dof_slurp(dof, &state->dts_vstate, CRED(),
&dtrace_anon.dta_enabling, 0, B_TRUE);
if (rv == 0)
rv = dtrace_dof_options(dof, state);
dtrace_err_verbose = 0;
dtrace_dof_destroy(dof);
if (rv != 0) {
/*
* This is malformed DOF; chuck any anonymous state
* that we created.
*/
ASSERT(dtrace_anon.dta_enabling == NULL);
dtrace_state_destroy(state);
dtrace_anon.dta_state = NULL;
break;
}
ASSERT(dtrace_anon.dta_enabling != NULL);
}
if (dtrace_anon.dta_enabling != NULL) {
int rval;
/*
* dtrace_enabling_retain() can only fail because we are
* trying to retain more enablings than are allowed -- but
* we only have one anonymous enabling, and we are guaranteed
* to be allowed at least one retained enabling; we assert
* that dtrace_enabling_retain() returns success.
*/
rval = dtrace_enabling_retain(dtrace_anon.dta_enabling);
ASSERT(rval == 0);
dtrace_enabling_dump(dtrace_anon.dta_enabling);
}
}
/*
* DTrace Helper Functions
*/
static void
dtrace_helper_trace(dtrace_helper_action_t *helper,
dtrace_mstate_t *mstate, dtrace_vstate_t *vstate, int where)
{
uint32_t size, next, nnext, i;
dtrace_helptrace_t *ent;
uint16_t flags = cpu_core[curcpu].cpuc_dtrace_flags;
if (!dtrace_helptrace_enabled)
return;
ASSERT(vstate->dtvs_nlocals <= dtrace_helptrace_nlocals);
/*
* What would a tracing framework be without its own tracing
* framework? (Well, a hell of a lot simpler, for starters...)
*/
size = sizeof (dtrace_helptrace_t) + dtrace_helptrace_nlocals *
sizeof (uint64_t) - sizeof (uint64_t);
/*
* Iterate until we can allocate a slot in the trace buffer.
*/
do {
next = dtrace_helptrace_next;
if (next + size < dtrace_helptrace_bufsize) {
nnext = next + size;
} else {
nnext = size;
}
} while (dtrace_cas32(&dtrace_helptrace_next, next, nnext) != next);
/*
* We have our slot; fill it in.
*/
if (nnext == size)
next = 0;
ent = (dtrace_helptrace_t *)&dtrace_helptrace_buffer[next];
ent->dtht_helper = helper;
ent->dtht_where = where;
ent->dtht_nlocals = vstate->dtvs_nlocals;
ent->dtht_fltoffs = (mstate->dtms_present & DTRACE_MSTATE_FLTOFFS) ?
mstate->dtms_fltoffs : -1;
ent->dtht_fault = DTRACE_FLAGS2FLT(flags);
ent->dtht_illval = cpu_core[curcpu].cpuc_dtrace_illval;
for (i = 0; i < vstate->dtvs_nlocals; i++) {
dtrace_statvar_t *svar;
if ((svar = vstate->dtvs_locals[i]) == NULL)
continue;
ASSERT(svar->dtsv_size >= NCPU * sizeof (uint64_t));
ent->dtht_locals[i] =
((uint64_t *)(uintptr_t)svar->dtsv_data)[curcpu];
}
}
static uint64_t
dtrace_helper(int which, dtrace_mstate_t *mstate,
dtrace_state_t *state, uint64_t arg0, uint64_t arg1)
{
uint16_t *flags = &cpu_core[curcpu].cpuc_dtrace_flags;
uint64_t sarg0 = mstate->dtms_arg[0];
uint64_t sarg1 = mstate->dtms_arg[1];
uint64_t rval = 0;
dtrace_helpers_t *helpers = curproc->p_dtrace_helpers;
dtrace_helper_action_t *helper;
dtrace_vstate_t *vstate;
dtrace_difo_t *pred;
int i, trace = dtrace_helptrace_enabled;
ASSERT(which >= 0 && which < DTRACE_NHELPER_ACTIONS);
if (helpers == NULL)
return (0);
if ((helper = helpers->dthps_actions[which]) == NULL)
return (0);
vstate = &helpers->dthps_vstate;
mstate->dtms_arg[0] = arg0;
mstate->dtms_arg[1] = arg1;
/*
* Now iterate over each helper. If its predicate evaluates to 'true',
* we'll call the corresponding actions. Note that the below calls
* to dtrace_dif_emulate() may set faults in machine state. This is
* okay: our caller (the outer dtrace_dif_emulate()) will simply plow
* the stored DIF offset with its own (which is the desired behavior).
* Also, note the calls to dtrace_dif_emulate() may allocate scratch
* from machine state; this is okay, too.
*/
for (; helper != NULL; helper = helper->dtha_next) {
if ((pred = helper->dtha_predicate) != NULL) {
if (trace)
dtrace_helper_trace(helper, mstate, vstate, 0);
if (!dtrace_dif_emulate(pred, mstate, vstate, state))
goto next;
if (*flags & CPU_DTRACE_FAULT)
goto err;
}
for (i = 0; i < helper->dtha_nactions; i++) {
if (trace)
dtrace_helper_trace(helper,
mstate, vstate, i + 1);
rval = dtrace_dif_emulate(helper->dtha_actions[i],
mstate, vstate, state);
if (*flags & CPU_DTRACE_FAULT)
goto err;
}
next:
if (trace)
dtrace_helper_trace(helper, mstate, vstate,
DTRACE_HELPTRACE_NEXT);
}
if (trace)
dtrace_helper_trace(helper, mstate, vstate,
DTRACE_HELPTRACE_DONE);
/*
* Restore the arg0 that we saved upon entry.
*/
mstate->dtms_arg[0] = sarg0;
mstate->dtms_arg[1] = sarg1;
return (rval);
err:
if (trace)
dtrace_helper_trace(helper, mstate, vstate,
DTRACE_HELPTRACE_ERR);
/*
* Restore the arg0 that we saved upon entry.
*/
mstate->dtms_arg[0] = sarg0;
mstate->dtms_arg[1] = sarg1;
return (0);
}
static void
dtrace_helper_action_destroy(dtrace_helper_action_t *helper,
dtrace_vstate_t *vstate)
{
int i;
if (helper->dtha_predicate != NULL)
dtrace_difo_release(helper->dtha_predicate, vstate);
for (i = 0; i < helper->dtha_nactions; i++) {
ASSERT(helper->dtha_actions[i] != NULL);
dtrace_difo_release(helper->dtha_actions[i], vstate);
}
kmem_free(helper->dtha_actions,
helper->dtha_nactions * sizeof (dtrace_difo_t *));
kmem_free(helper, sizeof (dtrace_helper_action_t));
}
static int
dtrace_helper_destroygen(int gen)
{
proc_t *p = curproc;
dtrace_helpers_t *help = p->p_dtrace_helpers;
dtrace_vstate_t *vstate;
int i;
ASSERT(MUTEX_HELD(&dtrace_lock));
if (help == NULL || gen > help->dthps_generation)
return (EINVAL);
vstate = &help->dthps_vstate;
for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
dtrace_helper_action_t *last = NULL, *h, *next;
for (h = help->dthps_actions[i]; h != NULL; h = next) {
next = h->dtha_next;
if (h->dtha_generation == gen) {
if (last != NULL) {
last->dtha_next = next;
} else {
help->dthps_actions[i] = next;
}
dtrace_helper_action_destroy(h, vstate);
} else {
last = h;
}
}
}
/*
* Interate until we've cleared out all helper providers with the
* given generation number.
*/
for (;;) {
dtrace_helper_provider_t *prov;
/*
* Look for a helper provider with the right generation. We
* have to start back at the beginning of the list each time
* because we drop dtrace_lock. It's unlikely that we'll make
* more than two passes.
*/
for (i = 0; i < help->dthps_nprovs; i++) {
prov = help->dthps_provs[i];
if (prov->dthp_generation == gen)
break;
}
/*
* If there were no matches, we're done.
*/
if (i == help->dthps_nprovs)
break;
/*
* Move the last helper provider into this slot.
*/
help->dthps_nprovs--;
help->dthps_provs[i] = help->dthps_provs[help->dthps_nprovs];
help->dthps_provs[help->dthps_nprovs] = NULL;
mutex_exit(&dtrace_lock);
/*
* If we have a meta provider, remove this helper provider.
*/
mutex_enter(&dtrace_meta_lock);
if (dtrace_meta_pid != NULL) {
ASSERT(dtrace_deferred_pid == NULL);
dtrace_helper_provider_remove(&prov->dthp_prov,
p->p_pid);
}
mutex_exit(&dtrace_meta_lock);
dtrace_helper_provider_destroy(prov);
mutex_enter(&dtrace_lock);
}
return (0);
}
static int
dtrace_helper_validate(dtrace_helper_action_t *helper)
{
int err = 0, i;
dtrace_difo_t *dp;
if ((dp = helper->dtha_predicate) != NULL)
err += dtrace_difo_validate_helper(dp);
for (i = 0; i < helper->dtha_nactions; i++)
err += dtrace_difo_validate_helper(helper->dtha_actions[i]);
return (err == 0);
}
static int
dtrace_helper_action_add(int which, dtrace_ecbdesc_t *ep)
{
dtrace_helpers_t *help;
dtrace_helper_action_t *helper, *last;
dtrace_actdesc_t *act;
dtrace_vstate_t *vstate;
dtrace_predicate_t *pred;
int count = 0, nactions = 0, i;
if (which < 0 || which >= DTRACE_NHELPER_ACTIONS)
return (EINVAL);
help = curproc->p_dtrace_helpers;
last = help->dthps_actions[which];
vstate = &help->dthps_vstate;
for (count = 0; last != NULL; last = last->dtha_next) {
count++;
if (last->dtha_next == NULL)
break;
}
/*
* If we already have dtrace_helper_actions_max helper actions for this
* helper action type, we'll refuse to add a new one.
*/
if (count >= dtrace_helper_actions_max)
return (ENOSPC);
helper = kmem_zalloc(sizeof (dtrace_helper_action_t), KM_SLEEP);
helper->dtha_generation = help->dthps_generation;
if ((pred = ep->dted_pred.dtpdd_predicate) != NULL) {
ASSERT(pred->dtp_difo != NULL);
dtrace_difo_hold(pred->dtp_difo);
helper->dtha_predicate = pred->dtp_difo;
}
for (act = ep->dted_action; act != NULL; act = act->dtad_next) {
if (act->dtad_kind != DTRACEACT_DIFEXPR)
goto err;
if (act->dtad_difo == NULL)
goto err;
nactions++;
}
helper->dtha_actions = kmem_zalloc(sizeof (dtrace_difo_t *) *
(helper->dtha_nactions = nactions), KM_SLEEP);
for (act = ep->dted_action, i = 0; act != NULL; act = act->dtad_next) {
dtrace_difo_hold(act->dtad_difo);
helper->dtha_actions[i++] = act->dtad_difo;
}
if (!dtrace_helper_validate(helper))
goto err;
if (last == NULL) {
help->dthps_actions[which] = helper;
} else {
last->dtha_next = helper;
}
if (vstate->dtvs_nlocals > dtrace_helptrace_nlocals) {
dtrace_helptrace_nlocals = vstate->dtvs_nlocals;
dtrace_helptrace_next = 0;
}
return (0);
err:
dtrace_helper_action_destroy(helper, vstate);
return (EINVAL);
}
static void
dtrace_helper_provider_register(proc_t *p, dtrace_helpers_t *help,
dof_helper_t *dofhp)
{
ASSERT(MUTEX_NOT_HELD(&dtrace_lock));
mutex_enter(&dtrace_meta_lock);
mutex_enter(&dtrace_lock);
if (!dtrace_attached() || dtrace_meta_pid == NULL) {
/*
* If the dtrace module is loaded but not attached, or if
* there aren't isn't a meta provider registered to deal with
* these provider descriptions, we need to postpone creating
* the actual providers until later.
*/
if (help->dthps_next == NULL && help->dthps_prev == NULL &&
dtrace_deferred_pid != help) {
help->dthps_deferred = 1;
help->dthps_pid = p->p_pid;
help->dthps_next = dtrace_deferred_pid;
help->dthps_prev = NULL;
if (dtrace_deferred_pid != NULL)
dtrace_deferred_pid->dthps_prev = help;
dtrace_deferred_pid = help;
}
mutex_exit(&dtrace_lock);
} else if (dofhp != NULL) {
/*
* If the dtrace module is loaded and we have a particular
* helper provider description, pass that off to the
* meta provider.
*/
mutex_exit(&dtrace_lock);
dtrace_helper_provide(dofhp, p->p_pid);
} else {
/*
* Otherwise, just pass all the helper provider descriptions
* off to the meta provider.
*/
int i;
mutex_exit(&dtrace_lock);
for (i = 0; i < help->dthps_nprovs; i++) {
dtrace_helper_provide(&help->dthps_provs[i]->dthp_prov,
p->p_pid);
}
}
mutex_exit(&dtrace_meta_lock);
}
static int
dtrace_helper_provider_add(dof_helper_t *dofhp, int gen)
{
dtrace_helpers_t *help;
dtrace_helper_provider_t *hprov, **tmp_provs;
uint_t tmp_maxprovs, i;
ASSERT(MUTEX_HELD(&dtrace_lock));
help = curproc->p_dtrace_helpers;
ASSERT(help != NULL);
/*
* If we already have dtrace_helper_providers_max helper providers,
* we're refuse to add a new one.
*/
if (help->dthps_nprovs >= dtrace_helper_providers_max)
return (ENOSPC);
/*
* Check to make sure this isn't a duplicate.
*/
for (i = 0; i < help->dthps_nprovs; i++) {
if (dofhp->dofhp_addr ==
help->dthps_provs[i]->dthp_prov.dofhp_addr)
return (EALREADY);
}
hprov = kmem_zalloc(sizeof (dtrace_helper_provider_t), KM_SLEEP);
hprov->dthp_prov = *dofhp;
hprov->dthp_ref = 1;
hprov->dthp_generation = gen;
/*
* Allocate a bigger table for helper providers if it's already full.
*/
if (help->dthps_maxprovs == help->dthps_nprovs) {
tmp_maxprovs = help->dthps_maxprovs;
tmp_provs = help->dthps_provs;
if (help->dthps_maxprovs == 0)
help->dthps_maxprovs = 2;
else
help->dthps_maxprovs *= 2;
if (help->dthps_maxprovs > dtrace_helper_providers_max)
help->dthps_maxprovs = dtrace_helper_providers_max;
ASSERT(tmp_maxprovs < help->dthps_maxprovs);
help->dthps_provs = kmem_zalloc(help->dthps_maxprovs *
sizeof (dtrace_helper_provider_t *), KM_SLEEP);
if (tmp_provs != NULL) {
bcopy(tmp_provs, help->dthps_provs, tmp_maxprovs *
sizeof (dtrace_helper_provider_t *));
kmem_free(tmp_provs, tmp_maxprovs *
sizeof (dtrace_helper_provider_t *));
}
}
help->dthps_provs[help->dthps_nprovs] = hprov;
help->dthps_nprovs++;
return (0);
}
static void
dtrace_helper_provider_destroy(dtrace_helper_provider_t *hprov)
{
mutex_enter(&dtrace_lock);
if (--hprov->dthp_ref == 0) {
dof_hdr_t *dof;
mutex_exit(&dtrace_lock);
dof = (dof_hdr_t *)(uintptr_t)hprov->dthp_prov.dofhp_dof;
dtrace_dof_destroy(dof);
kmem_free(hprov, sizeof (dtrace_helper_provider_t));
} else {
mutex_exit(&dtrace_lock);
}
}
static int
dtrace_helper_provider_validate(dof_hdr_t *dof, dof_sec_t *sec)
{
uintptr_t daddr = (uintptr_t)dof;
dof_sec_t *str_sec, *prb_sec, *arg_sec, *off_sec, *enoff_sec;
dof_provider_t *provider;
dof_probe_t *probe;
uint8_t *arg;
char *strtab, *typestr;
dof_stridx_t typeidx;
size_t typesz;
uint_t nprobes, j, k;
ASSERT(sec->dofs_type == DOF_SECT_PROVIDER);
if (sec->dofs_offset & (sizeof (uint_t) - 1)) {
dtrace_dof_error(dof, "misaligned section offset");
return (-1);
}
/*
* The section needs to be large enough to contain the DOF provider
* structure appropriate for the given version.
*/
if (sec->dofs_size <
((dof->dofh_ident[DOF_ID_VERSION] == DOF_VERSION_1) ?
offsetof(dof_provider_t, dofpv_prenoffs) :
sizeof (dof_provider_t))) {
dtrace_dof_error(dof, "provider section too small");
return (-1);
}
provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
str_sec = dtrace_dof_sect(dof, DOF_SECT_STRTAB, provider->dofpv_strtab);
prb_sec = dtrace_dof_sect(dof, DOF_SECT_PROBES, provider->dofpv_probes);
arg_sec = dtrace_dof_sect(dof, DOF_SECT_PRARGS, provider->dofpv_prargs);
off_sec = dtrace_dof_sect(dof, DOF_SECT_PROFFS, provider->dofpv_proffs);
if (str_sec == NULL || prb_sec == NULL ||
arg_sec == NULL || off_sec == NULL)
return (-1);
enoff_sec = NULL;
if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
provider->dofpv_prenoffs != DOF_SECT_NONE &&
(enoff_sec = dtrace_dof_sect(dof, DOF_SECT_PRENOFFS,
provider->dofpv_prenoffs)) == NULL)
return (-1);
strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
if (provider->dofpv_name >= str_sec->dofs_size ||
strlen(strtab + provider->dofpv_name) >= DTRACE_PROVNAMELEN) {
dtrace_dof_error(dof, "invalid provider name");
return (-1);
}
if (prb_sec->dofs_entsize == 0 ||
prb_sec->dofs_entsize > prb_sec->dofs_size) {
dtrace_dof_error(dof, "invalid entry size");
return (-1);
}
if (prb_sec->dofs_entsize & (sizeof (uintptr_t) - 1)) {
dtrace_dof_error(dof, "misaligned entry size");
return (-1);
}
if (off_sec->dofs_entsize != sizeof (uint32_t)) {
dtrace_dof_error(dof, "invalid entry size");
return (-1);
}
if (off_sec->dofs_offset & (sizeof (uint32_t) - 1)) {
dtrace_dof_error(dof, "misaligned section offset");
return (-1);
}
if (arg_sec->dofs_entsize != sizeof (uint8_t)) {
dtrace_dof_error(dof, "invalid entry size");
return (-1);
}
arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset);
nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize;
/*
* Take a pass through the probes to check for errors.
*/
for (j = 0; j < nprobes; j++) {
probe = (dof_probe_t *)(uintptr_t)(daddr +
prb_sec->dofs_offset + j * prb_sec->dofs_entsize);
if (probe->dofpr_func >= str_sec->dofs_size) {
dtrace_dof_error(dof, "invalid function name");
return (-1);
}
if (strlen(strtab + probe->dofpr_func) >= DTRACE_FUNCNAMELEN) {
dtrace_dof_error(dof, "function name too long");
return (-1);
}
if (probe->dofpr_name >= str_sec->dofs_size ||
strlen(strtab + probe->dofpr_name) >= DTRACE_NAMELEN) {
dtrace_dof_error(dof, "invalid probe name");
return (-1);
}
/*
* The offset count must not wrap the index, and the offsets
* must also not overflow the section's data.
*/
if (probe->dofpr_offidx + probe->dofpr_noffs <
probe->dofpr_offidx ||
(probe->dofpr_offidx + probe->dofpr_noffs) *
off_sec->dofs_entsize > off_sec->dofs_size) {
dtrace_dof_error(dof, "invalid probe offset");
return (-1);
}
if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1) {
/*
* If there's no is-enabled offset section, make sure
* there aren't any is-enabled offsets. Otherwise
* perform the same checks as for probe offsets
* (immediately above).
*/
if (enoff_sec == NULL) {
if (probe->dofpr_enoffidx != 0 ||
probe->dofpr_nenoffs != 0) {
dtrace_dof_error(dof, "is-enabled "
"offsets with null section");
return (-1);
}
} else if (probe->dofpr_enoffidx +
probe->dofpr_nenoffs < probe->dofpr_enoffidx ||
(probe->dofpr_enoffidx + probe->dofpr_nenoffs) *
enoff_sec->dofs_entsize > enoff_sec->dofs_size) {
dtrace_dof_error(dof, "invalid is-enabled "
"offset");
return (-1);
}
if (probe->dofpr_noffs + probe->dofpr_nenoffs == 0) {
dtrace_dof_error(dof, "zero probe and "
"is-enabled offsets");
return (-1);
}
} else if (probe->dofpr_noffs == 0) {
dtrace_dof_error(dof, "zero probe offsets");
return (-1);
}
if (probe->dofpr_argidx + probe->dofpr_xargc <
probe->dofpr_argidx ||
(probe->dofpr_argidx + probe->dofpr_xargc) *
arg_sec->dofs_entsize > arg_sec->dofs_size) {
dtrace_dof_error(dof, "invalid args");
return (-1);
}
typeidx = probe->dofpr_nargv;
typestr = strtab + probe->dofpr_nargv;
for (k = 0; k < probe->dofpr_nargc; k++) {
if (typeidx >= str_sec->dofs_size) {
dtrace_dof_error(dof, "bad "
"native argument type");
return (-1);
}
typesz = strlen(typestr) + 1;
if (typesz > DTRACE_ARGTYPELEN) {
dtrace_dof_error(dof, "native "
"argument type too long");
return (-1);
}
typeidx += typesz;
typestr += typesz;
}
typeidx = probe->dofpr_xargv;
typestr = strtab + probe->dofpr_xargv;
for (k = 0; k < probe->dofpr_xargc; k++) {
if (arg[probe->dofpr_argidx + k] > probe->dofpr_nargc) {
dtrace_dof_error(dof, "bad "
"native argument index");
return (-1);
}
if (typeidx >= str_sec->dofs_size) {
dtrace_dof_error(dof, "bad "
"translated argument type");
return (-1);
}
typesz = strlen(typestr) + 1;
if (typesz > DTRACE_ARGTYPELEN) {
dtrace_dof_error(dof, "translated argument "
"type too long");
return (-1);
}
typeidx += typesz;
typestr += typesz;
}
}
return (0);
}
static int
dtrace_helper_slurp(dof_hdr_t *dof, dof_helper_t *dhp)
{
dtrace_helpers_t *help;
dtrace_vstate_t *vstate;
dtrace_enabling_t *enab = NULL;
int i, gen, rv, nhelpers = 0, nprovs = 0, destroy = 1;
uintptr_t daddr = (uintptr_t)dof;
ASSERT(MUTEX_HELD(&dtrace_lock));
if ((help = curproc->p_dtrace_helpers) == NULL)
help = dtrace_helpers_create(curproc);
vstate = &help->dthps_vstate;
if ((rv = dtrace_dof_slurp(dof, vstate, NULL, &enab,
dhp != NULL ? dhp->dofhp_addr : 0, B_FALSE)) != 0) {
dtrace_dof_destroy(dof);
return (rv);
}
/*
* Look for helper providers and validate their descriptions.
*/
if (dhp != NULL) {
for (i = 0; i < dof->dofh_secnum; i++) {
dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
dof->dofh_secoff + i * dof->dofh_secsize);
if (sec->dofs_type != DOF_SECT_PROVIDER)
continue;
if (dtrace_helper_provider_validate(dof, sec) != 0) {
dtrace_enabling_destroy(enab);
dtrace_dof_destroy(dof);
return (-1);
}
nprovs++;
}
}
/*
* Now we need to walk through the ECB descriptions in the enabling.
*/
for (i = 0; i < enab->dten_ndesc; i++) {
dtrace_ecbdesc_t *ep = enab->dten_desc[i];
dtrace_probedesc_t *desc = &ep->dted_probe;
if (strcmp(desc->dtpd_provider, "dtrace") != 0)
continue;
if (strcmp(desc->dtpd_mod, "helper") != 0)
continue;
if (strcmp(desc->dtpd_func, "ustack") != 0)
continue;
if ((rv = dtrace_helper_action_add(DTRACE_HELPER_ACTION_USTACK,
ep)) != 0) {
/*
* Adding this helper action failed -- we are now going
* to rip out the entire generation and return failure.
*/
(void) dtrace_helper_destroygen(help->dthps_generation);
dtrace_enabling_destroy(enab);
dtrace_dof_destroy(dof);
return (-1);
}
nhelpers++;
}
if (nhelpers < enab->dten_ndesc)
dtrace_dof_error(dof, "unmatched helpers");
gen = help->dthps_generation++;
dtrace_enabling_destroy(enab);
if (dhp != NULL && nprovs > 0) {
dhp->dofhp_dof = (uint64_t)(uintptr_t)dof;
if (dtrace_helper_provider_add(dhp, gen) == 0) {
mutex_exit(&dtrace_lock);
dtrace_helper_provider_register(curproc, help, dhp);
mutex_enter(&dtrace_lock);
destroy = 0;
}
}
if (destroy)
dtrace_dof_destroy(dof);
return (gen);
}
static dtrace_helpers_t *
dtrace_helpers_create(proc_t *p)
{
dtrace_helpers_t *help;
ASSERT(MUTEX_HELD(&dtrace_lock));
ASSERT(p->p_dtrace_helpers == NULL);
help = kmem_zalloc(sizeof (dtrace_helpers_t), KM_SLEEP);
help->dthps_actions = kmem_zalloc(sizeof (dtrace_helper_action_t *) *
DTRACE_NHELPER_ACTIONS, KM_SLEEP);
p->p_dtrace_helpers = help;
dtrace_helpers++;
return (help);
}
#if defined(sun)
static
#endif
void
dtrace_helpers_destroy(proc_t *p)
{
dtrace_helpers_t *help;
dtrace_vstate_t *vstate;
#if defined(sun)
proc_t *p = curproc;
#endif
int i;
mutex_enter(&dtrace_lock);
ASSERT(p->p_dtrace_helpers != NULL);
ASSERT(dtrace_helpers > 0);
help = p->p_dtrace_helpers;
vstate = &help->dthps_vstate;
/*
* We're now going to lose the help from this process.
*/
p->p_dtrace_helpers = NULL;
dtrace_sync();
/*
* Destory the helper actions.
*/
for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
dtrace_helper_action_t *h, *next;
for (h = help->dthps_actions[i]; h != NULL; h = next) {
next = h->dtha_next;
dtrace_helper_action_destroy(h, vstate);
h = next;
}
}
mutex_exit(&dtrace_lock);
/*
* Destroy the helper providers.
*/
if (help->dthps_maxprovs > 0) {
mutex_enter(&dtrace_meta_lock);
if (dtrace_meta_pid != NULL) {
ASSERT(dtrace_deferred_pid == NULL);
for (i = 0; i < help->dthps_nprovs; i++) {
dtrace_helper_provider_remove(
&help->dthps_provs[i]->dthp_prov, p->p_pid);
}
} else {
mutex_enter(&dtrace_lock);
ASSERT(help->dthps_deferred == 0 ||
help->dthps_next != NULL ||
help->dthps_prev != NULL ||
help == dtrace_deferred_pid);
/*
* Remove the helper from the deferred list.
*/
if (help->dthps_next != NULL)
help->dthps_next->dthps_prev = help->dthps_prev;
if (help->dthps_prev != NULL)
help->dthps_prev->dthps_next = help->dthps_next;
if (dtrace_deferred_pid == help) {
dtrace_deferred_pid = help->dthps_next;
ASSERT(help->dthps_prev == NULL);
}
mutex_exit(&dtrace_lock);
}
mutex_exit(&dtrace_meta_lock);
for (i = 0; i < help->dthps_nprovs; i++) {
dtrace_helper_provider_destroy(help->dthps_provs[i]);
}
kmem_free(help->dthps_provs, help->dthps_maxprovs *
sizeof (dtrace_helper_provider_t *));
}
mutex_enter(&dtrace_lock);
dtrace_vstate_fini(&help->dthps_vstate);
kmem_free(help->dthps_actions,
sizeof (dtrace_helper_action_t *) * DTRACE_NHELPER_ACTIONS);
kmem_free(help, sizeof (dtrace_helpers_t));
--dtrace_helpers;
mutex_exit(&dtrace_lock);
}
#if defined(sun)
static
#endif
void
dtrace_helpers_duplicate(proc_t *from, proc_t *to)
{
dtrace_helpers_t *help, *newhelp;
dtrace_helper_action_t *helper, *new, *last;
dtrace_difo_t *dp;
dtrace_vstate_t *vstate;
int i, j, sz, hasprovs = 0;
mutex_enter(&dtrace_lock);
ASSERT(from->p_dtrace_helpers != NULL);
ASSERT(dtrace_helpers > 0);
help = from->p_dtrace_helpers;
newhelp = dtrace_helpers_create(to);
ASSERT(to->p_dtrace_helpers != NULL);
newhelp->dthps_generation = help->dthps_generation;
vstate = &newhelp->dthps_vstate;
/*
* Duplicate the helper actions.
*/
for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
if ((helper = help->dthps_actions[i]) == NULL)
continue;
for (last = NULL; helper != NULL; helper = helper->dtha_next) {
new = kmem_zalloc(sizeof (dtrace_helper_action_t),
KM_SLEEP);
new->dtha_generation = helper->dtha_generation;
if ((dp = helper->dtha_predicate) != NULL) {
dp = dtrace_difo_duplicate(dp, vstate);
new->dtha_predicate = dp;
}
new->dtha_nactions = helper->dtha_nactions;
sz = sizeof (dtrace_difo_t *) * new->dtha_nactions;
new->dtha_actions = kmem_alloc(sz, KM_SLEEP);
for (j = 0; j < new->dtha_nactions; j++) {
dtrace_difo_t *dp = helper->dtha_actions[j];
ASSERT(dp != NULL);
dp = dtrace_difo_duplicate(dp, vstate);
new->dtha_actions[j] = dp;
}
if (last != NULL) {
last->dtha_next = new;
} else {
newhelp->dthps_actions[i] = new;
}
last = new;
}
}
/*
* Duplicate the helper providers and register them with the
* DTrace framework.
*/
if (help->dthps_nprovs > 0) {
newhelp->dthps_nprovs = help->dthps_nprovs;
newhelp->dthps_maxprovs = help->dthps_nprovs;
newhelp->dthps_provs = kmem_alloc(newhelp->dthps_nprovs *
sizeof (dtrace_helper_provider_t *), KM_SLEEP);
for (i = 0; i < newhelp->dthps_nprovs; i++) {
newhelp->dthps_provs[i] = help->dthps_provs[i];
newhelp->dthps_provs[i]->dthp_ref++;
}
hasprovs = 1;
}
mutex_exit(&dtrace_lock);
if (hasprovs)
dtrace_helper_provider_register(to, newhelp, NULL);
}
#if defined(sun)
/*
* DTrace Hook Functions
*/
static void
dtrace_module_loaded(modctl_t *ctl)
{
dtrace_provider_t *prv;
mutex_enter(&dtrace_provider_lock);
mutex_enter(&mod_lock);
ASSERT(ctl->mod_busy);
/*
* We're going to call each providers per-module provide operation
* specifying only this module.
*/
for (prv = dtrace_provider; prv != NULL; prv = prv->dtpv_next)
prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
mutex_exit(&mod_lock);
mutex_exit(&dtrace_provider_lock);
/*
* If we have any retained enablings, we need to match against them.
* Enabling probes requires that cpu_lock be held, and we cannot hold
* cpu_lock here -- it is legal for cpu_lock to be held when loading a
* module. (In particular, this happens when loading scheduling
* classes.) So if we have any retained enablings, we need to dispatch
* our task queue to do the match for us.
*/
mutex_enter(&dtrace_lock);
if (dtrace_retained == NULL) {
mutex_exit(&dtrace_lock);
return;
}
(void) taskq_dispatch(dtrace_taskq,
(task_func_t *)dtrace_enabling_matchall, NULL, TQ_SLEEP);
mutex_exit(&dtrace_lock);
/*
* And now, for a little heuristic sleaze: in general, we want to
* match modules as soon as they load. However, we cannot guarantee
* this, because it would lead us to the lock ordering violation
* outlined above. The common case, of course, is that cpu_lock is
* _not_ held -- so we delay here for a clock tick, hoping that that's
* long enough for the task queue to do its work. If it's not, it's
* not a serious problem -- it just means that the module that we
* just loaded may not be immediately instrumentable.
*/
delay(1);
}
static void
dtrace_module_unloaded(modctl_t *ctl)
{
dtrace_probe_t template, *probe, *first, *next;
dtrace_provider_t *prov;
template.dtpr_mod = ctl->mod_modname;
mutex_enter(&dtrace_provider_lock);
mutex_enter(&mod_lock);
mutex_enter(&dtrace_lock);
if (dtrace_bymod == NULL) {
/*
* The DTrace module is loaded (obviously) but not attached;
* we don't have any work to do.
*/
mutex_exit(&dtrace_provider_lock);
mutex_exit(&mod_lock);
mutex_exit(&dtrace_lock);
return;
}
for (probe = first = dtrace_hash_lookup(dtrace_bymod, &template);
probe != NULL; probe = probe->dtpr_nextmod) {
if (probe->dtpr_ecb != NULL) {
mutex_exit(&dtrace_provider_lock);
mutex_exit(&mod_lock);
mutex_exit(&dtrace_lock);
/*
* This shouldn't _actually_ be possible -- we're
* unloading a module that has an enabled probe in it.
* (It's normally up to the provider to make sure that
* this can't happen.) However, because dtps_enable()
* doesn't have a failure mode, there can be an
* enable/unload race. Upshot: we don't want to
* assert, but we're not going to disable the
* probe, either.
*/
if (dtrace_err_verbose) {
cmn_err(CE_WARN, "unloaded module '%s' had "
"enabled probes", ctl->mod_modname);
}
return;
}
}
probe = first;
for (first = NULL; probe != NULL; probe = next) {
ASSERT(dtrace_probes[probe->dtpr_id - 1] == probe);
dtrace_probes[probe->dtpr_id - 1] = NULL;
next = probe->dtpr_nextmod;
dtrace_hash_remove(dtrace_bymod, probe);
dtrace_hash_remove(dtrace_byfunc, probe);
dtrace_hash_remove(dtrace_byname, probe);
if (first == NULL) {
first = probe;
probe->dtpr_nextmod = NULL;
} else {
probe->dtpr_nextmod = first;
first = probe;
}
}
/*
* We've removed all of the module's probes from the hash chains and
* from the probe array. Now issue a dtrace_sync() to be sure that
* everyone has cleared out from any probe array processing.
*/
dtrace_sync();
for (probe = first; probe != NULL; probe = first) {
first = probe->dtpr_nextmod;
prov = probe->dtpr_provider;
prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, probe->dtpr_id,
probe->dtpr_arg);
kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
vmem_free(dtrace_arena, (void *)(uintptr_t)probe->dtpr_id, 1);
kmem_free(probe, sizeof (dtrace_probe_t));
}
mutex_exit(&dtrace_lock);
mutex_exit(&mod_lock);
mutex_exit(&dtrace_provider_lock);
}
static void
dtrace_suspend(void)
{
dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_suspend));
}
static void
dtrace_resume(void)
{
dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_resume));
}
#endif
static int
dtrace_cpu_setup(cpu_setup_t what, processorid_t cpu)
{
ASSERT(MUTEX_HELD(&cpu_lock));
mutex_enter(&dtrace_lock);
switch (what) {
case CPU_CONFIG: {
dtrace_state_t *state;
dtrace_optval_t *opt, rs, c;
/*
* For now, we only allocate a new buffer for anonymous state.
*/
if ((state = dtrace_anon.dta_state) == NULL)
break;
if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE)
break;
opt = state->dts_options;
c = opt[DTRACEOPT_CPU];
if (c != DTRACE_CPUALL && c != DTRACEOPT_UNSET && c != cpu)
break;
/*
* Regardless of what the actual policy is, we're going to
* temporarily set our resize policy to be manual. We're
* also going to temporarily set our CPU option to denote
* the newly configured CPU.
*/
rs = opt[DTRACEOPT_BUFRESIZE];
opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_MANUAL;
opt[DTRACEOPT_CPU] = (dtrace_optval_t)cpu;
(void) dtrace_state_buffers(state);
opt[DTRACEOPT_BUFRESIZE] = rs;
opt[DTRACEOPT_CPU] = c;
break;
}
case CPU_UNCONFIG:
/*
* We don't free the buffer in the CPU_UNCONFIG case. (The
* buffer will be freed when the consumer exits.)
*/
break;
default:
break;
}
mutex_exit(&dtrace_lock);
return (0);
}
#if defined(sun)
static void
dtrace_cpu_setup_initial(processorid_t cpu)
{
(void) dtrace_cpu_setup(CPU_CONFIG, cpu);
}
#endif
static void
dtrace_toxrange_add(uintptr_t base, uintptr_t limit)
{
if (dtrace_toxranges >= dtrace_toxranges_max) {
int osize, nsize;
dtrace_toxrange_t *range;
osize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t);
if (osize == 0) {
ASSERT(dtrace_toxrange == NULL);
ASSERT(dtrace_toxranges_max == 0);
dtrace_toxranges_max = 1;
} else {
dtrace_toxranges_max <<= 1;
}
nsize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t);
range = kmem_zalloc(nsize, KM_SLEEP);
if (dtrace_toxrange != NULL) {
ASSERT(osize != 0);
bcopy(dtrace_toxrange, range, osize);
kmem_free(dtrace_toxrange, osize);
}
dtrace_toxrange = range;
}
ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_base == 0);
ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_limit == 0);
dtrace_toxrange[dtrace_toxranges].dtt_base = base;
dtrace_toxrange[dtrace_toxranges].dtt_limit = limit;
dtrace_toxranges++;
}
/*
* DTrace Driver Cookbook Functions
*/
#if defined(sun)
/*ARGSUSED*/
static int
dtrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
{
dtrace_provider_id_t id;
dtrace_state_t *state = NULL;
dtrace_enabling_t *enab;
mutex_enter(&cpu_lock);
mutex_enter(&dtrace_provider_lock);
mutex_enter(&dtrace_lock);
if (ddi_soft_state_init(&dtrace_softstate,
sizeof (dtrace_state_t), 0) != 0) {
cmn_err(CE_NOTE, "/dev/dtrace failed to initialize soft state");
mutex_exit(&cpu_lock);
mutex_exit(&dtrace_provider_lock);
mutex_exit(&dtrace_lock);
return (DDI_FAILURE);
}
if (ddi_create_minor_node(devi, DTRACEMNR_DTRACE, S_IFCHR,
DTRACEMNRN_DTRACE, DDI_PSEUDO, NULL) == DDI_FAILURE ||
ddi_create_minor_node(devi, DTRACEMNR_HELPER, S_IFCHR,
DTRACEMNRN_HELPER, DDI_PSEUDO, NULL) == DDI_FAILURE) {
cmn_err(CE_NOTE, "/dev/dtrace couldn't create minor nodes");
ddi_remove_minor_node(devi, NULL);
ddi_soft_state_fini(&dtrace_softstate);
mutex_exit(&cpu_lock);
mutex_exit(&dtrace_provider_lock);
mutex_exit(&dtrace_lock);
return (DDI_FAILURE);
}
ddi_report_dev(devi);
dtrace_devi = devi;
dtrace_modload = dtrace_module_loaded;
dtrace_modunload = dtrace_module_unloaded;
dtrace_cpu_init = dtrace_cpu_setup_initial;
dtrace_helpers_cleanup = dtrace_helpers_destroy;
dtrace_helpers_fork = dtrace_helpers_duplicate;
dtrace_cpustart_init = dtrace_suspend;
dtrace_cpustart_fini = dtrace_resume;
dtrace_debugger_init = dtrace_suspend;
dtrace_debugger_fini = dtrace_resume;
register_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL);
ASSERT(MUTEX_HELD(&cpu_lock));
dtrace_arena = vmem_create("dtrace", (void *)1, UINT32_MAX, 1,
NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
dtrace_minor = vmem_create("dtrace_minor", (void *)DTRACEMNRN_CLONE,
UINT32_MAX - DTRACEMNRN_CLONE, 1, NULL, NULL, NULL, 0,
VM_SLEEP | VMC_IDENTIFIER);
dtrace_taskq = taskq_create("dtrace_taskq", 1, maxclsyspri,
1, INT_MAX, 0);
dtrace_state_cache = kmem_cache_create("dtrace_state_cache",
sizeof (dtrace_dstate_percpu_t) * NCPU, DTRACE_STATE_ALIGN,
NULL, NULL, NULL, NULL, NULL, 0);
ASSERT(MUTEX_HELD(&cpu_lock));
dtrace_bymod = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_mod),
offsetof(dtrace_probe_t, dtpr_nextmod),
offsetof(dtrace_probe_t, dtpr_prevmod));
dtrace_byfunc = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_func),
offsetof(dtrace_probe_t, dtpr_nextfunc),
offsetof(dtrace_probe_t, dtpr_prevfunc));
dtrace_byname = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_name),
offsetof(dtrace_probe_t, dtpr_nextname),
offsetof(dtrace_probe_t, dtpr_prevname));
if (dtrace_retain_max < 1) {
cmn_err(CE_WARN, "illegal value (%lu) for dtrace_retain_max; "
"setting to 1", dtrace_retain_max);
dtrace_retain_max = 1;
}
/*
* Now discover our toxic ranges.
*/
dtrace_toxic_ranges(dtrace_toxrange_add);
/*
* Before we register ourselves as a provider to our own framework,
* we would like to assert that dtrace_provider is NULL -- but that's
* not true if we were loaded as a dependency of a DTrace provider.
* Once we've registered, we can assert that dtrace_provider is our
* pseudo provider.
*/
(void) dtrace_register("dtrace", &dtrace_provider_attr,
DTRACE_PRIV_NONE, 0, &dtrace_provider_ops, NULL, &id);
ASSERT(dtrace_provider != NULL);
ASSERT((dtrace_provider_id_t)dtrace_provider == id);
dtrace_probeid_begin = dtrace_probe_create((dtrace_provider_id_t)
dtrace_provider, NULL, NULL, "BEGIN", 0, NULL);
dtrace_probeid_end = dtrace_probe_create((dtrace_provider_id_t)
dtrace_provider, NULL, NULL, "END", 0, NULL);
dtrace_probeid_error = dtrace_probe_create((dtrace_provider_id_t)
dtrace_provider, NULL, NULL, "ERROR", 1, NULL);
dtrace_anon_property();
mutex_exit(&cpu_lock);
/*
* If DTrace helper tracing is enabled, we need to allocate the
* trace buffer and initialize the values.
*/
if (dtrace_helptrace_enabled) {
ASSERT(dtrace_helptrace_buffer == NULL);
dtrace_helptrace_buffer =
kmem_zalloc(dtrace_helptrace_bufsize, KM_SLEEP);
dtrace_helptrace_next = 0;
}
/*
* If there are already providers, we must ask them to provide their
* probes, and then match any anonymous enabling against them. Note
* that there should be no other retained enablings at this time:
* the only retained enablings at this time should be the anonymous
* enabling.
*/
if (dtrace_anon.dta_enabling != NULL) {
ASSERT(dtrace_retained == dtrace_anon.dta_enabling);
dtrace_enabling_provide(NULL);
state = dtrace_anon.dta_state;
/*
* We couldn't hold cpu_lock across the above call to
* dtrace_enabling_provide(), but we must hold it to actually
* enable the probes. We have to drop all of our locks, pick
* up cpu_lock, and regain our locks before matching the
* retained anonymous enabling.
*/
mutex_exit(&dtrace_lock);
mutex_exit(&dtrace_provider_lock);
mutex_enter(&cpu_lock);
mutex_enter(&dtrace_provider_lock);
mutex_enter(&dtrace_lock);
if ((enab = dtrace_anon.dta_enabling) != NULL)
(void) dtrace_enabling_match(enab, NULL);
mutex_exit(&cpu_lock);
}
mutex_exit(&dtrace_lock);
mutex_exit(&dtrace_provider_lock);
if (state != NULL) {
/*
* If we created any anonymous state, set it going now.
*/
(void) dtrace_state_go(state, &dtrace_anon.dta_beganon);
}
return (DDI_SUCCESS);
}
#endif
#if !defined(sun)
#if __FreeBSD_version >= 800039
static void
dtrace_dtr(void *data __unused)
{
}
#endif
#endif
/*ARGSUSED*/
static int
#if defined(sun)
dtrace_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
#else
dtrace_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
#endif
{
dtrace_state_t *state;
uint32_t priv;
uid_t uid;
zoneid_t zoneid;
#if defined(sun)
if (getminor(*devp) == DTRACEMNRN_HELPER)
return (0);
/*
* If this wasn't an open with the "helper" minor, then it must be
* the "dtrace" minor.
*/
ASSERT(getminor(*devp) == DTRACEMNRN_DTRACE);
#else
cred_t *cred_p = NULL;
#if __FreeBSD_version < 800039
/*
* The first minor device is the one that is cloned so there is
* nothing more to do here.
*/
if (dev2unit(dev) == 0)
return 0;
/*
* Devices are cloned, so if the DTrace state has already
* been allocated, that means this device belongs to a
* different client. Each client should open '/dev/dtrace'
* to get a cloned device.
*/
if (dev->si_drv1 != NULL)
return (EBUSY);
#endif
cred_p = dev->si_cred;
#endif
/*
* If no DTRACE_PRIV_* bits are set in the credential, then the
* caller lacks sufficient permission to do anything with DTrace.
*/
dtrace_cred2priv(cred_p, &priv, &uid, &zoneid);
if (priv == DTRACE_PRIV_NONE) {
#if !defined(sun)
#if __FreeBSD_version < 800039
/* Destroy the cloned device. */
destroy_dev(dev);
#endif
#endif
return (EACCES);
}
/*
* Ask all providers to provide all their probes.
*/
mutex_enter(&dtrace_provider_lock);
dtrace_probe_provide(NULL, NULL);
mutex_exit(&dtrace_provider_lock);
mutex_enter(&cpu_lock);
mutex_enter(&dtrace_lock);
dtrace_opens++;
dtrace_membar_producer();
#if defined(sun)
/*
* If the kernel debugger is active (that is, if the kernel debugger
* modified text in some way), we won't allow the open.
*/
if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != 0) {
dtrace_opens--;
mutex_exit(&cpu_lock);
mutex_exit(&dtrace_lock);
return (EBUSY);
}
state = dtrace_state_create(devp, cred_p);
#else
state = dtrace_state_create(dev);
#if __FreeBSD_version < 800039
dev->si_drv1 = state;
#else
devfs_set_cdevpriv(state, dtrace_dtr);
#endif
#endif
mutex_exit(&cpu_lock);
if (state == NULL) {
#if defined(sun)
if (--dtrace_opens == 0)
(void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
#else
--dtrace_opens;
#endif
mutex_exit(&dtrace_lock);
#if !defined(sun)
#if __FreeBSD_version < 800039
/* Destroy the cloned device. */
destroy_dev(dev);
#endif
#endif
return (EAGAIN);
}
mutex_exit(&dtrace_lock);
return (0);
}
/*ARGSUSED*/
static int
#if defined(sun)
dtrace_close(dev_t dev, int flag, int otyp, cred_t *cred_p)
#else
dtrace_close(struct cdev *dev, int flags, int fmt __unused, struct thread *td)
#endif
{
#if defined(sun)
minor_t minor = getminor(dev);
dtrace_state_t *state;
if (minor == DTRACEMNRN_HELPER)
return (0);
state = ddi_get_soft_state(dtrace_softstate, minor);
#else
#if __FreeBSD_version < 800039
dtrace_state_t *state = dev->si_drv1;
/* Check if this is not a cloned device. */
if (dev2unit(dev) == 0)
return (0);
#else
dtrace_state_t *state;
devfs_get_cdevpriv((void **) &state);
#endif
#endif
mutex_enter(&cpu_lock);
mutex_enter(&dtrace_lock);
if (state != NULL) {
if (state->dts_anon) {
/*
* There is anonymous state. Destroy that first.
*/
ASSERT(dtrace_anon.dta_state == NULL);
dtrace_state_destroy(state->dts_anon);
}
dtrace_state_destroy(state);
#if !defined(sun)
kmem_free(state, 0);
#if __FreeBSD_version < 800039
dev->si_drv1 = NULL;
#else
devfs_clear_cdevpriv();
#endif
#endif
}
ASSERT(dtrace_opens > 0);
#if defined(sun)
if (--dtrace_opens == 0)
(void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
#else
--dtrace_opens;
#endif
mutex_exit(&dtrace_lock);
mutex_exit(&cpu_lock);
#if __FreeBSD_version < 800039
/* Schedule this cloned device to be destroyed. */
destroy_dev_sched(dev);
#endif
return (0);
}
#if defined(sun)
/*ARGSUSED*/
static int
dtrace_ioctl_helper(int cmd, intptr_t arg, int *rv)
{
int rval;
dof_helper_t help, *dhp = NULL;
switch (cmd) {
case DTRACEHIOC_ADDDOF:
if (copyin((void *)arg, &help, sizeof (help)) != 0) {
dtrace_dof_error(NULL, "failed to copyin DOF helper");
return (EFAULT);
}
dhp = &help;
arg = (intptr_t)help.dofhp_dof;
/*FALLTHROUGH*/
case DTRACEHIOC_ADD: {
dof_hdr_t *dof = dtrace_dof_copyin(arg, &rval);
if (dof == NULL)
return (rval);
mutex_enter(&dtrace_lock);
/*
* dtrace_helper_slurp() takes responsibility for the dof --
* it may free it now or it may save it and free it later.
*/
if ((rval = dtrace_helper_slurp(dof, dhp)) != -1) {
*rv = rval;
rval = 0;
} else {
rval = EINVAL;
}
mutex_exit(&dtrace_lock);
return (rval);
}
case DTRACEHIOC_REMOVE: {
mutex_enter(&dtrace_lock);
rval = dtrace_helper_destroygen(arg);
mutex_exit(&dtrace_lock);
return (rval);
}
default:
break;
}
return (ENOTTY);
}
/*ARGSUSED*/
static int
dtrace_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv)
{
minor_t minor = getminor(dev);
dtrace_state_t *state;
int rval;
if (minor == DTRACEMNRN_HELPER)
return (dtrace_ioctl_helper(cmd, arg, rv));
state = ddi_get_soft_state(dtrace_softstate, minor);
if (state->dts_anon) {
ASSERT(dtrace_anon.dta_state == NULL);
state = state->dts_anon;
}
switch (cmd) {
case DTRACEIOC_PROVIDER: {
dtrace_providerdesc_t pvd;
dtrace_provider_t *pvp;
if (copyin((void *)arg, &pvd, sizeof (pvd)) != 0)
return (EFAULT);
pvd.dtvd_name[DTRACE_PROVNAMELEN - 1] = '\0';
mutex_enter(&dtrace_provider_lock);
for (pvp = dtrace_provider; pvp != NULL; pvp = pvp->dtpv_next) {
if (strcmp(pvp->dtpv_name, pvd.dtvd_name) == 0)
break;
}
mutex_exit(&dtrace_provider_lock);
if (pvp == NULL)
return (ESRCH);
bcopy(&pvp->dtpv_priv, &pvd.dtvd_priv, sizeof (dtrace_ppriv_t));
bcopy(&pvp->dtpv_attr, &pvd.dtvd_attr, sizeof (dtrace_pattr_t));
if (copyout(&pvd, (void *)arg, sizeof (pvd)) != 0)
return (EFAULT);
return (0);
}
case DTRACEIOC_EPROBE: {
dtrace_eprobedesc_t epdesc;
dtrace_ecb_t *ecb;
dtrace_action_t *act;
void *buf;
size_t size;
uintptr_t dest;
int nrecs;
if (copyin((void *)arg, &epdesc, sizeof (epdesc)) != 0)
return (EFAULT);
mutex_enter(&dtrace_lock);
if ((ecb = dtrace_epid2ecb(state, epdesc.dtepd_epid)) == NULL) {
mutex_exit(&dtrace_lock);
return (EINVAL);
}
if (ecb->dte_probe == NULL) {
mutex_exit(&dtrace_lock);
return (EINVAL);
}
epdesc.dtepd_probeid = ecb->dte_probe->dtpr_id;
epdesc.dtepd_uarg = ecb->dte_uarg;
epdesc.dtepd_size = ecb->dte_size;
nrecs = epdesc.dtepd_nrecs;
epdesc.dtepd_nrecs = 0;
for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
if (DTRACEACT_ISAGG(act->dta_kind) || act->dta_intuple)
continue;
epdesc.dtepd_nrecs++;
}
/*
* Now that we have the size, we need to allocate a temporary
* buffer in which to store the complete description. We need
* the temporary buffer to be able to drop dtrace_lock()
* across the copyout(), below.
*/
size = sizeof (dtrace_eprobedesc_t) +
(epdesc.dtepd_nrecs * sizeof (dtrace_recdesc_t));
buf = kmem_alloc(size, KM_SLEEP);
dest = (uintptr_t)buf;
bcopy(&epdesc, (void *)dest, sizeof (epdesc));
dest += offsetof(dtrace_eprobedesc_t, dtepd_rec[0]);
for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
if (DTRACEACT_ISAGG(act->dta_kind) || act->dta_intuple)
continue;
if (nrecs-- == 0)
break;
bcopy(&act->dta_rec, (void *)dest,
sizeof (dtrace_recdesc_t));
dest += sizeof (dtrace_recdesc_t);
}
mutex_exit(&dtrace_lock);
if (copyout(buf, (void *)arg, dest - (uintptr_t)buf) != 0) {
kmem_free(buf, size);
return (EFAULT);
}
kmem_free(buf, size);
return (0);
}
case DTRACEIOC_AGGDESC: {
dtrace_aggdesc_t aggdesc;
dtrace_action_t *act;
dtrace_aggregation_t *agg;
int nrecs;
uint32_t offs;
dtrace_recdesc_t *lrec;
void *buf;
size_t size;
uintptr_t dest;
if (copyin((void *)arg, &aggdesc, sizeof (aggdesc)) != 0)
return (EFAULT);
mutex_enter(&dtrace_lock);
if ((agg = dtrace_aggid2agg(state, aggdesc.dtagd_id)) == NULL) {
mutex_exit(&dtrace_lock);
return (EINVAL);
}
aggdesc.dtagd_epid = agg->dtag_ecb->dte_epid;
nrecs = aggdesc.dtagd_nrecs;
aggdesc.dtagd_nrecs = 0;
offs = agg->dtag_base;
lrec = &agg->dtag_action.dta_rec;
aggdesc.dtagd_size = lrec->dtrd_offset + lrec->dtrd_size - offs;
for (act = agg->dtag_first; ; act = act->dta_next) {
ASSERT(act->dta_intuple ||
DTRACEACT_ISAGG(act->dta_kind));
/*
* If this action has a record size of zero, it
* denotes an argument to the aggregating action.
* Because the presence of this record doesn't (or
* shouldn't) affect the way the data is interpreted,
* we don't copy it out to save user-level the
* confusion of dealing with a zero-length record.
*/
if (act->dta_rec.dtrd_size == 0) {
ASSERT(agg->dtag_hasarg);
continue;
}
aggdesc.dtagd_nrecs++;
if (act == &agg->dtag_action)
break;
}
/*
* Now that we have the size, we need to allocate a temporary
* buffer in which to store the complete description. We need
* the temporary buffer to be able to drop dtrace_lock()
* across the copyout(), below.
*/
size = sizeof (dtrace_aggdesc_t) +
(aggdesc.dtagd_nrecs * sizeof (dtrace_recdesc_t));
buf = kmem_alloc(size, KM_SLEEP);
dest = (uintptr_t)buf;
bcopy(&aggdesc, (void *)dest, sizeof (aggdesc));
dest += offsetof(dtrace_aggdesc_t, dtagd_rec[0]);
for (act = agg->dtag_first; ; act = act->dta_next) {
dtrace_recdesc_t rec = act->dta_rec;
/*
* See the comment in the above loop for why we pass
* over zero-length records.
*/
if (rec.dtrd_size == 0) {
ASSERT(agg->dtag_hasarg);
continue;
}
if (nrecs-- == 0)
break;
rec.dtrd_offset -= offs;
bcopy(&rec, (void *)dest, sizeof (rec));
dest += sizeof (dtrace_recdesc_t);
if (act == &agg->dtag_action)
break;
}
mutex_exit(&dtrace_lock);
if (copyout(buf, (void *)arg, dest - (uintptr_t)buf) != 0) {
kmem_free(buf, size);
return (EFAULT);
}
kmem_free(buf, size);
return (0);
}
case DTRACEIOC_ENABLE: {
dof_hdr_t *dof;
dtrace_enabling_t *enab = NULL;
dtrace_vstate_t *vstate;
int err = 0;
*rv = 0;
/*
* If a NULL argument has been passed, we take this as our
* cue to reevaluate our enablings.
*/
if (arg == NULL) {
dtrace_enabling_matchall();
return (0);
}
if ((dof = dtrace_dof_copyin(arg, &rval)) == NULL)
return (rval);
mutex_enter(&cpu_lock);
mutex_enter(&dtrace_lock);
vstate = &state->dts_vstate;
if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {
mutex_exit(&dtrace_lock);
mutex_exit(&cpu_lock);
dtrace_dof_destroy(dof);
return (EBUSY);
}
if (dtrace_dof_slurp(dof, vstate, cr, &enab, 0, B_TRUE) != 0) {
mutex_exit(&dtrace_lock);
mutex_exit(&cpu_lock);
dtrace_dof_destroy(dof);
return (EINVAL);
}
if ((rval = dtrace_dof_options(dof, state)) != 0) {
dtrace_enabling_destroy(enab);
mutex_exit(&dtrace_lock);
mutex_exit(&cpu_lock);
dtrace_dof_destroy(dof);
return (rval);
}
if ((err = dtrace_enabling_match(enab, rv)) == 0) {
err = dtrace_enabling_retain(enab);
} else {
dtrace_enabling_destroy(enab);
}
mutex_exit(&cpu_lock);
mutex_exit(&dtrace_lock);
dtrace_dof_destroy(dof);
return (err);
}
case DTRACEIOC_REPLICATE: {
dtrace_repldesc_t desc;
dtrace_probedesc_t *match = &desc.dtrpd_match;
dtrace_probedesc_t *create = &desc.dtrpd_create;
int err;
if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
return (EFAULT);
match->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
match->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
match->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
match->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
create->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
create->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
create->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
create->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
mutex_enter(&dtrace_lock);
err = dtrace_enabling_replicate(state, match, create);
mutex_exit(&dtrace_lock);
return (err);
}
case DTRACEIOC_PROBEMATCH:
case DTRACEIOC_PROBES: {
dtrace_probe_t *probe = NULL;
dtrace_probedesc_t desc;
dtrace_probekey_t pkey;
dtrace_id_t i;
int m = 0;
uint32_t priv;
uid_t uid;
zoneid_t zoneid;
if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
return (EFAULT);
desc.dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
desc.dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
desc.dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
desc.dtpd_name[DTRACE_NAMELEN - 1] = '\0';
/*
* Before we attempt to match this probe, we want to give
* all providers the opportunity to provide it.
*/
if (desc.dtpd_id == DTRACE_IDNONE) {
mutex_enter(&dtrace_provider_lock);
dtrace_probe_provide(&desc, NULL);
mutex_exit(&dtrace_provider_lock);
desc.dtpd_id++;
}
if (cmd == DTRACEIOC_PROBEMATCH) {
dtrace_probekey(&desc, &pkey);
pkey.dtpk_id = DTRACE_IDNONE;
}
dtrace_cred2priv(cr, &priv, &uid, &zoneid);
mutex_enter(&dtrace_lock);
if (cmd == DTRACEIOC_PROBEMATCH) {
for (i = desc.dtpd_id; i <= dtrace_nprobes; i++) {
if ((probe = dtrace_probes[i - 1]) != NULL &&
(m = dtrace_match_probe(probe, &pkey,
priv, uid, zoneid)) != 0)
break;
}
if (m < 0) {
mutex_exit(&dtrace_lock);
return (EINVAL);
}
} else {
for (i = desc.dtpd_id; i <= dtrace_nprobes; i++) {
if ((probe = dtrace_probes[i - 1]) != NULL &&
dtrace_match_priv(probe, priv, uid, zoneid))
break;
}
}
if (probe == NULL) {
mutex_exit(&dtrace_lock);
return (ESRCH);
}
dtrace_probe_description(probe, &desc);
mutex_exit(&dtrace_lock);
if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
return (EFAULT);
return (0);
}
case DTRACEIOC_PROBEARG: {
dtrace_argdesc_t desc;
dtrace_probe_t *probe;
dtrace_provider_t *prov;
if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
return (EFAULT);
if (desc.dtargd_id == DTRACE_IDNONE)
return (EINVAL);
if (desc.dtargd_ndx == DTRACE_ARGNONE)
return (EINVAL);
mutex_enter(&dtrace_provider_lock);
mutex_enter(&mod_lock);
mutex_enter(&dtrace_lock);
if (desc.dtargd_id > dtrace_nprobes) {
mutex_exit(&dtrace_lock);
mutex_exit(&mod_lock);
mutex_exit(&dtrace_provider_lock);
return (EINVAL);
}
if ((probe = dtrace_probes[desc.dtargd_id - 1]) == NULL) {
mutex_exit(&dtrace_lock);
mutex_exit(&mod_lock);
mutex_exit(&dtrace_provider_lock);
return (EINVAL);
}
mutex_exit(&dtrace_lock);
prov = probe->dtpr_provider;
if (prov->dtpv_pops.dtps_getargdesc == NULL) {
/*
* There isn't any typed information for this probe.
* Set the argument number to DTRACE_ARGNONE.
*/
desc.dtargd_ndx = DTRACE_ARGNONE;
} else {
desc.dtargd_native[0] = '\0';
desc.dtargd_xlate[0] = '\0';
desc.dtargd_mapping = desc.dtargd_ndx;
prov->dtpv_pops.dtps_getargdesc(prov->dtpv_arg,
probe->dtpr_id, probe->dtpr_arg, &desc);
}
mutex_exit(&mod_lock);
mutex_exit(&dtrace_provider_lock);
if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
return (EFAULT);
return (0);
}
case DTRACEIOC_GO: {
processorid_t cpuid;
rval = dtrace_state_go(state, &cpuid);
if (rval != 0)
return (rval);
if (copyout(&cpuid, (void *)arg, sizeof (cpuid)) != 0)
return (EFAULT);
return (0);
}
case DTRACEIOC_STOP: {
processorid_t cpuid;
mutex_enter(&dtrace_lock);
rval = dtrace_state_stop(state, &cpuid);
mutex_exit(&dtrace_lock);
if (rval != 0)
return (rval);
if (copyout(&cpuid, (void *)arg, sizeof (cpuid)) != 0)
return (EFAULT);
return (0);
}
case DTRACEIOC_DOFGET: {
dof_hdr_t hdr, *dof;
uint64_t len;
if (copyin((void *)arg, &hdr, sizeof (hdr)) != 0)
return (EFAULT);
mutex_enter(&dtrace_lock);
dof = dtrace_dof_create(state);
mutex_exit(&dtrace_lock);
len = MIN(hdr.dofh_loadsz, dof->dofh_loadsz);
rval = copyout(dof, (void *)arg, len);
dtrace_dof_destroy(dof);
return (rval == 0 ? 0 : EFAULT);
}
case DTRACEIOC_AGGSNAP:
case DTRACEIOC_BUFSNAP: {
dtrace_bufdesc_t desc;
caddr_t cached;
dtrace_buffer_t *buf;
if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
return (EFAULT);
if (desc.dtbd_cpu < 0 || desc.dtbd_cpu >= NCPU)
return (EINVAL);
mutex_enter(&dtrace_lock);
if (cmd == DTRACEIOC_BUFSNAP) {
buf = &state->dts_buffer[desc.dtbd_cpu];
} else {
buf = &state->dts_aggbuffer[desc.dtbd_cpu];
}
if (buf->dtb_flags & (DTRACEBUF_RING | DTRACEBUF_FILL)) {
size_t sz = buf->dtb_offset;
if (state->dts_activity != DTRACE_ACTIVITY_STOPPED) {
mutex_exit(&dtrace_lock);
return (EBUSY);
}
/*
* If this buffer has already been consumed, we're
* going to indicate that there's nothing left here
* to consume.
*/
if (buf->dtb_flags & DTRACEBUF_CONSUMED) {
mutex_exit(&dtrace_lock);
desc.dtbd_size = 0;
desc.dtbd_drops = 0;
desc.dtbd_errors = 0;
desc.dtbd_oldest = 0;
sz = sizeof (desc);
if (copyout(&desc, (void *)arg, sz) != 0)
return (EFAULT);
return (0);
}
/*
* If this is a ring buffer that has wrapped, we want
* to copy the whole thing out.
*/
if (buf->dtb_flags & DTRACEBUF_WRAPPED) {
dtrace_buffer_polish(buf);
sz = buf->dtb_size;
}
if (copyout(buf->dtb_tomax, desc.dtbd_data, sz) != 0) {
mutex_exit(&dtrace_lock);
return (EFAULT);
}
desc.dtbd_size = sz;
desc.dtbd_drops = buf->dtb_drops;
desc.dtbd_errors = buf->dtb_errors;
desc.dtbd_oldest = buf->dtb_xamot_offset;
mutex_exit(&dtrace_lock);
if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
return (EFAULT);
buf->dtb_flags |= DTRACEBUF_CONSUMED;
return (0);
}
if (buf->dtb_tomax == NULL) {
ASSERT(buf->dtb_xamot == NULL);
mutex_exit(&dtrace_lock);
return (ENOENT);
}
cached = buf->dtb_tomax;
ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
dtrace_xcall(desc.dtbd_cpu,
(dtrace_xcall_t)dtrace_buffer_switch, buf);
state->dts_errors += buf->dtb_xamot_errors;
/*
* If the buffers did not actually switch, then the cross call
* did not take place -- presumably because the given CPU is
* not in the ready set. If this is the case, we'll return
* ENOENT.
*/
if (buf->dtb_tomax == cached) {
ASSERT(buf->dtb_xamot != cached);
mutex_exit(&dtrace_lock);
return (ENOENT);
}
ASSERT(cached == buf->dtb_xamot);
/*
* We have our snapshot; now copy it out.
*/
if (copyout(buf->dtb_xamot, desc.dtbd_data,
buf->dtb_xamot_offset) != 0) {
mutex_exit(&dtrace_lock);
return (EFAULT);
}
desc.dtbd_size = buf->dtb_xamot_offset;
desc.dtbd_drops = buf->dtb_xamot_drops;
desc.dtbd_errors = buf->dtb_xamot_errors;
desc.dtbd_oldest = 0;
mutex_exit(&dtrace_lock);
/*
* Finally, copy out the buffer description.
*/
if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
return (EFAULT);
return (0);
}
case DTRACEIOC_CONF: {
dtrace_conf_t conf;
bzero(&conf, sizeof (conf));
conf.dtc_difversion = DIF_VERSION;
conf.dtc_difintregs = DIF_DIR_NREGS;
conf.dtc_diftupregs = DIF_DTR_NREGS;
conf.dtc_ctfmodel = CTF_MODEL_NATIVE;
if (copyout(&conf, (void *)arg, sizeof (conf)) != 0)
return (EFAULT);
return (0);
}
case DTRACEIOC_STATUS: {
dtrace_status_t stat;
dtrace_dstate_t *dstate;
int i, j;
uint64_t nerrs;
/*
* See the comment in dtrace_state_deadman() for the reason
* for setting dts_laststatus to INT64_MAX before setting
* it to the correct value.
*/
state->dts_laststatus = INT64_MAX;
dtrace_membar_producer();
state->dts_laststatus = dtrace_gethrtime();
bzero(&stat, sizeof (stat));
mutex_enter(&dtrace_lock);
if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE) {
mutex_exit(&dtrace_lock);
return (ENOENT);
}
if (state->dts_activity == DTRACE_ACTIVITY_DRAINING)
stat.dtst_exiting = 1;
nerrs = state->dts_errors;
dstate = &state->dts_vstate.dtvs_dynvars;
for (i = 0; i < NCPU; i++) {
dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[i];
stat.dtst_dyndrops += dcpu->dtdsc_drops;
stat.dtst_dyndrops_dirty += dcpu->dtdsc_dirty_drops;
stat.dtst_dyndrops_rinsing += dcpu->dtdsc_rinsing_drops;
if (state->dts_buffer[i].dtb_flags & DTRACEBUF_FULL)
stat.dtst_filled++;
nerrs += state->dts_buffer[i].dtb_errors;
for (j = 0; j < state->dts_nspeculations; j++) {
dtrace_speculation_t *spec;
dtrace_buffer_t *buf;
spec = &state->dts_speculations[j];
buf = &spec->dtsp_buffer[i];
stat.dtst_specdrops += buf->dtb_xamot_drops;
}
}
stat.dtst_specdrops_busy = state->dts_speculations_busy;
stat.dtst_specdrops_unavail = state->dts_speculations_unavail;
stat.dtst_stkstroverflows = state->dts_stkstroverflows;
stat.dtst_dblerrors = state->dts_dblerrors;
stat.dtst_killed =
(state->dts_activity == DTRACE_ACTIVITY_KILLED);
stat.dtst_errors = nerrs;
mutex_exit(&dtrace_lock);
if (copyout(&stat, (void *)arg, sizeof (stat)) != 0)
return (EFAULT);
return (0);
}
case DTRACEIOC_FORMAT: {
dtrace_fmtdesc_t fmt;
char *str;
int len;
if (copyin((void *)arg, &fmt, sizeof (fmt)) != 0)
return (EFAULT);
mutex_enter(&dtrace_lock);
if (fmt.dtfd_format == 0 ||
fmt.dtfd_format > state->dts_nformats) {
mutex_exit(&dtrace_lock);
return (EINVAL);
}
/*
* Format strings are allocated contiguously and they are
* never freed; if a format index is less than the number
* of formats, we can assert that the format map is non-NULL
* and that the format for the specified index is non-NULL.
*/
ASSERT(state->dts_formats != NULL);
str = state->dts_formats[fmt.dtfd_format - 1];
ASSERT(str != NULL);
len = strlen(str) + 1;
if (len > fmt.dtfd_length) {
fmt.dtfd_length = len;
if (copyout(&fmt, (void *)arg, sizeof (fmt)) != 0) {
mutex_exit(&dtrace_lock);
return (EINVAL);
}
} else {
if (copyout(str, fmt.dtfd_string, len) != 0) {
mutex_exit(&dtrace_lock);
return (EINVAL);
}
}
mutex_exit(&dtrace_lock);
return (0);
}
default:
break;
}
return (ENOTTY);
}
/*ARGSUSED*/
static int
dtrace_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
{
dtrace_state_t *state;
switch (cmd) {
case DDI_DETACH:
break;
case DDI_SUSPEND:
return (DDI_SUCCESS);
default:
return (DDI_FAILURE);
}
mutex_enter(&cpu_lock);
mutex_enter(&dtrace_provider_lock);
mutex_enter(&dtrace_lock);
ASSERT(dtrace_opens == 0);
if (dtrace_helpers > 0) {
mutex_exit(&dtrace_provider_lock);
mutex_exit(&dtrace_lock);
mutex_exit(&cpu_lock);
return (DDI_FAILURE);
}
if (dtrace_unregister((dtrace_provider_id_t)dtrace_provider) != 0) {
mutex_exit(&dtrace_provider_lock);
mutex_exit(&dtrace_lock);
mutex_exit(&cpu_lock);
return (DDI_FAILURE);
}
dtrace_provider = NULL;
if ((state = dtrace_anon_grab()) != NULL) {
/*
* If there were ECBs on this state, the provider should
* have not been allowed to detach; assert that there is
* none.
*/
ASSERT(state->dts_necbs == 0);
dtrace_state_destroy(state);
/*
* If we're being detached with anonymous state, we need to
* indicate to the kernel debugger that DTrace is now inactive.
*/
(void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
}
bzero(&dtrace_anon, sizeof (dtrace_anon_t));
unregister_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL);
dtrace_cpu_init = NULL;
dtrace_helpers_cleanup = NULL;
dtrace_helpers_fork = NULL;
dtrace_cpustart_init = NULL;
dtrace_cpustart_fini = NULL;
dtrace_debugger_init = NULL;
dtrace_debugger_fini = NULL;
dtrace_modload = NULL;
dtrace_modunload = NULL;
mutex_exit(&cpu_lock);
if (dtrace_helptrace_enabled) {
kmem_free(dtrace_helptrace_buffer, dtrace_helptrace_bufsize);
dtrace_helptrace_buffer = NULL;
}
kmem_free(dtrace_probes, dtrace_nprobes * sizeof (dtrace_probe_t *));
dtrace_probes = NULL;
dtrace_nprobes = 0;
dtrace_hash_destroy(dtrace_bymod);
dtrace_hash_destroy(dtrace_byfunc);
dtrace_hash_destroy(dtrace_byname);
dtrace_bymod = NULL;
dtrace_byfunc = NULL;
dtrace_byname = NULL;
kmem_cache_destroy(dtrace_state_cache);
vmem_destroy(dtrace_minor);
vmem_destroy(dtrace_arena);
if (dtrace_toxrange != NULL) {
kmem_free(dtrace_toxrange,
dtrace_toxranges_max * sizeof (dtrace_toxrange_t));
dtrace_toxrange = NULL;
dtrace_toxranges = 0;
dtrace_toxranges_max = 0;
}
ddi_remove_minor_node(dtrace_devi, NULL);
dtrace_devi = NULL;
ddi_soft_state_fini(&dtrace_softstate);
ASSERT(dtrace_vtime_references == 0);
ASSERT(dtrace_opens == 0);
ASSERT(dtrace_retained == NULL);
mutex_exit(&dtrace_lock);
mutex_exit(&dtrace_provider_lock);
/*
* We don't destroy the task queue until after we have dropped our
* locks (taskq_destroy() may block on running tasks). To prevent
* attempting to do work after we have effectively detached but before
* the task queue has been destroyed, all tasks dispatched via the
* task queue must check that DTrace is still attached before
* performing any operation.
*/
taskq_destroy(dtrace_taskq);
dtrace_taskq = NULL;
return (DDI_SUCCESS);
}
#endif
#if defined(sun)
/*ARGSUSED*/
static int
dtrace_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
{
int error;
switch (infocmd) {
case DDI_INFO_DEVT2DEVINFO:
*result = (void *)dtrace_devi;
error = DDI_SUCCESS;
break;
case DDI_INFO_DEVT2INSTANCE:
*result = (void *)0;
error = DDI_SUCCESS;
break;
default:
error = DDI_FAILURE;
}
return (error);
}
#endif
#if defined(sun)
static struct cb_ops dtrace_cb_ops = {
dtrace_open, /* open */
dtrace_close, /* close */
nulldev, /* strategy */
nulldev, /* print */
nodev, /* dump */
nodev, /* read */
nodev, /* write */
dtrace_ioctl, /* ioctl */
nodev, /* devmap */
nodev, /* mmap */
nodev, /* segmap */
nochpoll, /* poll */
ddi_prop_op, /* cb_prop_op */
0, /* streamtab */
D_NEW | D_MP /* Driver compatibility flag */
};
static struct dev_ops dtrace_ops = {
DEVO_REV, /* devo_rev */
0, /* refcnt */
dtrace_info, /* get_dev_info */
nulldev, /* identify */
nulldev, /* probe */
dtrace_attach, /* attach */
dtrace_detach, /* detach */
nodev, /* reset */
&dtrace_cb_ops, /* driver operations */
NULL, /* bus operations */
nodev /* dev power */
};
static struct modldrv modldrv = {
&mod_driverops, /* module type (this is a pseudo driver) */
"Dynamic Tracing", /* name of module */
&dtrace_ops, /* driver ops */
};
static struct modlinkage modlinkage = {
MODREV_1,
(void *)&modldrv,
NULL
};
int
_init(void)
{
return (mod_install(&modlinkage));
}
int
_info(struct modinfo *modinfop)
{
return (mod_info(&modlinkage, modinfop));
}
int
_fini(void)
{
return (mod_remove(&modlinkage));
}
#else
static d_ioctl_t dtrace_ioctl;
static d_ioctl_t dtrace_ioctl_helper;
static void dtrace_load(void *);
static int dtrace_unload(void);
#if __FreeBSD_version < 800039
static void dtrace_clone(void *, struct ucred *, char *, int , struct cdev **);
static struct clonedevs *dtrace_clones; /* Ptr to the array of cloned devices. */
static eventhandler_tag eh_tag; /* Event handler tag. */
#else
static struct cdev *dtrace_dev;
static struct cdev *helper_dev;
#endif
void dtrace_invop_init(void);
void dtrace_invop_uninit(void);
static struct cdevsw dtrace_cdevsw = {
.d_version = D_VERSION,
.d_flags = D_TRACKCLOSE | D_NEEDMINOR,
.d_close = dtrace_close,
.d_ioctl = dtrace_ioctl,
.d_open = dtrace_open,
.d_name = "dtrace",
};
static struct cdevsw helper_cdevsw = {
.d_version = D_VERSION,
.d_flags = D_TRACKCLOSE | D_NEEDMINOR,
.d_ioctl = dtrace_ioctl_helper,
.d_name = "helper",
};
#include <dtrace_anon.c>
#if __FreeBSD_version < 800039
#include <dtrace_clone.c>
#endif
#include <dtrace_ioctl.c>
#include <dtrace_load.c>
#include <dtrace_modevent.c>
#include <dtrace_sysctl.c>
#include <dtrace_unload.c>
#include <dtrace_vtime.c>
#include <dtrace_hacks.c>
#include <dtrace_isa.c>
SYSINIT(dtrace_load, SI_SUB_DTRACE, SI_ORDER_FIRST, dtrace_load, NULL);
SYSUNINIT(dtrace_unload, SI_SUB_DTRACE, SI_ORDER_FIRST, dtrace_unload, NULL);
SYSINIT(dtrace_anon_init, SI_SUB_DTRACE_ANON, SI_ORDER_FIRST, dtrace_anon_init, NULL);
DEV_MODULE(dtrace, dtrace_modevent, NULL);
MODULE_VERSION(dtrace, 1);
MODULE_DEPEND(dtrace, cyclic, 1, 1, 1);
MODULE_DEPEND(dtrace, opensolaris, 1, 1, 1);
#endif
Index: head/sys/cddl/contrib/opensolaris/uts/sparc/dtrace/fasttrap_isa.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/sparc/dtrace/fasttrap_isa.c (revision 225616)
+++ head/sys/cddl/contrib/opensolaris/uts/sparc/dtrace/fasttrap_isa.c (revision 225617)
@@ -1,1597 +1,1597 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/fasttrap_isa.h>
#include <sys/fasttrap_impl.h>
#include <sys/dtrace.h>
#include <sys/dtrace_impl.h>
#include <sys/cmn_err.h>
#include <sys/frame.h>
#include <sys/stack.h>
#include <sys/sysmacros.h>
#include <sys/trap.h>
#include <v9/sys/machpcb.h>
#include <v9/sys/privregs.h>
/*
* Lossless User-Land Tracing on SPARC
* -----------------------------------
*
* The Basic Idea
*
* The most important design constraint is, of course, correct execution of
* the user thread above all else. The next most important goal is rapid
* execution. We combine execution of instructions in user-land with
* emulation of certain instructions in the kernel to aim for complete
* correctness and maximal performance.
*
* We take advantage of the split PC/NPC architecture to speed up logical
* single-stepping; when we copy an instruction out to the scratch space in
* the ulwp_t structure (held in the %g7 register on SPARC), we can
* effectively single step by setting the PC to our scratch space and leaving
* the NPC alone. This executes the replaced instruction and then continues
* on without having to reenter the kernel as with single- stepping. The
* obvious caveat is for instructions whose execution is PC dependant --
* branches, call and link instructions (call and jmpl), and the rdpc
* instruction. These instructions cannot be executed in the manner described
* so they must be emulated in the kernel.
*
* Emulation for this small set of instructions if fairly simple; the most
* difficult part being emulating branch conditions.
*
*
* A Cache Heavy Portfolio
*
* It's important to note at this time that copying an instruction out to the
* ulwp_t scratch space in user-land is rather complicated. SPARC has
* separate data and instruction caches so any writes to the D$ (using a
* store instruction for example) aren't necessarily reflected in the I$.
* The flush instruction can be used to synchronize the two and must be used
* for any self-modifying code, but the flush instruction only applies to the
* primary address space (the absence of a flusha analogue to the flush
* instruction that accepts an ASI argument is an obvious omission from SPARC
* v9 where the notion of the alternate address space was introduced on
* SPARC). To correctly copy out the instruction we must use a block store
* that doesn't allocate in the D$ and ensures synchronization with the I$;
* see dtrace_blksuword32() for the implementation (this function uses
* ASI_BLK_COMMIT_S to write a block through the secondary ASI in the manner
* described). Refer to the UltraSPARC I/II manual for details on the
* ASI_BLK_COMMIT_S ASI.
*
*
* Return Subtleties
*
* When we're firing a return probe we need to expose the value returned by
* the function being traced. Since the function can set the return value
* in its last instruction, we need to fire the return probe only _after_
* the effects of the instruction are apparent. For instructions that we
* emulate, we can call dtrace_probe() after we've performed the emulation;
* for instructions that we execute after we return to user-land, we set
* %pc to the instruction we copied out (as described above) and set %npc
* to a trap instruction stashed in the ulwp_t structure. After the traced
* instruction is executed, the trap instruction returns control to the
* kernel where we can fire the return probe.
*
* This need for a second trap in cases where we execute the traced
* instruction makes it all the more important to emulate the most common
* instructions to avoid the second trip in and out of the kernel.
*
*
* Making it Fast
*
* Since copying out an instruction is neither simple nor inexpensive for the
* CPU, we should attempt to avoid doing it in as many cases as possible.
* Since function entry and return are usually the most interesting probe
* sites, we attempt to tune the performance of the fasttrap provider around
* instructions typically in those places.
*
* Looking at a bunch of functions in libraries and executables reveals that
* most functions begin with either a save or a sethi (to setup a larger
* argument to the save) and end with a restore or an or (in the case of leaf
* functions). To try to improve performance, we emulate all of these
* instructions in the kernel.
*
* The save and restore instructions are a little tricky since they perform
* register window maniplulation. Rather than trying to tinker with the
* register windows from the kernel, we emulate the implicit add that takes
* place as part of those instructions and set the %pc to point to a simple
* save or restore we've hidden in the ulwp_t structure. If we're in a return
* probe so want to make it seem as though the tracepoint has been completely
* executed we need to remember that we've pulled this trick with restore and
* pull registers from the previous window (the one that we'll switch to once
* the simple store instruction is executed) rather than the current one. This
* is why in the case of emulating a restore we set the DTrace CPU flag
* CPU_DTRACE_FAKERESTORE before calling dtrace_probe() for the return probes
* (see fasttrap_return_common()).
*/
#define OP(x) ((x) >> 30)
#define OP2(x) (((x) >> 22) & 0x07)
#define OP3(x) (((x) >> 19) & 0x3f)
#define RCOND(x) (((x) >> 25) & 0x07)
#define COND(x) (((x) >> 25) & 0x0f)
#define A(x) (((x) >> 29) & 0x01)
#define I(x) (((x) >> 13) & 0x01)
#define RD(x) (((x) >> 25) & 0x1f)
#define RS1(x) (((x) >> 14) & 0x1f)
#define RS2(x) (((x) >> 0) & 0x1f)
#define CC(x) (((x) >> 20) & 0x03)
#define DISP16(x) ((((x) >> 6) & 0xc000) | ((x) & 0x3fff))
#define DISP22(x) ((x) & 0x3fffff)
#define DISP19(x) ((x) & 0x7ffff)
#define DISP30(x) ((x) & 0x3fffffff)
#define SW_TRAP(x) ((x) & 0x7f)
#define OP3_OR 0x02
#define OP3_RD 0x28
#define OP3_JMPL 0x38
#define OP3_RETURN 0x39
#define OP3_TCC 0x3a
#define OP3_SAVE 0x3c
#define OP3_RESTORE 0x3d
#define OP3_PREFETCH 0x2d
#define OP3_CASA 0x3c
#define OP3_PREFETCHA 0x3d
#define OP3_CASXA 0x3e
#define OP2_ILLTRAP 0x0
#define OP2_BPcc 0x1
#define OP2_Bicc 0x2
#define OP2_BPr 0x3
#define OP2_SETHI 0x4
#define OP2_FBPfcc 0x5
#define OP2_FBfcc 0x6
#define R_G0 0
#define R_O0 8
#define R_SP 14
#define R_I0 24
#define R_I1 25
#define R_I2 26
#define R_I3 27
#define R_I4 28
/*
* Check the comment in fasttrap.h when changing these offsets or adding
* new instructions.
*/
#define FASTTRAP_OFF_SAVE 64
#define FASTTRAP_OFF_RESTORE 68
#define FASTTRAP_OFF_FTRET 72
#define FASTTRAP_OFF_RETURN 76
#define BREAKPOINT_INSTR 0x91d02001 /* ta 1 */
/*
* Tunable to let users turn off the fancy save instruction optimization.
* If a program is non-ABI compliant, there's a possibility that the save
* instruction optimization could cause an error.
*/
int fasttrap_optimize_save = 1;
static uint64_t
fasttrap_anarg(struct regs *rp, int argno)
{
uint64_t value;
if (argno < 6)
return ((&rp->r_o0)[argno]);
if (curproc->p_model == DATAMODEL_NATIVE) {
struct frame *fr = (struct frame *)(rp->r_sp + STACK_BIAS);
DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
value = dtrace_fulword(&fr->fr_argd[argno]);
DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT | CPU_DTRACE_BADADDR |
CPU_DTRACE_BADALIGN);
} else {
struct frame32 *fr = (struct frame32 *)rp->r_sp;
DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
value = dtrace_fuword32(&fr->fr_argd[argno]);
DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT | CPU_DTRACE_BADADDR |
CPU_DTRACE_BADALIGN);
}
return (value);
}
static ulong_t fasttrap_getreg(struct regs *, uint_t);
static void fasttrap_putreg(struct regs *, uint_t, ulong_t);
static void
fasttrap_usdt_args(fasttrap_probe_t *probe, struct regs *rp,
uint_t fake_restore, int argc, uintptr_t *argv)
{
int i, x, cap = MIN(argc, probe->ftp_nargs);
int inc = (fake_restore ? 16 : 0);
/*
* The only way we'll hit the fake_restore case is if a USDT probe is
* invoked as a tail-call. While it wouldn't be incorrect, we can
* avoid a call to fasttrap_getreg(), and safely use rp->r_sp
* directly since a tail-call can't be made if the invoked function
* would use the argument dump space (i.e. if there were more than
* 6 arguments). We take this shortcut because unconditionally rooting
* around for R_FP (R_SP + 16) would be unnecessarily painful.
*/
if (curproc->p_model == DATAMODEL_NATIVE) {
struct frame *fr = (struct frame *)(rp->r_sp + STACK_BIAS);
uintptr_t v;
for (i = 0; i < cap; i++) {
x = probe->ftp_argmap[i];
if (x < 6)
argv[i] = fasttrap_getreg(rp, R_O0 + x + inc);
else if (fasttrap_fulword(&fr->fr_argd[x], &v) != 0)
argv[i] = 0;
}
} else {
struct frame32 *fr = (struct frame32 *)rp->r_sp;
uint32_t v;
for (i = 0; i < cap; i++) {
x = probe->ftp_argmap[i];
if (x < 6)
argv[i] = fasttrap_getreg(rp, R_O0 + x + inc);
else if (fasttrap_fuword32(&fr->fr_argd[x], &v) != 0)
argv[i] = 0;
}
}
for (; i < argc; i++) {
argv[i] = 0;
}
}
static void
fasttrap_return_common(struct regs *rp, uintptr_t pc, pid_t pid,
uint_t fake_restore)
{
fasttrap_tracepoint_t *tp;
fasttrap_bucket_t *bucket;
fasttrap_id_t *id;
kmutex_t *pid_mtx;
dtrace_icookie_t cookie;
pid_mtx = &cpu_core[CPU->cpu_id].cpuc_pid_lock;
mutex_enter(pid_mtx);
bucket = &fasttrap_tpoints.fth_table[FASTTRAP_TPOINTS_INDEX(pid, pc)];
for (tp = bucket->ftb_data; tp != NULL; tp = tp->ftt_next) {
if (pid == tp->ftt_pid && pc == tp->ftt_pc &&
tp->ftt_proc->ftpc_acount != 0)
break;
}
/*
* Don't sweat it if we can't find the tracepoint again; unlike
* when we're in fasttrap_pid_probe(), finding the tracepoint here
* is not essential to the correct execution of the process.
*/
if (tp == NULL || tp->ftt_retids == NULL) {
mutex_exit(pid_mtx);
return;
}
for (id = tp->ftt_retids; id != NULL; id = id->fti_next) {
fasttrap_probe_t *probe = id->fti_probe;
if (id->fti_ptype == DTFTP_POST_OFFSETS) {
if (probe->ftp_argmap != NULL && fake_restore) {
uintptr_t t[5];
fasttrap_usdt_args(probe, rp, fake_restore,
sizeof (t) / sizeof (t[0]), t);
cookie = dtrace_interrupt_disable();
DTRACE_CPUFLAG_SET(CPU_DTRACE_FAKERESTORE);
dtrace_probe(probe->ftp_id, t[0], t[1],
t[2], t[3], t[4]);
DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_FAKERESTORE);
dtrace_interrupt_enable(cookie);
} else if (probe->ftp_argmap != NULL) {
uintptr_t t[5];
fasttrap_usdt_args(probe, rp, fake_restore,
sizeof (t) / sizeof (t[0]), t);
dtrace_probe(probe->ftp_id, t[0], t[1],
t[2], t[3], t[4]);
} else if (fake_restore) {
uintptr_t arg0 = fasttrap_getreg(rp, R_I0);
uintptr_t arg1 = fasttrap_getreg(rp, R_I1);
uintptr_t arg2 = fasttrap_getreg(rp, R_I2);
uintptr_t arg3 = fasttrap_getreg(rp, R_I3);
uintptr_t arg4 = fasttrap_getreg(rp, R_I4);
cookie = dtrace_interrupt_disable();
DTRACE_CPUFLAG_SET(CPU_DTRACE_FAKERESTORE);
dtrace_probe(probe->ftp_id, arg0, arg1,
arg2, arg3, arg4);
DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_FAKERESTORE);
dtrace_interrupt_enable(cookie);
} else {
dtrace_probe(probe->ftp_id, rp->r_o0, rp->r_o1,
rp->r_o2, rp->r_o3, rp->r_o4);
}
continue;
}
/*
* If this is only a possible return point, we must
* be looking at a potential tail call in leaf context.
* If the %npc is still within this function, then we
* must have misidentified a jmpl as a tail-call when it
* is, in fact, part of a jump table. It would be nice to
* remove this tracepoint, but this is neither the time
* nor the place.
*/
if ((tp->ftt_flags & FASTTRAP_F_RETMAYBE) &&
rp->r_npc - probe->ftp_faddr < probe->ftp_fsize)
continue;
/*
* It's possible for a function to branch to the delay slot
* of an instruction that we've identified as a return site.
* We can dectect this spurious return probe activation by
* observing that in this case %npc will be %pc + 4 and %npc
* will be inside the current function (unless the user is
* doing _crazy_ instruction picking in which case there's
* very little we can do). The second check is important
* in case the last instructions of a function make a tail-
* call to the function located immediately subsequent.
*/
if (rp->r_npc == rp->r_pc + 4 &&
rp->r_npc - probe->ftp_faddr < probe->ftp_fsize)
continue;
/*
* The first argument is the offset of return tracepoint
* in the function; the remaining arguments are the return
* values.
*
* If fake_restore is set, we need to pull the return values
* out of the %i's rather than the %o's -- a little trickier.
*/
if (!fake_restore) {
dtrace_probe(probe->ftp_id, pc - probe->ftp_faddr,
rp->r_o0, rp->r_o1, rp->r_o2, rp->r_o3);
} else {
uintptr_t arg0 = fasttrap_getreg(rp, R_I0);
uintptr_t arg1 = fasttrap_getreg(rp, R_I1);
uintptr_t arg2 = fasttrap_getreg(rp, R_I2);
uintptr_t arg3 = fasttrap_getreg(rp, R_I3);
cookie = dtrace_interrupt_disable();
DTRACE_CPUFLAG_SET(CPU_DTRACE_FAKERESTORE);
dtrace_probe(probe->ftp_id, pc - probe->ftp_faddr,
arg0, arg1, arg2, arg3);
DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_FAKERESTORE);
dtrace_interrupt_enable(cookie);
}
}
mutex_exit(pid_mtx);
}
int
fasttrap_pid_probe(struct regs *rp)
{
proc_t *p = curproc;
fasttrap_tracepoint_t *tp, tp_local;
fasttrap_id_t *id;
pid_t pid;
uintptr_t pc = rp->r_pc;
uintptr_t npc = rp->r_npc;
uintptr_t orig_pc = pc;
fasttrap_bucket_t *bucket;
kmutex_t *pid_mtx;
uint_t fake_restore = 0, is_enabled = 0;
dtrace_icookie_t cookie;
/*
* It's possible that a user (in a veritable orgy of bad planning)
* could redirect this thread's flow of control before it reached the
* return probe fasttrap. In this case we need to kill the process
* since it's in a unrecoverable state.
*/
if (curthread->t_dtrace_step) {
ASSERT(curthread->t_dtrace_on);
fasttrap_sigtrap(p, curthread, pc);
return (0);
}
/*
* Clear all user tracing flags.
*/
curthread->t_dtrace_ft = 0;
curthread->t_dtrace_pc = 0;
curthread->t_dtrace_npc = 0;
curthread->t_dtrace_scrpc = 0;
curthread->t_dtrace_astpc = 0;
/*
* Treat a child created by a call to vfork(2) as if it were its
* parent. We know that there's only one thread of control in such a
* process: this one.
*/
while (p->p_flag & SVFORK) {
p = p->p_parent;
}
pid = p->p_pid;
pid_mtx = &cpu_core[CPU->cpu_id].cpuc_pid_lock;
mutex_enter(pid_mtx);
bucket = &fasttrap_tpoints.fth_table[FASTTRAP_TPOINTS_INDEX(pid, pc)];
/*
* Lookup the tracepoint that the process just hit.
*/
for (tp = bucket->ftb_data; tp != NULL; tp = tp->ftt_next) {
if (pid == tp->ftt_pid && pc == tp->ftt_pc &&
tp->ftt_proc->ftpc_acount != 0)
break;
}
/*
* If we couldn't find a matching tracepoint, either a tracepoint has
* been inserted without using the pid<pid> ioctl interface (see
* fasttrap_ioctl), or somehow we have mislaid this tracepoint.
*/
if (tp == NULL) {
mutex_exit(pid_mtx);
return (-1);
}
for (id = tp->ftt_ids; id != NULL; id = id->fti_next) {
fasttrap_probe_t *probe = id->fti_probe;
int isentry = (id->fti_ptype == DTFTP_ENTRY);
if (id->fti_ptype == DTFTP_IS_ENABLED) {
is_enabled = 1;
continue;
}
/*
* We note that this was an entry probe to help ustack() find
* the first caller.
*/
if (isentry) {
cookie = dtrace_interrupt_disable();
DTRACE_CPUFLAG_SET(CPU_DTRACE_ENTRY);
}
dtrace_probe(probe->ftp_id, rp->r_o0, rp->r_o1, rp->r_o2,
rp->r_o3, rp->r_o4);
if (isentry) {
DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_ENTRY);
dtrace_interrupt_enable(cookie);
}
}
/*
* We're about to do a bunch of work so we cache a local copy of
* the tracepoint to emulate the instruction, and then find the
* tracepoint again later if we need to light up any return probes.
*/
tp_local = *tp;
mutex_exit(pid_mtx);
tp = &tp_local;
/*
* If there's an is-enabled probe conntected to this tracepoint it
* means that there was a 'mov %g0, %o0' instruction that was placed
* there by DTrace when the binary was linked. As this probe is, in
* fact, enabled, we need to stuff 1 into %o0. Accordingly, we can
* bypass all the instruction emulation logic since we know the
* inevitable result. It's possible that a user could construct a
* scenario where the 'is-enabled' probe was on some other
* instruction, but that would be a rather exotic way to shoot oneself
* in the foot.
*/
if (is_enabled) {
rp->r_o0 = 1;
pc = rp->r_npc;
npc = pc + 4;
goto done;
}
/*
* We emulate certain types of instructions to ensure correctness
* (in the case of position dependent instructions) or optimize
* common cases. The rest we have the thread execute back in user-
* land.
*/
switch (tp->ftt_type) {
case FASTTRAP_T_SAVE:
{
int32_t imm;
/*
* This an optimization to let us handle function entry
* probes more efficiently. Many functions begin with a save
* instruction that follows the pattern:
* save %sp, <imm>, %sp
*
* Meanwhile, we've stashed the instruction:
* save %g1, %g0, %sp
*
* off of %g7, so all we have to do is stick the right value
* into %g1 and reset %pc to point to the instruction we've
* cleverly hidden (%npc should not be touched).
*/
imm = tp->ftt_instr << 19;
imm >>= 19;
rp->r_g1 = rp->r_sp + imm;
pc = rp->r_g7 + FASTTRAP_OFF_SAVE;
break;
}
case FASTTRAP_T_RESTORE:
{
ulong_t value;
uint_t rd;
/*
* This is an optimization to let us handle function
* return probes more efficiently. Most non-leaf functions
* end with the sequence:
* ret
* restore <reg>, <reg_or_imm>, %oX
*
* We've stashed the instruction:
* restore %g0, %g0, %g0
*
* off of %g7 so we just need to place the correct value
* in the right %i register (since after our fake-o
* restore, the %i's will become the %o's) and set the %pc
* to point to our hidden restore. We also set fake_restore to
* let fasttrap_return_common() know that it will find the
* return values in the %i's rather than the %o's.
*/
if (I(tp->ftt_instr)) {
int32_t imm;
imm = tp->ftt_instr << 19;
imm >>= 19;
value = fasttrap_getreg(rp, RS1(tp->ftt_instr)) + imm;
} else {
value = fasttrap_getreg(rp, RS1(tp->ftt_instr)) +
fasttrap_getreg(rp, RS2(tp->ftt_instr));
}
/*
* Convert %o's to %i's; leave %g's as they are.
*/
rd = RD(tp->ftt_instr);
fasttrap_putreg(rp, ((rd & 0x18) == 0x8) ? rd + 16 : rd, value);
pc = rp->r_g7 + FASTTRAP_OFF_RESTORE;
fake_restore = 1;
break;
}
case FASTTRAP_T_RETURN:
{
uintptr_t target;
/*
* A return instruction is like a jmpl (without the link
* part) that executes an implicit restore. We've stashed
* the instruction:
* return %o0
*
* off of %g7 so we just need to place the target in %o0
* and set the %pc to point to the stashed return instruction.
* We use %o0 since that register disappears after the return
* executes, erasing any evidence of this tampering.
*/
if (I(tp->ftt_instr)) {
int32_t imm;
imm = tp->ftt_instr << 19;
imm >>= 19;
target = fasttrap_getreg(rp, RS1(tp->ftt_instr)) + imm;
} else {
target = fasttrap_getreg(rp, RS1(tp->ftt_instr)) +
fasttrap_getreg(rp, RS2(tp->ftt_instr));
}
fasttrap_putreg(rp, R_O0, target);
pc = rp->r_g7 + FASTTRAP_OFF_RETURN;
fake_restore = 1;
break;
}
case FASTTRAP_T_OR:
{
ulong_t value;
if (I(tp->ftt_instr)) {
int32_t imm;
imm = tp->ftt_instr << 19;
imm >>= 19;
value = fasttrap_getreg(rp, RS1(tp->ftt_instr)) | imm;
} else {
value = fasttrap_getreg(rp, RS1(tp->ftt_instr)) |
fasttrap_getreg(rp, RS2(tp->ftt_instr));
}
fasttrap_putreg(rp, RD(tp->ftt_instr), value);
pc = rp->r_npc;
npc = pc + 4;
break;
}
case FASTTRAP_T_SETHI:
if (RD(tp->ftt_instr) != R_G0) {
uint32_t imm32 = tp->ftt_instr << 10;
fasttrap_putreg(rp, RD(tp->ftt_instr), (ulong_t)imm32);
}
pc = rp->r_npc;
npc = pc + 4;
break;
case FASTTRAP_T_CCR:
{
uint_t c, v, z, n, taken;
uint_t ccr = rp->r_tstate >> TSTATE_CCR_SHIFT;
if (tp->ftt_cc != 0)
ccr >>= 4;
c = (ccr >> 0) & 1;
v = (ccr >> 1) & 1;
z = (ccr >> 2) & 1;
n = (ccr >> 3) & 1;
switch (tp->ftt_code) {
case 0x0: /* BN */
taken = 0; break;
case 0x1: /* BE */
taken = z; break;
case 0x2: /* BLE */
taken = z | (n ^ v); break;
case 0x3: /* BL */
taken = n ^ v; break;
case 0x4: /* BLEU */
taken = c | z; break;
case 0x5: /* BCS (BLU) */
taken = c; break;
case 0x6: /* BNEG */
taken = n; break;
case 0x7: /* BVS */
taken = v; break;
case 0x8: /* BA */
/*
* We handle the BA case differently since the annul
* bit means something slightly different.
*/
panic("fasttrap: mishandled a branch");
taken = 1; break;
case 0x9: /* BNE */
taken = ~z; break;
case 0xa: /* BG */
taken = ~(z | (n ^ v)); break;
case 0xb: /* BGE */
taken = ~(n ^ v); break;
case 0xc: /* BGU */
taken = ~(c | z); break;
case 0xd: /* BCC (BGEU) */
taken = ~c; break;
case 0xe: /* BPOS */
taken = ~n; break;
case 0xf: /* BVC */
taken = ~v; break;
}
if (taken & 1) {
pc = rp->r_npc;
npc = tp->ftt_dest;
} else if (tp->ftt_flags & FASTTRAP_F_ANNUL) {
/*
* Untaken annulled branches don't execute the
* instruction in the delay slot.
*/
pc = rp->r_npc + 4;
npc = pc + 4;
} else {
pc = rp->r_npc;
npc = pc + 4;
}
break;
}
case FASTTRAP_T_FCC:
{
uint_t fcc;
uint_t taken;
uint64_t fsr;
dtrace_getfsr(&fsr);
if (tp->ftt_cc == 0) {
fcc = (fsr >> 10) & 0x3;
} else {
uint_t shift;
ASSERT(tp->ftt_cc <= 3);
shift = 30 + tp->ftt_cc * 2;
fcc = (fsr >> shift) & 0x3;
}
switch (tp->ftt_code) {
case 0x0: /* FBN */
taken = (1 << fcc) & (0|0|0|0); break;
case 0x1: /* FBNE */
taken = (1 << fcc) & (8|4|2|0); break;
case 0x2: /* FBLG */
taken = (1 << fcc) & (0|4|2|0); break;
case 0x3: /* FBUL */
taken = (1 << fcc) & (8|0|2|0); break;
case 0x4: /* FBL */
taken = (1 << fcc) & (0|0|2|0); break;
case 0x5: /* FBUG */
taken = (1 << fcc) & (8|4|0|0); break;
case 0x6: /* FBG */
taken = (1 << fcc) & (0|4|0|0); break;
case 0x7: /* FBU */
taken = (1 << fcc) & (8|0|0|0); break;
case 0x8: /* FBA */
/*
* We handle the FBA case differently since the annul
* bit means something slightly different.
*/
panic("fasttrap: mishandled a branch");
taken = (1 << fcc) & (8|4|2|1); break;
case 0x9: /* FBE */
taken = (1 << fcc) & (0|0|0|1); break;
case 0xa: /* FBUE */
taken = (1 << fcc) & (8|0|0|1); break;
case 0xb: /* FBGE */
taken = (1 << fcc) & (0|4|0|1); break;
case 0xc: /* FBUGE */
taken = (1 << fcc) & (8|4|0|1); break;
case 0xd: /* FBLE */
taken = (1 << fcc) & (0|0|2|1); break;
case 0xe: /* FBULE */
taken = (1 << fcc) & (8|0|2|1); break;
case 0xf: /* FBO */
taken = (1 << fcc) & (0|4|2|1); break;
}
if (taken) {
pc = rp->r_npc;
npc = tp->ftt_dest;
} else if (tp->ftt_flags & FASTTRAP_F_ANNUL) {
/*
* Untaken annulled branches don't execute the
* instruction in the delay slot.
*/
pc = rp->r_npc + 4;
npc = pc + 4;
} else {
pc = rp->r_npc;
npc = pc + 4;
}
break;
}
case FASTTRAP_T_REG:
{
int64_t value;
uint_t taken;
uint_t reg = RS1(tp->ftt_instr);
/*
* An ILP32 process shouldn't be using a branch predicated on
* an %i or an %l since it would violate the ABI. It's a
* violation of the ABI because we can't ensure deterministic
* behavior. We should have identified this case when we
* enabled the probe.
*/
ASSERT(p->p_model == DATAMODEL_LP64 || reg < 16);
value = (int64_t)fasttrap_getreg(rp, reg);
switch (tp->ftt_code) {
case 0x1: /* BRZ */
taken = (value == 0); break;
case 0x2: /* BRLEZ */
taken = (value <= 0); break;
case 0x3: /* BRLZ */
taken = (value < 0); break;
case 0x5: /* BRNZ */
taken = (value != 0); break;
case 0x6: /* BRGZ */
taken = (value > 0); break;
case 0x7: /* BRGEZ */
taken = (value >= 0); break;
default:
case 0x0:
case 0x4:
panic("fasttrap: mishandled a branch");
}
if (taken) {
pc = rp->r_npc;
npc = tp->ftt_dest;
} else if (tp->ftt_flags & FASTTRAP_F_ANNUL) {
/*
* Untaken annulled branches don't execute the
* instruction in the delay slot.
*/
pc = rp->r_npc + 4;
npc = pc + 4;
} else {
pc = rp->r_npc;
npc = pc + 4;
}
break;
}
case FASTTRAP_T_ALWAYS:
/*
* BAs, BA,As...
*/
if (tp->ftt_flags & FASTTRAP_F_ANNUL) {
/*
* Annulled branch always instructions never execute
* the instruction in the delay slot.
*/
pc = tp->ftt_dest;
npc = tp->ftt_dest + 4;
} else {
pc = rp->r_npc;
npc = tp->ftt_dest;
}
break;
case FASTTRAP_T_RDPC:
fasttrap_putreg(rp, RD(tp->ftt_instr), rp->r_pc);
pc = rp->r_npc;
npc = pc + 4;
break;
case FASTTRAP_T_CALL:
/*
* It's a call _and_ link remember...
*/
rp->r_o7 = rp->r_pc;
pc = rp->r_npc;
npc = tp->ftt_dest;
break;
case FASTTRAP_T_JMPL:
pc = rp->r_npc;
if (I(tp->ftt_instr)) {
uint_t rs1 = RS1(tp->ftt_instr);
int32_t imm;
imm = tp->ftt_instr << 19;
imm >>= 19;
npc = fasttrap_getreg(rp, rs1) + imm;
} else {
uint_t rs1 = RS1(tp->ftt_instr);
uint_t rs2 = RS2(tp->ftt_instr);
npc = fasttrap_getreg(rp, rs1) +
fasttrap_getreg(rp, rs2);
}
/*
* Do the link part of the jump-and-link instruction.
*/
fasttrap_putreg(rp, RD(tp->ftt_instr), rp->r_pc);
break;
case FASTTRAP_T_COMMON:
{
curthread->t_dtrace_scrpc = rp->r_g7;
curthread->t_dtrace_astpc = rp->r_g7 + FASTTRAP_OFF_FTRET;
/*
* Copy the instruction to a reserved location in the
* user-land thread structure, then set the PC to that
* location and leave the NPC alone. We take pains to ensure
* consistency in the instruction stream (See SPARC
* Architecture Manual Version 9, sections 8.4.7, A.20, and
* H.1.6; UltraSPARC I/II User's Manual, sections 3.1.1.1,
* and 13.6.4) by using the ASI ASI_BLK_COMMIT_S to copy the
* instruction into the user's address space without
* bypassing the I$. There's no AS_USER version of this ASI
* (as exist for other ASIs) so we use the lofault
* mechanism to catch faults.
*/
if (dtrace_blksuword32(rp->r_g7, &tp->ftt_instr, 1) == -1) {
/*
* If the copyout fails, then the process's state
* is not consistent (the effects of the traced
* instruction will never be seen). This process
* cannot be allowed to continue execution.
*/
fasttrap_sigtrap(curproc, curthread, pc);
return (0);
}
curthread->t_dtrace_pc = pc;
curthread->t_dtrace_npc = npc;
curthread->t_dtrace_on = 1;
pc = curthread->t_dtrace_scrpc;
if (tp->ftt_retids != NULL) {
curthread->t_dtrace_step = 1;
curthread->t_dtrace_ret = 1;
npc = curthread->t_dtrace_astpc;
}
break;
}
default:
panic("fasttrap: mishandled an instruction");
}
/*
* This bit me in the ass a couple of times, so lets toss this
* in as a cursory sanity check.
*/
ASSERT(pc != rp->r_g7 + 4);
ASSERT(pc != rp->r_g7 + 8);
done:
/*
* If there were no return probes when we first found the tracepoint,
* we should feel no obligation to honor any return probes that were
* subsequently enabled -- they'll just have to wait until the next
* time around.
*/
if (tp->ftt_retids != NULL) {
/*
* We need to wait until the results of the instruction are
* apparent before invoking any return probes. If this
* instruction was emulated we can just call
* fasttrap_return_common(); if it needs to be executed, we
* need to wait until we return to the kernel.
*/
if (tp->ftt_type != FASTTRAP_T_COMMON) {
fasttrap_return_common(rp, orig_pc, pid, fake_restore);
} else {
ASSERT(curthread->t_dtrace_ret != 0);
ASSERT(curthread->t_dtrace_pc == orig_pc);
ASSERT(curthread->t_dtrace_scrpc == rp->r_g7);
ASSERT(npc == curthread->t_dtrace_astpc);
}
}
ASSERT(pc != 0);
rp->r_pc = pc;
rp->r_npc = npc;
return (0);
}
int
fasttrap_return_probe(struct regs *rp)
{
proc_t *p = ttoproc(curthread);
pid_t pid;
uintptr_t pc = curthread->t_dtrace_pc;
uintptr_t npc = curthread->t_dtrace_npc;
curthread->t_dtrace_pc = 0;
curthread->t_dtrace_npc = 0;
curthread->t_dtrace_scrpc = 0;
curthread->t_dtrace_astpc = 0;
/*
* Treat a child created by a call to vfork(2) as if it were its
* parent. We know there's only one thread of control in such a
* process: this one.
*/
while (p->p_flag & SVFORK) {
p = p->p_parent;
}
/*
* We set the %pc and %npc to their values when the traced
* instruction was initially executed so that it appears to
* dtrace_probe() that we're on the original instruction, and so that
* the user can't easily detect our complex web of lies.
* dtrace_return_probe() (our caller) will correctly set %pc and %npc
* after we return.
*/
rp->r_pc = pc;
rp->r_npc = npc;
pid = p->p_pid;
fasttrap_return_common(rp, pc, pid, 0);
return (0);
}
int
fasttrap_tracepoint_install(proc_t *p, fasttrap_tracepoint_t *tp)
{
fasttrap_instr_t instr = FASTTRAP_INSTR;
if (uwrite(p, &instr, 4, tp->ftt_pc) != 0)
return (-1);
return (0);
}
int
fasttrap_tracepoint_remove(proc_t *p, fasttrap_tracepoint_t *tp)
{
fasttrap_instr_t instr;
/*
* Distinguish between read or write failures and a changed
* instruction.
*/
if (uread(p, &instr, 4, tp->ftt_pc) != 0)
return (0);
if (instr != FASTTRAP_INSTR && instr != BREAKPOINT_INSTR)
return (0);
if (uwrite(p, &tp->ftt_instr, 4, tp->ftt_pc) != 0)
return (-1);
return (0);
}
int
fasttrap_tracepoint_init(proc_t *p, fasttrap_tracepoint_t *tp, uintptr_t pc,
fasttrap_probe_type_t type)
{
uint32_t instr;
int32_t disp;
/*
* Read the instruction at the given address out of the process's
* address space. We don't have to worry about a debugger
* changing this instruction before we overwrite it with our trap
* instruction since P_PR_LOCK is set.
*/
if (uread(p, &instr, 4, pc) != 0)
return (-1);
/*
* Decode the instruction to fill in the probe flags. We can have
* the process execute most instructions on its own using a pc/npc
* trick, but pc-relative control transfer present a problem since
* we're relocating the instruction. We emulate these instructions
* in the kernel. We assume a default type and over-write that as
* needed.
*
* pc-relative instructions must be emulated for correctness;
* other instructions (which represent a large set of commonly traced
* instructions) are emulated or otherwise optimized for performance.
*/
tp->ftt_type = FASTTRAP_T_COMMON;
if (OP(instr) == 1) {
/*
* Call instructions.
*/
tp->ftt_type = FASTTRAP_T_CALL;
disp = DISP30(instr) << 2;
tp->ftt_dest = pc + (intptr_t)disp;
} else if (OP(instr) == 0) {
/*
* Branch instructions.
*
* Unconditional branches need careful attention when they're
* annulled: annulled unconditional branches never execute
* the instruction in the delay slot.
*/
switch (OP2(instr)) {
case OP2_ILLTRAP:
case 0x7:
/*
* The compiler may place an illtrap after a call to
* a function that returns a structure. In the case of
* a returned structure, the compiler places an illtrap
* whose const22 field is the size of the returned
* structure immediately following the delay slot of
* the call. To stay out of the way, we refuse to
* place tracepoints on top of illtrap instructions.
*
* This is one of the dumbest architectural decisions
* I've ever had to work around.
*
* We also identify the only illegal op2 value (See
* SPARC Architecture Manual Version 9, E.2 table 31).
*/
return (-1);
case OP2_BPcc:
if (COND(instr) == 8) {
tp->ftt_type = FASTTRAP_T_ALWAYS;
} else {
/*
* Check for an illegal instruction.
*/
if (CC(instr) & 1)
return (-1);
tp->ftt_type = FASTTRAP_T_CCR;
tp->ftt_cc = CC(instr);
tp->ftt_code = COND(instr);
}
if (A(instr) != 0)
tp->ftt_flags |= FASTTRAP_F_ANNUL;
disp = DISP19(instr);
disp <<= 13;
disp >>= 11;
tp->ftt_dest = pc + (intptr_t)disp;
break;
case OP2_Bicc:
if (COND(instr) == 8) {
tp->ftt_type = FASTTRAP_T_ALWAYS;
} else {
tp->ftt_type = FASTTRAP_T_CCR;
tp->ftt_cc = 0;
tp->ftt_code = COND(instr);
}
if (A(instr) != 0)
tp->ftt_flags |= FASTTRAP_F_ANNUL;
disp = DISP22(instr);
disp <<= 10;
disp >>= 8;
tp->ftt_dest = pc + (intptr_t)disp;
break;
case OP2_BPr:
/*
* Check for an illegal instruction.
*/
if ((RCOND(instr) & 3) == 0)
return (-1);
/*
* It's a violation of the v8plus ABI to use a
* register-predicated branch in a 32-bit app if
* the register used is an %l or an %i (%gs and %os
* are legit because they're not saved to the stack
* in 32-bit words when we take a trap).
*/
if (p->p_model == DATAMODEL_ILP32 && RS1(instr) >= 16)
return (-1);
tp->ftt_type = FASTTRAP_T_REG;
if (A(instr) != 0)
tp->ftt_flags |= FASTTRAP_F_ANNUL;
disp = DISP16(instr);
disp <<= 16;
disp >>= 14;
tp->ftt_dest = pc + (intptr_t)disp;
tp->ftt_code = RCOND(instr);
break;
case OP2_SETHI:
tp->ftt_type = FASTTRAP_T_SETHI;
break;
case OP2_FBPfcc:
if (COND(instr) == 8) {
tp->ftt_type = FASTTRAP_T_ALWAYS;
} else {
tp->ftt_type = FASTTRAP_T_FCC;
tp->ftt_cc = CC(instr);
tp->ftt_code = COND(instr);
}
if (A(instr) != 0)
tp->ftt_flags |= FASTTRAP_F_ANNUL;
disp = DISP19(instr);
disp <<= 13;
disp >>= 11;
tp->ftt_dest = pc + (intptr_t)disp;
break;
case OP2_FBfcc:
if (COND(instr) == 8) {
tp->ftt_type = FASTTRAP_T_ALWAYS;
} else {
tp->ftt_type = FASTTRAP_T_FCC;
tp->ftt_cc = 0;
tp->ftt_code = COND(instr);
}
if (A(instr) != 0)
tp->ftt_flags |= FASTTRAP_F_ANNUL;
disp = DISP22(instr);
disp <<= 10;
disp >>= 8;
tp->ftt_dest = pc + (intptr_t)disp;
break;
}
} else if (OP(instr) == 2) {
switch (OP3(instr)) {
case OP3_RETURN:
tp->ftt_type = FASTTRAP_T_RETURN;
break;
case OP3_JMPL:
tp->ftt_type = FASTTRAP_T_JMPL;
break;
case OP3_RD:
if (RS1(instr) == 5)
tp->ftt_type = FASTTRAP_T_RDPC;
break;
case OP3_SAVE:
/*
* We optimize for save instructions at function
* entry; see the comment in fasttrap_pid_probe()
* (near FASTTRAP_T_SAVE) for details.
*/
if (fasttrap_optimize_save != 0 &&
type == DTFTP_ENTRY &&
I(instr) == 1 && RD(instr) == R_SP)
tp->ftt_type = FASTTRAP_T_SAVE;
break;
case OP3_RESTORE:
/*
* We optimize restore instructions at function
* return; see the comment in fasttrap_pid_probe()
* (near FASTTRAP_T_RESTORE) for details.
*
* rd must be an %o or %g register.
*/
if ((RD(instr) & 0x10) == 0)
tp->ftt_type = FASTTRAP_T_RESTORE;
break;
case OP3_OR:
/*
* A large proportion of instructions in the delay
* slot of retl instructions are or's so we emulate
* these downstairs as an optimization.
*/
tp->ftt_type = FASTTRAP_T_OR;
break;
case OP3_TCC:
/*
* Breakpoint instructions are effectively position-
* dependent since the debugger uses the %pc value
* to lookup which breakpoint was executed. As a
* result, we can't actually instrument breakpoints.
*/
if (SW_TRAP(instr) == ST_BREAKPOINT)
return (-1);
break;
case 0x19:
case 0x1d:
case 0x29:
case 0x33:
case 0x3f:
/*
* Identify illegal instructions (See SPARC
* Architecture Manual Version 9, E.2 table 32).
*/
return (-1);
}
} else if (OP(instr) == 3) {
uint32_t op3 = OP3(instr);
/*
* Identify illegal instructions (See SPARC Architecture
* Manual Version 9, E.2 table 33).
*/
if ((op3 & 0x28) == 0x28) {
if (op3 != OP3_PREFETCH && op3 != OP3_CASA &&
op3 != OP3_PREFETCHA && op3 != OP3_CASXA)
return (-1);
} else {
if ((op3 & 0x0f) == 0x0c || (op3 & 0x3b) == 0x31)
return (-1);
}
}
tp->ftt_instr = instr;
/*
* We don't know how this tracepoint is going to be used, but in case
* it's used as part of a function return probe, we need to indicate
* whether it's always a return site or only potentially a return
* site. If it's part of a return probe, it's always going to be a
* return from that function if it's a restore instruction or if
* the previous instruction was a return. If we could reliably
* distinguish jump tables from return sites, this wouldn't be
* necessary.
*/
if (tp->ftt_type != FASTTRAP_T_RESTORE &&
(uread(p, &instr, 4, pc - sizeof (instr)) != 0 ||
!(OP(instr) == 2 && OP3(instr) == OP3_RETURN)))
tp->ftt_flags |= FASTTRAP_F_RETMAYBE;
return (0);
}
/*ARGSUSED*/
uint64_t
fasttrap_pid_getarg(void *arg, dtrace_id_t id, void *parg, int argno,
int aframes)
{
return (fasttrap_anarg(ttolwp(curthread)->lwp_regs, argno));
}
/*ARGSUSED*/
uint64_t
fasttrap_usdt_getarg(void *arg, dtrace_id_t id, void *parg, int argno,
int aframes)
{
return (fasttrap_anarg(ttolwp(curthread)->lwp_regs, argno));
}
static uint64_t fasttrap_getreg_fast_cnt;
static uint64_t fasttrap_getreg_mpcb_cnt;
static uint64_t fasttrap_getreg_slow_cnt;
static ulong_t
fasttrap_getreg(struct regs *rp, uint_t reg)
{
ulong_t value;
dtrace_icookie_t cookie;
struct machpcb *mpcb;
extern ulong_t dtrace_getreg_win(uint_t, uint_t);
/*
* We have the %os and %gs in our struct regs, but if we need to
* snag a %l or %i we need to go scrounging around in the process's
* address space.
*/
if (reg == 0)
return (0);
if (reg < 16)
return ((&rp->r_g1)[reg - 1]);
/*
* Before we look at the user's stack, we'll check the register
* windows to see if the information we want is in there.
*/
cookie = dtrace_interrupt_disable();
if (dtrace_getotherwin() > 0) {
value = dtrace_getreg_win(reg, 1);
dtrace_interrupt_enable(cookie);
atomic_add_64(&fasttrap_getreg_fast_cnt, 1);
return (value);
}
dtrace_interrupt_enable(cookie);
/*
* First check the machpcb structure to see if we've already read
* in the register window we're looking for; if we haven't, (and
* we probably haven't) try to copy in the value of the register.
*/
/* LINTED - alignment */
mpcb = (struct machpcb *)((caddr_t)rp - REGOFF);
if (get_udatamodel() == DATAMODEL_NATIVE) {
struct frame *fr = (struct frame *)(rp->r_sp + STACK_BIAS);
if (mpcb->mpcb_wbcnt > 0) {
struct rwindow *rwin = (void *)mpcb->mpcb_wbuf;
int i = mpcb->mpcb_wbcnt;
do {
i--;
if ((long)mpcb->mpcb_spbuf[i] != rp->r_sp)
continue;
atomic_add_64(&fasttrap_getreg_mpcb_cnt, 1);
return (rwin[i].rw_local[reg - 16]);
} while (i > 0);
}
if (fasttrap_fulword(&fr->fr_local[reg - 16], &value) != 0)
goto err;
} else {
struct frame32 *fr =
(struct frame32 *)(uintptr_t)(caddr32_t)rp->r_sp;
uint32_t *v32 = (uint32_t *)&value;
if (mpcb->mpcb_wbcnt > 0) {
struct rwindow32 *rwin = (void *)mpcb->mpcb_wbuf;
int i = mpcb->mpcb_wbcnt;
do {
i--;
if ((long)mpcb->mpcb_spbuf[i] != rp->r_sp)
continue;
atomic_add_64(&fasttrap_getreg_mpcb_cnt, 1);
return (rwin[i].rw_local[reg - 16]);
} while (i > 0);
}
if (fasttrap_fuword32(&fr->fr_local[reg - 16], &v32[1]) != 0)
goto err;
v32[0] = 0;
}
atomic_add_64(&fasttrap_getreg_slow_cnt, 1);
return (value);
err:
/*
* If the copy in failed, the process will be in a irrecoverable
* state, and we have no choice but to kill it.
*/
- psignal(ttoproc(curthread), SIGILL);
+ kern_psignal(ttoproc(curthread), SIGILL);
return (0);
}
static uint64_t fasttrap_putreg_fast_cnt;
static uint64_t fasttrap_putreg_mpcb_cnt;
static uint64_t fasttrap_putreg_slow_cnt;
static void
fasttrap_putreg(struct regs *rp, uint_t reg, ulong_t value)
{
dtrace_icookie_t cookie;
struct machpcb *mpcb;
extern void dtrace_putreg_win(uint_t, ulong_t);
if (reg == 0)
return;
if (reg < 16) {
(&rp->r_g1)[reg - 1] = value;
return;
}
/*
* If the user process is still using some register windows, we
* can just place the value in the correct window.
*/
cookie = dtrace_interrupt_disable();
if (dtrace_getotherwin() > 0) {
dtrace_putreg_win(reg, value);
dtrace_interrupt_enable(cookie);
atomic_add_64(&fasttrap_putreg_fast_cnt, 1);
return;
}
dtrace_interrupt_enable(cookie);
/*
* First see if there's a copy of the register window in the
* machpcb structure that we can modify; if there isn't try to
* copy out the value. If that fails, we try to create a new
* register window in the machpcb structure. While this isn't
* _precisely_ the intended use of the machpcb structure, it
* can't cause any problems since we know at this point in the
* code that all of the user's data have been flushed out of the
* register file (since %otherwin is 0).
*/
/* LINTED - alignment */
mpcb = (struct machpcb *)((caddr_t)rp - REGOFF);
if (get_udatamodel() == DATAMODEL_NATIVE) {
struct frame *fr = (struct frame *)(rp->r_sp + STACK_BIAS);
/* LINTED - alignment */
struct rwindow *rwin = (struct rwindow *)mpcb->mpcb_wbuf;
if (mpcb->mpcb_wbcnt > 0) {
int i = mpcb->mpcb_wbcnt;
do {
i--;
if ((long)mpcb->mpcb_spbuf[i] != rp->r_sp)
continue;
rwin[i].rw_local[reg - 16] = value;
atomic_add_64(&fasttrap_putreg_mpcb_cnt, 1);
return;
} while (i > 0);
}
if (fasttrap_sulword(&fr->fr_local[reg - 16], value) != 0) {
if (mpcb->mpcb_wbcnt >= MAXWIN || copyin(fr,
&rwin[mpcb->mpcb_wbcnt], sizeof (*rwin)) != 0)
goto err;
rwin[mpcb->mpcb_wbcnt].rw_local[reg - 16] = value;
mpcb->mpcb_spbuf[mpcb->mpcb_wbcnt] = (caddr_t)rp->r_sp;
mpcb->mpcb_wbcnt++;
atomic_add_64(&fasttrap_putreg_mpcb_cnt, 1);
return;
}
} else {
struct frame32 *fr =
(struct frame32 *)(uintptr_t)(caddr32_t)rp->r_sp;
/* LINTED - alignment */
struct rwindow32 *rwin = (struct rwindow32 *)mpcb->mpcb_wbuf;
uint32_t v32 = (uint32_t)value;
if (mpcb->mpcb_wbcnt > 0) {
int i = mpcb->mpcb_wbcnt;
do {
i--;
if ((long)mpcb->mpcb_spbuf[i] != rp->r_sp)
continue;
rwin[i].rw_local[reg - 16] = v32;
atomic_add_64(&fasttrap_putreg_mpcb_cnt, 1);
return;
} while (i > 0);
}
if (fasttrap_suword32(&fr->fr_local[reg - 16], v32) != 0) {
if (mpcb->mpcb_wbcnt >= MAXWIN || copyin(fr,
&rwin[mpcb->mpcb_wbcnt], sizeof (*rwin)) != 0)
goto err;
rwin[mpcb->mpcb_wbcnt].rw_local[reg - 16] = v32;
mpcb->mpcb_spbuf[mpcb->mpcb_wbcnt] = (caddr_t)rp->r_sp;
mpcb->mpcb_wbcnt++;
atomic_add_64(&fasttrap_putreg_mpcb_cnt, 1);
return;
}
}
atomic_add_64(&fasttrap_putreg_slow_cnt, 1);
return;
err:
/*
* If we couldn't record this register's value, the process is in an
* irrecoverable state and we have no choice but to euthanize it.
*/
- psignal(ttoproc(curthread), SIGILL);
+ kern_psignal(ttoproc(curthread), SIGILL);
}
Index: head/sys/compat/freebsd32/freebsd32_ioctl.c
===================================================================
--- head/sys/compat/freebsd32/freebsd32_ioctl.c (revision 225616)
+++ head/sys/compat/freebsd32/freebsd32_ioctl.c (revision 225617)
@@ -1,404 +1,404 @@
/*-
* Copyright (c) 2008 David E. O'Brien
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the author nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_compat.h"
#include <sys/param.h>
#include <sys/capability.h>
#include <sys/cdio.h>
#include <sys/fcntl.h>
#include <sys/filio.h>
#include <sys/file.h>
#include <sys/ioccom.h>
#include <sys/malloc.h>
#include <sys/mdioctl.h>
#include <sys/memrange.h>
#include <sys/pciio.h>
#include <sys/proc.h>
#include <sys/syscall.h>
#include <sys/syscallsubr.h>
#include <sys/sysctl.h>
#include <sys/sysent.h>
#include <sys/sysproto.h>
#include <sys/systm.h>
#include <compat/freebsd32/freebsd32.h>
#include <compat/freebsd32/freebsd32_ioctl.h>
#include <compat/freebsd32/freebsd32_proto.h>
/* Cannot get exact size in 64-bit due to alignment issue of entire struct. */
CTASSERT((sizeof(struct md_ioctl32)+4) == 436);
CTASSERT(sizeof(struct ioc_read_toc_entry32) == 8);
CTASSERT(sizeof(struct ioc_toc_header32) == 4);
CTASSERT(sizeof(struct mem_range_op32) == 12);
CTASSERT(sizeof(struct pci_conf_io32) == 36);
CTASSERT(sizeof(struct pci_match_conf32) == 44);
CTASSERT(sizeof(struct pci_conf32) == 44);
static int
freebsd32_ioctl_md(struct thread *td, struct freebsd32_ioctl_args *uap,
struct file *fp)
{
struct md_ioctl mdv;
struct md_ioctl32 md32;
u_long com = 0;
int i, error;
if (uap->com & IOC_IN) {
if ((error = copyin(uap->data, &md32, sizeof(md32)))) {
return (error);
}
CP(md32, mdv, md_version);
CP(md32, mdv, md_unit);
CP(md32, mdv, md_type);
PTRIN_CP(md32, mdv, md_file);
CP(md32, mdv, md_mediasize);
CP(md32, mdv, md_sectorsize);
CP(md32, mdv, md_options);
CP(md32, mdv, md_base);
CP(md32, mdv, md_fwheads);
CP(md32, mdv, md_fwsectors);
} else if (uap->com & IOC_OUT) {
/*
* Zero the buffer so the user always
* gets back something deterministic.
*/
bzero(&mdv, sizeof mdv);
}
switch (uap->com) {
case MDIOCATTACH_32:
com = MDIOCATTACH;
break;
case MDIOCDETACH_32:
com = MDIOCDETACH;
break;
case MDIOCQUERY_32:
com = MDIOCQUERY;
break;
case MDIOCLIST_32:
com = MDIOCLIST;
break;
default:
panic("%s: unknown MDIOC %#x", __func__, uap->com);
}
error = fo_ioctl(fp, com, (caddr_t)&mdv, td->td_ucred, td);
if (error == 0 && (com & IOC_OUT)) {
CP(mdv, md32, md_version);
CP(mdv, md32, md_unit);
CP(mdv, md32, md_type);
PTROUT_CP(mdv, md32, md_file);
CP(mdv, md32, md_mediasize);
CP(mdv, md32, md_sectorsize);
CP(mdv, md32, md_options);
CP(mdv, md32, md_base);
CP(mdv, md32, md_fwheads);
CP(mdv, md32, md_fwsectors);
if (com == MDIOCLIST) {
/*
* Use MDNPAD, and not MDNPAD32. Padding is
* allocated and used by compat32 ABI.
*/
for (i = 0; i < MDNPAD; i++)
CP(mdv, md32, md_pad[i]);
}
error = copyout(&md32, uap->data, sizeof(md32));
}
return error;
}
static int
freebsd32_ioctl_ioc_toc_header(struct thread *td,
struct freebsd32_ioctl_args *uap, struct file *fp)
{
struct ioc_toc_header toch;
struct ioc_toc_header32 toch32;
int error;
if ((error = copyin(uap->data, &toch32, sizeof(toch32))))
return (error);
CP(toch32, toch, len);
CP(toch32, toch, starting_track);
CP(toch32, toch, ending_track);
error = fo_ioctl(fp, CDIOREADTOCHEADER, (caddr_t)&toch,
td->td_ucred, td);
return (error);
}
static int
freebsd32_ioctl_ioc_read_toc(struct thread *td,
struct freebsd32_ioctl_args *uap, struct file *fp)
{
struct ioc_read_toc_entry toce;
struct ioc_read_toc_entry32 toce32;
int error;
if ((error = copyin(uap->data, &toce32, sizeof(toce32))))
return (error);
CP(toce32, toce, address_format);
CP(toce32, toce, starting_track);
CP(toce32, toce, data_len);
PTRIN_CP(toce32, toce, data);
if ((error = fo_ioctl(fp, CDIOREADTOCENTRYS, (caddr_t)&toce,
td->td_ucred, td))) {
CP(toce, toce32, address_format);
CP(toce, toce32, starting_track);
CP(toce, toce32, data_len);
PTROUT_CP(toce, toce32, data);
error = copyout(&toce32, uap->data, sizeof(toce32));
}
return error;
}
static int
freebsd32_ioctl_fiodgname(struct thread *td,
struct freebsd32_ioctl_args *uap, struct file *fp)
{
struct fiodgname_arg fgn;
struct fiodgname_arg32 fgn32;
int error;
if ((error = copyin(uap->data, &fgn32, sizeof fgn32)) != 0)
return (error);
CP(fgn32, fgn, len);
PTRIN_CP(fgn32, fgn, buf);
error = fo_ioctl(fp, FIODGNAME, (caddr_t)&fgn, td->td_ucred, td);
return (error);
}
static int
freebsd32_ioctl_memrange(struct thread *td,
struct freebsd32_ioctl_args *uap, struct file *fp)
{
struct mem_range_op mro;
struct mem_range_op32 mro32;
int error;
u_long com;
if ((error = copyin(uap->data, &mro32, sizeof(mro32))) != 0)
return (error);
PTRIN_CP(mro32, mro, mo_desc);
CP(mro32, mro, mo_arg[0]);
CP(mro32, mro, mo_arg[1]);
com = 0;
switch (uap->com) {
case MEMRANGE_GET32:
com = MEMRANGE_GET;
break;
case MEMRANGE_SET32:
com = MEMRANGE_SET;
break;
default:
panic("%s: unknown MEMRANGE %#x", __func__, uap->com);
}
if ((error = fo_ioctl(fp, com, (caddr_t)&mro, td->td_ucred, td)) != 0)
return (error);
if ( (com & IOC_OUT) ) {
CP(mro, mro32, mo_arg[0]);
CP(mro, mro32, mo_arg[1]);
error = copyout(&mro32, uap->data, sizeof(mro32));
}
return (error);
}
static int
freebsd32_ioctl_pciocgetconf(struct thread *td,
struct freebsd32_ioctl_args *uap, struct file *fp)
{
struct pci_conf_io pci;
struct pci_conf_io32 pci32;
struct pci_match_conf32 pmc32;
struct pci_match_conf32 *pmc32p;
struct pci_match_conf pmc;
struct pci_match_conf *pmcp;
struct pci_conf32 pc32;
struct pci_conf32 *pc32p;
struct pci_conf pc;
struct pci_conf *pcp;
u_int32_t i;
u_int32_t npat_to_convert;
u_int32_t nmatch_to_convert;
vm_offset_t addr;
int error;
if ((error = copyin(uap->data, &pci32, sizeof(pci32))) != 0)
return (error);
CP(pci32, pci, num_patterns);
CP(pci32, pci, offset);
CP(pci32, pci, generation);
npat_to_convert = pci32.pat_buf_len / sizeof(struct pci_match_conf32);
pci.pat_buf_len = npat_to_convert * sizeof(struct pci_match_conf);
pci.patterns = NULL;
nmatch_to_convert = pci32.match_buf_len / sizeof(struct pci_conf32);
pci.match_buf_len = nmatch_to_convert * sizeof(struct pci_conf);
pci.matches = NULL;
if ((error = copyout_map(td, &addr, pci.pat_buf_len)) != 0)
goto cleanup;
pci.patterns = (struct pci_match_conf *)addr;
if ((error = copyout_map(td, &addr, pci.match_buf_len)) != 0)
goto cleanup;
pci.matches = (struct pci_conf *)addr;
npat_to_convert = min(npat_to_convert, pci.num_patterns);
for (i = 0, pmc32p = (struct pci_match_conf32 *)PTRIN(pci32.patterns),
pmcp = pci.patterns;
i < npat_to_convert; i++, pmc32p++, pmcp++) {
if ((error = copyin(pmc32p, &pmc32, sizeof(pmc32))) != 0)
goto cleanup;
CP(pmc32,pmc,pc_sel);
strlcpy(pmc.pd_name, pmc32.pd_name, sizeof(pmc.pd_name));
CP(pmc32,pmc,pd_unit);
CP(pmc32,pmc,pc_vendor);
CP(pmc32,pmc,pc_device);
CP(pmc32,pmc,pc_class);
CP(pmc32,pmc,flags);
if ((error = copyout(&pmc, pmcp, sizeof(pmc))) != 0)
goto cleanup;
}
if ((error = fo_ioctl(fp, PCIOCGETCONF, (caddr_t)&pci,
td->td_ucred, td)) != 0)
goto cleanup;
nmatch_to_convert = min(nmatch_to_convert, pci.num_matches);
for (i = 0, pcp = pci.matches,
pc32p = (struct pci_conf32 *)PTRIN(pci32.matches);
i < nmatch_to_convert; i++, pcp++, pc32p++) {
if ((error = copyin(pcp, &pc, sizeof(pc))) != 0)
goto cleanup;
CP(pc,pc32,pc_sel);
CP(pc,pc32,pc_hdr);
CP(pc,pc32,pc_subvendor);
CP(pc,pc32,pc_subdevice);
CP(pc,pc32,pc_vendor);
CP(pc,pc32,pc_device);
CP(pc,pc32,pc_class);
CP(pc,pc32,pc_subclass);
CP(pc,pc32,pc_progif);
CP(pc,pc32,pc_revid);
strlcpy(pc32.pd_name, pc.pd_name, sizeof(pc32.pd_name));
CP(pc,pc32,pd_unit);
if ((error = copyout(&pc32, pc32p, sizeof(pc32))) != 0)
goto cleanup;
}
CP(pci, pci32, num_matches);
CP(pci, pci32, offset);
CP(pci, pci32, generation);
CP(pci, pci32, status);
error = copyout(&pci32, uap->data, sizeof(pci32));
cleanup:
if (pci.patterns)
copyout_unmap(td, (vm_offset_t)pci.patterns, pci.pat_buf_len);
if (pci.matches)
copyout_unmap(td, (vm_offset_t)pci.matches, pci.match_buf_len);
return (error);
}
int
freebsd32_ioctl(struct thread *td, struct freebsd32_ioctl_args *uap)
{
struct ioctl_args ap /*{
int fd;
u_long com;
caddr_t data;
}*/ ;
struct file *fp;
int error;
if ((error = fget(td, uap->fd, CAP_IOCTL, &fp)) != 0)
return (error);
if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
fdrop(fp, td);
return (EBADF);
}
switch (uap->com) {
case MDIOCATTACH_32: /* FALLTHROUGH */
case MDIOCDETACH_32: /* FALLTHROUGH */
case MDIOCQUERY_32: /* FALLTHROUGH */
case MDIOCLIST_32:
error = freebsd32_ioctl_md(td, uap, fp);
break;
case CDIOREADTOCENTRYS_32:
error = freebsd32_ioctl_ioc_read_toc(td, uap, fp);
break;
case CDIOREADTOCHEADER_32:
error = freebsd32_ioctl_ioc_toc_header(td, uap, fp);
break;
case FIODGNAME_32:
error = freebsd32_ioctl_fiodgname(td, uap, fp);
break;
case MEMRANGE_GET32: /* FALLTHROUGH */
case MEMRANGE_SET32:
error = freebsd32_ioctl_memrange(td, uap, fp);
break;
case PCIOCGETCONF_32:
error = freebsd32_ioctl_pciocgetconf(td, uap, fp);
break;
default:
fdrop(fp, td);
ap.fd = uap->fd;
ap.com = uap->com;
PTRIN_CP(*uap, ap, data);
- return ioctl(td, &ap);
+ return sys_ioctl(td, &ap);
}
fdrop(fp, td);
return error;
}
Index: head/sys/compat/freebsd32/freebsd32_misc.c
===================================================================
--- head/sys/compat/freebsd32/freebsd32_misc.c (revision 225616)
+++ head/sys/compat/freebsd32/freebsd32_misc.c (revision 225617)
@@ -1,2817 +1,2817 @@
/*-
* Copyright (c) 2002 Doug Rabson
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_compat.h"
#include "opt_inet.h"
#include "opt_inet6.h"
#define __ELF_WORD_SIZE 32
#include <sys/param.h>
#include <sys/bus.h>
#include <sys/clock.h>
#include <sys/exec.h>
#include <sys/fcntl.h>
#include <sys/filedesc.h>
#include <sys/imgact.h>
#include <sys/jail.h>
#include <sys/kernel.h>
#include <sys/limits.h>
#include <sys/linker.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/file.h> /* Must come after sys/malloc.h */
#include <sys/imgact.h>
#include <sys/mbuf.h>
#include <sys/mman.h>
#include <sys/module.h>
#include <sys/mount.h>
#include <sys/mutex.h>
#include <sys/namei.h>
#include <sys/proc.h>
#include <sys/reboot.h>
#include <sys/resource.h>
#include <sys/resourcevar.h>
#include <sys/selinfo.h>
#include <sys/eventvar.h> /* Must come after sys/selinfo.h */
#include <sys/pipe.h> /* Must come after sys/selinfo.h */
#include <sys/signal.h>
#include <sys/signalvar.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/stat.h>
#include <sys/syscall.h>
#include <sys/syscallsubr.h>
#include <sys/sysctl.h>
#include <sys/sysent.h>
#include <sys/sysproto.h>
#include <sys/systm.h>
#include <sys/thr.h>
#include <sys/unistd.h>
#include <sys/ucontext.h>
#include <sys/vnode.h>
#include <sys/wait.h>
#include <sys/ipc.h>
#include <sys/msg.h>
#include <sys/sem.h>
#include <sys/shm.h>
#ifdef INET
#include <netinet/in.h>
#endif
#include <vm/vm.h>
#include <vm/vm_param.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
#include <vm/vm_object.h>
#include <vm/vm_extern.h>
#include <machine/cpu.h>
#include <machine/elf.h>
#include <security/audit/audit.h>
#include <compat/freebsd32/freebsd32_util.h>
#include <compat/freebsd32/freebsd32.h>
#include <compat/freebsd32/freebsd32_ipc.h>
#include <compat/freebsd32/freebsd32_signal.h>
#include <compat/freebsd32/freebsd32_proto.h>
CTASSERT(sizeof(struct timeval32) == 8);
CTASSERT(sizeof(struct timespec32) == 8);
CTASSERT(sizeof(struct itimerval32) == 16);
CTASSERT(sizeof(struct statfs32) == 256);
CTASSERT(sizeof(struct rusage32) == 72);
CTASSERT(sizeof(struct sigaltstack32) == 12);
CTASSERT(sizeof(struct kevent32) == 20);
CTASSERT(sizeof(struct iovec32) == 8);
CTASSERT(sizeof(struct msghdr32) == 28);
CTASSERT(sizeof(struct stat32) == 96);
CTASSERT(sizeof(struct sigaction32) == 24);
static int freebsd32_kevent_copyout(void *arg, struct kevent *kevp, int count);
static int freebsd32_kevent_copyin(void *arg, struct kevent *kevp, int count);
#if BYTE_ORDER == BIG_ENDIAN
#define PAIR32TO64(type, name) ((name ## 2) | ((type)(name ## 1) << 32))
#define RETVAL_HI 0
#define RETVAL_LO 1
#else
#define PAIR32TO64(type, name) ((name ## 1) | ((type)(name ## 2) << 32))
#define RETVAL_HI 1
#define RETVAL_LO 0
#endif
void
freebsd32_rusage_out(const struct rusage *s, struct rusage32 *s32)
{
TV_CP(*s, *s32, ru_utime);
TV_CP(*s, *s32, ru_stime);
CP(*s, *s32, ru_maxrss);
CP(*s, *s32, ru_ixrss);
CP(*s, *s32, ru_idrss);
CP(*s, *s32, ru_isrss);
CP(*s, *s32, ru_minflt);
CP(*s, *s32, ru_majflt);
CP(*s, *s32, ru_nswap);
CP(*s, *s32, ru_inblock);
CP(*s, *s32, ru_oublock);
CP(*s, *s32, ru_msgsnd);
CP(*s, *s32, ru_msgrcv);
CP(*s, *s32, ru_nsignals);
CP(*s, *s32, ru_nvcsw);
CP(*s, *s32, ru_nivcsw);
}
int
freebsd32_wait4(struct thread *td, struct freebsd32_wait4_args *uap)
{
int error, status;
struct rusage32 ru32;
struct rusage ru, *rup;
if (uap->rusage != NULL)
rup = &ru;
else
rup = NULL;
error = kern_wait(td, uap->pid, &status, uap->options, rup);
if (error)
return (error);
if (uap->status != NULL)
error = copyout(&status, uap->status, sizeof(status));
if (uap->rusage != NULL && error == 0) {
freebsd32_rusage_out(&ru, &ru32);
error = copyout(&ru32, uap->rusage, sizeof(ru32));
}
return (error);
}
#ifdef COMPAT_FREEBSD4
static void
copy_statfs(struct statfs *in, struct statfs32 *out)
{
statfs_scale_blocks(in, INT32_MAX);
bzero(out, sizeof(*out));
CP(*in, *out, f_bsize);
out->f_iosize = MIN(in->f_iosize, INT32_MAX);
CP(*in, *out, f_blocks);
CP(*in, *out, f_bfree);
CP(*in, *out, f_bavail);
out->f_files = MIN(in->f_files, INT32_MAX);
out->f_ffree = MIN(in->f_ffree, INT32_MAX);
CP(*in, *out, f_fsid);
CP(*in, *out, f_owner);
CP(*in, *out, f_type);
CP(*in, *out, f_flags);
out->f_syncwrites = MIN(in->f_syncwrites, INT32_MAX);
out->f_asyncwrites = MIN(in->f_asyncwrites, INT32_MAX);
strlcpy(out->f_fstypename,
in->f_fstypename, MFSNAMELEN);
strlcpy(out->f_mntonname,
in->f_mntonname, min(MNAMELEN, FREEBSD4_MNAMELEN));
out->f_syncreads = MIN(in->f_syncreads, INT32_MAX);
out->f_asyncreads = MIN(in->f_asyncreads, INT32_MAX);
strlcpy(out->f_mntfromname,
in->f_mntfromname, min(MNAMELEN, FREEBSD4_MNAMELEN));
}
#endif
#ifdef COMPAT_FREEBSD4
int
freebsd4_freebsd32_getfsstat(struct thread *td, struct freebsd4_freebsd32_getfsstat_args *uap)
{
struct statfs *buf, *sp;
struct statfs32 stat32;
size_t count, size;
int error;
count = uap->bufsize / sizeof(struct statfs32);
size = count * sizeof(struct statfs);
error = kern_getfsstat(td, &buf, size, UIO_SYSSPACE, uap->flags);
if (size > 0) {
count = td->td_retval[0];
sp = buf;
while (count > 0 && error == 0) {
copy_statfs(sp, &stat32);
error = copyout(&stat32, uap->buf, sizeof(stat32));
sp++;
uap->buf++;
count--;
}
free(buf, M_TEMP);
}
return (error);
}
#endif
int
freebsd32_sigaltstack(struct thread *td,
struct freebsd32_sigaltstack_args *uap)
{
struct sigaltstack32 s32;
struct sigaltstack ss, oss, *ssp;
int error;
if (uap->ss != NULL) {
error = copyin(uap->ss, &s32, sizeof(s32));
if (error)
return (error);
PTRIN_CP(s32, ss, ss_sp);
CP(s32, ss, ss_size);
CP(s32, ss, ss_flags);
ssp = &ss;
} else
ssp = NULL;
error = kern_sigaltstack(td, ssp, &oss);
if (error == 0 && uap->oss != NULL) {
PTROUT_CP(oss, s32, ss_sp);
CP(oss, s32, ss_size);
CP(oss, s32, ss_flags);
error = copyout(&s32, uap->oss, sizeof(s32));
}
return (error);
}
/*
* Custom version of exec_copyin_args() so that we can translate
* the pointers.
*/
int
freebsd32_exec_copyin_args(struct image_args *args, char *fname,
enum uio_seg segflg, u_int32_t *argv, u_int32_t *envv)
{
char *argp, *envp;
u_int32_t *p32, arg;
size_t length;
int error;
bzero(args, sizeof(*args));
if (argv == NULL)
return (EFAULT);
/*
* Allocate demand-paged memory for the file name, argument, and
* environment strings.
*/
error = exec_alloc_args(args);
if (error != 0)
return (error);
/*
* Copy the file name.
*/
if (fname != NULL) {
args->fname = args->buf;
error = (segflg == UIO_SYSSPACE) ?
copystr(fname, args->fname, PATH_MAX, &length) :
copyinstr(fname, args->fname, PATH_MAX, &length);
if (error != 0)
goto err_exit;
} else
length = 0;
args->begin_argv = args->buf + length;
args->endp = args->begin_argv;
args->stringspace = ARG_MAX;
/*
* extract arguments first
*/
p32 = argv;
for (;;) {
error = copyin(p32++, &arg, sizeof(arg));
if (error)
goto err_exit;
if (arg == 0)
break;
argp = PTRIN(arg);
error = copyinstr(argp, args->endp, args->stringspace, &length);
if (error) {
if (error == ENAMETOOLONG)
error = E2BIG;
goto err_exit;
}
args->stringspace -= length;
args->endp += length;
args->argc++;
}
args->begin_envv = args->endp;
/*
* extract environment strings
*/
if (envv) {
p32 = envv;
for (;;) {
error = copyin(p32++, &arg, sizeof(arg));
if (error)
goto err_exit;
if (arg == 0)
break;
envp = PTRIN(arg);
error = copyinstr(envp, args->endp, args->stringspace,
&length);
if (error) {
if (error == ENAMETOOLONG)
error = E2BIG;
goto err_exit;
}
args->stringspace -= length;
args->endp += length;
args->envc++;
}
}
return (0);
err_exit:
exec_free_args(args);
return (error);
}
int
freebsd32_execve(struct thread *td, struct freebsd32_execve_args *uap)
{
struct image_args eargs;
int error;
error = freebsd32_exec_copyin_args(&eargs, uap->fname, UIO_USERSPACE,
uap->argv, uap->envv);
if (error == 0)
error = kern_execve(td, &eargs, NULL);
return (error);
}
int
freebsd32_fexecve(struct thread *td, struct freebsd32_fexecve_args *uap)
{
struct image_args eargs;
int error;
error = freebsd32_exec_copyin_args(&eargs, NULL, UIO_SYSSPACE,
uap->argv, uap->envv);
if (error == 0) {
eargs.fd = uap->fd;
error = kern_execve(td, &eargs, NULL);
}
return (error);
}
#ifdef __ia64__
static int
freebsd32_mmap_partial(struct thread *td, vm_offset_t start, vm_offset_t end,
int prot, int fd, off_t pos)
{
vm_map_t map;
vm_map_entry_t entry;
int rv;
map = &td->td_proc->p_vmspace->vm_map;
if (fd != -1)
prot |= VM_PROT_WRITE;
if (vm_map_lookup_entry(map, start, &entry)) {
if ((entry->protection & prot) != prot) {
rv = vm_map_protect(map,
trunc_page(start),
round_page(end),
entry->protection | prot,
FALSE);
if (rv != KERN_SUCCESS)
return (EINVAL);
}
} else {
vm_offset_t addr = trunc_page(start);
rv = vm_map_find(map, 0, 0,
&addr, PAGE_SIZE, FALSE, prot,
VM_PROT_ALL, 0);
if (rv != KERN_SUCCESS)
return (EINVAL);
}
if (fd != -1) {
struct pread_args r;
r.fd = fd;
r.buf = (void *) start;
r.nbyte = end - start;
r.offset = pos;
- return (pread(td, &r));
+ return (sys_pread(td, &r));
} else {
while (start < end) {
subyte((void *) start, 0);
start++;
}
return (0);
}
}
#endif
int
freebsd32_mmap(struct thread *td, struct freebsd32_mmap_args *uap)
{
struct mmap_args ap;
vm_offset_t addr = (vm_offset_t) uap->addr;
vm_size_t len = uap->len;
int prot = uap->prot;
int flags = uap->flags;
int fd = uap->fd;
off_t pos = PAIR32TO64(off_t,uap->pos);
#ifdef __ia64__
vm_size_t pageoff;
int error;
/*
* Attempt to handle page size hassles.
*/
pageoff = (pos & PAGE_MASK);
if (flags & MAP_FIXED) {
vm_offset_t start, end;
start = addr;
end = addr + len;
if (start != trunc_page(start)) {
error = freebsd32_mmap_partial(td, start,
round_page(start), prot,
fd, pos);
if (fd != -1)
pos += round_page(start) - start;
start = round_page(start);
}
if (end != round_page(end)) {
vm_offset_t t = trunc_page(end);
error = freebsd32_mmap_partial(td, t, end,
prot, fd,
pos + t - start);
end = trunc_page(end);
}
if (end > start && fd != -1 && (pos & PAGE_MASK)) {
/*
* We can't map this region at all. The specified
* address doesn't have the same alignment as the file
* position. Fake the mapping by simply reading the
* entire region into memory. First we need to make
* sure the region exists.
*/
vm_map_t map;
struct pread_args r;
int rv;
prot |= VM_PROT_WRITE;
map = &td->td_proc->p_vmspace->vm_map;
rv = vm_map_remove(map, start, end);
if (rv != KERN_SUCCESS)
return (EINVAL);
rv = vm_map_find(map, 0, 0,
&start, end - start, FALSE,
prot, VM_PROT_ALL, 0);
if (rv != KERN_SUCCESS)
return (EINVAL);
r.fd = fd;
r.buf = (void *) start;
r.nbyte = end - start;
r.offset = pos;
- error = pread(td, &r);
+ error = sys_pread(td, &r);
if (error)
return (error);
td->td_retval[0] = addr;
return (0);
}
if (end == start) {
/*
* After dealing with the ragged ends, there
* might be none left.
*/
td->td_retval[0] = addr;
return (0);
}
addr = start;
len = end - start;
}
#endif
ap.addr = (void *) addr;
ap.len = len;
ap.prot = prot;
ap.flags = flags;
ap.fd = fd;
ap.pos = pos;
- return (mmap(td, &ap));
+ return (sys_mmap(td, &ap));
}
#ifdef COMPAT_FREEBSD6
int
freebsd6_freebsd32_mmap(struct thread *td, struct freebsd6_freebsd32_mmap_args *uap)
{
struct freebsd32_mmap_args ap;
ap.addr = uap->addr;
ap.len = uap->len;
ap.prot = uap->prot;
ap.flags = uap->flags;
ap.fd = uap->fd;
ap.pos1 = uap->pos1;
ap.pos2 = uap->pos2;
return (freebsd32_mmap(td, &ap));
}
#endif
int
freebsd32_setitimer(struct thread *td, struct freebsd32_setitimer_args *uap)
{
struct itimerval itv, oitv, *itvp;
struct itimerval32 i32;
int error;
if (uap->itv != NULL) {
error = copyin(uap->itv, &i32, sizeof(i32));
if (error)
return (error);
TV_CP(i32, itv, it_interval);
TV_CP(i32, itv, it_value);
itvp = &itv;
} else
itvp = NULL;
error = kern_setitimer(td, uap->which, itvp, &oitv);
if (error || uap->oitv == NULL)
return (error);
TV_CP(oitv, i32, it_interval);
TV_CP(oitv, i32, it_value);
return (copyout(&i32, uap->oitv, sizeof(i32)));
}
int
freebsd32_getitimer(struct thread *td, struct freebsd32_getitimer_args *uap)
{
struct itimerval itv;
struct itimerval32 i32;
int error;
error = kern_getitimer(td, uap->which, &itv);
if (error || uap->itv == NULL)
return (error);
TV_CP(itv, i32, it_interval);
TV_CP(itv, i32, it_value);
return (copyout(&i32, uap->itv, sizeof(i32)));
}
int
freebsd32_select(struct thread *td, struct freebsd32_select_args *uap)
{
struct timeval32 tv32;
struct timeval tv, *tvp;
int error;
if (uap->tv != NULL) {
error = copyin(uap->tv, &tv32, sizeof(tv32));
if (error)
return (error);
CP(tv32, tv, tv_sec);
CP(tv32, tv, tv_usec);
tvp = &tv;
} else
tvp = NULL;
/*
* XXX Do pointers need PTRIN()?
*/
return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp,
sizeof(int32_t) * 8));
}
int
freebsd32_pselect(struct thread *td, struct freebsd32_pselect_args *uap)
{
struct timespec32 ts32;
struct timespec ts;
struct timeval tv, *tvp;
sigset_t set, *uset;
int error;
if (uap->ts != NULL) {
error = copyin(uap->ts, &ts32, sizeof(ts32));
if (error != 0)
return (error);
CP(ts32, ts, tv_sec);
CP(ts32, ts, tv_nsec);
TIMESPEC_TO_TIMEVAL(&tv, &ts);
tvp = &tv;
} else
tvp = NULL;
if (uap->sm != NULL) {
error = copyin(uap->sm, &set, sizeof(set));
if (error != 0)
return (error);
uset = &set;
} else
uset = NULL;
/*
* XXX Do pointers need PTRIN()?
*/
error = kern_pselect(td, uap->nd, uap->in, uap->ou, uap->ex, tvp,
uset, sizeof(int32_t) * 8);
return (error);
}
/*
* Copy 'count' items into the destination list pointed to by uap->eventlist.
*/
static int
freebsd32_kevent_copyout(void *arg, struct kevent *kevp, int count)
{
struct freebsd32_kevent_args *uap;
struct kevent32 ks32[KQ_NEVENTS];
int i, error = 0;
KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
uap = (struct freebsd32_kevent_args *)arg;
for (i = 0; i < count; i++) {
CP(kevp[i], ks32[i], ident);
CP(kevp[i], ks32[i], filter);
CP(kevp[i], ks32[i], flags);
CP(kevp[i], ks32[i], fflags);
CP(kevp[i], ks32[i], data);
PTROUT_CP(kevp[i], ks32[i], udata);
}
error = copyout(ks32, uap->eventlist, count * sizeof *ks32);
if (error == 0)
uap->eventlist += count;
return (error);
}
/*
* Copy 'count' items from the list pointed to by uap->changelist.
*/
static int
freebsd32_kevent_copyin(void *arg, struct kevent *kevp, int count)
{
struct freebsd32_kevent_args *uap;
struct kevent32 ks32[KQ_NEVENTS];
int i, error = 0;
KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
uap = (struct freebsd32_kevent_args *)arg;
error = copyin(uap->changelist, ks32, count * sizeof *ks32);
if (error)
goto done;
uap->changelist += count;
for (i = 0; i < count; i++) {
CP(ks32[i], kevp[i], ident);
CP(ks32[i], kevp[i], filter);
CP(ks32[i], kevp[i], flags);
CP(ks32[i], kevp[i], fflags);
CP(ks32[i], kevp[i], data);
PTRIN_CP(ks32[i], kevp[i], udata);
}
done:
return (error);
}
int
freebsd32_kevent(struct thread *td, struct freebsd32_kevent_args *uap)
{
struct timespec32 ts32;
struct timespec ts, *tsp;
struct kevent_copyops k_ops = { uap,
freebsd32_kevent_copyout,
freebsd32_kevent_copyin};
int error;
if (uap->timeout) {
error = copyin(uap->timeout, &ts32, sizeof(ts32));
if (error)
return (error);
CP(ts32, ts, tv_sec);
CP(ts32, ts, tv_nsec);
tsp = &ts;
} else
tsp = NULL;
error = kern_kevent(td, uap->fd, uap->nchanges, uap->nevents,
&k_ops, tsp);
return (error);
}
int
freebsd32_gettimeofday(struct thread *td,
struct freebsd32_gettimeofday_args *uap)
{
struct timeval atv;
struct timeval32 atv32;
struct timezone rtz;
int error = 0;
if (uap->tp) {
microtime(&atv);
CP(atv, atv32, tv_sec);
CP(atv, atv32, tv_usec);
error = copyout(&atv32, uap->tp, sizeof (atv32));
}
if (error == 0 && uap->tzp != NULL) {
rtz.tz_minuteswest = tz_minuteswest;
rtz.tz_dsttime = tz_dsttime;
error = copyout(&rtz, uap->tzp, sizeof (rtz));
}
return (error);
}
int
freebsd32_getrusage(struct thread *td, struct freebsd32_getrusage_args *uap)
{
struct rusage32 s32;
struct rusage s;
int error;
error = kern_getrusage(td, uap->who, &s);
if (error)
return (error);
if (uap->rusage != NULL) {
freebsd32_rusage_out(&s, &s32);
error = copyout(&s32, uap->rusage, sizeof(s32));
}
return (error);
}
static int
freebsd32_copyinuio(struct iovec32 *iovp, u_int iovcnt, struct uio **uiop)
{
struct iovec32 iov32;
struct iovec *iov;
struct uio *uio;
u_int iovlen;
int error, i;
*uiop = NULL;
if (iovcnt > UIO_MAXIOV)
return (EINVAL);
iovlen = iovcnt * sizeof(struct iovec);
uio = malloc(iovlen + sizeof *uio, M_IOV, M_WAITOK);
iov = (struct iovec *)(uio + 1);
for (i = 0; i < iovcnt; i++) {
error = copyin(&iovp[i], &iov32, sizeof(struct iovec32));
if (error) {
free(uio, M_IOV);
return (error);
}
iov[i].iov_base = PTRIN(iov32.iov_base);
iov[i].iov_len = iov32.iov_len;
}
uio->uio_iov = iov;
uio->uio_iovcnt = iovcnt;
uio->uio_segflg = UIO_USERSPACE;
uio->uio_offset = -1;
uio->uio_resid = 0;
for (i = 0; i < iovcnt; i++) {
if (iov->iov_len > INT_MAX - uio->uio_resid) {
free(uio, M_IOV);
return (EINVAL);
}
uio->uio_resid += iov->iov_len;
iov++;
}
*uiop = uio;
return (0);
}
int
freebsd32_readv(struct thread *td, struct freebsd32_readv_args *uap)
{
struct uio *auio;
int error;
error = freebsd32_copyinuio(uap->iovp, uap->iovcnt, &auio);
if (error)
return (error);
error = kern_readv(td, uap->fd, auio);
free(auio, M_IOV);
return (error);
}
int
freebsd32_writev(struct thread *td, struct freebsd32_writev_args *uap)
{
struct uio *auio;
int error;
error = freebsd32_copyinuio(uap->iovp, uap->iovcnt, &auio);
if (error)
return (error);
error = kern_writev(td, uap->fd, auio);
free(auio, M_IOV);
return (error);
}
int
freebsd32_preadv(struct thread *td, struct freebsd32_preadv_args *uap)
{
struct uio *auio;
int error;
error = freebsd32_copyinuio(uap->iovp, uap->iovcnt, &auio);
if (error)
return (error);
error = kern_preadv(td, uap->fd, auio, PAIR32TO64(off_t,uap->offset));
free(auio, M_IOV);
return (error);
}
int
freebsd32_pwritev(struct thread *td, struct freebsd32_pwritev_args *uap)
{
struct uio *auio;
int error;
error = freebsd32_copyinuio(uap->iovp, uap->iovcnt, &auio);
if (error)
return (error);
error = kern_pwritev(td, uap->fd, auio, PAIR32TO64(off_t,uap->offset));
free(auio, M_IOV);
return (error);
}
int
freebsd32_copyiniov(struct iovec32 *iovp32, u_int iovcnt, struct iovec **iovp,
int error)
{
struct iovec32 iov32;
struct iovec *iov;
u_int iovlen;
int i;
*iovp = NULL;
if (iovcnt > UIO_MAXIOV)
return (error);
iovlen = iovcnt * sizeof(struct iovec);
iov = malloc(iovlen, M_IOV, M_WAITOK);
for (i = 0; i < iovcnt; i++) {
error = copyin(&iovp32[i], &iov32, sizeof(struct iovec32));
if (error) {
free(iov, M_IOV);
return (error);
}
iov[i].iov_base = PTRIN(iov32.iov_base);
iov[i].iov_len = iov32.iov_len;
}
*iovp = iov;
return (0);
}
static int
freebsd32_copyinmsghdr(struct msghdr32 *msg32, struct msghdr *msg)
{
struct msghdr32 m32;
int error;
error = copyin(msg32, &m32, sizeof(m32));
if (error)
return (error);
msg->msg_name = PTRIN(m32.msg_name);
msg->msg_namelen = m32.msg_namelen;
msg->msg_iov = PTRIN(m32.msg_iov);
msg->msg_iovlen = m32.msg_iovlen;
msg->msg_control = PTRIN(m32.msg_control);
msg->msg_controllen = m32.msg_controllen;
msg->msg_flags = m32.msg_flags;
return (0);
}
static int
freebsd32_copyoutmsghdr(struct msghdr *msg, struct msghdr32 *msg32)
{
struct msghdr32 m32;
int error;
m32.msg_name = PTROUT(msg->msg_name);
m32.msg_namelen = msg->msg_namelen;
m32.msg_iov = PTROUT(msg->msg_iov);
m32.msg_iovlen = msg->msg_iovlen;
m32.msg_control = PTROUT(msg->msg_control);
m32.msg_controllen = msg->msg_controllen;
m32.msg_flags = msg->msg_flags;
error = copyout(&m32, msg32, sizeof(m32));
return (error);
}
#define FREEBSD32_ALIGNBYTES (sizeof(int) - 1)
#define FREEBSD32_ALIGN(p) \
(((u_long)(p) + FREEBSD32_ALIGNBYTES) & ~FREEBSD32_ALIGNBYTES)
#define FREEBSD32_CMSG_SPACE(l) \
(FREEBSD32_ALIGN(sizeof(struct cmsghdr)) + FREEBSD32_ALIGN(l))
#define FREEBSD32_CMSG_DATA(cmsg) ((unsigned char *)(cmsg) + \
FREEBSD32_ALIGN(sizeof(struct cmsghdr)))
static int
freebsd32_copy_msg_out(struct msghdr *msg, struct mbuf *control)
{
struct cmsghdr *cm;
void *data;
socklen_t clen, datalen;
int error;
caddr_t ctlbuf;
int len, maxlen, copylen;
struct mbuf *m;
error = 0;
len = msg->msg_controllen;
maxlen = msg->msg_controllen;
msg->msg_controllen = 0;
m = control;
ctlbuf = msg->msg_control;
while (m && len > 0) {
cm = mtod(m, struct cmsghdr *);
clen = m->m_len;
while (cm != NULL) {
if (sizeof(struct cmsghdr) > clen ||
cm->cmsg_len > clen) {
error = EINVAL;
break;
}
data = CMSG_DATA(cm);
datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data;
/* Adjust message length */
cm->cmsg_len = FREEBSD32_ALIGN(sizeof(struct cmsghdr)) +
datalen;
/* Copy cmsghdr */
copylen = sizeof(struct cmsghdr);
if (len < copylen) {
msg->msg_flags |= MSG_CTRUNC;
copylen = len;
}
error = copyout(cm,ctlbuf,copylen);
if (error)
goto exit;
ctlbuf += FREEBSD32_ALIGN(copylen);
len -= FREEBSD32_ALIGN(copylen);
if (len <= 0)
break;
/* Copy data */
copylen = datalen;
if (len < copylen) {
msg->msg_flags |= MSG_CTRUNC;
copylen = len;
}
error = copyout(data,ctlbuf,copylen);
if (error)
goto exit;
ctlbuf += FREEBSD32_ALIGN(copylen);
len -= FREEBSD32_ALIGN(copylen);
if (CMSG_SPACE(datalen) < clen) {
clen -= CMSG_SPACE(datalen);
cm = (struct cmsghdr *)
((caddr_t)cm + CMSG_SPACE(datalen));
} else {
clen = 0;
cm = NULL;
}
}
m = m->m_next;
}
msg->msg_controllen = (len <= 0) ? maxlen : ctlbuf - (caddr_t)msg->msg_control;
exit:
return (error);
}
int
freebsd32_recvmsg(td, uap)
struct thread *td;
struct freebsd32_recvmsg_args /* {
int s;
struct msghdr32 *msg;
int flags;
} */ *uap;
{
struct msghdr msg;
struct msghdr32 m32;
struct iovec *uiov, *iov;
struct mbuf *control = NULL;
struct mbuf **controlp;
int error;
error = copyin(uap->msg, &m32, sizeof(m32));
if (error)
return (error);
error = freebsd32_copyinmsghdr(uap->msg, &msg);
if (error)
return (error);
error = freebsd32_copyiniov(PTRIN(m32.msg_iov), m32.msg_iovlen, &iov,
EMSGSIZE);
if (error)
return (error);
msg.msg_flags = uap->flags;
uiov = msg.msg_iov;
msg.msg_iov = iov;
controlp = (msg.msg_control != NULL) ? &control : NULL;
error = kern_recvit(td, uap->s, &msg, UIO_USERSPACE, controlp);
if (error == 0) {
msg.msg_iov = uiov;
if (control != NULL)
error = freebsd32_copy_msg_out(&msg, control);
else
msg.msg_controllen = 0;
if (error == 0)
error = freebsd32_copyoutmsghdr(&msg, uap->msg);
}
free(iov, M_IOV);
if (control != NULL)
m_freem(control);
return (error);
}
static int
freebsd32_convert_msg_in(struct mbuf **controlp)
{
struct mbuf *control = *controlp;
struct cmsghdr *cm = mtod(control, struct cmsghdr *);
void *data;
socklen_t clen = control->m_len, datalen;
int error;
error = 0;
*controlp = NULL;
while (cm != NULL) {
if (sizeof(struct cmsghdr) > clen || cm->cmsg_len > clen) {
error = EINVAL;
break;
}
data = FREEBSD32_CMSG_DATA(cm);
datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data;
*controlp = sbcreatecontrol(data, datalen, cm->cmsg_type,
cm->cmsg_level);
controlp = &(*controlp)->m_next;
if (FREEBSD32_CMSG_SPACE(datalen) < clen) {
clen -= FREEBSD32_CMSG_SPACE(datalen);
cm = (struct cmsghdr *)
((caddr_t)cm + FREEBSD32_CMSG_SPACE(datalen));
} else {
clen = 0;
cm = NULL;
}
}
m_freem(control);
return (error);
}
int
freebsd32_sendmsg(struct thread *td,
struct freebsd32_sendmsg_args *uap)
{
struct msghdr msg;
struct msghdr32 m32;
struct iovec *iov;
struct mbuf *control = NULL;
struct sockaddr *to = NULL;
int error;
error = copyin(uap->msg, &m32, sizeof(m32));
if (error)
return (error);
error = freebsd32_copyinmsghdr(uap->msg, &msg);
if (error)
return (error);
error = freebsd32_copyiniov(PTRIN(m32.msg_iov), m32.msg_iovlen, &iov,
EMSGSIZE);
if (error)
return (error);
msg.msg_iov = iov;
if (msg.msg_name != NULL) {
error = getsockaddr(&to, msg.msg_name, msg.msg_namelen);
if (error) {
to = NULL;
goto out;
}
msg.msg_name = to;
}
if (msg.msg_control) {
if (msg.msg_controllen < sizeof(struct cmsghdr)) {
error = EINVAL;
goto out;
}
error = sockargs(&control, msg.msg_control,
msg.msg_controllen, MT_CONTROL);
if (error)
goto out;
error = freebsd32_convert_msg_in(&control);
if (error)
goto out;
}
error = kern_sendit(td, uap->s, &msg, uap->flags, control,
UIO_USERSPACE);
out:
free(iov, M_IOV);
if (to)
free(to, M_SONAME);
return (error);
}
int
freebsd32_recvfrom(struct thread *td,
struct freebsd32_recvfrom_args *uap)
{
struct msghdr msg;
struct iovec aiov;
int error;
if (uap->fromlenaddr) {
error = copyin(PTRIN(uap->fromlenaddr), &msg.msg_namelen,
sizeof(msg.msg_namelen));
if (error)
return (error);
} else {
msg.msg_namelen = 0;
}
msg.msg_name = PTRIN(uap->from);
msg.msg_iov = &aiov;
msg.msg_iovlen = 1;
aiov.iov_base = PTRIN(uap->buf);
aiov.iov_len = uap->len;
msg.msg_control = NULL;
msg.msg_flags = uap->flags;
error = kern_recvit(td, uap->s, &msg, UIO_USERSPACE, NULL);
if (error == 0 && uap->fromlenaddr)
error = copyout(&msg.msg_namelen, PTRIN(uap->fromlenaddr),
sizeof (msg.msg_namelen));
return (error);
}
int
freebsd32_settimeofday(struct thread *td,
struct freebsd32_settimeofday_args *uap)
{
struct timeval32 tv32;
struct timeval tv, *tvp;
struct timezone tz, *tzp;
int error;
if (uap->tv) {
error = copyin(uap->tv, &tv32, sizeof(tv32));
if (error)
return (error);
CP(tv32, tv, tv_sec);
CP(tv32, tv, tv_usec);
tvp = &tv;
} else
tvp = NULL;
if (uap->tzp) {
error = copyin(uap->tzp, &tz, sizeof(tz));
if (error)
return (error);
tzp = &tz;
} else
tzp = NULL;
return (kern_settimeofday(td, tvp, tzp));
}
int
freebsd32_utimes(struct thread *td, struct freebsd32_utimes_args *uap)
{
struct timeval32 s32[2];
struct timeval s[2], *sp;
int error;
if (uap->tptr != NULL) {
error = copyin(uap->tptr, s32, sizeof(s32));
if (error)
return (error);
CP(s32[0], s[0], tv_sec);
CP(s32[0], s[0], tv_usec);
CP(s32[1], s[1], tv_sec);
CP(s32[1], s[1], tv_usec);
sp = s;
} else
sp = NULL;
return (kern_utimes(td, uap->path, UIO_USERSPACE, sp, UIO_SYSSPACE));
}
int
freebsd32_lutimes(struct thread *td, struct freebsd32_lutimes_args *uap)
{
struct timeval32 s32[2];
struct timeval s[2], *sp;
int error;
if (uap->tptr != NULL) {
error = copyin(uap->tptr, s32, sizeof(s32));
if (error)
return (error);
CP(s32[0], s[0], tv_sec);
CP(s32[0], s[0], tv_usec);
CP(s32[1], s[1], tv_sec);
CP(s32[1], s[1], tv_usec);
sp = s;
} else
sp = NULL;
return (kern_lutimes(td, uap->path, UIO_USERSPACE, sp, UIO_SYSSPACE));
}
int
freebsd32_futimes(struct thread *td, struct freebsd32_futimes_args *uap)
{
struct timeval32 s32[2];
struct timeval s[2], *sp;
int error;
if (uap->tptr != NULL) {
error = copyin(uap->tptr, s32, sizeof(s32));
if (error)
return (error);
CP(s32[0], s[0], tv_sec);
CP(s32[0], s[0], tv_usec);
CP(s32[1], s[1], tv_sec);
CP(s32[1], s[1], tv_usec);
sp = s;
} else
sp = NULL;
return (kern_futimes(td, uap->fd, sp, UIO_SYSSPACE));
}
int
freebsd32_futimesat(struct thread *td, struct freebsd32_futimesat_args *uap)
{
struct timeval32 s32[2];
struct timeval s[2], *sp;
int error;
if (uap->times != NULL) {
error = copyin(uap->times, s32, sizeof(s32));
if (error)
return (error);
CP(s32[0], s[0], tv_sec);
CP(s32[0], s[0], tv_usec);
CP(s32[1], s[1], tv_sec);
CP(s32[1], s[1], tv_usec);
sp = s;
} else
sp = NULL;
return (kern_utimesat(td, uap->fd, uap->path, UIO_USERSPACE,
sp, UIO_SYSSPACE));
}
int
freebsd32_adjtime(struct thread *td, struct freebsd32_adjtime_args *uap)
{
struct timeval32 tv32;
struct timeval delta, olddelta, *deltap;
int error;
if (uap->delta) {
error = copyin(uap->delta, &tv32, sizeof(tv32));
if (error)
return (error);
CP(tv32, delta, tv_sec);
CP(tv32, delta, tv_usec);
deltap = &delta;
} else
deltap = NULL;
error = kern_adjtime(td, deltap, &olddelta);
if (uap->olddelta && error == 0) {
CP(olddelta, tv32, tv_sec);
CP(olddelta, tv32, tv_usec);
error = copyout(&tv32, uap->olddelta, sizeof(tv32));
}
return (error);
}
#ifdef COMPAT_FREEBSD4
int
freebsd4_freebsd32_statfs(struct thread *td, struct freebsd4_freebsd32_statfs_args *uap)
{
struct statfs32 s32;
struct statfs s;
int error;
error = kern_statfs(td, uap->path, UIO_USERSPACE, &s);
if (error)
return (error);
copy_statfs(&s, &s32);
return (copyout(&s32, uap->buf, sizeof(s32)));
}
#endif
#ifdef COMPAT_FREEBSD4
int
freebsd4_freebsd32_fstatfs(struct thread *td, struct freebsd4_freebsd32_fstatfs_args *uap)
{
struct statfs32 s32;
struct statfs s;
int error;
error = kern_fstatfs(td, uap->fd, &s);
if (error)
return (error);
copy_statfs(&s, &s32);
return (copyout(&s32, uap->buf, sizeof(s32)));
}
#endif
#ifdef COMPAT_FREEBSD4
int
freebsd4_freebsd32_fhstatfs(struct thread *td, struct freebsd4_freebsd32_fhstatfs_args *uap)
{
struct statfs32 s32;
struct statfs s;
fhandle_t fh;
int error;
if ((error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t))) != 0)
return (error);
error = kern_fhstatfs(td, fh, &s);
if (error)
return (error);
copy_statfs(&s, &s32);
return (copyout(&s32, uap->buf, sizeof(s32)));
}
#endif
int
freebsd32_pread(struct thread *td, struct freebsd32_pread_args *uap)
{
struct pread_args ap;
ap.fd = uap->fd;
ap.buf = uap->buf;
ap.nbyte = uap->nbyte;
ap.offset = PAIR32TO64(off_t,uap->offset);
- return (pread(td, &ap));
+ return (sys_pread(td, &ap));
}
int
freebsd32_pwrite(struct thread *td, struct freebsd32_pwrite_args *uap)
{
struct pwrite_args ap;
ap.fd = uap->fd;
ap.buf = uap->buf;
ap.nbyte = uap->nbyte;
ap.offset = PAIR32TO64(off_t,uap->offset);
- return (pwrite(td, &ap));
+ return (sys_pwrite(td, &ap));
}
#ifdef COMPAT_43
int
ofreebsd32_lseek(struct thread *td, struct ofreebsd32_lseek_args *uap)
{
struct lseek_args nuap;
nuap.fd = uap->fd;
nuap.offset = uap->offset;
nuap.whence = uap->whence;
- return (lseek(td, &nuap));
+ return (sys_lseek(td, &nuap));
}
#endif
int
freebsd32_lseek(struct thread *td, struct freebsd32_lseek_args *uap)
{
int error;
struct lseek_args ap;
off_t pos;
ap.fd = uap->fd;
ap.offset = PAIR32TO64(off_t,uap->offset);
ap.whence = uap->whence;
- error = lseek(td, &ap);
+ error = sys_lseek(td, &ap);
/* Expand the quad return into two parts for eax and edx */
pos = *(off_t *)(td->td_retval);
td->td_retval[RETVAL_LO] = pos & 0xffffffff; /* %eax */
td->td_retval[RETVAL_HI] = pos >> 32; /* %edx */
return error;
}
int
freebsd32_truncate(struct thread *td, struct freebsd32_truncate_args *uap)
{
struct truncate_args ap;
ap.path = uap->path;
ap.length = PAIR32TO64(off_t,uap->length);
- return (truncate(td, &ap));
+ return (sys_truncate(td, &ap));
}
int
freebsd32_ftruncate(struct thread *td, struct freebsd32_ftruncate_args *uap)
{
struct ftruncate_args ap;
ap.fd = uap->fd;
ap.length = PAIR32TO64(off_t,uap->length);
- return (ftruncate(td, &ap));
+ return (sys_ftruncate(td, &ap));
}
#ifdef COMPAT_43
int
ofreebsd32_getdirentries(struct thread *td,
struct ofreebsd32_getdirentries_args *uap)
{
struct ogetdirentries_args ap;
int error;
long loff;
int32_t loff_cut;
ap.fd = uap->fd;
ap.buf = uap->buf;
ap.count = uap->count;
ap.basep = NULL;
error = kern_ogetdirentries(td, &ap, &loff);
if (error == 0) {
loff_cut = loff;
error = copyout(&loff_cut, uap->basep, sizeof(int32_t));
}
return (error);
}
#endif
int
freebsd32_getdirentries(struct thread *td,
struct freebsd32_getdirentries_args *uap)
{
long base;
int32_t base32;
int error;
error = kern_getdirentries(td, uap->fd, uap->buf, uap->count, &base);
if (error)
return (error);
if (uap->basep != NULL) {
base32 = base;
error = copyout(&base32, uap->basep, sizeof(int32_t));
}
return (error);
}
#ifdef COMPAT_FREEBSD6
/* versions with the 'int pad' argument */
int
freebsd6_freebsd32_pread(struct thread *td, struct freebsd6_freebsd32_pread_args *uap)
{
struct pread_args ap;
ap.fd = uap->fd;
ap.buf = uap->buf;
ap.nbyte = uap->nbyte;
ap.offset = PAIR32TO64(off_t,uap->offset);
- return (pread(td, &ap));
+ return (sys_pread(td, &ap));
}
int
freebsd6_freebsd32_pwrite(struct thread *td, struct freebsd6_freebsd32_pwrite_args *uap)
{
struct pwrite_args ap;
ap.fd = uap->fd;
ap.buf = uap->buf;
ap.nbyte = uap->nbyte;
ap.offset = PAIR32TO64(off_t,uap->offset);
- return (pwrite(td, &ap));
+ return (sys_pwrite(td, &ap));
}
int
freebsd6_freebsd32_lseek(struct thread *td, struct freebsd6_freebsd32_lseek_args *uap)
{
int error;
struct lseek_args ap;
off_t pos;
ap.fd = uap->fd;
ap.offset = PAIR32TO64(off_t,uap->offset);
ap.whence = uap->whence;
- error = lseek(td, &ap);
+ error = sys_lseek(td, &ap);
/* Expand the quad return into two parts for eax and edx */
pos = *(off_t *)(td->td_retval);
td->td_retval[RETVAL_LO] = pos & 0xffffffff; /* %eax */
td->td_retval[RETVAL_HI] = pos >> 32; /* %edx */
return error;
}
int
freebsd6_freebsd32_truncate(struct thread *td, struct freebsd6_freebsd32_truncate_args *uap)
{
struct truncate_args ap;
ap.path = uap->path;
ap.length = PAIR32TO64(off_t,uap->length);
- return (truncate(td, &ap));
+ return (sys_truncate(td, &ap));
}
int
freebsd6_freebsd32_ftruncate(struct thread *td, struct freebsd6_freebsd32_ftruncate_args *uap)
{
struct ftruncate_args ap;
ap.fd = uap->fd;
ap.length = PAIR32TO64(off_t,uap->length);
- return (ftruncate(td, &ap));
+ return (sys_ftruncate(td, &ap));
}
#endif /* COMPAT_FREEBSD6 */
struct sf_hdtr32 {
uint32_t headers;
int hdr_cnt;
uint32_t trailers;
int trl_cnt;
};
static int
freebsd32_do_sendfile(struct thread *td,
struct freebsd32_sendfile_args *uap, int compat)
{
struct sendfile_args ap;
struct sf_hdtr32 hdtr32;
struct sf_hdtr hdtr;
struct uio *hdr_uio, *trl_uio;
struct iovec32 *iov32;
int error;
hdr_uio = trl_uio = NULL;
ap.fd = uap->fd;
ap.s = uap->s;
ap.offset = PAIR32TO64(off_t,uap->offset);
ap.nbytes = uap->nbytes;
ap.hdtr = (struct sf_hdtr *)uap->hdtr; /* XXX not used */
ap.sbytes = uap->sbytes;
ap.flags = uap->flags;
if (uap->hdtr != NULL) {
error = copyin(uap->hdtr, &hdtr32, sizeof(hdtr32));
if (error)
goto out;
PTRIN_CP(hdtr32, hdtr, headers);
CP(hdtr32, hdtr, hdr_cnt);
PTRIN_CP(hdtr32, hdtr, trailers);
CP(hdtr32, hdtr, trl_cnt);
if (hdtr.headers != NULL) {
iov32 = PTRIN(hdtr32.headers);
error = freebsd32_copyinuio(iov32,
hdtr32.hdr_cnt, &hdr_uio);
if (error)
goto out;
}
if (hdtr.trailers != NULL) {
iov32 = PTRIN(hdtr32.trailers);
error = freebsd32_copyinuio(iov32,
hdtr32.trl_cnt, &trl_uio);
if (error)
goto out;
}
}
error = kern_sendfile(td, &ap, hdr_uio, trl_uio, compat);
out:
if (hdr_uio)
free(hdr_uio, M_IOV);
if (trl_uio)
free(trl_uio, M_IOV);
return (error);
}
#ifdef COMPAT_FREEBSD4
int
freebsd4_freebsd32_sendfile(struct thread *td,
struct freebsd4_freebsd32_sendfile_args *uap)
{
return (freebsd32_do_sendfile(td,
(struct freebsd32_sendfile_args *)uap, 1));
}
#endif
int
freebsd32_sendfile(struct thread *td, struct freebsd32_sendfile_args *uap)
{
return (freebsd32_do_sendfile(td, uap, 0));
}
static void
copy_stat(struct stat *in, struct stat32 *out)
{
CP(*in, *out, st_dev);
CP(*in, *out, st_ino);
CP(*in, *out, st_mode);
CP(*in, *out, st_nlink);
CP(*in, *out, st_uid);
CP(*in, *out, st_gid);
CP(*in, *out, st_rdev);
TS_CP(*in, *out, st_atim);
TS_CP(*in, *out, st_mtim);
TS_CP(*in, *out, st_ctim);
CP(*in, *out, st_size);
CP(*in, *out, st_blocks);
CP(*in, *out, st_blksize);
CP(*in, *out, st_flags);
CP(*in, *out, st_gen);
TS_CP(*in, *out, st_birthtim);
}
#ifdef COMPAT_43
static void
copy_ostat(struct stat *in, struct ostat32 *out)
{
CP(*in, *out, st_dev);
CP(*in, *out, st_ino);
CP(*in, *out, st_mode);
CP(*in, *out, st_nlink);
CP(*in, *out, st_uid);
CP(*in, *out, st_gid);
CP(*in, *out, st_rdev);
CP(*in, *out, st_size);
TS_CP(*in, *out, st_atim);
TS_CP(*in, *out, st_mtim);
TS_CP(*in, *out, st_ctim);
CP(*in, *out, st_blksize);
CP(*in, *out, st_blocks);
CP(*in, *out, st_flags);
CP(*in, *out, st_gen);
}
#endif
int
freebsd32_stat(struct thread *td, struct freebsd32_stat_args *uap)
{
struct stat sb;
struct stat32 sb32;
int error;
error = kern_stat(td, uap->path, UIO_USERSPACE, &sb);
if (error)
return (error);
copy_stat(&sb, &sb32);
error = copyout(&sb32, uap->ub, sizeof (sb32));
return (error);
}
#ifdef COMPAT_43
int
ofreebsd32_stat(struct thread *td, struct ofreebsd32_stat_args *uap)
{
struct stat sb;
struct ostat32 sb32;
int error;
error = kern_stat(td, uap->path, UIO_USERSPACE, &sb);
if (error)
return (error);
copy_ostat(&sb, &sb32);
error = copyout(&sb32, uap->ub, sizeof (sb32));
return (error);
}
#endif
int
freebsd32_fstat(struct thread *td, struct freebsd32_fstat_args *uap)
{
struct stat ub;
struct stat32 ub32;
int error;
error = kern_fstat(td, uap->fd, &ub);
if (error)
return (error);
copy_stat(&ub, &ub32);
error = copyout(&ub32, uap->ub, sizeof(ub32));
return (error);
}
#ifdef COMPAT_43
int
ofreebsd32_fstat(struct thread *td, struct ofreebsd32_fstat_args *uap)
{
struct stat ub;
struct ostat32 ub32;
int error;
error = kern_fstat(td, uap->fd, &ub);
if (error)
return (error);
copy_ostat(&ub, &ub32);
error = copyout(&ub32, uap->ub, sizeof(ub32));
return (error);
}
#endif
int
freebsd32_fstatat(struct thread *td, struct freebsd32_fstatat_args *uap)
{
struct stat ub;
struct stat32 ub32;
int error;
error = kern_statat(td, uap->flag, uap->fd, uap->path, UIO_USERSPACE, &ub);
if (error)
return (error);
copy_stat(&ub, &ub32);
error = copyout(&ub32, uap->buf, sizeof(ub32));
return (error);
}
int
freebsd32_lstat(struct thread *td, struct freebsd32_lstat_args *uap)
{
struct stat sb;
struct stat32 sb32;
int error;
error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb);
if (error)
return (error);
copy_stat(&sb, &sb32);
error = copyout(&sb32, uap->ub, sizeof (sb32));
return (error);
}
#ifdef COMPAT_43
int
ofreebsd32_lstat(struct thread *td, struct ofreebsd32_lstat_args *uap)
{
struct stat sb;
struct ostat32 sb32;
int error;
error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb);
if (error)
return (error);
copy_ostat(&sb, &sb32);
error = copyout(&sb32, uap->ub, sizeof (sb32));
return (error);
}
#endif
int
freebsd32_sysctl(struct thread *td, struct freebsd32_sysctl_args *uap)
{
int error, name[CTL_MAXNAME];
size_t j, oldlen;
if (uap->namelen > CTL_MAXNAME || uap->namelen < 2)
return (EINVAL);
error = copyin(uap->name, name, uap->namelen * sizeof(int));
if (error)
return (error);
if (uap->oldlenp)
oldlen = fuword32(uap->oldlenp);
else
oldlen = 0;
error = userland_sysctl(td, name, uap->namelen,
uap->old, &oldlen, 1,
uap->new, uap->newlen, &j, SCTL_MASK32);
if (error && error != ENOMEM)
return (error);
if (uap->oldlenp)
suword32(uap->oldlenp, j);
return (0);
}
int
freebsd32_jail(struct thread *td, struct freebsd32_jail_args *uap)
{
uint32_t version;
int error;
struct jail j;
error = copyin(uap->jail, &version, sizeof(uint32_t));
if (error)
return (error);
switch (version) {
case 0:
{
/* FreeBSD single IPv4 jails. */
struct jail32_v0 j32_v0;
bzero(&j, sizeof(struct jail));
error = copyin(uap->jail, &j32_v0, sizeof(struct jail32_v0));
if (error)
return (error);
CP(j32_v0, j, version);
PTRIN_CP(j32_v0, j, path);
PTRIN_CP(j32_v0, j, hostname);
j.ip4s = j32_v0.ip_number;
break;
}
case 1:
/*
* Version 1 was used by multi-IPv4 jail implementations
* that never made it into the official kernel.
*/
return (EINVAL);
case 2: /* JAIL_API_VERSION */
{
/* FreeBSD multi-IPv4/IPv6,noIP jails. */
struct jail32 j32;
error = copyin(uap->jail, &j32, sizeof(struct jail32));
if (error)
return (error);
CP(j32, j, version);
PTRIN_CP(j32, j, path);
PTRIN_CP(j32, j, hostname);
PTRIN_CP(j32, j, jailname);
CP(j32, j, ip4s);
CP(j32, j, ip6s);
PTRIN_CP(j32, j, ip4);
PTRIN_CP(j32, j, ip6);
break;
}
default:
/* Sci-Fi jails are not supported, sorry. */
return (EINVAL);
}
return (kern_jail(td, &j));
}
int
freebsd32_jail_set(struct thread *td, struct freebsd32_jail_set_args *uap)
{
struct uio *auio;
int error;
/* Check that we have an even number of iovecs. */
if (uap->iovcnt & 1)
return (EINVAL);
error = freebsd32_copyinuio(uap->iovp, uap->iovcnt, &auio);
if (error)
return (error);
error = kern_jail_set(td, auio, uap->flags);
free(auio, M_IOV);
return (error);
}
int
freebsd32_jail_get(struct thread *td, struct freebsd32_jail_get_args *uap)
{
struct iovec32 iov32;
struct uio *auio;
int error, i;
/* Check that we have an even number of iovecs. */
if (uap->iovcnt & 1)
return (EINVAL);
error = freebsd32_copyinuio(uap->iovp, uap->iovcnt, &auio);
if (error)
return (error);
error = kern_jail_get(td, auio, uap->flags);
if (error == 0)
for (i = 0; i < uap->iovcnt; i++) {
PTROUT_CP(auio->uio_iov[i], iov32, iov_base);
CP(auio->uio_iov[i], iov32, iov_len);
error = copyout(&iov32, uap->iovp + i, sizeof(iov32));
if (error != 0)
break;
}
free(auio, M_IOV);
return (error);
}
int
freebsd32_sigaction(struct thread *td, struct freebsd32_sigaction_args *uap)
{
struct sigaction32 s32;
struct sigaction sa, osa, *sap;
int error;
if (uap->act) {
error = copyin(uap->act, &s32, sizeof(s32));
if (error)
return (error);
sa.sa_handler = PTRIN(s32.sa_u);
CP(s32, sa, sa_flags);
CP(s32, sa, sa_mask);
sap = &sa;
} else
sap = NULL;
error = kern_sigaction(td, uap->sig, sap, &osa, 0);
if (error == 0 && uap->oact != NULL) {
s32.sa_u = PTROUT(osa.sa_handler);
CP(osa, s32, sa_flags);
CP(osa, s32, sa_mask);
error = copyout(&s32, uap->oact, sizeof(s32));
}
return (error);
}
#ifdef COMPAT_FREEBSD4
int
freebsd4_freebsd32_sigaction(struct thread *td,
struct freebsd4_freebsd32_sigaction_args *uap)
{
struct sigaction32 s32;
struct sigaction sa, osa, *sap;
int error;
if (uap->act) {
error = copyin(uap->act, &s32, sizeof(s32));
if (error)
return (error);
sa.sa_handler = PTRIN(s32.sa_u);
CP(s32, sa, sa_flags);
CP(s32, sa, sa_mask);
sap = &sa;
} else
sap = NULL;
error = kern_sigaction(td, uap->sig, sap, &osa, KSA_FREEBSD4);
if (error == 0 && uap->oact != NULL) {
s32.sa_u = PTROUT(osa.sa_handler);
CP(osa, s32, sa_flags);
CP(osa, s32, sa_mask);
error = copyout(&s32, uap->oact, sizeof(s32));
}
return (error);
}
#endif
#ifdef COMPAT_43
struct osigaction32 {
u_int32_t sa_u;
osigset_t sa_mask;
int sa_flags;
};
#define ONSIG 32
int
ofreebsd32_sigaction(struct thread *td,
struct ofreebsd32_sigaction_args *uap)
{
struct osigaction32 s32;
struct sigaction sa, osa, *sap;
int error;
if (uap->signum <= 0 || uap->signum >= ONSIG)
return (EINVAL);
if (uap->nsa) {
error = copyin(uap->nsa, &s32, sizeof(s32));
if (error)
return (error);
sa.sa_handler = PTRIN(s32.sa_u);
CP(s32, sa, sa_flags);
OSIG2SIG(s32.sa_mask, sa.sa_mask);
sap = &sa;
} else
sap = NULL;
error = kern_sigaction(td, uap->signum, sap, &osa, KSA_OSIGSET);
if (error == 0 && uap->osa != NULL) {
s32.sa_u = PTROUT(osa.sa_handler);
CP(osa, s32, sa_flags);
SIG2OSIG(osa.sa_mask, s32.sa_mask);
error = copyout(&s32, uap->osa, sizeof(s32));
}
return (error);
}
int
ofreebsd32_sigprocmask(struct thread *td,
struct ofreebsd32_sigprocmask_args *uap)
{
sigset_t set, oset;
int error;
OSIG2SIG(uap->mask, set);
error = kern_sigprocmask(td, uap->how, &set, &oset, SIGPROCMASK_OLD);
SIG2OSIG(oset, td->td_retval[0]);
return (error);
}
int
ofreebsd32_sigpending(struct thread *td,
struct ofreebsd32_sigpending_args *uap)
{
struct proc *p = td->td_proc;
sigset_t siglist;
PROC_LOCK(p);
siglist = p->p_siglist;
SIGSETOR(siglist, td->td_siglist);
PROC_UNLOCK(p);
SIG2OSIG(siglist, td->td_retval[0]);
return (0);
}
struct sigvec32 {
u_int32_t sv_handler;
int sv_mask;
int sv_flags;
};
int
ofreebsd32_sigvec(struct thread *td,
struct ofreebsd32_sigvec_args *uap)
{
struct sigvec32 vec;
struct sigaction sa, osa, *sap;
int error;
if (uap->signum <= 0 || uap->signum >= ONSIG)
return (EINVAL);
if (uap->nsv) {
error = copyin(uap->nsv, &vec, sizeof(vec));
if (error)
return (error);
sa.sa_handler = PTRIN(vec.sv_handler);
OSIG2SIG(vec.sv_mask, sa.sa_mask);
sa.sa_flags = vec.sv_flags;
sa.sa_flags ^= SA_RESTART;
sap = &sa;
} else
sap = NULL;
error = kern_sigaction(td, uap->signum, sap, &osa, KSA_OSIGSET);
if (error == 0 && uap->osv != NULL) {
vec.sv_handler = PTROUT(osa.sa_handler);
SIG2OSIG(osa.sa_mask, vec.sv_mask);
vec.sv_flags = osa.sa_flags;
vec.sv_flags &= ~SA_NOCLDWAIT;
vec.sv_flags ^= SA_RESTART;
error = copyout(&vec, uap->osv, sizeof(vec));
}
return (error);
}
int
ofreebsd32_sigblock(struct thread *td,
struct ofreebsd32_sigblock_args *uap)
{
sigset_t set, oset;
OSIG2SIG(uap->mask, set);
kern_sigprocmask(td, SIG_BLOCK, &set, &oset, 0);
SIG2OSIG(oset, td->td_retval[0]);
return (0);
}
int
ofreebsd32_sigsetmask(struct thread *td,
struct ofreebsd32_sigsetmask_args *uap)
{
sigset_t set, oset;
OSIG2SIG(uap->mask, set);
kern_sigprocmask(td, SIG_SETMASK, &set, &oset, 0);
SIG2OSIG(oset, td->td_retval[0]);
return (0);
}
int
ofreebsd32_sigsuspend(struct thread *td,
struct ofreebsd32_sigsuspend_args *uap)
{
sigset_t mask;
OSIG2SIG(uap->mask, mask);
return (kern_sigsuspend(td, mask));
}
struct sigstack32 {
u_int32_t ss_sp;
int ss_onstack;
};
int
ofreebsd32_sigstack(struct thread *td,
struct ofreebsd32_sigstack_args *uap)
{
struct sigstack32 s32;
struct sigstack nss, oss;
int error = 0, unss;
if (uap->nss != NULL) {
error = copyin(uap->nss, &s32, sizeof(s32));
if (error)
return (error);
nss.ss_sp = PTRIN(s32.ss_sp);
CP(s32, nss, ss_onstack);
unss = 1;
} else {
unss = 0;
}
oss.ss_sp = td->td_sigstk.ss_sp;
oss.ss_onstack = sigonstack(cpu_getstack(td));
if (unss) {
td->td_sigstk.ss_sp = nss.ss_sp;
td->td_sigstk.ss_size = 0;
td->td_sigstk.ss_flags |= (nss.ss_onstack & SS_ONSTACK);
td->td_pflags |= TDP_ALTSTACK;
}
if (uap->oss != NULL) {
s32.ss_sp = PTROUT(oss.ss_sp);
CP(oss, s32, ss_onstack);
error = copyout(&s32, uap->oss, sizeof(s32));
}
return (error);
}
#endif
int
freebsd32_nanosleep(struct thread *td, struct freebsd32_nanosleep_args *uap)
{
struct timespec32 rmt32, rqt32;
struct timespec rmt, rqt;
int error;
error = copyin(uap->rqtp, &rqt32, sizeof(rqt32));
if (error)
return (error);
CP(rqt32, rqt, tv_sec);
CP(rqt32, rqt, tv_nsec);
if (uap->rmtp &&
!useracc((caddr_t)uap->rmtp, sizeof(rmt), VM_PROT_WRITE))
return (EFAULT);
error = kern_nanosleep(td, &rqt, &rmt);
if (error && uap->rmtp) {
int error2;
CP(rmt, rmt32, tv_sec);
CP(rmt, rmt32, tv_nsec);
error2 = copyout(&rmt32, uap->rmtp, sizeof(rmt32));
if (error2)
error = error2;
}
return (error);
}
int
freebsd32_clock_gettime(struct thread *td,
struct freebsd32_clock_gettime_args *uap)
{
struct timespec ats;
struct timespec32 ats32;
int error;
error = kern_clock_gettime(td, uap->clock_id, &ats);
if (error == 0) {
CP(ats, ats32, tv_sec);
CP(ats, ats32, tv_nsec);
error = copyout(&ats32, uap->tp, sizeof(ats32));
}
return (error);
}
int
freebsd32_clock_settime(struct thread *td,
struct freebsd32_clock_settime_args *uap)
{
struct timespec ats;
struct timespec32 ats32;
int error;
error = copyin(uap->tp, &ats32, sizeof(ats32));
if (error)
return (error);
CP(ats32, ats, tv_sec);
CP(ats32, ats, tv_nsec);
return (kern_clock_settime(td, uap->clock_id, &ats));
}
int
freebsd32_clock_getres(struct thread *td,
struct freebsd32_clock_getres_args *uap)
{
struct timespec ts;
struct timespec32 ts32;
int error;
if (uap->tp == NULL)
return (0);
error = kern_clock_getres(td, uap->clock_id, &ts);
if (error == 0) {
CP(ts, ts32, tv_sec);
CP(ts, ts32, tv_nsec);
error = copyout(&ts32, uap->tp, sizeof(ts32));
}
return (error);
}
int
freebsd32_thr_new(struct thread *td,
struct freebsd32_thr_new_args *uap)
{
struct thr_param32 param32;
struct thr_param param;
int error;
if (uap->param_size < 0 ||
uap->param_size > sizeof(struct thr_param32))
return (EINVAL);
bzero(&param, sizeof(struct thr_param));
bzero(&param32, sizeof(struct thr_param32));
error = copyin(uap->param, &param32, uap->param_size);
if (error != 0)
return (error);
param.start_func = PTRIN(param32.start_func);
param.arg = PTRIN(param32.arg);
param.stack_base = PTRIN(param32.stack_base);
param.stack_size = param32.stack_size;
param.tls_base = PTRIN(param32.tls_base);
param.tls_size = param32.tls_size;
param.child_tid = PTRIN(param32.child_tid);
param.parent_tid = PTRIN(param32.parent_tid);
param.flags = param32.flags;
param.rtp = PTRIN(param32.rtp);
param.spare[0] = PTRIN(param32.spare[0]);
param.spare[1] = PTRIN(param32.spare[1]);
param.spare[2] = PTRIN(param32.spare[2]);
return (kern_thr_new(td, &param));
}
int
freebsd32_thr_suspend(struct thread *td, struct freebsd32_thr_suspend_args *uap)
{
struct timespec32 ts32;
struct timespec ts, *tsp;
int error;
error = 0;
tsp = NULL;
if (uap->timeout != NULL) {
error = copyin((const void *)uap->timeout, (void *)&ts32,
sizeof(struct timespec32));
if (error != 0)
return (error);
ts.tv_sec = ts32.tv_sec;
ts.tv_nsec = ts32.tv_nsec;
tsp = &ts;
}
return (kern_thr_suspend(td, tsp));
}
void
siginfo_to_siginfo32(const siginfo_t *src, struct siginfo32 *dst)
{
bzero(dst, sizeof(*dst));
dst->si_signo = src->si_signo;
dst->si_errno = src->si_errno;
dst->si_code = src->si_code;
dst->si_pid = src->si_pid;
dst->si_uid = src->si_uid;
dst->si_status = src->si_status;
dst->si_addr = (uintptr_t)src->si_addr;
dst->si_value.sigval_int = src->si_value.sival_int;
dst->si_timerid = src->si_timerid;
dst->si_overrun = src->si_overrun;
}
int
freebsd32_sigtimedwait(struct thread *td, struct freebsd32_sigtimedwait_args *uap)
{
struct timespec32 ts32;
struct timespec ts;
struct timespec *timeout;
sigset_t set;
ksiginfo_t ksi;
struct siginfo32 si32;
int error;
if (uap->timeout) {
error = copyin(uap->timeout, &ts32, sizeof(ts32));
if (error)
return (error);
ts.tv_sec = ts32.tv_sec;
ts.tv_nsec = ts32.tv_nsec;
timeout = &ts;
} else
timeout = NULL;
error = copyin(uap->set, &set, sizeof(set));
if (error)
return (error);
error = kern_sigtimedwait(td, set, &ksi, timeout);
if (error)
return (error);
if (uap->info) {
siginfo_to_siginfo32(&ksi.ksi_info, &si32);
error = copyout(&si32, uap->info, sizeof(struct siginfo32));
}
if (error == 0)
td->td_retval[0] = ksi.ksi_signo;
return (error);
}
/*
* MPSAFE
*/
int
freebsd32_sigwaitinfo(struct thread *td, struct freebsd32_sigwaitinfo_args *uap)
{
ksiginfo_t ksi;
struct siginfo32 si32;
sigset_t set;
int error;
error = copyin(uap->set, &set, sizeof(set));
if (error)
return (error);
error = kern_sigtimedwait(td, set, &ksi, NULL);
if (error)
return (error);
if (uap->info) {
siginfo_to_siginfo32(&ksi.ksi_info, &si32);
error = copyout(&si32, uap->info, sizeof(struct siginfo32));
}
if (error == 0)
td->td_retval[0] = ksi.ksi_signo;
return (error);
}
int
freebsd32_cpuset_setid(struct thread *td,
struct freebsd32_cpuset_setid_args *uap)
{
struct cpuset_setid_args ap;
ap.which = uap->which;
ap.id = PAIR32TO64(id_t,uap->id);
ap.setid = uap->setid;
- return (cpuset_setid(td, &ap));
+ return (sys_cpuset_setid(td, &ap));
}
int
freebsd32_cpuset_getid(struct thread *td,
struct freebsd32_cpuset_getid_args *uap)
{
struct cpuset_getid_args ap;
ap.level = uap->level;
ap.which = uap->which;
ap.id = PAIR32TO64(id_t,uap->id);
ap.setid = uap->setid;
- return (cpuset_getid(td, &ap));
+ return (sys_cpuset_getid(td, &ap));
}
int
freebsd32_cpuset_getaffinity(struct thread *td,
struct freebsd32_cpuset_getaffinity_args *uap)
{
struct cpuset_getaffinity_args ap;
ap.level = uap->level;
ap.which = uap->which;
ap.id = PAIR32TO64(id_t,uap->id);
ap.cpusetsize = uap->cpusetsize;
ap.mask = uap->mask;
- return (cpuset_getaffinity(td, &ap));
+ return (sys_cpuset_getaffinity(td, &ap));
}
int
freebsd32_cpuset_setaffinity(struct thread *td,
struct freebsd32_cpuset_setaffinity_args *uap)
{
struct cpuset_setaffinity_args ap;
ap.level = uap->level;
ap.which = uap->which;
ap.id = PAIR32TO64(id_t,uap->id);
ap.cpusetsize = uap->cpusetsize;
ap.mask = uap->mask;
- return (cpuset_setaffinity(td, &ap));
+ return (sys_cpuset_setaffinity(td, &ap));
}
int
freebsd32_nmount(struct thread *td,
struct freebsd32_nmount_args /* {
struct iovec *iovp;
unsigned int iovcnt;
int flags;
} */ *uap)
{
struct uio *auio;
int error;
AUDIT_ARG_FFLAGS(uap->flags);
/*
* Filter out MNT_ROOTFS. We do not want clients of nmount() in
* userspace to set this flag, but we must filter it out if we want
* MNT_UPDATE on the root file system to work.
* MNT_ROOTFS should only be set by the kernel when mounting its
* root file system.
*/
uap->flags &= ~MNT_ROOTFS;
/*
* check that we have an even number of iovec's
* and that we have at least two options.
*/
if ((uap->iovcnt & 1) || (uap->iovcnt < 4))
return (EINVAL);
error = freebsd32_copyinuio(uap->iovp, uap->iovcnt, &auio);
if (error)
return (error);
error = vfs_donmount(td, uap->flags, auio);
free(auio, M_IOV);
return error;
}
#if 0
int
freebsd32_xxx(struct thread *td, struct freebsd32_xxx_args *uap)
{
struct yyy32 *p32, s32;
struct yyy *p = NULL, s;
struct xxx_arg ap;
int error;
if (uap->zzz) {
error = copyin(uap->zzz, &s32, sizeof(s32));
if (error)
return (error);
/* translate in */
p = &s;
}
error = kern_xxx(td, p);
if (error)
return (error);
if (uap->zzz) {
/* translate out */
error = copyout(&s32, p32, sizeof(s32));
}
return (error);
}
#endif
int
syscall32_register(int *offset, struct sysent *new_sysent,
struct sysent *old_sysent)
{
if (*offset == NO_SYSCALL) {
int i;
for (i = 1; i < SYS_MAXSYSCALL; ++i)
if (freebsd32_sysent[i].sy_call ==
(sy_call_t *)lkmnosys)
break;
if (i == SYS_MAXSYSCALL)
return (ENFILE);
*offset = i;
} else if (*offset < 0 || *offset >= SYS_MAXSYSCALL)
return (EINVAL);
else if (freebsd32_sysent[*offset].sy_call != (sy_call_t *)lkmnosys &&
freebsd32_sysent[*offset].sy_call != (sy_call_t *)lkmressys)
return (EEXIST);
*old_sysent = freebsd32_sysent[*offset];
freebsd32_sysent[*offset] = *new_sysent;
return 0;
}
int
syscall32_deregister(int *offset, struct sysent *old_sysent)
{
if (*offset)
freebsd32_sysent[*offset] = *old_sysent;
return 0;
}
int
syscall32_module_handler(struct module *mod, int what, void *arg)
{
struct syscall_module_data *data = (struct syscall_module_data*)arg;
modspecific_t ms;
int error;
switch (what) {
case MOD_LOAD:
error = syscall32_register(data->offset, data->new_sysent,
&data->old_sysent);
if (error) {
/* Leave a mark so we know to safely unload below. */
data->offset = NULL;
return error;
}
ms.intval = *data->offset;
MOD_XLOCK;
module_setspecific(mod, &ms);
MOD_XUNLOCK;
if (data->chainevh)
error = data->chainevh(mod, what, data->chainarg);
return (error);
case MOD_UNLOAD:
/*
* MOD_LOAD failed, so just return without calling the
* chained handler since we didn't pass along the MOD_LOAD
* event.
*/
if (data->offset == NULL)
return (0);
if (data->chainevh) {
error = data->chainevh(mod, what, data->chainarg);
if (error)
return (error);
}
error = syscall32_deregister(data->offset, &data->old_sysent);
return (error);
default:
error = EOPNOTSUPP;
if (data->chainevh)
error = data->chainevh(mod, what, data->chainarg);
return (error);
}
}
int
syscall32_helper_register(struct syscall_helper_data *sd)
{
struct syscall_helper_data *sd1;
int error;
for (sd1 = sd; sd1->syscall_no != NO_SYSCALL; sd1++) {
error = syscall32_register(&sd1->syscall_no, &sd1->new_sysent,
&sd1->old_sysent);
if (error != 0) {
syscall32_helper_unregister(sd);
return (error);
}
sd1->registered = 1;
}
return (0);
}
int
syscall32_helper_unregister(struct syscall_helper_data *sd)
{
struct syscall_helper_data *sd1;
for (sd1 = sd; sd1->registered != 0; sd1++) {
syscall32_deregister(&sd1->syscall_no, &sd1->old_sysent);
sd1->registered = 0;
}
return (0);
}
register_t *
freebsd32_copyout_strings(struct image_params *imgp)
{
int argc, envc, i;
u_int32_t *vectp;
char *stringp, *destp;
u_int32_t *stack_base;
struct freebsd32_ps_strings *arginfo;
char canary[sizeof(long) * 8];
int32_t pagesizes32[MAXPAGESIZES];
size_t execpath_len;
int szsigcode;
/*
* Calculate string base and vector table pointers.
* Also deal with signal trampoline code for this exec type.
*/
if (imgp->execpath != NULL && imgp->auxargs != NULL)
execpath_len = strlen(imgp->execpath) + 1;
else
execpath_len = 0;
arginfo = (struct freebsd32_ps_strings *)curproc->p_sysent->
sv_psstrings;
if (imgp->proc->p_sysent->sv_sigcode_base == 0)
szsigcode = *(imgp->proc->p_sysent->sv_szsigcode);
else
szsigcode = 0;
destp = (caddr_t)arginfo - szsigcode - SPARE_USRSPACE -
roundup(execpath_len, sizeof(char *)) -
roundup(sizeof(canary), sizeof(char *)) -
roundup(sizeof(pagesizes32), sizeof(char *)) -
roundup((ARG_MAX - imgp->args->stringspace), sizeof(char *));
/*
* install sigcode
*/
if (szsigcode != 0)
copyout(imgp->proc->p_sysent->sv_sigcode,
((caddr_t)arginfo - szsigcode), szsigcode);
/*
* Copy the image path for the rtld.
*/
if (execpath_len != 0) {
imgp->execpathp = (uintptr_t)arginfo - szsigcode - execpath_len;
copyout(imgp->execpath, (void *)imgp->execpathp,
execpath_len);
}
/*
* Prepare the canary for SSP.
*/
arc4rand(canary, sizeof(canary), 0);
imgp->canary = (uintptr_t)arginfo - szsigcode - execpath_len -
sizeof(canary);
copyout(canary, (void *)imgp->canary, sizeof(canary));
imgp->canarylen = sizeof(canary);
/*
* Prepare the pagesizes array.
*/
for (i = 0; i < MAXPAGESIZES; i++)
pagesizes32[i] = (uint32_t)pagesizes[i];
imgp->pagesizes = (uintptr_t)arginfo - szsigcode - execpath_len -
roundup(sizeof(canary), sizeof(char *)) - sizeof(pagesizes32);
copyout(pagesizes32, (void *)imgp->pagesizes, sizeof(pagesizes32));
imgp->pagesizeslen = sizeof(pagesizes32);
/*
* If we have a valid auxargs ptr, prepare some room
* on the stack.
*/
if (imgp->auxargs) {
/*
* 'AT_COUNT*2' is size for the ELF Auxargs data. This is for
* lower compatibility.
*/
imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size
: (AT_COUNT * 2);
/*
* The '+ 2' is for the null pointers at the end of each of
* the arg and env vector sets,and imgp->auxarg_size is room
* for argument of Runtime loader.
*/
vectp = (u_int32_t *) (destp - (imgp->args->argc +
imgp->args->envc + 2 + imgp->auxarg_size + execpath_len) *
sizeof(u_int32_t));
} else
/*
* The '+ 2' is for the null pointers at the end of each of
* the arg and env vector sets
*/
vectp = (u_int32_t *)
(destp - (imgp->args->argc + imgp->args->envc + 2) * sizeof(u_int32_t));
/*
* vectp also becomes our initial stack base
*/
stack_base = vectp;
stringp = imgp->args->begin_argv;
argc = imgp->args->argc;
envc = imgp->args->envc;
/*
* Copy out strings - arguments and environment.
*/
copyout(stringp, destp, ARG_MAX - imgp->args->stringspace);
/*
* Fill in "ps_strings" struct for ps, w, etc.
*/
suword32(&arginfo->ps_argvstr, (u_int32_t)(intptr_t)vectp);
suword32(&arginfo->ps_nargvstr, argc);
/*
* Fill in argument portion of vector table.
*/
for (; argc > 0; --argc) {
suword32(vectp++, (u_int32_t)(intptr_t)destp);
while (*stringp++ != 0)
destp++;
destp++;
}
/* a null vector table pointer separates the argp's from the envp's */
suword32(vectp++, 0);
suword32(&arginfo->ps_envstr, (u_int32_t)(intptr_t)vectp);
suword32(&arginfo->ps_nenvstr, envc);
/*
* Fill in environment portion of vector table.
*/
for (; envc > 0; --envc) {
suword32(vectp++, (u_int32_t)(intptr_t)destp);
while (*stringp++ != 0)
destp++;
destp++;
}
/* end of vector table is a null pointer */
suword32(vectp, 0);
return ((register_t *)stack_base);
}
int
freebsd32_kldstat(struct thread *td, struct freebsd32_kldstat_args *uap)
{
struct kld_file_stat stat;
struct kld32_file_stat stat32;
int error, version;
if ((error = copyin(&uap->stat->version, &version, sizeof(version)))
!= 0)
return (error);
if (version != sizeof(struct kld32_file_stat_1) &&
version != sizeof(struct kld32_file_stat))
return (EINVAL);
error = kern_kldstat(td, uap->fileid, &stat);
if (error != 0)
return (error);
bcopy(&stat.name[0], &stat32.name[0], sizeof(stat.name));
CP(stat, stat32, refs);
CP(stat, stat32, id);
PTROUT_CP(stat, stat32, address);
CP(stat, stat32, size);
bcopy(&stat.pathname[0], &stat32.pathname[0], sizeof(stat.pathname));
return (copyout(&stat32, uap->stat, version));
}
int
freebsd32_posix_fallocate(struct thread *td,
struct freebsd32_posix_fallocate_args *uap)
{
struct posix_fallocate_args ap;
ap.fd = uap->fd;
ap.offset = (uap->offsetlo | ((off_t)uap->offsethi << 32));
ap.len = (uap->lenlo | ((off_t)uap->lenhi << 32));
- return (posix_fallocate(td, &ap));
+ return (sys_posix_fallocate(td, &ap));
}
Index: head/sys/compat/freebsd32/freebsd32_util.h
===================================================================
--- head/sys/compat/freebsd32/freebsd32_util.h (revision 225616)
+++ head/sys/compat/freebsd32/freebsd32_util.h (revision 225617)
@@ -1,109 +1,118 @@
/*-
* Copyright (c) 1998-1999 Andrew Gallatin
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer
* in this position and unchanged.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the author may not be used to endorse or promote products
* derived from this software withough specific prior written permission
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _COMPAT_FREEBSD32_FREEBSD32_UTIL_H_
#define _COMPAT_FREEBSD32_FREEBSD32_UTIL_H_
#include <sys/cdefs.h>
#include <sys/exec.h>
#include <sys/sysent.h>
#include <sys/uio.h>
#include <vm/vm.h>
#include <vm/vm_param.h>
#include <vm/pmap.h>
struct freebsd32_ps_strings {
u_int32_t ps_argvstr; /* first of 0 or more argument strings */
int ps_nargvstr; /* the number of argument strings */
u_int32_t ps_envstr; /* first of 0 or more environment strings */
int ps_nenvstr; /* the number of environment strings */
};
#if defined(__amd64__) || defined(__ia64__)
#include <compat/ia32/ia32_util.h>
#endif
#define FREEBSD32_PS_STRINGS \
(FREEBSD32_USRSTACK - sizeof(struct freebsd32_ps_strings))
extern struct sysent freebsd32_sysent[];
#define SYSCALL32_MODULE(name, offset, new_sysent, evh, arg) \
static struct syscall_module_data name##_syscall32_mod = { \
evh, arg, offset, new_sysent, { 0, NULL } \
}; \
\
static moduledata_t name##32_mod = { \
"sys32/" #name, \
syscall32_module_handler, \
&name##_syscall32_mod \
}; \
DECLARE_MODULE(name##32, name##32_mod, SI_SUB_SYSCALLS, SI_ORDER_MIDDLE)
#define SYSCALL32_MODULE_HELPER(syscallname) \
static int syscallname##_syscall32 = FREEBSD32_SYS_##syscallname; \
static struct sysent syscallname##_sysent32 = { \
(sizeof(struct syscallname ## _args ) \
/ sizeof(register_t)), \
(sy_call_t *)& syscallname \
}; \
SYSCALL32_MODULE(syscallname, \
& syscallname##_syscall32, & syscallname##_sysent32,\
NULL, NULL);
#define SYSCALL32_INIT_HELPER(syscallname) { \
.new_sysent = { \
.sy_narg = (sizeof(struct syscallname ## _args ) \
/ sizeof(register_t)), \
.sy_call = (sy_call_t *)& syscallname, \
}, \
.syscall_no = FREEBSD32_SYS_##syscallname \
}
+#define SYSCALL32_INIT_HELPER_COMPAT(syscallname) { \
+ .new_sysent = { \
+ .sy_narg = (sizeof(struct syscallname ## _args ) \
+ / sizeof(register_t)), \
+ .sy_call = (sy_call_t *)& sys_ ## syscallname, \
+ }, \
+ .syscall_no = FREEBSD32_SYS_##syscallname \
+}
+
int syscall32_register(int *offset, struct sysent *new_sysent,
struct sysent *old_sysent);
int syscall32_deregister(int *offset, struct sysent *old_sysent);
int syscall32_module_handler(struct module *mod, int what, void *arg);
int syscall32_helper_register(struct syscall_helper_data *sd);
int syscall32_helper_unregister(struct syscall_helper_data *sd);
struct iovec32;
struct rusage32;
register_t *freebsd32_copyout_strings(struct image_params *imgp);
int freebsd32_copyiniov(struct iovec32 *iovp, u_int iovcnt,
struct iovec **iov, int error);
void freebsd32_rusage_out(const struct rusage *s, struct rusage32 *s32);
struct image_args;
int freebsd32_exec_copyin_args(struct image_args *args, char *fname,
enum uio_seg segflg, u_int32_t *argv, u_int32_t *envv);
#endif /* !_COMPAT_FREEBSD32_FREEBSD32_UTIL_H_ */
Index: head/sys/compat/linux/linux_emul.c
===================================================================
--- head/sys/compat/linux/linux_emul.c (revision 225616)
+++ head/sys/compat/linux/linux_emul.c (revision 225617)
@@ -1,372 +1,372 @@
/*-
* Copyright (c) 2006 Roman Divacky
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer
* in this position and unchanged.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_compat.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/imgact.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
#include <sys/sx.h>
#include <sys/proc.h>
#include <sys/syscallsubr.h>
#include <sys/sysent.h>
#include <sys/sysproto.h>
#include <sys/unistd.h>
#ifdef COMPAT_LINUX32
#include <machine/../linux32/linux.h>
#include <machine/../linux32/linux32_proto.h>
#else
#include <machine/../linux/linux.h>
#include <machine/../linux/linux_proto.h>
#endif
#include <compat/linux/linux_emul.h>
#include <compat/linux/linux_futex.h>
struct sx emul_shared_lock;
struct mtx emul_lock;
/* this returns locked reference to the emuldata entry (if found) */
struct linux_emuldata *
em_find(struct proc *p, int locked)
{
struct linux_emuldata *em;
if (locked == EMUL_DOLOCK)
EMUL_LOCK(&emul_lock);
em = p->p_emuldata;
if (em == NULL && locked == EMUL_DOLOCK)
EMUL_UNLOCK(&emul_lock);
return (em);
}
int
linux_proc_init(struct thread *td, pid_t child, int flags)
{
struct linux_emuldata *em, *p_em;
struct proc *p;
if (child != 0) {
/* non-exec call */
em = malloc(sizeof *em, M_LINUX, M_WAITOK | M_ZERO);
em->pid = child;
em->pdeath_signal = 0;
em->flags = 0;
em->robust_futexes = NULL;
if (flags & LINUX_CLONE_THREAD) {
/* handled later in the code */
} else {
struct linux_emuldata_shared *s;
s = malloc(sizeof *s, M_LINUX, M_WAITOK | M_ZERO);
s->refs = 1;
s->group_pid = child;
LIST_INIT(&s->threads);
em->shared = s;
}
} else {
/* lookup the old one */
em = em_find(td->td_proc, EMUL_DOLOCK);
KASSERT(em != NULL, ("proc_init: emuldata not found in exec case.\n"));
}
em->child_clear_tid = NULL;
em->child_set_tid = NULL;
/*
* allocate the shared struct only in clone()/fork cases in the case
* of clone() td = calling proc and child = pid of the newly created
* proc
*/
if (child != 0) {
if (flags & LINUX_CLONE_THREAD) {
/* lookup the parent */
/*
* we dont have to lock the p_em because
* its waiting for us in linux_clone so
* there is no chance of it changing the
* p_em->shared address
*/
p_em = em_find(td->td_proc, EMUL_DONTLOCK);
KASSERT(p_em != NULL, ("proc_init: parent emuldata not found for CLONE_THREAD\n"));
em->shared = p_em->shared;
EMUL_SHARED_WLOCK(&emul_shared_lock);
em->shared->refs++;
EMUL_SHARED_WUNLOCK(&emul_shared_lock);
} else {
/*
* handled earlier to avoid malloc(M_WAITOK) with
* rwlock held
*/
}
}
if (child != 0) {
EMUL_SHARED_WLOCK(&emul_shared_lock);
LIST_INSERT_HEAD(&em->shared->threads, em, threads);
EMUL_SHARED_WUNLOCK(&emul_shared_lock);
p = pfind(child);
KASSERT(p != NULL, ("process not found in proc_init\n"));
p->p_emuldata = em;
PROC_UNLOCK(p);
} else
EMUL_UNLOCK(&emul_lock);
return (0);
}
void
linux_proc_exit(void *arg __unused, struct proc *p)
{
struct linux_emuldata *em;
int error, shared_flags, shared_xstat;
struct thread *td = FIRST_THREAD_IN_PROC(p);
int *child_clear_tid;
struct proc *q, *nq;
if (__predict_true(p->p_sysent != &elf_linux_sysvec))
return;
release_futexes(p);
/* find the emuldata */
em = em_find(p, EMUL_DOLOCK);
KASSERT(em != NULL, ("proc_exit: emuldata not found.\n"));
/* reparent all procs that are not a thread leader to initproc */
if (em->shared->group_pid != p->p_pid) {
child_clear_tid = em->child_clear_tid;
EMUL_UNLOCK(&emul_lock);
sx_xlock(&proctree_lock);
wakeup(initproc);
PROC_LOCK(p);
proc_reparent(p, initproc);
p->p_sigparent = SIGCHLD;
PROC_UNLOCK(p);
sx_xunlock(&proctree_lock);
} else {
child_clear_tid = em->child_clear_tid;
EMUL_UNLOCK(&emul_lock);
}
EMUL_SHARED_WLOCK(&emul_shared_lock);
shared_flags = em->shared->flags;
shared_xstat = em->shared->xstat;
LIST_REMOVE(em, threads);
em->shared->refs--;
if (em->shared->refs == 0) {
EMUL_SHARED_WUNLOCK(&emul_shared_lock);
free(em->shared, M_LINUX);
} else
EMUL_SHARED_WUNLOCK(&emul_shared_lock);
if ((shared_flags & EMUL_SHARED_HASXSTAT) != 0)
p->p_xstat = shared_xstat;
if (child_clear_tid != NULL) {
struct linux_sys_futex_args cup;
int null = 0;
error = copyout(&null, child_clear_tid, sizeof(null));
if (error) {
free(em, M_LINUX);
return;
}
/* futexes stuff */
cup.uaddr = child_clear_tid;
cup.op = LINUX_FUTEX_WAKE;
cup.val = 0x7fffffff; /* Awake everyone */
cup.timeout = NULL;
cup.uaddr2 = NULL;
cup.val3 = 0;
error = linux_sys_futex(FIRST_THREAD_IN_PROC(p), &cup);
/*
* this cannot happen at the moment and if this happens it
* probably means there is a user space bug
*/
if (error)
printf(LMSG("futex stuff in proc_exit failed.\n"));
}
/* clean the stuff up */
free(em, M_LINUX);
/* this is a little weird but rewritten from exit1() */
sx_xlock(&proctree_lock);
q = LIST_FIRST(&p->p_children);
for (; q != NULL; q = nq) {
nq = LIST_NEXT(q, p_sibling);
if (q->p_flag & P_WEXIT)
continue;
if (__predict_false(q->p_sysent != &elf_linux_sysvec))
continue;
em = em_find(q, EMUL_DOLOCK);
KASSERT(em != NULL, ("linux_reparent: emuldata not found: %i\n", q->p_pid));
PROC_LOCK(q);
if ((q->p_flag & P_WEXIT) == 0 && em->pdeath_signal != 0) {
- psignal(q, em->pdeath_signal);
+ kern_psignal(q, em->pdeath_signal);
}
PROC_UNLOCK(q);
EMUL_UNLOCK(&emul_lock);
}
sx_xunlock(&proctree_lock);
}
/*
* This is used in a case of transition from FreeBSD binary execing to linux binary
* in this case we create linux emuldata proc entry with the pid of the currently running
* process.
*/
void
linux_proc_exec(void *arg __unused, struct proc *p, struct image_params *imgp)
{
if (__predict_false(imgp->sysent == &elf_linux_sysvec
&& p->p_sysent != &elf_linux_sysvec))
linux_proc_init(FIRST_THREAD_IN_PROC(p), p->p_pid, 0);
if (__predict_false((p->p_sysent->sv_flags & SV_ABI_MASK) ==
SV_ABI_LINUX))
/* Kill threads regardless of imgp->sysent value */
linux_kill_threads(FIRST_THREAD_IN_PROC(p), SIGKILL);
if (__predict_false(imgp->sysent != &elf_linux_sysvec
&& p->p_sysent == &elf_linux_sysvec)) {
struct linux_emuldata *em;
/*
* XXX:There's a race because here we assign p->p_emuldata NULL
* but the process is still counted as linux one for a short
* time so some other process might reference it and try to
* access its p->p_emuldata and panicing on a NULL reference.
*/
em = em_find(p, EMUL_DONTLOCK);
KASSERT(em != NULL, ("proc_exec: emuldata not found.\n"));
EMUL_SHARED_WLOCK(&emul_shared_lock);
LIST_REMOVE(em, threads);
PROC_LOCK(p);
p->p_emuldata = NULL;
PROC_UNLOCK(p);
em->shared->refs--;
if (em->shared->refs == 0) {
EMUL_SHARED_WUNLOCK(&emul_shared_lock);
free(em->shared, M_LINUX);
} else
EMUL_SHARED_WUNLOCK(&emul_shared_lock);
free(em, M_LINUX);
}
}
void
linux_schedtail(struct thread *td)
{
struct linux_emuldata *em;
struct proc *p;
int error = 0;
int *child_set_tid;
p = td->td_proc;
/* find the emuldata */
em = em_find(p, EMUL_DOLOCK);
KASSERT(em != NULL, ("linux_schedtail: emuldata not found.\n"));
child_set_tid = em->child_set_tid;
EMUL_UNLOCK(&emul_lock);
if (child_set_tid != NULL)
error = copyout(&p->p_pid, (int *)child_set_tid,
sizeof(p->p_pid));
return;
}
int
linux_set_tid_address(struct thread *td, struct linux_set_tid_address_args *args)
{
struct linux_emuldata *em;
#ifdef DEBUG
if (ldebug(set_tid_address))
printf(ARGS(set_tid_address, "%p"), args->tidptr);
#endif
/* find the emuldata */
em = em_find(td->td_proc, EMUL_DOLOCK);
KASSERT(em != NULL, ("set_tid_address: emuldata not found.\n"));
em->child_clear_tid = args->tidptr;
td->td_retval[0] = td->td_proc->p_pid;
EMUL_UNLOCK(&emul_lock);
return 0;
}
void
linux_kill_threads(struct thread *td, int sig)
{
struct linux_emuldata *em, *td_em, *tmp_em;
struct proc *sp;
td_em = em_find(td->td_proc, EMUL_DONTLOCK);
KASSERT(td_em != NULL, ("linux_kill_threads: emuldata not found.\n"));
EMUL_SHARED_RLOCK(&emul_shared_lock);
LIST_FOREACH_SAFE(em, &td_em->shared->threads, threads, tmp_em) {
if (em->pid == td_em->pid)
continue;
sp = pfind(em->pid);
if ((sp->p_flag & P_WEXIT) == 0)
- psignal(sp, sig);
+ kern_psignal(sp, sig);
PROC_UNLOCK(sp);
#ifdef DEBUG
printf(LMSG("linux_kill_threads: kill PID %d\n"), em->pid);
#endif
}
EMUL_SHARED_RUNLOCK(&emul_shared_lock);
}
Index: head/sys/compat/linux/linux_file.c
===================================================================
--- head/sys/compat/linux/linux_file.c (revision 225616)
+++ head/sys/compat/linux/linux_file.c (revision 225617)
@@ -1,1532 +1,1532 @@
/*-
* Copyright (c) 1994-1995 Søren Schmidt
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer
* in this position and unchanged.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_compat.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/capability.h>
#include <sys/conf.h>
#include <sys/dirent.h>
#include <sys/fcntl.h>
#include <sys/file.h>
#include <sys/filedesc.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mount.h>
#include <sys/mutex.h>
#include <sys/namei.h>
#include <sys/proc.h>
#include <sys/stat.h>
#include <sys/sx.h>
#include <sys/syscallsubr.h>
#include <sys/sysproto.h>
#include <sys/tty.h>
#include <sys/unistd.h>
#include <sys/vnode.h>
#include <security/mac/mac_framework.h>
#include <ufs/ufs/extattr.h>
#include <ufs/ufs/quota.h>
#include <ufs/ufs/ufsmount.h>
#ifdef COMPAT_LINUX32
#include <machine/../linux32/linux.h>
#include <machine/../linux32/linux32_proto.h>
#else
#include <machine/../linux/linux.h>
#include <machine/../linux/linux_proto.h>
#endif
#include <compat/linux/linux_util.h>
#include <compat/linux/linux_file.h>
int
linux_creat(struct thread *td, struct linux_creat_args *args)
{
char *path;
int error;
LCONVPATHEXIST(td, args->path, &path);
#ifdef DEBUG
if (ldebug(creat))
printf(ARGS(creat, "%s, %d"), path, args->mode);
#endif
error = kern_open(td, path, UIO_SYSSPACE, O_WRONLY | O_CREAT | O_TRUNC,
args->mode);
LFREEPATH(path);
return (error);
}
static int
linux_common_open(struct thread *td, int dirfd, char *path, int l_flags, int mode)
{
struct proc *p = td->td_proc;
struct file *fp;
int fd;
int bsd_flags, error;
bsd_flags = 0;
switch (l_flags & LINUX_O_ACCMODE) {
case LINUX_O_WRONLY:
bsd_flags |= O_WRONLY;
break;
case LINUX_O_RDWR:
bsd_flags |= O_RDWR;
break;
default:
bsd_flags |= O_RDONLY;
}
if (l_flags & LINUX_O_NDELAY)
bsd_flags |= O_NONBLOCK;
if (l_flags & LINUX_O_APPEND)
bsd_flags |= O_APPEND;
if (l_flags & LINUX_O_SYNC)
bsd_flags |= O_FSYNC;
if (l_flags & LINUX_O_NONBLOCK)
bsd_flags |= O_NONBLOCK;
if (l_flags & LINUX_FASYNC)
bsd_flags |= O_ASYNC;
if (l_flags & LINUX_O_CREAT)
bsd_flags |= O_CREAT;
if (l_flags & LINUX_O_TRUNC)
bsd_flags |= O_TRUNC;
if (l_flags & LINUX_O_EXCL)
bsd_flags |= O_EXCL;
if (l_flags & LINUX_O_NOCTTY)
bsd_flags |= O_NOCTTY;
if (l_flags & LINUX_O_DIRECT)
bsd_flags |= O_DIRECT;
if (l_flags & LINUX_O_NOFOLLOW)
bsd_flags |= O_NOFOLLOW;
if (l_flags & LINUX_O_DIRECTORY)
bsd_flags |= O_DIRECTORY;
/* XXX LINUX_O_NOATIME: unable to be easily implemented. */
error = kern_openat(td, dirfd, path, UIO_SYSSPACE, bsd_flags, mode);
if (!error) {
fd = td->td_retval[0];
/*
* XXX In between kern_open() and fget(), another process
* having the same filedesc could use that fd without
* checking below.
*/
error = fget(td, fd, CAP_IOCTL, &fp);
if (!error) {
sx_slock(&proctree_lock);
PROC_LOCK(p);
if (!(bsd_flags & O_NOCTTY) &&
SESS_LEADER(p) && !(p->p_flag & P_CONTROLT)) {
PROC_UNLOCK(p);
sx_unlock(&proctree_lock);
if (fp->f_type == DTYPE_VNODE)
(void) fo_ioctl(fp, TIOCSCTTY, (caddr_t) 0,
td->td_ucred, td);
} else {
PROC_UNLOCK(p);
sx_sunlock(&proctree_lock);
}
fdrop(fp, td);
/*
* XXX as above, fdrop()/kern_close() pair is racy.
*/
if (error)
kern_close(td, fd);
}
}
#ifdef DEBUG
if (ldebug(open))
printf(LMSG("open returns error %d"), error);
#endif
LFREEPATH(path);
return (error);
}
int
linux_openat(struct thread *td, struct linux_openat_args *args)
{
char *path;
int dfd;
dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd;
if (args->flags & LINUX_O_CREAT)
LCONVPATH_AT(td, args->filename, &path, 1, dfd);
else
LCONVPATH_AT(td, args->filename, &path, 0, dfd);
#ifdef DEBUG
if (ldebug(openat))
printf(ARGS(openat, "%i, %s, 0x%x, 0x%x"), args->dfd,
path, args->flags, args->mode);
#endif
return (linux_common_open(td, dfd, path, args->flags, args->mode));
}
int
linux_open(struct thread *td, struct linux_open_args *args)
{
char *path;
if (args->flags & LINUX_O_CREAT)
LCONVPATHCREAT(td, args->path, &path);
else
LCONVPATHEXIST(td, args->path, &path);
#ifdef DEBUG
if (ldebug(open))
printf(ARGS(open, "%s, 0x%x, 0x%x"),
path, args->flags, args->mode);
#endif
return (linux_common_open(td, AT_FDCWD, path, args->flags, args->mode));
}
int
linux_lseek(struct thread *td, struct linux_lseek_args *args)
{
struct lseek_args /* {
int fd;
int pad;
off_t offset;
int whence;
} */ tmp_args;
int error;
#ifdef DEBUG
if (ldebug(lseek))
printf(ARGS(lseek, "%d, %ld, %d"),
args->fdes, (long)args->off, args->whence);
#endif
tmp_args.fd = args->fdes;
tmp_args.offset = (off_t)args->off;
tmp_args.whence = args->whence;
- error = lseek(td, &tmp_args);
+ error = sys_lseek(td, &tmp_args);
return error;
}
int
linux_llseek(struct thread *td, struct linux_llseek_args *args)
{
struct lseek_args bsd_args;
int error;
off_t off;
#ifdef DEBUG
if (ldebug(llseek))
printf(ARGS(llseek, "%d, %d:%d, %d"),
args->fd, args->ohigh, args->olow, args->whence);
#endif
off = (args->olow) | (((off_t) args->ohigh) << 32);
bsd_args.fd = args->fd;
bsd_args.offset = off;
bsd_args.whence = args->whence;
- if ((error = lseek(td, &bsd_args)))
+ if ((error = sys_lseek(td, &bsd_args)))
return error;
if ((error = copyout(td->td_retval, args->res, sizeof (off_t))))
return error;
td->td_retval[0] = 0;
return 0;
}
int
linux_readdir(struct thread *td, struct linux_readdir_args *args)
{
struct linux_getdents_args lda;
lda.fd = args->fd;
lda.dent = args->dent;
lda.count = 1;
return linux_getdents(td, &lda);
}
/*
* Note that linux_getdents(2) and linux_getdents64(2) have the same
* arguments. They only differ in the definition of struct dirent they
* operate on. We use this to common the code, with the exception of
* accessing struct dirent. Note that linux_readdir(2) is implemented
* by means of linux_getdents(2). In this case we never operate on
* struct dirent64 and thus don't need to handle it...
*/
struct l_dirent {
l_ulong d_ino;
l_off_t d_off;
l_ushort d_reclen;
char d_name[LINUX_NAME_MAX + 1];
};
struct l_dirent64 {
uint64_t d_ino;
int64_t d_off;
l_ushort d_reclen;
u_char d_type;
char d_name[LINUX_NAME_MAX + 1];
};
/*
* Linux uses the last byte in the dirent buffer to store d_type,
* at least glibc-2.7 requires it. That is why l_dirent is padded with 2 bytes.
*/
#define LINUX_RECLEN(namlen) \
roundup((offsetof(struct l_dirent, d_name) + (namlen) + 2), \
sizeof(l_ulong))
#define LINUX_RECLEN64(namlen) \
roundup((offsetof(struct l_dirent64, d_name) + (namlen) + 1), \
sizeof(uint64_t))
#define LINUX_MAXRECLEN max(LINUX_RECLEN(LINUX_NAME_MAX), \
LINUX_RECLEN64(LINUX_NAME_MAX))
#define LINUX_DIRBLKSIZ 512
static int
getdents_common(struct thread *td, struct linux_getdents64_args *args,
int is64bit)
{
struct dirent *bdp;
struct vnode *vp;
caddr_t inp, buf; /* BSD-format */
int len, reclen; /* BSD-format */
caddr_t outp; /* Linux-format */
int resid, linuxreclen=0; /* Linux-format */
caddr_t lbuf; /* Linux-format */
struct file *fp;
struct uio auio;
struct iovec aiov;
off_t off;
struct l_dirent *linux_dirent;
struct l_dirent64 *linux_dirent64;
int buflen, error, eofflag, nbytes, justone;
u_long *cookies = NULL, *cookiep;
int ncookies, vfslocked;
nbytes = args->count;
if (nbytes == 1) {
/* readdir(2) case. Always struct dirent. */
if (is64bit)
return (EINVAL);
nbytes = sizeof(*linux_dirent);
justone = 1;
} else
justone = 0;
if ((error = getvnode(td->td_proc->p_fd, args->fd, CAP_READ, &fp)) != 0)
return (error);
if ((fp->f_flag & FREAD) == 0) {
fdrop(fp, td);
return (EBADF);
}
vp = fp->f_vnode;
vfslocked = VFS_LOCK_GIANT(vp->v_mount);
if (vp->v_type != VDIR) {
VFS_UNLOCK_GIANT(vfslocked);
fdrop(fp, td);
return (EINVAL);
}
off = fp->f_offset;
buflen = max(LINUX_DIRBLKSIZ, nbytes);
buflen = min(buflen, MAXBSIZE);
buf = malloc(buflen, M_TEMP, M_WAITOK);
lbuf = malloc(LINUX_MAXRECLEN, M_TEMP, M_WAITOK | M_ZERO);
vn_lock(vp, LK_SHARED | LK_RETRY);
aiov.iov_base = buf;
aiov.iov_len = buflen;
auio.uio_iov = &aiov;
auio.uio_iovcnt = 1;
auio.uio_rw = UIO_READ;
auio.uio_segflg = UIO_SYSSPACE;
auio.uio_td = td;
auio.uio_resid = buflen;
auio.uio_offset = off;
if (cookies) {
free(cookies, M_TEMP);
cookies = NULL;
}
#ifdef MAC
/*
* Do directory search MAC check using non-cached credentials.
*/
if ((error = mac_vnode_check_readdir(td->td_ucred, vp)))
goto out;
#endif /* MAC */
if ((error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, &ncookies,
&cookies)))
goto out;
inp = buf;
outp = (caddr_t)args->dirent;
resid = nbytes;
if ((len = buflen - auio.uio_resid) <= 0)
goto eof;
cookiep = cookies;
if (cookies) {
/*
* When using cookies, the vfs has the option of reading from
* a different offset than that supplied (UFS truncates the
* offset to a block boundary to make sure that it never reads
* partway through a directory entry, even if the directory
* has been compacted).
*/
while (len > 0 && ncookies > 0 && *cookiep <= off) {
bdp = (struct dirent *) inp;
len -= bdp->d_reclen;
inp += bdp->d_reclen;
cookiep++;
ncookies--;
}
}
while (len > 0) {
if (cookiep && ncookies == 0)
break;
bdp = (struct dirent *) inp;
reclen = bdp->d_reclen;
if (reclen & 3) {
error = EFAULT;
goto out;
}
if (bdp->d_fileno == 0) {
inp += reclen;
if (cookiep) {
off = *cookiep++;
ncookies--;
} else
off += reclen;
len -= reclen;
continue;
}
linuxreclen = (is64bit)
? LINUX_RECLEN64(bdp->d_namlen)
: LINUX_RECLEN(bdp->d_namlen);
if (reclen > len || resid < linuxreclen) {
outp++;
break;
}
if (justone) {
/* readdir(2) case. */
linux_dirent = (struct l_dirent*)lbuf;
linux_dirent->d_ino = bdp->d_fileno;
linux_dirent->d_off = (l_off_t)linuxreclen;
linux_dirent->d_reclen = (l_ushort)bdp->d_namlen;
strlcpy(linux_dirent->d_name, bdp->d_name,
linuxreclen - offsetof(struct l_dirent, d_name));
error = copyout(linux_dirent, outp, linuxreclen);
}
if (is64bit) {
linux_dirent64 = (struct l_dirent64*)lbuf;
linux_dirent64->d_ino = bdp->d_fileno;
linux_dirent64->d_off = (cookiep)
? (l_off_t)*cookiep
: (l_off_t)(off + reclen);
linux_dirent64->d_reclen = (l_ushort)linuxreclen;
linux_dirent64->d_type = bdp->d_type;
strlcpy(linux_dirent64->d_name, bdp->d_name,
linuxreclen - offsetof(struct l_dirent64, d_name));
error = copyout(linux_dirent64, outp, linuxreclen);
} else if (!justone) {
linux_dirent = (struct l_dirent*)lbuf;
linux_dirent->d_ino = bdp->d_fileno;
linux_dirent->d_off = (cookiep)
? (l_off_t)*cookiep
: (l_off_t)(off + reclen);
linux_dirent->d_reclen = (l_ushort)linuxreclen;
/*
* Copy d_type to last byte of l_dirent buffer
*/
lbuf[linuxreclen-1] = bdp->d_type;
strlcpy(linux_dirent->d_name, bdp->d_name,
linuxreclen - offsetof(struct l_dirent, d_name)-1);
error = copyout(linux_dirent, outp, linuxreclen);
}
if (error)
goto out;
inp += reclen;
if (cookiep) {
off = *cookiep++;
ncookies--;
} else
off += reclen;
outp += linuxreclen;
resid -= linuxreclen;
len -= reclen;
if (justone)
break;
}
if (outp == (caddr_t)args->dirent) {
nbytes = resid;
goto eof;
}
fp->f_offset = off;
if (justone)
nbytes = resid + linuxreclen;
eof:
td->td_retval[0] = nbytes - resid;
out:
if (cookies)
free(cookies, M_TEMP);
VOP_UNLOCK(vp, 0);
VFS_UNLOCK_GIANT(vfslocked);
fdrop(fp, td);
free(buf, M_TEMP);
free(lbuf, M_TEMP);
return (error);
}
int
linux_getdents(struct thread *td, struct linux_getdents_args *args)
{
#ifdef DEBUG
if (ldebug(getdents))
printf(ARGS(getdents, "%d, *, %d"), args->fd, args->count);
#endif
return (getdents_common(td, (struct linux_getdents64_args*)args, 0));
}
int
linux_getdents64(struct thread *td, struct linux_getdents64_args *args)
{
#ifdef DEBUG
if (ldebug(getdents64))
printf(ARGS(getdents64, "%d, *, %d"), args->fd, args->count);
#endif
return (getdents_common(td, args, 1));
}
/*
* These exist mainly for hooks for doing /compat/linux translation.
*/
int
linux_access(struct thread *td, struct linux_access_args *args)
{
char *path;
int error;
/* linux convention */
if (args->flags & ~(F_OK | X_OK | W_OK | R_OK))
return (EINVAL);
LCONVPATHEXIST(td, args->path, &path);
#ifdef DEBUG
if (ldebug(access))
printf(ARGS(access, "%s, %d"), path, args->flags);
#endif
error = kern_access(td, path, UIO_SYSSPACE, args->flags);
LFREEPATH(path);
return (error);
}
int
linux_faccessat(struct thread *td, struct linux_faccessat_args *args)
{
char *path;
int error, dfd;
/* linux convention */
if (args->mode & ~(F_OK | X_OK | W_OK | R_OK))
return (EINVAL);
dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd;
LCONVPATHEXIST_AT(td, args->filename, &path, dfd);
#ifdef DEBUG
if (ldebug(access))
printf(ARGS(access, "%s, %d"), path, args->mode);
#endif
error = kern_accessat(td, dfd, path, UIO_SYSSPACE, 0 /* XXX */,
args->mode);
LFREEPATH(path);
return (error);
}
int
linux_unlink(struct thread *td, struct linux_unlink_args *args)
{
char *path;
int error;
struct stat st;
LCONVPATHEXIST(td, args->path, &path);
#ifdef DEBUG
if (ldebug(unlink))
printf(ARGS(unlink, "%s"), path);
#endif
error = kern_unlink(td, path, UIO_SYSSPACE);
if (error == EPERM)
/* Introduce POSIX noncompliant behaviour of Linux */
if (kern_stat(td, path, UIO_SYSSPACE, &st) == 0)
if (S_ISDIR(st.st_mode))
error = EISDIR;
LFREEPATH(path);
return (error);
}
int
linux_unlinkat(struct thread *td, struct linux_unlinkat_args *args)
{
char *path;
int error, dfd;
struct stat st;
if (args->flag & ~LINUX_AT_REMOVEDIR)
return (EINVAL);
dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd;
LCONVPATHEXIST_AT(td, args->pathname, &path, dfd);
#ifdef DEBUG
if (ldebug(unlinkat))
printf(ARGS(unlinkat, "%s"), path);
#endif
if (args->flag & LINUX_AT_REMOVEDIR)
error = kern_rmdirat(td, dfd, path, UIO_SYSSPACE);
else
error = kern_unlinkat(td, dfd, path, UIO_SYSSPACE, 0);
if (error == EPERM && !(args->flag & LINUX_AT_REMOVEDIR)) {
/* Introduce POSIX noncompliant behaviour of Linux */
if (kern_statat(td, AT_SYMLINK_NOFOLLOW, dfd, path,
UIO_SYSSPACE, &st) == 0 && S_ISDIR(st.st_mode))
error = EISDIR;
}
LFREEPATH(path);
return (error);
}
int
linux_chdir(struct thread *td, struct linux_chdir_args *args)
{
char *path;
int error;
LCONVPATHEXIST(td, args->path, &path);
#ifdef DEBUG
if (ldebug(chdir))
printf(ARGS(chdir, "%s"), path);
#endif
error = kern_chdir(td, path, UIO_SYSSPACE);
LFREEPATH(path);
return (error);
}
int
linux_chmod(struct thread *td, struct linux_chmod_args *args)
{
char *path;
int error;
LCONVPATHEXIST(td, args->path, &path);
#ifdef DEBUG
if (ldebug(chmod))
printf(ARGS(chmod, "%s, %d"), path, args->mode);
#endif
error = kern_chmod(td, path, UIO_SYSSPACE, args->mode);
LFREEPATH(path);
return (error);
}
int
linux_fchmodat(struct thread *td, struct linux_fchmodat_args *args)
{
char *path;
int error, dfd;
dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd;
LCONVPATHEXIST_AT(td, args->filename, &path, dfd);
#ifdef DEBUG
if (ldebug(fchmodat))
printf(ARGS(fchmodat, "%s, %d"), path, args->mode);
#endif
error = kern_fchmodat(td, dfd, path, UIO_SYSSPACE, args->mode, 0);
LFREEPATH(path);
return (error);
}
int
linux_mkdir(struct thread *td, struct linux_mkdir_args *args)
{
char *path;
int error;
LCONVPATHCREAT(td, args->path, &path);
#ifdef DEBUG
if (ldebug(mkdir))
printf(ARGS(mkdir, "%s, %d"), path, args->mode);
#endif
error = kern_mkdir(td, path, UIO_SYSSPACE, args->mode);
LFREEPATH(path);
return (error);
}
int
linux_mkdirat(struct thread *td, struct linux_mkdirat_args *args)
{
char *path;
int error, dfd;
dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd;
LCONVPATHCREAT_AT(td, args->pathname, &path, dfd);
#ifdef DEBUG
if (ldebug(mkdirat))
printf(ARGS(mkdirat, "%s, %d"), path, args->mode);
#endif
error = kern_mkdirat(td, dfd, path, UIO_SYSSPACE, args->mode);
LFREEPATH(path);
return (error);
}
int
linux_rmdir(struct thread *td, struct linux_rmdir_args *args)
{
char *path;
int error;
LCONVPATHEXIST(td, args->path, &path);
#ifdef DEBUG
if (ldebug(rmdir))
printf(ARGS(rmdir, "%s"), path);
#endif
error = kern_rmdir(td, path, UIO_SYSSPACE);
LFREEPATH(path);
return (error);
}
int
linux_rename(struct thread *td, struct linux_rename_args *args)
{
char *from, *to;
int error;
LCONVPATHEXIST(td, args->from, &from);
/* Expand LCONVPATHCREATE so that `from' can be freed on errors */
error = linux_emul_convpath(td, args->to, UIO_USERSPACE, &to, 1, AT_FDCWD);
if (to == NULL) {
LFREEPATH(from);
return (error);
}
#ifdef DEBUG
if (ldebug(rename))
printf(ARGS(rename, "%s, %s"), from, to);
#endif
error = kern_rename(td, from, to, UIO_SYSSPACE);
LFREEPATH(from);
LFREEPATH(to);
return (error);
}
int
linux_renameat(struct thread *td, struct linux_renameat_args *args)
{
char *from, *to;
int error, olddfd, newdfd;
olddfd = (args->olddfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->olddfd;
newdfd = (args->newdfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->newdfd;
LCONVPATHEXIST_AT(td, args->oldname, &from, olddfd);
/* Expand LCONVPATHCREATE so that `from' can be freed on errors */
error = linux_emul_convpath(td, args->newname, UIO_USERSPACE, &to, 1, newdfd);
if (to == NULL) {
LFREEPATH(from);
return (error);
}
#ifdef DEBUG
if (ldebug(renameat))
printf(ARGS(renameat, "%s, %s"), from, to);
#endif
error = kern_renameat(td, olddfd, from, newdfd, to, UIO_SYSSPACE);
LFREEPATH(from);
LFREEPATH(to);
return (error);
}
int
linux_symlink(struct thread *td, struct linux_symlink_args *args)
{
char *path, *to;
int error;
LCONVPATHEXIST(td, args->path, &path);
/* Expand LCONVPATHCREATE so that `path' can be freed on errors */
error = linux_emul_convpath(td, args->to, UIO_USERSPACE, &to, 1, AT_FDCWD);
if (to == NULL) {
LFREEPATH(path);
return (error);
}
#ifdef DEBUG
if (ldebug(symlink))
printf(ARGS(symlink, "%s, %s"), path, to);
#endif
error = kern_symlink(td, path, to, UIO_SYSSPACE);
LFREEPATH(path);
LFREEPATH(to);
return (error);
}
int
linux_symlinkat(struct thread *td, struct linux_symlinkat_args *args)
{
char *path, *to;
int error, dfd;
dfd = (args->newdfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->newdfd;
LCONVPATHEXIST_AT(td, args->oldname, &path, dfd);
/* Expand LCONVPATHCREATE so that `path' can be freed on errors */
error = linux_emul_convpath(td, args->newname, UIO_USERSPACE, &to, 1, dfd);
if (to == NULL) {
LFREEPATH(path);
return (error);
}
#ifdef DEBUG
if (ldebug(symlinkat))
printf(ARGS(symlinkat, "%s, %s"), path, to);
#endif
error = kern_symlinkat(td, path, dfd, to, UIO_SYSSPACE);
LFREEPATH(path);
LFREEPATH(to);
return (error);
}
int
linux_readlink(struct thread *td, struct linux_readlink_args *args)
{
char *name;
int error;
LCONVPATHEXIST(td, args->name, &name);
#ifdef DEBUG
if (ldebug(readlink))
printf(ARGS(readlink, "%s, %p, %d"), name, (void *)args->buf,
args->count);
#endif
error = kern_readlink(td, name, UIO_SYSSPACE, args->buf, UIO_USERSPACE,
args->count);
LFREEPATH(name);
return (error);
}
int
linux_readlinkat(struct thread *td, struct linux_readlinkat_args *args)
{
char *name;
int error, dfd;
dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd;
LCONVPATHEXIST_AT(td, args->path, &name, dfd);
#ifdef DEBUG
if (ldebug(readlinkat))
printf(ARGS(readlinkat, "%s, %p, %d"), name, (void *)args->buf,
args->bufsiz);
#endif
error = kern_readlinkat(td, dfd, name, UIO_SYSSPACE, args->buf,
UIO_USERSPACE, args->bufsiz);
LFREEPATH(name);
return (error);
}
int
linux_truncate(struct thread *td, struct linux_truncate_args *args)
{
char *path;
int error;
LCONVPATHEXIST(td, args->path, &path);
#ifdef DEBUG
if (ldebug(truncate))
printf(ARGS(truncate, "%s, %ld"), path, (long)args->length);
#endif
error = kern_truncate(td, path, UIO_SYSSPACE, args->length);
LFREEPATH(path);
return (error);
}
int
linux_truncate64(struct thread *td, struct linux_truncate64_args *args)
{
char *path;
int error;
LCONVPATHEXIST(td, args->path, &path);
#ifdef DEBUG
if (ldebug(truncate64))
printf(ARGS(truncate64, "%s, %jd"), path, args->length);
#endif
error = kern_truncate(td, path, UIO_SYSSPACE, args->length);
LFREEPATH(path);
return (error);
}
int
linux_ftruncate(struct thread *td, struct linux_ftruncate_args *args)
{
struct ftruncate_args /* {
int fd;
int pad;
off_t length;
} */ nuap;
nuap.fd = args->fd;
nuap.length = args->length;
- return (ftruncate(td, &nuap));
+ return (sys_ftruncate(td, &nuap));
}
int
linux_link(struct thread *td, struct linux_link_args *args)
{
char *path, *to;
int error;
LCONVPATHEXIST(td, args->path, &path);
/* Expand LCONVPATHCREATE so that `path' can be freed on errors */
error = linux_emul_convpath(td, args->to, UIO_USERSPACE, &to, 1, AT_FDCWD);
if (to == NULL) {
LFREEPATH(path);
return (error);
}
#ifdef DEBUG
if (ldebug(link))
printf(ARGS(link, "%s, %s"), path, to);
#endif
error = kern_link(td, path, to, UIO_SYSSPACE);
LFREEPATH(path);
LFREEPATH(to);
return (error);
}
int
linux_linkat(struct thread *td, struct linux_linkat_args *args)
{
char *path, *to;
int error, olddfd, newdfd;
/*
* They really introduced flags argument which is forbidden to
* use.
*/
if (args->flags != 0)
return (EINVAL);
olddfd = (args->olddfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->olddfd;
newdfd = (args->newdfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->newdfd;
LCONVPATHEXIST_AT(td, args->oldname, &path, olddfd);
/* Expand LCONVPATHCREATE so that `path' can be freed on errors */
error = linux_emul_convpath(td, args->newname, UIO_USERSPACE, &to, 1, newdfd);
if (to == NULL) {
LFREEPATH(path);
return (error);
}
#ifdef DEBUG
if (ldebug(linkat))
printf(ARGS(linkat, "%i, %s, %i, %s, %i"), args->olddfd, path,
args->newdfd, to, args->flags);
#endif
error = kern_linkat(td, olddfd, newdfd, path, to, UIO_SYSSPACE, FOLLOW);
LFREEPATH(path);
LFREEPATH(to);
return (error);
}
int
linux_fdatasync(td, uap)
struct thread *td;
struct linux_fdatasync_args *uap;
{
struct fsync_args bsd;
bsd.fd = uap->fd;
- return fsync(td, &bsd);
+ return sys_fsync(td, &bsd);
}
int
linux_pread(td, uap)
struct thread *td;
struct linux_pread_args *uap;
{
struct pread_args bsd;
struct vnode *vp;
int error;
bsd.fd = uap->fd;
bsd.buf = uap->buf;
bsd.nbyte = uap->nbyte;
bsd.offset = uap->offset;
- error = pread(td, &bsd);
+ error = sys_pread(td, &bsd);
if (error == 0) {
/* This seems to violate POSIX but linux does it */
if ((error = fgetvp(td, uap->fd, CAP_READ, &vp)) != 0)
return (error);
if (vp->v_type == VDIR) {
vrele(vp);
return (EISDIR);
}
vrele(vp);
}
return (error);
}
int
linux_pwrite(td, uap)
struct thread *td;
struct linux_pwrite_args *uap;
{
struct pwrite_args bsd;
bsd.fd = uap->fd;
bsd.buf = uap->buf;
bsd.nbyte = uap->nbyte;
bsd.offset = uap->offset;
- return pwrite(td, &bsd);
+ return sys_pwrite(td, &bsd);
}
int
linux_mount(struct thread *td, struct linux_mount_args *args)
{
struct ufs_args ufs;
char fstypename[MFSNAMELEN];
char mntonname[MNAMELEN], mntfromname[MNAMELEN];
int error;
int fsflags;
void *fsdata;
error = copyinstr(args->filesystemtype, fstypename, MFSNAMELEN - 1,
NULL);
if (error)
return (error);
error = copyinstr(args->specialfile, mntfromname, MNAMELEN - 1, NULL);
if (error)
return (error);
error = copyinstr(args->dir, mntonname, MNAMELEN - 1, NULL);
if (error)
return (error);
#ifdef DEBUG
if (ldebug(mount))
printf(ARGS(mount, "%s, %s, %s"),
fstypename, mntfromname, mntonname);
#endif
if (strcmp(fstypename, "ext2") == 0) {
strcpy(fstypename, "ext2fs");
fsdata = &ufs;
ufs.fspec = mntfromname;
#define DEFAULT_ROOTID -2
ufs.export.ex_root = DEFAULT_ROOTID;
ufs.export.ex_flags =
args->rwflag & LINUX_MS_RDONLY ? MNT_EXRDONLY : 0;
} else if (strcmp(fstypename, "proc") == 0) {
strcpy(fstypename, "linprocfs");
fsdata = NULL;
} else if (strcmp(fstypename, "vfat") == 0) {
strcpy(fstypename, "msdosfs");
fsdata = NULL;
} else {
return (ENODEV);
}
fsflags = 0;
if ((args->rwflag & 0xffff0000) == 0xc0ed0000) {
/*
* Linux SYNC flag is not included; the closest equivalent
* FreeBSD has is !ASYNC, which is our default.
*/
if (args->rwflag & LINUX_MS_RDONLY)
fsflags |= MNT_RDONLY;
if (args->rwflag & LINUX_MS_NOSUID)
fsflags |= MNT_NOSUID;
if (args->rwflag & LINUX_MS_NOEXEC)
fsflags |= MNT_NOEXEC;
if (args->rwflag & LINUX_MS_REMOUNT)
fsflags |= MNT_UPDATE;
}
if (strcmp(fstypename, "linprocfs") == 0) {
error = kernel_vmount(fsflags,
"fstype", fstypename,
"fspath", mntonname,
NULL);
} else if (strcmp(fstypename, "msdosfs") == 0) {
error = kernel_vmount(fsflags,
"fstype", fstypename,
"fspath", mntonname,
"from", mntfromname,
NULL);
} else
error = EOPNOTSUPP;
return (error);
}
int
linux_oldumount(struct thread *td, struct linux_oldumount_args *args)
{
struct linux_umount_args args2;
args2.path = args->path;
args2.flags = 0;
return (linux_umount(td, &args2));
}
int
linux_umount(struct thread *td, struct linux_umount_args *args)
{
struct unmount_args bsd;
bsd.path = args->path;
bsd.flags = args->flags; /* XXX correct? */
- return (unmount(td, &bsd));
+ return (sys_unmount(td, &bsd));
}
/*
* fcntl family of syscalls
*/
struct l_flock {
l_short l_type;
l_short l_whence;
l_off_t l_start;
l_off_t l_len;
l_pid_t l_pid;
}
#if defined(__amd64__) && defined(COMPAT_LINUX32)
__packed
#endif
;
static void
linux_to_bsd_flock(struct l_flock *linux_flock, struct flock *bsd_flock)
{
switch (linux_flock->l_type) {
case LINUX_F_RDLCK:
bsd_flock->l_type = F_RDLCK;
break;
case LINUX_F_WRLCK:
bsd_flock->l_type = F_WRLCK;
break;
case LINUX_F_UNLCK:
bsd_flock->l_type = F_UNLCK;
break;
default:
bsd_flock->l_type = -1;
break;
}
bsd_flock->l_whence = linux_flock->l_whence;
bsd_flock->l_start = (off_t)linux_flock->l_start;
bsd_flock->l_len = (off_t)linux_flock->l_len;
bsd_flock->l_pid = (pid_t)linux_flock->l_pid;
bsd_flock->l_sysid = 0;
}
static void
bsd_to_linux_flock(struct flock *bsd_flock, struct l_flock *linux_flock)
{
switch (bsd_flock->l_type) {
case F_RDLCK:
linux_flock->l_type = LINUX_F_RDLCK;
break;
case F_WRLCK:
linux_flock->l_type = LINUX_F_WRLCK;
break;
case F_UNLCK:
linux_flock->l_type = LINUX_F_UNLCK;
break;
}
linux_flock->l_whence = bsd_flock->l_whence;
linux_flock->l_start = (l_off_t)bsd_flock->l_start;
linux_flock->l_len = (l_off_t)bsd_flock->l_len;
linux_flock->l_pid = (l_pid_t)bsd_flock->l_pid;
}
#if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
struct l_flock64 {
l_short l_type;
l_short l_whence;
l_loff_t l_start;
l_loff_t l_len;
l_pid_t l_pid;
}
#if defined(__amd64__) && defined(COMPAT_LINUX32)
__packed
#endif
;
static void
linux_to_bsd_flock64(struct l_flock64 *linux_flock, struct flock *bsd_flock)
{
switch (linux_flock->l_type) {
case LINUX_F_RDLCK:
bsd_flock->l_type = F_RDLCK;
break;
case LINUX_F_WRLCK:
bsd_flock->l_type = F_WRLCK;
break;
case LINUX_F_UNLCK:
bsd_flock->l_type = F_UNLCK;
break;
default:
bsd_flock->l_type = -1;
break;
}
bsd_flock->l_whence = linux_flock->l_whence;
bsd_flock->l_start = (off_t)linux_flock->l_start;
bsd_flock->l_len = (off_t)linux_flock->l_len;
bsd_flock->l_pid = (pid_t)linux_flock->l_pid;
bsd_flock->l_sysid = 0;
}
static void
bsd_to_linux_flock64(struct flock *bsd_flock, struct l_flock64 *linux_flock)
{
switch (bsd_flock->l_type) {
case F_RDLCK:
linux_flock->l_type = LINUX_F_RDLCK;
break;
case F_WRLCK:
linux_flock->l_type = LINUX_F_WRLCK;
break;
case F_UNLCK:
linux_flock->l_type = LINUX_F_UNLCK;
break;
}
linux_flock->l_whence = bsd_flock->l_whence;
linux_flock->l_start = (l_loff_t)bsd_flock->l_start;
linux_flock->l_len = (l_loff_t)bsd_flock->l_len;
linux_flock->l_pid = (l_pid_t)bsd_flock->l_pid;
}
#endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */
static int
fcntl_common(struct thread *td, struct linux_fcntl64_args *args)
{
struct l_flock linux_flock;
struct flock bsd_flock;
struct file *fp;
long arg;
int error, result;
switch (args->cmd) {
case LINUX_F_DUPFD:
return (kern_fcntl(td, args->fd, F_DUPFD, args->arg));
case LINUX_F_GETFD:
return (kern_fcntl(td, args->fd, F_GETFD, 0));
case LINUX_F_SETFD:
return (kern_fcntl(td, args->fd, F_SETFD, args->arg));
case LINUX_F_GETFL:
error = kern_fcntl(td, args->fd, F_GETFL, 0);
result = td->td_retval[0];
td->td_retval[0] = 0;
if (result & O_RDONLY)
td->td_retval[0] |= LINUX_O_RDONLY;
if (result & O_WRONLY)
td->td_retval[0] |= LINUX_O_WRONLY;
if (result & O_RDWR)
td->td_retval[0] |= LINUX_O_RDWR;
if (result & O_NDELAY)
td->td_retval[0] |= LINUX_O_NONBLOCK;
if (result & O_APPEND)
td->td_retval[0] |= LINUX_O_APPEND;
if (result & O_FSYNC)
td->td_retval[0] |= LINUX_O_SYNC;
if (result & O_ASYNC)
td->td_retval[0] |= LINUX_FASYNC;
#ifdef LINUX_O_NOFOLLOW
if (result & O_NOFOLLOW)
td->td_retval[0] |= LINUX_O_NOFOLLOW;
#endif
#ifdef LINUX_O_DIRECT
if (result & O_DIRECT)
td->td_retval[0] |= LINUX_O_DIRECT;
#endif
return (error);
case LINUX_F_SETFL:
arg = 0;
if (args->arg & LINUX_O_NDELAY)
arg |= O_NONBLOCK;
if (args->arg & LINUX_O_APPEND)
arg |= O_APPEND;
if (args->arg & LINUX_O_SYNC)
arg |= O_FSYNC;
if (args->arg & LINUX_FASYNC)
arg |= O_ASYNC;
#ifdef LINUX_O_NOFOLLOW
if (args->arg & LINUX_O_NOFOLLOW)
arg |= O_NOFOLLOW;
#endif
#ifdef LINUX_O_DIRECT
if (args->arg & LINUX_O_DIRECT)
arg |= O_DIRECT;
#endif
return (kern_fcntl(td, args->fd, F_SETFL, arg));
case LINUX_F_GETLK:
error = copyin((void *)args->arg, &linux_flock,
sizeof(linux_flock));
if (error)
return (error);
linux_to_bsd_flock(&linux_flock, &bsd_flock);
error = kern_fcntl(td, args->fd, F_GETLK, (intptr_t)&bsd_flock);
if (error)
return (error);
bsd_to_linux_flock(&bsd_flock, &linux_flock);
return (copyout(&linux_flock, (void *)args->arg,
sizeof(linux_flock)));
case LINUX_F_SETLK:
error = copyin((void *)args->arg, &linux_flock,
sizeof(linux_flock));
if (error)
return (error);
linux_to_bsd_flock(&linux_flock, &bsd_flock);
return (kern_fcntl(td, args->fd, F_SETLK,
(intptr_t)&bsd_flock));
case LINUX_F_SETLKW:
error = copyin((void *)args->arg, &linux_flock,
sizeof(linux_flock));
if (error)
return (error);
linux_to_bsd_flock(&linux_flock, &bsd_flock);
return (kern_fcntl(td, args->fd, F_SETLKW,
(intptr_t)&bsd_flock));
case LINUX_F_GETOWN:
return (kern_fcntl(td, args->fd, F_GETOWN, 0));
case LINUX_F_SETOWN:
/*
* XXX some Linux applications depend on F_SETOWN having no
* significant effect for pipes (SIGIO is not delivered for
* pipes under Linux-2.2.35 at least).
*/
error = fget(td, args->fd, CAP_FCNTL, &fp);
if (error)
return (error);
if (fp->f_type == DTYPE_PIPE) {
fdrop(fp, td);
return (EINVAL);
}
fdrop(fp, td);
return (kern_fcntl(td, args->fd, F_SETOWN, args->arg));
}
return (EINVAL);
}
int
linux_fcntl(struct thread *td, struct linux_fcntl_args *args)
{
struct linux_fcntl64_args args64;
#ifdef DEBUG
if (ldebug(fcntl))
printf(ARGS(fcntl, "%d, %08x, *"), args->fd, args->cmd);
#endif
args64.fd = args->fd;
args64.cmd = args->cmd;
args64.arg = args->arg;
return (fcntl_common(td, &args64));
}
#if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
int
linux_fcntl64(struct thread *td, struct linux_fcntl64_args *args)
{
struct l_flock64 linux_flock;
struct flock bsd_flock;
int error;
#ifdef DEBUG
if (ldebug(fcntl64))
printf(ARGS(fcntl64, "%d, %08x, *"), args->fd, args->cmd);
#endif
switch (args->cmd) {
case LINUX_F_GETLK64:
error = copyin((void *)args->arg, &linux_flock,
sizeof(linux_flock));
if (error)
return (error);
linux_to_bsd_flock64(&linux_flock, &bsd_flock);
error = kern_fcntl(td, args->fd, F_GETLK, (intptr_t)&bsd_flock);
if (error)
return (error);
bsd_to_linux_flock64(&bsd_flock, &linux_flock);
return (copyout(&linux_flock, (void *)args->arg,
sizeof(linux_flock)));
case LINUX_F_SETLK64:
error = copyin((void *)args->arg, &linux_flock,
sizeof(linux_flock));
if (error)
return (error);
linux_to_bsd_flock64(&linux_flock, &bsd_flock);
return (kern_fcntl(td, args->fd, F_SETLK,
(intptr_t)&bsd_flock));
case LINUX_F_SETLKW64:
error = copyin((void *)args->arg, &linux_flock,
sizeof(linux_flock));
if (error)
return (error);
linux_to_bsd_flock64(&linux_flock, &bsd_flock);
return (kern_fcntl(td, args->fd, F_SETLKW,
(intptr_t)&bsd_flock));
}
return (fcntl_common(td, args));
}
#endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */
int
linux_chown(struct thread *td, struct linux_chown_args *args)
{
char *path;
int error;
LCONVPATHEXIST(td, args->path, &path);
#ifdef DEBUG
if (ldebug(chown))
printf(ARGS(chown, "%s, %d, %d"), path, args->uid, args->gid);
#endif
error = kern_chown(td, path, UIO_SYSSPACE, args->uid, args->gid);
LFREEPATH(path);
return (error);
}
int
linux_fchownat(struct thread *td, struct linux_fchownat_args *args)
{
char *path;
int error, dfd, follow;
if (args->flag & ~LINUX_AT_SYMLINK_NOFOLLOW)
return (EINVAL);
dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd;
LCONVPATHEXIST_AT(td, args->filename, &path, dfd);
#ifdef DEBUG
if (ldebug(fchownat))
printf(ARGS(fchownat, "%s, %d, %d"), path, args->uid, args->gid);
#endif
follow = (args->flag & LINUX_AT_SYMLINK_NOFOLLOW) == 0 ? 0 :
AT_SYMLINK_NOFOLLOW;
error = kern_fchownat(td, dfd, path, UIO_SYSSPACE, args->uid, args->gid,
follow);
LFREEPATH(path);
return (error);
}
int
linux_lchown(struct thread *td, struct linux_lchown_args *args)
{
char *path;
int error;
LCONVPATHEXIST(td, args->path, &path);
#ifdef DEBUG
if (ldebug(lchown))
printf(ARGS(lchown, "%s, %d, %d"), path, args->uid, args->gid);
#endif
error = kern_lchown(td, path, UIO_SYSSPACE, args->uid, args->gid);
LFREEPATH(path);
return (error);
}
Index: head/sys/compat/linux/linux_ioctl.c
===================================================================
--- head/sys/compat/linux/linux_ioctl.c (revision 225616)
+++ head/sys/compat/linux/linux_ioctl.c (revision 225617)
@@ -1,3531 +1,3531 @@
/*-
* Copyright (c) 1994-1995 Søren Schmidt
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer
* in this position and unchanged.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "opt_compat.h"
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/sysproto.h>
#include <sys/capability.h>
#include <sys/cdio.h>
#include <sys/dvdio.h>
#include <sys/conf.h>
#include <sys/disk.h>
#include <sys/consio.h>
#include <sys/ctype.h>
#include <sys/fcntl.h>
#include <sys/file.h>
#include <sys/filedesc.h>
#include <sys/filio.h>
#include <sys/jail.h>
#include <sys/kbio.h>
#include <sys/kernel.h>
#include <sys/linker_set.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/proc.h>
#include <sys/sbuf.h>
#include <sys/socket.h>
#include <sys/sockio.h>
#include <sys/soundcard.h>
#include <sys/stdint.h>
#include <sys/sx.h>
#include <sys/sysctl.h>
#include <sys/tty.h>
#include <sys/uio.h>
#include <sys/types.h>
#include <sys/mman.h>
#include <sys/resourcevar.h>
#include <net/if.h>
#include <net/if_dl.h>
#include <net/if_types.h>
#include <net/vnet.h>
#include <dev/usb/usb_ioctl.h>
#ifdef COMPAT_LINUX32
#include <machine/../linux32/linux.h>
#include <machine/../linux32/linux32_proto.h>
#else
#include <machine/../linux/linux.h>
#include <machine/../linux/linux_proto.h>
#endif
#include <compat/linux/linux_ioctl.h>
#include <compat/linux/linux_mib.h>
#include <compat/linux/linux_socket.h>
#include <compat/linux/linux_util.h>
#include <compat/linux/linux_videodev.h>
#include <compat/linux/linux_videodev_compat.h>
#include <compat/linux/linux_videodev2.h>
#include <compat/linux/linux_videodev2_compat.h>
CTASSERT(LINUX_IFNAMSIZ == IFNAMSIZ);
FEATURE(linuxulator_v4l, "V4L ioctl wrapper support in the linuxulator");
FEATURE(linuxulator_v4l2, "V4L2 ioctl wrapper support in the linuxulator");
static linux_ioctl_function_t linux_ioctl_cdrom;
static linux_ioctl_function_t linux_ioctl_vfat;
static linux_ioctl_function_t linux_ioctl_console;
static linux_ioctl_function_t linux_ioctl_hdio;
static linux_ioctl_function_t linux_ioctl_disk;
static linux_ioctl_function_t linux_ioctl_socket;
static linux_ioctl_function_t linux_ioctl_sound;
static linux_ioctl_function_t linux_ioctl_termio;
static linux_ioctl_function_t linux_ioctl_private;
static linux_ioctl_function_t linux_ioctl_drm;
static linux_ioctl_function_t linux_ioctl_sg;
static linux_ioctl_function_t linux_ioctl_v4l;
static linux_ioctl_function_t linux_ioctl_v4l2;
static linux_ioctl_function_t linux_ioctl_special;
static linux_ioctl_function_t linux_ioctl_fbsd_usb;
static struct linux_ioctl_handler cdrom_handler =
{ linux_ioctl_cdrom, LINUX_IOCTL_CDROM_MIN, LINUX_IOCTL_CDROM_MAX };
static struct linux_ioctl_handler vfat_handler =
{ linux_ioctl_vfat, LINUX_IOCTL_VFAT_MIN, LINUX_IOCTL_VFAT_MAX };
static struct linux_ioctl_handler console_handler =
{ linux_ioctl_console, LINUX_IOCTL_CONSOLE_MIN, LINUX_IOCTL_CONSOLE_MAX };
static struct linux_ioctl_handler hdio_handler =
{ linux_ioctl_hdio, LINUX_IOCTL_HDIO_MIN, LINUX_IOCTL_HDIO_MAX };
static struct linux_ioctl_handler disk_handler =
{ linux_ioctl_disk, LINUX_IOCTL_DISK_MIN, LINUX_IOCTL_DISK_MAX };
static struct linux_ioctl_handler socket_handler =
{ linux_ioctl_socket, LINUX_IOCTL_SOCKET_MIN, LINUX_IOCTL_SOCKET_MAX };
static struct linux_ioctl_handler sound_handler =
{ linux_ioctl_sound, LINUX_IOCTL_SOUND_MIN, LINUX_IOCTL_SOUND_MAX };
static struct linux_ioctl_handler termio_handler =
{ linux_ioctl_termio, LINUX_IOCTL_TERMIO_MIN, LINUX_IOCTL_TERMIO_MAX };
static struct linux_ioctl_handler private_handler =
{ linux_ioctl_private, LINUX_IOCTL_PRIVATE_MIN, LINUX_IOCTL_PRIVATE_MAX };
static struct linux_ioctl_handler drm_handler =
{ linux_ioctl_drm, LINUX_IOCTL_DRM_MIN, LINUX_IOCTL_DRM_MAX };
static struct linux_ioctl_handler sg_handler =
{ linux_ioctl_sg, LINUX_IOCTL_SG_MIN, LINUX_IOCTL_SG_MAX };
static struct linux_ioctl_handler video_handler =
{ linux_ioctl_v4l, LINUX_IOCTL_VIDEO_MIN, LINUX_IOCTL_VIDEO_MAX };
static struct linux_ioctl_handler video2_handler =
{ linux_ioctl_v4l2, LINUX_IOCTL_VIDEO2_MIN, LINUX_IOCTL_VIDEO2_MAX };
static struct linux_ioctl_handler fbsd_usb =
{ linux_ioctl_fbsd_usb, FBSD_LUSB_MIN, FBSD_LUSB_MAX };
DATA_SET(linux_ioctl_handler_set, cdrom_handler);
DATA_SET(linux_ioctl_handler_set, vfat_handler);
DATA_SET(linux_ioctl_handler_set, console_handler);
DATA_SET(linux_ioctl_handler_set, hdio_handler);
DATA_SET(linux_ioctl_handler_set, disk_handler);
DATA_SET(linux_ioctl_handler_set, socket_handler);
DATA_SET(linux_ioctl_handler_set, sound_handler);
DATA_SET(linux_ioctl_handler_set, termio_handler);
DATA_SET(linux_ioctl_handler_set, private_handler);
DATA_SET(linux_ioctl_handler_set, drm_handler);
DATA_SET(linux_ioctl_handler_set, sg_handler);
DATA_SET(linux_ioctl_handler_set, video_handler);
DATA_SET(linux_ioctl_handler_set, video2_handler);
DATA_SET(linux_ioctl_handler_set, fbsd_usb);
struct handler_element
{
TAILQ_ENTRY(handler_element) list;
int (*func)(struct thread *, struct linux_ioctl_args *);
int low, high, span;
};
static TAILQ_HEAD(, handler_element) handlers =
TAILQ_HEAD_INITIALIZER(handlers);
static struct sx linux_ioctl_sx;
SX_SYSINIT(linux_ioctl, &linux_ioctl_sx, "linux ioctl handlers");
/*
* hdio related ioctls for VMWare support
*/
struct linux_hd_geometry {
u_int8_t heads;
u_int8_t sectors;
u_int16_t cylinders;
u_int32_t start;
};
struct linux_hd_big_geometry {
u_int8_t heads;
u_int8_t sectors;
u_int32_t cylinders;
u_int32_t start;
};
static int
linux_ioctl_hdio(struct thread *td, struct linux_ioctl_args *args)
{
struct file *fp;
int error;
u_int sectorsize, fwcylinders, fwheads, fwsectors;
off_t mediasize, bytespercyl;
if ((error = fget(td, args->fd, CAP_IOCTL, &fp)) != 0)
return (error);
switch (args->cmd & 0xffff) {
case LINUX_HDIO_GET_GEO:
case LINUX_HDIO_GET_GEO_BIG:
error = fo_ioctl(fp, DIOCGMEDIASIZE,
(caddr_t)&mediasize, td->td_ucred, td);
if (!error)
error = fo_ioctl(fp, DIOCGSECTORSIZE,
(caddr_t)&sectorsize, td->td_ucred, td);
if (!error)
error = fo_ioctl(fp, DIOCGFWHEADS,
(caddr_t)&fwheads, td->td_ucred, td);
if (!error)
error = fo_ioctl(fp, DIOCGFWSECTORS,
(caddr_t)&fwsectors, td->td_ucred, td);
/*
* XXX: DIOCGFIRSTOFFSET is not yet implemented, so
* so pretend that GEOM always says 0. This is NOT VALID
* for slices or partitions, only the per-disk raw devices.
*/
fdrop(fp, td);
if (error)
return (error);
/*
* 1. Calculate the number of bytes in a cylinder,
* given the firmware's notion of heads and sectors
* per cylinder.
* 2. Calculate the number of cylinders, given the total
* size of the media.
* All internal calculations should have 64-bit precision.
*/
bytespercyl = (off_t) sectorsize * fwheads * fwsectors;
fwcylinders = mediasize / bytespercyl;
#if defined(DEBUG)
linux_msg(td, "HDIO_GET_GEO: mediasize %jd, c/h/s %d/%d/%d, "
"bpc %jd",
(intmax_t)mediasize, fwcylinders, fwheads, fwsectors,
(intmax_t)bytespercyl);
#endif
if ((args->cmd & 0xffff) == LINUX_HDIO_GET_GEO) {
struct linux_hd_geometry hdg;
hdg.cylinders = fwcylinders;
hdg.heads = fwheads;
hdg.sectors = fwsectors;
hdg.start = 0;
error = copyout(&hdg, (void *)args->arg, sizeof(hdg));
} else if ((args->cmd & 0xffff) == LINUX_HDIO_GET_GEO_BIG) {
struct linux_hd_big_geometry hdbg;
hdbg.cylinders = fwcylinders;
hdbg.heads = fwheads;
hdbg.sectors = fwsectors;
hdbg.start = 0;
error = copyout(&hdbg, (void *)args->arg, sizeof(hdbg));
}
return (error);
break;
default:
/* XXX */
linux_msg(td,
"ioctl fd=%d, cmd=0x%x ('%c',%d) is not implemented",
args->fd, (int)(args->cmd & 0xffff),
(int)(args->cmd & 0xff00) >> 8,
(int)(args->cmd & 0xff));
break;
}
fdrop(fp, td);
return (ENOIOCTL);
}
static int
linux_ioctl_disk(struct thread *td, struct linux_ioctl_args *args)
{
struct file *fp;
int error;
u_int sectorsize;
off_t mediasize;
if ((error = fget(td, args->fd, CAP_IOCTL, &fp)) != 0)
return (error);
switch (args->cmd & 0xffff) {
case LINUX_BLKGETSIZE:
error = fo_ioctl(fp, DIOCGSECTORSIZE,
(caddr_t)&sectorsize, td->td_ucred, td);
if (!error)
error = fo_ioctl(fp, DIOCGMEDIASIZE,
(caddr_t)&mediasize, td->td_ucred, td);
fdrop(fp, td);
if (error)
return (error);
sectorsize = mediasize / sectorsize;
/*
* XXX: How do we know we return the right size of integer ?
*/
return (copyout(&sectorsize, (void *)args->arg,
sizeof(sectorsize)));
break;
}
fdrop(fp, td);
return (ENOIOCTL);
}
/*
* termio related ioctls
*/
struct linux_termio {
unsigned short c_iflag;
unsigned short c_oflag;
unsigned short c_cflag;
unsigned short c_lflag;
unsigned char c_line;
unsigned char c_cc[LINUX_NCC];
};
struct linux_termios {
unsigned int c_iflag;
unsigned int c_oflag;
unsigned int c_cflag;
unsigned int c_lflag;
unsigned char c_line;
unsigned char c_cc[LINUX_NCCS];
};
struct linux_winsize {
unsigned short ws_row, ws_col;
unsigned short ws_xpixel, ws_ypixel;
};
struct speedtab {
int sp_speed; /* Speed. */
int sp_code; /* Code. */
};
static struct speedtab sptab[] = {
{ B0, LINUX_B0 }, { B50, LINUX_B50 },
{ B75, LINUX_B75 }, { B110, LINUX_B110 },
{ B134, LINUX_B134 }, { B150, LINUX_B150 },
{ B200, LINUX_B200 }, { B300, LINUX_B300 },
{ B600, LINUX_B600 }, { B1200, LINUX_B1200 },
{ B1800, LINUX_B1800 }, { B2400, LINUX_B2400 },
{ B4800, LINUX_B4800 }, { B9600, LINUX_B9600 },
{ B19200, LINUX_B19200 }, { B38400, LINUX_B38400 },
{ B57600, LINUX_B57600 }, { B115200, LINUX_B115200 },
{-1, -1 }
};
struct linux_serial_struct {
int type;
int line;
int port;
int irq;
int flags;
int xmit_fifo_size;
int custom_divisor;
int baud_base;
unsigned short close_delay;
char reserved_char[2];
int hub6;
unsigned short closing_wait;
unsigned short closing_wait2;
int reserved[4];
};
static int
linux_to_bsd_speed(int code, struct speedtab *table)
{
for ( ; table->sp_code != -1; table++)
if (table->sp_code == code)
return (table->sp_speed);
return -1;
}
static int
bsd_to_linux_speed(int speed, struct speedtab *table)
{
for ( ; table->sp_speed != -1; table++)
if (table->sp_speed == speed)
return (table->sp_code);
return -1;
}
static void
bsd_to_linux_termios(struct termios *bios, struct linux_termios *lios)
{
int i;
#ifdef DEBUG
if (ldebug(ioctl)) {
printf("LINUX: BSD termios structure (input):\n");
printf("i=%08x o=%08x c=%08x l=%08x ispeed=%d ospeed=%d\n",
bios->c_iflag, bios->c_oflag, bios->c_cflag, bios->c_lflag,
bios->c_ispeed, bios->c_ospeed);
printf("c_cc ");
for (i=0; i<NCCS; i++)
printf("%02x ", bios->c_cc[i]);
printf("\n");
}
#endif
lios->c_iflag = 0;
if (bios->c_iflag & IGNBRK)
lios->c_iflag |= LINUX_IGNBRK;
if (bios->c_iflag & BRKINT)
lios->c_iflag |= LINUX_BRKINT;
if (bios->c_iflag & IGNPAR)
lios->c_iflag |= LINUX_IGNPAR;
if (bios->c_iflag & PARMRK)
lios->c_iflag |= LINUX_PARMRK;
if (bios->c_iflag & INPCK)
lios->c_iflag |= LINUX_INPCK;
if (bios->c_iflag & ISTRIP)
lios->c_iflag |= LINUX_ISTRIP;
if (bios->c_iflag & INLCR)
lios->c_iflag |= LINUX_INLCR;
if (bios->c_iflag & IGNCR)
lios->c_iflag |= LINUX_IGNCR;
if (bios->c_iflag & ICRNL)
lios->c_iflag |= LINUX_ICRNL;
if (bios->c_iflag & IXON)
lios->c_iflag |= LINUX_IXON;
if (bios->c_iflag & IXANY)
lios->c_iflag |= LINUX_IXANY;
if (bios->c_iflag & IXOFF)
lios->c_iflag |= LINUX_IXOFF;
if (bios->c_iflag & IMAXBEL)
lios->c_iflag |= LINUX_IMAXBEL;
lios->c_oflag = 0;
if (bios->c_oflag & OPOST)
lios->c_oflag |= LINUX_OPOST;
if (bios->c_oflag & ONLCR)
lios->c_oflag |= LINUX_ONLCR;
if (bios->c_oflag & TAB3)
lios->c_oflag |= LINUX_XTABS;
lios->c_cflag = bsd_to_linux_speed(bios->c_ispeed, sptab);
lios->c_cflag |= (bios->c_cflag & CSIZE) >> 4;
if (bios->c_cflag & CSTOPB)
lios->c_cflag |= LINUX_CSTOPB;
if (bios->c_cflag & CREAD)
lios->c_cflag |= LINUX_CREAD;
if (bios->c_cflag & PARENB)
lios->c_cflag |= LINUX_PARENB;
if (bios->c_cflag & PARODD)
lios->c_cflag |= LINUX_PARODD;
if (bios->c_cflag & HUPCL)
lios->c_cflag |= LINUX_HUPCL;
if (bios->c_cflag & CLOCAL)
lios->c_cflag |= LINUX_CLOCAL;
if (bios->c_cflag & CRTSCTS)
lios->c_cflag |= LINUX_CRTSCTS;
lios->c_lflag = 0;
if (bios->c_lflag & ISIG)
lios->c_lflag |= LINUX_ISIG;
if (bios->c_lflag & ICANON)
lios->c_lflag |= LINUX_ICANON;
if (bios->c_lflag & ECHO)
lios->c_lflag |= LINUX_ECHO;
if (bios->c_lflag & ECHOE)
lios->c_lflag |= LINUX_ECHOE;
if (bios->c_lflag & ECHOK)
lios->c_lflag |= LINUX_ECHOK;
if (bios->c_lflag & ECHONL)
lios->c_lflag |= LINUX_ECHONL;
if (bios->c_lflag & NOFLSH)
lios->c_lflag |= LINUX_NOFLSH;
if (bios->c_lflag & TOSTOP)
lios->c_lflag |= LINUX_TOSTOP;
if (bios->c_lflag & ECHOCTL)
lios->c_lflag |= LINUX_ECHOCTL;
if (bios->c_lflag & ECHOPRT)
lios->c_lflag |= LINUX_ECHOPRT;
if (bios->c_lflag & ECHOKE)
lios->c_lflag |= LINUX_ECHOKE;
if (bios->c_lflag & FLUSHO)
lios->c_lflag |= LINUX_FLUSHO;
if (bios->c_lflag & PENDIN)
lios->c_lflag |= LINUX_PENDIN;
if (bios->c_lflag & IEXTEN)
lios->c_lflag |= LINUX_IEXTEN;
for (i=0; i<LINUX_NCCS; i++)
lios->c_cc[i] = LINUX_POSIX_VDISABLE;
lios->c_cc[LINUX_VINTR] = bios->c_cc[VINTR];
lios->c_cc[LINUX_VQUIT] = bios->c_cc[VQUIT];
lios->c_cc[LINUX_VERASE] = bios->c_cc[VERASE];
lios->c_cc[LINUX_VKILL] = bios->c_cc[VKILL];
lios->c_cc[LINUX_VEOF] = bios->c_cc[VEOF];
lios->c_cc[LINUX_VEOL] = bios->c_cc[VEOL];
lios->c_cc[LINUX_VMIN] = bios->c_cc[VMIN];
lios->c_cc[LINUX_VTIME] = bios->c_cc[VTIME];
lios->c_cc[LINUX_VEOL2] = bios->c_cc[VEOL2];
lios->c_cc[LINUX_VSUSP] = bios->c_cc[VSUSP];
lios->c_cc[LINUX_VSTART] = bios->c_cc[VSTART];
lios->c_cc[LINUX_VSTOP] = bios->c_cc[VSTOP];
lios->c_cc[LINUX_VREPRINT] = bios->c_cc[VREPRINT];
lios->c_cc[LINUX_VDISCARD] = bios->c_cc[VDISCARD];
lios->c_cc[LINUX_VWERASE] = bios->c_cc[VWERASE];
lios->c_cc[LINUX_VLNEXT] = bios->c_cc[VLNEXT];
for (i=0; i<LINUX_NCCS; i++) {
if (i != LINUX_VMIN && i != LINUX_VTIME &&
lios->c_cc[i] == _POSIX_VDISABLE)
lios->c_cc[i] = LINUX_POSIX_VDISABLE;
}
lios->c_line = 0;
#ifdef DEBUG
if (ldebug(ioctl)) {
printf("LINUX: LINUX termios structure (output):\n");
printf("i=%08x o=%08x c=%08x l=%08x line=%d\n",
lios->c_iflag, lios->c_oflag, lios->c_cflag,
lios->c_lflag, (int)lios->c_line);
printf("c_cc ");
for (i=0; i<LINUX_NCCS; i++)
printf("%02x ", lios->c_cc[i]);
printf("\n");
}
#endif
}
static void
linux_to_bsd_termios(struct linux_termios *lios, struct termios *bios)
{
int i;
#ifdef DEBUG
if (ldebug(ioctl)) {
printf("LINUX: LINUX termios structure (input):\n");
printf("i=%08x o=%08x c=%08x l=%08x line=%d\n",
lios->c_iflag, lios->c_oflag, lios->c_cflag,
lios->c_lflag, (int)lios->c_line);
printf("c_cc ");
for (i=0; i<LINUX_NCCS; i++)
printf("%02x ", lios->c_cc[i]);
printf("\n");
}
#endif
bios->c_iflag = 0;
if (lios->c_iflag & LINUX_IGNBRK)
bios->c_iflag |= IGNBRK;
if (lios->c_iflag & LINUX_BRKINT)
bios->c_iflag |= BRKINT;
if (lios->c_iflag & LINUX_IGNPAR)
bios->c_iflag |= IGNPAR;
if (lios->c_iflag & LINUX_PARMRK)
bios->c_iflag |= PARMRK;
if (lios->c_iflag & LINUX_INPCK)
bios->c_iflag |= INPCK;
if (lios->c_iflag & LINUX_ISTRIP)
bios->c_iflag |= ISTRIP;
if (lios->c_iflag & LINUX_INLCR)
bios->c_iflag |= INLCR;
if (lios->c_iflag & LINUX_IGNCR)
bios->c_iflag |= IGNCR;
if (lios->c_iflag & LINUX_ICRNL)
bios->c_iflag |= ICRNL;
if (lios->c_iflag & LINUX_IXON)
bios->c_iflag |= IXON;
if (lios->c_iflag & LINUX_IXANY)
bios->c_iflag |= IXANY;
if (lios->c_iflag & LINUX_IXOFF)
bios->c_iflag |= IXOFF;
if (lios->c_iflag & LINUX_IMAXBEL)
bios->c_iflag |= IMAXBEL;
bios->c_oflag = 0;
if (lios->c_oflag & LINUX_OPOST)
bios->c_oflag |= OPOST;
if (lios->c_oflag & LINUX_ONLCR)
bios->c_oflag |= ONLCR;
if (lios->c_oflag & LINUX_XTABS)
bios->c_oflag |= TAB3;
bios->c_cflag = (lios->c_cflag & LINUX_CSIZE) << 4;
if (lios->c_cflag & LINUX_CSTOPB)
bios->c_cflag |= CSTOPB;
if (lios->c_cflag & LINUX_CREAD)
bios->c_cflag |= CREAD;
if (lios->c_cflag & LINUX_PARENB)
bios->c_cflag |= PARENB;
if (lios->c_cflag & LINUX_PARODD)
bios->c_cflag |= PARODD;
if (lios->c_cflag & LINUX_HUPCL)
bios->c_cflag |= HUPCL;
if (lios->c_cflag & LINUX_CLOCAL)
bios->c_cflag |= CLOCAL;
if (lios->c_cflag & LINUX_CRTSCTS)
bios->c_cflag |= CRTSCTS;
bios->c_lflag = 0;
if (lios->c_lflag & LINUX_ISIG)
bios->c_lflag |= ISIG;
if (lios->c_lflag & LINUX_ICANON)
bios->c_lflag |= ICANON;
if (lios->c_lflag & LINUX_ECHO)
bios->c_lflag |= ECHO;
if (lios->c_lflag & LINUX_ECHOE)
bios->c_lflag |= ECHOE;
if (lios->c_lflag & LINUX_ECHOK)
bios->c_lflag |= ECHOK;
if (lios->c_lflag & LINUX_ECHONL)
bios->c_lflag |= ECHONL;
if (lios->c_lflag & LINUX_NOFLSH)
bios->c_lflag |= NOFLSH;
if (lios->c_lflag & LINUX_TOSTOP)
bios->c_lflag |= TOSTOP;
if (lios->c_lflag & LINUX_ECHOCTL)
bios->c_lflag |= ECHOCTL;
if (lios->c_lflag & LINUX_ECHOPRT)
bios->c_lflag |= ECHOPRT;
if (lios->c_lflag & LINUX_ECHOKE)
bios->c_lflag |= ECHOKE;
if (lios->c_lflag & LINUX_FLUSHO)
bios->c_lflag |= FLUSHO;
if (lios->c_lflag & LINUX_PENDIN)
bios->c_lflag |= PENDIN;
if (lios->c_lflag & LINUX_IEXTEN)
bios->c_lflag |= IEXTEN;
for (i=0; i<NCCS; i++)
bios->c_cc[i] = _POSIX_VDISABLE;
bios->c_cc[VINTR] = lios->c_cc[LINUX_VINTR];
bios->c_cc[VQUIT] = lios->c_cc[LINUX_VQUIT];
bios->c_cc[VERASE] = lios->c_cc[LINUX_VERASE];
bios->c_cc[VKILL] = lios->c_cc[LINUX_VKILL];
bios->c_cc[VEOF] = lios->c_cc[LINUX_VEOF];
bios->c_cc[VEOL] = lios->c_cc[LINUX_VEOL];
bios->c_cc[VMIN] = lios->c_cc[LINUX_VMIN];
bios->c_cc[VTIME] = lios->c_cc[LINUX_VTIME];
bios->c_cc[VEOL2] = lios->c_cc[LINUX_VEOL2];
bios->c_cc[VSUSP] = lios->c_cc[LINUX_VSUSP];
bios->c_cc[VSTART] = lios->c_cc[LINUX_VSTART];
bios->c_cc[VSTOP] = lios->c_cc[LINUX_VSTOP];
bios->c_cc[VREPRINT] = lios->c_cc[LINUX_VREPRINT];
bios->c_cc[VDISCARD] = lios->c_cc[LINUX_VDISCARD];
bios->c_cc[VWERASE] = lios->c_cc[LINUX_VWERASE];
bios->c_cc[VLNEXT] = lios->c_cc[LINUX_VLNEXT];
for (i=0; i<NCCS; i++) {
if (i != VMIN && i != VTIME &&
bios->c_cc[i] == LINUX_POSIX_VDISABLE)
bios->c_cc[i] = _POSIX_VDISABLE;
}
bios->c_ispeed = bios->c_ospeed =
linux_to_bsd_speed(lios->c_cflag & LINUX_CBAUD, sptab);
#ifdef DEBUG
if (ldebug(ioctl)) {
printf("LINUX: BSD termios structure (output):\n");
printf("i=%08x o=%08x c=%08x l=%08x ispeed=%d ospeed=%d\n",
bios->c_iflag, bios->c_oflag, bios->c_cflag, bios->c_lflag,
bios->c_ispeed, bios->c_ospeed);
printf("c_cc ");
for (i=0; i<NCCS; i++)
printf("%02x ", bios->c_cc[i]);
printf("\n");
}
#endif
}
static void
bsd_to_linux_termio(struct termios *bios, struct linux_termio *lio)
{
struct linux_termios lios;
bsd_to_linux_termios(bios, &lios);
lio->c_iflag = lios.c_iflag;
lio->c_oflag = lios.c_oflag;
lio->c_cflag = lios.c_cflag;
lio->c_lflag = lios.c_lflag;
lio->c_line = lios.c_line;
memcpy(lio->c_cc, lios.c_cc, LINUX_NCC);
}
static void
linux_to_bsd_termio(struct linux_termio *lio, struct termios *bios)
{
struct linux_termios lios;
int i;
lios.c_iflag = lio->c_iflag;
lios.c_oflag = lio->c_oflag;
lios.c_cflag = lio->c_cflag;
lios.c_lflag = lio->c_lflag;
for (i=LINUX_NCC; i<LINUX_NCCS; i++)
lios.c_cc[i] = LINUX_POSIX_VDISABLE;
memcpy(lios.c_cc, lio->c_cc, LINUX_NCC);
linux_to_bsd_termios(&lios, bios);
}
static int
linux_ioctl_termio(struct thread *td, struct linux_ioctl_args *args)
{
struct termios bios;
struct linux_termios lios;
struct linux_termio lio;
struct file *fp;
int error;
if ((error = fget(td, args->fd, CAP_IOCTL, &fp)) != 0)
return (error);
switch (args->cmd & 0xffff) {
case LINUX_TCGETS:
error = fo_ioctl(fp, TIOCGETA, (caddr_t)&bios, td->td_ucred,
td);
if (error)
break;
bsd_to_linux_termios(&bios, &lios);
error = copyout(&lios, (void *)args->arg, sizeof(lios));
break;
case LINUX_TCSETS:
error = copyin((void *)args->arg, &lios, sizeof(lios));
if (error)
break;
linux_to_bsd_termios(&lios, &bios);
error = (fo_ioctl(fp, TIOCSETA, (caddr_t)&bios, td->td_ucred,
td));
break;
case LINUX_TCSETSW:
error = copyin((void *)args->arg, &lios, sizeof(lios));
if (error)
break;
linux_to_bsd_termios(&lios, &bios);
error = (fo_ioctl(fp, TIOCSETAW, (caddr_t)&bios, td->td_ucred,
td));
break;
case LINUX_TCSETSF:
error = copyin((void *)args->arg, &lios, sizeof(lios));
if (error)
break;
linux_to_bsd_termios(&lios, &bios);
error = (fo_ioctl(fp, TIOCSETAF, (caddr_t)&bios, td->td_ucred,
td));
break;
case LINUX_TCGETA:
error = fo_ioctl(fp, TIOCGETA, (caddr_t)&bios, td->td_ucred,
td);
if (error)
break;
bsd_to_linux_termio(&bios, &lio);
error = (copyout(&lio, (void *)args->arg, sizeof(lio)));
break;
case LINUX_TCSETA:
error = copyin((void *)args->arg, &lio, sizeof(lio));
if (error)
break;
linux_to_bsd_termio(&lio, &bios);
error = (fo_ioctl(fp, TIOCSETA, (caddr_t)&bios, td->td_ucred,
td));
break;
case LINUX_TCSETAW:
error = copyin((void *)args->arg, &lio, sizeof(lio));
if (error)
break;
linux_to_bsd_termio(&lio, &bios);
error = (fo_ioctl(fp, TIOCSETAW, (caddr_t)&bios, td->td_ucred,
td));
break;
case LINUX_TCSETAF:
error = copyin((void *)args->arg, &lio, sizeof(lio));
if (error)
break;
linux_to_bsd_termio(&lio, &bios);
error = (fo_ioctl(fp, TIOCSETAF, (caddr_t)&bios, td->td_ucred,
td));
break;
/* LINUX_TCSBRK */
case LINUX_TCXONC: {
switch (args->arg) {
case LINUX_TCOOFF:
args->cmd = TIOCSTOP;
break;
case LINUX_TCOON:
args->cmd = TIOCSTART;
break;
case LINUX_TCIOFF:
case LINUX_TCION: {
int c;
struct write_args wr;
error = fo_ioctl(fp, TIOCGETA, (caddr_t)&bios,
td->td_ucred, td);
if (error)
break;
fdrop(fp, td);
c = (args->arg == LINUX_TCIOFF) ? VSTOP : VSTART;
c = bios.c_cc[c];
if (c != _POSIX_VDISABLE) {
wr.fd = args->fd;
wr.buf = &c;
wr.nbyte = sizeof(c);
- return (write(td, &wr));
+ return (sys_write(td, &wr));
} else
return (0);
}
default:
fdrop(fp, td);
return (EINVAL);
}
args->arg = 0;
- error = (ioctl(td, (struct ioctl_args *)args));
+ error = (sys_ioctl(td, (struct ioctl_args *)args));
break;
}
case LINUX_TCFLSH: {
int val;
switch (args->arg) {
case LINUX_TCIFLUSH:
val = FREAD;
break;
case LINUX_TCOFLUSH:
val = FWRITE;
break;
case LINUX_TCIOFLUSH:
val = FREAD | FWRITE;
break;
default:
fdrop(fp, td);
return (EINVAL);
}
error = (fo_ioctl(fp,TIOCFLUSH,(caddr_t)&val,td->td_ucred,td));
break;
}
case LINUX_TIOCEXCL:
args->cmd = TIOCEXCL;
- error = (ioctl(td, (struct ioctl_args *)args));
+ error = (sys_ioctl(td, (struct ioctl_args *)args));
break;
case LINUX_TIOCNXCL:
args->cmd = TIOCNXCL;
- error = (ioctl(td, (struct ioctl_args *)args));
+ error = (sys_ioctl(td, (struct ioctl_args *)args));
break;
case LINUX_TIOCSCTTY:
args->cmd = TIOCSCTTY;
- error = (ioctl(td, (struct ioctl_args *)args));
+ error = (sys_ioctl(td, (struct ioctl_args *)args));
break;
case LINUX_TIOCGPGRP:
args->cmd = TIOCGPGRP;
- error = (ioctl(td, (struct ioctl_args *)args));
+ error = (sys_ioctl(td, (struct ioctl_args *)args));
break;
case LINUX_TIOCSPGRP:
args->cmd = TIOCSPGRP;
- error = (ioctl(td, (struct ioctl_args *)args));
+ error = (sys_ioctl(td, (struct ioctl_args *)args));
break;
/* LINUX_TIOCOUTQ */
/* LINUX_TIOCSTI */
case LINUX_TIOCGWINSZ:
args->cmd = TIOCGWINSZ;
- error = (ioctl(td, (struct ioctl_args *)args));
+ error = (sys_ioctl(td, (struct ioctl_args *)args));
break;
case LINUX_TIOCSWINSZ:
args->cmd = TIOCSWINSZ;
- error = (ioctl(td, (struct ioctl_args *)args));
+ error = (sys_ioctl(td, (struct ioctl_args *)args));
break;
case LINUX_TIOCMGET:
args->cmd = TIOCMGET;
- error = (ioctl(td, (struct ioctl_args *)args));
+ error = (sys_ioctl(td, (struct ioctl_args *)args));
break;
case LINUX_TIOCMBIS:
args->cmd = TIOCMBIS;
- error = (ioctl(td, (struct ioctl_args *)args));
+ error = (sys_ioctl(td, (struct ioctl_args *)args));
break;
case LINUX_TIOCMBIC:
args->cmd = TIOCMBIC;
- error = (ioctl(td, (struct ioctl_args *)args));
+ error = (sys_ioctl(td, (struct ioctl_args *)args));
break;
case LINUX_TIOCMSET:
args->cmd = TIOCMSET;
- error = (ioctl(td, (struct ioctl_args *)args));
+ error = (sys_ioctl(td, (struct ioctl_args *)args));
break;
/* TIOCGSOFTCAR */
/* TIOCSSOFTCAR */
case LINUX_FIONREAD: /* LINUX_TIOCINQ */
args->cmd = FIONREAD;
- error = (ioctl(td, (struct ioctl_args *)args));
+ error = (sys_ioctl(td, (struct ioctl_args *)args));
break;
/* LINUX_TIOCLINUX */
case LINUX_TIOCCONS:
args->cmd = TIOCCONS;
- error = (ioctl(td, (struct ioctl_args *)args));
+ error = (sys_ioctl(td, (struct ioctl_args *)args));
break;
case LINUX_TIOCGSERIAL: {
struct linux_serial_struct lss;
lss.type = LINUX_PORT_16550A;
lss.flags = 0;
lss.close_delay = 0;
error = copyout(&lss, (void *)args->arg, sizeof(lss));
break;
}
case LINUX_TIOCSSERIAL: {
struct linux_serial_struct lss;
error = copyin((void *)args->arg, &lss, sizeof(lss));
if (error)
break;
/* XXX - It really helps to have an implementation that
* does nothing. NOT!
*/
error = 0;
break;
}
case LINUX_TIOCPKT:
args->cmd = TIOCPKT;
- error = (ioctl(td, (struct ioctl_args *)args));
+ error = (sys_ioctl(td, (struct ioctl_args *)args));
break;
case LINUX_FIONBIO:
args->cmd = FIONBIO;
- error = (ioctl(td, (struct ioctl_args *)args));
+ error = (sys_ioctl(td, (struct ioctl_args *)args));
break;
case LINUX_TIOCNOTTY:
args->cmd = TIOCNOTTY;
- error = (ioctl(td, (struct ioctl_args *)args));
+ error = (sys_ioctl(td, (struct ioctl_args *)args));
break;
case LINUX_TIOCSETD: {
int line;
switch (args->arg) {
case LINUX_N_TTY:
line = TTYDISC;
break;
case LINUX_N_SLIP:
line = SLIPDISC;
break;
case LINUX_N_PPP:
line = PPPDISC;
break;
default:
fdrop(fp, td);
return (EINVAL);
}
error = (fo_ioctl(fp, TIOCSETD, (caddr_t)&line, td->td_ucred,
td));
break;
}
case LINUX_TIOCGETD: {
int linux_line;
int bsd_line = TTYDISC;
error = fo_ioctl(fp, TIOCGETD, (caddr_t)&bsd_line,
td->td_ucred, td);
if (error)
return (error);
switch (bsd_line) {
case TTYDISC:
linux_line = LINUX_N_TTY;
break;
case SLIPDISC:
linux_line = LINUX_N_SLIP;
break;
case PPPDISC:
linux_line = LINUX_N_PPP;
break;
default:
fdrop(fp, td);
return (EINVAL);
}
error = (copyout(&linux_line, (void *)args->arg, sizeof(int)));
break;
}
/* LINUX_TCSBRKP */
/* LINUX_TIOCTTYGSTRUCT */
case LINUX_FIONCLEX:
args->cmd = FIONCLEX;
- error = (ioctl(td, (struct ioctl_args *)args));
+ error = (sys_ioctl(td, (struct ioctl_args *)args));
break;
case LINUX_FIOCLEX:
args->cmd = FIOCLEX;
- error = (ioctl(td, (struct ioctl_args *)args));
+ error = (sys_ioctl(td, (struct ioctl_args *)args));
break;
case LINUX_FIOASYNC:
args->cmd = FIOASYNC;
- error = (ioctl(td, (struct ioctl_args *)args));
+ error = (sys_ioctl(td, (struct ioctl_args *)args));
break;
/* LINUX_TIOCSERCONFIG */
/* LINUX_TIOCSERGWILD */
/* LINUX_TIOCSERSWILD */
/* LINUX_TIOCGLCKTRMIOS */
/* LINUX_TIOCSLCKTRMIOS */
case LINUX_TIOCSBRK:
args->cmd = TIOCSBRK;
- error = (ioctl(td, (struct ioctl_args *)args));
+ error = (sys_ioctl(td, (struct ioctl_args *)args));
break;
case LINUX_TIOCCBRK:
args->cmd = TIOCCBRK;
- error = (ioctl(td, (struct ioctl_args *)args));
+ error = (sys_ioctl(td, (struct ioctl_args *)args));
break;
case LINUX_TIOCGPTN: {
int nb;
error = fo_ioctl(fp, TIOCGPTN, (caddr_t)&nb, td->td_ucred, td);
if (!error)
error = copyout(&nb, (void *)args->arg,
sizeof(int));
break;
}
case LINUX_TIOCSPTLCK:
/* Our unlockpt() does nothing. */
error = 0;
break;
default:
error = ENOIOCTL;
break;
}
fdrop(fp, td);
return (error);
}
/*
* CDROM related ioctls
*/
struct linux_cdrom_msf
{
u_char cdmsf_min0;
u_char cdmsf_sec0;
u_char cdmsf_frame0;
u_char cdmsf_min1;
u_char cdmsf_sec1;
u_char cdmsf_frame1;
};
struct linux_cdrom_tochdr
{
u_char cdth_trk0;
u_char cdth_trk1;
};
union linux_cdrom_addr
{
struct {
u_char minute;
u_char second;
u_char frame;
} msf;
int lba;
};
struct linux_cdrom_tocentry
{
u_char cdte_track;
u_char cdte_adr:4;
u_char cdte_ctrl:4;
u_char cdte_format;
union linux_cdrom_addr cdte_addr;
u_char cdte_datamode;
};
struct linux_cdrom_subchnl
{
u_char cdsc_format;
u_char cdsc_audiostatus;
u_char cdsc_adr:4;
u_char cdsc_ctrl:4;
u_char cdsc_trk;
u_char cdsc_ind;
union linux_cdrom_addr cdsc_absaddr;
union linux_cdrom_addr cdsc_reladdr;
};
struct l_cdrom_read_audio {
union linux_cdrom_addr addr;
u_char addr_format;
l_int nframes;
u_char *buf;
};
struct l_dvd_layer {
u_char book_version:4;
u_char book_type:4;
u_char min_rate:4;
u_char disc_size:4;
u_char layer_type:4;
u_char track_path:1;
u_char nlayers:2;
u_char track_density:4;
u_char linear_density:4;
u_char bca:1;
u_int32_t start_sector;
u_int32_t end_sector;
u_int32_t end_sector_l0;
};
struct l_dvd_physical {
u_char type;
u_char layer_num;
struct l_dvd_layer layer[4];
};
struct l_dvd_copyright {
u_char type;
u_char layer_num;
u_char cpst;
u_char rmi;
};
struct l_dvd_disckey {
u_char type;
l_uint agid:2;
u_char value[2048];
};
struct l_dvd_bca {
u_char type;
l_int len;
u_char value[188];
};
struct l_dvd_manufact {
u_char type;
u_char layer_num;
l_int len;
u_char value[2048];
};
typedef union {
u_char type;
struct l_dvd_physical physical;
struct l_dvd_copyright copyright;
struct l_dvd_disckey disckey;
struct l_dvd_bca bca;
struct l_dvd_manufact manufact;
} l_dvd_struct;
typedef u_char l_dvd_key[5];
typedef u_char l_dvd_challenge[10];
struct l_dvd_lu_send_agid {
u_char type;
l_uint agid:2;
};
struct l_dvd_host_send_challenge {
u_char type;
l_uint agid:2;
l_dvd_challenge chal;
};
struct l_dvd_send_key {
u_char type;
l_uint agid:2;
l_dvd_key key;
};
struct l_dvd_lu_send_challenge {
u_char type;
l_uint agid:2;
l_dvd_challenge chal;
};
struct l_dvd_lu_send_title_key {
u_char type;
l_uint agid:2;
l_dvd_key title_key;
l_int lba;
l_uint cpm:1;
l_uint cp_sec:1;
l_uint cgms:2;
};
struct l_dvd_lu_send_asf {
u_char type;
l_uint agid:2;
l_uint asf:1;
};
struct l_dvd_host_send_rpcstate {
u_char type;
u_char pdrc;
};
struct l_dvd_lu_send_rpcstate {
u_char type:2;
u_char vra:3;
u_char ucca:3;
u_char region_mask;
u_char rpc_scheme;
};
typedef union {
u_char type;
struct l_dvd_lu_send_agid lsa;
struct l_dvd_host_send_challenge hsc;
struct l_dvd_send_key lsk;
struct l_dvd_lu_send_challenge lsc;
struct l_dvd_send_key hsk;
struct l_dvd_lu_send_title_key lstk;
struct l_dvd_lu_send_asf lsasf;
struct l_dvd_host_send_rpcstate hrpcs;
struct l_dvd_lu_send_rpcstate lrpcs;
} l_dvd_authinfo;
static void
bsd_to_linux_msf_lba(u_char af, union msf_lba *bp, union linux_cdrom_addr *lp)
{
if (af == CD_LBA_FORMAT)
lp->lba = bp->lba;
else {
lp->msf.minute = bp->msf.minute;
lp->msf.second = bp->msf.second;
lp->msf.frame = bp->msf.frame;
}
}
static void
set_linux_cdrom_addr(union linux_cdrom_addr *addr, int format, int lba)
{
if (format == LINUX_CDROM_MSF) {
addr->msf.frame = lba % 75;
lba /= 75;
lba += 2;
addr->msf.second = lba % 60;
addr->msf.minute = lba / 60;
} else
addr->lba = lba;
}
static int
linux_to_bsd_dvd_struct(l_dvd_struct *lp, struct dvd_struct *bp)
{
bp->format = lp->type;
switch (bp->format) {
case DVD_STRUCT_PHYSICAL:
if (bp->layer_num >= 4)
return (EINVAL);
bp->layer_num = lp->physical.layer_num;
break;
case DVD_STRUCT_COPYRIGHT:
bp->layer_num = lp->copyright.layer_num;
break;
case DVD_STRUCT_DISCKEY:
bp->agid = lp->disckey.agid;
break;
case DVD_STRUCT_BCA:
case DVD_STRUCT_MANUFACT:
break;
default:
return (EINVAL);
}
return (0);
}
static int
bsd_to_linux_dvd_struct(struct dvd_struct *bp, l_dvd_struct *lp)
{
switch (bp->format) {
case DVD_STRUCT_PHYSICAL: {
struct dvd_layer *blp = (struct dvd_layer *)bp->data;
struct l_dvd_layer *llp = &lp->physical.layer[bp->layer_num];
memset(llp, 0, sizeof(*llp));
llp->book_version = blp->book_version;
llp->book_type = blp->book_type;
llp->min_rate = blp->max_rate;
llp->disc_size = blp->disc_size;
llp->layer_type = blp->layer_type;
llp->track_path = blp->track_path;
llp->nlayers = blp->nlayers;
llp->track_density = blp->track_density;
llp->linear_density = blp->linear_density;
llp->bca = blp->bca;
llp->start_sector = blp->start_sector;
llp->end_sector = blp->end_sector;
llp->end_sector_l0 = blp->end_sector_l0;
break;
}
case DVD_STRUCT_COPYRIGHT:
lp->copyright.cpst = bp->cpst;
lp->copyright.rmi = bp->rmi;
break;
case DVD_STRUCT_DISCKEY:
memcpy(lp->disckey.value, bp->data, sizeof(lp->disckey.value));
break;
case DVD_STRUCT_BCA:
lp->bca.len = bp->length;
memcpy(lp->bca.value, bp->data, sizeof(lp->bca.value));
break;
case DVD_STRUCT_MANUFACT:
lp->manufact.len = bp->length;
memcpy(lp->manufact.value, bp->data,
sizeof(lp->manufact.value));
/* lp->manufact.layer_num is unused in linux (redhat 7.0) */
break;
default:
return (EINVAL);
}
return (0);
}
static int
linux_to_bsd_dvd_authinfo(l_dvd_authinfo *lp, int *bcode,
struct dvd_authinfo *bp)
{
switch (lp->type) {
case LINUX_DVD_LU_SEND_AGID:
*bcode = DVDIOCREPORTKEY;
bp->format = DVD_REPORT_AGID;
bp->agid = lp->lsa.agid;
break;
case LINUX_DVD_HOST_SEND_CHALLENGE:
*bcode = DVDIOCSENDKEY;
bp->format = DVD_SEND_CHALLENGE;
bp->agid = lp->hsc.agid;
memcpy(bp->keychal, lp->hsc.chal, 10);
break;
case LINUX_DVD_LU_SEND_KEY1:
*bcode = DVDIOCREPORTKEY;
bp->format = DVD_REPORT_KEY1;
bp->agid = lp->lsk.agid;
break;
case LINUX_DVD_LU_SEND_CHALLENGE:
*bcode = DVDIOCREPORTKEY;
bp->format = DVD_REPORT_CHALLENGE;
bp->agid = lp->lsc.agid;
break;
case LINUX_DVD_HOST_SEND_KEY2:
*bcode = DVDIOCSENDKEY;
bp->format = DVD_SEND_KEY2;
bp->agid = lp->hsk.agid;
memcpy(bp->keychal, lp->hsk.key, 5);
break;
case LINUX_DVD_LU_SEND_TITLE_KEY:
*bcode = DVDIOCREPORTKEY;
bp->format = DVD_REPORT_TITLE_KEY;
bp->agid = lp->lstk.agid;
bp->lba = lp->lstk.lba;
break;
case LINUX_DVD_LU_SEND_ASF:
*bcode = DVDIOCREPORTKEY;
bp->format = DVD_REPORT_ASF;
bp->agid = lp->lsasf.agid;
break;
case LINUX_DVD_INVALIDATE_AGID:
*bcode = DVDIOCREPORTKEY;
bp->format = DVD_INVALIDATE_AGID;
bp->agid = lp->lsa.agid;
break;
case LINUX_DVD_LU_SEND_RPC_STATE:
*bcode = DVDIOCREPORTKEY;
bp->format = DVD_REPORT_RPC;
break;
case LINUX_DVD_HOST_SEND_RPC_STATE:
*bcode = DVDIOCSENDKEY;
bp->format = DVD_SEND_RPC;
bp->region = lp->hrpcs.pdrc;
break;
default:
return (EINVAL);
}
return (0);
}
static int
bsd_to_linux_dvd_authinfo(struct dvd_authinfo *bp, l_dvd_authinfo *lp)
{
switch (lp->type) {
case LINUX_DVD_LU_SEND_AGID:
lp->lsa.agid = bp->agid;
break;
case LINUX_DVD_HOST_SEND_CHALLENGE:
lp->type = LINUX_DVD_LU_SEND_KEY1;
break;
case LINUX_DVD_LU_SEND_KEY1:
memcpy(lp->lsk.key, bp->keychal, sizeof(lp->lsk.key));
break;
case LINUX_DVD_LU_SEND_CHALLENGE:
memcpy(lp->lsc.chal, bp->keychal, sizeof(lp->lsc.chal));
break;
case LINUX_DVD_HOST_SEND_KEY2:
lp->type = LINUX_DVD_AUTH_ESTABLISHED;
break;
case LINUX_DVD_LU_SEND_TITLE_KEY:
memcpy(lp->lstk.title_key, bp->keychal,
sizeof(lp->lstk.title_key));
lp->lstk.cpm = bp->cpm;
lp->lstk.cp_sec = bp->cp_sec;
lp->lstk.cgms = bp->cgms;
break;
case LINUX_DVD_LU_SEND_ASF:
lp->lsasf.asf = bp->asf;
break;
case LINUX_DVD_INVALIDATE_AGID:
break;
case LINUX_DVD_LU_SEND_RPC_STATE:
lp->lrpcs.type = bp->reg_type;
lp->lrpcs.vra = bp->vend_rsts;
lp->lrpcs.ucca = bp->user_rsts;
lp->lrpcs.region_mask = bp->region;
lp->lrpcs.rpc_scheme = bp->rpc_scheme;
break;
case LINUX_DVD_HOST_SEND_RPC_STATE:
break;
default:
return (EINVAL);
}
return (0);
}
static int
linux_ioctl_cdrom(struct thread *td, struct linux_ioctl_args *args)
{
struct file *fp;
int error;
if ((error = fget(td, args->fd, CAP_IOCTL, &fp)) != 0)
return (error);
switch (args->cmd & 0xffff) {
case LINUX_CDROMPAUSE:
args->cmd = CDIOCPAUSE;
- error = (ioctl(td, (struct ioctl_args *)args));
+ error = (sys_ioctl(td, (struct ioctl_args *)args));
break;
case LINUX_CDROMRESUME:
args->cmd = CDIOCRESUME;
- error = (ioctl(td, (struct ioctl_args *)args));
+ error = (sys_ioctl(td, (struct ioctl_args *)args));
break;
case LINUX_CDROMPLAYMSF:
args->cmd = CDIOCPLAYMSF;
- error = (ioctl(td, (struct ioctl_args *)args));
+ error = (sys_ioctl(td, (struct ioctl_args *)args));
break;
case LINUX_CDROMPLAYTRKIND:
args->cmd = CDIOCPLAYTRACKS;
- error = (ioctl(td, (struct ioctl_args *)args));
+ error = (sys_ioctl(td, (struct ioctl_args *)args));
break;
case LINUX_CDROMREADTOCHDR: {
struct ioc_toc_header th;
struct linux_cdrom_tochdr lth;
error = fo_ioctl(fp, CDIOREADTOCHEADER, (caddr_t)&th,
td->td_ucred, td);
if (!error) {
lth.cdth_trk0 = th.starting_track;
lth.cdth_trk1 = th.ending_track;
copyout(&lth, (void *)args->arg, sizeof(lth));
}
break;
}
case LINUX_CDROMREADTOCENTRY: {
struct linux_cdrom_tocentry lte;
struct ioc_read_toc_single_entry irtse;
error = copyin((void *)args->arg, &lte, sizeof(lte));
if (error)
break;
irtse.address_format = lte.cdte_format;
irtse.track = lte.cdte_track;
error = fo_ioctl(fp, CDIOREADTOCENTRY, (caddr_t)&irtse,
td->td_ucred, td);
if (!error) {
lte.cdte_ctrl = irtse.entry.control;
lte.cdte_adr = irtse.entry.addr_type;
bsd_to_linux_msf_lba(irtse.address_format,
&irtse.entry.addr, &lte.cdte_addr);
error = copyout(&lte, (void *)args->arg, sizeof(lte));
}
break;
}
case LINUX_CDROMSTOP:
args->cmd = CDIOCSTOP;
- error = (ioctl(td, (struct ioctl_args *)args));
+ error = (sys_ioctl(td, (struct ioctl_args *)args));
break;
case LINUX_CDROMSTART:
args->cmd = CDIOCSTART;
- error = (ioctl(td, (struct ioctl_args *)args));
+ error = (sys_ioctl(td, (struct ioctl_args *)args));
break;
case LINUX_CDROMEJECT:
args->cmd = CDIOCEJECT;
- error = (ioctl(td, (struct ioctl_args *)args));
+ error = (sys_ioctl(td, (struct ioctl_args *)args));
break;
/* LINUX_CDROMVOLCTRL */
case LINUX_CDROMSUBCHNL: {
struct linux_cdrom_subchnl sc;
struct ioc_read_subchannel bsdsc;
struct cd_sub_channel_info bsdinfo;
bsdsc.address_format = CD_LBA_FORMAT;
bsdsc.data_format = CD_CURRENT_POSITION;
bsdsc.track = 0;
bsdsc.data_len = sizeof(bsdinfo);
bsdsc.data = &bsdinfo;
error = fo_ioctl(fp, CDIOCREADSUBCHANNEL_SYSSPACE,
(caddr_t)&bsdsc, td->td_ucred, td);
if (error)
break;
error = copyin((void *)args->arg, &sc, sizeof(sc));
if (error)
break;
sc.cdsc_audiostatus = bsdinfo.header.audio_status;
sc.cdsc_adr = bsdinfo.what.position.addr_type;
sc.cdsc_ctrl = bsdinfo.what.position.control;
sc.cdsc_trk = bsdinfo.what.position.track_number;
sc.cdsc_ind = bsdinfo.what.position.index_number;
set_linux_cdrom_addr(&sc.cdsc_absaddr, sc.cdsc_format,
bsdinfo.what.position.absaddr.lba);
set_linux_cdrom_addr(&sc.cdsc_reladdr, sc.cdsc_format,
bsdinfo.what.position.reladdr.lba);
error = copyout(&sc, (void *)args->arg, sizeof(sc));
break;
}
/* LINUX_CDROMREADMODE2 */
/* LINUX_CDROMREADMODE1 */
/* LINUX_CDROMREADAUDIO */
/* LINUX_CDROMEJECT_SW */
/* LINUX_CDROMMULTISESSION */
/* LINUX_CDROM_GET_UPC */
case LINUX_CDROMRESET:
args->cmd = CDIOCRESET;
- error = (ioctl(td, (struct ioctl_args *)args));
+ error = (sys_ioctl(td, (struct ioctl_args *)args));
break;
/* LINUX_CDROMVOLREAD */
/* LINUX_CDROMREADRAW */
/* LINUX_CDROMREADCOOKED */
/* LINUX_CDROMSEEK */
/* LINUX_CDROMPLAYBLK */
/* LINUX_CDROMREADALL */
/* LINUX_CDROMCLOSETRAY */
/* LINUX_CDROMLOADFROMSLOT */
/* LINUX_CDROMGETSPINDOWN */
/* LINUX_CDROMSETSPINDOWN */
/* LINUX_CDROM_SET_OPTIONS */
/* LINUX_CDROM_CLEAR_OPTIONS */
/* LINUX_CDROM_SELECT_SPEED */
/* LINUX_CDROM_SELECT_DISC */
/* LINUX_CDROM_MEDIA_CHANGED */
/* LINUX_CDROM_DRIVE_STATUS */
/* LINUX_CDROM_DISC_STATUS */
/* LINUX_CDROM_CHANGER_NSLOTS */
/* LINUX_CDROM_LOCKDOOR */
/* LINUX_CDROM_DEBUG */
/* LINUX_CDROM_GET_CAPABILITY */
/* LINUX_CDROMAUDIOBUFSIZ */
case LINUX_DVD_READ_STRUCT: {
l_dvd_struct *lds;
struct dvd_struct *bds;
lds = malloc(sizeof(*lds), M_LINUX, M_WAITOK);
bds = malloc(sizeof(*bds), M_LINUX, M_WAITOK);
error = copyin((void *)args->arg, lds, sizeof(*lds));
if (error)
goto out;
error = linux_to_bsd_dvd_struct(lds, bds);
if (error)
goto out;
error = fo_ioctl(fp, DVDIOCREADSTRUCTURE, (caddr_t)bds,
td->td_ucred, td);
if (error)
goto out;
error = bsd_to_linux_dvd_struct(bds, lds);
if (error)
goto out;
error = copyout(lds, (void *)args->arg, sizeof(*lds));
out:
free(bds, M_LINUX);
free(lds, M_LINUX);
break;
}
/* LINUX_DVD_WRITE_STRUCT */
case LINUX_DVD_AUTH: {
l_dvd_authinfo lda;
struct dvd_authinfo bda;
int bcode;
error = copyin((void *)args->arg, &lda, sizeof(lda));
if (error)
break;
error = linux_to_bsd_dvd_authinfo(&lda, &bcode, &bda);
if (error)
break;
error = fo_ioctl(fp, bcode, (caddr_t)&bda, td->td_ucred,
td);
if (error) {
if (lda.type == LINUX_DVD_HOST_SEND_KEY2) {
lda.type = LINUX_DVD_AUTH_FAILURE;
copyout(&lda, (void *)args->arg, sizeof(lda));
}
break;
}
error = bsd_to_linux_dvd_authinfo(&bda, &lda);
if (error)
break;
error = copyout(&lda, (void *)args->arg, sizeof(lda));
break;
}
case LINUX_SCSI_GET_BUS_NUMBER:
case LINUX_SCSI_GET_IDLUN:
error = linux_ioctl_sg(td, args);
break;
/* LINUX_CDROM_SEND_PACKET */
/* LINUX_CDROM_NEXT_WRITABLE */
/* LINUX_CDROM_LAST_WRITTEN */
default:
error = ENOIOCTL;
break;
}
fdrop(fp, td);
return (error);
}
static int
linux_ioctl_vfat(struct thread *td, struct linux_ioctl_args *args)
{
return (ENOTTY);
}
/*
* Sound related ioctls
*/
struct linux_mixer_info {
char id[16];
char name[32];
int modify_counter;
int fillers[10];
};
struct linux_old_mixer_info {
char id[16];
char name[32];
};
static u_int32_t dirbits[4] = { IOC_VOID, IOC_IN, IOC_OUT, IOC_INOUT };
#define SETDIR(c) (((c) & ~IOC_DIRMASK) | dirbits[args->cmd >> 30])
static int
linux_ioctl_sound(struct thread *td, struct linux_ioctl_args *args)
{
switch (args->cmd & 0xffff) {
case LINUX_SOUND_MIXER_WRITE_VOLUME:
args->cmd = SETDIR(SOUND_MIXER_WRITE_VOLUME);
- return (ioctl(td, (struct ioctl_args *)args));
+ return (sys_ioctl(td, (struct ioctl_args *)args));
case LINUX_SOUND_MIXER_WRITE_BASS:
args->cmd = SETDIR(SOUND_MIXER_WRITE_BASS);
- return (ioctl(td, (struct ioctl_args *)args));
+ return (sys_ioctl(td, (struct ioctl_args *)args));
case LINUX_SOUND_MIXER_WRITE_TREBLE:
args->cmd = SETDIR(SOUND_MIXER_WRITE_TREBLE);
- return (ioctl(td, (struct ioctl_args *)args));
+ return (sys_ioctl(td, (struct ioctl_args *)args));
case LINUX_SOUND_MIXER_WRITE_SYNTH:
args->cmd = SETDIR(SOUND_MIXER_WRITE_SYNTH);
- return (ioctl(td, (struct ioctl_args *)args));
+ return (sys_ioctl(td, (struct ioctl_args *)args));
case LINUX_SOUND_MIXER_WRITE_PCM:
args->cmd = SETDIR(SOUND_MIXER_WRITE_PCM);
- return (ioctl(td, (struct ioctl_args *)args));
+ return (sys_ioctl(td, (struct ioctl_args *)args));
case LINUX_SOUND_MIXER_WRITE_SPEAKER:
args->cmd = SETDIR(SOUND_MIXER_WRITE_SPEAKER);
- return (ioctl(td, (struct ioctl_args *)args));
+ return (sys_ioctl(td, (struct ioctl_args *)args));
case LINUX_SOUND_MIXER_WRITE_LINE:
args->cmd = SETDIR(SOUND_MIXER_WRITE_LINE);
- return (ioctl(td, (struct ioctl_args *)args));
+ return (sys_ioctl(td, (struct ioctl_args *)args));
case LINUX_SOUND_MIXER_WRITE_MIC:
args->cmd = SETDIR(SOUND_MIXER_WRITE_MIC);
- return (ioctl(td, (struct ioctl_args *)args));
+ return (sys_ioctl(td, (struct ioctl_args *)args));
case LINUX_SOUND_MIXER_WRITE_CD:
args->cmd = SETDIR(SOUND_MIXER_WRITE_CD);
- return (ioctl(td, (struct ioctl_args *)args));
+ return (sys_ioctl(td, (struct ioctl_args *)args));
case LINUX_SOUND_MIXER_WRITE_IMIX:
args->cmd = SETDIR(SOUND_MIXER_WRITE_IMIX);
- return (ioctl(td, (struct ioctl_args *)args));
+ return (sys_ioctl(td, (struct ioctl_args *)args));
case LINUX_SOUND_MIXER_WRITE_ALTPCM:
args->cmd = SETDIR(SOUND_MIXER_WRITE_ALTPCM);
- return (ioctl(td, (struct ioctl_args *)args));
+ return (sys_ioctl(td, (struct ioctl_args *)args));
case LINUX_SOUND_MIXER_WRITE_RECLEV:
args->cmd = SETDIR(SOUND_MIXER_WRITE_RECLEV);
- return (ioctl(td, (struct ioctl_args *)args));
+ return (sys_ioctl(td, (struct ioctl_args *)args));
case LINUX_SOUND_MIXER_WRITE_IGAIN:
args->cmd = SETDIR(SOUND_MIXER_WRITE_IGAIN);
- return (ioctl(td, (struct ioctl_args *)args));
+ return (sys_ioctl(td, (struct ioctl_args *)args));
case LINUX_SOUND_MIXER_WRITE_OGAIN:
args->cmd = SETDIR(SOUND_MIXER_WRITE_OGAIN);
- return (ioctl(td, (struct ioctl_args *)args));
+ return (sys_ioctl(td, (struct ioctl_args *)args));
case LINUX_SOUND_MIXER_WRITE_LINE1:
args->cmd = SETDIR(SOUND_MIXER_WRITE_LINE1);
- return (ioctl(td, (struct ioctl_args *)args));
+ return (sys_ioctl(td, (struct ioctl_args *)args));
case LINUX_SOUND_MIXER_WRITE_LINE2:
args->cmd = SETDIR(SOUND_MIXER_WRITE_LINE2);
- return (ioctl(td, (struct ioctl_args *)args));
+ return (sys_ioctl(td, (struct ioctl_args *)args));
case LINUX_SOUND_MIXER_WRITE_LINE3:
args->cmd = SETDIR(SOUND_MIXER_WRITE_LINE3);
- return (ioctl(td, (struct ioctl_args *)args));
+ return (sys_ioctl(td, (struct ioctl_args *)args));
case LINUX_SOUND_MIXER_INFO: {
/* Key on encoded length */
switch ((args->cmd >> 16) & 0x1fff) {
case 0x005c: { /* SOUND_MIXER_INFO */
struct linux_mixer_info info;
bzero(&info, sizeof(info));
strncpy(info.id, "OSS", sizeof(info.id) - 1);
strncpy(info.name, "FreeBSD OSS Mixer", sizeof(info.name) - 1);
copyout(&info, (void *)args->arg, sizeof(info));
return (0);
}
case 0x0030: { /* SOUND_OLD_MIXER_INFO */
struct linux_old_mixer_info info;
bzero(&info, sizeof(info));
strncpy(info.id, "OSS", sizeof(info.id) - 1);
strncpy(info.name, "FreeBSD OSS Mixer", sizeof(info.name) - 1);
copyout(&info, (void *)args->arg, sizeof(info));
return (0);
}
default:
return (ENOIOCTL);
}
break;
}
case LINUX_OSS_GETVERSION: {
int version = linux_get_oss_version(td);
return (copyout(&version, (void *)args->arg, sizeof(int)));
}
case LINUX_SOUND_MIXER_READ_STEREODEVS:
args->cmd = SOUND_MIXER_READ_STEREODEVS;
- return (ioctl(td, (struct ioctl_args *)args));
+ return (sys_ioctl(td, (struct ioctl_args *)args));
case LINUX_SOUND_MIXER_READ_CAPS:
args->cmd = SOUND_MIXER_READ_CAPS;
- return (ioctl(td, (struct ioctl_args *)args));
+ return (sys_ioctl(td, (struct ioctl_args *)args));
case LINUX_SOUND_MIXER_READ_RECMASK:
args->cmd = SOUND_MIXER_READ_RECMASK;
- return (ioctl(td, (struct ioctl_args *)args));
+ return (sys_ioctl(td, (struct ioctl_args *)args));
case LINUX_SOUND_MIXER_READ_DEVMASK:
args->cmd = SOUND_MIXER_READ_DEVMASK;
- return (ioctl(td, (struct ioctl_args *)args));
+ return (sys_ioctl(td, (struct ioctl_args *)args));
case LINUX_SOUND_MIXER_WRITE_RECSRC:
args->cmd = SETDIR(SOUND_MIXER_WRITE_RECSRC);
- return (ioctl(td, (struct ioctl_args *)args));
+ return (sys_ioctl(td, (struct ioctl_args *)args));
case LINUX_SNDCTL_DSP_RESET:
args->cmd = SNDCTL_DSP_RESET;
- return (ioctl(td, (struct ioctl_args *)args));
+ return (sys_ioctl(td, (struct ioctl_args *)args));
case LINUX_SNDCTL_DSP_SYNC:
args->cmd = SNDCTL_DSP_SYNC;
- return (ioctl(td, (struct ioctl_args *)args));
+ return (sys_ioctl(td, (struct ioctl_args *)args));
case LINUX_SNDCTL_DSP_SPEED:
args->cmd = SNDCTL_DSP_SPEED;
- return (ioctl(td, (struct ioctl_args *)args));
+ return (sys_ioctl(td, (struct ioctl_args *)args));
case LINUX_SNDCTL_DSP_STEREO:
args->cmd = SNDCTL_DSP_STEREO;
- return (ioctl(td, (struct ioctl_args *)args));
+ return (sys_ioctl(td, (struct ioctl_args *)args));
case LINUX_SNDCTL_DSP_GETBLKSIZE: /* LINUX_SNDCTL_DSP_SETBLKSIZE */
args->cmd = SNDCTL_DSP_GETBLKSIZE;
- return (ioctl(td, (struct ioctl_args *)args));
+ return (sys_ioctl(td, (struct ioctl_args *)args));
case LINUX_SNDCTL_DSP_SETFMT:
args->cmd = SNDCTL_DSP_SETFMT;
- return (ioctl(td, (struct ioctl_args *)args));
+ return (sys_ioctl(td, (struct ioctl_args *)args));
case LINUX_SOUND_PCM_WRITE_CHANNELS:
args->cmd = SOUND_PCM_WRITE_CHANNELS;
- return (ioctl(td, (struct ioctl_args *)args));
+ return (sys_ioctl(td, (struct ioctl_args *)args));
case LINUX_SOUND_PCM_WRITE_FILTER:
args->cmd = SOUND_PCM_WRITE_FILTER;
- return (ioctl(td, (struct ioctl_args *)args));
+ return (sys_ioctl(td, (struct ioctl_args *)args));
case LINUX_SNDCTL_DSP_POST:
args->cmd = SNDCTL_DSP_POST;
- return (ioctl(td, (struct ioctl_args *)args));
+ return (sys_ioctl(td, (struct ioctl_args *)args));
case LINUX_SNDCTL_DSP_SUBDIVIDE:
args->cmd = SNDCTL_DSP_SUBDIVIDE;
- return (ioctl(td, (struct ioctl_args *)args));
+ return (sys_ioctl(td, (struct ioctl_args *)args));
case LINUX_SNDCTL_DSP_SETFRAGMENT:
args->cmd = SNDCTL_DSP_SETFRAGMENT;
- return (ioctl(td, (struct ioctl_args *)args));
+ return (sys_ioctl(td, (struct ioctl_args *)args));
case LINUX_SNDCTL_DSP_GETFMTS:
args->cmd = SNDCTL_DSP_GETFMTS;
- return (ioctl(td, (struct ioctl_args *)args));
+ return (sys_ioctl(td, (struct ioctl_args *)args));
case LINUX_SNDCTL_DSP_GETOSPACE:
args->cmd = SNDCTL_DSP_GETOSPACE;
- return (ioctl(td, (struct ioctl_args *)args));
+ return (sys_ioctl(td, (struct ioctl_args *)args));
case LINUX_SNDCTL_DSP_GETISPACE:
args->cmd = SNDCTL_DSP_GETISPACE;
- return (ioctl(td, (struct ioctl_args *)args));
+ return (sys_ioctl(td, (struct ioctl_args *)args));
case LINUX_SNDCTL_DSP_NONBLOCK:
args->cmd = SNDCTL_DSP_NONBLOCK;
- return (ioctl(td, (struct ioctl_args *)args));
+ return (sys_ioctl(td, (struct ioctl_args *)args));
case LINUX_SNDCTL_DSP_GETCAPS:
args->cmd = SNDCTL_DSP_GETCAPS;
- return (ioctl(td, (struct ioctl_args *)args));
+ return (sys_ioctl(td, (struct ioctl_args *)args));
case LINUX_SNDCTL_DSP_SETTRIGGER: /* LINUX_SNDCTL_GETTRIGGER */
args->cmd = SNDCTL_DSP_SETTRIGGER;
- return (ioctl(td, (struct ioctl_args *)args));
+ return (sys_ioctl(td, (struct ioctl_args *)args));
case LINUX_SNDCTL_DSP_GETIPTR:
args->cmd = SNDCTL_DSP_GETIPTR;
- return (ioctl(td, (struct ioctl_args *)args));
+ return (sys_ioctl(td, (struct ioctl_args *)args));
case LINUX_SNDCTL_DSP_GETOPTR:
args->cmd = SNDCTL_DSP_GETOPTR;
- return (ioctl(td, (struct ioctl_args *)args));
+ return (sys_ioctl(td, (struct ioctl_args *)args));
case LINUX_SNDCTL_DSP_SETDUPLEX:
args->cmd = SNDCTL_DSP_SETDUPLEX;
- return (ioctl(td, (struct ioctl_args *)args));
+ return (sys_ioctl(td, (struct ioctl_args *)args));
case LINUX_SNDCTL_DSP_GETODELAY:
args->cmd = SNDCTL_DSP_GETODELAY;
- return (ioctl(td, (struct ioctl_args *)args));
+ return (sys_ioctl(td, (struct ioctl_args *)args));
case LINUX_SNDCTL_SEQ_RESET:
args->cmd = SNDCTL_SEQ_RESET;
- return (ioctl(td, (struct ioctl_args *)args));
+ return (sys_ioctl(td, (struct ioctl_args *)args));
case LINUX_SNDCTL_SEQ_SYNC:
args->cmd = SNDCTL_SEQ_SYNC;
- return (ioctl(td, (struct ioctl_args *)args));
+ return (sys_ioctl(td, (struct ioctl_args *)args));
case LINUX_SNDCTL_SYNTH_INFO:
args->cmd = SNDCTL_SYNTH_INFO;
- return (ioctl(td, (struct ioctl_args *)args));
+ return (sys_ioctl(td, (struct ioctl_args *)args));
case LINUX_SNDCTL_SEQ_CTRLRATE:
args->cmd = SNDCTL_SEQ_CTRLRATE;
- return (ioctl(td, (struct ioctl_args *)args));
+ return (sys_ioctl(td, (struct ioctl_args *)args));
case LINUX_SNDCTL_SEQ_GETOUTCOUNT:
args->cmd = SNDCTL_SEQ_GETOUTCOUNT;
- return (ioctl(td, (struct ioctl_args *)args));
+ return (sys_ioctl(td, (struct ioctl_args *)args));
case LINUX_SNDCTL_SEQ_GETINCOUNT:
args->cmd = SNDCTL_SEQ_GETINCOUNT;
- return (ioctl(td, (struct ioctl_args *)args));
+ return (sys_ioctl(td, (struct ioctl_args *)args));
case LINUX_SNDCTL_SEQ_PERCMODE:
args->cmd = SNDCTL_SEQ_PERCMODE;
- return (ioctl(td, (struct ioctl_args *)args));
+ return (sys_ioctl(td, (struct ioctl_args *)args));
case LINUX_SNDCTL_FM_LOAD_INSTR:
args->cmd = SNDCTL_FM_LOAD_INSTR;
- return (ioctl(td, (struct ioctl_args *)args));
+ return (sys_ioctl(td, (struct ioctl_args *)args));
case LINUX_SNDCTL_SEQ_TESTMIDI:
args->cmd = SNDCTL_SEQ_TESTMIDI;
- return (ioctl(td, (struct ioctl_args *)args));
+ return (sys_ioctl(td, (struct ioctl_args *)args));
case LINUX_SNDCTL_SEQ_RESETSAMPLES:
args->cmd = SNDCTL_SEQ_RESETSAMPLES;
- return (ioctl(td, (struct ioctl_args *)args));
+ return (sys_ioctl(td, (struct ioctl_args *)args));
case LINUX_SNDCTL_SEQ_NRSYNTHS:
args->cmd = SNDCTL_SEQ_NRSYNTHS;
- return (ioctl(td, (struct ioctl_args *)args));
+ return (sys_ioctl(td, (struct ioctl_args *)args));
case LINUX_SNDCTL_SEQ_NRMIDIS:
args->cmd = SNDCTL_SEQ_NRMIDIS;
- return (ioctl(td, (struct ioctl_args *)args));
+ return (sys_ioctl(td, (struct ioctl_args *)args));
case LINUX_SNDCTL_MIDI_INFO:
args->cmd = SNDCTL_MIDI_INFO;
- return (ioctl(td, (struct ioctl_args *)args));
+ return (sys_ioctl(td, (struct ioctl_args *)args));
case LINUX_SNDCTL_SEQ_TRESHOLD:
args->cmd = SNDCTL_SEQ_TRESHOLD;
- return (ioctl(td, (struct ioctl_args *)args));
+ return (sys_ioctl(td, (struct ioctl_args *)args));
case LINUX_SNDCTL_SYNTH_MEMAVL:
args->cmd = SNDCTL_SYNTH_MEMAVL;
- return (ioctl(td, (struct ioctl_args *)args));
+ return (sys_ioctl(td, (struct ioctl_args *)args));
}
return (ENOIOCTL);
}
/*
* Console related ioctls
*/
#define ISSIGVALID(sig) ((sig) > 0 && (sig) < NSIG)
static int
linux_ioctl_console(struct thread *td, struct linux_ioctl_args *args)
{
struct file *fp;
int error;
if ((error = fget(td, args->fd, CAP_IOCTL, &fp)) != 0)
return (error);
switch (args->cmd & 0xffff) {
case LINUX_KIOCSOUND:
args->cmd = KIOCSOUND;
- error = (ioctl(td, (struct ioctl_args *)args));
+ error = (sys_ioctl(td, (struct ioctl_args *)args));
break;
case LINUX_KDMKTONE:
args->cmd = KDMKTONE;
- error = (ioctl(td, (struct ioctl_args *)args));
+ error = (sys_ioctl(td, (struct ioctl_args *)args));
break;
case LINUX_KDGETLED:
args->cmd = KDGETLED;
- error = (ioctl(td, (struct ioctl_args *)args));
+ error = (sys_ioctl(td, (struct ioctl_args *)args));
break;
case LINUX_KDSETLED:
args->cmd = KDSETLED;
- error = (ioctl(td, (struct ioctl_args *)args));
+ error = (sys_ioctl(td, (struct ioctl_args *)args));
break;
case LINUX_KDSETMODE:
args->cmd = KDSETMODE;
- error = (ioctl(td, (struct ioctl_args *)args));
+ error = (sys_ioctl(td, (struct ioctl_args *)args));
break;
case LINUX_KDGETMODE:
args->cmd = KDGETMODE;
- error = (ioctl(td, (struct ioctl_args *)args));
+ error = (sys_ioctl(td, (struct ioctl_args *)args));
break;
case LINUX_KDGKBMODE:
args->cmd = KDGKBMODE;
- error = (ioctl(td, (struct ioctl_args *)args));
+ error = (sys_ioctl(td, (struct ioctl_args *)args));
break;
case LINUX_KDSKBMODE: {
int kbdmode;
switch (args->arg) {
case LINUX_KBD_RAW:
kbdmode = K_RAW;
break;
case LINUX_KBD_XLATE:
kbdmode = K_XLATE;
break;
case LINUX_KBD_MEDIUMRAW:
kbdmode = K_RAW;
break;
default:
fdrop(fp, td);
return (EINVAL);
}
error = (fo_ioctl(fp, KDSKBMODE, (caddr_t)&kbdmode,
td->td_ucred, td));
break;
}
case LINUX_VT_OPENQRY:
args->cmd = VT_OPENQRY;
- error = (ioctl(td, (struct ioctl_args *)args));
+ error = (sys_ioctl(td, (struct ioctl_args *)args));
break;
case LINUX_VT_GETMODE:
args->cmd = VT_GETMODE;
- error = (ioctl(td, (struct ioctl_args *)args));
+ error = (sys_ioctl(td, (struct ioctl_args *)args));
break;
case LINUX_VT_SETMODE: {
struct vt_mode mode;
if ((error = copyin((void *)args->arg, &mode, sizeof(mode))))
break;
if (!ISSIGVALID(mode.frsig) && ISSIGVALID(mode.acqsig))
mode.frsig = mode.acqsig;
if ((error = copyout(&mode, (void *)args->arg, sizeof(mode))))
break;
args->cmd = VT_SETMODE;
- error = (ioctl(td, (struct ioctl_args *)args));
+ error = (sys_ioctl(td, (struct ioctl_args *)args));
break;
}
case LINUX_VT_GETSTATE:
args->cmd = VT_GETACTIVE;
- error = (ioctl(td, (struct ioctl_args *)args));
+ error = (sys_ioctl(td, (struct ioctl_args *)args));
break;
case LINUX_VT_RELDISP:
args->cmd = VT_RELDISP;
- error = (ioctl(td, (struct ioctl_args *)args));
+ error = (sys_ioctl(td, (struct ioctl_args *)args));
break;
case LINUX_VT_ACTIVATE:
args->cmd = VT_ACTIVATE;
- error = (ioctl(td, (struct ioctl_args *)args));
+ error = (sys_ioctl(td, (struct ioctl_args *)args));
break;
case LINUX_VT_WAITACTIVE:
args->cmd = VT_WAITACTIVE;
- error = (ioctl(td, (struct ioctl_args *)args));
+ error = (sys_ioctl(td, (struct ioctl_args *)args));
break;
default:
error = ENOIOCTL;
break;
}
fdrop(fp, td);
return (error);
}
/*
* Criteria for interface name translation
*/
#define IFP_IS_ETH(ifp) (ifp->if_type == IFT_ETHER)
/*
* Interface function used by linprocfs (at the time of writing). It's not
* used by the Linuxulator itself.
*/
int
linux_ifname(struct ifnet *ifp, char *buffer, size_t buflen)
{
struct ifnet *ifscan;
int ethno;
IFNET_RLOCK_ASSERT();
/* Short-circuit non ethernet interfaces */
if (!IFP_IS_ETH(ifp))
return (strlcpy(buffer, ifp->if_xname, buflen));
/* Determine the (relative) unit number for ethernet interfaces */
ethno = 0;
TAILQ_FOREACH(ifscan, &V_ifnet, if_link) {
if (ifscan == ifp)
return (snprintf(buffer, buflen, "eth%d", ethno));
if (IFP_IS_ETH(ifscan))
ethno++;
}
return (0);
}
/*
* Translate a Linux interface name to a FreeBSD interface name,
* and return the associated ifnet structure
* bsdname and lxname need to be least IFNAMSIZ bytes long, but
* can point to the same buffer.
*/
static struct ifnet *
ifname_linux_to_bsd(struct thread *td, const char *lxname, char *bsdname)
{
struct ifnet *ifp;
int len, unit;
char *ep;
int is_eth, index;
for (len = 0; len < LINUX_IFNAMSIZ; ++len)
if (!isalpha(lxname[len]))
break;
if (len == 0 || len == LINUX_IFNAMSIZ)
return (NULL);
unit = (int)strtoul(lxname + len, &ep, 10);
if (ep == NULL || ep == lxname + len || ep >= lxname + LINUX_IFNAMSIZ)
return (NULL);
index = 0;
is_eth = (len == 3 && !strncmp(lxname, "eth", len)) ? 1 : 0;
CURVNET_SET(TD_TO_VNET(td));
IFNET_RLOCK();
TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
/*
* Allow Linux programs to use FreeBSD names. Don't presume
* we never have an interface named "eth", so don't make
* the test optional based on is_eth.
*/
if (strncmp(ifp->if_xname, lxname, LINUX_IFNAMSIZ) == 0)
break;
if (is_eth && IFP_IS_ETH(ifp) && unit == index++)
break;
}
IFNET_RUNLOCK();
CURVNET_RESTORE();
if (ifp != NULL)
strlcpy(bsdname, ifp->if_xname, IFNAMSIZ);
return (ifp);
}
/*
* Implement the SIOCGIFCONF ioctl
*/
static int
linux_ifconf(struct thread *td, struct ifconf *uifc)
{
#ifdef COMPAT_LINUX32
struct l_ifconf ifc;
#else
struct ifconf ifc;
#endif
struct l_ifreq ifr;
struct ifnet *ifp;
struct ifaddr *ifa;
struct sbuf *sb;
int error, ethno, full = 0, valid_len, max_len;
error = copyin(uifc, &ifc, sizeof(ifc));
if (error != 0)
return (error);
max_len = MAXPHYS - 1;
CURVNET_SET(TD_TO_VNET(td));
/* handle the 'request buffer size' case */
if (ifc.ifc_buf == PTROUT(NULL)) {
ifc.ifc_len = 0;
IFNET_RLOCK();
TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
struct sockaddr *sa = ifa->ifa_addr;
if (sa->sa_family == AF_INET)
ifc.ifc_len += sizeof(ifr);
}
}
IFNET_RUNLOCK();
error = copyout(&ifc, uifc, sizeof(ifc));
CURVNET_RESTORE();
return (error);
}
if (ifc.ifc_len <= 0) {
CURVNET_RESTORE();
return (EINVAL);
}
again:
/* Keep track of eth interfaces */
ethno = 0;
if (ifc.ifc_len <= max_len) {
max_len = ifc.ifc_len;
full = 1;
}
sb = sbuf_new(NULL, NULL, max_len + 1, SBUF_FIXEDLEN);
max_len = 0;
valid_len = 0;
/* Return all AF_INET addresses of all interfaces */
IFNET_RLOCK();
TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
int addrs = 0;
bzero(&ifr, sizeof(ifr));
if (IFP_IS_ETH(ifp))
snprintf(ifr.ifr_name, LINUX_IFNAMSIZ, "eth%d",
ethno++);
else
strlcpy(ifr.ifr_name, ifp->if_xname, LINUX_IFNAMSIZ);
/* Walk the address list */
TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
struct sockaddr *sa = ifa->ifa_addr;
if (sa->sa_family == AF_INET) {
ifr.ifr_addr.sa_family = LINUX_AF_INET;
memcpy(ifr.ifr_addr.sa_data, sa->sa_data,
sizeof(ifr.ifr_addr.sa_data));
sbuf_bcat(sb, &ifr, sizeof(ifr));
max_len += sizeof(ifr);
addrs++;
}
if (sbuf_error(sb) == 0)
valid_len = sbuf_len(sb);
}
if (addrs == 0) {
bzero((caddr_t)&ifr.ifr_addr, sizeof(ifr.ifr_addr));
sbuf_bcat(sb, &ifr, sizeof(ifr));
max_len += sizeof(ifr);
if (sbuf_error(sb) == 0)
valid_len = sbuf_len(sb);
}
}
IFNET_RUNLOCK();
if (valid_len != max_len && !full) {
sbuf_delete(sb);
goto again;
}
ifc.ifc_len = valid_len;
sbuf_finish(sb);
memcpy(PTRIN(ifc.ifc_buf), sbuf_data(sb), ifc.ifc_len);
error = copyout(&ifc, uifc, sizeof(ifc));
sbuf_delete(sb);
CURVNET_RESTORE();
return (error);
}
static int
linux_gifflags(struct thread *td, struct ifnet *ifp, struct l_ifreq *ifr)
{
l_short flags;
flags = (ifp->if_flags | ifp->if_drv_flags) & 0xffff;
/* these flags have no Linux equivalent */
flags &= ~(IFF_SMART|IFF_DRV_OACTIVE|IFF_SIMPLEX|
IFF_LINK0|IFF_LINK1|IFF_LINK2);
/* Linux' multicast flag is in a different bit */
if (flags & IFF_MULTICAST) {
flags &= ~IFF_MULTICAST;
flags |= 0x1000;
}
return (copyout(&flags, &ifr->ifr_flags, sizeof(flags)));
}
#define ARPHRD_ETHER 1
#define ARPHRD_LOOPBACK 772
static int
linux_gifhwaddr(struct ifnet *ifp, struct l_ifreq *ifr)
{
struct ifaddr *ifa;
struct sockaddr_dl *sdl;
struct l_sockaddr lsa;
if (ifp->if_type == IFT_LOOP) {
bzero(&lsa, sizeof(lsa));
lsa.sa_family = ARPHRD_LOOPBACK;
return (copyout(&lsa, &ifr->ifr_hwaddr, sizeof(lsa)));
}
if (ifp->if_type != IFT_ETHER)
return (ENOENT);
TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
sdl = (struct sockaddr_dl*)ifa->ifa_addr;
if (sdl != NULL && (sdl->sdl_family == AF_LINK) &&
(sdl->sdl_type == IFT_ETHER)) {
bzero(&lsa, sizeof(lsa));
lsa.sa_family = ARPHRD_ETHER;
bcopy(LLADDR(sdl), lsa.sa_data, LINUX_IFHWADDRLEN);
return (copyout(&lsa, &ifr->ifr_hwaddr, sizeof(lsa)));
}
}
return (ENOENT);
}
/*
* If we fault in bsd_to_linux_ifreq() then we will fault when we call
* the native ioctl(). Thus, we don't really need to check the return
* value of this function.
*/
static int
bsd_to_linux_ifreq(struct ifreq *arg)
{
struct ifreq ifr;
size_t ifr_len = sizeof(struct ifreq);
int error;
if ((error = copyin(arg, &ifr, ifr_len)))
return (error);
*(u_short *)&ifr.ifr_addr = ifr.ifr_addr.sa_family;
error = copyout(&ifr, arg, ifr_len);
return (error);
}
/*
* Socket related ioctls
*/
static int
linux_ioctl_socket(struct thread *td, struct linux_ioctl_args *args)
{
char lifname[LINUX_IFNAMSIZ], ifname[IFNAMSIZ];
struct ifnet *ifp;
struct file *fp;
int error, type;
ifp = NULL;
error = 0;
if ((error = fget(td, args->fd, CAP_IOCTL, &fp)) != 0)
return (error);
type = fp->f_type;
fdrop(fp, td);
if (type != DTYPE_SOCKET) {
/* not a socket - probably a tap / vmnet device */
switch (args->cmd) {
case LINUX_SIOCGIFADDR:
case LINUX_SIOCSIFADDR:
case LINUX_SIOCGIFFLAGS:
return (linux_ioctl_special(td, args));
default:
return (ENOIOCTL);
}
}
switch (args->cmd & 0xffff) {
case LINUX_FIOGETOWN:
case LINUX_FIOSETOWN:
case LINUX_SIOCADDMULTI:
case LINUX_SIOCATMARK:
case LINUX_SIOCDELMULTI:
case LINUX_SIOCGIFCONF:
case LINUX_SIOCGPGRP:
case LINUX_SIOCSPGRP:
case LINUX_SIOCGIFCOUNT:
/* these ioctls don't take an interface name */
#ifdef DEBUG
printf("%s(): ioctl %d\n", __func__,
args->cmd & 0xffff);
#endif
break;
case LINUX_SIOCGIFFLAGS:
case LINUX_SIOCGIFADDR:
case LINUX_SIOCSIFADDR:
case LINUX_SIOCGIFDSTADDR:
case LINUX_SIOCGIFBRDADDR:
case LINUX_SIOCGIFNETMASK:
case LINUX_SIOCSIFNETMASK:
case LINUX_SIOCGIFMTU:
case LINUX_SIOCSIFMTU:
case LINUX_SIOCSIFNAME:
case LINUX_SIOCGIFHWADDR:
case LINUX_SIOCSIFHWADDR:
case LINUX_SIOCDEVPRIVATE:
case LINUX_SIOCDEVPRIVATE+1:
case LINUX_SIOCGIFINDEX:
/* copy in the interface name and translate it. */
error = copyin((void *)args->arg, lifname, LINUX_IFNAMSIZ);
if (error != 0)
return (error);
#ifdef DEBUG
printf("%s(): ioctl %d on %.*s\n", __func__,
args->cmd & 0xffff, LINUX_IFNAMSIZ, lifname);
#endif
ifp = ifname_linux_to_bsd(td, lifname, ifname);
if (ifp == NULL)
return (EINVAL);
/*
* We need to copy it back out in case we pass the
* request on to our native ioctl(), which will expect
* the ifreq to be in user space and have the correct
* interface name.
*/
error = copyout(ifname, (void *)args->arg, IFNAMSIZ);
if (error != 0)
return (error);
#ifdef DEBUG
printf("%s(): %s translated to %s\n", __func__,
lifname, ifname);
#endif
break;
default:
return (ENOIOCTL);
}
switch (args->cmd & 0xffff) {
case LINUX_FIOSETOWN:
args->cmd = FIOSETOWN;
- error = ioctl(td, (struct ioctl_args *)args);
+ error = sys_ioctl(td, (struct ioctl_args *)args);
break;
case LINUX_SIOCSPGRP:
args->cmd = SIOCSPGRP;
- error = ioctl(td, (struct ioctl_args *)args);
+ error = sys_ioctl(td, (struct ioctl_args *)args);
break;
case LINUX_FIOGETOWN:
args->cmd = FIOGETOWN;
- error = ioctl(td, (struct ioctl_args *)args);
+ error = sys_ioctl(td, (struct ioctl_args *)args);
break;
case LINUX_SIOCGPGRP:
args->cmd = SIOCGPGRP;
- error = ioctl(td, (struct ioctl_args *)args);
+ error = sys_ioctl(td, (struct ioctl_args *)args);
break;
case LINUX_SIOCATMARK:
args->cmd = SIOCATMARK;
- error = ioctl(td, (struct ioctl_args *)args);
+ error = sys_ioctl(td, (struct ioctl_args *)args);
break;
/* LINUX_SIOCGSTAMP */
case LINUX_SIOCGIFCONF:
error = linux_ifconf(td, (struct ifconf *)args->arg);
break;
case LINUX_SIOCGIFFLAGS:
args->cmd = SIOCGIFFLAGS;
error = linux_gifflags(td, ifp, (struct l_ifreq *)args->arg);
break;
case LINUX_SIOCGIFADDR:
args->cmd = SIOCGIFADDR;
- error = ioctl(td, (struct ioctl_args *)args);
+ error = sys_ioctl(td, (struct ioctl_args *)args);
bsd_to_linux_ifreq((struct ifreq *)args->arg);
break;
case LINUX_SIOCSIFADDR:
/* XXX probably doesn't work, included for completeness */
args->cmd = SIOCSIFADDR;
- error = ioctl(td, (struct ioctl_args *)args);
+ error = sys_ioctl(td, (struct ioctl_args *)args);
break;
case LINUX_SIOCGIFDSTADDR:
args->cmd = SIOCGIFDSTADDR;
- error = ioctl(td, (struct ioctl_args *)args);
+ error = sys_ioctl(td, (struct ioctl_args *)args);
bsd_to_linux_ifreq((struct ifreq *)args->arg);
break;
case LINUX_SIOCGIFBRDADDR:
args->cmd = SIOCGIFBRDADDR;
- error = ioctl(td, (struct ioctl_args *)args);
+ error = sys_ioctl(td, (struct ioctl_args *)args);
bsd_to_linux_ifreq((struct ifreq *)args->arg);
break;
case LINUX_SIOCGIFNETMASK:
args->cmd = SIOCGIFNETMASK;
- error = ioctl(td, (struct ioctl_args *)args);
+ error = sys_ioctl(td, (struct ioctl_args *)args);
bsd_to_linux_ifreq((struct ifreq *)args->arg);
break;
case LINUX_SIOCSIFNETMASK:
error = ENOIOCTL;
break;
case LINUX_SIOCGIFMTU:
args->cmd = SIOCGIFMTU;
- error = ioctl(td, (struct ioctl_args *)args);
+ error = sys_ioctl(td, (struct ioctl_args *)args);
break;
case LINUX_SIOCSIFMTU:
args->cmd = SIOCSIFMTU;
- error = ioctl(td, (struct ioctl_args *)args);
+ error = sys_ioctl(td, (struct ioctl_args *)args);
break;
case LINUX_SIOCSIFNAME:
error = ENOIOCTL;
break;
case LINUX_SIOCGIFHWADDR:
error = linux_gifhwaddr(ifp, (struct l_ifreq *)args->arg);
break;
case LINUX_SIOCSIFHWADDR:
error = ENOIOCTL;
break;
case LINUX_SIOCADDMULTI:
args->cmd = SIOCADDMULTI;
- error = ioctl(td, (struct ioctl_args *)args);
+ error = sys_ioctl(td, (struct ioctl_args *)args);
break;
case LINUX_SIOCDELMULTI:
args->cmd = SIOCDELMULTI;
- error = ioctl(td, (struct ioctl_args *)args);
+ error = sys_ioctl(td, (struct ioctl_args *)args);
break;
case LINUX_SIOCGIFINDEX:
args->cmd = SIOCGIFINDEX;
- error = ioctl(td, (struct ioctl_args *)args);
+ error = sys_ioctl(td, (struct ioctl_args *)args);
break;
case LINUX_SIOCGIFCOUNT:
error = 0;
break;
/*
* XXX This is slightly bogus, but these ioctls are currently
* XXX only used by the aironet (if_an) network driver.
*/
case LINUX_SIOCDEVPRIVATE:
args->cmd = SIOCGPRIVATE_0;
- error = ioctl(td, (struct ioctl_args *)args);
+ error = sys_ioctl(td, (struct ioctl_args *)args);
break;
case LINUX_SIOCDEVPRIVATE+1:
args->cmd = SIOCGPRIVATE_1;
- error = ioctl(td, (struct ioctl_args *)args);
+ error = sys_ioctl(td, (struct ioctl_args *)args);
break;
}
if (ifp != NULL)
/* restore the original interface name */
copyout(lifname, (void *)args->arg, LINUX_IFNAMSIZ);
#ifdef DEBUG
printf("%s(): returning %d\n", __func__, error);
#endif
return (error);
}
/*
* Device private ioctl handler
*/
static int
linux_ioctl_private(struct thread *td, struct linux_ioctl_args *args)
{
struct file *fp;
int error, type;
if ((error = fget(td, args->fd, CAP_IOCTL, &fp)) != 0)
return (error);
type = fp->f_type;
fdrop(fp, td);
if (type == DTYPE_SOCKET)
return (linux_ioctl_socket(td, args));
return (ENOIOCTL);
}
/*
* DRM ioctl handler (sys/dev/drm)
*/
static int
linux_ioctl_drm(struct thread *td, struct linux_ioctl_args *args)
{
args->cmd = SETDIR(args->cmd);
- return ioctl(td, (struct ioctl_args *)args);
+ return sys_ioctl(td, (struct ioctl_args *)args);
}
static int
linux_ioctl_sg(struct thread *td, struct linux_ioctl_args *args)
{
struct file *fp;
u_long cmd;
int error;
if ((error = fget(td, args->fd, CAP_IOCTL, &fp)) != 0) {
printf("sg_linux_ioctl: fget returned %d\n", error);
return (error);
}
cmd = args->cmd;
error = (fo_ioctl(fp, cmd, (caddr_t)args->arg, td->td_ucred, td));
fdrop(fp, td);
return (error);
}
/*
* Video4Linux (V4L) ioctl handler
*/
static int
linux_to_bsd_v4l_tuner(struct l_video_tuner *lvt, struct video_tuner *vt)
{
vt->tuner = lvt->tuner;
strlcpy(vt->name, lvt->name, LINUX_VIDEO_TUNER_NAME_SIZE);
vt->rangelow = lvt->rangelow; /* possible long size conversion */
vt->rangehigh = lvt->rangehigh; /* possible long size conversion */
vt->flags = lvt->flags;
vt->mode = lvt->mode;
vt->signal = lvt->signal;
return (0);
}
static int
bsd_to_linux_v4l_tuner(struct video_tuner *vt, struct l_video_tuner *lvt)
{
lvt->tuner = vt->tuner;
strlcpy(lvt->name, vt->name, LINUX_VIDEO_TUNER_NAME_SIZE);
lvt->rangelow = vt->rangelow; /* possible long size conversion */
lvt->rangehigh = vt->rangehigh; /* possible long size conversion */
lvt->flags = vt->flags;
lvt->mode = vt->mode;
lvt->signal = vt->signal;
return (0);
}
#ifdef COMPAT_LINUX_V4L_CLIPLIST
static int
linux_to_bsd_v4l_clip(struct l_video_clip *lvc, struct video_clip *vc)
{
vc->x = lvc->x;
vc->y = lvc->y;
vc->width = lvc->width;
vc->height = lvc->height;
vc->next = PTRIN(lvc->next); /* possible pointer size conversion */
return (0);
}
#endif
static int
linux_to_bsd_v4l_window(struct l_video_window *lvw, struct video_window *vw)
{
vw->x = lvw->x;
vw->y = lvw->y;
vw->width = lvw->width;
vw->height = lvw->height;
vw->chromakey = lvw->chromakey;
vw->flags = lvw->flags;
vw->clips = PTRIN(lvw->clips); /* possible pointer size conversion */
vw->clipcount = lvw->clipcount;
return (0);
}
static int
bsd_to_linux_v4l_window(struct video_window *vw, struct l_video_window *lvw)
{
lvw->x = vw->x;
lvw->y = vw->y;
lvw->width = vw->width;
lvw->height = vw->height;
lvw->chromakey = vw->chromakey;
lvw->flags = vw->flags;
lvw->clips = PTROUT(vw->clips); /* possible pointer size conversion */
lvw->clipcount = vw->clipcount;
return (0);
}
static int
linux_to_bsd_v4l_buffer(struct l_video_buffer *lvb, struct video_buffer *vb)
{
vb->base = PTRIN(lvb->base); /* possible pointer size conversion */
vb->height = lvb->height;
vb->width = lvb->width;
vb->depth = lvb->depth;
vb->bytesperline = lvb->bytesperline;
return (0);
}
static int
bsd_to_linux_v4l_buffer(struct video_buffer *vb, struct l_video_buffer *lvb)
{
lvb->base = PTROUT(vb->base); /* possible pointer size conversion */
lvb->height = vb->height;
lvb->width = vb->width;
lvb->depth = vb->depth;
lvb->bytesperline = vb->bytesperline;
return (0);
}
static int
linux_to_bsd_v4l_code(struct l_video_code *lvc, struct video_code *vc)
{
strlcpy(vc->loadwhat, lvc->loadwhat, LINUX_VIDEO_CODE_LOADWHAT_SIZE);
vc->datasize = lvc->datasize;
vc->data = PTRIN(lvc->data); /* possible pointer size conversion */
return (0);
}
#ifdef COMPAT_LINUX_V4L_CLIPLIST
static int
linux_v4l_clip_copy(void *lvc, struct video_clip **ppvc)
{
int error;
struct video_clip vclip;
struct l_video_clip l_vclip;
error = copyin(lvc, &l_vclip, sizeof(l_vclip));
if (error) return (error);
linux_to_bsd_v4l_clip(&l_vclip, &vclip);
/* XXX: If there can be no concurrency: s/M_NOWAIT/M_WAITOK/ */
if ((*ppvc = malloc(sizeof(**ppvc), M_LINUX, M_NOWAIT)) == NULL)
return (ENOMEM); /* XXX: linux has no ENOMEM here */
memcpy(*ppvc, &vclip, sizeof(vclip));
(*ppvc)->next = NULL;
return (0);
}
static int
linux_v4l_cliplist_free(struct video_window *vw)
{
struct video_clip **ppvc;
struct video_clip **ppvc_next;
for (ppvc = &(vw->clips); *ppvc != NULL; ppvc = ppvc_next) {
ppvc_next = &((*ppvc)->next);
free(*ppvc, M_LINUX);
}
vw->clips = NULL;
return (0);
}
static int
linux_v4l_cliplist_copy(struct l_video_window *lvw, struct video_window *vw)
{
int error;
int clipcount;
void *plvc;
struct video_clip **ppvc;
/*
* XXX: The cliplist is used to pass in a list of clipping
* rectangles or, if clipcount == VIDEO_CLIP_BITMAP, a
* clipping bitmap. Some Linux apps, however, appear to
* leave cliplist and clips uninitialized. In any case,
* the cliplist is not used by pwc(4), at the time of
* writing, FreeBSD's only V4L driver. When a driver
* that uses the cliplist is developed, this code may
* need re-examiniation.
*/
error = 0;
clipcount = vw->clipcount;
if (clipcount == VIDEO_CLIP_BITMAP) {
/*
* In this case, the pointer (clips) is overloaded
* to be a "void *" to a bitmap, therefore there
* is no struct video_clip to copy now.
*/
} else if (clipcount > 0 && clipcount <= 16384) {
/*
* Clips points to list of clip rectangles, so
* copy the list.
*
* XXX: Upper limit of 16384 was used here to try to
* avoid cases when clipcount and clips pointer
* are uninitialized and therefore have high random
* values, as is the case in the Linux Skype
* application. The value 16384 was chosen as that
* is what is used in the Linux stradis(4) MPEG
* decoder driver, the only place we found an
* example of cliplist use.
*/
plvc = PTRIN(lvw->clips);
vw->clips = NULL;
ppvc = &(vw->clips);
while (clipcount-- > 0) {
if (plvc == 0) {
error = EFAULT;
break;
} else {
error = linux_v4l_clip_copy(plvc, ppvc);
if (error) {
linux_v4l_cliplist_free(vw);
break;
}
}
ppvc = &((*ppvc)->next);
plvc = PTRIN(((struct l_video_clip *) plvc)->next);
}
} else {
/*
* clipcount == 0 or negative (but not VIDEO_CLIP_BITMAP)
* Force cliplist to null.
*/
vw->clipcount = 0;
vw->clips = NULL;
}
return (error);
}
#endif
static int
linux_ioctl_v4l(struct thread *td, struct linux_ioctl_args *args)
{
struct file *fp;
int error;
struct video_tuner vtun;
struct video_window vwin;
struct video_buffer vbuf;
struct video_code vcode;
struct l_video_tuner l_vtun;
struct l_video_window l_vwin;
struct l_video_buffer l_vbuf;
struct l_video_code l_vcode;
switch (args->cmd & 0xffff) {
case LINUX_VIDIOCGCAP: args->cmd = VIDIOCGCAP; break;
case LINUX_VIDIOCGCHAN: args->cmd = VIDIOCGCHAN; break;
case LINUX_VIDIOCSCHAN: args->cmd = VIDIOCSCHAN; break;
case LINUX_VIDIOCGTUNER:
if ((error = fget(td, args->fd, CAP_IOCTL, &fp)) != 0)
return (error);
error = copyin((void *) args->arg, &l_vtun, sizeof(l_vtun));
if (error) {
fdrop(fp, td);
return (error);
}
linux_to_bsd_v4l_tuner(&l_vtun, &vtun);
error = fo_ioctl(fp, VIDIOCGTUNER, &vtun, td->td_ucred, td);
if (!error) {
bsd_to_linux_v4l_tuner(&vtun, &l_vtun);
error = copyout(&l_vtun, (void *) args->arg,
sizeof(l_vtun));
}
fdrop(fp, td);
return (error);
case LINUX_VIDIOCSTUNER:
if ((error = fget(td, args->fd, CAP_IOCTL, &fp)) != 0)
return (error);
error = copyin((void *) args->arg, &l_vtun, sizeof(l_vtun));
if (error) {
fdrop(fp, td);
return (error);
}
linux_to_bsd_v4l_tuner(&l_vtun, &vtun);
error = fo_ioctl(fp, VIDIOCSTUNER, &vtun, td->td_ucred, td);
fdrop(fp, td);
return (error);
case LINUX_VIDIOCGPICT: args->cmd = VIDIOCGPICT; break;
case LINUX_VIDIOCSPICT: args->cmd = VIDIOCSPICT; break;
case LINUX_VIDIOCCAPTURE: args->cmd = VIDIOCCAPTURE; break;
case LINUX_VIDIOCGWIN:
if ((error = fget(td, args->fd, CAP_IOCTL, &fp)) != 0)
return (error);
error = fo_ioctl(fp, VIDIOCGWIN, &vwin, td->td_ucred, td);
if (!error) {
bsd_to_linux_v4l_window(&vwin, &l_vwin);
error = copyout(&l_vwin, (void *) args->arg,
sizeof(l_vwin));
}
fdrop(fp, td);
return (error);
case LINUX_VIDIOCSWIN:
if ((error = fget(td, args->fd, CAP_IOCTL, &fp)) != 0)
return (error);
error = copyin((void *) args->arg, &l_vwin, sizeof(l_vwin));
if (error) {
fdrop(fp, td);
return (error);
}
linux_to_bsd_v4l_window(&l_vwin, &vwin);
#ifdef COMPAT_LINUX_V4L_CLIPLIST
error = linux_v4l_cliplist_copy(&l_vwin, &vwin);
if (error) {
fdrop(fp, td);
return (error);
}
#endif
error = fo_ioctl(fp, VIDIOCSWIN, &vwin, td->td_ucred, td);
fdrop(fp, td);
#ifdef COMPAT_LINUX_V4L_CLIPLIST
linux_v4l_cliplist_free(&vwin);
#endif
return (error);
case LINUX_VIDIOCGFBUF:
if ((error = fget(td, args->fd, CAP_IOCTL, &fp)) != 0)
return (error);
error = fo_ioctl(fp, VIDIOCGFBUF, &vbuf, td->td_ucred, td);
if (!error) {
bsd_to_linux_v4l_buffer(&vbuf, &l_vbuf);
error = copyout(&l_vbuf, (void *) args->arg,
sizeof(l_vbuf));
}
fdrop(fp, td);
return (error);
case LINUX_VIDIOCSFBUF:
if ((error = fget(td, args->fd, CAP_IOCTL, &fp)) != 0)
return (error);
error = copyin((void *) args->arg, &l_vbuf, sizeof(l_vbuf));
if (error) {
fdrop(fp, td);
return (error);
}
linux_to_bsd_v4l_buffer(&l_vbuf, &vbuf);
error = fo_ioctl(fp, VIDIOCSFBUF, &vbuf, td->td_ucred, td);
fdrop(fp, td);
return (error);
case LINUX_VIDIOCKEY: args->cmd = VIDIOCKEY; break;
case LINUX_VIDIOCGFREQ: args->cmd = VIDIOCGFREQ; break;
case LINUX_VIDIOCSFREQ: args->cmd = VIDIOCSFREQ; break;
case LINUX_VIDIOCGAUDIO: args->cmd = VIDIOCGAUDIO; break;
case LINUX_VIDIOCSAUDIO: args->cmd = VIDIOCSAUDIO; break;
case LINUX_VIDIOCSYNC: args->cmd = VIDIOCSYNC; break;
case LINUX_VIDIOCMCAPTURE: args->cmd = VIDIOCMCAPTURE; break;
case LINUX_VIDIOCGMBUF: args->cmd = VIDIOCGMBUF; break;
case LINUX_VIDIOCGUNIT: args->cmd = VIDIOCGUNIT; break;
case LINUX_VIDIOCGCAPTURE: args->cmd = VIDIOCGCAPTURE; break;
case LINUX_VIDIOCSCAPTURE: args->cmd = VIDIOCSCAPTURE; break;
case LINUX_VIDIOCSPLAYMODE: args->cmd = VIDIOCSPLAYMODE; break;
case LINUX_VIDIOCSWRITEMODE: args->cmd = VIDIOCSWRITEMODE; break;
case LINUX_VIDIOCGPLAYINFO: args->cmd = VIDIOCGPLAYINFO; break;
case LINUX_VIDIOCSMICROCODE:
if ((error = fget(td, args->fd, CAP_IOCTL, &fp)) != 0)
return (error);
error = copyin((void *) args->arg, &l_vcode, sizeof(l_vcode));
if (error) {
fdrop(fp, td);
return (error);
}
linux_to_bsd_v4l_code(&l_vcode, &vcode);
error = fo_ioctl(fp, VIDIOCSMICROCODE, &vcode, td->td_ucred, td);
fdrop(fp, td);
return (error);
case LINUX_VIDIOCGVBIFMT: args->cmd = VIDIOCGVBIFMT; break;
case LINUX_VIDIOCSVBIFMT: args->cmd = VIDIOCSVBIFMT; break;
default: return (ENOIOCTL);
}
- error = ioctl(td, (struct ioctl_args *)args);
+ error = sys_ioctl(td, (struct ioctl_args *)args);
return (error);
}
/*
* Special ioctl handler
*/
static int
linux_ioctl_special(struct thread *td, struct linux_ioctl_args *args)
{
int error;
switch (args->cmd) {
case LINUX_SIOCGIFADDR:
args->cmd = SIOCGIFADDR;
- error = ioctl(td, (struct ioctl_args *)args);
+ error = sys_ioctl(td, (struct ioctl_args *)args);
break;
case LINUX_SIOCSIFADDR:
args->cmd = SIOCSIFADDR;
- error = ioctl(td, (struct ioctl_args *)args);
+ error = sys_ioctl(td, (struct ioctl_args *)args);
break;
case LINUX_SIOCGIFFLAGS:
args->cmd = SIOCGIFFLAGS;
- error = ioctl(td, (struct ioctl_args *)args);
+ error = sys_ioctl(td, (struct ioctl_args *)args);
break;
default:
error = ENOIOCTL;
}
return (error);
}
static int
linux_to_bsd_v4l2_standard(struct l_v4l2_standard *lvstd, struct v4l2_standard *vstd)
{
vstd->index = lvstd->index;
vstd->id = lvstd->id;
memcpy(&vstd->name, &lvstd->name, sizeof(*lvstd) - offsetof(struct l_v4l2_standard, name));
return (0);
}
static int
bsd_to_linux_v4l2_standard(struct v4l2_standard *vstd, struct l_v4l2_standard *lvstd)
{
lvstd->index = vstd->index;
lvstd->id = vstd->id;
memcpy(&lvstd->name, &vstd->name, sizeof(*lvstd) - offsetof(struct l_v4l2_standard, name));
return (0);
}
static int
linux_to_bsd_v4l2_buffer(struct l_v4l2_buffer *lvb, struct v4l2_buffer *vb)
{
vb->index = lvb->index;
vb->type = lvb->type;
vb->bytesused = lvb->bytesused;
vb->flags = lvb->flags;
vb->field = lvb->field;
vb->timestamp.tv_sec = lvb->timestamp.tv_sec;
vb->timestamp.tv_usec = lvb->timestamp.tv_usec;
memcpy(&vb->timecode, &lvb->timecode, sizeof (lvb->timecode));
vb->sequence = lvb->sequence;
vb->memory = lvb->memory;
if (lvb->memory == V4L2_MEMORY_USERPTR)
/* possible pointer size conversion */
vb->m.userptr = (unsigned long)PTRIN(lvb->m.userptr);
else
vb->m.offset = lvb->m.offset;
vb->length = lvb->length;
vb->input = lvb->input;
vb->reserved = lvb->reserved;
return (0);
}
static int
bsd_to_linux_v4l2_buffer(struct v4l2_buffer *vb, struct l_v4l2_buffer *lvb)
{
lvb->index = vb->index;
lvb->type = vb->type;
lvb->bytesused = vb->bytesused;
lvb->flags = vb->flags;
lvb->field = vb->field;
lvb->timestamp.tv_sec = vb->timestamp.tv_sec;
lvb->timestamp.tv_usec = vb->timestamp.tv_usec;
memcpy(&lvb->timecode, &vb->timecode, sizeof (vb->timecode));
lvb->sequence = vb->sequence;
lvb->memory = vb->memory;
if (vb->memory == V4L2_MEMORY_USERPTR)
/* possible pointer size conversion */
lvb->m.userptr = PTROUT(vb->m.userptr);
else
lvb->m.offset = vb->m.offset;
lvb->length = vb->length;
lvb->input = vb->input;
lvb->reserved = vb->reserved;
return (0);
}
static int
linux_to_bsd_v4l2_format(struct l_v4l2_format *lvf, struct v4l2_format *vf)
{
vf->type = lvf->type;
if (lvf->type == V4L2_BUF_TYPE_VIDEO_OVERLAY
#ifdef V4L2_BUF_TYPE_VIDEO_OUTPUT_OVERLAY
|| lvf->type == V4L2_BUF_TYPE_VIDEO_OUTPUT_OVERLAY
#endif
)
/*
* XXX TODO - needs 32 -> 64 bit conversion:
* (unused by webcams?)
*/
return EINVAL;
memcpy(&vf->fmt, &lvf->fmt, sizeof(vf->fmt));
return 0;
}
static int
bsd_to_linux_v4l2_format(struct v4l2_format *vf, struct l_v4l2_format *lvf)
{
lvf->type = vf->type;
if (vf->type == V4L2_BUF_TYPE_VIDEO_OVERLAY
#ifdef V4L2_BUF_TYPE_VIDEO_OUTPUT_OVERLAY
|| vf->type == V4L2_BUF_TYPE_VIDEO_OUTPUT_OVERLAY
#endif
)
/*
* XXX TODO - needs 32 -> 64 bit conversion:
* (unused by webcams?)
*/
return EINVAL;
memcpy(&lvf->fmt, &vf->fmt, sizeof(vf->fmt));
return 0;
}
static int
linux_ioctl_v4l2(struct thread *td, struct linux_ioctl_args *args)
{
struct file *fp;
int error;
struct v4l2_format vformat;
struct l_v4l2_format l_vformat;
struct v4l2_standard vstd;
struct l_v4l2_standard l_vstd;
struct l_v4l2_buffer l_vbuf;
struct v4l2_buffer vbuf;
struct v4l2_input vinp;
switch (args->cmd & 0xffff) {
case LINUX_VIDIOC_RESERVED:
case LINUX_VIDIOC_LOG_STATUS:
if ((args->cmd & IOC_DIRMASK) != LINUX_IOC_VOID)
return ENOIOCTL;
args->cmd = (args->cmd & 0xffff) | IOC_VOID;
break;
case LINUX_VIDIOC_OVERLAY:
case LINUX_VIDIOC_STREAMON:
case LINUX_VIDIOC_STREAMOFF:
case LINUX_VIDIOC_S_STD:
case LINUX_VIDIOC_S_TUNER:
case LINUX_VIDIOC_S_AUDIO:
case LINUX_VIDIOC_S_AUDOUT:
case LINUX_VIDIOC_S_MODULATOR:
case LINUX_VIDIOC_S_FREQUENCY:
case LINUX_VIDIOC_S_CROP:
case LINUX_VIDIOC_S_JPEGCOMP:
case LINUX_VIDIOC_S_PRIORITY:
case LINUX_VIDIOC_DBG_S_REGISTER:
case LINUX_VIDIOC_S_HW_FREQ_SEEK:
case LINUX_VIDIOC_SUBSCRIBE_EVENT:
case LINUX_VIDIOC_UNSUBSCRIBE_EVENT:
args->cmd = (args->cmd & ~IOC_DIRMASK) | IOC_IN;
break;
case LINUX_VIDIOC_QUERYCAP:
case LINUX_VIDIOC_G_STD:
case LINUX_VIDIOC_G_AUDIO:
case LINUX_VIDIOC_G_INPUT:
case LINUX_VIDIOC_G_OUTPUT:
case LINUX_VIDIOC_G_AUDOUT:
case LINUX_VIDIOC_G_JPEGCOMP:
case LINUX_VIDIOC_QUERYSTD:
case LINUX_VIDIOC_G_PRIORITY:
case LINUX_VIDIOC_QUERY_DV_PRESET:
args->cmd = (args->cmd & ~IOC_DIRMASK) | IOC_OUT;
break;
case LINUX_VIDIOC_ENUM_FMT:
case LINUX_VIDIOC_REQBUFS:
case LINUX_VIDIOC_G_PARM:
case LINUX_VIDIOC_S_PARM:
case LINUX_VIDIOC_G_CTRL:
case LINUX_VIDIOC_S_CTRL:
case LINUX_VIDIOC_G_TUNER:
case LINUX_VIDIOC_QUERYCTRL:
case LINUX_VIDIOC_QUERYMENU:
case LINUX_VIDIOC_S_INPUT:
case LINUX_VIDIOC_S_OUTPUT:
case LINUX_VIDIOC_ENUMOUTPUT:
case LINUX_VIDIOC_G_MODULATOR:
case LINUX_VIDIOC_G_FREQUENCY:
case LINUX_VIDIOC_CROPCAP:
case LINUX_VIDIOC_G_CROP:
case LINUX_VIDIOC_ENUMAUDIO:
case LINUX_VIDIOC_ENUMAUDOUT:
case LINUX_VIDIOC_G_SLICED_VBI_CAP:
#ifdef VIDIOC_ENUM_FRAMESIZES
case LINUX_VIDIOC_ENUM_FRAMESIZES:
case LINUX_VIDIOC_ENUM_FRAMEINTERVALS:
case LINUX_VIDIOC_ENCODER_CMD:
case LINUX_VIDIOC_TRY_ENCODER_CMD:
#endif
case LINUX_VIDIOC_DBG_G_REGISTER:
case LINUX_VIDIOC_DBG_G_CHIP_IDENT:
case LINUX_VIDIOC_ENUM_DV_PRESETS:
case LINUX_VIDIOC_S_DV_PRESET:
case LINUX_VIDIOC_G_DV_PRESET:
case LINUX_VIDIOC_S_DV_TIMINGS:
case LINUX_VIDIOC_G_DV_TIMINGS:
args->cmd = (args->cmd & ~IOC_DIRMASK) | IOC_INOUT;
break;
case LINUX_VIDIOC_G_FMT:
case LINUX_VIDIOC_S_FMT:
case LINUX_VIDIOC_TRY_FMT:
error = copyin((void *)args->arg, &l_vformat, sizeof(l_vformat));
if (error)
return (error);
if ((error = fget(td, args->fd, CAP_IOCTL, &fp)) != 0)
return (error);
if (linux_to_bsd_v4l2_format(&l_vformat, &vformat) != 0)
error = EINVAL;
else if ((args->cmd & 0xffff) == LINUX_VIDIOC_G_FMT)
error = fo_ioctl(fp, VIDIOC_G_FMT, &vformat,
td->td_ucred, td);
else if ((args->cmd & 0xffff) == LINUX_VIDIOC_S_FMT)
error = fo_ioctl(fp, VIDIOC_S_FMT, &vformat,
td->td_ucred, td);
else
error = fo_ioctl(fp, VIDIOC_TRY_FMT, &vformat,
td->td_ucred, td);
bsd_to_linux_v4l2_format(&vformat, &l_vformat);
copyout(&l_vformat, (void *)args->arg, sizeof(l_vformat));
fdrop(fp, td);
return (error);
case LINUX_VIDIOC_ENUMSTD:
error = copyin((void *)args->arg, &l_vstd, sizeof(l_vstd));
if (error)
return (error);
linux_to_bsd_v4l2_standard(&l_vstd, &vstd);
if ((error = fget(td, args->fd, CAP_IOCTL, &fp)) != 0)
return (error);
error = fo_ioctl(fp, VIDIOC_ENUMSTD, (caddr_t)&vstd,
td->td_ucred, td);
if (error) {
fdrop(fp, td);
return (error);
}
bsd_to_linux_v4l2_standard(&vstd, &l_vstd);
error = copyout(&l_vstd, (void *)args->arg, sizeof(l_vstd));
fdrop(fp, td);
return (error);
case LINUX_VIDIOC_ENUMINPUT:
/*
* The Linux struct l_v4l2_input differs only in size,
* it has no padding at the end.
*/
error = copyin((void *)args->arg, &vinp,
sizeof(struct l_v4l2_input));
if (error != 0)
return (error);
if ((error = fget(td, args->fd, CAP_IOCTL, &fp)) != 0)
return (error);
error = fo_ioctl(fp, VIDIOC_ENUMINPUT, (caddr_t)&vinp,
td->td_ucred, td);
if (error) {
fdrop(fp, td);
return (error);
}
error = copyout(&vinp, (void *)args->arg,
sizeof(struct l_v4l2_input));
fdrop(fp, td);
return (error);
case LINUX_VIDIOC_QUERYBUF:
case LINUX_VIDIOC_QBUF:
case LINUX_VIDIOC_DQBUF:
error = copyin((void *)args->arg, &l_vbuf, sizeof(l_vbuf));
if (error)
return (error);
if ((error = fget(td, args->fd, CAP_IOCTL, &fp)) != 0)
return (error);
linux_to_bsd_v4l2_buffer(&l_vbuf, &vbuf);
if ((args->cmd & 0xffff) == LINUX_VIDIOC_QUERYBUF)
error = fo_ioctl(fp, VIDIOC_QUERYBUF, &vbuf,
td->td_ucred, td);
else if ((args->cmd & 0xffff) == LINUX_VIDIOC_QBUF)
error = fo_ioctl(fp, VIDIOC_QBUF, &vbuf,
td->td_ucred, td);
else
error = fo_ioctl(fp, VIDIOC_DQBUF, &vbuf,
td->td_ucred, td);
bsd_to_linux_v4l2_buffer(&vbuf, &l_vbuf);
copyout(&l_vbuf, (void *)args->arg, sizeof(l_vbuf));
fdrop(fp, td);
return (error);
/*
* XXX TODO - these need 32 -> 64 bit conversion:
* (are any of them needed for webcams?)
*/
case LINUX_VIDIOC_G_FBUF:
case LINUX_VIDIOC_S_FBUF:
case LINUX_VIDIOC_G_EXT_CTRLS:
case LINUX_VIDIOC_S_EXT_CTRLS:
case LINUX_VIDIOC_TRY_EXT_CTRLS:
case LINUX_VIDIOC_DQEVENT:
default: return (ENOIOCTL);
}
- error = ioctl(td, (struct ioctl_args *)args);
+ error = sys_ioctl(td, (struct ioctl_args *)args);
return (error);
}
/*
* Support for emulators/linux-libusb. This port uses FBSD_LUSB* macros
* instead of USB* ones. This lets us to provide correct values for cmd.
* 0xffffffe0 -- 0xffffffff range seemed to be the least collision-prone.
*/
static int
linux_ioctl_fbsd_usb(struct thread *td, struct linux_ioctl_args *args)
{
int error;
error = 0;
switch (args->cmd) {
case FBSD_LUSB_DEVICEENUMERATE:
args->cmd = USB_DEVICEENUMERATE;
break;
case FBSD_LUSB_DEV_QUIRK_ADD:
args->cmd = USB_DEV_QUIRK_ADD;
break;
case FBSD_LUSB_DEV_QUIRK_GET:
args->cmd = USB_DEV_QUIRK_GET;
break;
case FBSD_LUSB_DEV_QUIRK_REMOVE:
args->cmd = USB_DEV_QUIRK_REMOVE;
break;
case FBSD_LUSB_DO_REQUEST:
args->cmd = USB_DO_REQUEST;
break;
case FBSD_LUSB_FS_CLEAR_STALL_SYNC:
args->cmd = USB_FS_CLEAR_STALL_SYNC;
break;
case FBSD_LUSB_FS_CLOSE:
args->cmd = USB_FS_CLOSE;
break;
case FBSD_LUSB_FS_COMPLETE:
args->cmd = USB_FS_COMPLETE;
break;
case FBSD_LUSB_FS_INIT:
args->cmd = USB_FS_INIT;
break;
case FBSD_LUSB_FS_OPEN:
args->cmd = USB_FS_OPEN;
break;
case FBSD_LUSB_FS_START:
args->cmd = USB_FS_START;
break;
case FBSD_LUSB_FS_STOP:
args->cmd = USB_FS_STOP;
break;
case FBSD_LUSB_FS_UNINIT:
args->cmd = USB_FS_UNINIT;
break;
case FBSD_LUSB_GET_CONFIG:
args->cmd = USB_GET_CONFIG;
break;
case FBSD_LUSB_GET_DEVICEINFO:
args->cmd = USB_GET_DEVICEINFO;
break;
case FBSD_LUSB_GET_DEVICE_DESC:
args->cmd = USB_GET_DEVICE_DESC;
break;
case FBSD_LUSB_GET_FULL_DESC:
args->cmd = USB_GET_FULL_DESC;
break;
case FBSD_LUSB_GET_IFACE_DRIVER:
args->cmd = USB_GET_IFACE_DRIVER;
break;
case FBSD_LUSB_GET_PLUGTIME:
args->cmd = USB_GET_PLUGTIME;
break;
case FBSD_LUSB_GET_POWER_MODE:
args->cmd = USB_GET_POWER_MODE;
break;
case FBSD_LUSB_GET_REPORT_DESC:
args->cmd = USB_GET_REPORT_DESC;
break;
case FBSD_LUSB_GET_REPORT_ID:
args->cmd = USB_GET_REPORT_ID;
break;
case FBSD_LUSB_GET_TEMPLATE:
args->cmd = USB_GET_TEMPLATE;
break;
case FBSD_LUSB_IFACE_DRIVER_ACTIVE:
args->cmd = USB_IFACE_DRIVER_ACTIVE;
break;
case FBSD_LUSB_IFACE_DRIVER_DETACH:
args->cmd = USB_IFACE_DRIVER_DETACH;
break;
case FBSD_LUSB_QUIRK_NAME_GET:
args->cmd = USB_QUIRK_NAME_GET;
break;
case FBSD_LUSB_READ_DIR:
args->cmd = USB_READ_DIR;
break;
case FBSD_LUSB_SET_ALTINTERFACE:
args->cmd = USB_SET_ALTINTERFACE;
break;
case FBSD_LUSB_SET_CONFIG:
args->cmd = USB_SET_CONFIG;
break;
case FBSD_LUSB_SET_IMMED:
args->cmd = USB_SET_IMMED;
break;
case FBSD_LUSB_SET_POWER_MODE:
args->cmd = USB_SET_POWER_MODE;
break;
case FBSD_LUSB_SET_TEMPLATE:
args->cmd = USB_SET_TEMPLATE;
break;
default:
error = ENOIOCTL;
}
if (error != ENOIOCTL)
- error = ioctl(td, (struct ioctl_args *)args);
+ error = sys_ioctl(td, (struct ioctl_args *)args);
return (error);
}
/*
* main ioctl syscall function
*/
int
linux_ioctl(struct thread *td, struct linux_ioctl_args *args)
{
struct file *fp;
struct handler_element *he;
int error, cmd;
#ifdef DEBUG
if (ldebug(ioctl))
printf(ARGS(ioctl, "%d, %04lx, *"), args->fd,
(unsigned long)args->cmd);
#endif
if ((error = fget(td, args->fd, CAP_IOCTL, &fp)) != 0)
return (error);
if ((fp->f_flag & (FREAD|FWRITE)) == 0) {
fdrop(fp, td);
return (EBADF);
}
/* Iterate over the ioctl handlers */
cmd = args->cmd & 0xffff;
sx_slock(&linux_ioctl_sx);
mtx_lock(&Giant);
TAILQ_FOREACH(he, &handlers, list) {
if (cmd >= he->low && cmd <= he->high) {
error = (*he->func)(td, args);
if (error != ENOIOCTL) {
mtx_unlock(&Giant);
sx_sunlock(&linux_ioctl_sx);
fdrop(fp, td);
return (error);
}
}
}
mtx_unlock(&Giant);
sx_sunlock(&linux_ioctl_sx);
fdrop(fp, td);
linux_msg(td, "ioctl fd=%d, cmd=0x%x ('%c',%d) is not implemented",
args->fd, (int)(args->cmd & 0xffff),
(int)(args->cmd & 0xff00) >> 8, (int)(args->cmd & 0xff));
return (EINVAL);
}
int
linux_ioctl_register_handler(struct linux_ioctl_handler *h)
{
struct handler_element *he, *cur;
if (h == NULL || h->func == NULL)
return (EINVAL);
/*
* Reuse the element if the handler is already on the list, otherwise
* create a new element.
*/
sx_xlock(&linux_ioctl_sx);
TAILQ_FOREACH(he, &handlers, list) {
if (he->func == h->func)
break;
}
if (he == NULL) {
he = malloc(sizeof(*he),
M_LINUX, M_WAITOK);
he->func = h->func;
} else
TAILQ_REMOVE(&handlers, he, list);
/* Initialize range information. */
he->low = h->low;
he->high = h->high;
he->span = h->high - h->low + 1;
/* Add the element to the list, sorted on span. */
TAILQ_FOREACH(cur, &handlers, list) {
if (cur->span > he->span) {
TAILQ_INSERT_BEFORE(cur, he, list);
sx_xunlock(&linux_ioctl_sx);
return (0);
}
}
TAILQ_INSERT_TAIL(&handlers, he, list);
sx_xunlock(&linux_ioctl_sx);
return (0);
}
int
linux_ioctl_unregister_handler(struct linux_ioctl_handler *h)
{
struct handler_element *he;
if (h == NULL || h->func == NULL)
return (EINVAL);
sx_xlock(&linux_ioctl_sx);
TAILQ_FOREACH(he, &handlers, list) {
if (he->func == h->func) {
TAILQ_REMOVE(&handlers, he, list);
sx_xunlock(&linux_ioctl_sx);
free(he, M_LINUX);
return (0);
}
}
sx_xunlock(&linux_ioctl_sx);
return (EINVAL);
}
Index: head/sys/compat/linux/linux_ipc.c
===================================================================
--- head/sys/compat/linux/linux_ipc.c (revision 225616)
+++ head/sys/compat/linux/linux_ipc.c (revision 225617)
@@ -1,899 +1,899 @@
/*-
* Copyright (c) 1994-1995 Søren Schmidt
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer
* in this position and unchanged.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/syscallsubr.h>
#include <sys/sysproto.h>
#include <sys/proc.h>
#include <sys/limits.h>
#include <sys/msg.h>
#include <sys/sem.h>
#include <sys/shm.h>
#include "opt_compat.h"
#ifdef COMPAT_LINUX32
#include <machine/../linux32/linux.h>
#include <machine/../linux32/linux32_proto.h>
#include <machine/../linux32/linux32_ipc64.h>
#else
#include <machine/../linux/linux.h>
#include <machine/../linux/linux_proto.h>
#include <machine/../linux/linux_ipc64.h>
#endif
#include <compat/linux/linux_ipc.h>
#include <compat/linux/linux_util.h>
struct l_seminfo {
l_int semmap;
l_int semmni;
l_int semmns;
l_int semmnu;
l_int semmsl;
l_int semopm;
l_int semume;
l_int semusz;
l_int semvmx;
l_int semaem;
};
struct l_shminfo {
l_int shmmax;
l_int shmmin;
l_int shmmni;
l_int shmseg;
l_int shmall;
};
struct l_shm_info {
l_int used_ids;
l_ulong shm_tot; /* total allocated shm */
l_ulong shm_rss; /* total resident shm */
l_ulong shm_swp; /* total swapped shm */
l_ulong swap_attempts;
l_ulong swap_successes;
};
struct l_msginfo {
l_int msgpool;
l_int msgmap;
l_int msgmax;
l_int msgmnb;
l_int msgmni;
l_int msgssz;
l_int msgtql;
l_ushort msgseg;
};
static void
bsd_to_linux_shminfo( struct shminfo *bpp, struct l_shminfo *lpp)
{
lpp->shmmax = bpp->shmmax;
lpp->shmmin = bpp->shmmin;
lpp->shmmni = bpp->shmmni;
lpp->shmseg = bpp->shmseg;
lpp->shmall = bpp->shmall;
}
static void
bsd_to_linux_shm_info( struct shm_info *bpp, struct l_shm_info *lpp)
{
lpp->used_ids = bpp->used_ids ;
lpp->shm_tot = bpp->shm_tot ;
lpp->shm_rss = bpp->shm_rss ;
lpp->shm_swp = bpp->shm_swp ;
lpp->swap_attempts = bpp->swap_attempts ;
lpp->swap_successes = bpp->swap_successes ;
}
struct l_ipc_perm {
l_key_t key;
l_uid16_t uid;
l_gid16_t gid;
l_uid16_t cuid;
l_gid16_t cgid;
l_ushort mode;
l_ushort seq;
};
static void
linux_to_bsd_ipc_perm(struct l_ipc_perm *lpp, struct ipc_perm *bpp)
{
bpp->key = lpp->key;
bpp->uid = lpp->uid;
bpp->gid = lpp->gid;
bpp->cuid = lpp->cuid;
bpp->cgid = lpp->cgid;
bpp->mode = lpp->mode;
bpp->seq = lpp->seq;
}
static void
bsd_to_linux_ipc_perm(struct ipc_perm *bpp, struct l_ipc_perm *lpp)
{
lpp->key = bpp->key;
lpp->uid = bpp->uid;
lpp->gid = bpp->gid;
lpp->cuid = bpp->cuid;
lpp->cgid = bpp->cgid;
lpp->mode = bpp->mode;
lpp->seq = bpp->seq;
}
struct l_msqid_ds {
struct l_ipc_perm msg_perm;
l_uintptr_t msg_first; /* first message on queue,unused */
l_uintptr_t msg_last; /* last message in queue,unused */
l_time_t msg_stime; /* last msgsnd time */
l_time_t msg_rtime; /* last msgrcv time */
l_time_t msg_ctime; /* last change time */
l_ulong msg_lcbytes; /* Reuse junk fields for 32 bit */
l_ulong msg_lqbytes; /* ditto */
l_ushort msg_cbytes; /* current number of bytes on queue */
l_ushort msg_qnum; /* number of messages in queue */
l_ushort msg_qbytes; /* max number of bytes on queue */
l_pid_t msg_lspid; /* pid of last msgsnd */
l_pid_t msg_lrpid; /* last receive pid */
}
#if defined(__amd64__) && defined(COMPAT_LINUX32)
__packed
#endif
;
struct l_semid_ds {
struct l_ipc_perm sem_perm;
l_time_t sem_otime;
l_time_t sem_ctime;
l_uintptr_t sem_base;
l_uintptr_t sem_pending;
l_uintptr_t sem_pending_last;
l_uintptr_t undo;
l_ushort sem_nsems;
}
#if defined(__amd64__) && defined(COMPAT_LINUX32)
__packed
#endif
;
struct l_shmid_ds {
struct l_ipc_perm shm_perm;
l_int shm_segsz;
l_time_t shm_atime;
l_time_t shm_dtime;
l_time_t shm_ctime;
l_ushort shm_cpid;
l_ushort shm_lpid;
l_short shm_nattch;
l_ushort private1;
l_uintptr_t private2;
l_uintptr_t private3;
};
static void
linux_to_bsd_semid_ds(struct l_semid_ds *lsp, struct semid_ds *bsp)
{
linux_to_bsd_ipc_perm(&lsp->sem_perm, &bsp->sem_perm);
bsp->sem_otime = lsp->sem_otime;
bsp->sem_ctime = lsp->sem_ctime;
bsp->sem_nsems = lsp->sem_nsems;
bsp->sem_base = PTRIN(lsp->sem_base);
}
static void
bsd_to_linux_semid_ds(struct semid_ds *bsp, struct l_semid_ds *lsp)
{
bsd_to_linux_ipc_perm(&bsp->sem_perm, &lsp->sem_perm);
lsp->sem_otime = bsp->sem_otime;
lsp->sem_ctime = bsp->sem_ctime;
lsp->sem_nsems = bsp->sem_nsems;
lsp->sem_base = PTROUT(bsp->sem_base);
}
static void
linux_to_bsd_shmid_ds(struct l_shmid_ds *lsp, struct shmid_ds *bsp)
{
linux_to_bsd_ipc_perm(&lsp->shm_perm, &bsp->shm_perm);
bsp->shm_segsz = lsp->shm_segsz;
bsp->shm_lpid = lsp->shm_lpid;
bsp->shm_cpid = lsp->shm_cpid;
bsp->shm_nattch = lsp->shm_nattch;
bsp->shm_atime = lsp->shm_atime;
bsp->shm_dtime = lsp->shm_dtime;
bsp->shm_ctime = lsp->shm_ctime;
}
static void
bsd_to_linux_shmid_ds(struct shmid_ds *bsp, struct l_shmid_ds *lsp)
{
bsd_to_linux_ipc_perm(&bsp->shm_perm, &lsp->shm_perm);
if (bsp->shm_segsz > INT_MAX)
lsp->shm_segsz = INT_MAX;
else
lsp->shm_segsz = bsp->shm_segsz;
lsp->shm_lpid = bsp->shm_lpid;
lsp->shm_cpid = bsp->shm_cpid;
if (bsp->shm_nattch > SHRT_MAX)
lsp->shm_nattch = SHRT_MAX;
else
lsp->shm_nattch = bsp->shm_nattch;
lsp->shm_atime = bsp->shm_atime;
lsp->shm_dtime = bsp->shm_dtime;
lsp->shm_ctime = bsp->shm_ctime;
lsp->private3 = 0;
}
static void
linux_to_bsd_msqid_ds(struct l_msqid_ds *lsp, struct msqid_ds *bsp)
{
linux_to_bsd_ipc_perm(&lsp->msg_perm, &bsp->msg_perm);
bsp->msg_cbytes = lsp->msg_cbytes;
bsp->msg_qnum = lsp->msg_qnum;
bsp->msg_qbytes = lsp->msg_qbytes;
bsp->msg_lspid = lsp->msg_lspid;
bsp->msg_lrpid = lsp->msg_lrpid;
bsp->msg_stime = lsp->msg_stime;
bsp->msg_rtime = lsp->msg_rtime;
bsp->msg_ctime = lsp->msg_ctime;
}
static void
bsd_to_linux_msqid_ds(struct msqid_ds *bsp, struct l_msqid_ds *lsp)
{
bsd_to_linux_ipc_perm(&bsp->msg_perm, &lsp->msg_perm);
lsp->msg_cbytes = bsp->msg_cbytes;
lsp->msg_qnum = bsp->msg_qnum;
lsp->msg_qbytes = bsp->msg_qbytes;
lsp->msg_lspid = bsp->msg_lspid;
lsp->msg_lrpid = bsp->msg_lrpid;
lsp->msg_stime = bsp->msg_stime;
lsp->msg_rtime = bsp->msg_rtime;
lsp->msg_ctime = bsp->msg_ctime;
}
static void
linux_ipc_perm_to_ipc64_perm(struct l_ipc_perm *in, struct l_ipc64_perm *out)
{
/* XXX: do we really need to do something here? */
out->key = in->key;
out->uid = in->uid;
out->gid = in->gid;
out->cuid = in->cuid;
out->cgid = in->cgid;
out->mode = in->mode;
out->seq = in->seq;
}
static int
linux_msqid_pullup(l_int ver, struct l_msqid_ds *linux_msqid, caddr_t uaddr)
{
struct l_msqid64_ds linux_msqid64;
int error;
if (ver == LINUX_IPC_64) {
error = copyin(uaddr, &linux_msqid64, sizeof(linux_msqid64));
if (error != 0)
return (error);
bzero(linux_msqid, sizeof(*linux_msqid));
linux_msqid->msg_perm.uid = linux_msqid64.msg_perm.uid;
linux_msqid->msg_perm.gid = linux_msqid64.msg_perm.gid;
linux_msqid->msg_perm.mode = linux_msqid64.msg_perm.mode;
if (linux_msqid64.msg_qbytes > USHRT_MAX)
linux_msqid->msg_lqbytes = linux_msqid64.msg_qbytes;
else
linux_msqid->msg_qbytes = linux_msqid64.msg_qbytes;
} else
error = copyin(uaddr, linux_msqid, sizeof(*linux_msqid));
return (error);
}
static int
linux_msqid_pushdown(l_int ver, struct l_msqid_ds *linux_msqid, caddr_t uaddr)
{
struct l_msqid64_ds linux_msqid64;
if (ver == LINUX_IPC_64) {
bzero(&linux_msqid64, sizeof(linux_msqid64));
linux_ipc_perm_to_ipc64_perm(&linux_msqid->msg_perm,
&linux_msqid64.msg_perm);
linux_msqid64.msg_stime = linux_msqid->msg_stime;
linux_msqid64.msg_rtime = linux_msqid->msg_rtime;
linux_msqid64.msg_ctime = linux_msqid->msg_ctime;
if (linux_msqid->msg_cbytes == 0)
linux_msqid64.msg_cbytes = linux_msqid->msg_lcbytes;
else
linux_msqid64.msg_cbytes = linux_msqid->msg_cbytes;
linux_msqid64.msg_qnum = linux_msqid->msg_qnum;
if (linux_msqid->msg_qbytes == 0)
linux_msqid64.msg_qbytes = linux_msqid->msg_lqbytes;
else
linux_msqid64.msg_qbytes = linux_msqid->msg_qbytes;
linux_msqid64.msg_lspid = linux_msqid->msg_lspid;
linux_msqid64.msg_lrpid = linux_msqid->msg_lrpid;
return (copyout(&linux_msqid64, uaddr, sizeof(linux_msqid64)));
} else
return (copyout(linux_msqid, uaddr, sizeof(*linux_msqid)));
}
static int
linux_semid_pullup(l_int ver, struct l_semid_ds *linux_semid, caddr_t uaddr)
{
struct l_semid64_ds linux_semid64;
int error;
if (ver == LINUX_IPC_64) {
error = copyin(uaddr, &linux_semid64, sizeof(linux_semid64));
if (error != 0)
return (error);
bzero(linux_semid, sizeof(*linux_semid));
linux_semid->sem_perm.uid = linux_semid64.sem_perm.uid;
linux_semid->sem_perm.gid = linux_semid64.sem_perm.gid;
linux_semid->sem_perm.mode = linux_semid64.sem_perm.mode;
} else
error = copyin(uaddr, linux_semid, sizeof(*linux_semid));
return (error);
}
static int
linux_semid_pushdown(l_int ver, struct l_semid_ds *linux_semid, caddr_t uaddr)
{
struct l_semid64_ds linux_semid64;
if (ver == LINUX_IPC_64) {
bzero(&linux_semid64, sizeof(linux_semid64));
linux_ipc_perm_to_ipc64_perm(&linux_semid->sem_perm,
&linux_semid64.sem_perm);
linux_semid64.sem_otime = linux_semid->sem_otime;
linux_semid64.sem_ctime = linux_semid->sem_ctime;
linux_semid64.sem_nsems = linux_semid->sem_nsems;
return (copyout(&linux_semid64, uaddr, sizeof(linux_semid64)));
} else
return (copyout(linux_semid, uaddr, sizeof(*linux_semid)));
}
static int
linux_shmid_pullup(l_int ver, struct l_shmid_ds *linux_shmid, caddr_t uaddr)
{
struct l_shmid64_ds linux_shmid64;
int error;
if (ver == LINUX_IPC_64) {
error = copyin(uaddr, &linux_shmid64, sizeof(linux_shmid64));
if (error != 0)
return (error);
bzero(linux_shmid, sizeof(*linux_shmid));
linux_shmid->shm_perm.uid = linux_shmid64.shm_perm.uid;
linux_shmid->shm_perm.gid = linux_shmid64.shm_perm.gid;
linux_shmid->shm_perm.mode = linux_shmid64.shm_perm.mode;
} else
error = copyin(uaddr, linux_shmid, sizeof(*linux_shmid));
return (error);
}
static int
linux_shmid_pushdown(l_int ver, struct l_shmid_ds *linux_shmid, caddr_t uaddr)
{
struct l_shmid64_ds linux_shmid64;
/*
* XXX: This is backwards and loses information in shm_nattch
* and shm_segsz. We should probably either expose the BSD
* shmid structure directly and convert it to either the
* non-64 or 64 variant directly or the code should always
* convert to the 64 variant and then truncate values into the
* non-64 variant if needed since the 64 variant has more
* precision.
*/
if (ver == LINUX_IPC_64) {
bzero(&linux_shmid64, sizeof(linux_shmid64));
linux_ipc_perm_to_ipc64_perm(&linux_shmid->shm_perm,
&linux_shmid64.shm_perm);
linux_shmid64.shm_segsz = linux_shmid->shm_segsz;
linux_shmid64.shm_atime = linux_shmid->shm_atime;
linux_shmid64.shm_dtime = linux_shmid->shm_dtime;
linux_shmid64.shm_ctime = linux_shmid->shm_ctime;
linux_shmid64.shm_cpid = linux_shmid->shm_cpid;
linux_shmid64.shm_lpid = linux_shmid->shm_lpid;
linux_shmid64.shm_nattch = linux_shmid->shm_nattch;
return (copyout(&linux_shmid64, uaddr, sizeof(linux_shmid64)));
} else
return (copyout(linux_shmid, uaddr, sizeof(*linux_shmid)));
}
static int
linux_shminfo_pushdown(l_int ver, struct l_shminfo *linux_shminfo,
caddr_t uaddr)
{
struct l_shminfo64 linux_shminfo64;
if (ver == LINUX_IPC_64) {
bzero(&linux_shminfo64, sizeof(linux_shminfo64));
linux_shminfo64.shmmax = linux_shminfo->shmmax;
linux_shminfo64.shmmin = linux_shminfo->shmmin;
linux_shminfo64.shmmni = linux_shminfo->shmmni;
linux_shminfo64.shmseg = linux_shminfo->shmseg;
linux_shminfo64.shmall = linux_shminfo->shmall;
return (copyout(&linux_shminfo64, uaddr,
sizeof(linux_shminfo64)));
} else
return (copyout(linux_shminfo, uaddr, sizeof(*linux_shminfo)));
}
int
linux_semop(struct thread *td, struct linux_semop_args *args)
{
struct semop_args /* {
int semid;
struct sembuf *sops;
int nsops;
} */ bsd_args;
bsd_args.semid = args->semid;
bsd_args.sops = PTRIN(args->tsops);
bsd_args.nsops = args->nsops;
- return (semop(td, &bsd_args));
+ return (sys_semop(td, &bsd_args));
}
int
linux_semget(struct thread *td, struct linux_semget_args *args)
{
struct semget_args /* {
key_t key;
int nsems;
int semflg;
} */ bsd_args;
if (args->nsems < 0)
return (EINVAL);
bsd_args.key = args->key;
bsd_args.nsems = args->nsems;
bsd_args.semflg = args->semflg;
- return (semget(td, &bsd_args));
+ return (sys_semget(td, &bsd_args));
}
int
linux_semctl(struct thread *td, struct linux_semctl_args *args)
{
struct l_semid_ds linux_semid;
struct l_seminfo linux_seminfo;
struct semid_ds semid;
union semun semun;
register_t rval;
int cmd, error;
switch (args->cmd & ~LINUX_IPC_64) {
case LINUX_IPC_RMID:
cmd = IPC_RMID;
break;
case LINUX_GETNCNT:
cmd = GETNCNT;
break;
case LINUX_GETPID:
cmd = GETPID;
break;
case LINUX_GETVAL:
cmd = GETVAL;
break;
case LINUX_GETZCNT:
cmd = GETZCNT;
break;
case LINUX_SETVAL:
cmd = SETVAL;
semun.val = args->arg.val;
break;
case LINUX_IPC_SET:
cmd = IPC_SET;
error = linux_semid_pullup(args->cmd & LINUX_IPC_64,
&linux_semid, PTRIN(args->arg.buf));
if (error)
return (error);
linux_to_bsd_semid_ds(&linux_semid, &semid);
semun.buf = &semid;
return (kern_semctl(td, args->semid, args->semnum, cmd, &semun,
td->td_retval));
case LINUX_IPC_STAT:
case LINUX_SEM_STAT:
if ((args->cmd & ~LINUX_IPC_64) == LINUX_IPC_STAT)
cmd = IPC_STAT;
else
cmd = SEM_STAT;
semun.buf = &semid;
error = kern_semctl(td, args->semid, args->semnum, cmd, &semun,
&rval);
if (error)
return (error);
bsd_to_linux_semid_ds(&semid, &linux_semid);
error = linux_semid_pushdown(args->cmd & LINUX_IPC_64,
&linux_semid, PTRIN(args->arg.buf));
if (error == 0)
td->td_retval[0] = (cmd == SEM_STAT) ? rval : 0;
return (error);
case LINUX_IPC_INFO:
case LINUX_SEM_INFO:
bcopy(&seminfo, &linux_seminfo.semmni, sizeof(linux_seminfo) -
sizeof(linux_seminfo.semmap) );
/*
* Linux does not use the semmap field but populates it with
* the defined value from SEMMAP, which really is redefined to
* SEMMNS, which they define as SEMMNI * SEMMSL. Try to
* simulate this returning our dynamic semmns value.
*/
linux_seminfo.semmap = linux_seminfo.semmns;
/* XXX BSD equivalent?
#define used_semids 10
#define used_sems 10
linux_seminfo.semusz = used_semids;
linux_seminfo.semaem = used_sems;
*/
error = copyout(&linux_seminfo,
PTRIN(args->arg.buf), sizeof(linux_seminfo));
if (error)
return (error);
td->td_retval[0] = seminfo.semmni;
return (0); /* No need for __semctl call */
case LINUX_GETALL:
cmd = GETALL;
semun.val = args->arg.val;
break;
case LINUX_SETALL:
cmd = SETALL;
semun.val = args->arg.val;
break;
default:
linux_msg(td, "ipc type %d is not implemented",
args->cmd & ~LINUX_IPC_64);
return (EINVAL);
}
return (kern_semctl(td, args->semid, args->semnum, cmd, &semun,
td->td_retval));
}
int
linux_msgsnd(struct thread *td, struct linux_msgsnd_args *args)
{
const void *msgp;
long mtype;
l_long lmtype;
int error;
if ((l_long)args->msgsz < 0 || args->msgsz > (l_long)msginfo.msgmax)
return (EINVAL);
msgp = PTRIN(args->msgp);
if ((error = copyin(msgp, &lmtype, sizeof(lmtype))) != 0)
return (error);
mtype = (long)lmtype;
return (kern_msgsnd(td, args->msqid,
(const char *)msgp + sizeof(lmtype),
args->msgsz, args->msgflg, mtype));
}
int
linux_msgrcv(struct thread *td, struct linux_msgrcv_args *args)
{
void *msgp;
long mtype;
l_long lmtype;
int error;
if ((l_long)args->msgsz < 0 || args->msgsz > (l_long)msginfo.msgmax)
return (EINVAL);
msgp = PTRIN(args->msgp);
if ((error = kern_msgrcv(td, args->msqid,
(char *)msgp + sizeof(lmtype), args->msgsz,
args->msgtyp, args->msgflg, &mtype)) != 0)
return (error);
lmtype = (l_long)mtype;
return (copyout(&lmtype, msgp, sizeof(lmtype)));
}
int
linux_msgget(struct thread *td, struct linux_msgget_args *args)
{
struct msgget_args /* {
key_t key;
int msgflg;
} */ bsd_args;
bsd_args.key = args->key;
bsd_args.msgflg = args->msgflg;
- return (msgget(td, &bsd_args));
+ return (sys_msgget(td, &bsd_args));
}
int
linux_msgctl(struct thread *td, struct linux_msgctl_args *args)
{
int error, bsd_cmd;
struct l_msqid_ds linux_msqid;
struct msqid_ds bsd_msqid;
bsd_cmd = args->cmd & ~LINUX_IPC_64;
switch (bsd_cmd) {
case LINUX_IPC_INFO:
case LINUX_MSG_INFO: {
struct l_msginfo linux_msginfo;
/*
* XXX MSG_INFO uses the same data structure but returns different
* dynamic counters in msgpool, msgmap, and msgtql fields.
*/
linux_msginfo.msgpool = (long)msginfo.msgmni *
(long)msginfo.msgmnb / 1024L; /* XXX MSG_INFO. */
linux_msginfo.msgmap = msginfo.msgmnb; /* XXX MSG_INFO. */
linux_msginfo.msgmax = msginfo.msgmax;
linux_msginfo.msgmnb = msginfo.msgmnb;
linux_msginfo.msgmni = msginfo.msgmni;
linux_msginfo.msgssz = msginfo.msgssz;
linux_msginfo.msgtql = msginfo.msgtql; /* XXX MSG_INFO. */
linux_msginfo.msgseg = msginfo.msgseg;
error = copyout(&linux_msginfo, PTRIN(args->buf),
sizeof(linux_msginfo));
if (error == 0)
td->td_retval[0] = msginfo.msgmni; /* XXX */
return (error);
}
/*
* TODO: implement this
* case LINUX_MSG_STAT:
*/
case LINUX_IPC_STAT:
/* NOTHING */
break;
case LINUX_IPC_SET:
error = linux_msqid_pullup(args->cmd & LINUX_IPC_64,
&linux_msqid, PTRIN(args->buf));
if (error)
return (error);
linux_to_bsd_msqid_ds(&linux_msqid, &bsd_msqid);
break;
case LINUX_IPC_RMID:
/* NOTHING */
break;
default:
return (EINVAL);
break;
}
error = kern_msgctl(td, args->msqid, bsd_cmd, &bsd_msqid);
if (error != 0)
if (bsd_cmd != LINUX_IPC_RMID || error != EINVAL)
return (error);
if (bsd_cmd == LINUX_IPC_STAT) {
bsd_to_linux_msqid_ds(&bsd_msqid, &linux_msqid);
return (linux_msqid_pushdown(args->cmd & LINUX_IPC_64,
&linux_msqid, PTRIN(args->buf)));
}
return (0);
}
int
linux_shmat(struct thread *td, struct linux_shmat_args *args)
{
struct shmat_args /* {
int shmid;
void *shmaddr;
int shmflg;
} */ bsd_args;
int error;
#if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
l_uintptr_t addr;
#endif
bsd_args.shmid = args->shmid;
bsd_args.shmaddr = PTRIN(args->shmaddr);
bsd_args.shmflg = args->shmflg;
- if ((error = shmat(td, &bsd_args)))
+ if ((error = sys_shmat(td, &bsd_args)))
return (error);
#if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
addr = td->td_retval[0];
if ((error = copyout(&addr, PTRIN(args->raddr), sizeof(addr))))
return (error);
td->td_retval[0] = 0;
#endif
return (0);
}
int
linux_shmdt(struct thread *td, struct linux_shmdt_args *args)
{
struct shmdt_args /* {
void *shmaddr;
} */ bsd_args;
bsd_args.shmaddr = PTRIN(args->shmaddr);
- return (shmdt(td, &bsd_args));
+ return (sys_shmdt(td, &bsd_args));
}
int
linux_shmget(struct thread *td, struct linux_shmget_args *args)
{
struct shmget_args /* {
key_t key;
int size;
int shmflg;
} */ bsd_args;
bsd_args.key = args->key;
bsd_args.size = args->size;
bsd_args.shmflg = args->shmflg;
- return (shmget(td, &bsd_args));
+ return (sys_shmget(td, &bsd_args));
}
int
linux_shmctl(struct thread *td, struct linux_shmctl_args *args)
{
struct l_shmid_ds linux_shmid;
struct l_shminfo linux_shminfo;
struct l_shm_info linux_shm_info;
struct shmid_ds bsd_shmid;
int error;
switch (args->cmd & ~LINUX_IPC_64) {
case LINUX_IPC_INFO: {
struct shminfo bsd_shminfo;
/* Perform shmctl wanting removed segments lookup */
error = kern_shmctl(td, args->shmid, IPC_INFO,
(void *)&bsd_shminfo, NULL);
if (error)
return (error);
bsd_to_linux_shminfo(&bsd_shminfo, &linux_shminfo);
return (linux_shminfo_pushdown(args->cmd & LINUX_IPC_64,
&linux_shminfo, PTRIN(args->buf)));
}
case LINUX_SHM_INFO: {
struct shm_info bsd_shm_info;
/* Perform shmctl wanting removed segments lookup */
error = kern_shmctl(td, args->shmid, SHM_INFO,
(void *)&bsd_shm_info, NULL);
if (error)
return (error);
bsd_to_linux_shm_info(&bsd_shm_info, &linux_shm_info);
return (copyout(&linux_shm_info, PTRIN(args->buf),
sizeof(struct l_shm_info)));
}
case LINUX_IPC_STAT:
/* Perform shmctl wanting removed segments lookup */
error = kern_shmctl(td, args->shmid, IPC_STAT,
(void *)&bsd_shmid, NULL);
if (error)
return (error);
bsd_to_linux_shmid_ds(&bsd_shmid, &linux_shmid);
return (linux_shmid_pushdown(args->cmd & LINUX_IPC_64,
&linux_shmid, PTRIN(args->buf)));
case LINUX_SHM_STAT:
/* Perform shmctl wanting removed segments lookup */
error = kern_shmctl(td, args->shmid, IPC_STAT,
(void *)&bsd_shmid, NULL);
if (error)
return (error);
bsd_to_linux_shmid_ds(&bsd_shmid, &linux_shmid);
return (linux_shmid_pushdown(args->cmd & LINUX_IPC_64,
&linux_shmid, PTRIN(args->buf)));
case LINUX_IPC_SET:
error = linux_shmid_pullup(args->cmd & LINUX_IPC_64,
&linux_shmid, PTRIN(args->buf));
if (error)
return (error);
linux_to_bsd_shmid_ds(&linux_shmid, &bsd_shmid);
/* Perform shmctl wanting removed segments lookup */
return (kern_shmctl(td, args->shmid, IPC_SET,
(void *)&bsd_shmid, NULL));
case LINUX_IPC_RMID: {
void *buf;
if (args->buf == 0)
buf = NULL;
else {
error = linux_shmid_pullup(args->cmd & LINUX_IPC_64,
&linux_shmid, PTRIN(args->buf));
if (error)
return (error);
linux_to_bsd_shmid_ds(&linux_shmid, &bsd_shmid);
buf = (void *)&bsd_shmid;
}
return (kern_shmctl(td, args->shmid, IPC_RMID, buf, NULL));
}
case LINUX_SHM_LOCK:
/* FALLTHROUGH */
case LINUX_SHM_UNLOCK:
/* FALLTHROUGH */
default:
linux_msg(td, "ipc type %d not implemented",
args->cmd & ~LINUX_IPC_64);
return (EINVAL);
}
}
MODULE_DEPEND(linux, sysvmsg, 1, 1, 1);
MODULE_DEPEND(linux, sysvsem, 1, 1, 1);
MODULE_DEPEND(linux, sysvshm, 1, 1, 1);
Index: head/sys/compat/linux/linux_misc.c
===================================================================
--- head/sys/compat/linux/linux_misc.c (revision 225616)
+++ head/sys/compat/linux/linux_misc.c (revision 225617)
@@ -1,1926 +1,1926 @@
/*-
* Copyright (c) 2002 Doug Rabson
* Copyright (c) 1994-1995 Søren Schmidt
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer
* in this position and unchanged.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_compat.h"
#include <sys/param.h>
#include <sys/blist.h>
#include <sys/fcntl.h>
#if defined(__i386__)
#include <sys/imgact_aout.h>
#endif
#include <sys/jail.h>
#include <sys/kernel.h>
#include <sys/limits.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mman.h>
#include <sys/mount.h>
#include <sys/mutex.h>
#include <sys/namei.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/reboot.h>
#include <sys/racct.h>
#include <sys/resourcevar.h>
#include <sys/sched.h>
#include <sys/signalvar.h>
#include <sys/stat.h>
#include <sys/syscallsubr.h>
#include <sys/sysctl.h>
#include <sys/sysproto.h>
#include <sys/systm.h>
#include <sys/time.h>
#include <sys/vmmeter.h>
#include <sys/vnode.h>
#include <sys/wait.h>
#include <sys/cpuset.h>
#include <security/mac/mac_framework.h>
#include <vm/vm.h>
#include <vm/pmap.h>
#include <vm/vm_kern.h>
#include <vm/vm_map.h>
#include <vm/vm_extern.h>
#include <vm/vm_object.h>
#include <vm/swap_pager.h>
#ifdef COMPAT_LINUX32
#include <machine/../linux32/linux.h>
#include <machine/../linux32/linux32_proto.h>
#else
#include <machine/../linux/linux.h>
#include <machine/../linux/linux_proto.h>
#endif
#include <compat/linux/linux_file.h>
#include <compat/linux/linux_mib.h>
#include <compat/linux/linux_signal.h>
#include <compat/linux/linux_util.h>
#include <compat/linux/linux_sysproto.h>
#include <compat/linux/linux_emul.h>
#include <compat/linux/linux_misc.h>
int stclohz; /* Statistics clock frequency */
static unsigned int linux_to_bsd_resource[LINUX_RLIM_NLIMITS] = {
RLIMIT_CPU, RLIMIT_FSIZE, RLIMIT_DATA, RLIMIT_STACK,
RLIMIT_CORE, RLIMIT_RSS, RLIMIT_NPROC, RLIMIT_NOFILE,
RLIMIT_MEMLOCK, RLIMIT_AS
};
struct l_sysinfo {
l_long uptime; /* Seconds since boot */
l_ulong loads[3]; /* 1, 5, and 15 minute load averages */
#define LINUX_SYSINFO_LOADS_SCALE 65536
l_ulong totalram; /* Total usable main memory size */
l_ulong freeram; /* Available memory size */
l_ulong sharedram; /* Amount of shared memory */
l_ulong bufferram; /* Memory used by buffers */
l_ulong totalswap; /* Total swap space size */
l_ulong freeswap; /* swap space still available */
l_ushort procs; /* Number of current processes */
l_ushort pads;
l_ulong totalbig;
l_ulong freebig;
l_uint mem_unit;
char _f[20-2*sizeof(l_long)-sizeof(l_int)]; /* padding */
};
int
linux_sysinfo(struct thread *td, struct linux_sysinfo_args *args)
{
struct l_sysinfo sysinfo;
vm_object_t object;
int i, j;
struct timespec ts;
getnanouptime(&ts);
if (ts.tv_nsec != 0)
ts.tv_sec++;
sysinfo.uptime = ts.tv_sec;
/* Use the information from the mib to get our load averages */
for (i = 0; i < 3; i++)
sysinfo.loads[i] = averunnable.ldavg[i] *
LINUX_SYSINFO_LOADS_SCALE / averunnable.fscale;
sysinfo.totalram = physmem * PAGE_SIZE;
sysinfo.freeram = sysinfo.totalram - cnt.v_wire_count * PAGE_SIZE;
sysinfo.sharedram = 0;
mtx_lock(&vm_object_list_mtx);
TAILQ_FOREACH(object, &vm_object_list, object_list)
if (object->shadow_count > 1)
sysinfo.sharedram += object->resident_page_count;
mtx_unlock(&vm_object_list_mtx);
sysinfo.sharedram *= PAGE_SIZE;
sysinfo.bufferram = 0;
swap_pager_status(&i, &j);
sysinfo.totalswap = i * PAGE_SIZE;
sysinfo.freeswap = (i - j) * PAGE_SIZE;
sysinfo.procs = nprocs;
/* The following are only present in newer Linux kernels. */
sysinfo.totalbig = 0;
sysinfo.freebig = 0;
sysinfo.mem_unit = 1;
return (copyout(&sysinfo, args->info, sizeof(sysinfo)));
}
int
linux_alarm(struct thread *td, struct linux_alarm_args *args)
{
struct itimerval it, old_it;
u_int secs;
int error;
#ifdef DEBUG
if (ldebug(alarm))
printf(ARGS(alarm, "%u"), args->secs);
#endif
secs = args->secs;
if (secs > INT_MAX)
secs = INT_MAX;
it.it_value.tv_sec = (long) secs;
it.it_value.tv_usec = 0;
it.it_interval.tv_sec = 0;
it.it_interval.tv_usec = 0;
error = kern_setitimer(td, ITIMER_REAL, &it, &old_it);
if (error)
return (error);
if (timevalisset(&old_it.it_value)) {
if (old_it.it_value.tv_usec != 0)
old_it.it_value.tv_sec++;
td->td_retval[0] = old_it.it_value.tv_sec;
}
return (0);
}
int
linux_brk(struct thread *td, struct linux_brk_args *args)
{
struct vmspace *vm = td->td_proc->p_vmspace;
vm_offset_t new, old;
struct obreak_args /* {
char * nsize;
} */ tmp;
#ifdef DEBUG
if (ldebug(brk))
printf(ARGS(brk, "%p"), (void *)(uintptr_t)args->dsend);
#endif
old = (vm_offset_t)vm->vm_daddr + ctob(vm->vm_dsize);
new = (vm_offset_t)args->dsend;
tmp.nsize = (char *)new;
- if (((caddr_t)new > vm->vm_daddr) && !obreak(td, &tmp))
+ if (((caddr_t)new > vm->vm_daddr) && !sys_obreak(td, &tmp))
td->td_retval[0] = (long)new;
else
td->td_retval[0] = (long)old;
return (0);
}
#if defined(__i386__)
/* XXX: what about amd64/linux32? */
int
linux_uselib(struct thread *td, struct linux_uselib_args *args)
{
struct nameidata ni;
struct vnode *vp;
struct exec *a_out;
struct vattr attr;
vm_offset_t vmaddr;
unsigned long file_offset;
vm_offset_t buffer;
unsigned long bss_size;
char *library;
int error;
int locked, vfslocked;
LCONVPATHEXIST(td, args->library, &library);
#ifdef DEBUG
if (ldebug(uselib))
printf(ARGS(uselib, "%s"), library);
#endif
a_out = NULL;
vfslocked = 0;
locked = 0;
vp = NULL;
NDINIT(&ni, LOOKUP, ISOPEN | FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
UIO_SYSSPACE, library, td);
error = namei(&ni);
LFREEPATH(library);
if (error)
goto cleanup;
vp = ni.ni_vp;
vfslocked = NDHASGIANT(&ni);
NDFREE(&ni, NDF_ONLY_PNBUF);
/*
* From here on down, we have a locked vnode that must be unlocked.
* XXX: The code below largely duplicates exec_check_permissions().
*/
locked = 1;
/* Writable? */
if (vp->v_writecount) {
error = ETXTBSY;
goto cleanup;
}
/* Executable? */
error = VOP_GETATTR(vp, &attr, td->td_ucred);
if (error)
goto cleanup;
if ((vp->v_mount->mnt_flag & MNT_NOEXEC) ||
((attr.va_mode & 0111) == 0) || (attr.va_type != VREG)) {
/* EACCESS is what exec(2) returns. */
error = ENOEXEC;
goto cleanup;
}
/* Sensible size? */
if (attr.va_size == 0) {
error = ENOEXEC;
goto cleanup;
}
/* Can we access it? */
error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td);
if (error)
goto cleanup;
/*
* XXX: This should use vn_open() so that it is properly authorized,
* and to reduce code redundancy all over the place here.
* XXX: Not really, it duplicates far more of exec_check_permissions()
* than vn_open().
*/
#ifdef MAC
error = mac_vnode_check_open(td->td_ucred, vp, VREAD);
if (error)
goto cleanup;
#endif
error = VOP_OPEN(vp, FREAD, td->td_ucred, td, NULL);
if (error)
goto cleanup;
/* Pull in executable header into kernel_map */
error = vm_mmap(kernel_map, (vm_offset_t *)&a_out, PAGE_SIZE,
VM_PROT_READ, VM_PROT_READ, 0, OBJT_VNODE, vp, 0);
if (error)
goto cleanup;
/* Is it a Linux binary ? */
if (((a_out->a_magic >> 16) & 0xff) != 0x64) {
error = ENOEXEC;
goto cleanup;
}
/*
* While we are here, we should REALLY do some more checks
*/
/* Set file/virtual offset based on a.out variant. */
switch ((int)(a_out->a_magic & 0xffff)) {
case 0413: /* ZMAGIC */
file_offset = 1024;
break;
case 0314: /* QMAGIC */
file_offset = 0;
break;
default:
error = ENOEXEC;
goto cleanup;
}
bss_size = round_page(a_out->a_bss);
/* Check various fields in header for validity/bounds. */
if (a_out->a_text & PAGE_MASK || a_out->a_data & PAGE_MASK) {
error = ENOEXEC;
goto cleanup;
}
/* text + data can't exceed file size */
if (a_out->a_data + a_out->a_text > attr.va_size) {
error = EFAULT;
goto cleanup;
}
/*
* text/data/bss must not exceed limits
* XXX - this is not complete. it should check current usage PLUS
* the resources needed by this library.
*/
PROC_LOCK(td->td_proc);
if (a_out->a_text > maxtsiz ||
a_out->a_data + bss_size > lim_cur(td->td_proc, RLIMIT_DATA) ||
racct_set(td->td_proc, RACCT_DATA, a_out->a_data +
bss_size) != 0) {
PROC_UNLOCK(td->td_proc);
error = ENOMEM;
goto cleanup;
}
PROC_UNLOCK(td->td_proc);
/*
* Prevent more writers.
* XXX: Note that if any of the VM operations fail below we don't
* clear this flag.
*/
vp->v_vflag |= VV_TEXT;
/*
* Lock no longer needed
*/
locked = 0;
VOP_UNLOCK(vp, 0);
VFS_UNLOCK_GIANT(vfslocked);
/*
* Check if file_offset page aligned. Currently we cannot handle
* misalinged file offsets, and so we read in the entire image
* (what a waste).
*/
if (file_offset & PAGE_MASK) {
#ifdef DEBUG
printf("uselib: Non page aligned binary %lu\n", file_offset);
#endif
/* Map text+data read/write/execute */
/* a_entry is the load address and is page aligned */
vmaddr = trunc_page(a_out->a_entry);
/* get anon user mapping, read+write+execute */
error = vm_map_find(&td->td_proc->p_vmspace->vm_map, NULL, 0,
&vmaddr, a_out->a_text + a_out->a_data, FALSE, VM_PROT_ALL,
VM_PROT_ALL, 0);
if (error)
goto cleanup;
/* map file into kernel_map */
error = vm_mmap(kernel_map, &buffer,
round_page(a_out->a_text + a_out->a_data + file_offset),
VM_PROT_READ, VM_PROT_READ, 0, OBJT_VNODE, vp,
trunc_page(file_offset));
if (error)
goto cleanup;
/* copy from kernel VM space to user space */
error = copyout(PTRIN(buffer + file_offset),
(void *)vmaddr, a_out->a_text + a_out->a_data);
/* release temporary kernel space */
vm_map_remove(kernel_map, buffer, buffer +
round_page(a_out->a_text + a_out->a_data + file_offset));
if (error)
goto cleanup;
} else {
#ifdef DEBUG
printf("uselib: Page aligned binary %lu\n", file_offset);
#endif
/*
* for QMAGIC, a_entry is 20 bytes beyond the load address
* to skip the executable header
*/
vmaddr = trunc_page(a_out->a_entry);
/*
* Map it all into the process's space as a single
* copy-on-write "data" segment.
*/
error = vm_mmap(&td->td_proc->p_vmspace->vm_map, &vmaddr,
a_out->a_text + a_out->a_data, VM_PROT_ALL, VM_PROT_ALL,
MAP_PRIVATE | MAP_FIXED, OBJT_VNODE, vp, file_offset);
if (error)
goto cleanup;
}
#ifdef DEBUG
printf("mem=%08lx = %08lx %08lx\n", (long)vmaddr, ((long *)vmaddr)[0],
((long *)vmaddr)[1]);
#endif
if (bss_size != 0) {
/* Calculate BSS start address */
vmaddr = trunc_page(a_out->a_entry) + a_out->a_text +
a_out->a_data;
/* allocate some 'anon' space */
error = vm_map_find(&td->td_proc->p_vmspace->vm_map, NULL, 0,
&vmaddr, bss_size, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0);
if (error)
goto cleanup;
}
cleanup:
/* Unlock vnode if needed */
if (locked) {
VOP_UNLOCK(vp, 0);
VFS_UNLOCK_GIANT(vfslocked);
}
/* Release the kernel mapping. */
if (a_out)
vm_map_remove(kernel_map, (vm_offset_t)a_out,
(vm_offset_t)a_out + PAGE_SIZE);
return (error);
}
#endif /* __i386__ */
int
linux_select(struct thread *td, struct linux_select_args *args)
{
l_timeval ltv;
struct timeval tv0, tv1, utv, *tvp;
int error;
#ifdef DEBUG
if (ldebug(select))
printf(ARGS(select, "%d, %p, %p, %p, %p"), args->nfds,
(void *)args->readfds, (void *)args->writefds,
(void *)args->exceptfds, (void *)args->timeout);
#endif
/*
* Store current time for computation of the amount of
* time left.
*/
if (args->timeout) {
if ((error = copyin(args->timeout, &ltv, sizeof(ltv))))
goto select_out;
utv.tv_sec = ltv.tv_sec;
utv.tv_usec = ltv.tv_usec;
#ifdef DEBUG
if (ldebug(select))
printf(LMSG("incoming timeout (%jd/%ld)"),
(intmax_t)utv.tv_sec, utv.tv_usec);
#endif
if (itimerfix(&utv)) {
/*
* The timeval was invalid. Convert it to something
* valid that will act as it does under Linux.
*/
utv.tv_sec += utv.tv_usec / 1000000;
utv.tv_usec %= 1000000;
if (utv.tv_usec < 0) {
utv.tv_sec -= 1;
utv.tv_usec += 1000000;
}
if (utv.tv_sec < 0)
timevalclear(&utv);
}
microtime(&tv0);
tvp = &utv;
} else
tvp = NULL;
error = kern_select(td, args->nfds, args->readfds, args->writefds,
args->exceptfds, tvp, sizeof(l_int) * 8);
#ifdef DEBUG
if (ldebug(select))
printf(LMSG("real select returns %d"), error);
#endif
if (error)
goto select_out;
if (args->timeout) {
if (td->td_retval[0]) {
/*
* Compute how much time was left of the timeout,
* by subtracting the current time and the time
* before we started the call, and subtracting
* that result from the user-supplied value.
*/
microtime(&tv1);
timevalsub(&tv1, &tv0);
timevalsub(&utv, &tv1);
if (utv.tv_sec < 0)
timevalclear(&utv);
} else
timevalclear(&utv);
#ifdef DEBUG
if (ldebug(select))
printf(LMSG("outgoing timeout (%jd/%ld)"),
(intmax_t)utv.tv_sec, utv.tv_usec);
#endif
ltv.tv_sec = utv.tv_sec;
ltv.tv_usec = utv.tv_usec;
if ((error = copyout(&ltv, args->timeout, sizeof(ltv))))
goto select_out;
}
select_out:
#ifdef DEBUG
if (ldebug(select))
printf(LMSG("select_out -> %d"), error);
#endif
return (error);
}
int
linux_mremap(struct thread *td, struct linux_mremap_args *args)
{
struct munmap_args /* {
void *addr;
size_t len;
} */ bsd_args;
int error = 0;
#ifdef DEBUG
if (ldebug(mremap))
printf(ARGS(mremap, "%p, %08lx, %08lx, %08lx"),
(void *)(uintptr_t)args->addr,
(unsigned long)args->old_len,
(unsigned long)args->new_len,
(unsigned long)args->flags);
#endif
if (args->flags & ~(LINUX_MREMAP_FIXED | LINUX_MREMAP_MAYMOVE)) {
td->td_retval[0] = 0;
return (EINVAL);
}
/*
* Check for the page alignment.
* Linux defines PAGE_MASK to be FreeBSD ~PAGE_MASK.
*/
if (args->addr & PAGE_MASK) {
td->td_retval[0] = 0;
return (EINVAL);
}
args->new_len = round_page(args->new_len);
args->old_len = round_page(args->old_len);
if (args->new_len > args->old_len) {
td->td_retval[0] = 0;
return (ENOMEM);
}
if (args->new_len < args->old_len) {
bsd_args.addr =
(caddr_t)((uintptr_t)args->addr + args->new_len);
bsd_args.len = args->old_len - args->new_len;
- error = munmap(td, &bsd_args);
+ error = sys_munmap(td, &bsd_args);
}
td->td_retval[0] = error ? 0 : (uintptr_t)args->addr;
return (error);
}
#define LINUX_MS_ASYNC 0x0001
#define LINUX_MS_INVALIDATE 0x0002
#define LINUX_MS_SYNC 0x0004
int
linux_msync(struct thread *td, struct linux_msync_args *args)
{
struct msync_args bsd_args;
bsd_args.addr = (caddr_t)(uintptr_t)args->addr;
bsd_args.len = (uintptr_t)args->len;
bsd_args.flags = args->fl & ~LINUX_MS_SYNC;
- return (msync(td, &bsd_args));
+ return (sys_msync(td, &bsd_args));
}
int
linux_time(struct thread *td, struct linux_time_args *args)
{
struct timeval tv;
l_time_t tm;
int error;
#ifdef DEBUG
if (ldebug(time))
printf(ARGS(time, "*"));
#endif
microtime(&tv);
tm = tv.tv_sec;
if (args->tm && (error = copyout(&tm, args->tm, sizeof(tm))))
return (error);
td->td_retval[0] = tm;
return (0);
}
struct l_times_argv {
l_clock_t tms_utime;
l_clock_t tms_stime;
l_clock_t tms_cutime;
l_clock_t tms_cstime;
};
/*
* Glibc versions prior to 2.2.1 always use hard-coded CLK_TCK value.
* Since 2.2.1 Glibc uses value exported from kernel via AT_CLKTCK
* auxiliary vector entry.
*/
#define CLK_TCK 100
#define CONVOTCK(r) (r.tv_sec * CLK_TCK + r.tv_usec / (1000000 / CLK_TCK))
#define CONVNTCK(r) (r.tv_sec * stclohz + r.tv_usec / (1000000 / stclohz))
#define CONVTCK(r) (linux_kernver(td) >= LINUX_KERNVER_2004000 ? \
CONVNTCK(r) : CONVOTCK(r))
int
linux_times(struct thread *td, struct linux_times_args *args)
{
struct timeval tv, utime, stime, cutime, cstime;
struct l_times_argv tms;
struct proc *p;
int error;
#ifdef DEBUG
if (ldebug(times))
printf(ARGS(times, "*"));
#endif
if (args->buf != NULL) {
p = td->td_proc;
PROC_LOCK(p);
PROC_SLOCK(p);
calcru(p, &utime, &stime);
PROC_SUNLOCK(p);
calccru(p, &cutime, &cstime);
PROC_UNLOCK(p);
tms.tms_utime = CONVTCK(utime);
tms.tms_stime = CONVTCK(stime);
tms.tms_cutime = CONVTCK(cutime);
tms.tms_cstime = CONVTCK(cstime);
if ((error = copyout(&tms, args->buf, sizeof(tms))))
return (error);
}
microuptime(&tv);
td->td_retval[0] = (int)CONVTCK(tv);
return (0);
}
int
linux_newuname(struct thread *td, struct linux_newuname_args *args)
{
struct l_new_utsname utsname;
char osname[LINUX_MAX_UTSNAME];
char osrelease[LINUX_MAX_UTSNAME];
char *p;
#ifdef DEBUG
if (ldebug(newuname))
printf(ARGS(newuname, "*"));
#endif
linux_get_osname(td, osname);
linux_get_osrelease(td, osrelease);
bzero(&utsname, sizeof(utsname));
strlcpy(utsname.sysname, osname, LINUX_MAX_UTSNAME);
getcredhostname(td->td_ucred, utsname.nodename, LINUX_MAX_UTSNAME);
getcreddomainname(td->td_ucred, utsname.domainname, LINUX_MAX_UTSNAME);
strlcpy(utsname.release, osrelease, LINUX_MAX_UTSNAME);
strlcpy(utsname.version, version, LINUX_MAX_UTSNAME);
for (p = utsname.version; *p != '\0'; ++p)
if (*p == '\n') {
*p = '\0';
break;
}
strlcpy(utsname.machine, linux_platform, LINUX_MAX_UTSNAME);
return (copyout(&utsname, args->buf, sizeof(utsname)));
}
#if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
struct l_utimbuf {
l_time_t l_actime;
l_time_t l_modtime;
};
int
linux_utime(struct thread *td, struct linux_utime_args *args)
{
struct timeval tv[2], *tvp;
struct l_utimbuf lut;
char *fname;
int error;
LCONVPATHEXIST(td, args->fname, &fname);
#ifdef DEBUG
if (ldebug(utime))
printf(ARGS(utime, "%s, *"), fname);
#endif
if (args->times) {
if ((error = copyin(args->times, &lut, sizeof lut))) {
LFREEPATH(fname);
return (error);
}
tv[0].tv_sec = lut.l_actime;
tv[0].tv_usec = 0;
tv[1].tv_sec = lut.l_modtime;
tv[1].tv_usec = 0;
tvp = tv;
} else
tvp = NULL;
error = kern_utimes(td, fname, UIO_SYSSPACE, tvp, UIO_SYSSPACE);
LFREEPATH(fname);
return (error);
}
int
linux_utimes(struct thread *td, struct linux_utimes_args *args)
{
l_timeval ltv[2];
struct timeval tv[2], *tvp = NULL;
char *fname;
int error;
LCONVPATHEXIST(td, args->fname, &fname);
#ifdef DEBUG
if (ldebug(utimes))
printf(ARGS(utimes, "%s, *"), fname);
#endif
if (args->tptr != NULL) {
if ((error = copyin(args->tptr, ltv, sizeof ltv))) {
LFREEPATH(fname);
return (error);
}
tv[0].tv_sec = ltv[0].tv_sec;
tv[0].tv_usec = ltv[0].tv_usec;
tv[1].tv_sec = ltv[1].tv_sec;
tv[1].tv_usec = ltv[1].tv_usec;
tvp = tv;
}
error = kern_utimes(td, fname, UIO_SYSSPACE, tvp, UIO_SYSSPACE);
LFREEPATH(fname);
return (error);
}
int
linux_futimesat(struct thread *td, struct linux_futimesat_args *args)
{
l_timeval ltv[2];
struct timeval tv[2], *tvp = NULL;
char *fname;
int error, dfd;
dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd;
LCONVPATHEXIST_AT(td, args->filename, &fname, dfd);
#ifdef DEBUG
if (ldebug(futimesat))
printf(ARGS(futimesat, "%s, *"), fname);
#endif
if (args->utimes != NULL) {
if ((error = copyin(args->utimes, ltv, sizeof ltv))) {
LFREEPATH(fname);
return (error);
}
tv[0].tv_sec = ltv[0].tv_sec;
tv[0].tv_usec = ltv[0].tv_usec;
tv[1].tv_sec = ltv[1].tv_sec;
tv[1].tv_usec = ltv[1].tv_usec;
tvp = tv;
}
error = kern_utimesat(td, dfd, fname, UIO_SYSSPACE, tvp, UIO_SYSSPACE);
LFREEPATH(fname);
return (error);
}
#endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */
int
linux_common_wait(struct thread *td, int pid, int *status,
int options, struct rusage *ru)
{
int error, tmpstat;
error = kern_wait(td, pid, &tmpstat, options, ru);
if (error)
return (error);
if (status) {
tmpstat &= 0xffff;
if (WIFSIGNALED(tmpstat))
tmpstat = (tmpstat & 0xffffff80) |
BSD_TO_LINUX_SIGNAL(WTERMSIG(tmpstat));
else if (WIFSTOPPED(tmpstat))
tmpstat = (tmpstat & 0xffff00ff) |
(BSD_TO_LINUX_SIGNAL(WSTOPSIG(tmpstat)) << 8);
error = copyout(&tmpstat, status, sizeof(int));
}
return (error);
}
int
linux_waitpid(struct thread *td, struct linux_waitpid_args *args)
{
int options;
#ifdef DEBUG
if (ldebug(waitpid))
printf(ARGS(waitpid, "%d, %p, %d"),
args->pid, (void *)args->status, args->options);
#endif
/*
* this is necessary because the test in kern_wait doesn't work
* because we mess with the options here
*/
if (args->options & ~(WUNTRACED | WNOHANG | WCONTINUED | __WCLONE))
return (EINVAL);
options = (args->options & (WNOHANG | WUNTRACED));
/* WLINUXCLONE should be equal to __WCLONE, but we make sure */
if (args->options & __WCLONE)
options |= WLINUXCLONE;
return (linux_common_wait(td, args->pid, args->status, options, NULL));
}
int
linux_mknod(struct thread *td, struct linux_mknod_args *args)
{
char *path;
int error;
LCONVPATHCREAT(td, args->path, &path);
#ifdef DEBUG
if (ldebug(mknod))
printf(ARGS(mknod, "%s, %d, %d"), path, args->mode, args->dev);
#endif
switch (args->mode & S_IFMT) {
case S_IFIFO:
case S_IFSOCK:
error = kern_mkfifo(td, path, UIO_SYSSPACE, args->mode);
break;
case S_IFCHR:
case S_IFBLK:
error = kern_mknod(td, path, UIO_SYSSPACE, args->mode,
args->dev);
break;
case S_IFDIR:
error = EPERM;
break;
case 0:
args->mode |= S_IFREG;
/* FALLTHROUGH */
case S_IFREG:
error = kern_open(td, path, UIO_SYSSPACE,
O_WRONLY | O_CREAT | O_TRUNC, args->mode);
if (error == 0)
kern_close(td, td->td_retval[0]);
break;
default:
error = EINVAL;
break;
}
LFREEPATH(path);
return (error);
}
int
linux_mknodat(struct thread *td, struct linux_mknodat_args *args)
{
char *path;
int error, dfd;
dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd;
LCONVPATHCREAT_AT(td, args->filename, &path, dfd);
#ifdef DEBUG
if (ldebug(mknodat))
printf(ARGS(mknodat, "%s, %d, %d"), path, args->mode, args->dev);
#endif
switch (args->mode & S_IFMT) {
case S_IFIFO:
case S_IFSOCK:
error = kern_mkfifoat(td, dfd, path, UIO_SYSSPACE, args->mode);
break;
case S_IFCHR:
case S_IFBLK:
error = kern_mknodat(td, dfd, path, UIO_SYSSPACE, args->mode,
args->dev);
break;
case S_IFDIR:
error = EPERM;
break;
case 0:
args->mode |= S_IFREG;
/* FALLTHROUGH */
case S_IFREG:
error = kern_openat(td, dfd, path, UIO_SYSSPACE,
O_WRONLY | O_CREAT | O_TRUNC, args->mode);
if (error == 0)
kern_close(td, td->td_retval[0]);
break;
default:
error = EINVAL;
break;
}
LFREEPATH(path);
return (error);
}
/*
* UGH! This is just about the dumbest idea I've ever heard!!
*/
int
linux_personality(struct thread *td, struct linux_personality_args *args)
{
#ifdef DEBUG
if (ldebug(personality))
printf(ARGS(personality, "%lu"), (unsigned long)args->per);
#endif
if (args->per != 0)
return (EINVAL);
/* Yes Jim, it's still a Linux... */
td->td_retval[0] = 0;
return (0);
}
struct l_itimerval {
l_timeval it_interval;
l_timeval it_value;
};
#define B2L_ITIMERVAL(bip, lip) \
(bip)->it_interval.tv_sec = (lip)->it_interval.tv_sec; \
(bip)->it_interval.tv_usec = (lip)->it_interval.tv_usec; \
(bip)->it_value.tv_sec = (lip)->it_value.tv_sec; \
(bip)->it_value.tv_usec = (lip)->it_value.tv_usec;
int
linux_setitimer(struct thread *td, struct linux_setitimer_args *uap)
{
int error;
struct l_itimerval ls;
struct itimerval aitv, oitv;
#ifdef DEBUG
if (ldebug(setitimer))
printf(ARGS(setitimer, "%p, %p"),
(void *)uap->itv, (void *)uap->oitv);
#endif
if (uap->itv == NULL) {
uap->itv = uap->oitv;
return (linux_getitimer(td, (struct linux_getitimer_args *)uap));
}
error = copyin(uap->itv, &ls, sizeof(ls));
if (error != 0)
return (error);
B2L_ITIMERVAL(&aitv, &ls);
#ifdef DEBUG
if (ldebug(setitimer)) {
printf("setitimer: value: sec: %jd, usec: %ld\n",
(intmax_t)aitv.it_value.tv_sec, aitv.it_value.tv_usec);
printf("setitimer: interval: sec: %jd, usec: %ld\n",
(intmax_t)aitv.it_interval.tv_sec, aitv.it_interval.tv_usec);
}
#endif
error = kern_setitimer(td, uap->which, &aitv, &oitv);
if (error != 0 || uap->oitv == NULL)
return (error);
B2L_ITIMERVAL(&ls, &oitv);
return (copyout(&ls, uap->oitv, sizeof(ls)));
}
int
linux_getitimer(struct thread *td, struct linux_getitimer_args *uap)
{
int error;
struct l_itimerval ls;
struct itimerval aitv;
#ifdef DEBUG
if (ldebug(getitimer))
printf(ARGS(getitimer, "%p"), (void *)uap->itv);
#endif
error = kern_getitimer(td, uap->which, &aitv);
if (error != 0)
return (error);
B2L_ITIMERVAL(&ls, &aitv);
return (copyout(&ls, uap->itv, sizeof(ls)));
}
int
linux_nice(struct thread *td, struct linux_nice_args *args)
{
struct setpriority_args bsd_args;
bsd_args.which = PRIO_PROCESS;
bsd_args.who = 0; /* current process */
bsd_args.prio = args->inc;
- return (setpriority(td, &bsd_args));
+ return (sys_setpriority(td, &bsd_args));
}
int
linux_setgroups(struct thread *td, struct linux_setgroups_args *args)
{
struct ucred *newcred, *oldcred;
l_gid_t *linux_gidset;
gid_t *bsd_gidset;
int ngrp, error;
struct proc *p;
ngrp = args->gidsetsize;
if (ngrp < 0 || ngrp >= ngroups_max + 1)
return (EINVAL);
linux_gidset = malloc(ngrp * sizeof(*linux_gidset), M_TEMP, M_WAITOK);
error = copyin(args->grouplist, linux_gidset, ngrp * sizeof(l_gid_t));
if (error)
goto out;
newcred = crget();
p = td->td_proc;
PROC_LOCK(p);
oldcred = crcopysafe(p, newcred);
/*
* cr_groups[0] holds egid. Setting the whole set from
* the supplied set will cause egid to be changed too.
* Keep cr_groups[0] unchanged to prevent that.
*/
if ((error = priv_check_cred(oldcred, PRIV_CRED_SETGROUPS, 0)) != 0) {
PROC_UNLOCK(p);
crfree(newcred);
goto out;
}
if (ngrp > 0) {
newcred->cr_ngroups = ngrp + 1;
bsd_gidset = newcred->cr_groups;
ngrp--;
while (ngrp >= 0) {
bsd_gidset[ngrp + 1] = linux_gidset[ngrp];
ngrp--;
}
} else
newcred->cr_ngroups = 1;
setsugid(p);
p->p_ucred = newcred;
PROC_UNLOCK(p);
crfree(oldcred);
error = 0;
out:
free(linux_gidset, M_TEMP);
return (error);
}
int
linux_getgroups(struct thread *td, struct linux_getgroups_args *args)
{
struct ucred *cred;
l_gid_t *linux_gidset;
gid_t *bsd_gidset;
int bsd_gidsetsz, ngrp, error;
cred = td->td_ucred;
bsd_gidset = cred->cr_groups;
bsd_gidsetsz = cred->cr_ngroups - 1;
/*
* cr_groups[0] holds egid. Returning the whole set
* here will cause a duplicate. Exclude cr_groups[0]
* to prevent that.
*/
if ((ngrp = args->gidsetsize) == 0) {
td->td_retval[0] = bsd_gidsetsz;
return (0);
}
if (ngrp < bsd_gidsetsz)
return (EINVAL);
ngrp = 0;
linux_gidset = malloc(bsd_gidsetsz * sizeof(*linux_gidset),
M_TEMP, M_WAITOK);
while (ngrp < bsd_gidsetsz) {
linux_gidset[ngrp] = bsd_gidset[ngrp + 1];
ngrp++;
}
error = copyout(linux_gidset, args->grouplist, ngrp * sizeof(l_gid_t));
free(linux_gidset, M_TEMP);
if (error)
return (error);
td->td_retval[0] = ngrp;
return (0);
}
int
linux_setrlimit(struct thread *td, struct linux_setrlimit_args *args)
{
struct rlimit bsd_rlim;
struct l_rlimit rlim;
u_int which;
int error;
#ifdef DEBUG
if (ldebug(setrlimit))
printf(ARGS(setrlimit, "%d, %p"),
args->resource, (void *)args->rlim);
#endif
if (args->resource >= LINUX_RLIM_NLIMITS)
return (EINVAL);
which = linux_to_bsd_resource[args->resource];
if (which == -1)
return (EINVAL);
error = copyin(args->rlim, &rlim, sizeof(rlim));
if (error)
return (error);
bsd_rlim.rlim_cur = (rlim_t)rlim.rlim_cur;
bsd_rlim.rlim_max = (rlim_t)rlim.rlim_max;
return (kern_setrlimit(td, which, &bsd_rlim));
}
int
linux_old_getrlimit(struct thread *td, struct linux_old_getrlimit_args *args)
{
struct l_rlimit rlim;
struct proc *p = td->td_proc;
struct rlimit bsd_rlim;
u_int which;
#ifdef DEBUG
if (ldebug(old_getrlimit))
printf(ARGS(old_getrlimit, "%d, %p"),
args->resource, (void *)args->rlim);
#endif
if (args->resource >= LINUX_RLIM_NLIMITS)
return (EINVAL);
which = linux_to_bsd_resource[args->resource];
if (which == -1)
return (EINVAL);
PROC_LOCK(p);
lim_rlimit(p, which, &bsd_rlim);
PROC_UNLOCK(p);
#ifdef COMPAT_LINUX32
rlim.rlim_cur = (unsigned int)bsd_rlim.rlim_cur;
if (rlim.rlim_cur == UINT_MAX)
rlim.rlim_cur = INT_MAX;
rlim.rlim_max = (unsigned int)bsd_rlim.rlim_max;
if (rlim.rlim_max == UINT_MAX)
rlim.rlim_max = INT_MAX;
#else
rlim.rlim_cur = (unsigned long)bsd_rlim.rlim_cur;
if (rlim.rlim_cur == ULONG_MAX)
rlim.rlim_cur = LONG_MAX;
rlim.rlim_max = (unsigned long)bsd_rlim.rlim_max;
if (rlim.rlim_max == ULONG_MAX)
rlim.rlim_max = LONG_MAX;
#endif
return (copyout(&rlim, args->rlim, sizeof(rlim)));
}
int
linux_getrlimit(struct thread *td, struct linux_getrlimit_args *args)
{
struct l_rlimit rlim;
struct proc *p = td->td_proc;
struct rlimit bsd_rlim;
u_int which;
#ifdef DEBUG
if (ldebug(getrlimit))
printf(ARGS(getrlimit, "%d, %p"),
args->resource, (void *)args->rlim);
#endif
if (args->resource >= LINUX_RLIM_NLIMITS)
return (EINVAL);
which = linux_to_bsd_resource[args->resource];
if (which == -1)
return (EINVAL);
PROC_LOCK(p);
lim_rlimit(p, which, &bsd_rlim);
PROC_UNLOCK(p);
rlim.rlim_cur = (l_ulong)bsd_rlim.rlim_cur;
rlim.rlim_max = (l_ulong)bsd_rlim.rlim_max;
return (copyout(&rlim, args->rlim, sizeof(rlim)));
}
int
linux_sched_setscheduler(struct thread *td,
struct linux_sched_setscheduler_args *args)
{
struct sched_setscheduler_args bsd;
#ifdef DEBUG
if (ldebug(sched_setscheduler))
printf(ARGS(sched_setscheduler, "%d, %d, %p"),
args->pid, args->policy, (const void *)args->param);
#endif
switch (args->policy) {
case LINUX_SCHED_OTHER:
bsd.policy = SCHED_OTHER;
break;
case LINUX_SCHED_FIFO:
bsd.policy = SCHED_FIFO;
break;
case LINUX_SCHED_RR:
bsd.policy = SCHED_RR;
break;
default:
return (EINVAL);
}
bsd.pid = args->pid;
bsd.param = (struct sched_param *)args->param;
- return (sched_setscheduler(td, &bsd));
+ return (sys_sched_setscheduler(td, &bsd));
}
int
linux_sched_getscheduler(struct thread *td,
struct linux_sched_getscheduler_args *args)
{
struct sched_getscheduler_args bsd;
int error;
#ifdef DEBUG
if (ldebug(sched_getscheduler))
printf(ARGS(sched_getscheduler, "%d"), args->pid);
#endif
bsd.pid = args->pid;
- error = sched_getscheduler(td, &bsd);
+ error = sys_sched_getscheduler(td, &bsd);
switch (td->td_retval[0]) {
case SCHED_OTHER:
td->td_retval[0] = LINUX_SCHED_OTHER;
break;
case SCHED_FIFO:
td->td_retval[0] = LINUX_SCHED_FIFO;
break;
case SCHED_RR:
td->td_retval[0] = LINUX_SCHED_RR;
break;
}
return (error);
}
int
linux_sched_get_priority_max(struct thread *td,
struct linux_sched_get_priority_max_args *args)
{
struct sched_get_priority_max_args bsd;
#ifdef DEBUG
if (ldebug(sched_get_priority_max))
printf(ARGS(sched_get_priority_max, "%d"), args->policy);
#endif
switch (args->policy) {
case LINUX_SCHED_OTHER:
bsd.policy = SCHED_OTHER;
break;
case LINUX_SCHED_FIFO:
bsd.policy = SCHED_FIFO;
break;
case LINUX_SCHED_RR:
bsd.policy = SCHED_RR;
break;
default:
return (EINVAL);
}
- return (sched_get_priority_max(td, &bsd));
+ return (sys_sched_get_priority_max(td, &bsd));
}
int
linux_sched_get_priority_min(struct thread *td,
struct linux_sched_get_priority_min_args *args)
{
struct sched_get_priority_min_args bsd;
#ifdef DEBUG
if (ldebug(sched_get_priority_min))
printf(ARGS(sched_get_priority_min, "%d"), args->policy);
#endif
switch (args->policy) {
case LINUX_SCHED_OTHER:
bsd.policy = SCHED_OTHER;
break;
case LINUX_SCHED_FIFO:
bsd.policy = SCHED_FIFO;
break;
case LINUX_SCHED_RR:
bsd.policy = SCHED_RR;
break;
default:
return (EINVAL);
}
- return (sched_get_priority_min(td, &bsd));
+ return (sys_sched_get_priority_min(td, &bsd));
}
#define REBOOT_CAD_ON 0x89abcdef
#define REBOOT_CAD_OFF 0
#define REBOOT_HALT 0xcdef0123
#define REBOOT_RESTART 0x01234567
#define REBOOT_RESTART2 0xA1B2C3D4
#define REBOOT_POWEROFF 0x4321FEDC
#define REBOOT_MAGIC1 0xfee1dead
#define REBOOT_MAGIC2 0x28121969
#define REBOOT_MAGIC2A 0x05121996
#define REBOOT_MAGIC2B 0x16041998
int
linux_reboot(struct thread *td, struct linux_reboot_args *args)
{
struct reboot_args bsd_args;
#ifdef DEBUG
if (ldebug(reboot))
printf(ARGS(reboot, "0x%x"), args->cmd);
#endif
if (args->magic1 != REBOOT_MAGIC1)
return (EINVAL);
switch (args->magic2) {
case REBOOT_MAGIC2:
case REBOOT_MAGIC2A:
case REBOOT_MAGIC2B:
break;
default:
return (EINVAL);
}
switch (args->cmd) {
case REBOOT_CAD_ON:
case REBOOT_CAD_OFF:
return (priv_check(td, PRIV_REBOOT));
case REBOOT_HALT:
bsd_args.opt = RB_HALT;
break;
case REBOOT_RESTART:
case REBOOT_RESTART2:
bsd_args.opt = 0;
break;
case REBOOT_POWEROFF:
bsd_args.opt = RB_POWEROFF;
break;
default:
return (EINVAL);
}
- return (reboot(td, &bsd_args));
+ return (sys_reboot(td, &bsd_args));
}
/*
* The FreeBSD native getpid(2), getgid(2) and getuid(2) also modify
* td->td_retval[1] when COMPAT_43 is defined. This clobbers registers that
* are assumed to be preserved. The following lightweight syscalls fixes
* this. See also linux_getgid16() and linux_getuid16() in linux_uid16.c
*
* linux_getpid() - MP SAFE
* linux_getgid() - MP SAFE
* linux_getuid() - MP SAFE
*/
int
linux_getpid(struct thread *td, struct linux_getpid_args *args)
{
struct linux_emuldata *em;
#ifdef DEBUG
if (ldebug(getpid))
printf(ARGS(getpid, ""));
#endif
if (linux_use26(td)) {
em = em_find(td->td_proc, EMUL_DONTLOCK);
KASSERT(em != NULL, ("getpid: emuldata not found.\n"));
td->td_retval[0] = em->shared->group_pid;
} else {
td->td_retval[0] = td->td_proc->p_pid;
}
return (0);
}
int
linux_gettid(struct thread *td, struct linux_gettid_args *args)
{
#ifdef DEBUG
if (ldebug(gettid))
printf(ARGS(gettid, ""));
#endif
td->td_retval[0] = td->td_proc->p_pid;
return (0);
}
int
linux_getppid(struct thread *td, struct linux_getppid_args *args)
{
struct linux_emuldata *em;
struct proc *p, *pp;
#ifdef DEBUG
if (ldebug(getppid))
printf(ARGS(getppid, ""));
#endif
if (!linux_use26(td)) {
PROC_LOCK(td->td_proc);
td->td_retval[0] = td->td_proc->p_pptr->p_pid;
PROC_UNLOCK(td->td_proc);
return (0);
}
em = em_find(td->td_proc, EMUL_DONTLOCK);
KASSERT(em != NULL, ("getppid: process emuldata not found.\n"));
/* find the group leader */
p = pfind(em->shared->group_pid);
if (p == NULL) {
#ifdef DEBUG
printf(LMSG("parent process not found.\n"));
#endif
return (0);
}
pp = p->p_pptr; /* switch to parent */
PROC_LOCK(pp);
PROC_UNLOCK(p);
/* if its also linux process */
if (pp->p_sysent == &elf_linux_sysvec) {
em = em_find(pp, EMUL_DONTLOCK);
KASSERT(em != NULL, ("getppid: parent emuldata not found.\n"));
td->td_retval[0] = em->shared->group_pid;
} else
td->td_retval[0] = pp->p_pid;
PROC_UNLOCK(pp);
return (0);
}
int
linux_getgid(struct thread *td, struct linux_getgid_args *args)
{
#ifdef DEBUG
if (ldebug(getgid))
printf(ARGS(getgid, ""));
#endif
td->td_retval[0] = td->td_ucred->cr_rgid;
return (0);
}
int
linux_getuid(struct thread *td, struct linux_getuid_args *args)
{
#ifdef DEBUG
if (ldebug(getuid))
printf(ARGS(getuid, ""));
#endif
td->td_retval[0] = td->td_ucred->cr_ruid;
return (0);
}
int
linux_getsid(struct thread *td, struct linux_getsid_args *args)
{
struct getsid_args bsd;
#ifdef DEBUG
if (ldebug(getsid))
printf(ARGS(getsid, "%i"), args->pid);
#endif
bsd.pid = args->pid;
- return (getsid(td, &bsd));
+ return (sys_getsid(td, &bsd));
}
int
linux_nosys(struct thread *td, struct nosys_args *ignore)
{
return (ENOSYS);
}
int
linux_getpriority(struct thread *td, struct linux_getpriority_args *args)
{
struct getpriority_args bsd_args;
int error;
#ifdef DEBUG
if (ldebug(getpriority))
printf(ARGS(getpriority, "%i, %i"), args->which, args->who);
#endif
bsd_args.which = args->which;
bsd_args.who = args->who;
- error = getpriority(td, &bsd_args);
+ error = sys_getpriority(td, &bsd_args);
td->td_retval[0] = 20 - td->td_retval[0];
return (error);
}
int
linux_sethostname(struct thread *td, struct linux_sethostname_args *args)
{
int name[2];
#ifdef DEBUG
if (ldebug(sethostname))
printf(ARGS(sethostname, "*, %i"), args->len);
#endif
name[0] = CTL_KERN;
name[1] = KERN_HOSTNAME;
return (userland_sysctl(td, name, 2, 0, 0, 0, args->hostname,
args->len, 0, 0));
}
int
linux_setdomainname(struct thread *td, struct linux_setdomainname_args *args)
{
int name[2];
#ifdef DEBUG
if (ldebug(setdomainname))
printf(ARGS(setdomainname, "*, %i"), args->len);
#endif
name[0] = CTL_KERN;
name[1] = KERN_NISDOMAINNAME;
return (userland_sysctl(td, name, 2, 0, 0, 0, args->name,
args->len, 0, 0));
}
int
linux_exit_group(struct thread *td, struct linux_exit_group_args *args)
{
struct linux_emuldata *em;
#ifdef DEBUG
if (ldebug(exit_group))
printf(ARGS(exit_group, "%i"), args->error_code);
#endif
em = em_find(td->td_proc, EMUL_DONTLOCK);
if (em->shared->refs > 1) {
EMUL_SHARED_WLOCK(&emul_shared_lock);
em->shared->flags |= EMUL_SHARED_HASXSTAT;
em->shared->xstat = W_EXITCODE(args->error_code, 0);
EMUL_SHARED_WUNLOCK(&emul_shared_lock);
if (linux_use26(td))
linux_kill_threads(td, SIGKILL);
}
/*
* XXX: we should send a signal to the parent if
* SIGNAL_EXIT_GROUP is set. We ignore that (temporarily?)
* as it doesnt occur often.
*/
exit1(td, W_EXITCODE(args->error_code, 0));
return (0);
}
#define _LINUX_CAPABILITY_VERSION 0x19980330
struct l_user_cap_header {
l_int version;
l_int pid;
};
struct l_user_cap_data {
l_int effective;
l_int permitted;
l_int inheritable;
};
int
linux_capget(struct thread *td, struct linux_capget_args *args)
{
struct l_user_cap_header luch;
struct l_user_cap_data lucd;
int error;
if (args->hdrp == NULL)
return (EFAULT);
error = copyin(args->hdrp, &luch, sizeof(luch));
if (error != 0)
return (error);
if (luch.version != _LINUX_CAPABILITY_VERSION) {
luch.version = _LINUX_CAPABILITY_VERSION;
error = copyout(&luch, args->hdrp, sizeof(luch));
if (error)
return (error);
return (EINVAL);
}
if (luch.pid)
return (EPERM);
if (args->datap) {
/*
* The current implementation doesn't support setting
* a capability (it's essentially a stub) so indicate
* that no capabilities are currently set or available
* to request.
*/
bzero (&lucd, sizeof(lucd));
error = copyout(&lucd, args->datap, sizeof(lucd));
}
return (error);
}
int
linux_capset(struct thread *td, struct linux_capset_args *args)
{
struct l_user_cap_header luch;
struct l_user_cap_data lucd;
int error;
if (args->hdrp == NULL || args->datap == NULL)
return (EFAULT);
error = copyin(args->hdrp, &luch, sizeof(luch));
if (error != 0)
return (error);
if (luch.version != _LINUX_CAPABILITY_VERSION) {
luch.version = _LINUX_CAPABILITY_VERSION;
error = copyout(&luch, args->hdrp, sizeof(luch));
if (error)
return (error);
return (EINVAL);
}
if (luch.pid)
return (EPERM);
error = copyin(args->datap, &lucd, sizeof(lucd));
if (error != 0)
return (error);
/* We currently don't support setting any capabilities. */
if (lucd.effective || lucd.permitted || lucd.inheritable) {
linux_msg(td,
"capset effective=0x%x, permitted=0x%x, "
"inheritable=0x%x is not implemented",
(int)lucd.effective, (int)lucd.permitted,
(int)lucd.inheritable);
return (EPERM);
}
return (0);
}
int
linux_prctl(struct thread *td, struct linux_prctl_args *args)
{
int error = 0, max_size;
struct proc *p = td->td_proc;
char comm[LINUX_MAX_COMM_LEN];
struct linux_emuldata *em;
int pdeath_signal;
#ifdef DEBUG
if (ldebug(prctl))
printf(ARGS(prctl, "%d, %d, %d, %d, %d"), args->option,
args->arg2, args->arg3, args->arg4, args->arg5);
#endif
switch (args->option) {
case LINUX_PR_SET_PDEATHSIG:
if (!LINUX_SIG_VALID(args->arg2))
return (EINVAL);
em = em_find(p, EMUL_DOLOCK);
KASSERT(em != NULL, ("prctl: emuldata not found.\n"));
em->pdeath_signal = args->arg2;
EMUL_UNLOCK(&emul_lock);
break;
case LINUX_PR_GET_PDEATHSIG:
em = em_find(p, EMUL_DOLOCK);
KASSERT(em != NULL, ("prctl: emuldata not found.\n"));
pdeath_signal = em->pdeath_signal;
EMUL_UNLOCK(&emul_lock);
error = copyout(&pdeath_signal,
(void *)(register_t)args->arg2,
sizeof(pdeath_signal));
break;
case LINUX_PR_GET_KEEPCAPS:
/*
* Indicate that we always clear the effective and
* permitted capability sets when the user id becomes
* non-zero (actually the capability sets are simply
* always zero in the current implementation).
*/
td->td_retval[0] = 0;
break;
case LINUX_PR_SET_KEEPCAPS:
/*
* Ignore requests to keep the effective and permitted
* capability sets when the user id becomes non-zero.
*/
break;
case LINUX_PR_SET_NAME:
/*
* To be on the safe side we need to make sure to not
* overflow the size a linux program expects. We already
* do this here in the copyin, so that we don't need to
* check on copyout.
*/
max_size = MIN(sizeof(comm), sizeof(p->p_comm));
error = copyinstr((void *)(register_t)args->arg2, comm,
max_size, NULL);
/* Linux silently truncates the name if it is too long. */
if (error == ENAMETOOLONG) {
/*
* XXX: copyinstr() isn't documented to populate the
* array completely, so do a copyin() to be on the
* safe side. This should be changed in case
* copyinstr() is changed to guarantee this.
*/
error = copyin((void *)(register_t)args->arg2, comm,
max_size - 1);
comm[max_size - 1] = '\0';
}
if (error)
return (error);
PROC_LOCK(p);
strlcpy(p->p_comm, comm, sizeof(p->p_comm));
PROC_UNLOCK(p);
break;
case LINUX_PR_GET_NAME:
PROC_LOCK(p);
strlcpy(comm, p->p_comm, sizeof(comm));
PROC_UNLOCK(p);
error = copyout(comm, (void *)(register_t)args->arg2,
strlen(comm) + 1);
break;
default:
error = EINVAL;
break;
}
return (error);
}
/*
* Get affinity of a process.
*/
int
linux_sched_getaffinity(struct thread *td,
struct linux_sched_getaffinity_args *args)
{
int error;
struct cpuset_getaffinity_args cga;
#ifdef DEBUG
if (ldebug(sched_getaffinity))
printf(ARGS(sched_getaffinity, "%d, %d, *"), args->pid,
args->len);
#endif
if (args->len < sizeof(cpuset_t))
return (EINVAL);
cga.level = CPU_LEVEL_WHICH;
cga.which = CPU_WHICH_PID;
cga.id = args->pid;
cga.cpusetsize = sizeof(cpuset_t);
cga.mask = (cpuset_t *) args->user_mask_ptr;
- if ((error = cpuset_getaffinity(td, &cga)) == 0)
+ if ((error = sys_cpuset_getaffinity(td, &cga)) == 0)
td->td_retval[0] = sizeof(cpuset_t);
return (error);
}
/*
* Set affinity of a process.
*/
int
linux_sched_setaffinity(struct thread *td,
struct linux_sched_setaffinity_args *args)
{
struct cpuset_setaffinity_args csa;
#ifdef DEBUG
if (ldebug(sched_setaffinity))
printf(ARGS(sched_setaffinity, "%d, %d, *"), args->pid,
args->len);
#endif
if (args->len < sizeof(cpuset_t))
return (EINVAL);
csa.level = CPU_LEVEL_WHICH;
csa.which = CPU_WHICH_PID;
csa.id = args->pid;
csa.cpusetsize = sizeof(cpuset_t);
csa.mask = (cpuset_t *) args->user_mask_ptr;
- return (cpuset_setaffinity(td, &csa));
+ return (sys_cpuset_setaffinity(td, &csa));
}
Index: head/sys/compat/linux/linux_signal.c
===================================================================
--- head/sys/compat/linux/linux_signal.c (revision 225616)
+++ head/sys/compat/linux/linux_signal.c (revision 225617)
@@ -1,656 +1,656 @@
/*-
* Copyright (c) 1994-1995 Søren Schmidt
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer
* in this position and unchanged.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/sx.h>
#include <sys/proc.h>
#include <sys/signalvar.h>
#include <sys/syscallsubr.h>
#include <sys/sysproto.h>
#include <security/audit/audit.h>
#include "opt_compat.h"
#ifdef COMPAT_LINUX32
#include <machine/../linux32/linux.h>
#include <machine/../linux32/linux32_proto.h>
#else
#include <machine/../linux/linux.h>
#include <machine/../linux/linux_proto.h>
#endif
#include <compat/linux/linux_signal.h>
#include <compat/linux/linux_util.h>
#include <compat/linux/linux_emul.h>
void
linux_to_bsd_sigset(l_sigset_t *lss, sigset_t *bss)
{
int b, l;
SIGEMPTYSET(*bss);
bss->__bits[0] = lss->__bits[0] & ~((1U << LINUX_SIGTBLSZ) - 1);
bss->__bits[1] = lss->__bits[1];
for (l = 1; l <= LINUX_SIGTBLSZ; l++) {
if (LINUX_SIGISMEMBER(*lss, l)) {
b = linux_to_bsd_signal[_SIG_IDX(l)];
if (b)
SIGADDSET(*bss, b);
}
}
}
void
bsd_to_linux_sigset(sigset_t *bss, l_sigset_t *lss)
{
int b, l;
LINUX_SIGEMPTYSET(*lss);
lss->__bits[0] = bss->__bits[0] & ~((1U << LINUX_SIGTBLSZ) - 1);
lss->__bits[1] = bss->__bits[1];
for (b = 1; b <= LINUX_SIGTBLSZ; b++) {
if (SIGISMEMBER(*bss, b)) {
l = bsd_to_linux_signal[_SIG_IDX(b)];
if (l)
LINUX_SIGADDSET(*lss, l);
}
}
}
static void
linux_to_bsd_sigaction(l_sigaction_t *lsa, struct sigaction *bsa)
{
linux_to_bsd_sigset(&lsa->lsa_mask, &bsa->sa_mask);
bsa->sa_handler = PTRIN(lsa->lsa_handler);
bsa->sa_flags = 0;
if (lsa->lsa_flags & LINUX_SA_NOCLDSTOP)
bsa->sa_flags |= SA_NOCLDSTOP;
if (lsa->lsa_flags & LINUX_SA_NOCLDWAIT)
bsa->sa_flags |= SA_NOCLDWAIT;
if (lsa->lsa_flags & LINUX_SA_SIGINFO)
bsa->sa_flags |= SA_SIGINFO;
if (lsa->lsa_flags & LINUX_SA_ONSTACK)
bsa->sa_flags |= SA_ONSTACK;
if (lsa->lsa_flags & LINUX_SA_RESTART)
bsa->sa_flags |= SA_RESTART;
if (lsa->lsa_flags & LINUX_SA_ONESHOT)
bsa->sa_flags |= SA_RESETHAND;
if (lsa->lsa_flags & LINUX_SA_NOMASK)
bsa->sa_flags |= SA_NODEFER;
}
static void
bsd_to_linux_sigaction(struct sigaction *bsa, l_sigaction_t *lsa)
{
bsd_to_linux_sigset(&bsa->sa_mask, &lsa->lsa_mask);
#ifdef COMPAT_LINUX32
lsa->lsa_handler = (uintptr_t)bsa->sa_handler;
#else
lsa->lsa_handler = bsa->sa_handler;
#endif
lsa->lsa_restorer = 0; /* unsupported */
lsa->lsa_flags = 0;
if (bsa->sa_flags & SA_NOCLDSTOP)
lsa->lsa_flags |= LINUX_SA_NOCLDSTOP;
if (bsa->sa_flags & SA_NOCLDWAIT)
lsa->lsa_flags |= LINUX_SA_NOCLDWAIT;
if (bsa->sa_flags & SA_SIGINFO)
lsa->lsa_flags |= LINUX_SA_SIGINFO;
if (bsa->sa_flags & SA_ONSTACK)
lsa->lsa_flags |= LINUX_SA_ONSTACK;
if (bsa->sa_flags & SA_RESTART)
lsa->lsa_flags |= LINUX_SA_RESTART;
if (bsa->sa_flags & SA_RESETHAND)
lsa->lsa_flags |= LINUX_SA_ONESHOT;
if (bsa->sa_flags & SA_NODEFER)
lsa->lsa_flags |= LINUX_SA_NOMASK;
}
int
linux_do_sigaction(struct thread *td, int linux_sig, l_sigaction_t *linux_nsa,
l_sigaction_t *linux_osa)
{
struct sigaction act, oact, *nsa, *osa;
int error, sig;
if (!LINUX_SIG_VALID(linux_sig))
return (EINVAL);
osa = (linux_osa != NULL) ? &oact : NULL;
if (linux_nsa != NULL) {
nsa = &act;
linux_to_bsd_sigaction(linux_nsa, nsa);
} else
nsa = NULL;
if (linux_sig <= LINUX_SIGTBLSZ)
sig = linux_to_bsd_signal[_SIG_IDX(linux_sig)];
else
sig = linux_sig;
error = kern_sigaction(td, sig, nsa, osa, 0);
if (error)
return (error);
if (linux_osa != NULL)
bsd_to_linux_sigaction(osa, linux_osa);
return (0);
}
int
linux_signal(struct thread *td, struct linux_signal_args *args)
{
l_sigaction_t nsa, osa;
int error;
#ifdef DEBUG
if (ldebug(signal))
printf(ARGS(signal, "%d, %p"),
args->sig, (void *)(uintptr_t)args->handler);
#endif
nsa.lsa_handler = args->handler;
nsa.lsa_flags = LINUX_SA_ONESHOT | LINUX_SA_NOMASK;
LINUX_SIGEMPTYSET(nsa.lsa_mask);
error = linux_do_sigaction(td, args->sig, &nsa, &osa);
td->td_retval[0] = (int)(intptr_t)osa.lsa_handler;
return (error);
}
int
linux_rt_sigaction(struct thread *td, struct linux_rt_sigaction_args *args)
{
l_sigaction_t nsa, osa;
int error;
#ifdef DEBUG
if (ldebug(rt_sigaction))
printf(ARGS(rt_sigaction, "%ld, %p, %p, %ld"),
(long)args->sig, (void *)args->act,
(void *)args->oact, (long)args->sigsetsize);
#endif
if (args->sigsetsize != sizeof(l_sigset_t))
return (EINVAL);
if (args->act != NULL) {
error = copyin(args->act, &nsa, sizeof(l_sigaction_t));
if (error)
return (error);
}
error = linux_do_sigaction(td, args->sig,
args->act ? &nsa : NULL,
args->oact ? &osa : NULL);
if (args->oact != NULL && !error) {
error = copyout(&osa, args->oact, sizeof(l_sigaction_t));
}
return (error);
}
static int
linux_do_sigprocmask(struct thread *td, int how, l_sigset_t *new,
l_sigset_t *old)
{
sigset_t omask, nmask;
sigset_t *nmaskp;
int error;
td->td_retval[0] = 0;
switch (how) {
case LINUX_SIG_BLOCK:
how = SIG_BLOCK;
break;
case LINUX_SIG_UNBLOCK:
how = SIG_UNBLOCK;
break;
case LINUX_SIG_SETMASK:
how = SIG_SETMASK;
break;
default:
return (EINVAL);
}
if (new != NULL) {
linux_to_bsd_sigset(new, &nmask);
nmaskp = &nmask;
} else
nmaskp = NULL;
error = kern_sigprocmask(td, how, nmaskp, &omask, 0);
if (error == 0 && old != NULL)
bsd_to_linux_sigset(&omask, old);
return (error);
}
int
linux_sigprocmask(struct thread *td, struct linux_sigprocmask_args *args)
{
l_osigset_t mask;
l_sigset_t set, oset;
int error;
#ifdef DEBUG
if (ldebug(sigprocmask))
printf(ARGS(sigprocmask, "%d, *, *"), args->how);
#endif
if (args->mask != NULL) {
error = copyin(args->mask, &mask, sizeof(l_osigset_t));
if (error)
return (error);
LINUX_SIGEMPTYSET(set);
set.__bits[0] = mask;
}
error = linux_do_sigprocmask(td, args->how,
args->mask ? &set : NULL,
args->omask ? &oset : NULL);
if (args->omask != NULL && !error) {
mask = oset.__bits[0];
error = copyout(&mask, args->omask, sizeof(l_osigset_t));
}
return (error);
}
int
linux_rt_sigprocmask(struct thread *td, struct linux_rt_sigprocmask_args *args)
{
l_sigset_t set, oset;
int error;
#ifdef DEBUG
if (ldebug(rt_sigprocmask))
printf(ARGS(rt_sigprocmask, "%d, %p, %p, %ld"),
args->how, (void *)args->mask,
(void *)args->omask, (long)args->sigsetsize);
#endif
if (args->sigsetsize != sizeof(l_sigset_t))
return EINVAL;
if (args->mask != NULL) {
error = copyin(args->mask, &set, sizeof(l_sigset_t));
if (error)
return (error);
}
error = linux_do_sigprocmask(td, args->how,
args->mask ? &set : NULL,
args->omask ? &oset : NULL);
if (args->omask != NULL && !error) {
error = copyout(&oset, args->omask, sizeof(l_sigset_t));
}
return (error);
}
int
linux_sgetmask(struct thread *td, struct linux_sgetmask_args *args)
{
struct proc *p = td->td_proc;
l_sigset_t mask;
#ifdef DEBUG
if (ldebug(sgetmask))
printf(ARGS(sgetmask, ""));
#endif
PROC_LOCK(p);
bsd_to_linux_sigset(&td->td_sigmask, &mask);
PROC_UNLOCK(p);
td->td_retval[0] = mask.__bits[0];
return (0);
}
int
linux_ssetmask(struct thread *td, struct linux_ssetmask_args *args)
{
struct proc *p = td->td_proc;
l_sigset_t lset;
sigset_t bset;
#ifdef DEBUG
if (ldebug(ssetmask))
printf(ARGS(ssetmask, "%08lx"), (unsigned long)args->mask);
#endif
PROC_LOCK(p);
bsd_to_linux_sigset(&td->td_sigmask, &lset);
td->td_retval[0] = lset.__bits[0];
LINUX_SIGEMPTYSET(lset);
lset.__bits[0] = args->mask;
linux_to_bsd_sigset(&lset, &bset);
td->td_sigmask = bset;
SIG_CANTMASK(td->td_sigmask);
signotify(td);
PROC_UNLOCK(p);
return (0);
}
/*
* MPSAFE
*/
int
linux_sigpending(struct thread *td, struct linux_sigpending_args *args)
{
struct proc *p = td->td_proc;
sigset_t bset;
l_sigset_t lset;
l_osigset_t mask;
#ifdef DEBUG
if (ldebug(sigpending))
printf(ARGS(sigpending, "*"));
#endif
PROC_LOCK(p);
bset = p->p_siglist;
SIGSETOR(bset, td->td_siglist);
SIGSETAND(bset, td->td_sigmask);
PROC_UNLOCK(p);
bsd_to_linux_sigset(&bset, &lset);
mask = lset.__bits[0];
return (copyout(&mask, args->mask, sizeof(mask)));
}
/*
* MPSAFE
*/
int
linux_rt_sigpending(struct thread *td, struct linux_rt_sigpending_args *args)
{
struct proc *p = td->td_proc;
sigset_t bset;
l_sigset_t lset;
if (args->sigsetsize > sizeof(lset))
return EINVAL;
/* NOT REACHED */
#ifdef DEBUG
if (ldebug(rt_sigpending))
printf(ARGS(rt_sigpending, "*"));
#endif
PROC_LOCK(p);
bset = p->p_siglist;
SIGSETOR(bset, td->td_siglist);
SIGSETAND(bset, td->td_sigmask);
PROC_UNLOCK(p);
bsd_to_linux_sigset(&bset, &lset);
return (copyout(&lset, args->set, args->sigsetsize));
}
/*
* MPSAFE
*/
int
linux_rt_sigtimedwait(struct thread *td,
struct linux_rt_sigtimedwait_args *args)
{
int error, sig;
l_timeval ltv;
struct timeval tv;
struct timespec ts, *tsa;
l_sigset_t lset;
sigset_t bset;
l_siginfo_t linfo;
ksiginfo_t info;
#ifdef DEBUG
if (ldebug(rt_sigtimedwait))
printf(ARGS(rt_sigtimedwait, "*"));
#endif
if (args->sigsetsize != sizeof(l_sigset_t))
return (EINVAL);
if ((error = copyin(args->mask, &lset, sizeof(lset))))
return (error);
linux_to_bsd_sigset(&lset, &bset);
tsa = NULL;
if (args->timeout) {
if ((error = copyin(args->timeout, &ltv, sizeof(ltv))))
return (error);
#ifdef DEBUG
if (ldebug(rt_sigtimedwait))
printf(LMSG("linux_rt_sigtimedwait: "
"incoming timeout (%d/%d)\n"),
ltv.tv_sec, ltv.tv_usec);
#endif
tv.tv_sec = (long)ltv.tv_sec;
tv.tv_usec = (suseconds_t)ltv.tv_usec;
if (itimerfix(&tv)) {
/*
* The timeout was invalid. Convert it to something
* valid that will act as it does under Linux.
*/
tv.tv_sec += tv.tv_usec / 1000000;
tv.tv_usec %= 1000000;
if (tv.tv_usec < 0) {
tv.tv_sec -= 1;
tv.tv_usec += 1000000;
}
if (tv.tv_sec < 0)
timevalclear(&tv);
#ifdef DEBUG
if (ldebug(rt_sigtimedwait))
printf(LMSG("linux_rt_sigtimedwait: "
"converted timeout (%jd/%ld)\n"),
(intmax_t)tv.tv_sec, tv.tv_usec);
#endif
}
TIMEVAL_TO_TIMESPEC(&tv, &ts);
tsa = &ts;
}
error = kern_sigtimedwait(td, bset, &info, tsa);
#ifdef DEBUG
if (ldebug(rt_sigtimedwait))
printf(LMSG("linux_rt_sigtimedwait: "
"sigtimedwait returning (%d)\n"), error);
#endif
if (error)
return (error);
sig = BSD_TO_LINUX_SIGNAL(info.ksi_signo);
if (args->ptr) {
memset(&linfo, 0, sizeof(linfo));
ksiginfo_to_lsiginfo(&info, &linfo, sig);
error = copyout(&linfo, args->ptr, sizeof(linfo));
}
if (error == 0)
td->td_retval[0] = sig;
return (error);
}
int
linux_kill(struct thread *td, struct linux_kill_args *args)
{
struct kill_args /* {
int pid;
int signum;
} */ tmp;
#ifdef DEBUG
if (ldebug(kill))
printf(ARGS(kill, "%d, %d"), args->pid, args->signum);
#endif
/*
* Allow signal 0 as a means to check for privileges
*/
if (!LINUX_SIG_VALID(args->signum) && args->signum != 0)
return (EINVAL);
if (args->signum > 0 && args->signum <= LINUX_SIGTBLSZ)
tmp.signum = linux_to_bsd_signal[_SIG_IDX(args->signum)];
else
tmp.signum = args->signum;
tmp.pid = args->pid;
- return (kill(td, &tmp));
+ return (sys_kill(td, &tmp));
}
static int
linux_do_tkill(struct thread *td, l_int tgid, l_int pid, l_int signum)
{
struct proc *proc = td->td_proc;
struct linux_emuldata *em;
struct proc *p;
ksiginfo_t ksi;
int error;
AUDIT_ARG_SIGNUM(signum);
AUDIT_ARG_PID(pid);
/*
* Allow signal 0 as a means to check for privileges
*/
if (!LINUX_SIG_VALID(signum) && signum != 0)
return (EINVAL);
if (signum > 0 && signum <= LINUX_SIGTBLSZ)
signum = linux_to_bsd_signal[_SIG_IDX(signum)];
if ((p = pfind(pid)) == NULL) {
if ((p = zpfind(pid)) == NULL)
return (ESRCH);
}
AUDIT_ARG_PROCESS(p);
error = p_cansignal(td, p, signum);
if (error != 0 || signum == 0)
goto out;
error = ESRCH;
em = em_find(p, EMUL_DONTLOCK);
if (em == NULL) {
#ifdef DEBUG
printf("emuldata not found in do_tkill.\n");
#endif
goto out;
}
if (tgid > 0 && em->shared->group_pid != tgid)
goto out;
ksiginfo_init(&ksi);
ksi.ksi_signo = signum;
ksi.ksi_code = LINUX_SI_TKILL;
ksi.ksi_errno = 0;
ksi.ksi_pid = proc->p_pid;
ksi.ksi_uid = proc->p_ucred->cr_ruid;
error = pksignal(p, ksi.ksi_signo, &ksi);
out:
PROC_UNLOCK(p);
return (error);
}
int
linux_tgkill(struct thread *td, struct linux_tgkill_args *args)
{
#ifdef DEBUG
if (ldebug(tgkill))
printf(ARGS(tgkill, "%d, %d, %d"), args->tgid, args->pid, args->sig);
#endif
if (args->pid <= 0 || args->tgid <=0)
return (EINVAL);
return (linux_do_tkill(td, args->tgid, args->pid, args->sig));
}
int
linux_tkill(struct thread *td, struct linux_tkill_args *args)
{
#ifdef DEBUG
if (ldebug(tkill))
printf(ARGS(tkill, "%i, %i"), args->tid, args->sig);
#endif
if (args->tid <= 0)
return (EINVAL);
return (linux_do_tkill(td, 0, args->tid, args->sig));
}
void
ksiginfo_to_lsiginfo(ksiginfo_t *ksi, l_siginfo_t *lsi, l_int sig)
{
lsi->lsi_signo = sig;
lsi->lsi_code = ksi->ksi_code;
switch (sig) {
case LINUX_SIGPOLL:
/* XXX si_fd? */
lsi->lsi_band = ksi->ksi_band;
break;
case LINUX_SIGCHLD:
lsi->lsi_pid = ksi->ksi_pid;
lsi->lsi_uid = ksi->ksi_uid;
lsi->lsi_status = ksi->ksi_status;
break;
case LINUX_SIGBUS:
case LINUX_SIGILL:
case LINUX_SIGFPE:
case LINUX_SIGSEGV:
lsi->lsi_addr = PTROUT(ksi->ksi_addr);
break;
default:
/* XXX SI_TIMER etc... */
lsi->lsi_pid = ksi->ksi_pid;
lsi->lsi_uid = ksi->ksi_uid;
break;
}
if (sig >= LINUX_SIGRTMIN) {
lsi->lsi_int = ksi->ksi_info.si_value.sival_int;
lsi->lsi_ptr = PTROUT(ksi->ksi_info.si_value.sival_ptr);
}
}
Index: head/sys/compat/linux/linux_socket.c
===================================================================
--- head/sys/compat/linux/linux_socket.c (revision 225616)
+++ head/sys/compat/linux/linux_socket.c (revision 225617)
@@ -1,1685 +1,1685 @@
/*-
* Copyright (c) 1995 Søren Schmidt
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer
* in this position and unchanged.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
/* XXX we use functions that might not exist. */
#include "opt_compat.h"
#include "opt_inet6.h"
#include <sys/param.h>
#include <sys/proc.h>
#include <sys/systm.h>
#include <sys/sysproto.h>
#include <sys/capability.h>
#include <sys/fcntl.h>
#include <sys/file.h>
#include <sys/limits.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
#include <sys/mbuf.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/syscallsubr.h>
#include <sys/uio.h>
#include <sys/syslog.h>
#include <sys/un.h>
#include <net/if.h>
#include <netinet/in.h>
#include <netinet/in_systm.h>
#include <netinet/ip.h>
#ifdef INET6
#include <netinet/ip6.h>
#include <netinet6/ip6_var.h>
#include <netinet6/in6_var.h>
#endif
#ifdef COMPAT_LINUX32
#include <machine/../linux32/linux.h>
#include <machine/../linux32/linux32_proto.h>
#else
#include <machine/../linux/linux.h>
#include <machine/../linux/linux_proto.h>
#endif
#include <compat/linux/linux_socket.h>
#include <compat/linux/linux_util.h>
static int do_sa_get(struct sockaddr **, const struct osockaddr *, int *,
struct malloc_type *);
static int linux_to_bsd_domain(int);
/*
* Reads a linux sockaddr and does any necessary translation.
* Linux sockaddrs don't have a length field, only a family.
*/
static int
linux_getsockaddr(struct sockaddr **sap, const struct osockaddr *osa, int len)
{
int osalen = len;
return (do_sa_get(sap, osa, &osalen, M_SONAME));
}
/*
* Copy the osockaddr structure pointed to by osa to kernel, adjust
* family and convert to sockaddr.
*/
static int
do_sa_get(struct sockaddr **sap, const struct osockaddr *osa, int *osalen,
struct malloc_type *mtype)
{
int error=0, bdom;
struct sockaddr *sa;
struct osockaddr *kosa;
int alloclen;
#ifdef INET6
int oldv6size;
struct sockaddr_in6 *sin6;
#endif
if (*osalen < 2 || *osalen > UCHAR_MAX || !osa)
return (EINVAL);
alloclen = *osalen;
#ifdef INET6
oldv6size = 0;
/*
* Check for old (pre-RFC2553) sockaddr_in6. We may accept it
* if it's a v4-mapped address, so reserve the proper space
* for it.
*/
if (alloclen == sizeof (struct sockaddr_in6) - sizeof (u_int32_t)) {
alloclen = sizeof (struct sockaddr_in6);
oldv6size = 1;
}
#endif
kosa = malloc(alloclen, mtype, M_WAITOK);
if ((error = copyin(osa, kosa, *osalen)))
goto out;
bdom = linux_to_bsd_domain(kosa->sa_family);
if (bdom == -1) {
error = EAFNOSUPPORT;
goto out;
}
#ifdef INET6
/*
* Older Linux IPv6 code uses obsolete RFC2133 struct sockaddr_in6,
* which lacks the scope id compared with RFC2553 one. If we detect
* the situation, reject the address and write a message to system log.
*
* Still accept addresses for which the scope id is not used.
*/
if (oldv6size && bdom == AF_INET6) {
sin6 = (struct sockaddr_in6 *)kosa;
if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr) ||
(!IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr) &&
!IN6_IS_ADDR_SITELOCAL(&sin6->sin6_addr) &&
!IN6_IS_ADDR_V4COMPAT(&sin6->sin6_addr) &&
!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) &&
!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr))) {
sin6->sin6_scope_id = 0;
} else {
log(LOG_DEBUG,
"obsolete pre-RFC2553 sockaddr_in6 rejected\n");
error = EINVAL;
goto out;
}
} else
#endif
if (bdom == AF_INET) {
alloclen = sizeof(struct sockaddr_in);
if (*osalen < alloclen) {
error = EINVAL;
goto out;
}
}
sa = (struct sockaddr *) kosa;
sa->sa_family = bdom;
sa->sa_len = alloclen;
*sap = sa;
*osalen = alloclen;
return (0);
out:
free(kosa, mtype);
return (error);
}
static int
linux_to_bsd_domain(int domain)
{
switch (domain) {
case LINUX_AF_UNSPEC:
return (AF_UNSPEC);
case LINUX_AF_UNIX:
return (AF_LOCAL);
case LINUX_AF_INET:
return (AF_INET);
case LINUX_AF_INET6:
return (AF_INET6);
case LINUX_AF_AX25:
return (AF_CCITT);
case LINUX_AF_IPX:
return (AF_IPX);
case LINUX_AF_APPLETALK:
return (AF_APPLETALK);
}
return (-1);
}
static int
bsd_to_linux_domain(int domain)
{
switch (domain) {
case AF_UNSPEC:
return (LINUX_AF_UNSPEC);
case AF_LOCAL:
return (LINUX_AF_UNIX);
case AF_INET:
return (LINUX_AF_INET);
case AF_INET6:
return (LINUX_AF_INET6);
case AF_CCITT:
return (LINUX_AF_AX25);
case AF_IPX:
return (LINUX_AF_IPX);
case AF_APPLETALK:
return (LINUX_AF_APPLETALK);
}
return (-1);
}
static int
linux_to_bsd_sockopt_level(int level)
{
switch (level) {
case LINUX_SOL_SOCKET:
return (SOL_SOCKET);
}
return (level);
}
static int
bsd_to_linux_sockopt_level(int level)
{
switch (level) {
case SOL_SOCKET:
return (LINUX_SOL_SOCKET);
}
return (level);
}
static int
linux_to_bsd_ip_sockopt(int opt)
{
switch (opt) {
case LINUX_IP_TOS:
return (IP_TOS);
case LINUX_IP_TTL:
return (IP_TTL);
case LINUX_IP_OPTIONS:
return (IP_OPTIONS);
case LINUX_IP_MULTICAST_IF:
return (IP_MULTICAST_IF);
case LINUX_IP_MULTICAST_TTL:
return (IP_MULTICAST_TTL);
case LINUX_IP_MULTICAST_LOOP:
return (IP_MULTICAST_LOOP);
case LINUX_IP_ADD_MEMBERSHIP:
return (IP_ADD_MEMBERSHIP);
case LINUX_IP_DROP_MEMBERSHIP:
return (IP_DROP_MEMBERSHIP);
case LINUX_IP_HDRINCL:
return (IP_HDRINCL);
}
return (-1);
}
static int
linux_to_bsd_so_sockopt(int opt)
{
switch (opt) {
case LINUX_SO_DEBUG:
return (SO_DEBUG);
case LINUX_SO_REUSEADDR:
return (SO_REUSEADDR);
case LINUX_SO_TYPE:
return (SO_TYPE);
case LINUX_SO_ERROR:
return (SO_ERROR);
case LINUX_SO_DONTROUTE:
return (SO_DONTROUTE);
case LINUX_SO_BROADCAST:
return (SO_BROADCAST);
case LINUX_SO_SNDBUF:
return (SO_SNDBUF);
case LINUX_SO_RCVBUF:
return (SO_RCVBUF);
case LINUX_SO_KEEPALIVE:
return (SO_KEEPALIVE);
case LINUX_SO_OOBINLINE:
return (SO_OOBINLINE);
case LINUX_SO_LINGER:
return (SO_LINGER);
case LINUX_SO_PEERCRED:
return (LOCAL_PEERCRED);
case LINUX_SO_RCVLOWAT:
return (SO_RCVLOWAT);
case LINUX_SO_SNDLOWAT:
return (SO_SNDLOWAT);
case LINUX_SO_RCVTIMEO:
return (SO_RCVTIMEO);
case LINUX_SO_SNDTIMEO:
return (SO_SNDTIMEO);
case LINUX_SO_TIMESTAMP:
return (SO_TIMESTAMP);
case LINUX_SO_ACCEPTCONN:
return (SO_ACCEPTCONN);
}
return (-1);
}
static int
linux_to_bsd_msg_flags(int flags)
{
int ret_flags = 0;
if (flags & LINUX_MSG_OOB)
ret_flags |= MSG_OOB;
if (flags & LINUX_MSG_PEEK)
ret_flags |= MSG_PEEK;
if (flags & LINUX_MSG_DONTROUTE)
ret_flags |= MSG_DONTROUTE;
if (flags & LINUX_MSG_CTRUNC)
ret_flags |= MSG_CTRUNC;
if (flags & LINUX_MSG_TRUNC)
ret_flags |= MSG_TRUNC;
if (flags & LINUX_MSG_DONTWAIT)
ret_flags |= MSG_DONTWAIT;
if (flags & LINUX_MSG_EOR)
ret_flags |= MSG_EOR;
if (flags & LINUX_MSG_WAITALL)
ret_flags |= MSG_WAITALL;
if (flags & LINUX_MSG_NOSIGNAL)
ret_flags |= MSG_NOSIGNAL;
#if 0 /* not handled */
if (flags & LINUX_MSG_PROXY)
;
if (flags & LINUX_MSG_FIN)
;
if (flags & LINUX_MSG_SYN)
;
if (flags & LINUX_MSG_CONFIRM)
;
if (flags & LINUX_MSG_RST)
;
if (flags & LINUX_MSG_ERRQUEUE)
;
#endif
return ret_flags;
}
/*
* If bsd_to_linux_sockaddr() or linux_to_bsd_sockaddr() faults, then the
* native syscall will fault. Thus, we don't really need to check the
* return values for these functions.
*/
static int
bsd_to_linux_sockaddr(struct sockaddr *arg)
{
struct sockaddr sa;
size_t sa_len = sizeof(struct sockaddr);
int error;
if ((error = copyin(arg, &sa, sa_len)))
return (error);
*(u_short *)&sa = sa.sa_family;
error = copyout(&sa, arg, sa_len);
return (error);
}
static int
linux_to_bsd_sockaddr(struct sockaddr *arg, int len)
{
struct sockaddr sa;
size_t sa_len = sizeof(struct sockaddr);
int error;
if ((error = copyin(arg, &sa, sa_len)))
return (error);
sa.sa_family = *(sa_family_t *)&sa;
sa.sa_len = len;
error = copyout(&sa, arg, sa_len);
return (error);
}
static int
linux_sa_put(struct osockaddr *osa)
{
struct osockaddr sa;
int error, bdom;
/*
* Only read/write the osockaddr family part, the rest is
* not changed.
*/
error = copyin(osa, &sa, sizeof(sa.sa_family));
if (error)
return (error);
bdom = bsd_to_linux_domain(sa.sa_family);
if (bdom == -1)
return (EINVAL);
sa.sa_family = bdom;
error = copyout(&sa, osa, sizeof(sa.sa_family));
if (error)
return (error);
return (0);
}
static int
linux_to_bsd_cmsg_type(int cmsg_type)
{
switch (cmsg_type) {
case LINUX_SCM_RIGHTS:
return (SCM_RIGHTS);
case LINUX_SCM_CREDENTIALS:
return (SCM_CREDS);
}
return (-1);
}
static int
bsd_to_linux_cmsg_type(int cmsg_type)
{
switch (cmsg_type) {
case SCM_RIGHTS:
return (LINUX_SCM_RIGHTS);
case SCM_CREDS:
return (LINUX_SCM_CREDENTIALS);
}
return (-1);
}
static int
linux_to_bsd_msghdr(struct msghdr *bhdr, const struct l_msghdr *lhdr)
{
if (lhdr->msg_controllen > INT_MAX)
return (ENOBUFS);
bhdr->msg_name = PTRIN(lhdr->msg_name);
bhdr->msg_namelen = lhdr->msg_namelen;
bhdr->msg_iov = PTRIN(lhdr->msg_iov);
bhdr->msg_iovlen = lhdr->msg_iovlen;
bhdr->msg_control = PTRIN(lhdr->msg_control);
/*
* msg_controllen is skipped since BSD and LINUX control messages
* are potentially different sizes (e.g. the cred structure used
* by SCM_CREDS is different between the two operating system).
*
* The caller can set it (if necessary) after converting all the
* control messages.
*/
bhdr->msg_flags = linux_to_bsd_msg_flags(lhdr->msg_flags);
return (0);
}
static int
bsd_to_linux_msghdr(const struct msghdr *bhdr, struct l_msghdr *lhdr)
{
lhdr->msg_name = PTROUT(bhdr->msg_name);
lhdr->msg_namelen = bhdr->msg_namelen;
lhdr->msg_iov = PTROUT(bhdr->msg_iov);
lhdr->msg_iovlen = bhdr->msg_iovlen;
lhdr->msg_control = PTROUT(bhdr->msg_control);
/*
* msg_controllen is skipped since BSD and LINUX control messages
* are potentially different sizes (e.g. the cred structure used
* by SCM_CREDS is different between the two operating system).
*
* The caller can set it (if necessary) after converting all the
* control messages.
*/
/* msg_flags skipped */
return (0);
}
static int
linux_set_socket_flags(struct thread *td, int s, int flags)
{
int error;
if (flags & LINUX_SOCK_NONBLOCK) {
error = kern_fcntl(td, s, F_SETFL, O_NONBLOCK);
if (error)
return (error);
}
if (flags & LINUX_SOCK_CLOEXEC) {
error = kern_fcntl(td, s, F_SETFD, FD_CLOEXEC);
if (error)
return (error);
}
return (0);
}
static int
linux_sendit(struct thread *td, int s, struct msghdr *mp, int flags,
struct mbuf *control, enum uio_seg segflg)
{
struct sockaddr *to;
int error;
if (mp->msg_name != NULL) {
error = linux_getsockaddr(&to, mp->msg_name, mp->msg_namelen);
if (error)
return (error);
mp->msg_name = to;
} else
to = NULL;
error = kern_sendit(td, s, mp, linux_to_bsd_msg_flags(flags), control,
segflg);
if (to)
free(to, M_SONAME);
return (error);
}
/* Return 0 if IP_HDRINCL is set for the given socket. */
static int
linux_check_hdrincl(struct thread *td, int s)
{
int error, optval, size_val;
size_val = sizeof(optval);
error = kern_getsockopt(td, s, IPPROTO_IP, IP_HDRINCL,
&optval, UIO_SYSSPACE, &size_val);
if (error)
return (error);
return (optval == 0);
}
struct linux_sendto_args {
int s;
l_uintptr_t msg;
int len;
int flags;
l_uintptr_t to;
int tolen;
};
/*
* Updated sendto() when IP_HDRINCL is set:
* tweak endian-dependent fields in the IP packet.
*/
static int
linux_sendto_hdrincl(struct thread *td, struct linux_sendto_args *linux_args)
{
/*
* linux_ip_copysize defines how many bytes we should copy
* from the beginning of the IP packet before we customize it for BSD.
* It should include all the fields we modify (ip_len and ip_off).
*/
#define linux_ip_copysize 8
struct ip *packet;
struct msghdr msg;
struct iovec aiov[1];
int error;
/* Check that the packet isn't too big or too small. */
if (linux_args->len < linux_ip_copysize ||
linux_args->len > IP_MAXPACKET)
return (EINVAL);
packet = (struct ip *)malloc(linux_args->len, M_TEMP, M_WAITOK);
/* Make kernel copy of the packet to be sent */
if ((error = copyin(PTRIN(linux_args->msg), packet,
linux_args->len)))
goto goout;
/* Convert fields from Linux to BSD raw IP socket format */
packet->ip_len = linux_args->len;
packet->ip_off = ntohs(packet->ip_off);
/* Prepare the msghdr and iovec structures describing the new packet */
msg.msg_name = PTRIN(linux_args->to);
msg.msg_namelen = linux_args->tolen;
msg.msg_iov = aiov;
msg.msg_iovlen = 1;
msg.msg_control = NULL;
msg.msg_flags = 0;
aiov[0].iov_base = (char *)packet;
aiov[0].iov_len = linux_args->len;
error = linux_sendit(td, linux_args->s, &msg, linux_args->flags,
NULL, UIO_SYSSPACE);
goout:
free(packet, M_TEMP);
return (error);
}
struct linux_socket_args {
int domain;
int type;
int protocol;
};
static int
linux_socket(struct thread *td, struct linux_socket_args *args)
{
struct socket_args /* {
int domain;
int type;
int protocol;
} */ bsd_args;
int retval_socket, socket_flags;
bsd_args.protocol = args->protocol;
socket_flags = args->type & ~LINUX_SOCK_TYPE_MASK;
if (socket_flags & ~(LINUX_SOCK_CLOEXEC | LINUX_SOCK_NONBLOCK))
return (EINVAL);
bsd_args.type = args->type & LINUX_SOCK_TYPE_MASK;
if (bsd_args.type < 0 || bsd_args.type > LINUX_SOCK_MAX)
return (EINVAL);
bsd_args.domain = linux_to_bsd_domain(args->domain);
if (bsd_args.domain == -1)
return (EAFNOSUPPORT);
- retval_socket = socket(td, &bsd_args);
+ retval_socket = sys_socket(td, &bsd_args);
if (retval_socket)
return (retval_socket);
retval_socket = linux_set_socket_flags(td, td->td_retval[0],
socket_flags);
if (retval_socket) {
(void)kern_close(td, td->td_retval[0]);
goto out;
}
if (bsd_args.type == SOCK_RAW
&& (bsd_args.protocol == IPPROTO_RAW || bsd_args.protocol == 0)
&& bsd_args.domain == PF_INET) {
/* It's a raw IP socket: set the IP_HDRINCL option. */
int hdrincl;
hdrincl = 1;
/* We ignore any error returned by kern_setsockopt() */
kern_setsockopt(td, td->td_retval[0], IPPROTO_IP, IP_HDRINCL,
&hdrincl, UIO_SYSSPACE, sizeof(hdrincl));
}
#ifdef INET6
/*
* Linux AF_INET6 socket has IPV6_V6ONLY setsockopt set to 0 by default
* and some apps depend on this. So, set V6ONLY to 0 for Linux apps.
* For simplicity we do this unconditionally of the net.inet6.ip6.v6only
* sysctl value.
*/
if (bsd_args.domain == PF_INET6) {
int v6only;
v6only = 0;
/* We ignore any error returned by setsockopt() */
kern_setsockopt(td, td->td_retval[0], IPPROTO_IPV6, IPV6_V6ONLY,
&v6only, UIO_SYSSPACE, sizeof(v6only));
}
#endif
out:
return (retval_socket);
}
struct linux_bind_args {
int s;
l_uintptr_t name;
int namelen;
};
static int
linux_bind(struct thread *td, struct linux_bind_args *args)
{
struct sockaddr *sa;
int error;
error = linux_getsockaddr(&sa, PTRIN(args->name),
args->namelen);
if (error)
return (error);
error = kern_bind(td, args->s, sa);
free(sa, M_SONAME);
if (error == EADDRNOTAVAIL && args->namelen != sizeof(struct sockaddr_in))
return (EINVAL);
return (error);
}
struct linux_connect_args {
int s;
l_uintptr_t name;
int namelen;
};
int linux_connect(struct thread *, struct linux_connect_args *);
int
linux_connect(struct thread *td, struct linux_connect_args *args)
{
struct socket *so;
struct sockaddr *sa;
u_int fflag;
int error;
error = linux_getsockaddr(&sa, (struct osockaddr *)PTRIN(args->name),
args->namelen);
if (error)
return (error);
error = kern_connect(td, args->s, sa);
free(sa, M_SONAME);
if (error != EISCONN)
return (error);
/*
* Linux doesn't return EISCONN the first time it occurs,
* when on a non-blocking socket. Instead it returns the
* error getsockopt(SOL_SOCKET, SO_ERROR) would return on BSD.
*
* XXXRW: Instead of using fgetsock(), check that it is a
* socket and use the file descriptor reference instead of
* creating a new one.
*/
error = fgetsock(td, args->s, CAP_CONNECT, &so, &fflag);
if (error == 0) {
error = EISCONN;
if (fflag & FNONBLOCK) {
SOCK_LOCK(so);
if (so->so_emuldata == 0)
error = so->so_error;
so->so_emuldata = (void *)1;
SOCK_UNLOCK(so);
}
fputsock(so);
}
return (error);
}
struct linux_listen_args {
int s;
int backlog;
};
static int
linux_listen(struct thread *td, struct linux_listen_args *args)
{
struct listen_args /* {
int s;
int backlog;
} */ bsd_args;
bsd_args.s = args->s;
bsd_args.backlog = args->backlog;
- return (listen(td, &bsd_args));
+ return (sys_listen(td, &bsd_args));
}
static int
linux_accept_common(struct thread *td, int s, l_uintptr_t addr,
l_uintptr_t namelen, int flags)
{
struct accept_args /* {
int s;
struct sockaddr * __restrict name;
socklen_t * __restrict anamelen;
} */ bsd_args;
int error;
if (flags & ~(LINUX_SOCK_CLOEXEC | LINUX_SOCK_NONBLOCK))
return (EINVAL);
bsd_args.s = s;
/* XXX: */
bsd_args.name = (struct sockaddr * __restrict)PTRIN(addr);
bsd_args.anamelen = PTRIN(namelen);/* XXX */
- error = accept(td, &bsd_args);
+ error = sys_accept(td, &bsd_args);
bsd_to_linux_sockaddr((struct sockaddr *)bsd_args.name);
if (error) {
if (error == EFAULT && namelen != sizeof(struct sockaddr_in))
return (EINVAL);
return (error);
}
/*
* linux appears not to copy flags from the parent socket to the
* accepted one, so we must clear the flags in the new descriptor
* and apply the requested flags.
*/
error = kern_fcntl(td, td->td_retval[0], F_SETFL, 0);
if (error)
goto out;
error = linux_set_socket_flags(td, td->td_retval[0], flags);
if (error)
goto out;
if (addr)
error = linux_sa_put(PTRIN(addr));
out:
if (error) {
(void)kern_close(td, td->td_retval[0]);
td->td_retval[0] = 0;
}
return (error);
}
struct linux_accept_args {
int s;
l_uintptr_t addr;
l_uintptr_t namelen;
};
static int
linux_accept(struct thread *td, struct linux_accept_args *args)
{
return (linux_accept_common(td, args->s, args->addr,
args->namelen, 0));
}
struct linux_accept4_args {
int s;
l_uintptr_t addr;
l_uintptr_t namelen;
int flags;
};
static int
linux_accept4(struct thread *td, struct linux_accept4_args *args)
{
return (linux_accept_common(td, args->s, args->addr,
args->namelen, args->flags));
}
struct linux_getsockname_args {
int s;
l_uintptr_t addr;
l_uintptr_t namelen;
};
static int
linux_getsockname(struct thread *td, struct linux_getsockname_args *args)
{
struct getsockname_args /* {
int fdes;
struct sockaddr * __restrict asa;
socklen_t * __restrict alen;
} */ bsd_args;
int error;
bsd_args.fdes = args->s;
/* XXX: */
bsd_args.asa = (struct sockaddr * __restrict)PTRIN(args->addr);
bsd_args.alen = PTRIN(args->namelen); /* XXX */
- error = getsockname(td, &bsd_args);
+ error = sys_getsockname(td, &bsd_args);
bsd_to_linux_sockaddr((struct sockaddr *)bsd_args.asa);
if (error)
return (error);
error = linux_sa_put(PTRIN(args->addr));
if (error)
return (error);
return (0);
}
struct linux_getpeername_args {
int s;
l_uintptr_t addr;
l_uintptr_t namelen;
};
static int
linux_getpeername(struct thread *td, struct linux_getpeername_args *args)
{
struct getpeername_args /* {
int fdes;
caddr_t asa;
int *alen;
} */ bsd_args;
int error;
bsd_args.fdes = args->s;
bsd_args.asa = (struct sockaddr *)PTRIN(args->addr);
bsd_args.alen = (int *)PTRIN(args->namelen);
- error = getpeername(td, &bsd_args);
+ error = sys_getpeername(td, &bsd_args);
bsd_to_linux_sockaddr((struct sockaddr *)bsd_args.asa);
if (error)
return (error);
error = linux_sa_put(PTRIN(args->addr));
if (error)
return (error);
return (0);
}
struct linux_socketpair_args {
int domain;
int type;
int protocol;
l_uintptr_t rsv;
};
static int
linux_socketpair(struct thread *td, struct linux_socketpair_args *args)
{
struct socketpair_args /* {
int domain;
int type;
int protocol;
int *rsv;
} */ bsd_args;
int error, socket_flags;
int sv[2];
bsd_args.domain = linux_to_bsd_domain(args->domain);
if (bsd_args.domain != PF_LOCAL)
return (EAFNOSUPPORT);
socket_flags = args->type & ~LINUX_SOCK_TYPE_MASK;
if (socket_flags & ~(LINUX_SOCK_CLOEXEC | LINUX_SOCK_NONBLOCK))
return (EINVAL);
bsd_args.type = args->type & LINUX_SOCK_TYPE_MASK;
if (bsd_args.type < 0 || bsd_args.type > LINUX_SOCK_MAX)
return (EINVAL);
if (args->protocol != 0 && args->protocol != PF_UNIX)
/*
* Use of PF_UNIX as protocol argument is not right,
* but Linux does it.
* Do not map PF_UNIX as its Linux value is identical
* to FreeBSD one.
*/
return (EPROTONOSUPPORT);
else
bsd_args.protocol = 0;
bsd_args.rsv = (int *)PTRIN(args->rsv);
error = kern_socketpair(td, bsd_args.domain, bsd_args.type,
bsd_args.protocol, sv);
if (error)
return (error);
error = linux_set_socket_flags(td, sv[0], socket_flags);
if (error)
goto out;
error = linux_set_socket_flags(td, sv[1], socket_flags);
if (error)
goto out;
error = copyout(sv, bsd_args.rsv, 2 * sizeof(int));
out:
if (error) {
(void)kern_close(td, sv[0]);
(void)kern_close(td, sv[1]);
}
return (error);
}
struct linux_send_args {
int s;
l_uintptr_t msg;
int len;
int flags;
};
static int
linux_send(struct thread *td, struct linux_send_args *args)
{
struct sendto_args /* {
int s;
caddr_t buf;
int len;
int flags;
caddr_t to;
int tolen;
} */ bsd_args;
bsd_args.s = args->s;
bsd_args.buf = (caddr_t)PTRIN(args->msg);
bsd_args.len = args->len;
bsd_args.flags = args->flags;
bsd_args.to = NULL;
bsd_args.tolen = 0;
- return sendto(td, &bsd_args);
+ return sys_sendto(td, &bsd_args);
}
struct linux_recv_args {
int s;
l_uintptr_t msg;
int len;
int flags;
};
static int
linux_recv(struct thread *td, struct linux_recv_args *args)
{
struct recvfrom_args /* {
int s;
caddr_t buf;
int len;
int flags;
struct sockaddr *from;
socklen_t fromlenaddr;
} */ bsd_args;
bsd_args.s = args->s;
bsd_args.buf = (caddr_t)PTRIN(args->msg);
bsd_args.len = args->len;
bsd_args.flags = linux_to_bsd_msg_flags(args->flags);
bsd_args.from = NULL;
bsd_args.fromlenaddr = 0;
- return (recvfrom(td, &bsd_args));
+ return (sys_recvfrom(td, &bsd_args));
}
static int
linux_sendto(struct thread *td, struct linux_sendto_args *args)
{
struct msghdr msg;
struct iovec aiov;
int error;
if (linux_check_hdrincl(td, args->s) == 0)
/* IP_HDRINCL set, tweak the packet before sending */
return (linux_sendto_hdrincl(td, args));
msg.msg_name = PTRIN(args->to);
msg.msg_namelen = args->tolen;
msg.msg_iov = &aiov;
msg.msg_iovlen = 1;
msg.msg_control = NULL;
msg.msg_flags = 0;
aiov.iov_base = PTRIN(args->msg);
aiov.iov_len = args->len;
error = linux_sendit(td, args->s, &msg, args->flags, NULL,
UIO_USERSPACE);
return (error);
}
struct linux_recvfrom_args {
int s;
l_uintptr_t buf;
int len;
int flags;
l_uintptr_t from;
l_uintptr_t fromlen;
};
static int
linux_recvfrom(struct thread *td, struct linux_recvfrom_args *args)
{
struct recvfrom_args /* {
int s;
caddr_t buf;
size_t len;
int flags;
struct sockaddr * __restrict from;
socklen_t * __restrict fromlenaddr;
} */ bsd_args;
size_t len;
int error;
if ((error = copyin(PTRIN(args->fromlen), &len, sizeof(size_t))))
return (error);
bsd_args.s = args->s;
bsd_args.buf = PTRIN(args->buf);
bsd_args.len = args->len;
bsd_args.flags = linux_to_bsd_msg_flags(args->flags);
/* XXX: */
bsd_args.from = (struct sockaddr * __restrict)PTRIN(args->from);
bsd_args.fromlenaddr = PTRIN(args->fromlen);/* XXX */
linux_to_bsd_sockaddr((struct sockaddr *)bsd_args.from, len);
- error = recvfrom(td, &bsd_args);
+ error = sys_recvfrom(td, &bsd_args);
bsd_to_linux_sockaddr((struct sockaddr *)bsd_args.from);
if (error)
return (error);
if (args->from) {
error = linux_sa_put((struct osockaddr *)
PTRIN(args->from));
if (error)
return (error);
}
return (0);
}
struct linux_sendmsg_args {
int s;
l_uintptr_t msg;
int flags;
};
static int
linux_sendmsg(struct thread *td, struct linux_sendmsg_args *args)
{
struct cmsghdr *cmsg;
struct cmsgcred cmcred;
struct mbuf *control;
struct msghdr msg;
struct l_cmsghdr linux_cmsg;
struct l_cmsghdr *ptr_cmsg;
struct l_msghdr linux_msg;
struct iovec *iov;
socklen_t datalen;
struct sockaddr *sa;
sa_family_t sa_family;
void *data;
int error;
error = copyin(PTRIN(args->msg), &linux_msg, sizeof(linux_msg));
if (error)
return (error);
/*
* Some Linux applications (ping) define a non-NULL control data
* pointer, but a msg_controllen of 0, which is not allowed in the
* FreeBSD system call interface. NULL the msg_control pointer in
* order to handle this case. This should be checked, but allows the
* Linux ping to work.
*/
if (PTRIN(linux_msg.msg_control) != NULL && linux_msg.msg_controllen == 0)
linux_msg.msg_control = PTROUT(NULL);
error = linux_to_bsd_msghdr(&msg, &linux_msg);
if (error)
return (error);
#ifdef COMPAT_LINUX32
error = linux32_copyiniov(PTRIN(msg.msg_iov), msg.msg_iovlen,
&iov, EMSGSIZE);
#else
error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
#endif
if (error)
return (error);
control = NULL;
cmsg = NULL;
if ((ptr_cmsg = LINUX_CMSG_FIRSTHDR(&linux_msg)) != NULL) {
error = kern_getsockname(td, args->s, &sa, &datalen);
if (error)
goto bad;
sa_family = sa->sa_family;
free(sa, M_SONAME);
error = ENOBUFS;
cmsg = malloc(CMSG_HDRSZ, M_TEMP, M_WAITOK | M_ZERO);
control = m_get(M_WAIT, MT_CONTROL);
if (control == NULL)
goto bad;
do {
error = copyin(ptr_cmsg, &linux_cmsg,
sizeof(struct l_cmsghdr));
if (error)
goto bad;
error = EINVAL;
if (linux_cmsg.cmsg_len < sizeof(struct l_cmsghdr))
goto bad;
/*
* Now we support only SCM_RIGHTS and SCM_CRED,
* so return EINVAL in any other cmsg_type
*/
cmsg->cmsg_type =
linux_to_bsd_cmsg_type(linux_cmsg.cmsg_type);
cmsg->cmsg_level =
linux_to_bsd_sockopt_level(linux_cmsg.cmsg_level);
if (cmsg->cmsg_type == -1
|| cmsg->cmsg_level != SOL_SOCKET)
goto bad;
/*
* Some applications (e.g. pulseaudio) attempt to
* send ancillary data even if the underlying protocol
* doesn't support it which is not allowed in the
* FreeBSD system call interface.
*/
if (sa_family != AF_UNIX)
continue;
data = LINUX_CMSG_DATA(ptr_cmsg);
datalen = linux_cmsg.cmsg_len - L_CMSG_HDRSZ;
switch (cmsg->cmsg_type)
{
case SCM_RIGHTS:
break;
case SCM_CREDS:
data = &cmcred;
datalen = sizeof(cmcred);
/*
* The lower levels will fill in the structure
*/
bzero(data, datalen);
break;
}
cmsg->cmsg_len = CMSG_LEN(datalen);
error = ENOBUFS;
if (!m_append(control, CMSG_HDRSZ, (c_caddr_t) cmsg))
goto bad;
if (!m_append(control, datalen, (c_caddr_t) data))
goto bad;
} while ((ptr_cmsg = LINUX_CMSG_NXTHDR(&linux_msg, ptr_cmsg)));
if (m_length(control, NULL) == 0) {
m_freem(control);
control = NULL;
}
}
msg.msg_iov = iov;
msg.msg_flags = 0;
error = linux_sendit(td, args->s, &msg, args->flags, control,
UIO_USERSPACE);
bad:
free(iov, M_IOV);
if (cmsg)
free(cmsg, M_TEMP);
return (error);
}
struct linux_recvmsg_args {
int s;
l_uintptr_t msg;
int flags;
};
static int
linux_recvmsg(struct thread *td, struct linux_recvmsg_args *args)
{
struct cmsghdr *cm;
struct cmsgcred *cmcred;
struct msghdr msg;
struct l_cmsghdr *linux_cmsg = NULL;
struct l_ucred linux_ucred;
socklen_t datalen, outlen;
struct l_msghdr linux_msg;
struct iovec *iov, *uiov;
struct mbuf *control = NULL;
struct mbuf **controlp;
caddr_t outbuf;
void *data;
int error, i, fd, fds, *fdp;
error = copyin(PTRIN(args->msg), &linux_msg, sizeof(linux_msg));
if (error)
return (error);
error = linux_to_bsd_msghdr(&msg, &linux_msg);
if (error)
return (error);
#ifdef COMPAT_LINUX32
error = linux32_copyiniov(PTRIN(msg.msg_iov), msg.msg_iovlen,
&iov, EMSGSIZE);
#else
error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
#endif
if (error)
return (error);
if (msg.msg_name) {
error = linux_to_bsd_sockaddr((struct sockaddr *)msg.msg_name,
msg.msg_namelen);
if (error)
goto bad;
}
uiov = msg.msg_iov;
msg.msg_iov = iov;
controlp = (msg.msg_control != NULL) ? &control : NULL;
error = kern_recvit(td, args->s, &msg, UIO_USERSPACE, controlp);
msg.msg_iov = uiov;
if (error)
goto bad;
error = bsd_to_linux_msghdr(&msg, &linux_msg);
if (error)
goto bad;
if (linux_msg.msg_name) {
error = bsd_to_linux_sockaddr((struct sockaddr *)
PTRIN(linux_msg.msg_name));
if (error)
goto bad;
}
if (linux_msg.msg_name && linux_msg.msg_namelen > 2) {
error = linux_sa_put(PTRIN(linux_msg.msg_name));
if (error)
goto bad;
}
outbuf = PTRIN(linux_msg.msg_control);
outlen = 0;
if (control) {
linux_cmsg = malloc(L_CMSG_HDRSZ, M_TEMP, M_WAITOK | M_ZERO);
msg.msg_control = mtod(control, struct cmsghdr *);
msg.msg_controllen = control->m_len;
cm = CMSG_FIRSTHDR(&msg);
while (cm != NULL) {
linux_cmsg->cmsg_type =
bsd_to_linux_cmsg_type(cm->cmsg_type);
linux_cmsg->cmsg_level =
bsd_to_linux_sockopt_level(cm->cmsg_level);
if (linux_cmsg->cmsg_type == -1
|| cm->cmsg_level != SOL_SOCKET)
{
error = EINVAL;
goto bad;
}
data = CMSG_DATA(cm);
datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data;
switch (cm->cmsg_type)
{
case SCM_RIGHTS:
if (args->flags & LINUX_MSG_CMSG_CLOEXEC) {
fds = datalen / sizeof(int);
fdp = data;
for (i = 0; i < fds; i++) {
fd = *fdp++;
(void)kern_fcntl(td, fd,
F_SETFD, FD_CLOEXEC);
}
}
break;
case SCM_CREDS:
/*
* Currently LOCAL_CREDS is never in
* effect for Linux so no need to worry
* about sockcred
*/
if (datalen != sizeof (*cmcred)) {
error = EMSGSIZE;
goto bad;
}
cmcred = (struct cmsgcred *)data;
bzero(&linux_ucred, sizeof(linux_ucred));
linux_ucred.pid = cmcred->cmcred_pid;
linux_ucred.uid = cmcred->cmcred_uid;
linux_ucred.gid = cmcred->cmcred_gid;
data = &linux_ucred;
datalen = sizeof(linux_ucred);
break;
}
if (outlen + LINUX_CMSG_LEN(datalen) >
linux_msg.msg_controllen) {
if (outlen == 0) {
error = EMSGSIZE;
goto bad;
} else {
linux_msg.msg_flags |=
LINUX_MSG_CTRUNC;
goto out;
}
}
linux_cmsg->cmsg_len = LINUX_CMSG_LEN(datalen);
error = copyout(linux_cmsg, outbuf, L_CMSG_HDRSZ);
if (error)
goto bad;
outbuf += L_CMSG_HDRSZ;
error = copyout(data, outbuf, datalen);
if (error)
goto bad;
outbuf += LINUX_CMSG_ALIGN(datalen);
outlen += LINUX_CMSG_LEN(datalen);
cm = CMSG_NXTHDR(&msg, cm);
}
}
out:
linux_msg.msg_controllen = outlen;
error = copyout(&linux_msg, PTRIN(args->msg), sizeof(linux_msg));
bad:
free(iov, M_IOV);
if (control != NULL)
m_freem(control);
if (linux_cmsg != NULL)
free(linux_cmsg, M_TEMP);
return (error);
}
struct linux_shutdown_args {
int s;
int how;
};
static int
linux_shutdown(struct thread *td, struct linux_shutdown_args *args)
{
struct shutdown_args /* {
int s;
int how;
} */ bsd_args;
bsd_args.s = args->s;
bsd_args.how = args->how;
- return (shutdown(td, &bsd_args));
+ return (sys_shutdown(td, &bsd_args));
}
struct linux_setsockopt_args {
int s;
int level;
int optname;
l_uintptr_t optval;
int optlen;
};
static int
linux_setsockopt(struct thread *td, struct linux_setsockopt_args *args)
{
struct setsockopt_args /* {
int s;
int level;
int name;
caddr_t val;
int valsize;
} */ bsd_args;
l_timeval linux_tv;
struct timeval tv;
int error, name;
bsd_args.s = args->s;
bsd_args.level = linux_to_bsd_sockopt_level(args->level);
switch (bsd_args.level) {
case SOL_SOCKET:
name = linux_to_bsd_so_sockopt(args->optname);
switch (name) {
case SO_RCVTIMEO:
/* FALLTHROUGH */
case SO_SNDTIMEO:
error = copyin(PTRIN(args->optval), &linux_tv,
sizeof(linux_tv));
if (error)
return (error);
tv.tv_sec = linux_tv.tv_sec;
tv.tv_usec = linux_tv.tv_usec;
return (kern_setsockopt(td, args->s, bsd_args.level,
name, &tv, UIO_SYSSPACE, sizeof(tv)));
/* NOTREACHED */
break;
default:
break;
}
break;
case IPPROTO_IP:
name = linux_to_bsd_ip_sockopt(args->optname);
break;
case IPPROTO_TCP:
/* Linux TCP option values match BSD's */
name = args->optname;
break;
default:
name = -1;
break;
}
if (name == -1)
return (ENOPROTOOPT);
bsd_args.name = name;
bsd_args.val = PTRIN(args->optval);
bsd_args.valsize = args->optlen;
if (name == IPV6_NEXTHOP) {
linux_to_bsd_sockaddr((struct sockaddr *)bsd_args.val,
bsd_args.valsize);
- error = setsockopt(td, &bsd_args);
+ error = sys_setsockopt(td, &bsd_args);
bsd_to_linux_sockaddr((struct sockaddr *)bsd_args.val);
} else
- error = setsockopt(td, &bsd_args);
+ error = sys_setsockopt(td, &bsd_args);
return (error);
}
struct linux_getsockopt_args {
int s;
int level;
int optname;
l_uintptr_t optval;
l_uintptr_t optlen;
};
static int
linux_getsockopt(struct thread *td, struct linux_getsockopt_args *args)
{
struct getsockopt_args /* {
int s;
int level;
int name;
caddr_t val;
int *avalsize;
} */ bsd_args;
l_timeval linux_tv;
struct timeval tv;
socklen_t tv_len, xulen;
struct xucred xu;
struct l_ucred lxu;
int error, name;
bsd_args.s = args->s;
bsd_args.level = linux_to_bsd_sockopt_level(args->level);
switch (bsd_args.level) {
case SOL_SOCKET:
name = linux_to_bsd_so_sockopt(args->optname);
switch (name) {
case SO_RCVTIMEO:
/* FALLTHROUGH */
case SO_SNDTIMEO:
tv_len = sizeof(tv);
error = kern_getsockopt(td, args->s, bsd_args.level,
name, &tv, UIO_SYSSPACE, &tv_len);
if (error)
return (error);
linux_tv.tv_sec = tv.tv_sec;
linux_tv.tv_usec = tv.tv_usec;
return (copyout(&linux_tv, PTRIN(args->optval),
sizeof(linux_tv)));
/* NOTREACHED */
break;
case LOCAL_PEERCRED:
if (args->optlen != sizeof(lxu))
return (EINVAL);
xulen = sizeof(xu);
error = kern_getsockopt(td, args->s, bsd_args.level,
name, &xu, UIO_SYSSPACE, &xulen);
if (error)
return (error);
/*
* XXX Use 0 for pid as the FreeBSD does not cache peer pid.
*/
lxu.pid = 0;
lxu.uid = xu.cr_uid;
lxu.gid = xu.cr_gid;
return (copyout(&lxu, PTRIN(args->optval), sizeof(lxu)));
/* NOTREACHED */
break;
default:
break;
}
break;
case IPPROTO_IP:
name = linux_to_bsd_ip_sockopt(args->optname);
break;
case IPPROTO_TCP:
/* Linux TCP option values match BSD's */
name = args->optname;
break;
default:
name = -1;
break;
}
if (name == -1)
return (EINVAL);
bsd_args.name = name;
bsd_args.val = PTRIN(args->optval);
bsd_args.avalsize = PTRIN(args->optlen);
if (name == IPV6_NEXTHOP) {
- error = getsockopt(td, &bsd_args);
+ error = sys_getsockopt(td, &bsd_args);
bsd_to_linux_sockaddr((struct sockaddr *)bsd_args.val);
} else
- error = getsockopt(td, &bsd_args);
+ error = sys_getsockopt(td, &bsd_args);
return (error);
}
/* Argument list sizes for linux_socketcall */
#define LINUX_AL(x) ((x) * sizeof(l_ulong))
static const unsigned char lxs_args[] = {
LINUX_AL(0) /* unused*/, LINUX_AL(3) /* socket */,
LINUX_AL(3) /* bind */, LINUX_AL(3) /* connect */,
LINUX_AL(2) /* listen */, LINUX_AL(3) /* accept */,
LINUX_AL(3) /* getsockname */, LINUX_AL(3) /* getpeername */,
LINUX_AL(4) /* socketpair */, LINUX_AL(4) /* send */,
LINUX_AL(4) /* recv */, LINUX_AL(6) /* sendto */,
LINUX_AL(6) /* recvfrom */, LINUX_AL(2) /* shutdown */,
LINUX_AL(5) /* setsockopt */, LINUX_AL(5) /* getsockopt */,
LINUX_AL(3) /* sendmsg */, LINUX_AL(3) /* recvmsg */,
LINUX_AL(4) /* accept4 */
};
#define LINUX_AL_SIZE sizeof(lxs_args) / sizeof(lxs_args[0]) - 1
int
linux_socketcall(struct thread *td, struct linux_socketcall_args *args)
{
l_ulong a[6];
void *arg;
int error;
if (args->what < LINUX_SOCKET || args->what > LINUX_AL_SIZE)
return (EINVAL);
error = copyin(PTRIN(args->args), a, lxs_args[args->what]);
if (error)
return (error);
arg = a;
switch (args->what) {
case LINUX_SOCKET:
return (linux_socket(td, arg));
case LINUX_BIND:
return (linux_bind(td, arg));
case LINUX_CONNECT:
return (linux_connect(td, arg));
case LINUX_LISTEN:
return (linux_listen(td, arg));
case LINUX_ACCEPT:
return (linux_accept(td, arg));
case LINUX_GETSOCKNAME:
return (linux_getsockname(td, arg));
case LINUX_GETPEERNAME:
return (linux_getpeername(td, arg));
case LINUX_SOCKETPAIR:
return (linux_socketpair(td, arg));
case LINUX_SEND:
return (linux_send(td, arg));
case LINUX_RECV:
return (linux_recv(td, arg));
case LINUX_SENDTO:
return (linux_sendto(td, arg));
case LINUX_RECVFROM:
return (linux_recvfrom(td, arg));
case LINUX_SHUTDOWN:
return (linux_shutdown(td, arg));
case LINUX_SETSOCKOPT:
return (linux_setsockopt(td, arg));
case LINUX_GETSOCKOPT:
return (linux_getsockopt(td, arg));
case LINUX_SENDMSG:
return (linux_sendmsg(td, arg));
case LINUX_RECVMSG:
return (linux_recvmsg(td, arg));
case LINUX_ACCEPT4:
return (linux_accept4(td, arg));
}
uprintf("LINUX: 'socket' typ=%d not implemented\n", args->what);
return (ENOSYS);
}
Index: head/sys/compat/linux/linux_uid16.c
===================================================================
--- head/sys/compat/linux/linux_uid16.c (revision 225616)
+++ head/sys/compat/linux/linux_uid16.c (revision 225617)
@@ -1,306 +1,306 @@
/*-
* Copyright (c) 2001 The FreeBSD Project
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_compat.h"
#include <sys/fcntl.h>
#include <sys/param.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/syscallsubr.h>
#include <sys/sysproto.h>
#include <sys/systm.h>
#ifdef COMPAT_LINUX32
#include <machine/../linux32/linux.h>
#include <machine/../linux32/linux32_proto.h>
#else
#include <machine/../linux/linux.h>
#include <machine/../linux/linux_proto.h>
#endif
#include <compat/linux/linux_util.h>
DUMMY(setfsuid16);
DUMMY(setfsgid16);
DUMMY(getresuid16);
DUMMY(getresgid16);
#define CAST_NOCHG(x) ((x == 0xFFFF) ? -1 : x)
int
linux_chown16(struct thread *td, struct linux_chown16_args *args)
{
char *path;
int error;
LCONVPATHEXIST(td, args->path, &path);
#ifdef DEBUG
if (ldebug(chown16))
printf(ARGS(chown16, "%s, %d, %d"), path, args->uid, args->gid);
#endif
error = kern_chown(td, path, UIO_SYSSPACE, CAST_NOCHG(args->uid),
CAST_NOCHG(args->gid));
LFREEPATH(path);
return (error);
}
int
linux_lchown16(struct thread *td, struct linux_lchown16_args *args)
{
char *path;
int error;
LCONVPATHEXIST(td, args->path, &path);
#ifdef DEBUG
if (ldebug(lchown16))
printf(ARGS(lchown16, "%s, %d, %d"), path, args->uid,
args->gid);
#endif
error = kern_lchown(td, path, UIO_SYSSPACE, CAST_NOCHG(args->uid),
CAST_NOCHG(args->gid));
LFREEPATH(path);
return (error);
}
int
linux_setgroups16(struct thread *td, struct linux_setgroups16_args *args)
{
struct ucred *newcred, *oldcred;
l_gid16_t *linux_gidset;
gid_t *bsd_gidset;
int ngrp, error;
struct proc *p;
#ifdef DEBUG
if (ldebug(setgroups16))
printf(ARGS(setgroups16, "%d, *"), args->gidsetsize);
#endif
ngrp = args->gidsetsize;
if (ngrp < 0 || ngrp >= ngroups_max + 1)
return (EINVAL);
linux_gidset = malloc(ngrp * sizeof(*linux_gidset), M_TEMP, M_WAITOK);
error = copyin(args->gidset, linux_gidset, ngrp * sizeof(l_gid16_t));
if (error)
return (error);
newcred = crget();
p = td->td_proc;
PROC_LOCK(p);
oldcred = crcopysafe(p, newcred);
/*
* cr_groups[0] holds egid. Setting the whole set from
* the supplied set will cause egid to be changed too.
* Keep cr_groups[0] unchanged to prevent that.
*/
if ((error = priv_check_cred(oldcred, PRIV_CRED_SETGROUPS, 0)) != 0) {
PROC_UNLOCK(p);
crfree(newcred);
goto out;
}
if (ngrp > 0) {
newcred->cr_ngroups = ngrp + 1;
bsd_gidset = newcred->cr_groups;
ngrp--;
while (ngrp >= 0) {
bsd_gidset[ngrp + 1] = linux_gidset[ngrp];
ngrp--;
}
}
else
newcred->cr_ngroups = 1;
setsugid(td->td_proc);
p->p_ucred = newcred;
PROC_UNLOCK(p);
crfree(oldcred);
error = 0;
out:
free(linux_gidset, M_TEMP);
return (error);
}
int
linux_getgroups16(struct thread *td, struct linux_getgroups16_args *args)
{
struct ucred *cred;
l_gid16_t *linux_gidset;
gid_t *bsd_gidset;
int bsd_gidsetsz, ngrp, error;
#ifdef DEBUG
if (ldebug(getgroups16))
printf(ARGS(getgroups16, "%d, *"), args->gidsetsize);
#endif
cred = td->td_ucred;
bsd_gidset = cred->cr_groups;
bsd_gidsetsz = cred->cr_ngroups - 1;
/*
* cr_groups[0] holds egid. Returning the whole set
* here will cause a duplicate. Exclude cr_groups[0]
* to prevent that.
*/
if ((ngrp = args->gidsetsize) == 0) {
td->td_retval[0] = bsd_gidsetsz;
return (0);
}
if (ngrp < bsd_gidsetsz)
return (EINVAL);
ngrp = 0;
linux_gidset = malloc(bsd_gidsetsz * sizeof(*linux_gidset),
M_TEMP, M_WAITOK);
while (ngrp < bsd_gidsetsz) {
linux_gidset[ngrp] = bsd_gidset[ngrp + 1];
ngrp++;
}
error = copyout(linux_gidset, args->gidset, ngrp * sizeof(l_gid16_t));
free(linux_gidset, M_TEMP);
if (error)
return (error);
td->td_retval[0] = ngrp;
return (0);
}
/*
* The FreeBSD native getgid(2) and getuid(2) also modify td->td_retval[1]
* when COMPAT_43 is defined. This clobbers registers that are assumed to
* be preserved. The following lightweight syscalls fixes this. See also
* linux_getpid(2), linux_getgid(2) and linux_getuid(2) in linux_misc.c
*
* linux_getgid16() - MP SAFE
* linux_getuid16() - MP SAFE
*/
int
linux_getgid16(struct thread *td, struct linux_getgid16_args *args)
{
td->td_retval[0] = td->td_ucred->cr_rgid;
return (0);
}
int
linux_getuid16(struct thread *td, struct linux_getuid16_args *args)
{
td->td_retval[0] = td->td_ucred->cr_ruid;
return (0);
}
int
linux_getegid16(struct thread *td, struct linux_getegid16_args *args)
{
struct getegid_args bsd;
- return (getegid(td, &bsd));
+ return (sys_getegid(td, &bsd));
}
int
linux_geteuid16(struct thread *td, struct linux_geteuid16_args *args)
{
struct geteuid_args bsd;
- return (geteuid(td, &bsd));
+ return (sys_geteuid(td, &bsd));
}
int
linux_setgid16(struct thread *td, struct linux_setgid16_args *args)
{
struct setgid_args bsd;
bsd.gid = args->gid;
- return (setgid(td, &bsd));
+ return (sys_setgid(td, &bsd));
}
int
linux_setuid16(struct thread *td, struct linux_setuid16_args *args)
{
struct setuid_args bsd;
bsd.uid = args->uid;
- return (setuid(td, &bsd));
+ return (sys_setuid(td, &bsd));
}
int
linux_setregid16(struct thread *td, struct linux_setregid16_args *args)
{
struct setregid_args bsd;
bsd.rgid = CAST_NOCHG(args->rgid);
bsd.egid = CAST_NOCHG(args->egid);
- return (setregid(td, &bsd));
+ return (sys_setregid(td, &bsd));
}
int
linux_setreuid16(struct thread *td, struct linux_setreuid16_args *args)
{
struct setreuid_args bsd;
bsd.ruid = CAST_NOCHG(args->ruid);
bsd.euid = CAST_NOCHG(args->euid);
- return (setreuid(td, &bsd));
+ return (sys_setreuid(td, &bsd));
}
int
linux_setresgid16(struct thread *td, struct linux_setresgid16_args *args)
{
struct setresgid_args bsd;
bsd.rgid = CAST_NOCHG(args->rgid);
bsd.egid = CAST_NOCHG(args->egid);
bsd.sgid = CAST_NOCHG(args->sgid);
- return (setresgid(td, &bsd));
+ return (sys_setresgid(td, &bsd));
}
int
linux_setresuid16(struct thread *td, struct linux_setresuid16_args *args)
{
struct setresuid_args bsd;
bsd.ruid = CAST_NOCHG(args->ruid);
bsd.euid = CAST_NOCHG(args->euid);
bsd.suid = CAST_NOCHG(args->suid);
- return (setresuid(td, &bsd));
+ return (sys_setresuid(td, &bsd));
}
Index: head/sys/compat/svr4/svr4_fcntl.c
===================================================================
--- head/sys/compat/svr4/svr4_fcntl.c (revision 225616)
+++ head/sys/compat/svr4/svr4_fcntl.c (revision 225617)
@@ -1,724 +1,724 @@
/*-
* Copyright (c) 1998 Mark Newton
* Copyright (c) 1994, 1997 Christos Zoulas.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by Christos Zoulas.
* 4. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/capability.h>
#include <sys/systm.h>
#include <sys/file.h>
#include <sys/filedesc.h>
/*#include <sys/ioctl.h>*/
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mount.h>
#include <sys/mutex.h>
#include <sys/namei.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/stat.h>
#include <sys/syscallsubr.h>
#include <sys/unistd.h>
#include <sys/vnode.h>
#include <sys/sysproto.h>
#include <compat/svr4/svr4.h>
#include <compat/svr4/svr4_types.h>
#include <compat/svr4/svr4_signal.h>
#include <compat/svr4/svr4_proto.h>
#include <compat/svr4/svr4_util.h>
#include <compat/svr4/svr4_fcntl.h>
#include <security/mac/mac_framework.h>
static int svr4_to_bsd_flags(int);
static u_long svr4_to_bsd_cmd(u_long);
static int fd_revoke(struct thread *, int);
static int fd_truncate(struct thread *, int, struct flock *);
static int bsd_to_svr4_flags(int);
static void bsd_to_svr4_flock(struct flock *, struct svr4_flock *);
static void svr4_to_bsd_flock(struct svr4_flock *, struct flock *);
static void bsd_to_svr4_flock64(struct flock *, struct svr4_flock64 *);
static void svr4_to_bsd_flock64(struct svr4_flock64 *, struct flock *);
static u_long
svr4_to_bsd_cmd(cmd)
u_long cmd;
{
switch (cmd) {
case SVR4_F_DUPFD:
return F_DUPFD;
case SVR4_F_DUP2FD:
return F_DUP2FD;
case SVR4_F_GETFD:
return F_GETFD;
case SVR4_F_SETFD:
return F_SETFD;
case SVR4_F_GETFL:
return F_GETFL;
case SVR4_F_SETFL:
return F_SETFL;
case SVR4_F_GETLK:
return F_GETLK;
case SVR4_F_SETLK:
return F_SETLK;
case SVR4_F_SETLKW:
return F_SETLKW;
default:
return -1;
}
}
static int
svr4_to_bsd_flags(l)
int l;
{
int r = 0;
r |= (l & SVR4_O_RDONLY) ? O_RDONLY : 0;
r |= (l & SVR4_O_WRONLY) ? O_WRONLY : 0;
r |= (l & SVR4_O_RDWR) ? O_RDWR : 0;
r |= (l & SVR4_O_NDELAY) ? O_NONBLOCK : 0;
r |= (l & SVR4_O_APPEND) ? O_APPEND : 0;
r |= (l & SVR4_O_SYNC) ? O_FSYNC : 0;
r |= (l & SVR4_O_NONBLOCK) ? O_NONBLOCK : 0;
r |= (l & SVR4_O_PRIV) ? O_EXLOCK : 0;
r |= (l & SVR4_O_CREAT) ? O_CREAT : 0;
r |= (l & SVR4_O_TRUNC) ? O_TRUNC : 0;
r |= (l & SVR4_O_EXCL) ? O_EXCL : 0;
r |= (l & SVR4_O_NOCTTY) ? O_NOCTTY : 0;
return r;
}
static int
bsd_to_svr4_flags(l)
int l;
{
int r = 0;
r |= (l & O_RDONLY) ? SVR4_O_RDONLY : 0;
r |= (l & O_WRONLY) ? SVR4_O_WRONLY : 0;
r |= (l & O_RDWR) ? SVR4_O_RDWR : 0;
r |= (l & O_NDELAY) ? SVR4_O_NONBLOCK : 0;
r |= (l & O_APPEND) ? SVR4_O_APPEND : 0;
r |= (l & O_FSYNC) ? SVR4_O_SYNC : 0;
r |= (l & O_NONBLOCK) ? SVR4_O_NONBLOCK : 0;
r |= (l & O_EXLOCK) ? SVR4_O_PRIV : 0;
r |= (l & O_CREAT) ? SVR4_O_CREAT : 0;
r |= (l & O_TRUNC) ? SVR4_O_TRUNC : 0;
r |= (l & O_EXCL) ? SVR4_O_EXCL : 0;
r |= (l & O_NOCTTY) ? SVR4_O_NOCTTY : 0;
return r;
}
static void
bsd_to_svr4_flock(iflp, oflp)
struct flock *iflp;
struct svr4_flock *oflp;
{
switch (iflp->l_type) {
case F_RDLCK:
oflp->l_type = SVR4_F_RDLCK;
break;
case F_WRLCK:
oflp->l_type = SVR4_F_WRLCK;
break;
case F_UNLCK:
oflp->l_type = SVR4_F_UNLCK;
break;
default:
oflp->l_type = -1;
break;
}
oflp->l_whence = (short) iflp->l_whence;
oflp->l_start = (svr4_off_t) iflp->l_start;
oflp->l_len = (svr4_off_t) iflp->l_len;
oflp->l_sysid = 0;
oflp->l_pid = (svr4_pid_t) iflp->l_pid;
}
static void
svr4_to_bsd_flock(iflp, oflp)
struct svr4_flock *iflp;
struct flock *oflp;
{
switch (iflp->l_type) {
case SVR4_F_RDLCK:
oflp->l_type = F_RDLCK;
break;
case SVR4_F_WRLCK:
oflp->l_type = F_WRLCK;
break;
case SVR4_F_UNLCK:
oflp->l_type = F_UNLCK;
break;
default:
oflp->l_type = -1;
break;
}
oflp->l_whence = iflp->l_whence;
oflp->l_start = (off_t) iflp->l_start;
oflp->l_len = (off_t) iflp->l_len;
oflp->l_pid = (pid_t) iflp->l_pid;
oflp->l_sysid = iflp->l_sysid;
}
static void
bsd_to_svr4_flock64(iflp, oflp)
struct flock *iflp;
struct svr4_flock64 *oflp;
{
switch (iflp->l_type) {
case F_RDLCK:
oflp->l_type = SVR4_F_RDLCK;
break;
case F_WRLCK:
oflp->l_type = SVR4_F_WRLCK;
break;
case F_UNLCK:
oflp->l_type = SVR4_F_UNLCK;
break;
default:
oflp->l_type = -1;
break;
}
oflp->l_whence = (short) iflp->l_whence;
oflp->l_start = (svr4_off64_t) iflp->l_start;
oflp->l_len = (svr4_off64_t) iflp->l_len;
oflp->l_sysid = iflp->l_sysid;
oflp->l_pid = (svr4_pid_t) iflp->l_pid;
}
static void
svr4_to_bsd_flock64(iflp, oflp)
struct svr4_flock64 *iflp;
struct flock *oflp;
{
switch (iflp->l_type) {
case SVR4_F_RDLCK:
oflp->l_type = F_RDLCK;
break;
case SVR4_F_WRLCK:
oflp->l_type = F_WRLCK;
break;
case SVR4_F_UNLCK:
oflp->l_type = F_UNLCK;
break;
default:
oflp->l_type = -1;
break;
}
oflp->l_whence = iflp->l_whence;
oflp->l_start = (off_t) iflp->l_start;
oflp->l_len = (off_t) iflp->l_len;
oflp->l_pid = (pid_t) iflp->l_pid;
}
static int
fd_revoke(td, fd)
struct thread *td;
int fd;
{
struct vnode *vp;
struct mount *mp;
struct vattr vattr;
int error, *retval;
retval = td->td_retval;
/*
* If we ever want to support Capsicum on SVR4 processes (unlikely)
* or FreeBSD grows a native frevoke() (more likely), we will need a
* CAP_REVOKE here.
*
* In the meantime, use CAP_MASK_VALID: if a SVR4 process wants to
* do an frevoke(), it needs to do it on either a regular file
* descriptor or a fully-privileged capability (which is effectively
* the same as a non-capability-restricted file descriptor).
*/
if ((error = fgetvp(td, fd, CAP_MASK_VALID, &vp)) != 0)
return (error);
if (vp->v_type != VCHR && vp->v_type != VBLK) {
error = EINVAL;
goto out;
}
#ifdef MAC
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
error = mac_vnode_check_revoke(td->td_ucred, vp);
VOP_UNLOCK(vp, 0);
if (error)
goto out;
#endif
if ((error = VOP_GETATTR(vp, &vattr, td->td_ucred)) != 0)
goto out;
if (td->td_ucred->cr_uid != vattr.va_uid &&
(error = priv_check(td, PRIV_VFS_ADMIN)) != 0)
goto out;
if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
goto out;
if (vcount(vp) > 1)
VOP_REVOKE(vp, REVOKEALL);
vn_finished_write(mp);
out:
vrele(vp);
return error;
}
static int
fd_truncate(td, fd, flp)
struct thread *td;
int fd;
struct flock *flp;
{
off_t start, length;
struct file *fp;
struct vnode *vp;
struct vattr vattr;
int error, *retval;
struct ftruncate_args ft;
retval = td->td_retval;
/*
* We only support truncating the file.
*/
if ((error = fget(td, fd, CAP_FTRUNCATE, &fp)) != 0)
return (error);
vp = fp->f_vnode;
if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) {
fdrop(fp, td);
return ESPIPE;
}
if ((error = VOP_GETATTR(vp, &vattr, td->td_ucred)) != 0) {
fdrop(fp, td);
return error;
}
length = vattr.va_size;
switch (flp->l_whence) {
case SEEK_CUR:
start = fp->f_offset + flp->l_start;
break;
case SEEK_END:
start = flp->l_start + length;
break;
case SEEK_SET:
start = flp->l_start;
break;
default:
fdrop(fp, td);
return EINVAL;
}
if (start + flp->l_len < length) {
/* We don't support free'ing in the middle of the file */
fdrop(fp, td);
return EINVAL;
}
ft.fd = fd;
ft.length = start;
- error = ftruncate(td, &ft);
+ error = sys_ftruncate(td, &ft);
fdrop(fp, td);
return (error);
}
int
svr4_sys_open(td, uap)
struct thread *td;
struct svr4_sys_open_args *uap;
{
struct proc *p = td->td_proc;
char *newpath;
int bsd_flags, error, retval;
CHECKALTEXIST(td, uap->path, &newpath);
bsd_flags = svr4_to_bsd_flags(uap->flags);
error = kern_open(td, newpath, UIO_SYSSPACE, bsd_flags, uap->mode);
free(newpath, M_TEMP);
if (error) {
/* uprintf("svr4_open(%s, 0x%0x, 0%o): %d\n", uap->path,
uap->flags, uap->mode, error);*/
return error;
}
retval = td->td_retval[0];
PROC_LOCK(p);
if (!(bsd_flags & O_NOCTTY) && SESS_LEADER(p) &&
!(p->p_flag & P_CONTROLT)) {
#if defined(NOTYET)
struct file *fp;
error = fget(td, retval, CAP_IOCTL, &fp);
PROC_UNLOCK(p);
/*
* we may have lost a race the above open() and
* another thread issuing a close()
*/
if (error)
return (EBADF); /* XXX: correct errno? */
/* ignore any error, just give it a try */
if (fp->f_type == DTYPE_VNODE)
fo_ioctl(fp, TIOCSCTTY, (caddr_t) 0, td->td_ucred,
td);
fdrop(fp, td);
} else {
PROC_UNLOCK(p);
}
#else
}
PROC_UNLOCK(p);
#endif
return error;
}
int
svr4_sys_open64(td, uap)
struct thread *td;
struct svr4_sys_open64_args *uap;
{
return svr4_sys_open(td, (struct svr4_sys_open_args *)uap);
}
int
svr4_sys_creat(td, uap)
struct thread *td;
struct svr4_sys_creat_args *uap;
{
char *newpath;
int error;
CHECKALTEXIST(td, uap->path, &newpath);
error = kern_open(td, newpath, UIO_SYSSPACE, O_WRONLY | O_CREAT |
O_TRUNC, uap->mode);
free(newpath, M_TEMP);
return (error);
}
int
svr4_sys_creat64(td, uap)
struct thread *td;
struct svr4_sys_creat64_args *uap;
{
return svr4_sys_creat(td, (struct svr4_sys_creat_args *)uap);
}
int
svr4_sys_llseek(td, uap)
struct thread *td;
struct svr4_sys_llseek_args *uap;
{
struct lseek_args ap;
ap.fd = uap->fd;
#if BYTE_ORDER == BIG_ENDIAN
ap.offset = (((u_int64_t) uap->offset1) << 32) |
uap->offset2;
#else
ap.offset = (((u_int64_t) uap->offset2) << 32) |
uap->offset1;
#endif
ap.whence = uap->whence;
- return lseek(td, &ap);
+ return sys_lseek(td, &ap);
}
int
svr4_sys_access(td, uap)
struct thread *td;
struct svr4_sys_access_args *uap;
{
char *newpath;
int error;
CHECKALTEXIST(td, uap->path, &newpath);
error = kern_access(td, newpath, UIO_SYSSPACE, uap->flags);
free(newpath, M_TEMP);
return (error);
}
#if defined(NOTYET)
int
svr4_sys_pread(td, uap)
struct thread *td;
struct svr4_sys_pread_args *uap;
{
struct pread_args pra;
/*
* Just translate the args structure and call the NetBSD
* pread(2) system call (offset type is 64-bit in NetBSD).
*/
pra.fd = uap->fd;
pra.buf = uap->buf;
pra.nbyte = uap->nbyte;
pra.offset = uap->off;
return pread(td, &pra);
}
#endif
#if defined(NOTYET)
int
svr4_sys_pread64(td, v, retval)
struct thread *td;
void *v;
register_t *retval;
{
struct svr4_sys_pread64_args *uap = v;
struct sys_pread_args pra;
/*
* Just translate the args structure and call the NetBSD
* pread(2) system call (offset type is 64-bit in NetBSD).
*/
pra.fd = uap->fd;
pra.buf = uap->buf;
pra.nbyte = uap->nbyte;
pra.offset = uap->off;
return (sys_pread(td, &pra, retval));
}
#endif /* NOTYET */
#if defined(NOTYET)
int
svr4_sys_pwrite(td, uap)
struct thread *td;
struct svr4_sys_pwrite_args *uap;
{
struct pwrite_args pwa;
/*
* Just translate the args structure and call the NetBSD
* pwrite(2) system call (offset type is 64-bit in NetBSD).
*/
pwa.fd = uap->fd;
pwa.buf = uap->buf;
pwa.nbyte = uap->nbyte;
pwa.offset = uap->off;
return pwrite(td, &pwa);
}
#endif
#if defined(NOTYET)
int
svr4_sys_pwrite64(td, v, retval)
struct thread *td;
void *v;
register_t *retval;
{
struct svr4_sys_pwrite64_args *uap = v;
struct sys_pwrite_args pwa;
/*
* Just translate the args structure and call the NetBSD
* pwrite(2) system call (offset type is 64-bit in NetBSD).
*/
pwa.fd = uap->fd;
pwa.buf = uap->buf;
pwa.nbyte = uap->nbyte;
pwa.offset = uap->off;
return (sys_pwrite(td, &pwa, retval));
}
#endif /* NOTYET */
int
svr4_sys_fcntl(td, uap)
struct thread *td;
struct svr4_sys_fcntl_args *uap;
{
int cmd, error, *retval;
retval = td->td_retval;
cmd = svr4_to_bsd_cmd(uap->cmd);
switch (cmd) {
case F_DUPFD:
case F_DUP2FD:
case F_GETFD:
case F_SETFD:
return (kern_fcntl(td, uap->fd, cmd, (intptr_t)uap->arg));
case F_GETFL:
error = kern_fcntl(td, uap->fd, cmd, (intptr_t)uap->arg);
if (error)
return (error);
*retval = bsd_to_svr4_flags(*retval);
return (error);
case F_SETFL:
{
/*
* we must save the O_ASYNC flag, as that is
* handled by ioctl(_, I_SETSIG, _) emulation.
*/
int flags;
DPRINTF(("Setting flags %p\n", uap->arg));
error = kern_fcntl(td, uap->fd, F_GETFL, 0);
if (error)
return (error);
flags = *retval;
flags &= O_ASYNC;
flags |= svr4_to_bsd_flags((u_long) uap->arg);
return (kern_fcntl(td, uap->fd, F_SETFL, flags));
}
case F_GETLK:
case F_SETLK:
case F_SETLKW:
{
struct svr4_flock ifl;
struct flock fl;
error = copyin(uap->arg, &ifl, sizeof (ifl));
if (error)
return (error);
svr4_to_bsd_flock(&ifl, &fl);
error = kern_fcntl(td, uap->fd, cmd, (intptr_t)&fl);
if (error || cmd != F_GETLK)
return (error);
bsd_to_svr4_flock(&fl, &ifl);
return (copyout(&ifl, uap->arg, sizeof (ifl)));
}
case -1:
switch (uap->cmd) {
case SVR4_F_FREESP:
{
struct svr4_flock ifl;
struct flock fl;
error = copyin(uap->arg, &ifl,
sizeof ifl);
if (error)
return error;
svr4_to_bsd_flock(&ifl, &fl);
return fd_truncate(td, uap->fd, &fl);
}
case SVR4_F_GETLK64:
case SVR4_F_SETLK64:
case SVR4_F_SETLKW64:
{
struct svr4_flock64 ifl;
struct flock fl;
switch (uap->cmd) {
case SVR4_F_GETLK64:
cmd = F_GETLK;
break;
case SVR4_F_SETLK64:
cmd = F_SETLK;
break;
case SVR4_F_SETLKW64:
cmd = F_SETLKW;
break;
}
error = copyin(uap->arg, &ifl,
sizeof (ifl));
if (error)
return (error);
svr4_to_bsd_flock64(&ifl, &fl);
error = kern_fcntl(td, uap->fd, cmd,
(intptr_t)&fl);
if (error || cmd != F_GETLK)
return (error);
bsd_to_svr4_flock64(&fl, &ifl);
return (copyout(&ifl, uap->arg,
sizeof (ifl)));
}
case SVR4_F_FREESP64:
{
struct svr4_flock64 ifl;
struct flock fl;
error = copyin(uap->arg, &ifl,
sizeof ifl);
if (error)
return error;
svr4_to_bsd_flock64(&ifl, &fl);
return fd_truncate(td, uap->fd, &fl);
}
case SVR4_F_REVOKE:
return fd_revoke(td, uap->fd);
default:
return ENOSYS;
}
default:
return ENOSYS;
}
}
Index: head/sys/compat/svr4/svr4_filio.c
===================================================================
--- head/sys/compat/svr4/svr4_filio.c (revision 225616)
+++ head/sys/compat/svr4/svr4_filio.c (revision 225617)
@@ -1,249 +1,249 @@
/*-
* Copyright (c) 1998 Mark Newton
* Copyright (c) 1994 Christos Zoulas
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/proc.h>
#include <sys/systm.h>
#include <sys/capability.h>
#include <sys/file.h>
#include <sys/filio.h>
#include <sys/lock.h>
#include <sys/signal.h>
#include <sys/filedesc.h>
#include <sys/poll.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
#include <sys/sysproto.h>
#include <compat/svr4/svr4.h>
#include <compat/svr4/svr4_types.h>
#include <compat/svr4/svr4_util.h>
#include <compat/svr4/svr4_signal.h>
#include <compat/svr4/svr4_proto.h>
#include <compat/svr4/svr4_ioctl.h>
#include <compat/svr4/svr4_filio.h>
/*#define GROTTY_READ_HACK*/
int
svr4_sys_poll(td, uap)
struct thread *td;
struct svr4_sys_poll_args *uap;
{
int error;
struct poll_args pa;
struct pollfd *pfd;
int idx = 0, cerr;
u_long siz;
if (uap->nfds > maxfilesperproc && uap->nfds > FD_SETSIZE)
return (EINVAL);
pa.fds = uap->fds;
pa.nfds = uap->nfds;
pa.timeout = uap->timeout;
siz = uap->nfds * sizeof(struct pollfd);
pfd = (struct pollfd *)malloc(siz, M_TEMP, M_WAITOK);
- error = poll(td, (struct poll_args *)uap);
+ error = sys_poll(td, (struct poll_args *)uap);
if ((cerr = copyin(uap->fds, pfd, siz)) != 0) {
error = cerr;
goto done;
}
for (idx = 0; idx < uap->nfds; idx++) {
/* POLLWRNORM already equals POLLOUT, so we don't worry about that */
if (pfd[idx].revents & (POLLOUT | POLLWRNORM | POLLWRBAND))
pfd[idx].revents |= (POLLOUT | POLLWRNORM | POLLWRBAND);
}
if ((cerr = copyout(pfd, uap->fds, siz)) != 0) {
error = cerr;
goto done; /* yeah, I know it's the next line, but this way I won't
forget to update it if I add more code */
}
done:
free(pfd, M_TEMP);
return error;
}
#if defined(READ_TEST)
int
svr4_sys_read(td, uap)
struct thread *td;
struct svr4_sys_read_args *uap;
{
struct read_args ra;
struct file *fp;
struct socket *so = NULL;
int so_state;
sigset_t sigmask;
int rv;
ra.fd = uap->fd;
ra.buf = uap->buf;
ra.nbyte = uap->nbyte;
if (fget(td, uap->fd, CAP_READ, &fp) != 0) {
DPRINTF(("Something fishy with the user-supplied file descriptor...\n"));
return EBADF;
}
if (fp->f_type == DTYPE_SOCKET) {
so = fp->f_data;
DPRINTF(("fd %d is a socket\n", uap->fd));
if (so->so_state & SS_ASYNC) {
DPRINTF(("fd %d is an ASYNC socket!\n", uap->fd));
}
DPRINTF(("Here are its flags: 0x%x\n", so->so_state));
#if defined(GROTTY_READ_HACK)
so_state = so->so_state;
so->so_state &= ~SS_NBIO;
#endif
}
rv = read(td, &ra);
DPRINTF(("svr4_read(%d, 0x%0x, %d) = %d\n",
uap->fd, uap->buf, uap->nbyte, rv));
if (rv == EAGAIN) {
#ifdef DEBUG_SVR4
struct sigacts *ps;
PROC_LOCK(td->td_proc);
ps = td->td_proc->p_sigacts;
mtx_lock(&ps->ps_mtx);
#endif
DPRINTF(("sigmask = 0x%x\n", td->td_sigmask));
DPRINTF(("sigignore = 0x%x\n", ps->ps_sigignore));
DPRINTF(("sigcaught = 0x%x\n", ps->ps_sigcatch));
DPRINTF(("siglist = 0x%x\n", td->td_siglist));
#ifdef DEBUG_SVR4
mtx_unlock(&ps->ps_mtx);
PROC_UNLOCK(td->td_proc);
#endif
}
#if defined(GROTTY_READ_HACK)
if (so) { /* We've already checked to see if this is a socket */
so->so_state = so_state;
}
#endif
fdrop(fp, td);
return(rv);
}
#endif /* READ_TEST */
#if defined(BOGUS)
int
svr4_sys_write(td, uap)
struct thread *td;
struct svr4_sys_write_args *uap;
{
struct write_args wa;
struct file *fp;
int rv;
wa.fd = uap->fd;
wa.buf = uap->buf;
wa.nbyte = uap->nbyte;
rv = write(td, &wa);
DPRINTF(("svr4_write(%d, 0x%0x, %d) = %d\n",
uap->fd, uap->buf, uap->nbyte, rv));
return(rv);
}
#endif /* BOGUS */
int
svr4_fil_ioctl(fp, td, retval, fd, cmd, data)
struct file *fp;
struct thread *td;
register_t *retval;
int fd;
u_long cmd;
caddr_t data;
{
int error;
int num;
struct filedesc *fdp = td->td_proc->p_fd;
*retval = 0;
switch (cmd) {
case SVR4_FIOCLEX:
FILEDESC_XLOCK(fdp);
fdp->fd_ofileflags[fd] |= UF_EXCLOSE;
FILEDESC_XUNLOCK(fdp);
return 0;
case SVR4_FIONCLEX:
FILEDESC_XLOCK(fdp);
fdp->fd_ofileflags[fd] &= ~UF_EXCLOSE;
FILEDESC_XUNLOCK(fdp);
return 0;
case SVR4_FIOGETOWN:
case SVR4_FIOSETOWN:
case SVR4_FIOASYNC:
case SVR4_FIONBIO:
case SVR4_FIONREAD:
if ((error = copyin(data, &num, sizeof(num))) != 0)
return error;
switch (cmd) {
case SVR4_FIOGETOWN: cmd = FIOGETOWN; break;
case SVR4_FIOSETOWN: cmd = FIOSETOWN; break;
case SVR4_FIOASYNC: cmd = FIOASYNC; break;
case SVR4_FIONBIO: cmd = FIONBIO; break;
case SVR4_FIONREAD: cmd = FIONREAD; break;
}
#ifdef SVR4_DEBUG
if (cmd == FIOASYNC) DPRINTF(("FIOASYNC\n"));
#endif
error = fo_ioctl(fp, cmd, (caddr_t) &num, td->td_ucred, td);
if (error)
return error;
return copyout(&num, data, sizeof(num));
default:
DPRINTF(("Unknown svr4 filio %lx\n", cmd));
return 0; /* ENOSYS really */
}
}
Index: head/sys/compat/svr4/svr4_ipc.c
===================================================================
--- head/sys/compat/svr4/svr4_ipc.c (revision 225616)
+++ head/sys/compat/svr4/svr4_ipc.c (revision 225617)
@@ -1,707 +1,707 @@
/*-
* Copyright (c) 1995 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Christos Zoulas.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the NetBSD
* Foundation, Inc. and its contributors.
* 4. Neither the name of The NetBSD Foundation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*-
* Portions of this code have been derived from software contributed
* to the FreeBSD Project by Mark Newton.
*
* Copyright (c) 1999 Mark Newton
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* XXX- This code is presently a no-op on FreeBSD (and isn't compiled due
* to preprocessor conditionals). A nice project for a kernel hacking
* novice might be to MakeItGo, but I have more important fish to fry
* at present.
*
* Derived from: $NetBSD: svr4_ipc.c,v 1.7 1998/10/19 22:43:00 tron Exp $
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_sysvipc.h"
#include <sys/param.h>
#include <sys/ipc.h>
#include <sys/msg.h>
#include <sys/proc.h>
#include <sys/sem.h>
#include <sys/shm.h>
#include <sys/syscallsubr.h>
#include <sys/sysproto.h>
#include <sys/systm.h>
#include <sys/time.h>
#include <compat/svr4/svr4.h>
#include <compat/svr4/svr4_types.h>
#include <compat/svr4/svr4_signal.h>
#include <compat/svr4/svr4_proto.h>
#include <compat/svr4/svr4_util.h>
#include <compat/svr4/svr4_ipc.h>
#if defined(SYSVMSG) || defined(SYSVSHM) || defined(SYSVSEM)
static void svr4_to_bsd_ipc_perm(const struct svr4_ipc_perm *,
struct ipc_perm *);
static void bsd_to_svr4_ipc_perm(const struct ipc_perm *,
struct svr4_ipc_perm *);
#endif
#ifdef SYSVSEM
static void bsd_to_svr4_semid_ds(const struct semid_ds *,
struct svr4_semid_ds *);
static void svr4_to_bsd_semid_ds(const struct svr4_semid_ds *,
struct semid_ds *);
static int svr4_semop(struct thread *, void *);
static int svr4_semget(struct thread *, void *);
static int svr4_semctl(struct thread *, void *);
#endif
#ifdef SYSVMSG
static void bsd_to_svr4_msqid_ds(const struct msqid_ds *,
struct svr4_msqid_ds *);
static void svr4_to_bsd_msqid_ds(const struct svr4_msqid_ds *,
struct msqid_ds *);
static int svr4_msgsnd(struct thread *, void *);
static int svr4_msgrcv(struct thread *, void *);
static int svr4_msgget(struct thread *, void *);
static int svr4_msgctl(struct thread *, void *);
#endif
#ifdef SYSVSHM
static void bsd_to_svr4_shmid_ds(const struct shmid_ds *,
struct svr4_shmid_ds *);
static void svr4_to_bsd_shmid_ds(const struct svr4_shmid_ds *,
struct shmid_ds *);
static int svr4_shmat(struct thread *, void *);
static int svr4_shmdt(struct thread *, void *);
static int svr4_shmget(struct thread *, void *);
static int svr4_shmctl(struct thread *, void *);
#endif
#if defined(SYSVMSG) || defined(SYSVSHM) || defined(SYSVSEM)
static void
svr4_to_bsd_ipc_perm(spp, bpp)
const struct svr4_ipc_perm *spp;
struct ipc_perm *bpp;
{
bpp->key = spp->key;
bpp->uid = spp->uid;
bpp->gid = spp->gid;
bpp->cuid = spp->cuid;
bpp->cgid = spp->cgid;
bpp->mode = spp->mode;
bpp->seq = spp->seq;
}
static void
bsd_to_svr4_ipc_perm(bpp, spp)
const struct ipc_perm *bpp;
struct svr4_ipc_perm *spp;
{
spp->key = bpp->key;
spp->uid = bpp->uid;
spp->gid = bpp->gid;
spp->cuid = bpp->cuid;
spp->cgid = bpp->cgid;
spp->mode = bpp->mode;
spp->seq = bpp->seq;
}
#endif
#ifdef SYSVSEM
static void
bsd_to_svr4_semid_ds(bds, sds)
const struct semid_ds *bds;
struct svr4_semid_ds *sds;
{
bzero(sds, sizeof(*sds));
bsd_to_svr4_ipc_perm(&bds->sem_perm, &sds->sem_perm);
sds->sem_base = (struct svr4_sem *) bds->sem_base;
sds->sem_nsems = bds->sem_nsems;
sds->sem_otime = bds->sem_otime;
sds->sem_ctime = bds->sem_ctime;
}
static void
svr4_to_bsd_semid_ds(sds, bds)
const struct svr4_semid_ds *sds;
struct semid_ds *bds;
{
svr4_to_bsd_ipc_perm(&sds->sem_perm, &bds->sem_perm);
bds->sem_base = (struct sem *) bds->sem_base;
bds->sem_nsems = sds->sem_nsems;
bds->sem_otime = sds->sem_otime;
bds->sem_ctime = sds->sem_ctime;
}
struct svr4_sys_semctl_args {
int what;
int semid;
int semnum;
int cmd;
union semun arg;
};
static int
svr4_semctl(td, v)
struct thread *td;
void *v;
{
struct svr4_sys_semctl_args *uap = v;
struct svr4_semid_ds ss;
struct semid_ds bs;
union semun semun;
register_t rval;
int cmd, error;
switch (uap->cmd) {
case SVR4_SEM_GETZCNT:
cmd = GETZCNT;
break;
case SVR4_SEM_GETNCNT:
cmd = GETNCNT;
break;
case SVR4_SEM_GETPID:
cmd = GETPID;
break;
case SVR4_SEM_GETVAL:
cmd = GETVAL;
break;
case SVR4_SEM_SETVAL:
cmd = SETVAL;
break;
case SVR4_SEM_GETALL:
cmd = GETVAL;
break;
case SVR4_SEM_SETALL:
cmd = SETVAL;
break;
case SVR4_IPC_STAT:
cmd = IPC_STAT;
semun.buf = &bs;
error = kern_semctl(td, uap->semid, uap->semnum, cmd, &semun,
&rval);
if (error)
return (error);
bsd_to_svr4_semid_ds(&bs, &ss);
error = copyout(&ss, uap->arg.buf, sizeof(ss));
if (error == 0)
td->td_retval[0] = rval;
return (error);
case SVR4_IPC_SET:
cmd = IPC_SET;
error = copyin(uap->arg.buf, (caddr_t) &ss, sizeof ss);
if (error)
return (error);
svr4_to_bsd_semid_ds(&ss, &bs);
semun.buf = &bs;
return (kern_semctl(td, uap->semid, uap->semnum, cmd, &semun,
td->td_retval));
case SVR4_IPC_RMID:
cmd = IPC_RMID;
break;
default:
return EINVAL;
}
return (kern_semctl(td, uap->semid, uap->semnum, cmd, &uap->arg,
td->td_retval));
}
struct svr4_sys_semget_args {
int what;
svr4_key_t key;
int nsems;
int semflg;
};
static int
svr4_semget(td, v)
struct thread *td;
void *v;
{
struct svr4_sys_semget_args *uap = v;
struct semget_args ap;
ap.key = uap->key;
ap.nsems = uap->nsems;
ap.semflg = uap->semflg;
- return semget(td, &ap);
+ return sys_semget(td, &ap);
}
struct svr4_sys_semop_args {
int what;
int semid;
struct svr4_sembuf * sops;
u_int nsops;
};
static int
svr4_semop(td, v)
struct thread *td;
void *v;
{
struct svr4_sys_semop_args *uap = v;
struct semop_args ap;
ap.semid = uap->semid;
/* These are the same */
ap.sops = (struct sembuf *) uap->sops;
ap.nsops = uap->nsops;
- return semop(td, &ap);
+ return sys_semop(td, &ap);
}
int
svr4_sys_semsys(td, uap)
struct thread *td;
struct svr4_sys_semsys_args *uap;
{
DPRINTF(("svr4_semsys(%d)\n", uap->what));
switch (uap->what) {
case SVR4_semctl:
return svr4_semctl(td, uap);
case SVR4_semget:
return svr4_semget(td, uap);
case SVR4_semop:
return svr4_semop(td, uap);
default:
return EINVAL;
}
}
MODULE_DEPEND(svr4elf, sysvsem, 1, 1, 1);
#endif
#ifdef SYSVMSG
static void
bsd_to_svr4_msqid_ds(bds, sds)
const struct msqid_ds *bds;
struct svr4_msqid_ds *sds;
{
bzero(sds, sizeof(*sds));
bsd_to_svr4_ipc_perm(&bds->msg_perm, &sds->msg_perm);
sds->msg_first = (struct svr4_msg *) bds->msg_first;
sds->msg_last = (struct svr4_msg *) bds->msg_last;
sds->msg_cbytes = bds->msg_cbytes;
sds->msg_qnum = bds->msg_qnum;
sds->msg_qbytes = bds->msg_qbytes;
sds->msg_lspid = bds->msg_lspid;
sds->msg_lrpid = bds->msg_lrpid;
sds->msg_stime = bds->msg_stime;
sds->msg_rtime = bds->msg_rtime;
sds->msg_ctime = bds->msg_ctime;
}
static void
svr4_to_bsd_msqid_ds(sds, bds)
const struct svr4_msqid_ds *sds;
struct msqid_ds *bds;
{
svr4_to_bsd_ipc_perm(&sds->msg_perm, &bds->msg_perm);
bds->msg_first = (struct msg *) sds->msg_first;
bds->msg_last = (struct msg *) sds->msg_last;
bds->msg_cbytes = sds->msg_cbytes;
bds->msg_qnum = sds->msg_qnum;
bds->msg_qbytes = sds->msg_qbytes;
bds->msg_lspid = sds->msg_lspid;
bds->msg_lrpid = sds->msg_lrpid;
bds->msg_stime = sds->msg_stime;
bds->msg_rtime = sds->msg_rtime;
bds->msg_ctime = sds->msg_ctime;
}
struct svr4_sys_msgsnd_args {
int what;
int msqid;
void * msgp;
size_t msgsz;
int msgflg;
};
static int
svr4_msgsnd(td, v)
struct thread *td;
void *v;
{
struct svr4_sys_msgsnd_args *uap = v;
struct msgsnd_args ap;
ap.msqid = uap->msqid;
ap.msgp = uap->msgp;
ap.msgsz = uap->msgsz;
ap.msgflg = uap->msgflg;
- return msgsnd(td, &ap);
+ return sys_msgsnd(td, &ap);
}
struct svr4_sys_msgrcv_args {
int what;
int msqid;
void * msgp;
size_t msgsz;
long msgtyp;
int msgflg;
};
static int
svr4_msgrcv(td, v)
struct thread *td;
void *v;
{
struct svr4_sys_msgrcv_args *uap = v;
struct msgrcv_args ap;
ap.msqid = uap->msqid;
ap.msgp = uap->msgp;
ap.msgsz = uap->msgsz;
ap.msgtyp = uap->msgtyp;
ap.msgflg = uap->msgflg;
- return msgrcv(td, &ap);
+ return sys_msgrcv(td, &ap);
}
struct svr4_sys_msgget_args {
int what;
svr4_key_t key;
int msgflg;
};
static int
svr4_msgget(td, v)
struct thread *td;
void *v;
{
struct svr4_sys_msgget_args *uap = v;
struct msgget_args ap;
ap.key = uap->key;
ap.msgflg = uap->msgflg;
- return msgget(td, &ap);
+ return sys_msgget(td, &ap);
}
struct svr4_sys_msgctl_args {
int what;
int msqid;
int cmd;
struct svr4_msqid_ds * buf;
};
static int
svr4_msgctl(td, v)
struct thread *td;
void *v;
{
struct svr4_sys_msgctl_args *uap = v;
struct svr4_msqid_ds ss;
struct msqid_ds bs;
int error;
switch (uap->cmd) {
case SVR4_IPC_STAT:
error = kern_msgctl(td, uap->msqid, IPC_STAT, &bs);
if (error)
return error;
bsd_to_svr4_msqid_ds(&bs, &ss);
return copyout(&ss, uap->buf, sizeof ss);
case SVR4_IPC_SET:
error = copyin(uap->buf, &ss, sizeof ss);
if (error)
return error;
svr4_to_bsd_msqid_ds(&ss, &bs);
return (kern_msgctl(td, uap->msqid, IPC_SET, &bs));
case SVR4_IPC_RMID:
return (kern_msgctl(td, uap->msqid, IPC_RMID, NULL));
default:
return EINVAL;
}
}
int
svr4_sys_msgsys(td, uap)
struct thread *td;
struct svr4_sys_msgsys_args *uap;
{
DPRINTF(("svr4_msgsys(%d)\n", uap->what));
switch (uap->what) {
case SVR4_msgsnd:
return svr4_msgsnd(td, uap);
case SVR4_msgrcv:
return svr4_msgrcv(td, uap);
case SVR4_msgget:
return svr4_msgget(td, uap);
case SVR4_msgctl:
return svr4_msgctl(td, uap);
default:
return EINVAL;
}
}
MODULE_DEPEND(svr4elf, sysvmsg, 1, 1, 1);
#endif
#ifdef SYSVSHM
static void
bsd_to_svr4_shmid_ds(bds, sds)
const struct shmid_ds *bds;
struct svr4_shmid_ds *sds;
{
bzero(sds, sizeof(*sds));
bsd_to_svr4_ipc_perm(&bds->shm_perm, &sds->shm_perm);
sds->shm_segsz = bds->shm_segsz;
sds->shm_lkcnt = 0;
sds->shm_lpid = bds->shm_lpid;
sds->shm_cpid = bds->shm_cpid;
sds->shm_amp = 0;
sds->shm_nattch = bds->shm_nattch;
sds->shm_cnattch = 0;
sds->shm_atime = bds->shm_atime;
sds->shm_dtime = bds->shm_dtime;
sds->shm_ctime = bds->shm_ctime;
}
static void
svr4_to_bsd_shmid_ds(sds, bds)
const struct svr4_shmid_ds *sds;
struct shmid_ds *bds;
{
svr4_to_bsd_ipc_perm(&sds->shm_perm, &bds->shm_perm);
bds->shm_segsz = sds->shm_segsz;
bds->shm_lpid = sds->shm_lpid;
bds->shm_cpid = sds->shm_cpid;
bds->shm_nattch = sds->shm_nattch;
bds->shm_atime = sds->shm_atime;
bds->shm_dtime = sds->shm_dtime;
bds->shm_ctime = sds->shm_ctime;
}
struct svr4_sys_shmat_args {
int what;
int shmid;
void * shmaddr;
int shmflg;
};
static int
svr4_shmat(td, v)
struct thread *td;
void *v;
{
struct svr4_sys_shmat_args *uap = v;
struct shmat_args ap;
ap.shmid = uap->shmid;
ap.shmaddr = uap->shmaddr;
ap.shmflg = uap->shmflg;
- return shmat(td, &ap);
+ return sys_shmat(td, &ap);
}
struct svr4_sys_shmdt_args {
int what;
void * shmaddr;
};
static int
svr4_shmdt(td, v)
struct thread *td;
void *v;
{
struct svr4_sys_shmdt_args *uap = v;
struct shmdt_args ap;
ap.shmaddr = uap->shmaddr;
- return shmdt(td, &ap);
+ return sys_shmdt(td, &ap);
}
struct svr4_sys_shmget_args {
int what;
key_t key;
int size;
int shmflg;
};
static int
svr4_shmget(td, v)
struct thread *td;
void *v;
{
struct svr4_sys_shmget_args *uap = v;
struct shmget_args ap;
ap.key = uap->key;
ap.size = uap->size;
ap.shmflg = uap->shmflg;
- return shmget(td, &ap);
+ return sys_shmget(td, &ap);
}
struct svr4_sys_shmctl_args {
int what;
int shmid;
int cmd;
struct svr4_shmid_ds * buf;
};
int
svr4_shmctl(td, v)
struct thread *td;
void *v;
{
struct svr4_sys_shmctl_args *uap = v;
struct shmid_ds bs;
struct svr4_shmid_ds ss;
size_t bufsize;
int cmd, error;
if (uap->buf != NULL) {
switch (uap->cmd) {
case SVR4_IPC_SET:
case SVR4_SHM_LOCK:
case SVR4_SHM_UNLOCK:
error = copyin(uap->buf, &ss, sizeof(ss));
if (error)
return (error);
svr4_to_bsd_shmid_ds(&ss, &bs);
break;
default:
return (EINVAL);
}
}
switch (uap->cmd) {
case SVR4_IPC_STAT:
cmd = IPC_STAT;
break;
case SVR4_IPC_SET:
cmd = IPC_SET;
break;
case SVR4_IPC_RMID:
cmd = IPC_RMID;
break;
case SVR4_SHM_LOCK:
cmd = SHM_LOCK;
break;
case SVR4_SHM_UNLOCK:
cmd = SHM_UNLOCK;
break;
default:
return (EINVAL);
}
error = kern_shmctl(td, uap->shmid, cmd, &bs, &bufsize);
if (error)
return (error);
switch (uap->cmd) {
case SVR4_IPC_STAT:
if (uap->buf != NULL) {
bsd_to_svr4_shmid_ds(&bs, &ss);
error = copyout(&ss, uap->buf, sizeof(ss));
}
break;
}
return (error);
}
int
svr4_sys_shmsys(td, uap)
struct thread *td;
struct svr4_sys_shmsys_args *uap;
{
DPRINTF(("svr4_shmsys(%d)\n", uap->what));
switch (uap->what) {
case SVR4_shmat:
return svr4_shmat(td, uap);
case SVR4_shmdt:
return svr4_shmdt(td, uap);
case SVR4_shmget:
return svr4_shmget(td, uap);
case SVR4_shmctl:
return svr4_shmctl(td, uap);
default:
return ENOSYS;
}
}
MODULE_DEPEND(svr4elf, sysvshm, 1, 1, 1);
#endif /* SYSVSHM */
Index: head/sys/compat/svr4/svr4_misc.c
===================================================================
--- head/sys/compat/svr4/svr4_misc.c (revision 225616)
+++ head/sys/compat/svr4/svr4_misc.c (revision 225617)
@@ -1,1671 +1,1671 @@
/*-
* Copyright (c) 1998 Mark Newton
* Copyright (c) 1994 Christos Zoulas
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
* SVR4 compatibility module.
*
* SVR4 system calls that are implemented differently in BSD are
* handled here.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/capability.h>
#include <sys/dirent.h>
#include <sys/fcntl.h>
#include <sys/filedesc.h>
#include <sys/imgact.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/file.h> /* Must come after sys/malloc.h */
#include <sys/mman.h>
#include <sys/mount.h>
#include <sys/msg.h>
#include <sys/mutex.h>
#include <sys/namei.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/ptrace.h>
#include <sys/resource.h>
#include <sys/resourcevar.h>
#include <sys/sem.h>
#include <sys/signalvar.h>
#include <sys/stat.h>
#include <sys/sx.h>
#include <sys/syscallsubr.h>
#include <sys/sysproto.h>
#include <sys/time.h>
#include <sys/times.h>
#include <sys/uio.h>
#include <sys/vnode.h>
#include <sys/wait.h>
#include <compat/svr4/svr4.h>
#include <compat/svr4/svr4_types.h>
#include <compat/svr4/svr4_signal.h>
#include <compat/svr4/svr4_proto.h>
#include <compat/svr4/svr4_util.h>
#include <compat/svr4/svr4_sysconfig.h>
#include <compat/svr4/svr4_dirent.h>
#include <compat/svr4/svr4_acl.h>
#include <compat/svr4/svr4_ulimit.h>
#include <compat/svr4/svr4_statvfs.h>
#include <compat/svr4/svr4_hrt.h>
#include <compat/svr4/svr4_mman.h>
#include <compat/svr4/svr4_wait.h>
#include <security/mac/mac_framework.h>
#include <machine/vmparam.h>
#include <vm/vm.h>
#include <vm/vm_param.h>
#include <vm/vm_map.h>
#if defined(__FreeBSD__)
#include <vm/uma.h>
#include <vm/vm_extern.h>
#endif
#if defined(NetBSD)
# if defined(UVM)
# include <uvm/uvm_extern.h>
# endif
#endif
#define BSD_DIRENT(cp) ((struct dirent *)(cp))
static int svr4_mknod(struct thread *, register_t *, char *,
svr4_mode_t, svr4_dev_t);
static __inline clock_t timeval_to_clock_t(struct timeval *);
static int svr4_setinfo (pid_t , struct rusage *, int, svr4_siginfo_t *);
struct svr4_hrtcntl_args;
static int svr4_hrtcntl (struct thread *, struct svr4_hrtcntl_args *,
register_t *);
static void bsd_statfs_to_svr4_statvfs(const struct statfs *,
struct svr4_statvfs *);
static void bsd_statfs_to_svr4_statvfs64(const struct statfs *,
struct svr4_statvfs64 *);
static struct proc *svr4_pfind(pid_t pid);
/* BOGUS noop */
#if defined(BOGUS)
int
svr4_sys_setitimer(td, uap)
struct thread *td;
struct svr4_sys_setitimer_args *uap;
{
td->td_retval[0] = 0;
return 0;
}
#endif
int
svr4_sys_wait(td, uap)
struct thread *td;
struct svr4_sys_wait_args *uap;
{
int error, st, sig;
error = kern_wait(td, WAIT_ANY, &st, 0, NULL);
if (error)
return (error);
if (WIFSIGNALED(st)) {
sig = WTERMSIG(st);
if (sig >= 0 && sig < NSIG)
st = (st & ~0177) | SVR4_BSD2SVR4_SIG(sig);
} else if (WIFSTOPPED(st)) {
sig = WSTOPSIG(st);
if (sig >= 0 && sig < NSIG)
st = (st & ~0xff00) | (SVR4_BSD2SVR4_SIG(sig) << 8);
}
/*
* It looks like wait(2) on svr4/solaris/2.4 returns
* the status in retval[1], and the pid on retval[0].
*/
td->td_retval[1] = st;
if (uap->status)
error = copyout(&st, uap->status, sizeof(st));
return (error);
}
int
svr4_sys_execv(td, uap)
struct thread *td;
struct svr4_sys_execv_args *uap;
{
struct image_args eargs;
char *path;
int error;
CHECKALTEXIST(td, uap->path, &path);
error = exec_copyin_args(&eargs, path, UIO_SYSSPACE, uap->argp, NULL);
free(path, M_TEMP);
if (error == 0)
error = kern_execve(td, &eargs, NULL);
return (error);
}
int
svr4_sys_execve(td, uap)
struct thread *td;
struct svr4_sys_execve_args *uap;
{
struct image_args eargs;
char *path;
int error;
CHECKALTEXIST(td, uap->path, &path);
error = exec_copyin_args(&eargs, path, UIO_SYSSPACE, uap->argp,
uap->envp);
free(path, M_TEMP);
if (error == 0)
error = kern_execve(td, &eargs, NULL);
return (error);
}
int
svr4_sys_time(td, v)
struct thread *td;
struct svr4_sys_time_args *v;
{
struct svr4_sys_time_args *uap = v;
int error = 0;
struct timeval tv;
microtime(&tv);
if (uap->t)
error = copyout(&tv.tv_sec, uap->t,
sizeof(*(uap->t)));
td->td_retval[0] = (int) tv.tv_sec;
return error;
}
/*
* Read SVR4-style directory entries. We suck them into kernel space so
* that they can be massaged before being copied out to user code.
*
* This code is ported from the Linux emulator: Changes to the VFS interface
* between FreeBSD and NetBSD have made it simpler to port it from there than
* to adapt the NetBSD version.
*/
int
svr4_sys_getdents64(td, uap)
struct thread *td;
struct svr4_sys_getdents64_args *uap;
{
struct dirent *bdp;
struct vnode *vp;
caddr_t inp, buf; /* BSD-format */
int len, reclen; /* BSD-format */
caddr_t outp; /* SVR4-format */
int resid, svr4reclen=0; /* SVR4-format */
struct file *fp;
struct uio auio;
struct iovec aiov;
off_t off;
struct svr4_dirent64 svr4_dirent;
int buflen, error, eofflag, nbytes, justone, vfslocked;
u_long *cookies = NULL, *cookiep;
int ncookies;
DPRINTF(("svr4_sys_getdents64(%d, *, %d)\n",
uap->fd, uap->nbytes));
if ((error = getvnode(td->td_proc->p_fd, uap->fd,
CAP_READ | CAP_SEEK, &fp)) != 0) {
return (error);
}
if ((fp->f_flag & FREAD) == 0) {
fdrop(fp, td);
return (EBADF);
}
vp = fp->f_vnode;
vfslocked = VFS_LOCK_GIANT(vp->v_mount);
if (vp->v_type != VDIR) {
VFS_UNLOCK_GIANT(vfslocked);
fdrop(fp, td);
return (EINVAL);
}
nbytes = uap->nbytes;
if (nbytes == 1) {
nbytes = sizeof (struct svr4_dirent64);
justone = 1;
}
else
justone = 0;
off = fp->f_offset;
#define DIRBLKSIZ 512 /* XXX we used to use ufs's DIRBLKSIZ */
buflen = max(DIRBLKSIZ, nbytes);
buflen = min(buflen, MAXBSIZE);
buf = malloc(buflen, M_TEMP, M_WAITOK);
vn_lock(vp, LK_SHARED | LK_RETRY);
again:
aiov.iov_base = buf;
aiov.iov_len = buflen;
auio.uio_iov = &aiov;
auio.uio_iovcnt = 1;
auio.uio_rw = UIO_READ;
auio.uio_segflg = UIO_SYSSPACE;
auio.uio_td = td;
auio.uio_resid = buflen;
auio.uio_offset = off;
if (cookies) {
free(cookies, M_TEMP);
cookies = NULL;
}
#ifdef MAC
error = mac_vnode_check_readdir(td->td_ucred, vp);
if (error)
goto out;
#endif
error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag,
&ncookies, &cookies);
if (error) {
goto out;
}
inp = buf;
outp = (caddr_t) uap->dp;
resid = nbytes;
if ((len = buflen - auio.uio_resid) <= 0) {
goto eof;
}
cookiep = cookies;
if (cookies) {
/*
* When using cookies, the vfs has the option of reading from
* a different offset than that supplied (UFS truncates the
* offset to a block boundary to make sure that it never reads
* partway through a directory entry, even if the directory
* has been compacted).
*/
while (len > 0 && ncookies > 0 && *cookiep <= off) {
bdp = (struct dirent *) inp;
len -= bdp->d_reclen;
inp += bdp->d_reclen;
cookiep++;
ncookies--;
}
}
while (len > 0) {
if (cookiep && ncookies == 0)
break;
bdp = (struct dirent *) inp;
reclen = bdp->d_reclen;
if (reclen & 3) {
DPRINTF(("svr4_readdir: reclen=%d\n", reclen));
error = EFAULT;
goto out;
}
if (bdp->d_fileno == 0) {
inp += reclen;
if (cookiep) {
off = *cookiep++;
ncookies--;
} else
off += reclen;
len -= reclen;
continue;
}
svr4reclen = SVR4_RECLEN(&svr4_dirent, bdp->d_namlen);
if (reclen > len || resid < svr4reclen) {
outp++;
break;
}
svr4_dirent.d_ino = (long) bdp->d_fileno;
if (justone) {
/*
* old svr4-style readdir usage.
*/
svr4_dirent.d_off = (svr4_off_t) svr4reclen;
svr4_dirent.d_reclen = (u_short) bdp->d_namlen;
} else {
svr4_dirent.d_off = (svr4_off_t)(off + reclen);
svr4_dirent.d_reclen = (u_short) svr4reclen;
}
strlcpy(svr4_dirent.d_name, bdp->d_name, sizeof(svr4_dirent.d_name));
if ((error = copyout((caddr_t)&svr4_dirent, outp, svr4reclen)))
goto out;
inp += reclen;
if (cookiep) {
off = *cookiep++;
ncookies--;
} else
off += reclen;
outp += svr4reclen;
resid -= svr4reclen;
len -= reclen;
if (justone)
break;
}
if (outp == (caddr_t) uap->dp)
goto again;
fp->f_offset = off;
if (justone)
nbytes = resid + svr4reclen;
eof:
td->td_retval[0] = nbytes - resid;
out:
VOP_UNLOCK(vp, 0);
VFS_UNLOCK_GIANT(vfslocked);
fdrop(fp, td);
if (cookies)
free(cookies, M_TEMP);
free(buf, M_TEMP);
return error;
}
int
svr4_sys_getdents(td, uap)
struct thread *td;
struct svr4_sys_getdents_args *uap;
{
struct dirent *bdp;
struct vnode *vp;
caddr_t inp, buf; /* BSD-format */
int len, reclen; /* BSD-format */
caddr_t outp; /* SVR4-format */
int resid, svr4_reclen; /* SVR4-format */
struct file *fp;
struct uio auio;
struct iovec aiov;
struct svr4_dirent idb;
off_t off; /* true file offset */
int buflen, error, eofflag, vfslocked;
u_long *cookiebuf = NULL, *cookie;
int ncookies = 0, *retval = td->td_retval;
if (uap->nbytes < 0)
return (EINVAL);
if ((error = getvnode(td->td_proc->p_fd, uap->fd,
CAP_READ | CAP_SEEK, &fp)) != 0)
return (error);
if ((fp->f_flag & FREAD) == 0) {
fdrop(fp, td);
return (EBADF);
}
vp = fp->f_vnode;
vfslocked = VFS_LOCK_GIANT(vp->v_mount);
if (vp->v_type != VDIR) {
VFS_UNLOCK_GIANT(vfslocked);
fdrop(fp, td);
return (EINVAL);
}
buflen = min(MAXBSIZE, uap->nbytes);
buf = malloc(buflen, M_TEMP, M_WAITOK);
vn_lock(vp, LK_SHARED | LK_RETRY);
off = fp->f_offset;
again:
aiov.iov_base = buf;
aiov.iov_len = buflen;
auio.uio_iov = &aiov;
auio.uio_iovcnt = 1;
auio.uio_rw = UIO_READ;
auio.uio_segflg = UIO_SYSSPACE;
auio.uio_td = td;
auio.uio_resid = buflen;
auio.uio_offset = off;
#ifdef MAC
error = mac_vnode_check_readdir(td->td_ucred, vp);
if (error)
goto out;
#endif
/*
* First we read into the malloc'ed buffer, then
* we massage it into user space, one record at a time.
*/
error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, &ncookies,
&cookiebuf);
if (error) {
goto out;
}
inp = buf;
outp = uap->buf;
resid = uap->nbytes;
if ((len = buflen - auio.uio_resid) == 0)
goto eof;
for (cookie = cookiebuf; len > 0; len -= reclen) {
bdp = (struct dirent *)inp;
reclen = bdp->d_reclen;
if (reclen & 3)
panic("svr4_sys_getdents64: bad reclen");
if (cookie)
off = *cookie++; /* each entry points to the next */
else
off += reclen;
if ((off >> 32) != 0) {
uprintf("svr4_sys_getdents64: dir offset too large for emulated program");
error = EINVAL;
goto out;
}
if (bdp->d_fileno == 0) {
inp += reclen; /* it is a hole; squish it out */
continue;
}
svr4_reclen = SVR4_RECLEN(&idb, bdp->d_namlen);
if (reclen > len || resid < svr4_reclen) {
/* entry too big for buffer, so just stop */
outp++;
break;
}
/*
* Massage in place to make a SVR4-shaped dirent (otherwise
* we have to worry about touching user memory outside of
* the copyout() call).
*/
idb.d_ino = (svr4_ino_t)bdp->d_fileno;
idb.d_off = (svr4_off_t)off;
idb.d_reclen = (u_short)svr4_reclen;
strlcpy(idb.d_name, bdp->d_name, sizeof(idb.d_name));
if ((error = copyout((caddr_t)&idb, outp, svr4_reclen)))
goto out;
/* advance past this real entry */
inp += reclen;
/* advance output past SVR4-shaped entry */
outp += svr4_reclen;
resid -= svr4_reclen;
}
/* if we squished out the whole block, try again */
if (outp == uap->buf)
goto again;
fp->f_offset = off; /* update the vnode offset */
eof:
*retval = uap->nbytes - resid;
out:
VOP_UNLOCK(vp, 0);
VFS_UNLOCK_GIANT(vfslocked);
fdrop(fp, td);
if (cookiebuf)
free(cookiebuf, M_TEMP);
free(buf, M_TEMP);
return error;
}
int
svr4_sys_mmap(td, uap)
struct thread *td;
struct svr4_sys_mmap_args *uap;
{
struct mmap_args mm;
int *retval;
retval = td->td_retval;
#define _MAP_NEW 0x80000000
/*
* Verify the arguments.
*/
if (uap->prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC))
return EINVAL; /* XXX still needed? */
if (uap->len == 0)
return EINVAL;
mm.prot = uap->prot;
mm.len = uap->len;
mm.flags = uap->flags & ~_MAP_NEW;
mm.fd = uap->fd;
mm.addr = uap->addr;
mm.pos = uap->pos;
- return mmap(td, &mm);
+ return sys_mmap(td, &mm);
}
int
svr4_sys_mmap64(td, uap)
struct thread *td;
struct svr4_sys_mmap64_args *uap;
{
struct mmap_args mm;
void *rp;
#define _MAP_NEW 0x80000000
/*
* Verify the arguments.
*/
if (uap->prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC))
return EINVAL; /* XXX still needed? */
if (uap->len == 0)
return EINVAL;
mm.prot = uap->prot;
mm.len = uap->len;
mm.flags = uap->flags & ~_MAP_NEW;
mm.fd = uap->fd;
mm.addr = uap->addr;
mm.pos = uap->pos;
rp = (void *) round_page((vm_offset_t)(td->td_proc->p_vmspace->vm_daddr + maxdsiz));
if ((mm.flags & MAP_FIXED) == 0 &&
mm.addr != 0 && (void *)mm.addr < rp)
mm.addr = rp;
- return mmap(td, &mm);
+ return sys_mmap(td, &mm);
}
int
svr4_sys_fchroot(td, uap)
struct thread *td;
struct svr4_sys_fchroot_args *uap;
{
struct filedesc *fdp = td->td_proc->p_fd;
struct vnode *vp;
struct file *fp;
int error, vfslocked;
if ((error = priv_check(td, PRIV_VFS_FCHROOT)) != 0)
return error;
/* XXX: we have the chroot priv... what cap might we need? all? */
if ((error = getvnode(fdp, uap->fd, 0, &fp)) != 0)
return error;
vp = fp->f_vnode;
VREF(vp);
fdrop(fp, td);
vfslocked = VFS_LOCK_GIANT(vp->v_mount);
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
error = change_dir(vp, td);
if (error)
goto fail;
#ifdef MAC
error = mac_vnode_check_chroot(td->td_ucred, vp);
if (error)
goto fail;
#endif
VOP_UNLOCK(vp, 0);
error = change_root(vp, td);
vrele(vp);
VFS_UNLOCK_GIANT(vfslocked);
return (error);
fail:
vput(vp);
VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
static int
svr4_mknod(td, retval, path, mode, dev)
struct thread *td;
register_t *retval;
char *path;
svr4_mode_t mode;
svr4_dev_t dev;
{
char *newpath;
int error;
CHECKALTEXIST(td, path, &newpath);
if (S_ISFIFO(mode))
error = kern_mkfifo(td, newpath, UIO_SYSSPACE, mode);
else
error = kern_mknod(td, newpath, UIO_SYSSPACE, mode, dev);
free(newpath, M_TEMP);
return (error);
}
int
svr4_sys_mknod(td, uap)
struct thread *td;
struct svr4_sys_mknod_args *uap;
{
int *retval = td->td_retval;
return svr4_mknod(td, retval,
uap->path, uap->mode,
(svr4_dev_t)svr4_to_bsd_odev_t(uap->dev));
}
int
svr4_sys_xmknod(td, uap)
struct thread *td;
struct svr4_sys_xmknod_args *uap;
{
int *retval = td->td_retval;
return svr4_mknod(td, retval,
uap->path, uap->mode,
(svr4_dev_t)svr4_to_bsd_dev_t(uap->dev));
}
int
svr4_sys_vhangup(td, uap)
struct thread *td;
struct svr4_sys_vhangup_args *uap;
{
return 0;
}
int
svr4_sys_sysconfig(td, uap)
struct thread *td;
struct svr4_sys_sysconfig_args *uap;
{
int *retval;
retval = &(td->td_retval[0]);
switch (uap->name) {
case SVR4_CONFIG_NGROUPS:
*retval = ngroups_max;
break;
case SVR4_CONFIG_CHILD_MAX:
*retval = maxproc;
break;
case SVR4_CONFIG_OPEN_FILES:
*retval = maxfiles;
break;
case SVR4_CONFIG_POSIX_VER:
*retval = 198808;
break;
case SVR4_CONFIG_PAGESIZE:
*retval = PAGE_SIZE;
break;
case SVR4_CONFIG_CLK_TCK:
*retval = 60; /* should this be `hz', ie. 100? */
break;
case SVR4_CONFIG_XOPEN_VER:
*retval = 2; /* XXX: What should that be? */
break;
case SVR4_CONFIG_PROF_TCK:
*retval = 60; /* XXX: What should that be? */
break;
case SVR4_CONFIG_NPROC_CONF:
*retval = 1; /* Only one processor for now */
break;
case SVR4_CONFIG_NPROC_ONLN:
*retval = 1; /* And it better be online */
break;
case SVR4_CONFIG_AIO_LISTIO_MAX:
case SVR4_CONFIG_AIO_MAX:
case SVR4_CONFIG_AIO_PRIO_DELTA_MAX:
*retval = 0; /* No aio support */
break;
case SVR4_CONFIG_DELAYTIMER_MAX:
*retval = 0; /* No delaytimer support */
break;
case SVR4_CONFIG_MQ_OPEN_MAX:
*retval = msginfo.msgmni;
break;
case SVR4_CONFIG_MQ_PRIO_MAX:
*retval = 0; /* XXX: Don't know */
break;
case SVR4_CONFIG_RTSIG_MAX:
*retval = 0;
break;
case SVR4_CONFIG_SEM_NSEMS_MAX:
*retval = seminfo.semmni;
break;
case SVR4_CONFIG_SEM_VALUE_MAX:
*retval = seminfo.semvmx;
break;
case SVR4_CONFIG_SIGQUEUE_MAX:
*retval = 0; /* XXX: Don't know */
break;
case SVR4_CONFIG_SIGRT_MIN:
case SVR4_CONFIG_SIGRT_MAX:
*retval = 0; /* No real time signals */
break;
case SVR4_CONFIG_TIMER_MAX:
*retval = 3; /* XXX: real, virtual, profiling */
break;
#if defined(NOTYET)
case SVR4_CONFIG_PHYS_PAGES:
#if defined(UVM)
*retval = uvmexp.free; /* XXX: free instead of total */
#else
*retval = cnt.v_free_count; /* XXX: free instead of total */
#endif
break;
case SVR4_CONFIG_AVPHYS_PAGES:
#if defined(UVM)
*retval = uvmexp.active; /* XXX: active instead of avg */
#else
*retval = cnt.v_active_count; /* XXX: active instead of avg */
#endif
break;
#endif /* NOTYET */
case SVR4_CONFIG_COHERENCY:
*retval = 0; /* XXX */
break;
case SVR4_CONFIG_SPLIT_CACHE:
*retval = 0; /* XXX */
break;
case SVR4_CONFIG_ICACHESZ:
*retval = 256; /* XXX */
break;
case SVR4_CONFIG_DCACHESZ:
*retval = 256; /* XXX */
break;
case SVR4_CONFIG_ICACHELINESZ:
*retval = 64; /* XXX */
break;
case SVR4_CONFIG_DCACHELINESZ:
*retval = 64; /* XXX */
break;
case SVR4_CONFIG_ICACHEBLKSZ:
*retval = 64; /* XXX */
break;
case SVR4_CONFIG_DCACHEBLKSZ:
*retval = 64; /* XXX */
break;
case SVR4_CONFIG_DCACHETBLKSZ:
*retval = 64; /* XXX */
break;
case SVR4_CONFIG_ICACHE_ASSOC:
*retval = 1; /* XXX */
break;
case SVR4_CONFIG_DCACHE_ASSOC:
*retval = 1; /* XXX */
break;
case SVR4_CONFIG_MAXPID:
*retval = PID_MAX;
break;
case SVR4_CONFIG_STACK_PROT:
*retval = PROT_READ|PROT_WRITE|PROT_EXEC;
break;
default:
return EINVAL;
}
return 0;
}
/* ARGSUSED */
int
svr4_sys_break(td, uap)
struct thread *td;
struct svr4_sys_break_args *uap;
{
struct obreak_args ap;
ap.nsize = uap->nsize;
- return (obreak(td, &ap));
+ return (sys_obreak(td, &ap));
}
static __inline clock_t
timeval_to_clock_t(tv)
struct timeval *tv;
{
return tv->tv_sec * hz + tv->tv_usec / (1000000 / hz);
}
int
svr4_sys_times(td, uap)
struct thread *td;
struct svr4_sys_times_args *uap;
{
struct timeval tv, utime, stime, cutime, cstime;
struct tms tms;
struct proc *p;
int error;
p = td->td_proc;
PROC_LOCK(p);
PROC_SLOCK(p);
calcru(p, &utime, &stime);
PROC_SUNLOCK(p);
calccru(p, &cutime, &cstime);
PROC_UNLOCK(p);
tms.tms_utime = timeval_to_clock_t(&utime);
tms.tms_stime = timeval_to_clock_t(&stime);
tms.tms_cutime = timeval_to_clock_t(&cutime);
tms.tms_cstime = timeval_to_clock_t(&cstime);
error = copyout(&tms, uap->tp, sizeof(tms));
if (error)
return (error);
microtime(&tv);
td->td_retval[0] = (int)timeval_to_clock_t(&tv);
return (0);
}
int
svr4_sys_ulimit(td, uap)
struct thread *td;
struct svr4_sys_ulimit_args *uap;
{
int *retval = td->td_retval;
int error;
switch (uap->cmd) {
case SVR4_GFILLIM:
PROC_LOCK(td->td_proc);
*retval = lim_cur(td->td_proc, RLIMIT_FSIZE) / 512;
PROC_UNLOCK(td->td_proc);
if (*retval == -1)
*retval = 0x7fffffff;
return 0;
case SVR4_SFILLIM:
{
struct rlimit krl;
krl.rlim_cur = uap->newlimit * 512;
PROC_LOCK(td->td_proc);
krl.rlim_max = lim_max(td->td_proc, RLIMIT_FSIZE);
PROC_UNLOCK(td->td_proc);
error = kern_setrlimit(td, RLIMIT_FSIZE, &krl);
if (error)
return error;
PROC_LOCK(td->td_proc);
*retval = lim_cur(td->td_proc, RLIMIT_FSIZE);
PROC_UNLOCK(td->td_proc);
if (*retval == -1)
*retval = 0x7fffffff;
return 0;
}
case SVR4_GMEMLIM:
{
struct vmspace *vm = td->td_proc->p_vmspace;
register_t r;
PROC_LOCK(td->td_proc);
r = lim_cur(td->td_proc, RLIMIT_DATA);
PROC_UNLOCK(td->td_proc);
if (r == -1)
r = 0x7fffffff;
r += (long) vm->vm_daddr;
if (r < 0)
r = 0x7fffffff;
*retval = r;
return 0;
}
case SVR4_GDESLIM:
PROC_LOCK(td->td_proc);
*retval = lim_cur(td->td_proc, RLIMIT_NOFILE);
PROC_UNLOCK(td->td_proc);
if (*retval == -1)
*retval = 0x7fffffff;
return 0;
default:
return EINVAL;
}
}
static struct proc *
svr4_pfind(pid)
pid_t pid;
{
struct proc *p;
/* look in the live processes */
if ((p = pfind(pid)) == NULL)
/* look in the zombies */
p = zpfind(pid);
return p;
}
int
svr4_sys_pgrpsys(td, uap)
struct thread *td;
struct svr4_sys_pgrpsys_args *uap;
{
int *retval = td->td_retval;
struct proc *p = td->td_proc;
switch (uap->cmd) {
case 1: /* setpgrp() */
/*
* SVR4 setpgrp() (which takes no arguments) has the
* semantics that the session ID is also created anew, so
* in almost every sense, setpgrp() is identical to
* setsid() for SVR4. (Under BSD, the difference is that
* a setpgid(0,0) will not create a new session.)
*/
- setsid(td, NULL);
+ sys_setsid(td, NULL);
/*FALLTHROUGH*/
case 0: /* getpgrp() */
PROC_LOCK(p);
*retval = p->p_pgrp->pg_id;
PROC_UNLOCK(p);
return 0;
case 2: /* getsid(pid) */
if (uap->pid == 0)
PROC_LOCK(p);
else if ((p = svr4_pfind(uap->pid)) == NULL)
return ESRCH;
/*
* This has already been initialized to the pid of
* the session leader.
*/
*retval = (register_t) p->p_session->s_sid;
PROC_UNLOCK(p);
return 0;
case 3: /* setsid() */
- return setsid(td, NULL);
+ return sys_setsid(td, NULL);
case 4: /* getpgid(pid) */
if (uap->pid == 0)
PROC_LOCK(p);
else if ((p = svr4_pfind(uap->pid)) == NULL)
return ESRCH;
*retval = (int) p->p_pgrp->pg_id;
PROC_UNLOCK(p);
return 0;
case 5: /* setpgid(pid, pgid); */
{
struct setpgid_args sa;
sa.pid = uap->pid;
sa.pgid = uap->pgid;
- return setpgid(td, &sa);
+ return sys_setpgid(td, &sa);
}
default:
return EINVAL;
}
}
struct svr4_hrtcntl_args {
int cmd;
int fun;
int clk;
svr4_hrt_interval_t * iv;
svr4_hrt_time_t * ti;
};
static int
svr4_hrtcntl(td, uap, retval)
struct thread *td;
struct svr4_hrtcntl_args *uap;
register_t *retval;
{
switch (uap->fun) {
case SVR4_HRT_CNTL_RES:
DPRINTF(("htrcntl(RES)\n"));
*retval = SVR4_HRT_USEC;
return 0;
case SVR4_HRT_CNTL_TOFD:
DPRINTF(("htrcntl(TOFD)\n"));
{
struct timeval tv;
svr4_hrt_time_t t;
if (uap->clk != SVR4_HRT_CLK_STD) {
DPRINTF(("clk == %d\n", uap->clk));
return EINVAL;
}
if (uap->ti == NULL) {
DPRINTF(("ti NULL\n"));
return EINVAL;
}
microtime(&tv);
t.h_sec = tv.tv_sec;
t.h_rem = tv.tv_usec;
t.h_res = SVR4_HRT_USEC;
return copyout(&t, uap->ti, sizeof(t));
}
case SVR4_HRT_CNTL_START:
DPRINTF(("htrcntl(START)\n"));
return ENOSYS;
case SVR4_HRT_CNTL_GET:
DPRINTF(("htrcntl(GET)\n"));
return ENOSYS;
default:
DPRINTF(("Bad htrcntl command %d\n", uap->fun));
return ENOSYS;
}
}
int
svr4_sys_hrtsys(td, uap)
struct thread *td;
struct svr4_sys_hrtsys_args *uap;
{
int *retval = td->td_retval;
switch (uap->cmd) {
case SVR4_HRT_CNTL:
return svr4_hrtcntl(td, (struct svr4_hrtcntl_args *) uap,
retval);
case SVR4_HRT_ALRM:
DPRINTF(("hrtalarm\n"));
return ENOSYS;
case SVR4_HRT_SLP:
DPRINTF(("hrtsleep\n"));
return ENOSYS;
case SVR4_HRT_CAN:
DPRINTF(("hrtcancel\n"));
return ENOSYS;
default:
DPRINTF(("Bad hrtsys command %d\n", uap->cmd));
return EINVAL;
}
}
static int
svr4_setinfo(pid, ru, st, s)
pid_t pid;
struct rusage *ru;
int st;
svr4_siginfo_t *s;
{
svr4_siginfo_t i;
int sig;
memset(&i, 0, sizeof(i));
i.svr4_si_signo = SVR4_SIGCHLD;
i.svr4_si_errno = 0; /* XXX? */
i.svr4_si_pid = pid;
if (ru) {
i.svr4_si_stime = ru->ru_stime.tv_sec;
i.svr4_si_utime = ru->ru_utime.tv_sec;
}
if (WIFEXITED(st)) {
i.svr4_si_status = WEXITSTATUS(st);
i.svr4_si_code = SVR4_CLD_EXITED;
} else if (WIFSTOPPED(st)) {
sig = WSTOPSIG(st);
if (sig >= 0 && sig < NSIG)
i.svr4_si_status = SVR4_BSD2SVR4_SIG(sig);
if (i.svr4_si_status == SVR4_SIGCONT)
i.svr4_si_code = SVR4_CLD_CONTINUED;
else
i.svr4_si_code = SVR4_CLD_STOPPED;
} else {
sig = WTERMSIG(st);
if (sig >= 0 && sig < NSIG)
i.svr4_si_status = SVR4_BSD2SVR4_SIG(sig);
if (WCOREDUMP(st))
i.svr4_si_code = SVR4_CLD_DUMPED;
else
i.svr4_si_code = SVR4_CLD_KILLED;
}
DPRINTF(("siginfo [pid %ld signo %d code %d errno %d status %d]\n",
i.svr4_si_pid, i.svr4_si_signo, i.svr4_si_code, i.svr4_si_errno,
i.svr4_si_status));
return copyout(&i, s, sizeof(i));
}
int
svr4_sys_waitsys(td, uap)
struct thread *td;
struct svr4_sys_waitsys_args *uap;
{
struct rusage ru;
pid_t pid;
int nfound, status;
int error, *retval = td->td_retval;
struct proc *p, *q;
DPRINTF(("waitsys(%d, %d, %p, %x)\n",
uap->grp, uap->id,
uap->info, uap->options));
q = td->td_proc;
switch (uap->grp) {
case SVR4_P_PID:
pid = uap->id;
break;
case SVR4_P_PGID:
PROC_LOCK(q);
pid = -q->p_pgid;
PROC_UNLOCK(q);
break;
case SVR4_P_ALL:
pid = WAIT_ANY;
break;
default:
return EINVAL;
}
/* Hand off the easy cases to kern_wait(). */
if (!(uap->options & (SVR4_WNOWAIT)) &&
(uap->options & (SVR4_WEXITED | SVR4_WTRAPPED))) {
int options;
options = 0;
if (uap->options & SVR4_WSTOPPED)
options |= WUNTRACED;
if (uap->options & SVR4_WCONTINUED)
options |= WCONTINUED;
if (uap->options & SVR4_WNOHANG)
options |= WNOHANG;
error = kern_wait(td, pid, &status, options, &ru);
if (error)
return (error);
if (uap->options & SVR4_WNOHANG && *retval == 0)
error = svr4_setinfo(*retval, NULL, 0, uap->info);
else
error = svr4_setinfo(*retval, &ru, status, uap->info);
*retval = 0;
return (error);
}
/*
* Ok, handle the weird cases. Either WNOWAIT is set (meaning we
* just want to see if there is a process to harvest, we don't
* want to actually harvest it), or WEXIT and WTRAPPED are clear
* meaning we want to ignore zombies. Either way, we don't have
* to handle harvesting zombies here. We do have to duplicate the
* other portions of kern_wait() though, especially for WCONTINUED
* and WSTOPPED.
*/
loop:
nfound = 0;
sx_slock(&proctree_lock);
LIST_FOREACH(p, &q->p_children, p_sibling) {
PROC_LOCK(p);
if (pid != WAIT_ANY &&
p->p_pid != pid && p->p_pgid != -pid) {
PROC_UNLOCK(p);
DPRINTF(("pid %d pgid %d != %d\n", p->p_pid,
p->p_pgid, pid));
continue;
}
if (p_canwait(td, p)) {
PROC_UNLOCK(p);
continue;
}
nfound++;
PROC_SLOCK(p);
/*
* See if we have a zombie. If so, WNOWAIT should be set,
* as otherwise we should have called kern_wait() up above.
*/
if ((p->p_state == PRS_ZOMBIE) &&
((uap->options & (SVR4_WEXITED|SVR4_WTRAPPED)))) {
PROC_SUNLOCK(p);
KASSERT(uap->options & SVR4_WNOWAIT,
("WNOWAIT is clear"));
/* Found a zombie, so cache info in local variables. */
pid = p->p_pid;
status = p->p_xstat;
ru = p->p_ru;
PROC_SLOCK(p);
calcru(p, &ru.ru_utime, &ru.ru_stime);
PROC_SUNLOCK(p);
PROC_UNLOCK(p);
sx_sunlock(&proctree_lock);
/* Copy the info out to userland. */
*retval = 0;
DPRINTF(("found %d\n", pid));
return (svr4_setinfo(pid, &ru, status, uap->info));
}
/*
* See if we have a stopped or continued process.
* XXX: This duplicates the same code in kern_wait().
*/
if ((p->p_flag & P_STOPPED_SIG) &&
(p->p_suspcount == p->p_numthreads) &&
(p->p_flag & P_WAITED) == 0 &&
(p->p_flag & P_TRACED || uap->options & SVR4_WSTOPPED)) {
PROC_SUNLOCK(p);
if (((uap->options & SVR4_WNOWAIT)) == 0)
p->p_flag |= P_WAITED;
sx_sunlock(&proctree_lock);
pid = p->p_pid;
status = W_STOPCODE(p->p_xstat);
ru = p->p_ru;
PROC_SLOCK(p);
calcru(p, &ru.ru_utime, &ru.ru_stime);
PROC_SUNLOCK(p);
PROC_UNLOCK(p);
if (((uap->options & SVR4_WNOWAIT)) == 0) {
PROC_LOCK(q);
sigqueue_take(p->p_ksi);
PROC_UNLOCK(q);
}
*retval = 0;
DPRINTF(("jobcontrol %d\n", pid));
return (svr4_setinfo(pid, &ru, status, uap->info));
}
PROC_SUNLOCK(p);
if (uap->options & SVR4_WCONTINUED &&
(p->p_flag & P_CONTINUED)) {
sx_sunlock(&proctree_lock);
if (((uap->options & SVR4_WNOWAIT)) == 0)
p->p_flag &= ~P_CONTINUED;
pid = p->p_pid;
ru = p->p_ru;
status = SIGCONT;
PROC_SLOCK(p);
calcru(p, &ru.ru_utime, &ru.ru_stime);
PROC_SUNLOCK(p);
PROC_UNLOCK(p);
if (((uap->options & SVR4_WNOWAIT)) == 0) {
PROC_LOCK(q);
sigqueue_take(p->p_ksi);
PROC_UNLOCK(q);
}
*retval = 0;
DPRINTF(("jobcontrol %d\n", pid));
return (svr4_setinfo(pid, &ru, status, uap->info));
}
PROC_UNLOCK(p);
}
if (nfound == 0) {
sx_sunlock(&proctree_lock);
return (ECHILD);
}
if (uap->options & SVR4_WNOHANG) {
sx_sunlock(&proctree_lock);
*retval = 0;
return (svr4_setinfo(0, NULL, 0, uap->info));
}
PROC_LOCK(q);
sx_sunlock(&proctree_lock);
if (q->p_flag & P_STATCHILD) {
q->p_flag &= ~P_STATCHILD;
error = 0;
} else
error = msleep(q, &q->p_mtx, PWAIT | PCATCH, "svr4_wait", 0);
PROC_UNLOCK(q);
if (error)
return error;
goto loop;
}
static void
bsd_statfs_to_svr4_statvfs(bfs, sfs)
const struct statfs *bfs;
struct svr4_statvfs *sfs;
{
sfs->f_bsize = bfs->f_iosize; /* XXX */
sfs->f_frsize = bfs->f_bsize;
sfs->f_blocks = bfs->f_blocks;
sfs->f_bfree = bfs->f_bfree;
sfs->f_bavail = bfs->f_bavail;
sfs->f_files = bfs->f_files;
sfs->f_ffree = bfs->f_ffree;
sfs->f_favail = bfs->f_ffree;
sfs->f_fsid = bfs->f_fsid.val[0];
memcpy(sfs->f_basetype, bfs->f_fstypename, sizeof(sfs->f_basetype));
sfs->f_flag = 0;
if (bfs->f_flags & MNT_RDONLY)
sfs->f_flag |= SVR4_ST_RDONLY;
if (bfs->f_flags & MNT_NOSUID)
sfs->f_flag |= SVR4_ST_NOSUID;
sfs->f_namemax = MAXNAMLEN;
memcpy(sfs->f_fstr, bfs->f_fstypename, sizeof(sfs->f_fstr)); /* XXX */
memset(sfs->f_filler, 0, sizeof(sfs->f_filler));
}
static void
bsd_statfs_to_svr4_statvfs64(bfs, sfs)
const struct statfs *bfs;
struct svr4_statvfs64 *sfs;
{
sfs->f_bsize = bfs->f_iosize; /* XXX */
sfs->f_frsize = bfs->f_bsize;
sfs->f_blocks = bfs->f_blocks;
sfs->f_bfree = bfs->f_bfree;
sfs->f_bavail = bfs->f_bavail;
sfs->f_files = bfs->f_files;
sfs->f_ffree = bfs->f_ffree;
sfs->f_favail = bfs->f_ffree;
sfs->f_fsid = bfs->f_fsid.val[0];
memcpy(sfs->f_basetype, bfs->f_fstypename, sizeof(sfs->f_basetype));
sfs->f_flag = 0;
if (bfs->f_flags & MNT_RDONLY)
sfs->f_flag |= SVR4_ST_RDONLY;
if (bfs->f_flags & MNT_NOSUID)
sfs->f_flag |= SVR4_ST_NOSUID;
sfs->f_namemax = MAXNAMLEN;
memcpy(sfs->f_fstr, bfs->f_fstypename, sizeof(sfs->f_fstr)); /* XXX */
memset(sfs->f_filler, 0, sizeof(sfs->f_filler));
}
int
svr4_sys_statvfs(td, uap)
struct thread *td;
struct svr4_sys_statvfs_args *uap;
{
struct svr4_statvfs sfs;
struct statfs bfs;
char *path;
int error;
CHECKALTEXIST(td, uap->path, &path);
error = kern_statfs(td, path, UIO_SYSSPACE, &bfs);
free(path, M_TEMP);
if (error)
return (error);
bsd_statfs_to_svr4_statvfs(&bfs, &sfs);
return copyout(&sfs, uap->fs, sizeof(sfs));
}
int
svr4_sys_fstatvfs(td, uap)
struct thread *td;
struct svr4_sys_fstatvfs_args *uap;
{
struct svr4_statvfs sfs;
struct statfs bfs;
int error;
error = kern_fstatfs(td, uap->fd, &bfs);
if (error)
return (error);
bsd_statfs_to_svr4_statvfs(&bfs, &sfs);
return copyout(&sfs, uap->fs, sizeof(sfs));
}
int
svr4_sys_statvfs64(td, uap)
struct thread *td;
struct svr4_sys_statvfs64_args *uap;
{
struct svr4_statvfs64 sfs;
struct statfs bfs;
char *path;
int error;
CHECKALTEXIST(td, uap->path, &path);
error = kern_statfs(td, path, UIO_SYSSPACE, &bfs);
free(path, M_TEMP);
if (error)
return (error);
bsd_statfs_to_svr4_statvfs64(&bfs, &sfs);
return copyout(&sfs, uap->fs, sizeof(sfs));
}
int
svr4_sys_fstatvfs64(td, uap)
struct thread *td;
struct svr4_sys_fstatvfs64_args *uap;
{
struct svr4_statvfs64 sfs;
struct statfs bfs;
int error;
error = kern_fstatfs(td, uap->fd, &bfs);
if (error)
return (error);
bsd_statfs_to_svr4_statvfs64(&bfs, &sfs);
return copyout(&sfs, uap->fs, sizeof(sfs));
}
int
svr4_sys_alarm(td, uap)
struct thread *td;
struct svr4_sys_alarm_args *uap;
{
struct itimerval itv, oitv;
int error;
timevalclear(&itv.it_interval);
itv.it_value.tv_sec = uap->sec;
itv.it_value.tv_usec = 0;
error = kern_setitimer(td, ITIMER_REAL, &itv, &oitv);
if (error)
return (error);
if (oitv.it_value.tv_usec != 0)
oitv.it_value.tv_sec++;
td->td_retval[0] = oitv.it_value.tv_sec;
return (0);
}
int
svr4_sys_gettimeofday(td, uap)
struct thread *td;
struct svr4_sys_gettimeofday_args *uap;
{
if (uap->tp) {
struct timeval atv;
microtime(&atv);
return copyout(&atv, uap->tp, sizeof (atv));
}
return 0;
}
int
svr4_sys_facl(td, uap)
struct thread *td;
struct svr4_sys_facl_args *uap;
{
int *retval;
retval = td->td_retval;
*retval = 0;
switch (uap->cmd) {
case SVR4_SYS_SETACL:
/* We don't support acls on any filesystem */
return ENOSYS;
case SVR4_SYS_GETACL:
return copyout(retval, &uap->num,
sizeof(uap->num));
case SVR4_SYS_GETACLCNT:
return 0;
default:
return EINVAL;
}
}
int
svr4_sys_acl(td, uap)
struct thread *td;
struct svr4_sys_acl_args *uap;
{
/* XXX: for now the same */
return svr4_sys_facl(td, (struct svr4_sys_facl_args *)uap);
}
int
svr4_sys_auditsys(td, uap)
struct thread *td;
struct svr4_sys_auditsys_args *uap;
{
/*
* XXX: Big brother is *not* watching.
*/
return 0;
}
int
svr4_sys_memcntl(td, uap)
struct thread *td;
struct svr4_sys_memcntl_args *uap;
{
switch (uap->cmd) {
case SVR4_MC_SYNC:
{
struct msync_args msa;
msa.addr = uap->addr;
msa.len = uap->len;
msa.flags = (int)uap->arg;
- return msync(td, &msa);
+ return sys_msync(td, &msa);
}
case SVR4_MC_ADVISE:
{
struct madvise_args maa;
maa.addr = uap->addr;
maa.len = uap->len;
maa.behav = (int)uap->arg;
- return madvise(td, &maa);
+ return sys_madvise(td, &maa);
}
case SVR4_MC_LOCK:
case SVR4_MC_UNLOCK:
case SVR4_MC_LOCKAS:
case SVR4_MC_UNLOCKAS:
return EOPNOTSUPP;
default:
return ENOSYS;
}
}
int
svr4_sys_nice(td, uap)
struct thread *td;
struct svr4_sys_nice_args *uap;
{
struct setpriority_args ap;
int error;
ap.which = PRIO_PROCESS;
ap.who = 0;
ap.prio = uap->prio;
- if ((error = setpriority(td, &ap)) != 0)
+ if ((error = sys_setpriority(td, &ap)) != 0)
return error;
/* the cast is stupid, but the structures are the same */
- if ((error = getpriority(td, (struct getpriority_args *)&ap)) != 0)
+ if ((error = sys_getpriority(td, (struct getpriority_args *)&ap)) != 0)
return error;
return 0;
}
int
svr4_sys_resolvepath(td, uap)
struct thread *td;
struct svr4_sys_resolvepath_args *uap;
{
struct nameidata nd;
int error, *retval = td->td_retval;
unsigned int ncopy;
NDINIT(&nd, LOOKUP, NOFOLLOW | SAVENAME | MPSAFE, UIO_USERSPACE,
uap->path, td);
if ((error = namei(&nd)) != 0)
return (error);
NDFREE(&nd, NDF_NO_FREE_PNBUF);
VFS_UNLOCK_GIANT(NDHASGIANT(&nd));
ncopy = min(uap->bufsiz, strlen(nd.ni_cnd.cn_pnbuf) + 1);
if ((error = copyout(nd.ni_cnd.cn_pnbuf, uap->buf, ncopy)) != 0)
goto bad;
*retval = ncopy;
bad:
NDFREE(&nd, NDF_ONLY_PNBUF);
return error;
}
Index: head/sys/compat/svr4/svr4_signal.c
===================================================================
--- head/sys/compat/svr4/svr4_signal.c (revision 225616)
+++ head/sys/compat/svr4/svr4_signal.c (revision 225617)
@@ -1,577 +1,577 @@
/*-
* Copyright (c) 1998 Mark Newton
* Copyright (c) 1994 Christos Zoulas
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/filedesc.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/signal.h>
#include <sys/signalvar.h>
#include <sys/syscallsubr.h>
#include <sys/sysproto.h>
#include <machine/cpu.h>
#include <compat/svr4/svr4.h>
#include <compat/svr4/svr4_types.h>
#include <compat/svr4/svr4_signal.h>
#include <compat/svr4/svr4_proto.h>
#include <compat/svr4/svr4_util.h>
#include <compat/svr4/svr4_ucontext.h>
#define svr4_sigmask(n) (1 << (((n) - 1) & 31))
#define svr4_sigword(n) (((n) - 1) >> 5)
#define svr4_sigemptyset(s) memset((s), 0, sizeof(*(s)))
#define svr4_sigismember(s, n) ((s)->bits[svr4_sigword(n)] & svr4_sigmask(n))
#define svr4_sigaddset(s, n) ((s)->bits[svr4_sigword(n)] |= svr4_sigmask(n))
void svr4_to_bsd_sigaction(const struct svr4_sigaction *, struct sigaction *);
void bsd_to_svr4_sigaction(const struct sigaction *, struct svr4_sigaction *);
void svr4_sigfillset(svr4_sigset_t *);
int bsd_to_svr4_sig[SVR4_NSIG] = {
0,
SVR4_SIGHUP,
SVR4_SIGINT,
SVR4_SIGQUIT,
SVR4_SIGILL,
SVR4_SIGTRAP,
SVR4_SIGABRT,
SVR4_SIGEMT,
SVR4_SIGFPE,
SVR4_SIGKILL,
SVR4_SIGBUS,
SVR4_SIGSEGV,
SVR4_SIGSYS,
SVR4_SIGPIPE,
SVR4_SIGALRM,
SVR4_SIGTERM,
SVR4_SIGURG,
SVR4_SIGSTOP,
SVR4_SIGTSTP,
SVR4_SIGCONT,
SVR4_SIGCHLD,
SVR4_SIGTTIN,
SVR4_SIGTTOU,
SVR4_SIGIO,
SVR4_SIGXCPU,
SVR4_SIGXFSZ,
SVR4_SIGVTALRM,
SVR4_SIGPROF,
SVR4_SIGWINCH,
0, /* SIGINFO */
SVR4_SIGUSR1,
SVR4_SIGUSR2,
};
int svr4_to_bsd_sig[SVR4_NSIG] = {
0,
SIGHUP,
SIGINT,
SIGQUIT,
SIGILL,
SIGTRAP,
SIGABRT,
SIGEMT,
SIGFPE,
SIGKILL,
SIGBUS,
SIGSEGV,
SIGSYS,
SIGPIPE,
SIGALRM,
SIGTERM,
SIGUSR1,
SIGUSR2,
SIGCHLD,
0, /* XXX NetBSD uses SIGPWR here, but we don't seem to have one */
SIGWINCH,
SIGURG,
SIGIO,
SIGSTOP,
SIGTSTP,
SIGCONT,
SIGTTIN,
SIGTTOU,
SIGVTALRM,
SIGPROF,
SIGXCPU,
SIGXFSZ,
};
void
svr4_sigfillset(s)
svr4_sigset_t *s;
{
int i;
svr4_sigemptyset(s);
for (i = 1; i < SVR4_NSIG; i++)
if (svr4_to_bsd_sig[i] != 0)
svr4_sigaddset(s, i);
}
void
svr4_to_bsd_sigset(sss, bss)
const svr4_sigset_t *sss;
sigset_t *bss;
{
int i, newsig;
SIGEMPTYSET(*bss);
for (i = 1; i < SVR4_NSIG; i++)
if (svr4_sigismember(sss, i)) {
newsig = svr4_to_bsd_sig[i];
if (newsig)
SIGADDSET(*bss, newsig);
}
}
void
bsd_to_svr4_sigset(bss, sss)
const sigset_t *bss;
svr4_sigset_t *sss;
{
int i, newsig;
svr4_sigemptyset(sss);
for (i = 1; i < SVR4_NSIG; i++) {
if (SIGISMEMBER(*bss, i)) {
newsig = bsd_to_svr4_sig[i];
if (newsig)
svr4_sigaddset(sss, newsig);
}
}
}
/*
* XXX: Only a subset of the flags is currently implemented.
*/
void
svr4_to_bsd_sigaction(ssa, bsa)
const struct svr4_sigaction *ssa;
struct sigaction *bsa;
{
bsa->sa_handler = (sig_t) ssa->ssa_handler;
svr4_to_bsd_sigset(&ssa->ssa_mask, &bsa->sa_mask);
bsa->sa_flags = 0;
if ((ssa->ssa_flags & SVR4_SA_ONSTACK) != 0)
bsa->sa_flags |= SA_ONSTACK;
if ((ssa->ssa_flags & SVR4_SA_RESETHAND) != 0)
bsa->sa_flags |= SA_RESETHAND;
if ((ssa->ssa_flags & SVR4_SA_RESTART) != 0)
bsa->sa_flags |= SA_RESTART;
if ((ssa->ssa_flags & SVR4_SA_SIGINFO) != 0)
DPRINTF(("svr4_to_bsd_sigaction: SA_SIGINFO ignored\n"));
if ((ssa->ssa_flags & SVR4_SA_NOCLDSTOP) != 0)
bsa->sa_flags |= SA_NOCLDSTOP;
if ((ssa->ssa_flags & SVR4_SA_NODEFER) != 0)
bsa->sa_flags |= SA_NODEFER;
if ((ssa->ssa_flags & SVR4_SA_NOCLDWAIT) != 0)
bsa->sa_flags |= SA_NOCLDWAIT;
if ((ssa->ssa_flags & ~SVR4_SA_ALLBITS) != 0)
DPRINTF(("svr4_to_bsd_sigaction: extra bits ignored\n"));
}
void
bsd_to_svr4_sigaction(bsa, ssa)
const struct sigaction *bsa;
struct svr4_sigaction *ssa;
{
ssa->ssa_handler = (svr4_sig_t) bsa->sa_handler;
bsd_to_svr4_sigset(&bsa->sa_mask, &ssa->ssa_mask);
ssa->ssa_flags = 0;
if ((bsa->sa_flags & SA_ONSTACK) != 0)
ssa->ssa_flags |= SVR4_SA_ONSTACK;
if ((bsa->sa_flags & SA_RESETHAND) != 0)
ssa->ssa_flags |= SVR4_SA_RESETHAND;
if ((bsa->sa_flags & SA_RESTART) != 0)
ssa->ssa_flags |= SVR4_SA_RESTART;
if ((bsa->sa_flags & SA_NODEFER) != 0)
ssa->ssa_flags |= SVR4_SA_NODEFER;
if ((bsa->sa_flags & SA_NOCLDSTOP) != 0)
ssa->ssa_flags |= SVR4_SA_NOCLDSTOP;
}
void
svr4_to_bsd_sigaltstack(sss, bss)
const struct svr4_sigaltstack *sss;
struct sigaltstack *bss;
{
bss->ss_sp = sss->ss_sp;
bss->ss_size = sss->ss_size;
bss->ss_flags = 0;
if ((sss->ss_flags & SVR4_SS_DISABLE) != 0)
bss->ss_flags |= SS_DISABLE;
if ((sss->ss_flags & SVR4_SS_ONSTACK) != 0)
bss->ss_flags |= SS_ONSTACK;
if ((sss->ss_flags & ~SVR4_SS_ALLBITS) != 0)
/*XXX*/ uprintf("svr4_to_bsd_sigaltstack: extra bits ignored\n");
}
void
bsd_to_svr4_sigaltstack(bss, sss)
const struct sigaltstack *bss;
struct svr4_sigaltstack *sss;
{
sss->ss_sp = bss->ss_sp;
sss->ss_size = bss->ss_size;
sss->ss_flags = 0;
if ((bss->ss_flags & SS_DISABLE) != 0)
sss->ss_flags |= SVR4_SS_DISABLE;
if ((bss->ss_flags & SS_ONSTACK) != 0)
sss->ss_flags |= SVR4_SS_ONSTACK;
}
int
svr4_sys_sigaction(td, uap)
struct thread *td;
struct svr4_sys_sigaction_args *uap;
{
struct svr4_sigaction isa;
struct sigaction nbsa, obsa;
struct sigaction *nbsap;
int error;
if (uap->signum < 0 || uap->signum >= SVR4_NSIG)
return (EINVAL);
DPRINTF(("@@@ svr4_sys_sigaction(%d, %d, %d)\n", td->td_proc->p_pid,
uap->signum,
SVR4_SVR42BSD_SIG(uap->signum)));
if (uap->nsa != NULL) {
if ((error = copyin(uap->nsa, &isa, sizeof(isa))) != 0)
return (error);
svr4_to_bsd_sigaction(&isa, &nbsa);
nbsap = &nbsa;
} else
nbsap = NULL;
#if defined(DEBUG_SVR4)
{
int i;
for (i = 0; i < 4; i++)
DPRINTF(("\tssa_mask[%d] = %lx\n", i,
isa.ssa_mask.bits[i]));
DPRINTF(("\tssa_handler = %p\n", isa.ssa_handler));
}
#endif
error = kern_sigaction(td, SVR4_SVR42BSD_SIG(uap->signum), nbsap, &obsa,
0);
if (error == 0 && uap->osa != NULL) {
bsd_to_svr4_sigaction(&obsa, &isa);
error = copyout(&isa, uap->osa, sizeof(isa));
}
return (error);
}
int
svr4_sys_sigaltstack(td, uap)
struct thread *td;
struct svr4_sys_sigaltstack_args *uap;
{
struct svr4_sigaltstack sss;
struct sigaltstack nbss, obss, *nbssp;
int error;
if (uap->nss != NULL) {
if ((error = copyin(uap->nss, &sss, sizeof(sss))) != 0)
return (error);
svr4_to_bsd_sigaltstack(&sss, &nbss);
nbssp = &nbss;
} else
nbssp = NULL;
error = kern_sigaltstack(td, nbssp, &obss);
if (error == 0 && uap->oss != NULL) {
bsd_to_svr4_sigaltstack(&obss, &sss);
error = copyout(&sss, uap->oss, sizeof(sss));
}
return (error);
}
/*
* Stolen from the ibcs2 one
*/
int
svr4_sys_signal(td, uap)
struct thread *td;
struct svr4_sys_signal_args *uap;
{
struct proc *p;
int signum;
int error;
p = td->td_proc;
DPRINTF(("@@@ svr4_sys_signal(%d)\n", p->p_pid));
signum = SVR4_SIGNO(uap->signum);
if (signum < 0 || signum >= SVR4_NSIG) {
if (SVR4_SIGCALL(uap->signum) == SVR4_SIGNAL_MASK ||
SVR4_SIGCALL(uap->signum) == SVR4_SIGDEFER_MASK)
td->td_retval[0] = (int)SVR4_SIG_ERR;
return (EINVAL);
}
signum = SVR4_SVR42BSD_SIG(signum);
switch (SVR4_SIGCALL(uap->signum)) {
case SVR4_SIGDEFER_MASK:
if (uap->handler == SVR4_SIG_HOLD)
goto sighold;
/* FALLTHROUGH */
case SVR4_SIGNAL_MASK:
{
struct sigaction nbsa, obsa;
nbsa.sa_handler = (sig_t) uap->handler;
SIGEMPTYSET(nbsa.sa_mask);
nbsa.sa_flags = 0;
if (signum != SIGALRM)
nbsa.sa_flags = SA_RESTART;
error = kern_sigaction(td, signum, &nbsa, &obsa, 0);
if (error != 0) {
DPRINTF(("signal: sigaction failed: %d\n",
error));
td->td_retval[0] = (int)SVR4_SIG_ERR;
return (error);
}
td->td_retval[0] = (int)obsa.sa_handler;
return (0);
}
case SVR4_SIGHOLD_MASK:
sighold:
{
sigset_t set;
SIGEMPTYSET(set);
SIGADDSET(set, signum);
return (kern_sigprocmask(td, SIG_BLOCK, &set, NULL, 0));
}
case SVR4_SIGRELSE_MASK:
{
sigset_t set;
SIGEMPTYSET(set);
SIGADDSET(set, signum);
return (kern_sigprocmask(td, SIG_UNBLOCK, &set, NULL,
0));
}
case SVR4_SIGIGNORE_MASK:
{
struct sigaction sa;
sa.sa_handler = SIG_IGN;
SIGEMPTYSET(sa.sa_mask);
sa.sa_flags = 0;
error = kern_sigaction(td, signum, &sa, NULL, 0);
if (error != 0)
DPRINTF(("sigignore: sigaction failed\n"));
return (error);
}
case SVR4_SIGPAUSE_MASK:
{
sigset_t mask;
PROC_LOCK(p);
mask = td->td_sigmask;
PROC_UNLOCK(p);
SIGDELSET(mask, signum);
return kern_sigsuspend(td, mask);
}
default:
return (ENOSYS);
}
}
int
svr4_sys_sigprocmask(td, uap)
struct thread *td;
struct svr4_sys_sigprocmask_args *uap;
{
svr4_sigset_t sss;
sigset_t oss, nss;
sigset_t *nssp;
int error;
if (uap->set != NULL) {
if ((error = copyin(uap->set, &sss, sizeof(sss))) != 0)
return error;
svr4_to_bsd_sigset(&sss, &nss);
nssp = &nss;
} else
nssp = NULL;
/* SVR/4 sigprocmask flag values are the same as the FreeBSD values. */
error = kern_sigprocmask(td, uap->how, nssp, &oss, 0);
if (error == 0 && uap->oset != NULL) {
bsd_to_svr4_sigset(&oss, &sss);
error = copyout(&sss, uap->oset, sizeof(sss));
}
return (error);
}
int
svr4_sys_sigpending(td, uap)
struct thread *td;
struct svr4_sys_sigpending_args *uap;
{
struct proc *p;
sigset_t bss;
svr4_sigset_t sss;
p = td->td_proc;
DPRINTF(("@@@ svr4_sys_sigpending(%d)\n", p->p_pid));
switch (uap->what) {
case 1: /* sigpending */
if (uap->mask == NULL)
return 0;
PROC_LOCK(p);
bss = p->p_siglist;
SIGSETOR(bss, td->td_siglist);
SIGSETAND(bss, td->td_sigmask);
PROC_UNLOCK(p);
bsd_to_svr4_sigset(&bss, &sss);
break;
case 2: /* sigfillset */
svr4_sigfillset(&sss);
#if defined(DEBUG_SVR4)
{
int i;
for (i = 0; i < 4; i++)
DPRINTF(("new sigset[%d] = %lx\n", i, (long)sss.bits[i]));
}
#endif
break;
default:
return EINVAL;
}
return copyout(&sss, uap->mask, sizeof(sss));
}
int
svr4_sys_sigsuspend(td, uap)
struct thread *td;
struct svr4_sys_sigsuspend_args *uap;
{
svr4_sigset_t sss;
sigset_t bss;
int error;
if ((error = copyin(uap->ss, &sss, sizeof(sss))) != 0)
return error;
svr4_to_bsd_sigset(&sss, &bss);
return kern_sigsuspend(td, bss);
}
int
svr4_sys_kill(td, uap)
struct thread *td;
struct svr4_sys_kill_args *uap;
{
struct kill_args ka;
if (uap->signum < 0 || uap->signum >= SVR4_NSIG)
return (EINVAL);
ka.pid = uap->pid;
ka.signum = SVR4_SVR42BSD_SIG(uap->signum);
- return kill(td, &ka);
+ return sys_kill(td, &ka);
}
int
svr4_sys_context(td, uap)
struct thread *td;
struct svr4_sys_context_args *uap;
{
struct svr4_ucontext uc;
int error, onstack;
switch (uap->func) {
case 0:
DPRINTF(("getcontext(%p)\n", uap->uc));
PROC_LOCK(td->td_proc);
onstack = sigonstack(cpu_getstack(td));
PROC_UNLOCK(td->td_proc);
svr4_getcontext(td, &uc, &td->td_sigmask, onstack);
return copyout(&uc, uap->uc, sizeof(uc));
case 1:
DPRINTF(("setcontext(%p)\n", uap->uc));
if ((error = copyin(uap->uc, &uc, sizeof(uc))) != 0)
return error;
DPRINTF(("uc_flags = %lx\n", uc.uc_flags));
#if defined(DEBUG_SVR4)
{
int i;
for (i = 0; i < 4; i++)
DPRINTF(("uc_sigmask[%d] = %lx\n", i,
uc.uc_sigmask.bits[i]));
}
#endif
return svr4_setcontext(td, &uc);
default:
DPRINTF(("context(%d, %p)\n", uap->func,
uap->uc));
return ENOSYS;
}
return 0;
}
int
svr4_sys_pause(td, uap)
struct thread *td;
struct svr4_sys_pause_args *uap;
{
sigset_t mask;
PROC_LOCK(td->td_proc);
mask = td->td_sigmask;
PROC_UNLOCK(td->td_proc);
return kern_sigsuspend(td, mask);
}
Index: head/sys/compat/svr4/svr4_socket.c
===================================================================
--- head/sys/compat/svr4/svr4_socket.c (revision 225616)
+++ head/sys/compat/svr4/svr4_socket.c (revision 225617)
@@ -1,242 +1,242 @@
/*-
* Copyright (c) 1998 Mark Newton
* Copyright (c) 1996 Christos Zoulas.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by Christos Zoulas.
* 4. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
* In SVR4 unix domain sockets are referenced sometimes
* (in putmsg(2) for example) as a [device, inode] pair instead of a pathname.
* Since there is no iname() routine in the kernel, and we need access to
* a mapping from inode to pathname, we keep our own table. This is a simple
* linked list that contains the pathname, the [device, inode] pair, the
* file corresponding to that socket and the process. When the
* socket gets closed we remove the item from the list. The list gets loaded
* every time a stat(2) call finds a socket.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/queue.h>
#include <sys/eventhandler.h>
#include <sys/file.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/sysproto.h>
#include <sys/un.h>
#include <sys/stat.h>
#include <sys/proc.h>
#include <sys/malloc.h>
#include <compat/svr4/svr4.h>
#include <compat/svr4/svr4_types.h>
#include <compat/svr4/svr4_util.h>
#include <compat/svr4/svr4_socket.h>
#include <compat/svr4/svr4_signal.h>
#include <compat/svr4/svr4_sockmod.h>
#include <compat/svr4/svr4_proto.h>
struct svr4_sockcache_entry {
struct proc *p; /* Process for the socket */
void *cookie; /* Internal cookie used for matching */
struct sockaddr_un sock;/* Pathname for the socket */
dev_t dev; /* Device where the socket lives on */
ino_t ino; /* Inode where the socket lives on */
TAILQ_ENTRY(svr4_sockcache_entry) entries;
};
static TAILQ_HEAD(, svr4_sockcache_entry) svr4_head;
static struct mtx svr4_sockcache_lock;
static eventhandler_tag svr4_sockcache_exit_tag, svr4_sockcache_exec_tag;
static void svr4_purge_sockcache(void *arg, struct proc *p);
int
svr4_find_socket(td, fp, dev, ino, saun)
struct thread *td;
struct file *fp;
dev_t dev;
ino_t ino;
struct sockaddr_un *saun;
{
struct svr4_sockcache_entry *e;
void *cookie = ((struct socket *)fp->f_data)->so_emuldata;
DPRINTF(("svr4_find_socket: [%p,%d,%d]: ", td, dev, ino));
mtx_lock(&svr4_sockcache_lock);
TAILQ_FOREACH(e, &svr4_head, entries)
if (e->p == td->td_proc && e->dev == dev && e->ino == ino) {
#ifdef DIAGNOSTIC
if (e->cookie != NULL && e->cookie != cookie)
panic("svr4 socket cookie mismatch");
#endif
e->cookie = cookie;
DPRINTF(("%s\n", e->sock.sun_path));
*saun = e->sock;
mtx_unlock(&svr4_sockcache_lock);
return (0);
}
mtx_unlock(&svr4_sockcache_lock);
DPRINTF(("not found\n"));
return (ENOENT);
}
int
svr4_add_socket(td, path, st)
struct thread *td;
const char *path;
struct stat *st;
{
struct svr4_sockcache_entry *e;
size_t len;
int error;
e = malloc(sizeof(*e), M_TEMP, M_WAITOK);
e->cookie = NULL;
e->dev = st->st_dev;
e->ino = st->st_ino;
e->p = td->td_proc;
if ((error = copyinstr(path, e->sock.sun_path,
sizeof(e->sock.sun_path), &len)) != 0) {
DPRINTF(("svr4_add_socket: copyinstr failed %d\n", error));
free(e, M_TEMP);
return error;
}
e->sock.sun_family = AF_LOCAL;
e->sock.sun_len = len;
mtx_lock(&svr4_sockcache_lock);
TAILQ_INSERT_HEAD(&svr4_head, e, entries);
mtx_unlock(&svr4_sockcache_lock);
DPRINTF(("svr4_add_socket: %s [%p,%d,%d]\n", e->sock.sun_path,
td->td_proc, e->dev, e->ino));
return 0;
}
void
svr4_delete_socket(p, fp)
struct proc *p;
struct file *fp;
{
struct svr4_sockcache_entry *e;
void *cookie = ((struct socket *)fp->f_data)->so_emuldata;
mtx_lock(&svr4_sockcache_lock);
TAILQ_FOREACH(e, &svr4_head, entries)
if (e->p == p && e->cookie == cookie) {
TAILQ_REMOVE(&svr4_head, e, entries);
mtx_unlock(&svr4_sockcache_lock);
DPRINTF(("svr4_delete_socket: %s [%p,%d,%d]\n",
e->sock.sun_path, p, (int)e->dev, e->ino));
free(e, M_TEMP);
return;
}
mtx_unlock(&svr4_sockcache_lock);
}
void
svr4_purge_sockcache(arg, p)
void *arg;
struct proc *p;
{
struct svr4_sockcache_entry *e, *ne;
mtx_lock(&svr4_sockcache_lock);
TAILQ_FOREACH_SAFE(e, &svr4_head, entries, ne) {
if (e->p == p) {
TAILQ_REMOVE(&svr4_head, e, entries);
DPRINTF(("svr4_purge_sockcache: %s [%p,%d,%d]\n",
e->sock.sun_path, p, (int)e->dev, e->ino));
free(e, M_TEMP);
}
}
mtx_unlock(&svr4_sockcache_lock);
}
void
svr4_sockcache_init(void)
{
TAILQ_INIT(&svr4_head);
mtx_init(&svr4_sockcache_lock, "svr4 socket cache", NULL, MTX_DEF);
svr4_sockcache_exit_tag = EVENTHANDLER_REGISTER(process_exit,
svr4_purge_sockcache, NULL, EVENTHANDLER_PRI_ANY);
svr4_sockcache_exec_tag = EVENTHANDLER_REGISTER(process_exec,
svr4_purge_sockcache, NULL, EVENTHANDLER_PRI_ANY);
}
void
svr4_sockcache_destroy(void)
{
KASSERT(TAILQ_EMPTY(&svr4_head),
("%s: sockcache entries still around", __func__));
EVENTHANDLER_DEREGISTER(process_exec, svr4_sockcache_exec_tag);
EVENTHANDLER_DEREGISTER(process_exit, svr4_sockcache_exit_tag);
mtx_destroy(&svr4_sockcache_lock);
}
int
svr4_sys_socket(td, uap)
struct thread *td;
struct svr4_sys_socket_args *uap;
{
switch (uap->type) {
case SVR4_SOCK_DGRAM:
uap->type = SOCK_DGRAM;
break;
case SVR4_SOCK_STREAM:
uap->type = SOCK_STREAM;
break;
case SVR4_SOCK_RAW:
uap->type = SOCK_RAW;
break;
case SVR4_SOCK_RDM:
uap->type = SOCK_RDM;
break;
case SVR4_SOCK_SEQPACKET:
uap->type = SOCK_SEQPACKET;
break;
default:
return EINVAL;
}
- return socket(td, (struct socket_args *)uap);
+ return sys_socket(td, (struct socket_args *)uap);
}
Index: head/sys/compat/svr4/svr4_stat.c
===================================================================
--- head/sys/compat/svr4/svr4_stat.c (revision 225616)
+++ head/sys/compat/svr4/svr4_stat.c (revision 225617)
@@ -1,699 +1,699 @@
/*-
* Copyright (c) 1998 Mark Newton
* Copyright (c) 1994 Christos Zoulas
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/stat.h>
#include <sys/filedesc.h>
#include <sys/jail.h>
#include <sys/kernel.h>
#include <sys/malloc.h>
#include <sys/namei.h>
#include <sys/unistd.h>
#include <sys/time.h>
#include <sys/syscallsubr.h>
#include <sys/sysctl.h>
#include <sys/sysproto.h>
#include <sys/un.h>
#include <vm/vm.h>
#include <netinet/in.h>
#include <compat/svr4/svr4.h>
#include <compat/svr4/svr4_types.h>
#include <compat/svr4/svr4_signal.h>
#include <compat/svr4/svr4_proto.h>
#include <compat/svr4/svr4_util.h>
#include <compat/svr4/svr4_stat.h>
#include <compat/svr4/svr4_ustat.h>
#include <compat/svr4/svr4_utsname.h>
#include <compat/svr4/svr4_systeminfo.h>
#include <compat/svr4/svr4_socket.h>
#include <compat/svr4/svr4_time.h>
#if defined(NOTYET)
#include "svr4_fuser.h"
#endif
#ifdef sparc
/*
* Solaris-2.4 on the sparc has the old stat call using the new
* stat data structure...
*/
# define SVR4_NO_OSTAT
#endif
struct svr4_ustat_args {
svr4_dev_t dev;
struct svr4_ustat * name;
};
static void bsd_to_svr4_xstat(struct stat *, struct svr4_xstat *);
static void bsd_to_svr4_stat64(struct stat *, struct svr4_stat64 *);
int svr4_ustat(struct thread *, struct svr4_ustat_args *);
static int svr4_to_bsd_pathconf(int);
/*
* SVR4 uses named pipes as named sockets, so we tell programs
* that sockets are named pipes with mode 0
*/
#define BSD_TO_SVR4_MODE(mode) (S_ISSOCK(mode) ? S_IFIFO : (mode))
#ifndef SVR4_NO_OSTAT
static void bsd_to_svr4_stat(struct stat *, struct svr4_stat *);
static void
bsd_to_svr4_stat(st, st4)
struct stat *st;
struct svr4_stat *st4;
{
memset(st4, 0, sizeof(*st4));
st4->st_dev = bsd_to_svr4_odev_t(st->st_dev);
st4->st_ino = st->st_ino;
st4->st_mode = BSD_TO_SVR4_MODE(st->st_mode);
st4->st_nlink = st->st_nlink;
st4->st_uid = st->st_uid;
st4->st_gid = st->st_gid;
st4->st_rdev = bsd_to_svr4_odev_t(st->st_rdev);
st4->st_size = st->st_size;
st4->st_atim = st->st_atim.tv_sec;
st4->st_mtim = st->st_mtim.tv_sec;
st4->st_ctim = st->st_ctim.tv_sec;
}
#endif
static void
bsd_to_svr4_xstat(st, st4)
struct stat *st;
struct svr4_xstat *st4;
{
memset(st4, 0, sizeof(*st4));
st4->st_dev = bsd_to_svr4_dev_t(st->st_dev);
st4->st_ino = st->st_ino;
st4->st_mode = BSD_TO_SVR4_MODE(st->st_mode);
st4->st_nlink = st->st_nlink;
st4->st_uid = st->st_uid;
st4->st_gid = st->st_gid;
st4->st_rdev = bsd_to_svr4_dev_t(st->st_rdev);
st4->st_size = st->st_size;
st4->st_atim = st->st_atim;
st4->st_mtim = st->st_mtim;
st4->st_ctim = st->st_ctim;
st4->st_blksize = st->st_blksize;
st4->st_blocks = st->st_blocks;
strcpy(st4->st_fstype, "unknown");
}
static void
bsd_to_svr4_stat64(st, st4)
struct stat *st;
struct svr4_stat64 *st4;
{
memset(st4, 0, sizeof(*st4));
st4->st_dev = bsd_to_svr4_dev_t(st->st_dev);
st4->st_ino = st->st_ino;
st4->st_mode = BSD_TO_SVR4_MODE(st->st_mode);
st4->st_nlink = st->st_nlink;
st4->st_uid = st->st_uid;
st4->st_gid = st->st_gid;
st4->st_rdev = bsd_to_svr4_dev_t(st->st_rdev);
st4->st_size = st->st_size;
st4->st_atim = st->st_atim;
st4->st_mtim = st->st_mtim;
st4->st_ctim = st->st_ctim;
st4->st_blksize = st->st_blksize;
st4->st_blocks = st->st_blocks;
strcpy(st4->st_fstype, "unknown");
}
int
svr4_sys_stat(td, uap)
struct thread *td;
struct svr4_sys_stat_args *uap;
{
struct svr4_stat svr4_st;
struct stat st;
char *path;
int error;
CHECKALTEXIST(td, uap->path, &path);
error = kern_stat(td, path, UIO_SYSSPACE, &st);
free(path, M_TEMP);
if (error)
return (error);
bsd_to_svr4_stat(&st, &svr4_st);
if (S_ISSOCK(st.st_mode))
(void) svr4_add_socket(td, uap->path, &st);
return (copyout(&svr4_st, uap->ub, sizeof svr4_st));
}
int
svr4_sys_lstat(td, uap)
struct thread *td;
struct svr4_sys_lstat_args *uap;
{
struct svr4_stat svr4_st;
struct stat st;
char *path;
int error;
CHECKALTEXIST(td, uap->path, &path);
error = kern_lstat(td, path, UIO_SYSSPACE, &st);
free(path, M_TEMP);
if (error)
return (error);
bsd_to_svr4_stat(&st, &svr4_st);
if (S_ISSOCK(st.st_mode))
(void) svr4_add_socket(td, uap->path, &st);
return (copyout(&svr4_st, uap->ub, sizeof svr4_st));
}
int
svr4_sys_fstat(td, uap)
struct thread *td;
struct svr4_sys_fstat_args *uap;
{
struct svr4_stat svr4_st;
struct stat st;
int error;
error = kern_fstat(td, uap->fd, &st);
if (error)
return (error);
bsd_to_svr4_stat(&st, &svr4_st);
return (copyout(&svr4_st, uap->sb, sizeof svr4_st));
}
int
svr4_sys_xstat(td, uap)
struct thread *td;
struct svr4_sys_xstat_args *uap;
{
struct svr4_xstat svr4_st;
struct stat st;
char *path;
int error;
CHECKALTEXIST(td, uap->path, &path);
error = kern_stat(td, path, UIO_SYSSPACE, &st);
free(path, M_TEMP);
if (error)
return (error);
bsd_to_svr4_xstat(&st, &svr4_st);
#if defined(SOCKET_NOTYET)
if (S_ISSOCK(st.st_mode))
(void) svr4_add_socket(td, uap->path, &st);
#endif
return (copyout(&svr4_st, uap->ub, sizeof svr4_st));
}
int
svr4_sys_lxstat(td, uap)
struct thread *td;
struct svr4_sys_lxstat_args *uap;
{
struct svr4_xstat svr4_st;
struct stat st;
char *path;
int error;
CHECKALTEXIST(td, uap->path, &path);
error = kern_lstat(td, path, UIO_SYSSPACE, &st);
free(path, M_TEMP);
if (error)
return (error);
bsd_to_svr4_xstat(&st, &svr4_st);
#if defined(SOCKET_NOTYET)
if (S_ISSOCK(st.st_mode))
(void) svr4_add_socket(td, uap->path, &st);
#endif
return (copyout(&svr4_st, uap->ub, sizeof svr4_st));
}
int
svr4_sys_fxstat(td, uap)
struct thread *td;
struct svr4_sys_fxstat_args *uap;
{
struct svr4_xstat svr4_st;
struct stat st;
int error;
error = kern_fstat(td, uap->fd, &st);
if (error)
return (error);
bsd_to_svr4_xstat(&st, &svr4_st);
return (copyout(&svr4_st, uap->sb, sizeof svr4_st));
}
int
svr4_sys_stat64(td, uap)
struct thread *td;
struct svr4_sys_stat64_args *uap;
{
struct svr4_stat64 svr4_st;
struct stat st;
char *path;
int error;
CHECKALTEXIST(td, uap->path, &path);
error = kern_stat(td, path, UIO_SYSSPACE, &st);
free(path, M_TEMP);
if (error)
return (error);
bsd_to_svr4_stat64(&st, &svr4_st);
if (S_ISSOCK(st.st_mode))
(void) svr4_add_socket(td, uap->path, &st);
return (copyout(&svr4_st, uap->sb, sizeof svr4_st));
}
int
svr4_sys_lstat64(td, uap)
struct thread *td;
struct svr4_sys_lstat64_args *uap;
{
struct svr4_stat64 svr4_st;
struct stat st;
char *path;
int error;
CHECKALTEXIST(td, uap->path, &path);
error = kern_lstat(td, path, UIO_SYSSPACE, &st);
free(path, M_TEMP);
if (error)
return (error);
bsd_to_svr4_stat64(&st, &svr4_st);
if (S_ISSOCK(st.st_mode))
(void) svr4_add_socket(td, uap->path, &st);
return (copyout(&svr4_st, uap->sb, sizeof svr4_st));
}
int
svr4_sys_fstat64(td, uap)
struct thread *td;
struct svr4_sys_fstat64_args *uap;
{
struct svr4_stat64 svr4_st;
struct stat st;
int error;
error = kern_fstat(td, uap->fd, &st);
if (error)
return (error);
bsd_to_svr4_stat64(&st, &svr4_st);
return (copyout(&svr4_st, uap->sb, sizeof svr4_st));
}
int
svr4_ustat(td, uap)
struct thread *td;
struct svr4_ustat_args *uap;
{
struct svr4_ustat us;
int error;
memset(&us, 0, sizeof us);
/*
* XXX: should set f_tfree and f_tinode at least
* How do we translate dev -> fstat? (and then to svr4_ustat)
*/
if ((error = copyout(&us, uap->name, sizeof us)) != 0)
return (error);
return 0;
}
/*extern char ostype[], osrelease[], version[], machine[];*/
int
svr4_sys_uname(td, uap)
struct thread *td;
struct svr4_sys_uname_args *uap;
{
struct svr4_utsname sut;
memset(&sut, 0, sizeof(sut));
strlcpy(sut.sysname, ostype, sizeof(sut.sysname));
getcredhostname(td->td_ucred, sut.nodename, sizeof(sut.nodename));
strlcpy(sut.release, osrelease, sizeof(sut.release));
strlcpy(sut.version, version, sizeof(sut.version));
strlcpy(sut.machine, machine, sizeof(sut.machine));
return copyout((caddr_t) &sut, (caddr_t) uap->name,
sizeof(struct svr4_utsname));
}
int
svr4_sys_systeminfo(td, uap)
struct thread *td;
struct svr4_sys_systeminfo_args *uap;
{
char *str = NULL;
int error = 0;
register_t *retval = td->td_retval;
u_long hostid;
size_t len = 0;
char buf[MAXHOSTNAMELEN];
u_int rlen = uap->len;
switch (uap->what) {
case SVR4_SI_SYSNAME:
str = ostype;
break;
case SVR4_SI_HOSTNAME:
getcredhostname(td->td_ucred, buf, sizeof(buf));
str = buf;
break;
case SVR4_SI_RELEASE:
str = osrelease;
break;
case SVR4_SI_VERSION:
str = version;
break;
case SVR4_SI_MACHINE:
str = machine;
break;
case SVR4_SI_ARCHITECTURE:
str = machine;
break;
case SVR4_SI_ISALIST:
#if defined(__sparc__)
str = "sparcv9 sparcv9-fsmuld sparcv8 sparcv8-fsmuld sparcv7 sparc";
#elif defined(__i386__)
str = "i386";
#elif defined(__amd64__)
str = "amd64";
#else
str = "unknown";
#endif
break;
case SVR4_SI_HW_SERIAL:
getcredhostid(td->td_ucred, &hostid);
snprintf(buf, sizeof(buf), "%lu", hostid);
str = buf;
break;
case SVR4_SI_HW_PROVIDER:
str = ostype;
break;
case SVR4_SI_SRPC_DOMAIN:
getcreddomainname(td->td_ucred, buf, sizeof(buf));
str = buf;
break;
case SVR4_SI_PLATFORM:
#if defined(__i386__)
str = "i86pc";
#else
str = "unknown";
#endif
break;
case SVR4_SI_KERB_REALM:
str = "unsupported";
break;
#if defined(WHY_DOES_AN_EMULATOR_WANT_TO_SET_HOSTNAMES)
case SVR4_SI_SET_HOSTNAME:
name = KERN_HOSTNAME;
return kern_sysctl(&name, 1, 0, 0, uap->buf, rlen, td);
case SVR4_SI_SET_SRPC_DOMAIN:
name = KERN_NISDOMAINNAME;
return kern_sysctl(&name, 1, 0, 0, uap->buf, rlen, td);
#else
case SVR4_SI_SET_HOSTNAME:
case SVR4_SI_SET_SRPC_DOMAIN:
/* FALLTHROUGH */
#endif
case SVR4_SI_SET_KERB_REALM:
return 0;
default:
DPRINTF(("Bad systeminfo command %d\n", uap->what));
return ENOSYS;
}
if (str) {
len = strlen(str) + 1;
if (len > rlen)
len = rlen;
if (uap->buf) {
error = copyout(str, uap->buf, len);
if (error)
return error;
/* make sure we are NULL terminated */
buf[0] = '\0';
error = copyout(buf, &(uap->buf[len - 1]), 1);
}
else
error = 0;
}
/* XXX NetBSD has hostname setting stuff here. Why would an emulator
want to do that? */
*retval = len;
return error;
}
int
svr4_sys_utssys(td, uap)
struct thread *td;
struct svr4_sys_utssys_args *uap;
{
switch (uap->sel) {
case 0: /* uname(2) */
{
struct svr4_sys_uname_args ua;
ua.name = uap->a1;
return svr4_sys_uname(td, &ua);
}
case 2: /* ustat(2) */
{
struct svr4_ustat_args ua;
ua.dev = (svr4_dev_t) uap->a2;
ua.name = uap->a1;
return svr4_ustat(td, &ua);
}
case 3: /* fusers(2) */
return ENOSYS;
default:
return ENOSYS;
}
return ENOSYS;
}
int
svr4_sys_utime(td, uap)
struct thread *td;
struct svr4_sys_utime_args *uap;
{
struct svr4_utimbuf ub;
struct timeval tbuf[2], *tp;
char *path;
int error;
if (uap->ubuf != NULL) {
error = copyin(uap->ubuf, &ub, sizeof(ub));
if (error)
return (error);
tbuf[0].tv_sec = ub.actime;
tbuf[0].tv_usec = 0;
tbuf[1].tv_sec = ub.modtime;
tbuf[1].tv_usec = 0;
tp = tbuf;
} else
tp = NULL;
CHECKALTEXIST(td, uap->path, &path);
error = kern_utimes(td, path, UIO_SYSSPACE, tp, UIO_SYSSPACE);
free(path, M_TEMP);
return (error);
}
int
svr4_sys_utimes(td, uap)
struct thread *td;
struct svr4_sys_utimes_args *uap;
{
char *path;
int error;
CHECKALTEXIST(td, uap->path, &path);
error = kern_utimes(td, path, UIO_SYSSPACE, uap->tptr, UIO_USERSPACE);
free(path, M_TEMP);
return (error);
}
static int
svr4_to_bsd_pathconf(name)
int name;
{
switch (name) {
case SVR4_PC_LINK_MAX:
return _PC_LINK_MAX;
case SVR4_PC_MAX_CANON:
return _PC_MAX_CANON;
case SVR4_PC_MAX_INPUT:
return _PC_MAX_INPUT;
case SVR4_PC_NAME_MAX:
return _PC_NAME_MAX;
case SVR4_PC_PATH_MAX:
return _PC_PATH_MAX;
case SVR4_PC_PIPE_BUF:
return _PC_PIPE_BUF;
case SVR4_PC_NO_TRUNC:
return _PC_NO_TRUNC;
case SVR4_PC_VDISABLE:
return _PC_VDISABLE;
case SVR4_PC_CHOWN_RESTRICTED:
return _PC_CHOWN_RESTRICTED;
case SVR4_PC_SYNC_IO:
#if defined(_PC_SYNC_IO)
return _PC_SYNC_IO;
#else
return 0;
#endif
case SVR4_PC_ASYNC_IO:
case SVR4_PC_PRIO_IO:
/* Not supported */
return 0;
default:
/* Invalid */
return -1;
}
}
int
svr4_sys_pathconf(td, uap)
struct thread *td;
struct svr4_sys_pathconf_args *uap;
{
char *path;
int error, name;
name = svr4_to_bsd_pathconf(uap->name);
switch (name) {
case -1:
td->td_retval[0] = -1;
return (EINVAL);
case 0:
td->td_retval[0] = 0;
return (0);
default:
CHECKALTEXIST(td, uap->path, &path);
error = kern_pathconf(td, path, UIO_SYSSPACE, name, FOLLOW);
free(path, M_TEMP);
return (error);
}
}
int
svr4_sys_fpathconf(td, uap)
struct thread *td;
struct svr4_sys_fpathconf_args *uap;
{
register_t *retval = td->td_retval;
uap->name = svr4_to_bsd_pathconf(uap->name);
switch (uap->name) {
case -1:
*retval = -1;
return EINVAL;
case 0:
*retval = 0;
return 0;
default:
- return fpathconf(td, (struct fpathconf_args *)uap);
+ return sys_fpathconf(td, (struct fpathconf_args *)uap);
}
}
Index: head/sys/compat/svr4/svr4_stream.c
===================================================================
--- head/sys/compat/svr4/svr4_stream.c (revision 225616)
+++ head/sys/compat/svr4/svr4_stream.c (revision 225617)
@@ -1,2038 +1,2038 @@
/*-
* Copyright (c) 1998 Mark Newton. All rights reserved.
* Copyright (c) 1994, 1996 Christos Zoulas. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by Christos Zoulas.
* 4. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Pretend that we have streams...
* Yes, this is gross.
*
* ToDo: The state machine for getmsg needs re-thinking
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_compat.h"
#include "opt_ktrace.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/capability.h>
#include <sys/fcntl.h>
#include <sys/filedesc.h>
#include <sys/filio.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/file.h> /* Must come after sys/malloc.h */
#include <sys/mbuf.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/protosw.h>
#include <sys/signal.h>
#include <sys/signalvar.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/stat.h>
#include <sys/syscallsubr.h>
#include <sys/sysproto.h>
#include <sys/uio.h>
#include <sys/ktrace.h> /* Must come after sys/uio.h */
#include <sys/un.h>
#include <netinet/in.h>
#include <compat/svr4/svr4.h>
#include <compat/svr4/svr4_types.h>
#include <compat/svr4/svr4_util.h>
#include <compat/svr4/svr4_signal.h>
#include <compat/svr4/svr4_proto.h>
#include <compat/svr4/svr4_stropts.h>
#include <compat/svr4/svr4_timod.h>
#include <compat/svr4/svr4_sockmod.h>
#include <compat/svr4/svr4_ioctl.h>
#include <compat/svr4/svr4_socket.h>
/* Utils */
static int clean_pipe(struct thread *, char *);
static void getparm(struct file *, struct svr4_si_sockparms *);
static int svr4_do_putmsg(struct thread *, struct svr4_sys_putmsg_args *,
struct file *);
static int svr4_do_getmsg(struct thread *, struct svr4_sys_getmsg_args *,
struct file *);
/* Address Conversions */
static void sockaddr_to_netaddr_in(struct svr4_strmcmd *,
const struct sockaddr_in *);
static void sockaddr_to_netaddr_un(struct svr4_strmcmd *,
const struct sockaddr_un *);
static void netaddr_to_sockaddr_in(struct sockaddr_in *,
const struct svr4_strmcmd *);
static void netaddr_to_sockaddr_un(struct sockaddr_un *,
const struct svr4_strmcmd *);
/* stream ioctls */
static int i_nread(struct file *, struct thread *, register_t *, int,
u_long, caddr_t);
static int i_fdinsert(struct file *, struct thread *, register_t *, int,
u_long, caddr_t);
static int i_str(struct file *, struct thread *, register_t *, int,
u_long, caddr_t);
static int i_setsig(struct file *, struct thread *, register_t *, int,
u_long, caddr_t);
static int i_getsig(struct file *, struct thread *, register_t *, int,
u_long, caddr_t);
static int _i_bind_rsvd(struct file *, struct thread *, register_t *, int,
u_long, caddr_t);
static int _i_rele_rsvd(struct file *, struct thread *, register_t *, int,
u_long, caddr_t);
/* i_str sockmod calls */
static int sockmod(struct file *, int, struct svr4_strioctl *,
struct thread *);
static int si_listen(struct file *, int, struct svr4_strioctl *,
struct thread *);
static int si_ogetudata(struct file *, int, struct svr4_strioctl *,
struct thread *);
static int si_sockparams(struct file *, int, struct svr4_strioctl *,
struct thread *);
static int si_shutdown (struct file *, int, struct svr4_strioctl *,
struct thread *);
static int si_getudata(struct file *, int, struct svr4_strioctl *,
struct thread *);
/* i_str timod calls */
static int timod(struct file *, int, struct svr4_strioctl *, struct thread *);
static int ti_getinfo(struct file *, int, struct svr4_strioctl *,
struct thread *);
static int ti_bind(struct file *, int, struct svr4_strioctl *, struct thread *);
#ifdef DEBUG_SVR4
static void bufprint(u_char *, size_t);
static int show_ioc(const char *, struct svr4_strioctl *);
static int show_strbuf(struct svr4_strbuf *);
static void show_msg(const char *, int, struct svr4_strbuf *,
struct svr4_strbuf *, int);
static void
bufprint(buf, len)
u_char *buf;
size_t len;
{
size_t i;
uprintf("\n\t");
for (i = 0; i < len; i++) {
uprintf("%x ", buf[i]);
if (i && (i % 16) == 0)
uprintf("\n\t");
}
}
static int
show_ioc(str, ioc)
const char *str;
struct svr4_strioctl *ioc;
{
u_char *ptr = NULL;
int len;
int error;
len = ioc->len;
if (len > 1024)
len = 1024;
if (len > 0) {
ptr = (u_char *) malloc(len, M_TEMP, M_WAITOK);
if ((error = copyin(ioc->buf, ptr, len)) != 0) {
free((char *) ptr, M_TEMP);
return error;
}
}
uprintf("%s cmd = %ld, timeout = %d, len = %d, buf = %p { ",
str, ioc->cmd, ioc->timeout, ioc->len, ioc->buf);
if (ptr != NULL)
bufprint(ptr, len);
uprintf("}\n");
if (ptr != NULL)
free((char *) ptr, M_TEMP);
return 0;
}
static int
show_strbuf(str)
struct svr4_strbuf *str;
{
int error;
u_char *ptr = NULL;
int maxlen = str->maxlen;
int len = str->len;
if (maxlen > 8192)
maxlen = 8192;
if (maxlen < 0)
maxlen = 0;
if (len >= maxlen)
len = maxlen;
if (len > 0) {
ptr = (u_char *) malloc(len, M_TEMP, M_WAITOK);
if ((error = copyin(str->buf, ptr, len)) != 0) {
free((char *) ptr, M_TEMP);
return error;
}
}
uprintf(", { %d, %d, %p=[ ", str->maxlen, str->len, str->buf);
if (ptr)
bufprint(ptr, len);
uprintf("]}");
if (ptr)
free((char *) ptr, M_TEMP);
return 0;
}
static void
show_msg(str, fd, ctl, dat, flags)
const char *str;
int fd;
struct svr4_strbuf *ctl;
struct svr4_strbuf *dat;
int flags;
{
struct svr4_strbuf buf;
int error;
uprintf("%s(%d", str, fd);
if (ctl != NULL) {
if ((error = copyin(ctl, &buf, sizeof(buf))) != 0)
return;
show_strbuf(&buf);
}
else
uprintf(", NULL");
if (dat != NULL) {
if ((error = copyin(dat, &buf, sizeof(buf))) != 0)
return;
show_strbuf(&buf);
}
else
uprintf(", NULL");
uprintf(", %x);\n", flags);
}
#endif /* DEBUG_SVR4 */
/*
* We are faced with an interesting situation. On svr4 unix sockets
* are really pipes. But we really have sockets, and we might as
* well use them. At the point where svr4 calls TI_BIND, it has
* already created a named pipe for the socket using mknod(2).
* We need to create a socket with the same name when we bind,
* so we need to remove the pipe before, otherwise we'll get address
* already in use. So we *carefully* remove the pipe, to avoid
* using this as a random file removal tool. We use system calls
* to avoid code duplication.
*/
static int
clean_pipe(td, path)
struct thread *td;
char *path;
{
struct stat st;
int error;
error = kern_lstat(td, path, UIO_SYSSPACE, &st);
/*
* Make sure we are dealing with a mode 0 named pipe.
*/
if ((st.st_mode & S_IFMT) != S_IFIFO)
return (0);
if ((st.st_mode & ALLPERMS) != 0)
return (0);
error = kern_unlink(td, path, UIO_SYSSPACE);
if (error)
DPRINTF(("clean_pipe: unlink failed %d\n", error));
return (error);
}
static void
sockaddr_to_netaddr_in(sc, sain)
struct svr4_strmcmd *sc;
const struct sockaddr_in *sain;
{
struct svr4_netaddr_in *na;
na = SVR4_ADDROF(sc);
na->family = sain->sin_family;
na->port = sain->sin_port;
na->addr = sain->sin_addr.s_addr;
DPRINTF(("sockaddr_in -> netaddr %d %d %lx\n", na->family, na->port,
na->addr));
}
static void
sockaddr_to_netaddr_un(sc, saun)
struct svr4_strmcmd *sc;
const struct sockaddr_un *saun;
{
struct svr4_netaddr_un *na;
char *dst, *edst = ((char *) sc) + sc->offs + sizeof(na->family) + 1 -
sizeof(*sc);
const char *src;
na = SVR4_ADDROF(sc);
na->family = saun->sun_family;
for (src = saun->sun_path, dst = na->path; (*dst++ = *src++) != '\0'; )
if (dst == edst)
break;
DPRINTF(("sockaddr_un -> netaddr %d %s\n", na->family, na->path));
}
static void
netaddr_to_sockaddr_in(sain, sc)
struct sockaddr_in *sain;
const struct svr4_strmcmd *sc;
{
const struct svr4_netaddr_in *na;
na = SVR4_C_ADDROF(sc);
memset(sain, 0, sizeof(*sain));
sain->sin_len = sizeof(*sain);
sain->sin_family = na->family;
sain->sin_port = na->port;
sain->sin_addr.s_addr = na->addr;
DPRINTF(("netaddr -> sockaddr_in %d %d %x\n", sain->sin_family,
sain->sin_port, sain->sin_addr.s_addr));
}
static void
netaddr_to_sockaddr_un(saun, sc)
struct sockaddr_un *saun;
const struct svr4_strmcmd *sc;
{
const struct svr4_netaddr_un *na;
char *dst, *edst = &saun->sun_path[sizeof(saun->sun_path) - 1];
const char *src;
na = SVR4_C_ADDROF(sc);
memset(saun, 0, sizeof(*saun));
saun->sun_family = na->family;
for (src = na->path, dst = saun->sun_path; (*dst++ = *src++) != '\0'; )
if (dst == edst)
break;
saun->sun_len = dst - saun->sun_path;
DPRINTF(("netaddr -> sockaddr_un %d %s\n", saun->sun_family,
saun->sun_path));
}
static void
getparm(fp, pa)
struct file *fp;
struct svr4_si_sockparms *pa;
{
struct svr4_strm *st;
struct socket *so;
st = svr4_stream_get(fp);
if (st == NULL)
return;
so = fp->f_data;
pa->family = st->s_family;
switch (so->so_type) {
case SOCK_DGRAM:
pa->type = SVR4_T_CLTS;
pa->protocol = IPPROTO_UDP;
DPRINTF(("getparm(dgram)\n"));
return;
case SOCK_STREAM:
pa->type = SVR4_T_COTS; /* What about T_COTS_ORD? XXX */
pa->protocol = IPPROTO_IP;
DPRINTF(("getparm(stream)\n"));
return;
case SOCK_RAW:
pa->type = SVR4_T_CLTS;
pa->protocol = IPPROTO_RAW;
DPRINTF(("getparm(raw)\n"));
return;
default:
pa->type = 0;
pa->protocol = 0;
DPRINTF(("getparm(type %d?)\n", so->so_type));
return;
}
}
static int
si_ogetudata(fp, fd, ioc, td)
struct file *fp;
int fd;
struct svr4_strioctl *ioc;
struct thread *td;
{
int error;
struct svr4_si_oudata ud;
struct svr4_si_sockparms pa;
if (ioc->len != sizeof(ud) && ioc->len != sizeof(ud) - sizeof(int)) {
DPRINTF(("SI_OGETUDATA: Wrong size %d != %d\n",
sizeof(ud), ioc->len));
return EINVAL;
}
if ((error = copyin(ioc->buf, &ud, sizeof(ud))) != 0)
return error;
getparm(fp, &pa);
switch (pa.family) {
case AF_INET:
ud.tidusize = 16384;
ud.addrsize = sizeof(struct svr4_sockaddr_in);
if (pa.type == SVR4_SOCK_STREAM)
ud.etsdusize = 1;
else
ud.etsdusize = 0;
break;
case AF_LOCAL:
ud.tidusize = 65536;
ud.addrsize = 128;
ud.etsdusize = 128;
break;
default:
DPRINTF(("SI_OGETUDATA: Unsupported address family %d\n",
pa.family));
return ENOSYS;
}
/* I have no idea what these should be! */
ud.optsize = 128;
ud.tsdusize = 128;
ud.servtype = pa.type;
/* XXX: Fixme */
ud.so_state = 0;
ud.so_options = 0;
return copyout(&ud, ioc->buf, ioc->len);
}
static int
si_sockparams(fp, fd, ioc, td)
struct file *fp;
int fd;
struct svr4_strioctl *ioc;
struct thread *td;
{
struct svr4_si_sockparms pa;
getparm(fp, &pa);
return copyout(&pa, ioc->buf, sizeof(pa));
}
static int
si_listen(fp, fd, ioc, td)
struct file *fp;
int fd;
struct svr4_strioctl *ioc;
struct thread *td;
{
int error;
struct svr4_strm *st = svr4_stream_get(fp);
struct svr4_strmcmd lst;
struct listen_args la;
if (st == NULL)
return EINVAL;
if (ioc->len < 0 || ioc->len > sizeof(lst))
return EINVAL;
if ((error = copyin(ioc->buf, &lst, ioc->len)) != 0)
return error;
if (lst.cmd != SVR4_TI_OLD_BIND_REQUEST) {
DPRINTF(("si_listen: bad request %ld\n", lst.cmd));
return EINVAL;
}
/*
* We are making assumptions again...
*/
la.s = fd;
DPRINTF(("SI_LISTEN: fileno %d backlog = %d\n", fd, 5));
la.backlog = 5;
- if ((error = listen(td, &la)) != 0) {
+ if ((error = sys_listen(td, &la)) != 0) {
DPRINTF(("SI_LISTEN: listen failed %d\n", error));
return error;
}
st->s_cmd = SVR4_TI__ACCEPT_WAIT;
lst.cmd = SVR4_TI_BIND_REPLY;
switch (st->s_family) {
case AF_INET:
/* XXX: Fill the length here */
break;
case AF_LOCAL:
lst.len = 140;
lst.pad[28] = 0x00000000; /* magic again */
lst.pad[29] = 0x00000800; /* magic again */
lst.pad[30] = 0x80001400; /* magic again */
break;
default:
DPRINTF(("SI_LISTEN: Unsupported address family %d\n",
st->s_family));
return ENOSYS;
}
if ((error = copyout(&lst, ioc->buf, ioc->len)) != 0)
return error;
return 0;
}
static int
si_getudata(fp, fd, ioc, td)
struct file *fp;
int fd;
struct svr4_strioctl *ioc;
struct thread *td;
{
int error;
struct svr4_si_udata ud;
if (sizeof(ud) != ioc->len) {
DPRINTF(("SI_GETUDATA: Wrong size %d != %d\n",
sizeof(ud), ioc->len));
return EINVAL;
}
if ((error = copyin(ioc->buf, &ud, sizeof(ud))) != 0)
return error;
getparm(fp, &ud.sockparms);
switch (ud.sockparms.family) {
case AF_INET:
DPRINTF(("getudata_inet\n"));
ud.tidusize = 16384;
ud.tsdusize = 16384;
ud.addrsize = sizeof(struct svr4_sockaddr_in);
if (ud.sockparms.type == SVR4_SOCK_STREAM)
ud.etsdusize = 1;
else
ud.etsdusize = 0;
ud.optsize = 0;
break;
case AF_LOCAL:
DPRINTF(("getudata_local\n"));
ud.tidusize = 65536;
ud.tsdusize = 128;
ud.addrsize = 128;
ud.etsdusize = 128;
ud.optsize = 128;
break;
default:
DPRINTF(("SI_GETUDATA: Unsupported address family %d\n",
ud.sockparms.family));
return ENOSYS;
}
ud.servtype = ud.sockparms.type;
DPRINTF(("ud.servtype = %d\n", ud.servtype));
/* XXX: Fixme */
ud.so_state = 0;
ud.so_options = 0;
return copyout(&ud, ioc->buf, sizeof(ud));
}
static int
si_shutdown(fp, fd, ioc, td)
struct file *fp;
int fd;
struct svr4_strioctl *ioc;
struct thread *td;
{
int error;
struct shutdown_args ap;
if (ioc->len != sizeof(ap.how)) {
DPRINTF(("SI_SHUTDOWN: Wrong size %d != %d\n",
sizeof(ap.how), ioc->len));
return EINVAL;
}
if ((error = copyin(ioc->buf, &ap.how, ioc->len)) != 0)
return error;
ap.s = fd;
- return shutdown(td, &ap);
+ return sys_shutdown(td, &ap);
}
static int
sockmod(fp, fd, ioc, td)
struct file *fp;
int fd;
struct svr4_strioctl *ioc;
struct thread *td;
{
switch (ioc->cmd) {
case SVR4_SI_OGETUDATA:
DPRINTF(("SI_OGETUDATA\n"));
return si_ogetudata(fp, fd, ioc, td);
case SVR4_SI_SHUTDOWN:
DPRINTF(("SI_SHUTDOWN\n"));
return si_shutdown(fp, fd, ioc, td);
case SVR4_SI_LISTEN:
DPRINTF(("SI_LISTEN\n"));
return si_listen(fp, fd, ioc, td);
case SVR4_SI_SETMYNAME:
DPRINTF(("SI_SETMYNAME\n"));
return 0;
case SVR4_SI_SETPEERNAME:
DPRINTF(("SI_SETPEERNAME\n"));
return 0;
case SVR4_SI_GETINTRANSIT:
DPRINTF(("SI_GETINTRANSIT\n"));
return 0;
case SVR4_SI_TCL_LINK:
DPRINTF(("SI_TCL_LINK\n"));
return 0;
case SVR4_SI_TCL_UNLINK:
DPRINTF(("SI_TCL_UNLINK\n"));
return 0;
case SVR4_SI_SOCKPARAMS:
DPRINTF(("SI_SOCKPARAMS\n"));
return si_sockparams(fp, fd, ioc, td);
case SVR4_SI_GETUDATA:
DPRINTF(("SI_GETUDATA\n"));
return si_getudata(fp, fd, ioc, td);
default:
DPRINTF(("Unknown sockmod ioctl %lx\n", ioc->cmd));
return 0;
}
}
static int
ti_getinfo(fp, fd, ioc, td)
struct file *fp;
int fd;
struct svr4_strioctl *ioc;
struct thread *td;
{
int error;
struct svr4_infocmd info;
memset(&info, 0, sizeof(info));
if (ioc->len < 0 || ioc->len > sizeof(info))
return EINVAL;
if ((error = copyin(ioc->buf, &info, ioc->len)) != 0)
return error;
if (info.cmd != SVR4_TI_INFO_REQUEST)
return EINVAL;
info.cmd = SVR4_TI_INFO_REPLY;
info.tsdu = 0;
info.etsdu = 1;
info.cdata = -2;
info.ddata = -2;
info.addr = 16;
info.opt = -1;
info.tidu = 16384;
info.serv = 2;
info.current = 0;
info.provider = 2;
ioc->len = sizeof(info);
if ((error = copyout(&info, ioc->buf, ioc->len)) != 0)
return error;
return 0;
}
static int
ti_bind(fp, fd, ioc, td)
struct file *fp;
int fd;
struct svr4_strioctl *ioc;
struct thread *td;
{
int error;
struct svr4_strm *st = svr4_stream_get(fp);
struct sockaddr_in sain;
struct sockaddr_un saun;
struct sockaddr *skp;
int sasize;
struct svr4_strmcmd bnd;
if (st == NULL) {
DPRINTF(("ti_bind: bad file descriptor\n"));
return EINVAL;
}
if (ioc->len < 0 || ioc->len > sizeof(bnd))
return EINVAL;
if ((error = copyin(ioc->buf, &bnd, ioc->len)) != 0)
return error;
if (bnd.cmd != SVR4_TI_OLD_BIND_REQUEST) {
DPRINTF(("ti_bind: bad request %ld\n", bnd.cmd));
return EINVAL;
}
switch (st->s_family) {
case AF_INET:
skp = (struct sockaddr *)&sain;
sasize = sizeof(sain);
if (bnd.offs == 0)
goto error;
netaddr_to_sockaddr_in(&sain, &bnd);
DPRINTF(("TI_BIND: fam %d, port %d, addr %x\n",
sain.sin_family, sain.sin_port,
sain.sin_addr.s_addr));
break;
case AF_LOCAL:
skp = (struct sockaddr *)&saun;
sasize = sizeof(saun);
if (bnd.offs == 0)
goto error;
netaddr_to_sockaddr_un(&saun, &bnd);
if (saun.sun_path[0] == '\0')
goto error;
DPRINTF(("TI_BIND: fam %d, path %s\n",
saun.sun_family, saun.sun_path));
if ((error = clean_pipe(td, saun.sun_path)) != 0)
return error;
bnd.pad[28] = 0x00001000; /* magic again */
break;
default:
DPRINTF(("TI_BIND: Unsupported address family %d\n",
st->s_family));
return ENOSYS;
}
DPRINTF(("TI_BIND: fileno %d\n", fd));
if ((error = kern_bind(td, fd, skp)) != 0) {
DPRINTF(("TI_BIND: bind failed %d\n", error));
return error;
}
goto reply;
error:
memset(&bnd, 0, sizeof(bnd));
bnd.len = sasize + 4;
bnd.offs = 0x10; /* XXX */
reply:
bnd.cmd = SVR4_TI_BIND_REPLY;
if ((error = copyout(&bnd, ioc->buf, ioc->len)) != 0)
return error;
return 0;
}
static int
timod(fp, fd, ioc, td)
struct file *fp;
int fd;
struct svr4_strioctl *ioc;
struct thread *td;
{
switch (ioc->cmd) {
case SVR4_TI_GETINFO:
DPRINTF(("TI_GETINFO\n"));
return ti_getinfo(fp, fd, ioc, td);
case SVR4_TI_OPTMGMT:
DPRINTF(("TI_OPTMGMT\n"));
return 0;
case SVR4_TI_BIND:
DPRINTF(("TI_BIND\n"));
return ti_bind(fp, fd, ioc, td);
case SVR4_TI_UNBIND:
DPRINTF(("TI_UNBIND\n"));
return 0;
default:
DPRINTF(("Unknown timod ioctl %lx\n", ioc->cmd));
return 0;
}
}
int
svr4_stream_ti_ioctl(fp, td, retval, fd, cmd, dat)
struct file *fp;
struct thread *td;
register_t *retval;
int fd;
u_long cmd;
caddr_t dat;
{
struct svr4_strbuf skb, *sub = (struct svr4_strbuf *) dat;
struct svr4_strm *st = svr4_stream_get(fp);
int error;
struct sockaddr *sa;
socklen_t sasize, oldsasize;
struct svr4_strmcmd sc;
DPRINTF(("svr4_stream_ti_ioctl\n"));
if (st == NULL)
return EINVAL;
sc.offs = 0x10;
if ((error = copyin(sub, &skb, sizeof(skb))) != 0) {
DPRINTF(("ti_ioctl: error copying in strbuf\n"));
return error;
}
switch (st->s_family) {
case AF_INET:
sasize = sizeof(struct sockaddr_in);
break;
case AF_LOCAL:
sasize = sizeof(struct sockaddr_un);
break;
default:
DPRINTF(("ti_ioctl: Unsupported address family %d\n",
st->s_family));
return ENOSYS;
}
oldsasize = sasize;
switch (cmd) {
case SVR4_TI_GETMYNAME:
DPRINTF(("TI_GETMYNAME\n"));
{
error = kern_getsockname(td, fd, &sa, &sasize);
if (error) {
DPRINTF(("ti_ioctl: getsockname error\n"));
return error;
}
}
break;
case SVR4_TI_GETPEERNAME:
DPRINTF(("TI_GETPEERNAME\n"));
{
error = kern_getpeername(td, fd, &sa, &sasize);
if (error) {
DPRINTF(("ti_ioctl: getpeername error\n"));
return error;
}
}
break;
case SVR4_TI_SETMYNAME:
DPRINTF(("TI_SETMYNAME\n"));
return 0;
case SVR4_TI_SETPEERNAME:
DPRINTF(("TI_SETPEERNAME\n"));
return 0;
default:
DPRINTF(("ti_ioctl: Unknown ioctl %lx\n", cmd));
return ENOSYS;
}
if (sasize < 0 || sasize > oldsasize) {
free(sa, M_SONAME);
return EINVAL;
}
switch (st->s_family) {
case AF_INET:
sockaddr_to_netaddr_in(&sc, (struct sockaddr_in *)sa);
skb.len = sasize;
break;
case AF_LOCAL:
sockaddr_to_netaddr_un(&sc, (struct sockaddr_un *)sa);
skb.len = sasize + 4;
break;
default:
free(sa, M_SONAME);
return ENOSYS;
}
free(sa, M_SONAME);
if ((error = copyout(SVR4_ADDROF(&sc), skb.buf, sasize)) != 0) {
DPRINTF(("ti_ioctl: error copying out socket data\n"));
return error;
}
if ((error = copyout(&skb, sub, sizeof(skb))) != 0) {
DPRINTF(("ti_ioctl: error copying out strbuf\n"));
return error;
}
return error;
}
static int
i_nread(fp, td, retval, fd, cmd, dat)
struct file *fp;
struct thread *td;
register_t *retval;
int fd;
u_long cmd;
caddr_t dat;
{
int error;
int nread = 0;
/*
* We are supposed to return the message length in nread, and the
* number of messages in retval. We don't have the notion of number
* of stream messages, so we just find out if we have any bytes waiting
* for us, and if we do, then we assume that we have at least one
* message waiting for us.
*/
if ((error = fo_ioctl(fp, FIONREAD, (caddr_t) &nread, td->td_ucred,
td)) != 0)
return error;
if (nread != 0)
*retval = 1;
else
*retval = 0;
return copyout(&nread, dat, sizeof(nread));
}
static int
i_fdinsert(fp, td, retval, fd, cmd, dat)
struct file *fp;
struct thread *td;
register_t *retval;
int fd;
u_long cmd;
caddr_t dat;
{
/*
* Major hack again here. We assume that we are using this to
* implement accept(2). If that is the case, we have already
* called accept, and we have stored the file descriptor in
* afd. We find the file descriptor that the code wants to use
* in fd insert, and then we dup2() our accepted file descriptor
* to it.
*/
int error;
struct svr4_strm *st = svr4_stream_get(fp);
struct svr4_strfdinsert fdi;
struct dup2_args d2p;
if (st == NULL) {
DPRINTF(("fdinsert: bad file type\n"));
return EINVAL;
}
mtx_lock(&Giant);
if (st->s_afd == -1) {
DPRINTF(("fdinsert: accept fd not found\n"));
mtx_unlock(&Giant);
return ENOENT;
}
if ((error = copyin(dat, &fdi, sizeof(fdi))) != 0) {
DPRINTF(("fdinsert: copyin failed %d\n", error));
mtx_unlock(&Giant);
return error;
}
d2p.from = st->s_afd;
d2p.to = fdi.fd;
- if ((error = dup2(td, &d2p)) != 0) {
+ if ((error = sys_dup2(td, &d2p)) != 0) {
DPRINTF(("fdinsert: dup2(%d, %d) failed %d\n",
st->s_afd, fdi.fd, error));
mtx_unlock(&Giant);
return error;
}
if ((error = kern_close(td, st->s_afd)) != 0) {
DPRINTF(("fdinsert: close(%d) failed %d\n",
st->s_afd, error));
mtx_unlock(&Giant);
return error;
}
st->s_afd = -1;
mtx_unlock(&Giant);
*retval = 0;
return 0;
}
static int
_i_bind_rsvd(fp, td, retval, fd, cmd, dat)
struct file *fp;
struct thread *td;
register_t *retval;
int fd;
u_long cmd;
caddr_t dat;
{
struct mkfifo_args ap;
/*
* This is a supposed to be a kernel and library only ioctl.
* It gets called before ti_bind, when we have a unix
* socket, to physically create the socket transport and
* ``reserve'' it. I don't know how this get reserved inside
* the kernel, but we are going to create it nevertheless.
*/
ap.path = dat;
ap.mode = S_IFIFO;
- return mkfifo(td, &ap);
+ return sys_mkfifo(td, &ap);
}
static int
_i_rele_rsvd(fp, td, retval, fd, cmd, dat)
struct file *fp;
struct thread *td;
register_t *retval;
int fd;
u_long cmd;
caddr_t dat;
{
struct unlink_args ap;
/*
* This is a supposed to be a kernel and library only ioctl.
* I guess it is supposed to release the socket.
*/
ap.path = dat;
- return unlink(td, &ap);
+ return sys_unlink(td, &ap);
}
static int
i_str(fp, td, retval, fd, cmd, dat)
struct file *fp;
struct thread *td;
register_t *retval;
int fd;
u_long cmd;
caddr_t dat;
{
int error;
struct svr4_strioctl ioc;
if ((error = copyin(dat, &ioc, sizeof(ioc))) != 0)
return error;
#ifdef DEBUG_SVR4
if ((error = show_ioc(">", &ioc)) != 0)
return error;
#endif /* DEBUG_SVR4 */
switch (ioc.cmd & 0xff00) {
case SVR4_SIMOD:
if ((error = sockmod(fp, fd, &ioc, td)) != 0)
return error;
break;
case SVR4_TIMOD:
if ((error = timod(fp, fd, &ioc, td)) != 0)
return error;
break;
default:
DPRINTF(("Unimplemented module %c %ld\n",
(char) (cmd >> 8), cmd & 0xff));
return 0;
}
#ifdef DEBUG_SVR4
if ((error = show_ioc("<", &ioc)) != 0)
return error;
#endif /* DEBUG_SVR4 */
return copyout(&ioc, dat, sizeof(ioc));
}
static int
i_setsig(fp, td, retval, fd, cmd, dat)
struct file *fp;
struct thread *td;
register_t *retval;
int fd;
u_long cmd;
caddr_t dat;
{
/*
* This is the best we can do for now; we cannot generate
* signals only for specific events so the signal mask gets
* ignored; we save it just to pass it to a possible I_GETSIG...
*
* We alse have to fix the O_ASYNC fcntl bit, so the
* process will get SIGPOLLs.
*/
int error;
register_t oflags, flags;
struct svr4_strm *st = svr4_stream_get(fp);
if (st == NULL) {
DPRINTF(("i_setsig: bad file descriptor\n"));
return EINVAL;
}
/* get old status flags */
error = kern_fcntl(td, fd, F_GETFL, 0);
if (error)
return (error);
oflags = td->td_retval[0];
/* update the flags */
mtx_lock(&Giant);
if (dat != NULL) {
int mask;
flags = oflags | O_ASYNC;
if ((error = copyin(dat, &mask, sizeof(mask))) != 0) {
DPRINTF(("i_setsig: bad eventmask pointer\n"));
return error;
}
if (mask & SVR4_S_ALLMASK) {
DPRINTF(("i_setsig: bad eventmask data %x\n", mask));
return EINVAL;
}
st->s_eventmask = mask;
}
else {
flags = oflags & ~O_ASYNC;
st->s_eventmask = 0;
}
mtx_unlock(&Giant);
/* set the new flags, if changed */
if (flags != oflags) {
error = kern_fcntl(td, fd, F_SETFL, flags);
if (error)
return (error);
flags = td->td_retval[0];
}
/* set up SIGIO receiver if needed */
if (dat != NULL)
return (kern_fcntl(td, fd, F_SETOWN, td->td_proc->p_pid));
return 0;
}
static int
i_getsig(fp, td, retval, fd, cmd, dat)
struct file *fp;
struct thread *td;
register_t *retval;
int fd;
u_long cmd;
caddr_t dat;
{
int error, eventmask;
if (dat != NULL) {
struct svr4_strm *st = svr4_stream_get(fp);
if (st == NULL) {
DPRINTF(("i_getsig: bad file descriptor\n"));
return EINVAL;
}
mtx_lock(&Giant);
eventmask = st->s_eventmask;
mtx_unlock(&Giant);
if ((error = copyout(&eventmask, dat,
sizeof(eventmask))) != 0) {
DPRINTF(("i_getsig: bad eventmask pointer\n"));
return error;
}
}
return 0;
}
int
svr4_stream_ioctl(fp, td, retval, fd, cmd, dat)
struct file *fp;
struct thread *td;
register_t *retval;
int fd;
u_long cmd;
caddr_t dat;
{
*retval = 0;
/*
* All the following stuff assumes "sockmod" is pushed...
*/
switch (cmd) {
case SVR4_I_NREAD:
DPRINTF(("I_NREAD\n"));
return i_nread(fp, td, retval, fd, cmd, dat);
case SVR4_I_PUSH:
DPRINTF(("I_PUSH %p\n", dat));
#if defined(DEBUG_SVR4)
show_strbuf((struct svr4_strbuf *)dat);
#endif
return 0;
case SVR4_I_POP:
DPRINTF(("I_POP\n"));
return 0;
case SVR4_I_LOOK:
DPRINTF(("I_LOOK\n"));
return 0;
case SVR4_I_FLUSH:
DPRINTF(("I_FLUSH\n"));
return 0;
case SVR4_I_SRDOPT:
DPRINTF(("I_SRDOPT\n"));
return 0;
case SVR4_I_GRDOPT:
DPRINTF(("I_GRDOPT\n"));
return 0;
case SVR4_I_STR:
DPRINTF(("I_STR\n"));
return i_str(fp, td, retval, fd, cmd, dat);
case SVR4_I_SETSIG:
DPRINTF(("I_SETSIG\n"));
return i_setsig(fp, td, retval, fd, cmd, dat);
case SVR4_I_GETSIG:
DPRINTF(("I_GETSIG\n"));
return i_getsig(fp, td, retval, fd, cmd, dat);
case SVR4_I_FIND:
DPRINTF(("I_FIND\n"));
/*
* Here we are not pushing modules really, we just
* pretend all are present
*/
*retval = 0;
return 0;
case SVR4_I_LINK:
DPRINTF(("I_LINK\n"));
return 0;
case SVR4_I_UNLINK:
DPRINTF(("I_UNLINK\n"));
return 0;
case SVR4_I_ERECVFD:
DPRINTF(("I_ERECVFD\n"));
return 0;
case SVR4_I_PEEK:
DPRINTF(("I_PEEK\n"));
return 0;
case SVR4_I_FDINSERT:
DPRINTF(("I_FDINSERT\n"));
return i_fdinsert(fp, td, retval, fd, cmd, dat);
case SVR4_I_SENDFD:
DPRINTF(("I_SENDFD\n"));
return 0;
case SVR4_I_RECVFD:
DPRINTF(("I_RECVFD\n"));
return 0;
case SVR4_I_SWROPT:
DPRINTF(("I_SWROPT\n"));
return 0;
case SVR4_I_GWROPT:
DPRINTF(("I_GWROPT\n"));
return 0;
case SVR4_I_LIST:
DPRINTF(("I_LIST\n"));
return 0;
case SVR4_I_PLINK:
DPRINTF(("I_PLINK\n"));
return 0;
case SVR4_I_PUNLINK:
DPRINTF(("I_PUNLINK\n"));
return 0;
case SVR4_I_SETEV:
DPRINTF(("I_SETEV\n"));
return 0;
case SVR4_I_GETEV:
DPRINTF(("I_GETEV\n"));
return 0;
case SVR4_I_STREV:
DPRINTF(("I_STREV\n"));
return 0;
case SVR4_I_UNSTREV:
DPRINTF(("I_UNSTREV\n"));
return 0;
case SVR4_I_FLUSHBAND:
DPRINTF(("I_FLUSHBAND\n"));
return 0;
case SVR4_I_CKBAND:
DPRINTF(("I_CKBAND\n"));
return 0;
case SVR4_I_GETBAND:
DPRINTF(("I_GETBANK\n"));
return 0;
case SVR4_I_ATMARK:
DPRINTF(("I_ATMARK\n"));
return 0;
case SVR4_I_SETCLTIME:
DPRINTF(("I_SETCLTIME\n"));
return 0;
case SVR4_I_GETCLTIME:
DPRINTF(("I_GETCLTIME\n"));
return 0;
case SVR4_I_CANPUT:
DPRINTF(("I_CANPUT\n"));
return 0;
case SVR4__I_BIND_RSVD:
DPRINTF(("_I_BIND_RSVD\n"));
return _i_bind_rsvd(fp, td, retval, fd, cmd, dat);
case SVR4__I_RELE_RSVD:
DPRINTF(("_I_RELE_RSVD\n"));
return _i_rele_rsvd(fp, td, retval, fd, cmd, dat);
default:
DPRINTF(("unimpl cmd = %lx\n", cmd));
break;
}
return 0;
}
int
svr4_sys_putmsg(td, uap)
struct thread *td;
struct svr4_sys_putmsg_args *uap;
{
struct file *fp;
int error;
if ((error = fget(td, uap->fd, CAP_WRITE, &fp)) != 0) {
#ifdef DEBUG_SVR4
uprintf("putmsg: bad fp\n");
#endif
return EBADF;
}
error = svr4_do_putmsg(td, uap, fp);
fdrop(fp, td);
return (error);
}
static int
svr4_do_putmsg(td, uap, fp)
struct thread *td;
struct svr4_sys_putmsg_args *uap;
struct file *fp;
{
struct svr4_strbuf dat, ctl;
struct svr4_strmcmd sc;
struct sockaddr_in sain;
struct sockaddr_un saun;
struct sockaddr *sa;
int sasize, *retval;
struct svr4_strm *st;
int error;
retval = td->td_retval;
#ifdef DEBUG_SVR4
show_msg(">putmsg", uap->fd, uap->ctl,
uap->dat, uap->flags);
#endif /* DEBUG_SVR4 */
if (uap->ctl != NULL) {
if ((error = copyin(uap->ctl, &ctl, sizeof(ctl))) != 0) {
#ifdef DEBUG_SVR4
uprintf("putmsg: copyin(): %d\n", error);
#endif
return error;
}
}
else
ctl.len = -1;
if (uap->dat != NULL) {
if ((error = copyin(uap->dat, &dat, sizeof(dat))) != 0) {
#ifdef DEBUG_SVR4
uprintf("putmsg: copyin(): %d (2)\n", error);
#endif
return error;
}
}
else
dat.len = -1;
/*
* Only for sockets for now.
*/
if ((st = svr4_stream_get(fp)) == NULL) {
DPRINTF(("putmsg: bad file type\n"));
return EINVAL;
}
if (ctl.len < 0 || ctl.len > sizeof(sc)) {
DPRINTF(("putmsg: Bad control size %d != %d\n", ctl.len,
sizeof(struct svr4_strmcmd)));
return EINVAL;
}
if ((error = copyin(ctl.buf, &sc, ctl.len)) != 0)
return error;
switch (st->s_family) {
case AF_INET:
if (sc.len != sizeof(sain)) {
if (sc.cmd == SVR4_TI_DATA_REQUEST) {
struct write_args wa;
/* Solaris seems to use sc.cmd = 3 to
* send "expedited" data. telnet uses
* this for options processing, sending EOF,
* etc. I'm sure other things use it too.
* I don't have any documentation
* on it, so I'm making a guess that this
* is how it works. newton@atdot.dotat.org XXX
*/
DPRINTF(("sending expedited data ??\n"));
wa.fd = uap->fd;
wa.buf = dat.buf;
wa.nbyte = dat.len;
- return write(td, &wa);
+ return sys_write(td, &wa);
}
DPRINTF(("putmsg: Invalid inet length %ld\n", sc.len));
return EINVAL;
}
netaddr_to_sockaddr_in(&sain, &sc);
sa = (struct sockaddr *)&sain;
sasize = sizeof(sain);
if (sain.sin_family != st->s_family)
error = EINVAL;
break;
case AF_LOCAL:
if (ctl.len == 8) {
/* We are doing an accept; succeed */
DPRINTF(("putmsg: Do nothing\n"));
*retval = 0;
return 0;
}
else {
/* Maybe we've been given a device/inode pair */
dev_t *dev = SVR4_ADDROF(&sc);
ino_t *ino = (ino_t *) &dev[1];
if (svr4_find_socket(td, fp, *dev, *ino, &saun) != 0) {
/* I guess we have it by name */
netaddr_to_sockaddr_un(&saun, &sc);
}
sa = (struct sockaddr *)&saun;
sasize = sizeof(saun);
}
break;
default:
DPRINTF(("putmsg: Unsupported address family %d\n",
st->s_family));
return ENOSYS;
}
mtx_lock(&Giant);
st->s_cmd = sc.cmd;
mtx_unlock(&Giant);
switch (sc.cmd) {
case SVR4_TI_CONNECT_REQUEST: /* connect */
{
return (kern_connect(td, uap->fd, sa));
}
case SVR4_TI_SENDTO_REQUEST: /* sendto */
{
struct msghdr msg;
struct iovec aiov;
msg.msg_name = sa;
msg.msg_namelen = sasize;
msg.msg_iov = &aiov;
msg.msg_iovlen = 1;
msg.msg_control = 0;
msg.msg_flags = 0;
aiov.iov_base = dat.buf;
aiov.iov_len = dat.len;
error = kern_sendit(td, uap->fd, &msg, uap->flags,
NULL, UIO_USERSPACE);
DPRINTF(("sendto_request error: %d\n", error));
*retval = 0;
return error;
}
default:
DPRINTF(("putmsg: Unimplemented command %lx\n", sc.cmd));
return ENOSYS;
}
}
int
svr4_sys_getmsg(td, uap)
struct thread *td;
struct svr4_sys_getmsg_args *uap;
{
struct file *fp;
int error;
if ((error = fget(td, uap->fd, CAP_READ, &fp)) != 0) {
#ifdef DEBUG_SVR4
uprintf("getmsg: bad fp\n");
#endif
return EBADF;
}
error = svr4_do_getmsg(td, uap, fp);
fdrop(fp, td);
return (error);
}
int
svr4_do_getmsg(td, uap, fp)
struct thread *td;
struct svr4_sys_getmsg_args *uap;
struct file *fp;
{
struct svr4_strbuf dat, ctl;
struct svr4_strmcmd sc;
int error, *retval;
struct msghdr msg;
struct iovec aiov;
struct sockaddr_in sain;
struct sockaddr_un saun;
struct sockaddr *sa;
socklen_t sasize;
struct svr4_strm *st;
struct file *afp;
int fl;
retval = td->td_retval;
error = 0;
afp = NULL;
memset(&sc, 0, sizeof(sc));
#ifdef DEBUG_SVR4
show_msg(">getmsg", uap->fd, uap->ctl,
uap->dat, 0);
#endif /* DEBUG_SVR4 */
if (uap->ctl != NULL) {
if ((error = copyin(uap->ctl, &ctl, sizeof(ctl))) != 0)
return error;
if (ctl.len < 0)
return EINVAL;
}
else {
ctl.len = -1;
ctl.maxlen = 0;
}
if (uap->dat != NULL) {
if ((error = copyin(uap->dat, &dat, sizeof(dat))) != 0)
return error;
}
else {
dat.len = -1;
dat.maxlen = 0;
}
/*
* Only for sockets for now.
*/
if ((st = svr4_stream_get(fp)) == NULL) {
DPRINTF(("getmsg: bad file type\n"));
return EINVAL;
}
if (ctl.maxlen == -1 || dat.maxlen == -1) {
DPRINTF(("getmsg: Cannot handle -1 maxlen (yet)\n"));
return ENOSYS;
}
switch (st->s_family) {
case AF_INET:
sasize = sizeof(sain);
break;
case AF_LOCAL:
sasize = sizeof(saun);
break;
default:
DPRINTF(("getmsg: Unsupported address family %d\n",
st->s_family));
return ENOSYS;
}
mtx_lock(&Giant);
switch (st->s_cmd) {
case SVR4_TI_CONNECT_REQUEST:
DPRINTF(("getmsg: TI_CONNECT_REQUEST\n"));
/*
* We do the connect in one step, so the putmsg should
* have gotten the error.
*/
sc.cmd = SVR4_TI_OK_REPLY;
sc.len = 0;
ctl.len = 8;
dat.len = -1;
fl = 1;
st->s_cmd = sc.cmd;
break;
case SVR4_TI_OK_REPLY:
DPRINTF(("getmsg: TI_OK_REPLY\n"));
/*
* We are immediately after a connect reply, so we send
* a connect verification.
*/
error = kern_getpeername(td, uap->fd, &sa, &sasize);
if (error) {
mtx_unlock(&Giant);
DPRINTF(("getmsg: getpeername failed %d\n", error));
return error;
}
sc.cmd = SVR4_TI_CONNECT_REPLY;
sc.pad[0] = 0x4;
sc.offs = 0x18;
sc.pad[1] = 0x14;
sc.pad[2] = 0x04000402;
switch (st->s_family) {
case AF_INET:
sc.len = sasize;
sockaddr_to_netaddr_in(&sc, (struct sockaddr_in *)sa);
break;
case AF_LOCAL:
sc.len = sasize + 4;
sockaddr_to_netaddr_un(&sc, (struct sockaddr_un *)sa);
break;
default:
mtx_unlock(&Giant);
free(sa, M_SONAME);
return ENOSYS;
}
free(sa, M_SONAME);
ctl.len = 40;
dat.len = -1;
fl = 0;
st->s_cmd = sc.cmd;
break;
case SVR4_TI__ACCEPT_OK:
DPRINTF(("getmsg: TI__ACCEPT_OK\n"));
/*
* We do the connect in one step, so the putmsg should
* have gotten the error.
*/
sc.cmd = SVR4_TI_OK_REPLY;
sc.len = 1;
ctl.len = 8;
dat.len = -1;
fl = 1;
st->s_cmd = SVR4_TI__ACCEPT_WAIT;
break;
case SVR4_TI__ACCEPT_WAIT:
DPRINTF(("getmsg: TI__ACCEPT_WAIT\n"));
/*
* We are after a listen, so we try to accept...
*/
error = kern_accept(td, uap->fd, &sa, &sasize, &afp);
if (error) {
mtx_unlock(&Giant);
DPRINTF(("getmsg: accept failed %d\n", error));
return error;
}
st->s_afd = *retval;
DPRINTF(("getmsg: Accept fd = %d\n", st->s_afd));
sc.cmd = SVR4_TI_ACCEPT_REPLY;
sc.offs = 0x18;
sc.pad[0] = 0x0;
switch (st->s_family) {
case AF_INET:
sc.pad[1] = 0x28;
sockaddr_to_netaddr_in(&sc, (struct sockaddr_in *)&sa);
ctl.len = 40;
sc.len = sasize;
break;
case AF_LOCAL:
sc.pad[1] = 0x00010000;
sc.pad[2] = 0xf6bcdaa0; /* I don't know what that is */
sc.pad[3] = 0x00010000;
ctl.len = 134;
sc.len = sasize + 4;
break;
default:
fdclose(td->td_proc->p_fd, afp, st->s_afd, td);
fdrop(afp, td);
st->s_afd = -1;
mtx_unlock(&Giant);
free(sa, M_SONAME);
return ENOSYS;
}
free(sa, M_SONAME);
dat.len = -1;
fl = 0;
st->s_cmd = SVR4_TI__ACCEPT_OK;
break;
case SVR4_TI_SENDTO_REQUEST:
DPRINTF(("getmsg: TI_SENDTO_REQUEST\n"));
if (ctl.maxlen > 36 && ctl.len < 36)
ctl.len = 36;
if (ctl.len > sizeof(sc))
ctl.len = sizeof(sc);
if ((error = copyin(ctl.buf, &sc, ctl.len)) != 0) {
mtx_unlock(&Giant);
return error;
}
switch (st->s_family) {
case AF_INET:
sa = (struct sockaddr *)&sain;
sockaddr_to_netaddr_in(&sc, &sain);
break;
case AF_LOCAL:
sa = (struct sockaddr *)&saun;
sockaddr_to_netaddr_un(&sc, &saun);
break;
default:
mtx_unlock(&Giant);
return ENOSYS;
}
msg.msg_name = sa;
msg.msg_namelen = sasize;
msg.msg_iov = &aiov;
msg.msg_iovlen = 1;
msg.msg_control = 0;
aiov.iov_base = dat.buf;
aiov.iov_len = dat.maxlen;
msg.msg_flags = 0;
error = kern_recvit(td, uap->fd, &msg, UIO_SYSSPACE, NULL);
if (error) {
mtx_unlock(&Giant);
DPRINTF(("getmsg: recvit failed %d\n", error));
return error;
}
sc.cmd = SVR4_TI_RECVFROM_IND;
switch (st->s_family) {
case AF_INET:
sc.len = sasize;
sockaddr_to_netaddr_in(&sc, &sain);
break;
case AF_LOCAL:
sc.len = sasize + 4;
sockaddr_to_netaddr_un(&sc, &saun);
break;
default:
mtx_unlock(&Giant);
return ENOSYS;
}
dat.len = *retval;
fl = 0;
st->s_cmd = sc.cmd;
break;
default:
st->s_cmd = sc.cmd;
if (st->s_cmd == SVR4_TI_CONNECT_REQUEST) {
struct read_args ra;
/* More weirdness: Again, I can't find documentation
* to back this up, but when a process does a generic
* "getmsg()" call it seems that the command field is
* zero and the length of the data area is zero. I
* think processes expect getmsg() to fill in dat.len
* after reading at most dat.maxlen octets from the
* stream. Since we're using sockets I can let
* read() look after it and frob return values
* appropriately (or inappropriately :-)
* -- newton@atdot.dotat.org XXX
*/
ra.fd = uap->fd;
ra.buf = dat.buf;
ra.nbyte = dat.maxlen;
- if ((error = read(td, &ra)) != 0) {
+ if ((error = sys_read(td, &ra)) != 0) {
mtx_unlock(&Giant);
return error;
}
dat.len = *retval;
*retval = 0;
st->s_cmd = SVR4_TI_SENDTO_REQUEST;
break;
}
mtx_unlock(&Giant);
DPRINTF(("getmsg: Unknown state %x\n", st->s_cmd));
return EINVAL;
}
if (uap->ctl) {
if (ctl.len > sizeof(sc))
ctl.len = sizeof(sc);
if (ctl.len != -1)
error = copyout(&sc, ctl.buf, ctl.len);
if (error == 0)
error = copyout(&ctl, uap->ctl, sizeof(ctl));
}
if (uap->dat) {
if (error == 0)
error = copyout(&dat, uap->dat, sizeof(dat));
}
if (uap->flags) { /* XXX: Need translation */
if (error == 0)
error = copyout(&fl, uap->flags, sizeof(fl));
}
if (error) {
if (afp) {
fdclose(td->td_proc->p_fd, afp, st->s_afd, td);
fdrop(afp, td);
st->s_afd = -1;
}
mtx_unlock(&Giant);
return (error);
}
mtx_unlock(&Giant);
if (afp)
fdrop(afp, td);
*retval = 0;
#ifdef DEBUG_SVR4
show_msg("<getmsg", uap->fd, uap->ctl,
uap->dat, fl);
#endif /* DEBUG_SVR4 */
return error;
}
int svr4_sys_send(td, uap)
struct thread *td;
struct svr4_sys_send_args *uap;
{
struct sendto_args sta;
sta.s = uap->s;
sta.buf = uap->buf;
sta.len = uap->len;
sta.flags = uap->flags;
sta.to = NULL;
sta.tolen = 0;
- return (sendto(td, &sta));
+ return (sys_sendto(td, &sta));
}
int svr4_sys_recv(td, uap)
struct thread *td;
struct svr4_sys_recv_args *uap;
{
struct recvfrom_args rfa;
rfa.s = uap->s;
rfa.buf = uap->buf;
rfa.len = uap->len;
rfa.flags = uap->flags;
rfa.from = NULL;
rfa.fromlenaddr = NULL;
- return (recvfrom(td, &rfa));
+ return (sys_recvfrom(td, &rfa));
}
/*
* XXX This isn't necessary, but it's handy for inserting debug code into
* sendto(). Let's leave it here for now...
*/
int
svr4_sys_sendto(td, uap)
struct thread *td;
struct svr4_sys_sendto_args *uap;
{
struct sendto_args sa;
sa.s = uap->s;
sa.buf = uap->buf;
sa.len = uap->len;
sa.flags = uap->flags;
sa.to = (caddr_t)uap->to;
sa.tolen = uap->tolen;
DPRINTF(("calling sendto()\n"));
- return sendto(td, &sa);
+ return sys_sendto(td, &sa);
}
Index: head/sys/dev/bktr/bktr_core.c
===================================================================
--- head/sys/dev/bktr/bktr_core.c (revision 225616)
+++ head/sys/dev/bktr/bktr_core.c (revision 225617)
@@ -1,4315 +1,4315 @@
/*-
* 1. Redistributions of source code must retain the
* Copyright (c) 1997 Amancio Hasty, 1999 Roger Hardiman
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by Amancio Hasty and
* Roger Hardiman
* 4. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
* INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*-
* 1. Redistributions of source code must retain the
* Copyright (c) 1995 Mark Tinguely and Jim Lowe
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by Mark Tinguely and Jim Lowe
* 4. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
* INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
/*
* This is part of the Driver for Video Capture Cards (Frame grabbers)
* and TV Tuner cards using the Brooktree Bt848, Bt848A, Bt849A, Bt878, Bt879
* chipset.
* Copyright Roger Hardiman and Amancio Hasty.
*
* bktr_core : This deals with the Bt848/849/878/879 PCI Frame Grabber,
* Handles all the open, close, ioctl and read userland calls.
* Sets the Bt848 registers and generates RISC pograms.
* Controls the i2c bus and GPIO interface.
* Contains the interface to the kernel.
* (eg probe/attach and open/close/ioctl)
*/
/*
The Brooktree BT848 Driver driver is based upon Mark Tinguely and
Jim Lowe's driver for the Matrox Meteor PCI card . The
Philips SAA 7116 and SAA 7196 are very different chipsets than
the BT848.
The original copyright notice by Mark and Jim is included mostly
to honor their fantastic work in the Matrox Meteor driver!
*/
#include "opt_bktr.h" /* Include any kernel config options */
#if ( \
(defined(__FreeBSD__)) \
|| (defined(__bsdi__)) \
|| (defined(__OpenBSD__)) \
|| (defined(__NetBSD__)) \
)
/*******************/
/* *** FreeBSD *** */
/*******************/
#ifdef __FreeBSD__
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/fcntl.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/signalvar.h>
#include <sys/selinfo.h>
#include <sys/uio.h>
#include <vm/vm.h>
#include <vm/vm_kern.h>
#include <vm/pmap.h>
#include <vm/vm_extern.h>
#include <sys/bus.h> /* used by smbus and newbus */
#if (__FreeBSD_version < 500000)
#include <machine/clock.h> /* for DELAY */
#define PROC_LOCK(p)
#define PROC_UNLOCK(p)
#include <pci/pcivar.h>
#else
#include <dev/pci/pcivar.h>
#endif
#include <machine/bus.h>
#include <sys/bus.h>
#include <dev/bktr/ioctl_meteor.h>
#include <dev/bktr/ioctl_bt848.h> /* extensions to ioctl_meteor.h */
#include <dev/bktr/bktr_reg.h>
#include <dev/bktr/bktr_tuner.h>
#include <dev/bktr/bktr_card.h>
#include <dev/bktr/bktr_audio.h>
#include <dev/bktr/bktr_os.h>
#include <dev/bktr/bktr_core.h>
#if defined(BKTR_FREEBSD_MODULE)
#include <dev/bktr/bktr_mem.h>
#endif
#if defined(BKTR_USE_FREEBSD_SMBUS)
#include <dev/bktr/bktr_i2c.h>
#include <dev/smbus/smbconf.h>
#include <dev/iicbus/iiconf.h>
#include "smbus_if.h"
#include "iicbus_if.h"
#endif
const char *
bktr_name(bktr_ptr_t bktr)
{
return bktr->bktr_xname;
}
#endif /* __FreeBSD__ */
/****************/
/* *** BSDI *** */
/****************/
#ifdef __bsdi__
#define PROC_LOCK(p)
#define PROC_UNLOCK(p)
#endif /* __bsdi__ */
/**************************/
/* *** OpenBSD/NetBSD *** */
/**************************/
#if defined(__NetBSD__) || defined(__OpenBSD__)
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/signalvar.h>
#include <sys/vnode.h>
#ifdef __NetBSD__
#include <uvm/uvm_extern.h>
#else
#include <vm/vm.h>
#include <vm/vm_kern.h>
#include <vm/pmap.h>
#include <vm/vm_extern.h>
#endif
#include <sys/inttypes.h> /* uintptr_t */
#include <dev/ic/bt8xx.h>
#include <dev/pci/bktr/bktr_reg.h>
#include <dev/pci/bktr/bktr_tuner.h>
#include <dev/pci/bktr/bktr_card.h>
#include <dev/pci/bktr/bktr_audio.h>
#include <dev/pci/bktr/bktr_core.h>
#include <dev/pci/bktr/bktr_os.h>
static int bt848_format = -1;
const char *
bktr_name(bktr_ptr_t bktr)
{
return (bktr->bktr_dev.dv_xname);
}
#define PROC_LOCK(p)
#define PROC_UNLOCK(p)
#endif /* __NetBSD__ || __OpenBSD__ */
typedef u_char bool_t;
#define BKTRPRI (PZERO+8)|PCATCH
#define VBIPRI (PZERO-4)|PCATCH
/*
* memory allocated for DMA programs
*/
#define DMA_PROG_ALLOC (8 * PAGE_SIZE)
/* When to split a dma transfer , the bt848 has timing as well as
dma transfer size limitations so that we have to split dma
transfers into two dma requests
*/
#define DMA_BT848_SPLIT 319*2
/*
* Allocate enough memory for:
* 768x576 RGB 16 or YUV (16 storage bits/pixel) = 884736 = 216 pages
*
* You may override this using the options "BROOKTREE_ALLOC_PAGES=value"
* in your kernel configuration file.
*/
#ifndef BROOKTREE_ALLOC_PAGES
#define BROOKTREE_ALLOC_PAGES 217*4
#endif
#define BROOKTREE_ALLOC (BROOKTREE_ALLOC_PAGES * PAGE_SIZE)
/* Definitions for VBI capture.
* There are 16 VBI lines in a PAL video field (32 in a frame),
* and we take 2044 samples from each line (placed in a 2048 byte buffer
* for alignment).
* VBI lines are held in a circular buffer before being read by a
* user program from /dev/vbi.
*/
#define MAX_VBI_LINES 16 /* Maximum for all vidoe formats */
#define VBI_LINE_SIZE 2048 /* Store upto 2048 bytes per line */
#define VBI_BUFFER_ITEMS 20 /* Number of frames we buffer */
#define VBI_DATA_SIZE (VBI_LINE_SIZE * MAX_VBI_LINES * 2)
#define VBI_BUFFER_SIZE (VBI_DATA_SIZE * VBI_BUFFER_ITEMS)
/* Defines for fields */
#define ODD_F 0x01
#define EVEN_F 0x02
/*
* Parameters describing size of transmitted image.
*/
static struct format_params format_params[] = {
/* # define BT848_IFORM_F_AUTO (0x0) - don't matter. */
{ 525, 26, 480, 910, 135, 754, 640, 780, 30, 0x68, 0x5d, BT848_IFORM_X_AUTO,
12, 1600 },
/* # define BT848_IFORM_F_NTSCM (0x1) */
{ 525, 26, 480, 910, 135, 754, 640, 780, 30, 0x68, 0x5d, BT848_IFORM_X_XT0,
12, 1600 },
/* # define BT848_IFORM_F_NTSCJ (0x2) */
{ 525, 22, 480, 910, 135, 754, 640, 780, 30, 0x68, 0x5d, BT848_IFORM_X_XT0,
12, 1600 },
/* # define BT848_IFORM_F_PALBDGHI (0x3) */
{ 625, 32, 576, 1135, 186, 924, 768, 944, 25, 0x7f, 0x72, BT848_IFORM_X_XT1,
16, 2044 },
/* # define BT848_IFORM_F_PALM (0x4) */
{ 525, 22, 480, 910, 135, 754, 640, 780, 30, 0x68, 0x5d, BT848_IFORM_X_XT0,
12, 1600 },
/* # define BT848_IFORM_F_PALN (0x5) */
{ 625, 32, 576, 1135, 186, 924, 768, 944, 25, 0x7f, 0x72, BT848_IFORM_X_XT1,
16, 2044 },
/* # define BT848_IFORM_F_SECAM (0x6) */
{ 625, 32, 576, 1135, 186, 924, 768, 944, 25, 0x7f, 0xa0, BT848_IFORM_X_XT1,
16, 2044 },
/* # define BT848_IFORM_F_RSVD (0x7) - ???? */
{ 625, 32, 576, 1135, 186, 924, 768, 944, 25, 0x7f, 0x72, BT848_IFORM_X_XT0,
16, 2044 },
};
/*
* Table of supported Pixel Formats
*/
static struct meteor_pixfmt_internal {
struct meteor_pixfmt public;
u_int color_fmt;
} pixfmt_table[] = {
{ { 0, METEOR_PIXTYPE_RGB, 2, { 0x7c00, 0x03e0, 0x001f }, 0,0 }, 0x33 },
{ { 0, METEOR_PIXTYPE_RGB, 2, { 0x7c00, 0x03e0, 0x001f }, 1,0 }, 0x33 },
{ { 0, METEOR_PIXTYPE_RGB, 2, { 0xf800, 0x07e0, 0x001f }, 0,0 }, 0x22 },
{ { 0, METEOR_PIXTYPE_RGB, 2, { 0xf800, 0x07e0, 0x001f }, 1,0 }, 0x22 },
{ { 0, METEOR_PIXTYPE_RGB, 3, { 0xff0000,0x00ff00,0x0000ff }, 1,0 }, 0x11 },
{ { 0, METEOR_PIXTYPE_RGB, 4, { 0xff0000,0x00ff00,0x0000ff }, 0,0 }, 0x00 },
{ { 0, METEOR_PIXTYPE_RGB, 4, { 0xff0000,0x00ff00,0x0000ff }, 0,1 }, 0x00 },
{ { 0, METEOR_PIXTYPE_RGB, 4, { 0xff0000,0x00ff00,0x0000ff }, 1,0 }, 0x00 },
{ { 0, METEOR_PIXTYPE_RGB, 4, { 0xff0000,0x00ff00,0x0000ff }, 1,1 }, 0x00 },
{ { 0, METEOR_PIXTYPE_YUV, 2, { 0xff0000,0x00ff00,0x0000ff }, 1,1 }, 0x88 },
{ { 0, METEOR_PIXTYPE_YUV_PACKED, 2, { 0xff0000,0x00ff00,0x0000ff }, 0,1 }, 0x44 },
{ { 0, METEOR_PIXTYPE_YUV_12, 2, { 0xff0000,0x00ff00,0x0000ff }, 1,1 }, 0x88 },
};
#define PIXFMT_TABLE_SIZE ( sizeof(pixfmt_table) / sizeof(pixfmt_table[0]) )
/*
* Table of Meteor-supported Pixel Formats (for SETGEO compatibility)
*/
/* FIXME: Also add YUV_422 and YUV_PACKED as well */
static struct {
u_long meteor_format;
struct meteor_pixfmt public;
} meteor_pixfmt_table[] = {
{ METEOR_GEO_YUV_12,
{ 0, METEOR_PIXTYPE_YUV_12, 2, { 0xff0000,0x00ff00,0x0000ff }, 1,1 }
},
/* FIXME: Should byte swap flag be on for this one; negative in drvr? */
{ METEOR_GEO_YUV_422,
{ 0, METEOR_PIXTYPE_YUV, 2, { 0xff0000,0x00ff00,0x0000ff }, 1,1 }
},
{ METEOR_GEO_YUV_PACKED,
{ 0, METEOR_PIXTYPE_YUV_PACKED, 2, { 0xff0000,0x00ff00,0x0000ff }, 0,1 }
},
{ METEOR_GEO_RGB16,
{ 0, METEOR_PIXTYPE_RGB, 2, { 0x7c00, 0x03e0, 0x001f }, 0, 0 }
},
{ METEOR_GEO_RGB24,
{ 0, METEOR_PIXTYPE_RGB, 4, { 0xff0000, 0x00ff00, 0x0000ff }, 0, 0 }
},
};
#define METEOR_PIXFMT_TABLE_SIZE ( sizeof(meteor_pixfmt_table) / \
sizeof(meteor_pixfmt_table[0]) )
#define BSWAP (BT848_COLOR_CTL_BSWAP_ODD | BT848_COLOR_CTL_BSWAP_EVEN)
#define WSWAP (BT848_COLOR_CTL_WSWAP_ODD | BT848_COLOR_CTL_WSWAP_EVEN)
/* sync detect threshold */
#if 0
#define SYNC_LEVEL (BT848_ADC_RESERVED | \
BT848_ADC_CRUSH) /* threshold ~125 mV */
#else
#define SYNC_LEVEL (BT848_ADC_RESERVED | \
BT848_ADC_SYNC_T) /* threshold ~75 mV */
#endif
/* debug utility for holding previous INT_STAT contents */
#define STATUS_SUM
static u_long status_sum = 0;
/*
* defines to make certain bit-fiddles understandable
*/
#define FIFO_ENABLED BT848_DMA_CTL_FIFO_EN
#define RISC_ENABLED BT848_DMA_CTL_RISC_EN
#define FIFO_RISC_ENABLED (BT848_DMA_CTL_FIFO_EN | BT848_DMA_CTL_RISC_EN)
#define FIFO_RISC_DISABLED 0
#define ALL_INTS_DISABLED 0
#define ALL_INTS_CLEARED 0xffffffff
#define CAPTURE_OFF 0
#define BIT_SEVEN_HIGH (1<<7)
#define BIT_EIGHT_HIGH (1<<8)
#define I2C_BITS (BT848_INT_RACK | BT848_INT_I2CDONE)
#define TDEC_BITS (BT848_INT_FDSR | BT848_INT_FBUS)
static int oformat_meteor_to_bt( u_long format );
static u_int pixfmt_swap_flags( int pixfmt );
/*
* bt848 RISC programming routines.
*/
#ifdef BT848_DUMP
static int dump_bt848( bktr_ptr_t bktr );
#endif
static void yuvpack_prog( bktr_ptr_t bktr, char i_flag, int cols,
int rows, int interlace );
static void yuv422_prog( bktr_ptr_t bktr, char i_flag, int cols,
int rows, int interlace );
static void yuv12_prog( bktr_ptr_t bktr, char i_flag, int cols,
int rows, int interlace );
static void rgb_prog( bktr_ptr_t bktr, char i_flag, int cols,
int rows, int interlace );
static void rgb_vbi_prog( bktr_ptr_t bktr, char i_flag, int cols,
int rows, int interlace );
static void build_dma_prog( bktr_ptr_t bktr, char i_flag );
static bool_t getline(bktr_reg_t *, int);
static bool_t notclipped(bktr_reg_t * , int , int);
static bool_t split(bktr_reg_t *, volatile uint32_t **, int, u_long, int,
volatile u_char ** , int );
static void start_capture( bktr_ptr_t bktr, unsigned type );
static void set_fps( bktr_ptr_t bktr, u_short fps );
/*
* Remote Control Functions
*/
static void remote_read(bktr_ptr_t bktr, struct bktr_remote *remote);
/*
* ioctls common to both video & tuner.
*/
static int common_ioctl( bktr_ptr_t bktr, ioctl_cmd_t cmd, caddr_t arg );
#if !defined(BKTR_USE_FREEBSD_SMBUS)
/*
* i2c primitives for low level control of i2c bus. Added for MSP34xx control
*/
static void i2c_start( bktr_ptr_t bktr);
static void i2c_stop( bktr_ptr_t bktr);
static int i2c_write_byte( bktr_ptr_t bktr, unsigned char data);
static int i2c_read_byte( bktr_ptr_t bktr, unsigned char *data, int last );
#endif
/*
* the common attach code, used by all OS versions.
*/
void
common_bktr_attach( bktr_ptr_t bktr, int unit, u_long pci_id, u_int rev )
{
vm_offset_t buf = 0;
int need_to_allocate_memory = 1;
#ifdef BKTR_NEW_MSP34XX_DRIVER
int err;
#endif
/***************************************/
/* *** OS Specific memory routines *** */
/***************************************/
#if defined(__NetBSD__) || defined(__OpenBSD__)
/* allocate space for dma program */
bktr->dma_prog = get_bktr_mem(bktr, &bktr->dm_prog,
DMA_PROG_ALLOC);
bktr->odd_dma_prog = get_bktr_mem(bktr, &bktr->dm_oprog,
DMA_PROG_ALLOC);
/* allocate space for the VBI buffer */
bktr->vbidata = get_bktr_mem(bktr, &bktr->dm_vbidata,
VBI_DATA_SIZE);
bktr->vbibuffer = get_bktr_mem(bktr, &bktr->dm_vbibuffer,
VBI_BUFFER_SIZE);
/* allocate space for pixel buffer */
if ( BROOKTREE_ALLOC )
buf = get_bktr_mem(bktr, &bktr->dm_mem, BROOKTREE_ALLOC);
else
buf = 0;
#endif
#if defined(__FreeBSD__) || defined(__bsdi__)
/* If this is a module, check if there is any currently saved contiguous memory */
#if defined(BKTR_FREEBSD_MODULE)
if (bktr_has_stored_addresses(unit) == 1) {
/* recover the addresses */
bktr->dma_prog = bktr_retrieve_address(unit, BKTR_MEM_DMA_PROG);
bktr->odd_dma_prog = bktr_retrieve_address(unit, BKTR_MEM_ODD_DMA_PROG);
bktr->vbidata = bktr_retrieve_address(unit, BKTR_MEM_VBIDATA);
bktr->vbibuffer = bktr_retrieve_address(unit, BKTR_MEM_VBIBUFFER);
buf = bktr_retrieve_address(unit, BKTR_MEM_BUF);
need_to_allocate_memory = 0;
}
#endif
if (need_to_allocate_memory == 1) {
/* allocate space for dma program */
bktr->dma_prog = get_bktr_mem(unit, DMA_PROG_ALLOC);
bktr->odd_dma_prog = get_bktr_mem(unit, DMA_PROG_ALLOC);
/* allocte space for the VBI buffer */
bktr->vbidata = get_bktr_mem(unit, VBI_DATA_SIZE);
bktr->vbibuffer = get_bktr_mem(unit, VBI_BUFFER_SIZE);
/* allocate space for pixel buffer */
if ( BROOKTREE_ALLOC )
buf = get_bktr_mem(unit, BROOKTREE_ALLOC);
else
buf = 0;
}
#endif /* FreeBSD or BSDi */
#ifdef USE_VBIMUTEX
mtx_init(&bktr->vbimutex, "bktr vbi lock", NULL, MTX_DEF);
#endif
/* If this is a module, save the current contiguous memory */
#if defined(BKTR_FREEBSD_MODULE)
bktr_store_address(unit, BKTR_MEM_DMA_PROG, bktr->dma_prog);
bktr_store_address(unit, BKTR_MEM_ODD_DMA_PROG, bktr->odd_dma_prog);
bktr_store_address(unit, BKTR_MEM_VBIDATA, bktr->vbidata);
bktr_store_address(unit, BKTR_MEM_VBIBUFFER, bktr->vbibuffer);
bktr_store_address(unit, BKTR_MEM_BUF, buf);
#endif
if ( bootverbose ) {
printf("%s: buffer size %d, addr %p\n",
bktr_name(bktr), (int)BROOKTREE_ALLOC,
(void *)(uintptr_t)vtophys(buf));
}
if ( buf != 0 ) {
bktr->bigbuf = buf;
bktr->alloc_pages = BROOKTREE_ALLOC_PAGES;
bzero((caddr_t) bktr->bigbuf, BROOKTREE_ALLOC);
} else {
bktr->alloc_pages = 0;
}
bktr->flags = METEOR_INITALIZED | METEOR_AUTOMODE |
METEOR_DEV0 | METEOR_RGB16;
bktr->dma_prog_loaded = FALSE;
bktr->cols = 640;
bktr->rows = 480;
bktr->frames = 1; /* one frame */
bktr->format = METEOR_GEO_RGB16;
bktr->pixfmt = oformat_meteor_to_bt( bktr->format );
bktr->pixfmt_compat = TRUE;
bktr->vbiinsert = 0;
bktr->vbistart = 0;
bktr->vbisize = 0;
bktr->vbiflags = 0;
/* using the pci device id and revision id */
/* and determine the card type */
if (PCI_VENDOR(pci_id) == PCI_VENDOR_BROOKTREE)
{
switch (PCI_PRODUCT(pci_id)) {
case PCI_PRODUCT_BROOKTREE_BT848:
if (rev == 0x12)
bktr->id = BROOKTREE_848A;
else
bktr->id = BROOKTREE_848;
break;
case PCI_PRODUCT_BROOKTREE_BT849:
bktr->id = BROOKTREE_849A;
break;
case PCI_PRODUCT_BROOKTREE_BT878:
bktr->id = BROOKTREE_878;
break;
case PCI_PRODUCT_BROOKTREE_BT879:
bktr->id = BROOKTREE_879;
break;
}
};
bktr->clr_on_start = FALSE;
/* defaults for the tuner section of the card */
bktr->tflags = TUNER_INITALIZED;
bktr->tuner.frequency = 0;
bktr->tuner.channel = 0;
bktr->tuner.chnlset = DEFAULT_CHNLSET;
bktr->tuner.afc = 0;
bktr->tuner.radio_mode = 0;
bktr->audio_mux_select = 0;
bktr->audio_mute_state = FALSE;
bktr->bt848_card = -1;
bktr->bt848_tuner = -1;
bktr->reverse_mute = -1;
bktr->slow_msp_audio = 0;
bktr->msp_use_mono_source = 0;
bktr->msp_source_selected = -1;
bktr->audio_mux_present = 1;
#if defined(__FreeBSD__)
#ifdef BKTR_NEW_MSP34XX_DRIVER
/* get hint on short programming of the msp34xx, so we know */
/* if the decision what thread to start should be overwritten */
if ( (err = resource_int_value("bktr", unit, "mspsimple",
&(bktr->mspsimple)) ) != 0 )
bktr->mspsimple = -1; /* fall back to default */
#endif
#endif
probeCard( bktr, TRUE, unit );
/* Initialise any MSP34xx or TDA98xx audio chips */
init_audio_devices( bktr );
#ifdef BKTR_NEW_MSP34XX_DRIVER
/* setup the kenrel thread */
err = msp_attach( bktr );
if ( err != 0 ) /* error doing kernel thread stuff, disable msp3400c */
bktr->card.msp3400c = 0;
#endif
}
/* Copy the vbi lines from 'vbidata' into the circular buffer, 'vbibuffer'.
* The circular buffer holds 'n' fixed size data blocks.
* vbisize is the number of bytes in the circular buffer
* vbiread is the point we reading data out of the circular buffer
* vbiinsert is the point we insert data into the circular buffer
*/
static void vbidecode(bktr_ptr_t bktr) {
unsigned char *dest;
unsigned int *seq_dest;
/* Check if there is room in the buffer to insert the data. */
if (bktr->vbisize + VBI_DATA_SIZE > VBI_BUFFER_SIZE) return;
/* Copy the VBI data into the next free slot in the buffer. */
/* 'dest' is the point in vbibuffer where we want to insert new data */
dest = (unsigned char *)bktr->vbibuffer + bktr->vbiinsert;
memcpy(dest, (unsigned char*)bktr->vbidata, VBI_DATA_SIZE);
/* Write the VBI sequence number to the end of the vbi data */
/* This is used by the AleVT teletext program */
seq_dest = (unsigned int *)((unsigned char *)bktr->vbibuffer
+ bktr->vbiinsert
+ (VBI_DATA_SIZE - sizeof(bktr->vbi_sequence_number)));
*seq_dest = bktr->vbi_sequence_number;
/* And increase the VBI sequence number */
/* This can wrap around */
bktr->vbi_sequence_number++;
/* Increment the vbiinsert pointer */
/* This can wrap around */
bktr->vbiinsert += VBI_DATA_SIZE;
bktr->vbiinsert = (bktr->vbiinsert % VBI_BUFFER_SIZE);
/* And increase the amount of vbi data in the buffer */
bktr->vbisize = bktr->vbisize + VBI_DATA_SIZE;
}
/*
* the common interrupt handler.
* Returns a 0 or 1 depending on whether the interrupt has handled.
* In the OS specific section, bktr_intr() is defined which calls this
* common interrupt handler.
*/
int
common_bktr_intr( void *arg )
{
bktr_ptr_t bktr;
u_long bktr_status;
u_char dstatus;
u_long field;
u_long w_field;
u_long req_field;
bktr = (bktr_ptr_t) arg;
/*
* check to see if any interrupts are unmasked on this device. If
* none are, then we likely got here by way of being on a PCI shared
* interrupt dispatch list.
*/
if (INL(bktr, BKTR_INT_MASK) == ALL_INTS_DISABLED)
return 0; /* bail out now, before we do something we
shouldn't */
if (!(bktr->flags & METEOR_OPEN)) {
OUTW(bktr, BKTR_GPIO_DMA_CTL, FIFO_RISC_DISABLED);
OUTL(bktr, BKTR_INT_MASK, ALL_INTS_DISABLED);
/* return; ?? */
}
/* record and clear the INTerrupt status bits */
bktr_status = INL(bktr, BKTR_INT_STAT);
OUTL(bktr, BKTR_INT_STAT, bktr_status & ~I2C_BITS); /* don't touch i2c */
/* record and clear the device status register */
dstatus = INB(bktr, BKTR_DSTATUS);
OUTB(bktr, BKTR_DSTATUS, 0x00);
#if defined( STATUS_SUM )
/* add any new device status or INTerrupt status bits */
status_sum |= (bktr_status & ~(BT848_INT_RSV0|BT848_INT_RSV1));
status_sum |= ((dstatus & (BT848_DSTATUS_COF|BT848_DSTATUS_LOF)) << 6);
#endif /* STATUS_SUM */
/* printf( "%s: STATUS %x %x %x \n", bktr_name(bktr),
dstatus, bktr_status, INL(bktr, BKTR_RISC_COUNT) );
*/
/* if risc was disabled re-start process again */
/* if there was one of the following errors re-start again */
if ( !(bktr_status & BT848_INT_RISC_EN) ||
((bktr_status &(/* BT848_INT_FBUS | */
/* BT848_INT_FTRGT | */
/* BT848_INT_FDSR | */
BT848_INT_PPERR |
BT848_INT_RIPERR | BT848_INT_PABORT |
BT848_INT_OCERR | BT848_INT_SCERR) ) != 0)
|| ((INB(bktr, BKTR_TDEC) == 0) && (bktr_status & TDEC_BITS)) ) {
u_short tdec_save = INB(bktr, BKTR_TDEC);
OUTW(bktr, BKTR_GPIO_DMA_CTL, FIFO_RISC_DISABLED);
OUTB(bktr, BKTR_CAP_CTL, CAPTURE_OFF);
OUTL(bktr, BKTR_INT_MASK, ALL_INTS_DISABLED);
/* Reset temporal decimation counter */
OUTB(bktr, BKTR_TDEC, 0);
OUTB(bktr, BKTR_TDEC, tdec_save);
/* Reset to no-fields captured state */
if (bktr->flags & (METEOR_CONTIN | METEOR_SYNCAP)) {
switch(bktr->flags & METEOR_ONLY_FIELDS_MASK) {
case METEOR_ONLY_ODD_FIELDS:
bktr->flags |= METEOR_WANT_ODD;
break;
case METEOR_ONLY_EVEN_FIELDS:
bktr->flags |= METEOR_WANT_EVEN;
break;
default:
bktr->flags |= METEOR_WANT_MASK;
break;
}
}
OUTL(bktr, BKTR_RISC_STRT_ADD, vtophys(bktr->dma_prog));
OUTW(bktr, BKTR_GPIO_DMA_CTL, FIFO_ENABLED);
OUTW(bktr, BKTR_GPIO_DMA_CTL, bktr->capcontrol);
OUTL(bktr, BKTR_INT_MASK, BT848_INT_MYSTERYBIT |
BT848_INT_RISCI |
BT848_INT_VSYNC |
BT848_INT_FMTCHG);
OUTB(bktr, BKTR_CAP_CTL, bktr->bktr_cap_ctl);
return 1;
}
/* If this is not a RISC program interrupt, return */
if (!(bktr_status & BT848_INT_RISCI))
return 0;
/**
printf( "%s: intr status %x %x %x\n", bktr_name(bktr),
bktr_status, dstatus, INL(bktr, BKTR_RISC_COUNT) );
*/
/*
* Disable future interrupts if a capture mode is not selected.
* This can happen when we are in the process of closing or
* changing capture modes, otherwise it shouldn't happen.
*/
if (!(bktr->flags & METEOR_CAP_MASK))
OUTB(bktr, BKTR_CAP_CTL, CAPTURE_OFF);
/* Determine which field generated this interrupt */
field = ( bktr_status & BT848_INT_FIELD ) ? EVEN_F : ODD_F;
/*
* Process the VBI data if it is being captured. We do this once
* both Odd and Even VBI data is captured. Therefore we do this
* in the Even field interrupt handler.
*/
LOCK_VBI(bktr);
if ( (bktr->vbiflags & VBI_CAPTURE)
&&(bktr->vbiflags & VBI_OPEN)
&&(field==EVEN_F)) {
/* Put VBI data into circular buffer */
vbidecode(bktr);
/* If someone is blocked on reading from /dev/vbi, wake them */
if (bktr->vbi_read_blocked) {
bktr->vbi_read_blocked = FALSE;
wakeup(VBI_SLEEP);
}
/* If someone has a select() on /dev/vbi, inform them */
if (SEL_WAITING(&bktr->vbi_select)) {
selwakeuppri(&bktr->vbi_select, VBIPRI);
}
}
UNLOCK_VBI(bktr);
/*
* Register the completed field
* (For dual-field mode, require fields from the same frame)
*/
switch ( bktr->flags & METEOR_WANT_MASK ) {
case METEOR_WANT_ODD : w_field = ODD_F ; break;
case METEOR_WANT_EVEN : w_field = EVEN_F ; break;
default : w_field = (ODD_F|EVEN_F); break;
}
switch ( bktr->flags & METEOR_ONLY_FIELDS_MASK ) {
case METEOR_ONLY_ODD_FIELDS : req_field = ODD_F ; break;
case METEOR_ONLY_EVEN_FIELDS : req_field = EVEN_F ; break;
default : req_field = (ODD_F|EVEN_F);
break;
}
if (( field == EVEN_F ) && ( w_field == EVEN_F ))
bktr->flags &= ~METEOR_WANT_EVEN;
else if (( field == ODD_F ) && ( req_field == ODD_F ) &&
( w_field == ODD_F ))
bktr->flags &= ~METEOR_WANT_ODD;
else if (( field == ODD_F ) && ( req_field == (ODD_F|EVEN_F) ) &&
( w_field == (ODD_F|EVEN_F) ))
bktr->flags &= ~METEOR_WANT_ODD;
else if (( field == ODD_F ) && ( req_field == (ODD_F|EVEN_F) ) &&
( w_field == ODD_F )) {
bktr->flags &= ~METEOR_WANT_ODD;
bktr->flags |= METEOR_WANT_EVEN;
}
else {
/* We're out of sync. Start over. */
if (bktr->flags & (METEOR_CONTIN | METEOR_SYNCAP)) {
switch(bktr->flags & METEOR_ONLY_FIELDS_MASK) {
case METEOR_ONLY_ODD_FIELDS:
bktr->flags |= METEOR_WANT_ODD;
break;
case METEOR_ONLY_EVEN_FIELDS:
bktr->flags |= METEOR_WANT_EVEN;
break;
default:
bktr->flags |= METEOR_WANT_MASK;
break;
}
}
return 1;
}
/*
* If we have a complete frame.
*/
if (!(bktr->flags & METEOR_WANT_MASK)) {
bktr->frames_captured++;
/*
* post the completion time.
*/
if (bktr->flags & METEOR_WANT_TS) {
struct timeval *ts;
if ((u_int) bktr->alloc_pages * PAGE_SIZE
<= (bktr->frame_size + sizeof(struct timeval))) {
ts =(struct timeval *)bktr->bigbuf +
bktr->frame_size;
/* doesn't work in synch mode except
* for first frame */
/* XXX */
microtime(ts);
}
}
/*
* Wake up the user in single capture mode.
*/
if (bktr->flags & METEOR_SINGLE) {
/* stop dma */
OUTL(bktr, BKTR_INT_MASK, ALL_INTS_DISABLED);
/* disable risc, leave fifo running */
OUTW(bktr, BKTR_GPIO_DMA_CTL, FIFO_ENABLED);
wakeup(BKTR_SLEEP);
}
/*
* If the user requested to be notified via signal,
* let them know the frame is complete.
*/
if (bktr->proc != NULL) {
PROC_LOCK(bktr->proc);
- psignal( bktr->proc, bktr->signal);
+ kern_psignal( bktr->proc, bktr->signal);
PROC_UNLOCK(bktr->proc);
}
/*
* Reset the want flags if in continuous or
* synchronous capture mode.
*/
/*
* XXX NOTE (Luigi):
* currently we only support 3 capture modes: odd only, even only,
* odd+even interlaced (odd field first). A fourth mode (non interlaced,
* either even OR odd) could provide 60 (50 for PAL) pictures per
* second, but it would require this routine to toggle the desired frame
* each time, and one more different DMA program for the Bt848.
* As a consequence, this fourth mode is currently unsupported.
*/
if (bktr->flags & (METEOR_CONTIN | METEOR_SYNCAP)) {
switch(bktr->flags & METEOR_ONLY_FIELDS_MASK) {
case METEOR_ONLY_ODD_FIELDS:
bktr->flags |= METEOR_WANT_ODD;
break;
case METEOR_ONLY_EVEN_FIELDS:
bktr->flags |= METEOR_WANT_EVEN;
break;
default:
bktr->flags |= METEOR_WANT_MASK;
break;
}
}
}
return 1;
}
/*
*
*/
extern int bt848_format; /* used to set the default format, PAL or NTSC */
int
video_open( bktr_ptr_t bktr )
{
int frame_rate, video_format=0;
if (bktr->flags & METEOR_OPEN) /* device is busy */
return( EBUSY );
bktr->flags |= METEOR_OPEN;
#ifdef BT848_DUMP
dump_bt848( bt848 );
#endif
bktr->clr_on_start = FALSE;
OUTB(bktr, BKTR_DSTATUS, 0x00); /* clear device status reg. */
OUTB(bktr, BKTR_ADC, SYNC_LEVEL);
#if defined(BKTR_SYSTEM_DEFAULT) && BKTR_SYSTEM_DEFAULT == BROOKTREE_PAL
video_format = 0;
#else
video_format = 1;
#endif
if (bt848_format == 0 )
video_format = 0;
if (bt848_format == 1 )
video_format = 1;
if (video_format == 1 ) {
OUTB(bktr, BKTR_IFORM, BT848_IFORM_F_NTSCM);
bktr->format_params = BT848_IFORM_F_NTSCM;
} else {
OUTB(bktr, BKTR_IFORM, BT848_IFORM_F_PALBDGHI);
bktr->format_params = BT848_IFORM_F_PALBDGHI;
}
OUTB(bktr, BKTR_IFORM, INB(bktr, BKTR_IFORM) | format_params[bktr->format_params].iform_xtsel);
/* work around for new Hauppauge 878 cards */
if ((bktr->card.card_id == CARD_HAUPPAUGE) &&
(bktr->id==BROOKTREE_878 || bktr->id==BROOKTREE_879) )
OUTB(bktr, BKTR_IFORM, INB(bktr, BKTR_IFORM) | BT848_IFORM_M_MUX3);
else
OUTB(bktr, BKTR_IFORM, INB(bktr, BKTR_IFORM) | BT848_IFORM_M_MUX1);
OUTB(bktr, BKTR_ADELAY, format_params[bktr->format_params].adelay);
OUTB(bktr, BKTR_BDELAY, format_params[bktr->format_params].bdelay);
frame_rate = format_params[bktr->format_params].frame_rate;
/* enable PLL mode using 28Mhz crystal for PAL/SECAM users */
if (bktr->xtal_pll_mode == BT848_USE_PLL) {
OUTB(bktr, BKTR_TGCTRL, 0);
OUTB(bktr, BKTR_PLL_F_LO, 0xf9);
OUTB(bktr, BKTR_PLL_F_HI, 0xdc);
OUTB(bktr, BKTR_PLL_F_XCI, 0x8e);
}
bktr->flags = (bktr->flags & ~METEOR_DEV_MASK) | METEOR_DEV0;
bktr->max_clip_node = 0;
OUTB(bktr, BKTR_COLOR_CTL, BT848_COLOR_CTL_GAMMA | BT848_COLOR_CTL_RGB_DED);
OUTB(bktr, BKTR_E_HSCALE_LO, 170);
OUTB(bktr, BKTR_O_HSCALE_LO, 170);
OUTB(bktr, BKTR_E_DELAY_LO, 0x72);
OUTB(bktr, BKTR_O_DELAY_LO, 0x72);
OUTB(bktr, BKTR_E_SCLOOP, 0);
OUTB(bktr, BKTR_O_SCLOOP, 0);
OUTB(bktr, BKTR_VBI_PACK_SIZE, 0);
OUTB(bktr, BKTR_VBI_PACK_DEL, 0);
bktr->fifo_errors = 0;
bktr->dma_errors = 0;
bktr->frames_captured = 0;
bktr->even_fields_captured = 0;
bktr->odd_fields_captured = 0;
bktr->proc = NULL;
set_fps(bktr, frame_rate);
bktr->video.addr = 0;
bktr->video.width = 0;
bktr->video.banksize = 0;
bktr->video.ramsize = 0;
bktr->pixfmt_compat = TRUE;
bktr->format = METEOR_GEO_RGB16;
bktr->pixfmt = oformat_meteor_to_bt( bktr->format );
bktr->capture_area_enabled = FALSE;
OUTL(bktr, BKTR_INT_MASK, BT848_INT_MYSTERYBIT); /* if you take this out triton
based motherboards will
operate unreliably */
return( 0 );
}
int
vbi_open( bktr_ptr_t bktr )
{
LOCK_VBI(bktr);
if (bktr->vbiflags & VBI_OPEN) { /* device is busy */
UNLOCK_VBI(bktr);
return( EBUSY );
}
bktr->vbiflags |= VBI_OPEN;
/* reset the VBI circular buffer pointers and clear the buffers */
bktr->vbiinsert = 0;
bktr->vbistart = 0;
bktr->vbisize = 0;
bktr->vbi_sequence_number = 0;
bktr->vbi_read_blocked = FALSE;
bzero((caddr_t) bktr->vbibuffer, VBI_BUFFER_SIZE);
bzero((caddr_t) bktr->vbidata, VBI_DATA_SIZE);
UNLOCK_VBI(bktr);
return( 0 );
}
/*
*
*/
int
tuner_open( bktr_ptr_t bktr )
{
if ( !(bktr->tflags & TUNER_INITALIZED) ) /* device not found */
return( ENXIO );
if ( bktr->tflags & TUNER_OPEN ) /* already open */
return( 0 );
bktr->tflags |= TUNER_OPEN;
bktr->tuner.frequency = 0;
bktr->tuner.channel = 0;
bktr->tuner.chnlset = DEFAULT_CHNLSET;
bktr->tuner.afc = 0;
bktr->tuner.radio_mode = 0;
/* enable drivers on the GPIO port that control the MUXes */
OUTL(bktr, BKTR_GPIO_OUT_EN, INL(bktr, BKTR_GPIO_OUT_EN) | bktr->card.gpio_mux_bits);
/* unmute the audio stream */
set_audio( bktr, AUDIO_UNMUTE );
/* Initialise any audio chips, eg MSP34xx or TDA98xx */
init_audio_devices( bktr );
return( 0 );
}
/*
*
*/
int
video_close( bktr_ptr_t bktr )
{
bktr->flags &= ~(METEOR_OPEN |
METEOR_SINGLE |
METEOR_CAP_MASK |
METEOR_WANT_MASK);
OUTW(bktr, BKTR_GPIO_DMA_CTL, FIFO_RISC_DISABLED);
OUTB(bktr, BKTR_CAP_CTL, CAPTURE_OFF);
bktr->dma_prog_loaded = FALSE;
OUTB(bktr, BKTR_TDEC, 0);
OUTL(bktr, BKTR_INT_MASK, ALL_INTS_DISABLED);
/** FIXME: is 0xf magic, wouldn't 0x00 work ??? */
OUTL(bktr, BKTR_SRESET, 0xf);
OUTL(bktr, BKTR_INT_STAT, ALL_INTS_CLEARED);
return( 0 );
}
/*
* tuner close handle,
* place holder for tuner specific operations on a close.
*/
int
tuner_close( bktr_ptr_t bktr )
{
bktr->tflags &= ~TUNER_OPEN;
/* mute the audio by switching the mux */
set_audio( bktr, AUDIO_MUTE );
/* disable drivers on the GPIO port that control the MUXes */
OUTL(bktr, BKTR_GPIO_OUT_EN, INL(bktr, BKTR_GPIO_OUT_EN) & ~bktr->card.gpio_mux_bits);
return( 0 );
}
int
vbi_close( bktr_ptr_t bktr )
{
LOCK_VBI(bktr);
bktr->vbiflags &= ~VBI_OPEN;
UNLOCK_VBI(bktr);
return( 0 );
}
/*
*
*/
int
video_read(bktr_ptr_t bktr, int unit, struct cdev *dev, struct uio *uio)
{
int status;
int count;
if (bktr->bigbuf == 0) /* no frame buffer allocated (ioctl failed) */
return( ENOMEM );
if (bktr->flags & METEOR_CAP_MASK)
return( EIO ); /* already capturing */
OUTB(bktr, BKTR_CAP_CTL, bktr->bktr_cap_ctl);
count = bktr->rows * bktr->cols *
pixfmt_table[ bktr->pixfmt ].public.Bpp;
if ((int) uio->uio_iov->iov_len < count)
return( EINVAL );
bktr->flags &= ~(METEOR_CAP_MASK | METEOR_WANT_MASK);
/* capture one frame */
start_capture(bktr, METEOR_SINGLE);
/* wait for capture to complete */
OUTL(bktr, BKTR_INT_STAT, ALL_INTS_CLEARED);
OUTW(bktr, BKTR_GPIO_DMA_CTL, FIFO_ENABLED);
OUTW(bktr, BKTR_GPIO_DMA_CTL, bktr->capcontrol);
OUTL(bktr, BKTR_INT_MASK, BT848_INT_MYSTERYBIT |
BT848_INT_RISCI |
BT848_INT_VSYNC |
BT848_INT_FMTCHG);
status = tsleep(BKTR_SLEEP, BKTRPRI, "captur", 0);
if (!status) /* successful capture */
status = uiomove((caddr_t)bktr->bigbuf, count, uio);
else
printf ("%s: read: tsleep error %d\n",
bktr_name(bktr), status);
bktr->flags &= ~(METEOR_SINGLE | METEOR_WANT_MASK);
return( status );
}
/*
* Read VBI data from the vbi circular buffer
* The buffer holds vbi data blocks which are the same size
* vbiinsert is the position we will insert the next item into the buffer
* vbistart is the actual position in the buffer we want to read from
* vbisize is the exact number of bytes in the buffer left to read
*/
int
vbi_read(bktr_ptr_t bktr, struct uio *uio, int ioflag)
{
int readsize, readsize2, start;
int status;
/*
* XXX - vbi_read() should be protected against being re-entered
* while it is unlocked for the uiomove.
*/
LOCK_VBI(bktr);
while(bktr->vbisize == 0) {
if (ioflag & FNDELAY) {
status = EWOULDBLOCK;
goto out;
}
bktr->vbi_read_blocked = TRUE;
#ifdef USE_VBIMUTEX
if ((status = msleep(VBI_SLEEP, &bktr->vbimutex, VBIPRI, "vbi",
0))) {
goto out;
}
#else
if ((status = tsleep(VBI_SLEEP, VBIPRI, "vbi", 0))) {
goto out;
}
#endif
}
/* Now we have some data to give to the user */
/* We cannot read more bytes than there are in
* the circular buffer
*/
readsize = (int)uio->uio_iov->iov_len;
if (readsize > bktr->vbisize) readsize = bktr->vbisize;
/* Check if we can read this number of bytes without having
* to wrap around the circular buffer */
if((bktr->vbistart + readsize) >= VBI_BUFFER_SIZE) {
/* We need to wrap around */
readsize2 = VBI_BUFFER_SIZE - bktr->vbistart;
start = bktr->vbistart;
UNLOCK_VBI(bktr);
status = uiomove((caddr_t)bktr->vbibuffer + start, readsize2, uio);
if (status == 0)
status = uiomove((caddr_t)bktr->vbibuffer, (readsize - readsize2), uio);
} else {
UNLOCK_VBI(bktr);
/* We do not need to wrap around */
status = uiomove((caddr_t)bktr->vbibuffer + bktr->vbistart, readsize, uio);
}
LOCK_VBI(bktr);
/* Update the number of bytes left to read */
bktr->vbisize -= readsize;
/* Update vbistart */
bktr->vbistart += readsize;
bktr->vbistart = bktr->vbistart % VBI_BUFFER_SIZE; /* wrap around if needed */
out:
UNLOCK_VBI(bktr);
return( status );
}
/*
* video ioctls
*/
int
video_ioctl( bktr_ptr_t bktr, int unit, ioctl_cmd_t cmd, caddr_t arg, struct thread* td )
{
volatile u_char c_temp;
unsigned int temp;
unsigned int temp_iform;
unsigned int error;
struct meteor_geomet *geo;
struct meteor_counts *counts;
struct meteor_video *video;
struct bktr_capture_area *cap_area;
vm_offset_t buf;
int i;
int sig;
char char_temp;
switch ( cmd ) {
case BT848SCLIP: /* set clip region */
bktr->max_clip_node = 0;
memcpy(&bktr->clip_list, arg, sizeof(bktr->clip_list));
for (i = 0; i < BT848_MAX_CLIP_NODE; i++) {
if (bktr->clip_list[i].y_min == 0 &&
bktr->clip_list[i].y_max == 0)
break;
}
bktr->max_clip_node = i;
/* make sure that the list contains a valid clip secquence */
/* the clip rectangles should be sorted by x then by y as the
second order sort key */
/* clip rectangle list is terminated by y_min and y_max set to 0 */
/* to disable clipping set y_min and y_max to 0 in the first
clip rectangle . The first clip rectangle is clip_list[0].
*/
if (bktr->max_clip_node == 0 &&
(bktr->clip_list[0].y_min != 0 &&
bktr->clip_list[0].y_max != 0)) {
return EINVAL;
}
for (i = 0; i < BT848_MAX_CLIP_NODE - 1 ; i++) {
if (bktr->clip_list[i].y_min == 0 &&
bktr->clip_list[i].y_max == 0) {
break;
}
if ( bktr->clip_list[i+1].y_min != 0 &&
bktr->clip_list[i+1].y_max != 0 &&
bktr->clip_list[i].x_min > bktr->clip_list[i+1].x_min ) {
bktr->max_clip_node = 0;
return (EINVAL);
}
if (bktr->clip_list[i].x_min >= bktr->clip_list[i].x_max ||
bktr->clip_list[i].y_min >= bktr->clip_list[i].y_max ||
bktr->clip_list[i].x_min < 0 ||
bktr->clip_list[i].x_max < 0 ||
bktr->clip_list[i].y_min < 0 ||
bktr->clip_list[i].y_max < 0 ) {
bktr->max_clip_node = 0;
return (EINVAL);
}
}
bktr->dma_prog_loaded = FALSE;
break;
case METEORSTATUS: /* get Bt848 status */
c_temp = INB(bktr, BKTR_DSTATUS);
temp = 0;
if (!(c_temp & 0x40)) temp |= METEOR_STATUS_HCLK;
if (!(c_temp & 0x10)) temp |= METEOR_STATUS_FIDT;
*(u_short *)arg = temp;
break;
case BT848SFMT: /* set input format */
temp = *(unsigned long*)arg & BT848_IFORM_FORMAT;
temp_iform = INB(bktr, BKTR_IFORM);
temp_iform &= ~BT848_IFORM_FORMAT;
temp_iform &= ~BT848_IFORM_XTSEL;
OUTB(bktr, BKTR_IFORM, (temp_iform | temp | format_params[temp].iform_xtsel));
switch( temp ) {
case BT848_IFORM_F_AUTO:
bktr->flags = (bktr->flags & ~METEOR_FORM_MASK) |
METEOR_AUTOMODE;
break;
case BT848_IFORM_F_NTSCM:
case BT848_IFORM_F_NTSCJ:
bktr->flags = (bktr->flags & ~METEOR_FORM_MASK) |
METEOR_NTSC;
OUTB(bktr, BKTR_ADELAY, format_params[temp].adelay);
OUTB(bktr, BKTR_BDELAY, format_params[temp].bdelay);
bktr->format_params = temp;
break;
case BT848_IFORM_F_PALBDGHI:
case BT848_IFORM_F_PALN:
case BT848_IFORM_F_SECAM:
case BT848_IFORM_F_RSVD:
case BT848_IFORM_F_PALM:
bktr->flags = (bktr->flags & ~METEOR_FORM_MASK) |
METEOR_PAL;
OUTB(bktr, BKTR_ADELAY, format_params[temp].adelay);
OUTB(bktr, BKTR_BDELAY, format_params[temp].bdelay);
bktr->format_params = temp;
break;
}
bktr->dma_prog_loaded = FALSE;
break;
case METEORSFMT: /* set input format */
temp_iform = INB(bktr, BKTR_IFORM);
temp_iform &= ~BT848_IFORM_FORMAT;
temp_iform &= ~BT848_IFORM_XTSEL;
switch(*(unsigned long *)arg & METEOR_FORM_MASK ) {
case 0: /* default */
case METEOR_FMT_NTSC:
bktr->flags = (bktr->flags & ~METEOR_FORM_MASK) |
METEOR_NTSC;
OUTB(bktr, BKTR_IFORM, temp_iform | BT848_IFORM_F_NTSCM |
format_params[BT848_IFORM_F_NTSCM].iform_xtsel);
OUTB(bktr, BKTR_ADELAY, format_params[BT848_IFORM_F_NTSCM].adelay);
OUTB(bktr, BKTR_BDELAY, format_params[BT848_IFORM_F_NTSCM].bdelay);
bktr->format_params = BT848_IFORM_F_NTSCM;
break;
case METEOR_FMT_PAL:
bktr->flags = (bktr->flags & ~METEOR_FORM_MASK) |
METEOR_PAL;
OUTB(bktr, BKTR_IFORM, temp_iform | BT848_IFORM_F_PALBDGHI |
format_params[BT848_IFORM_F_PALBDGHI].iform_xtsel);
OUTB(bktr, BKTR_ADELAY, format_params[BT848_IFORM_F_PALBDGHI].adelay);
OUTB(bktr, BKTR_BDELAY, format_params[BT848_IFORM_F_PALBDGHI].bdelay);
bktr->format_params = BT848_IFORM_F_PALBDGHI;
break;
case METEOR_FMT_AUTOMODE:
bktr->flags = (bktr->flags & ~METEOR_FORM_MASK) |
METEOR_AUTOMODE;
OUTB(bktr, BKTR_IFORM, temp_iform | BT848_IFORM_F_AUTO |
format_params[BT848_IFORM_F_AUTO].iform_xtsel);
break;
default:
return( EINVAL );
}
bktr->dma_prog_loaded = FALSE;
break;
case METEORGFMT: /* get input format */
*(u_long *)arg = bktr->flags & METEOR_FORM_MASK;
break;
case BT848GFMT: /* get input format */
*(u_long *)arg = INB(bktr, BKTR_IFORM) & BT848_IFORM_FORMAT;
break;
case METEORSCOUNT: /* (re)set error counts */
counts = (struct meteor_counts *) arg;
bktr->fifo_errors = counts->fifo_errors;
bktr->dma_errors = counts->dma_errors;
bktr->frames_captured = counts->frames_captured;
bktr->even_fields_captured = counts->even_fields_captured;
bktr->odd_fields_captured = counts->odd_fields_captured;
break;
case METEORGCOUNT: /* get error counts */
counts = (struct meteor_counts *) arg;
counts->fifo_errors = bktr->fifo_errors;
counts->dma_errors = bktr->dma_errors;
counts->frames_captured = bktr->frames_captured;
counts->even_fields_captured = bktr->even_fields_captured;
counts->odd_fields_captured = bktr->odd_fields_captured;
break;
case METEORGVIDEO:
video = (struct meteor_video *)arg;
video->addr = bktr->video.addr;
video->width = bktr->video.width;
video->banksize = bktr->video.banksize;
video->ramsize = bktr->video.ramsize;
break;
case METEORSVIDEO:
video = (struct meteor_video *)arg;
bktr->video.addr = video->addr;
bktr->video.width = video->width;
bktr->video.banksize = video->banksize;
bktr->video.ramsize = video->ramsize;
break;
case METEORSFPS:
set_fps(bktr, *(u_short *)arg);
break;
case METEORGFPS:
*(u_short *)arg = bktr->fps;
break;
case METEORSHUE: /* set hue */
OUTB(bktr, BKTR_HUE, (*(u_char *) arg) & 0xff);
break;
case METEORGHUE: /* get hue */
*(u_char *)arg = INB(bktr, BKTR_HUE);
break;
case METEORSBRIG: /* set brightness */
char_temp = ( *(u_char *)arg & 0xff) - 128;
OUTB(bktr, BKTR_BRIGHT, char_temp);
break;
case METEORGBRIG: /* get brightness */
*(u_char *)arg = INB(bktr, BKTR_BRIGHT);
break;
case METEORSCSAT: /* set chroma saturation */
temp = (int)*(u_char *)arg;
OUTB(bktr, BKTR_SAT_U_LO, (temp << 1) & 0xff);
OUTB(bktr, BKTR_SAT_V_LO, (temp << 1) & 0xff);
OUTB(bktr, BKTR_E_CONTROL, INB(bktr, BKTR_E_CONTROL)
& ~(BT848_E_CONTROL_SAT_U_MSB
| BT848_E_CONTROL_SAT_V_MSB));
OUTB(bktr, BKTR_O_CONTROL, INB(bktr, BKTR_O_CONTROL)
& ~(BT848_O_CONTROL_SAT_U_MSB |
BT848_O_CONTROL_SAT_V_MSB));
if ( temp & BIT_SEVEN_HIGH ) {
OUTB(bktr, BKTR_E_CONTROL, INB(bktr, BKTR_E_CONTROL)
| (BT848_E_CONTROL_SAT_U_MSB
| BT848_E_CONTROL_SAT_V_MSB));
OUTB(bktr, BKTR_O_CONTROL, INB(bktr, BKTR_O_CONTROL)
| (BT848_O_CONTROL_SAT_U_MSB
| BT848_O_CONTROL_SAT_V_MSB));
}
break;
case METEORGCSAT: /* get chroma saturation */
temp = (INB(bktr, BKTR_SAT_V_LO) >> 1) & 0xff;
if ( INB(bktr, BKTR_E_CONTROL) & BT848_E_CONTROL_SAT_V_MSB )
temp |= BIT_SEVEN_HIGH;
*(u_char *)arg = (u_char)temp;
break;
case METEORSCONT: /* set contrast */
temp = (int)*(u_char *)arg & 0xff;
temp <<= 1;
OUTB(bktr, BKTR_CONTRAST_LO, temp & 0xff);
OUTB(bktr, BKTR_E_CONTROL, INB(bktr, BKTR_E_CONTROL) & ~BT848_E_CONTROL_CON_MSB);
OUTB(bktr, BKTR_O_CONTROL, INB(bktr, BKTR_O_CONTROL) & ~BT848_O_CONTROL_CON_MSB);
OUTB(bktr, BKTR_E_CONTROL, INB(bktr, BKTR_E_CONTROL) |
(((temp & 0x100) >> 6 ) & BT848_E_CONTROL_CON_MSB));
OUTB(bktr, BKTR_O_CONTROL, INB(bktr, BKTR_O_CONTROL) |
(((temp & 0x100) >> 6 ) & BT848_O_CONTROL_CON_MSB));
break;
case METEORGCONT: /* get contrast */
temp = (int)INB(bktr, BKTR_CONTRAST_LO) & 0xff;
temp |= ((int)INB(bktr, BKTR_O_CONTROL) & 0x04) << 6;
*(u_char *)arg = (u_char)((temp >> 1) & 0xff);
break;
case BT848SCBUF: /* set Clear-Buffer-on-start flag */
bktr->clr_on_start = (*(int *)arg != 0);
break;
case BT848GCBUF: /* get Clear-Buffer-on-start flag */
*(int *)arg = (int) bktr->clr_on_start;
break;
case METEORSSIGNAL:
sig = *(int *)arg;
/* Historically, applications used METEOR_SIG_MODE_MASK
* to reset signal delivery.
*/
if (sig == METEOR_SIG_MODE_MASK)
sig = 0;
if (sig < 0 || sig > _SIG_MAXSIG)
return (EINVAL);
bktr->signal = sig;
bktr->proc = sig ? td->td_proc : NULL;
break;
case METEORGSIGNAL:
*(int *)arg = bktr->signal;
break;
case METEORCAPTUR:
temp = bktr->flags;
switch (*(int *) arg) {
case METEOR_CAP_SINGLE:
if (bktr->bigbuf==0) /* no frame buffer allocated */
return( ENOMEM );
/* already capturing */
if (temp & METEOR_CAP_MASK)
return( EIO );
start_capture(bktr, METEOR_SINGLE);
/* wait for capture to complete */
OUTL(bktr, BKTR_INT_STAT, ALL_INTS_CLEARED);
OUTW(bktr, BKTR_GPIO_DMA_CTL, FIFO_ENABLED);
OUTW(bktr, BKTR_GPIO_DMA_CTL, bktr->capcontrol);
OUTL(bktr, BKTR_INT_MASK, BT848_INT_MYSTERYBIT |
BT848_INT_RISCI |
BT848_INT_VSYNC |
BT848_INT_FMTCHG);
OUTB(bktr, BKTR_CAP_CTL, bktr->bktr_cap_ctl);
error = tsleep(BKTR_SLEEP, BKTRPRI, "captur", hz);
if (error && (error != ERESTART)) {
/* Here if we didn't get complete frame */
#ifdef DIAGNOSTIC
printf( "%s: ioctl: tsleep error %d %x\n",
bktr_name(bktr), error,
INL(bktr, BKTR_RISC_COUNT));
#endif
/* stop dma */
OUTL(bktr, BKTR_INT_MASK, ALL_INTS_DISABLED);
/* disable risc, leave fifo running */
OUTW(bktr, BKTR_GPIO_DMA_CTL, FIFO_ENABLED);
}
bktr->flags &= ~(METEOR_SINGLE|METEOR_WANT_MASK);
/* FIXME: should we set bt848->int_stat ??? */
break;
case METEOR_CAP_CONTINOUS:
if (bktr->bigbuf==0) /* no frame buffer allocated */
return( ENOMEM );
/* already capturing */
if (temp & METEOR_CAP_MASK)
return( EIO );
start_capture(bktr, METEOR_CONTIN);
/* Clear the interrypt status register */
OUTL(bktr, BKTR_INT_STAT, INL(bktr, BKTR_INT_STAT));
OUTW(bktr, BKTR_GPIO_DMA_CTL, FIFO_ENABLED);
OUTW(bktr, BKTR_GPIO_DMA_CTL, bktr->capcontrol);
OUTB(bktr, BKTR_CAP_CTL, bktr->bktr_cap_ctl);
OUTL(bktr, BKTR_INT_MASK, BT848_INT_MYSTERYBIT |
BT848_INT_RISCI |
BT848_INT_VSYNC |
BT848_INT_FMTCHG);
#ifdef BT848_DUMP
dump_bt848( bt848 );
#endif
break;
case METEOR_CAP_STOP_CONT:
if (bktr->flags & METEOR_CONTIN) {
/* turn off capture */
OUTW(bktr, BKTR_GPIO_DMA_CTL, FIFO_RISC_DISABLED);
OUTB(bktr, BKTR_CAP_CTL, CAPTURE_OFF);
OUTL(bktr, BKTR_INT_MASK, ALL_INTS_DISABLED);
bktr->flags &=
~(METEOR_CONTIN | METEOR_WANT_MASK);
}
}
break;
case METEORSETGEO:
/* can't change parameters while capturing */
if (bktr->flags & METEOR_CAP_MASK)
return( EBUSY );
geo = (struct meteor_geomet *) arg;
error = 0;
/* Either even or odd, if even & odd, then these a zero */
if ((geo->oformat & METEOR_GEO_ODD_ONLY) &&
(geo->oformat & METEOR_GEO_EVEN_ONLY)) {
printf( "%s: ioctl: Geometry odd or even only.\n",
bktr_name(bktr));
return( EINVAL );
}
/* set/clear even/odd flags */
if (geo->oformat & METEOR_GEO_ODD_ONLY)
bktr->flags |= METEOR_ONLY_ODD_FIELDS;
else
bktr->flags &= ~METEOR_ONLY_ODD_FIELDS;
if (geo->oformat & METEOR_GEO_EVEN_ONLY)
bktr->flags |= METEOR_ONLY_EVEN_FIELDS;
else
bktr->flags &= ~METEOR_ONLY_EVEN_FIELDS;
if (geo->columns <= 0) {
printf(
"%s: ioctl: %d: columns must be greater than zero.\n",
bktr_name(bktr), geo->columns);
error = EINVAL;
}
else if ((geo->columns & 0x3fe) != geo->columns) {
printf(
"%s: ioctl: %d: columns too large or not even.\n",
bktr_name(bktr), geo->columns);
error = EINVAL;
}
if (geo->rows <= 0) {
printf(
"%s: ioctl: %d: rows must be greater than zero.\n",
bktr_name(bktr), geo->rows);
error = EINVAL;
}
else if (((geo->rows & 0x7fe) != geo->rows) ||
((geo->oformat & METEOR_GEO_FIELD_MASK) &&
((geo->rows & 0x3fe) != geo->rows)) ) {
printf(
"%s: ioctl: %d: rows too large or not even.\n",
bktr_name(bktr), geo->rows);
error = EINVAL;
}
if (geo->frames > 32) {
printf("%s: ioctl: too many frames.\n",
bktr_name(bktr));
error = EINVAL;
}
if (error)
return( error );
bktr->dma_prog_loaded = FALSE;
OUTW(bktr, BKTR_GPIO_DMA_CTL, FIFO_RISC_DISABLED);
OUTL(bktr, BKTR_INT_MASK, ALL_INTS_DISABLED);
if ((temp=(geo->rows * geo->columns * geo->frames * 2))) {
if (geo->oformat & METEOR_GEO_RGB24) temp = temp * 2;
/* meteor_mem structure for SYNC Capture */
if (geo->frames > 1) temp += PAGE_SIZE;
temp = btoc(temp);
if ((int) temp > bktr->alloc_pages
&& bktr->video.addr == 0) {
/*****************************/
/* *** OS Dependant code *** */
/*****************************/
#if defined(__NetBSD__) || defined(__OpenBSD__)
bus_dmamap_t dmamap;
buf = get_bktr_mem(bktr, &dmamap,
temp * PAGE_SIZE);
if (buf != 0) {
free_bktr_mem(bktr, bktr->dm_mem,
bktr->bigbuf);
bktr->dm_mem = dmamap;
#else
buf = get_bktr_mem(unit, temp*PAGE_SIZE);
if (buf != 0) {
kmem_free(kernel_map, bktr->bigbuf,
(bktr->alloc_pages * PAGE_SIZE));
#endif
bktr->bigbuf = buf;
bktr->alloc_pages = temp;
if (bootverbose)
printf("%s: ioctl: Allocating %d bytes\n",
bktr_name(bktr), (int)(temp*PAGE_SIZE));
}
else
error = ENOMEM;
}
}
if (error)
return error;
bktr->rows = geo->rows;
bktr->cols = geo->columns;
bktr->frames = geo->frames;
/* Pixel format (if in meteor pixfmt compatibility mode) */
if ( bktr->pixfmt_compat ) {
bktr->format = METEOR_GEO_YUV_422;
switch (geo->oformat & METEOR_GEO_OUTPUT_MASK) {
case 0: /* default */
case METEOR_GEO_RGB16:
bktr->format = METEOR_GEO_RGB16;
break;
case METEOR_GEO_RGB24:
bktr->format = METEOR_GEO_RGB24;
break;
case METEOR_GEO_YUV_422:
bktr->format = METEOR_GEO_YUV_422;
if (geo->oformat & METEOR_GEO_YUV_12)
bktr->format = METEOR_GEO_YUV_12;
break;
case METEOR_GEO_YUV_PACKED:
bktr->format = METEOR_GEO_YUV_PACKED;
break;
}
bktr->pixfmt = oformat_meteor_to_bt( bktr->format );
}
if (bktr->flags & METEOR_CAP_MASK) {
if (bktr->flags & (METEOR_CONTIN|METEOR_SYNCAP)) {
switch(bktr->flags & METEOR_ONLY_FIELDS_MASK) {
case METEOR_ONLY_ODD_FIELDS:
bktr->flags |= METEOR_WANT_ODD;
break;
case METEOR_ONLY_EVEN_FIELDS:
bktr->flags |= METEOR_WANT_EVEN;
break;
default:
bktr->flags |= METEOR_WANT_MASK;
break;
}
start_capture(bktr, METEOR_CONTIN);
OUTL(bktr, BKTR_INT_STAT, INL(bktr, BKTR_INT_STAT));
OUTW(bktr, BKTR_GPIO_DMA_CTL, FIFO_ENABLED);
OUTW(bktr, BKTR_GPIO_DMA_CTL, bktr->capcontrol);
OUTL(bktr, BKTR_INT_MASK, BT848_INT_MYSTERYBIT |
BT848_INT_VSYNC |
BT848_INT_FMTCHG);
}
}
break;
/* end of METEORSETGEO */
/* FIXME. The Capture Area currently has the following restrictions:
GENERAL
y_offset may need to be even in interlaced modes
RGB24 - Interlaced mode
x_size must be greater than or equal to 1.666*METEORSETGEO width (cols)
y_size must be greater than or equal to METEORSETGEO height (rows)
RGB24 - Even Only (or Odd Only) mode
x_size must be greater than or equal to 1.666*METEORSETGEO width (cols)
y_size must be greater than or equal to 2*METEORSETGEO height (rows)
YUV12 - Interlaced mode
x_size must be greater than or equal to METEORSETGEO width (cols)
y_size must be greater than or equal to METEORSETGEO height (rows)
YUV12 - Even Only (or Odd Only) mode
x_size must be greater than or equal to METEORSETGEO width (cols)
y_size must be greater than or equal to 2*METEORSETGEO height (rows)
*/
case BT848_SCAPAREA: /* set capture area of each video frame */
/* can't change parameters while capturing */
if (bktr->flags & METEOR_CAP_MASK)
return( EBUSY );
cap_area = (struct bktr_capture_area *) arg;
bktr->capture_area_x_offset = cap_area->x_offset;
bktr->capture_area_y_offset = cap_area->y_offset;
bktr->capture_area_x_size = cap_area->x_size;
bktr->capture_area_y_size = cap_area->y_size;
bktr->capture_area_enabled = TRUE;
bktr->dma_prog_loaded = FALSE;
break;
case BT848_GCAPAREA: /* get capture area of each video frame */
cap_area = (struct bktr_capture_area *) arg;
if (bktr->capture_area_enabled == FALSE) {
cap_area->x_offset = 0;
cap_area->y_offset = 0;
cap_area->x_size = format_params[
bktr->format_params].scaled_hactive;
cap_area->y_size = format_params[
bktr->format_params].vactive;
} else {
cap_area->x_offset = bktr->capture_area_x_offset;
cap_area->y_offset = bktr->capture_area_y_offset;
cap_area->x_size = bktr->capture_area_x_size;
cap_area->y_size = bktr->capture_area_y_size;
}
break;
default:
return common_ioctl( bktr, cmd, arg );
}
return( 0 );
}
/*
* tuner ioctls
*/
int
tuner_ioctl( bktr_ptr_t bktr, int unit, ioctl_cmd_t cmd, caddr_t arg, struct thread* td )
{
int tmp_int;
unsigned int temp, temp1;
int offset;
int count;
u_char *buf;
u_long par;
u_char write;
int i2c_addr;
int i2c_port;
u_long data;
switch ( cmd ) {
case REMOTE_GETKEY:
/* Read the last key pressed by the Remote Control */
if (bktr->remote_control == 0) return (EINVAL);
remote_read(bktr, (struct bktr_remote *)arg);
break;
#if defined( TUNER_AFC )
case TVTUNER_SETAFC:
bktr->tuner.afc = (*(int *)arg != 0);
break;
case TVTUNER_GETAFC:
*(int *)arg = bktr->tuner.afc;
/* XXX Perhaps use another bit to indicate AFC success? */
break;
#endif /* TUNER_AFC */
case TVTUNER_SETCHNL:
temp_mute( bktr, TRUE );
temp = tv_channel( bktr, (int)*(unsigned long *)arg );
if ( temp < 0 ) {
temp_mute( bktr, FALSE );
return( EINVAL );
}
*(unsigned long *)arg = temp;
/* after every channel change, we must restart the MSP34xx */
/* audio chip to reselect NICAM STEREO or MONO audio */
if ( bktr->card.msp3400c )
msp_autodetect( bktr );
/* after every channel change, we must restart the DPL35xx */
if ( bktr->card.dpl3518a )
dpl_autodetect( bktr );
temp_mute( bktr, FALSE );
break;
case TVTUNER_GETCHNL:
*(unsigned long *)arg = bktr->tuner.channel;
break;
case TVTUNER_SETTYPE:
temp = *(unsigned long *)arg;
if ( (temp < CHNLSET_MIN) || (temp > CHNLSET_MAX) )
return( EINVAL );
bktr->tuner.chnlset = temp;
break;
case TVTUNER_GETTYPE:
*(unsigned long *)arg = bktr->tuner.chnlset;
break;
case TVTUNER_GETSTATUS:
temp = get_tuner_status( bktr );
*(unsigned long *)arg = temp & 0xff;
break;
case TVTUNER_SETFREQ:
temp_mute( bktr, TRUE );
temp = tv_freq( bktr, (int)*(unsigned long *)arg, TV_FREQUENCY);
temp_mute( bktr, FALSE );
if ( temp < 0 ) {
temp_mute( bktr, FALSE );
return( EINVAL );
}
*(unsigned long *)arg = temp;
/* after every channel change, we must restart the MSP34xx */
/* audio chip to reselect NICAM STEREO or MONO audio */
if ( bktr->card.msp3400c )
msp_autodetect( bktr );
/* after every channel change, we must restart the DPL35xx */
if ( bktr->card.dpl3518a )
dpl_autodetect( bktr );
temp_mute( bktr, FALSE );
break;
case TVTUNER_GETFREQ:
*(unsigned long *)arg = bktr->tuner.frequency;
break;
case TVTUNER_GETCHNLSET:
return tuner_getchnlset((struct bktr_chnlset *)arg);
case BT848_SAUDIO: /* set audio channel */
if ( set_audio( bktr, *(int*)arg ) < 0 )
return( EIO );
break;
/* hue is a 2's compliment number, -90' to +89.3' in 0.7' steps */
case BT848_SHUE: /* set hue */
OUTB(bktr, BKTR_HUE, (u_char)(*(int*)arg & 0xff));
break;
case BT848_GHUE: /* get hue */
*(int*)arg = (signed char)(INB(bktr, BKTR_HUE) & 0xff);
break;
/* brightness is a 2's compliment #, -50 to +%49.6% in 0.39% steps */
case BT848_SBRIG: /* set brightness */
OUTB(bktr, BKTR_BRIGHT, (u_char)(*(int *)arg & 0xff));
break;
case BT848_GBRIG: /* get brightness */
*(int *)arg = (signed char)(INB(bktr, BKTR_BRIGHT) & 0xff);
break;
/* */
case BT848_SCSAT: /* set chroma saturation */
tmp_int = *(int*)arg;
temp = INB(bktr, BKTR_E_CONTROL);
temp1 = INB(bktr, BKTR_O_CONTROL);
if ( tmp_int & BIT_EIGHT_HIGH ) {
temp |= (BT848_E_CONTROL_SAT_U_MSB |
BT848_E_CONTROL_SAT_V_MSB);
temp1 |= (BT848_O_CONTROL_SAT_U_MSB |
BT848_O_CONTROL_SAT_V_MSB);
}
else {
temp &= ~(BT848_E_CONTROL_SAT_U_MSB |
BT848_E_CONTROL_SAT_V_MSB);
temp1 &= ~(BT848_O_CONTROL_SAT_U_MSB |
BT848_O_CONTROL_SAT_V_MSB);
}
OUTB(bktr, BKTR_SAT_U_LO, (u_char)(tmp_int & 0xff));
OUTB(bktr, BKTR_SAT_V_LO, (u_char)(tmp_int & 0xff));
OUTB(bktr, BKTR_E_CONTROL, temp);
OUTB(bktr, BKTR_O_CONTROL, temp1);
break;
case BT848_GCSAT: /* get chroma saturation */
tmp_int = (int)(INB(bktr, BKTR_SAT_V_LO) & 0xff);
if ( INB(bktr, BKTR_E_CONTROL) & BT848_E_CONTROL_SAT_V_MSB )
tmp_int |= BIT_EIGHT_HIGH;
*(int*)arg = tmp_int;
break;
/* */
case BT848_SVSAT: /* set chroma V saturation */
tmp_int = *(int*)arg;
temp = INB(bktr, BKTR_E_CONTROL);
temp1 = INB(bktr, BKTR_O_CONTROL);
if ( tmp_int & BIT_EIGHT_HIGH) {
temp |= BT848_E_CONTROL_SAT_V_MSB;
temp1 |= BT848_O_CONTROL_SAT_V_MSB;
}
else {
temp &= ~BT848_E_CONTROL_SAT_V_MSB;
temp1 &= ~BT848_O_CONTROL_SAT_V_MSB;
}
OUTB(bktr, BKTR_SAT_V_LO, (u_char)(tmp_int & 0xff));
OUTB(bktr, BKTR_E_CONTROL, temp);
OUTB(bktr, BKTR_O_CONTROL, temp1);
break;
case BT848_GVSAT: /* get chroma V saturation */
tmp_int = (int)INB(bktr, BKTR_SAT_V_LO) & 0xff;
if ( INB(bktr, BKTR_E_CONTROL) & BT848_E_CONTROL_SAT_V_MSB )
tmp_int |= BIT_EIGHT_HIGH;
*(int*)arg = tmp_int;
break;
/* */
case BT848_SUSAT: /* set chroma U saturation */
tmp_int = *(int*)arg;
temp = INB(bktr, BKTR_E_CONTROL);
temp1 = INB(bktr, BKTR_O_CONTROL);
if ( tmp_int & BIT_EIGHT_HIGH ) {
temp |= BT848_E_CONTROL_SAT_U_MSB;
temp1 |= BT848_O_CONTROL_SAT_U_MSB;
}
else {
temp &= ~BT848_E_CONTROL_SAT_U_MSB;
temp1 &= ~BT848_O_CONTROL_SAT_U_MSB;
}
OUTB(bktr, BKTR_SAT_U_LO, (u_char)(tmp_int & 0xff));
OUTB(bktr, BKTR_E_CONTROL, temp);
OUTB(bktr, BKTR_O_CONTROL, temp1);
break;
case BT848_GUSAT: /* get chroma U saturation */
tmp_int = (int)INB(bktr, BKTR_SAT_U_LO) & 0xff;
if ( INB(bktr, BKTR_E_CONTROL) & BT848_E_CONTROL_SAT_U_MSB )
tmp_int |= BIT_EIGHT_HIGH;
*(int*)arg = tmp_int;
break;
/* lr 970528 luma notch etc - 3 high bits of e_control/o_control */
case BT848_SLNOTCH: /* set luma notch */
tmp_int = (*(int *)arg & 0x7) << 5 ;
OUTB(bktr, BKTR_E_CONTROL, INB(bktr, BKTR_E_CONTROL) & ~0xe0);
OUTB(bktr, BKTR_O_CONTROL, INB(bktr, BKTR_O_CONTROL) & ~0xe0);
OUTB(bktr, BKTR_E_CONTROL, INB(bktr, BKTR_E_CONTROL) | tmp_int);
OUTB(bktr, BKTR_O_CONTROL, INB(bktr, BKTR_O_CONTROL) | tmp_int);
break;
case BT848_GLNOTCH: /* get luma notch */
*(int *)arg = (int) ( (INB(bktr, BKTR_E_CONTROL) & 0xe0) >> 5) ;
break;
/* */
case BT848_SCONT: /* set contrast */
tmp_int = *(int*)arg;
temp = INB(bktr, BKTR_E_CONTROL);
temp1 = INB(bktr, BKTR_O_CONTROL);
if ( tmp_int & BIT_EIGHT_HIGH ) {
temp |= BT848_E_CONTROL_CON_MSB;
temp1 |= BT848_O_CONTROL_CON_MSB;
}
else {
temp &= ~BT848_E_CONTROL_CON_MSB;
temp1 &= ~BT848_O_CONTROL_CON_MSB;
}
OUTB(bktr, BKTR_CONTRAST_LO, (u_char)(tmp_int & 0xff));
OUTB(bktr, BKTR_E_CONTROL, temp);
OUTB(bktr, BKTR_O_CONTROL, temp1);
break;
case BT848_GCONT: /* get contrast */
tmp_int = (int)INB(bktr, BKTR_CONTRAST_LO) & 0xff;
if ( INB(bktr, BKTR_E_CONTROL) & BT848_E_CONTROL_CON_MSB )
tmp_int |= BIT_EIGHT_HIGH;
*(int*)arg = tmp_int;
break;
/* FIXME: SCBARS and CCBARS require a valid int * */
/* argument to succeed, but its not used; consider */
/* using the arg to store the on/off state so */
/* there's only one ioctl() needed to turn cbars on/off */
case BT848_SCBARS: /* set colorbar output */
OUTB(bktr, BKTR_COLOR_CTL, INB(bktr, BKTR_COLOR_CTL) | BT848_COLOR_CTL_COLOR_BARS);
break;
case BT848_CCBARS: /* clear colorbar output */
OUTB(bktr, BKTR_COLOR_CTL, INB(bktr, BKTR_COLOR_CTL) & ~(BT848_COLOR_CTL_COLOR_BARS));
break;
case BT848_GAUDIO: /* get audio channel */
temp = bktr->audio_mux_select;
if ( bktr->audio_mute_state == TRUE )
temp |= AUDIO_MUTE;
*(int*)arg = temp;
break;
case BT848_SBTSC: /* set audio channel */
if ( set_BTSC( bktr, *(int*)arg ) < 0 )
return( EIO );
break;
case BT848_WEEPROM: /* write eeprom */
offset = (((struct eeProm *)arg)->offset);
count = (((struct eeProm *)arg)->count);
buf = &(((struct eeProm *)arg)->bytes[ 0 ]);
if ( writeEEProm( bktr, offset, count, buf ) < 0 )
return( EIO );
break;
case BT848_REEPROM: /* read eeprom */
offset = (((struct eeProm *)arg)->offset);
count = (((struct eeProm *)arg)->count);
buf = &(((struct eeProm *)arg)->bytes[ 0 ]);
if ( readEEProm( bktr, offset, count, buf ) < 0 )
return( EIO );
break;
case BT848_SIGNATURE:
offset = (((struct eeProm *)arg)->offset);
count = (((struct eeProm *)arg)->count);
buf = &(((struct eeProm *)arg)->bytes[ 0 ]);
if ( signCard( bktr, offset, count, buf ) < 0 )
return( EIO );
break;
/* Ioctl's for direct gpio access */
#ifdef BKTR_GPIO_ACCESS
case BT848_GPIO_GET_EN:
*(int*)arg = INL(bktr, BKTR_GPIO_OUT_EN);
break;
case BT848_GPIO_SET_EN:
OUTL(bktr, BKTR_GPIO_OUT_EN, *(int*)arg);
break;
case BT848_GPIO_GET_DATA:
*(int*)arg = INL(bktr, BKTR_GPIO_DATA);
break;
case BT848_GPIO_SET_DATA:
OUTL(bktr, BKTR_GPIO_DATA, *(int*)arg);
break;
#endif /* BKTR_GPIO_ACCESS */
/* Ioctl's for running the tuner device in radio mode */
case RADIO_GETMODE:
*(unsigned char *)arg = bktr->tuner.radio_mode;
break;
case RADIO_SETMODE:
bktr->tuner.radio_mode = *(unsigned char *)arg;
break;
case RADIO_GETFREQ:
*(unsigned long *)arg = bktr->tuner.frequency;
break;
case RADIO_SETFREQ:
/* The argument to this ioctl is NOT freq*16. It is
** freq*100.
*/
temp=(int)*(unsigned long *)arg;
#ifdef BKTR_RADIO_DEBUG
printf("%s: arg=%d temp=%d\n", bktr_name(bktr),
(int)*(unsigned long *)arg, temp);
#endif
#ifndef BKTR_RADIO_NOFREQCHECK
/* According to the spec. sheet the band: 87.5MHz-108MHz */
/* is supported. */
if(temp<8750 || temp>10800) {
printf("%s: Radio frequency out of range\n", bktr_name(bktr));
return(EINVAL);
}
#endif
temp_mute( bktr, TRUE );
temp = tv_freq( bktr, temp, FM_RADIO_FREQUENCY );
temp_mute( bktr, FALSE );
#ifdef BKTR_RADIO_DEBUG
if(temp)
printf("%s: tv_freq returned: %d\n", bktr_name(bktr), temp);
#endif
if ( temp < 0 )
return( EINVAL );
*(unsigned long *)arg = temp;
break;
/* Luigi's I2CWR ioctl */
case BT848_I2CWR:
par = *(u_long *)arg;
write = (par >> 24) & 0xff ;
i2c_addr = (par >> 16) & 0xff ;
i2c_port = (par >> 8) & 0xff ;
data = (par) & 0xff ;
if (write) {
i2cWrite( bktr, i2c_addr, i2c_port, data);
} else {
data = i2cRead( bktr, i2c_addr);
}
*(u_long *)arg = (par & 0xffffff00) | ( data & 0xff );
break;
#ifdef BT848_MSP_READ
/* I2C ioctls to allow userland access to the MSP chip */
case BT848_MSP_READ:
{
struct bktr_msp_control *msp;
msp = (struct bktr_msp_control *) arg;
msp->data = msp_dpl_read(bktr, bktr->msp_addr,
msp->function, msp->address);
break;
}
case BT848_MSP_WRITE:
{
struct bktr_msp_control *msp;
msp = (struct bktr_msp_control *) arg;
msp_dpl_write(bktr, bktr->msp_addr, msp->function,
msp->address, msp->data );
break;
}
case BT848_MSP_RESET:
msp_dpl_reset(bktr, bktr->msp_addr);
break;
#endif
default:
return common_ioctl( bktr, cmd, arg );
}
return( 0 );
}
/*
* common ioctls
*/
static int
common_ioctl( bktr_ptr_t bktr, ioctl_cmd_t cmd, caddr_t arg )
{
int pixfmt;
unsigned int temp;
struct meteor_pixfmt *pf_pub;
switch (cmd) {
case METEORSINPUT: /* set input device */
/*Bt848 has 3 MUX Inputs. Bt848A/849A/878/879 has 4 MUX Inputs*/
/* On the original bt848 boards, */
/* Tuner is MUX0, RCA is MUX1, S-Video is MUX2 */
/* On the Hauppauge bt878 boards, */
/* Tuner is MUX0, RCA is MUX3 */
/* Unfortunatly Meteor driver codes DEV_RCA as DEV_0, so we */
/* stick with this system in our Meteor Emulation */
switch(*(unsigned long *)arg & METEOR_DEV_MASK) {
/* this is the RCA video input */
case 0: /* default */
case METEOR_INPUT_DEV0:
/* METEOR_INPUT_DEV_RCA: */
bktr->flags = (bktr->flags & ~METEOR_DEV_MASK)
| METEOR_DEV0;
OUTB(bktr, BKTR_IFORM, INB(bktr, BKTR_IFORM)
& ~BT848_IFORM_MUXSEL);
/* work around for new Hauppauge 878 cards */
if ((bktr->card.card_id == CARD_HAUPPAUGE) &&
(bktr->id==BROOKTREE_878 ||
bktr->id==BROOKTREE_879) )
OUTB(bktr, BKTR_IFORM, INB(bktr, BKTR_IFORM) | BT848_IFORM_M_MUX3);
else
OUTB(bktr, BKTR_IFORM, INB(bktr, BKTR_IFORM) | BT848_IFORM_M_MUX1);
OUTB(bktr, BKTR_E_CONTROL, INB(bktr, BKTR_E_CONTROL) & ~BT848_E_CONTROL_COMP);
OUTB(bktr, BKTR_O_CONTROL, INB(bktr, BKTR_O_CONTROL) & ~BT848_O_CONTROL_COMP);
set_audio( bktr, AUDIO_EXTERN );
break;
/* this is the tuner input */
case METEOR_INPUT_DEV1:
bktr->flags = (bktr->flags & ~METEOR_DEV_MASK)
| METEOR_DEV1;
OUTB(bktr, BKTR_IFORM, INB(bktr, BKTR_IFORM) & ~BT848_IFORM_MUXSEL);
OUTB(bktr, BKTR_IFORM, INB(bktr, BKTR_IFORM) | BT848_IFORM_M_MUX0);
OUTB(bktr, BKTR_E_CONTROL, INB(bktr, BKTR_E_CONTROL) & ~BT848_E_CONTROL_COMP);
OUTB(bktr, BKTR_O_CONTROL, INB(bktr, BKTR_O_CONTROL) & ~BT848_O_CONTROL_COMP);
set_audio( bktr, AUDIO_TUNER );
break;
/* this is the S-VHS input, but with a composite camera */
case METEOR_INPUT_DEV2:
bktr->flags = (bktr->flags & ~METEOR_DEV_MASK)
| METEOR_DEV2;
OUTB(bktr, BKTR_IFORM, INB(bktr, BKTR_IFORM) & ~BT848_IFORM_MUXSEL);
OUTB(bktr, BKTR_IFORM, INB(bktr, BKTR_IFORM) | BT848_IFORM_M_MUX2);
OUTB(bktr, BKTR_E_CONTROL, INB(bktr, BKTR_E_CONTROL) & ~BT848_E_CONTROL_COMP);
OUTB(bktr, BKTR_O_CONTROL, INB(bktr, BKTR_E_CONTROL) & ~BT848_O_CONTROL_COMP);
set_audio( bktr, AUDIO_EXTERN );
break;
/* this is the S-VHS input */
case METEOR_INPUT_DEV_SVIDEO:
bktr->flags = (bktr->flags & ~METEOR_DEV_MASK)
| METEOR_DEV_SVIDEO;
OUTB(bktr, BKTR_IFORM, INB(bktr, BKTR_IFORM) & ~BT848_IFORM_MUXSEL);
OUTB(bktr, BKTR_IFORM, INB(bktr, BKTR_IFORM) | BT848_IFORM_M_MUX2);
OUTB(bktr, BKTR_E_CONTROL, INB(bktr, BKTR_E_CONTROL) | BT848_E_CONTROL_COMP);
OUTB(bktr, BKTR_O_CONTROL, INB(bktr, BKTR_O_CONTROL) | BT848_O_CONTROL_COMP);
set_audio( bktr, AUDIO_EXTERN );
break;
case METEOR_INPUT_DEV3:
if ((bktr->id == BROOKTREE_848A) ||
(bktr->id == BROOKTREE_849A) ||
(bktr->id == BROOKTREE_878) ||
(bktr->id == BROOKTREE_879) ) {
bktr->flags = (bktr->flags & ~METEOR_DEV_MASK)
| METEOR_DEV3;
OUTB(bktr, BKTR_IFORM, INB(bktr, BKTR_IFORM) & ~BT848_IFORM_MUXSEL);
/* work around for new Hauppauge 878 cards */
if ((bktr->card.card_id == CARD_HAUPPAUGE) &&
(bktr->id==BROOKTREE_878 ||
bktr->id==BROOKTREE_879) )
OUTB(bktr, BKTR_IFORM, INB(bktr, BKTR_IFORM) | BT848_IFORM_M_MUX1);
else
OUTB(bktr, BKTR_IFORM, INB(bktr, BKTR_IFORM) | BT848_IFORM_M_MUX3);
OUTB(bktr, BKTR_E_CONTROL, INB(bktr, BKTR_E_CONTROL) & ~BT848_E_CONTROL_COMP);
OUTB(bktr, BKTR_O_CONTROL, INB(bktr, BKTR_O_CONTROL) & ~BT848_O_CONTROL_COMP);
set_audio( bktr, AUDIO_EXTERN );
break;
}
default:
return( EINVAL );
}
break;
case METEORGINPUT: /* get input device */
*(u_long *)arg = bktr->flags & METEOR_DEV_MASK;
break;
case METEORSACTPIXFMT:
if (( *(int *)arg < 0 ) ||
( *(int *)arg >= PIXFMT_TABLE_SIZE ))
return( EINVAL );
bktr->pixfmt = *(int *)arg;
OUTB(bktr, BKTR_COLOR_CTL, (INB(bktr, BKTR_COLOR_CTL) & 0xf0)
| pixfmt_swap_flags( bktr->pixfmt ));
bktr->pixfmt_compat = FALSE;
break;
case METEORGACTPIXFMT:
*(int *)arg = bktr->pixfmt;
break;
case METEORGSUPPIXFMT :
pf_pub = (struct meteor_pixfmt *)arg;
pixfmt = pf_pub->index;
if (( pixfmt < 0 ) || ( pixfmt >= PIXFMT_TABLE_SIZE ))
return( EINVAL );
memcpy( pf_pub, &pixfmt_table[ pixfmt ].public,
sizeof( *pf_pub ) );
/* Patch in our format index */
pf_pub->index = pixfmt;
break;
#if defined( STATUS_SUM )
case BT848_GSTATUS: /* reap status */
{
DECLARE_INTR_MASK(s);
DISABLE_INTR(s);
temp = status_sum;
status_sum = 0;
ENABLE_INTR(s);
*(u_int*)arg = temp;
break;
}
#endif /* STATUS_SUM */
default:
return( ENOTTY );
}
return( 0 );
}
/******************************************************************************
* bt848 RISC programming routines:
*/
/*
*
*/
#ifdef BT848_DEBUG
static int
dump_bt848( bktr_ptr_t bktr )
{
int r[60]={
4, 8, 0xc, 0x8c, 0x10, 0x90, 0x14, 0x94,
0x18, 0x98, 0x1c, 0x9c, 0x20, 0xa0, 0x24, 0xa4,
0x28, 0x2c, 0xac, 0x30, 0x34, 0x38, 0x3c, 0x40,
0xc0, 0x48, 0x4c, 0xcc, 0x50, 0xd0, 0xd4, 0x60,
0x64, 0x68, 0x6c, 0xec, 0xd8, 0xdc, 0xe0, 0xe4,
0, 0, 0, 0
};
int i;
for (i = 0; i < 40; i+=4) {
printf("%s: Reg:value : \t%x:%x \t%x:%x \t %x:%x \t %x:%x\n",
bktr_name(bktr),
r[i], INL(bktr, r[i]),
r[i+1], INL(bktr, r[i+1]),
r[i+2], INL(bktr, r[i+2]),
r[i+3], INL(bktr, r[i+3]]));
}
printf("%s: INT STAT %x \n", bktr_name(bktr),
INL(bktr, BKTR_INT_STAT));
printf("%s: Reg INT_MASK %x \n", bktr_name(bktr),
INL(bktr, BKTR_INT_MASK));
printf("%s: Reg GPIO_DMA_CTL %x \n", bktr_name(bktr),
INW(bktr, BKTR_GPIO_DMA_CTL));
return( 0 );
}
#endif
/*
* build write instruction
*/
#define BKTR_FM1 0x6 /* packed data to follow */
#define BKTR_FM3 0xe /* planar data to follow */
#define BKTR_VRE 0x4 /* Marks the end of the even field */
#define BKTR_VRO 0xC /* Marks the end of the odd field */
#define BKTR_PXV 0x0 /* valid word (never used) */
#define BKTR_EOL 0x1 /* last dword, 4 bytes */
#define BKTR_SOL 0x2 /* first dword */
#define OP_WRITE (0x1 << 28)
#define OP_SKIP (0x2 << 28)
#define OP_WRITEC (0x5 << 28)
#define OP_JUMP (0x7 << 28)
#define OP_SYNC (0x8 << 28)
#define OP_WRITE123 (0x9 << 28)
#define OP_WRITES123 (0xb << 28)
#define OP_SOL (1 << 27) /* first instr for scanline */
#define OP_EOL (1 << 26)
#define BKTR_RESYNC (1 << 15)
#define BKTR_GEN_IRQ (1 << 24)
/*
* The RISC status bits can be set/cleared in the RISC programs
* and tested in the Interrupt Handler
*/
#define BKTR_SET_RISC_STATUS_BIT0 (1 << 16)
#define BKTR_SET_RISC_STATUS_BIT1 (1 << 17)
#define BKTR_SET_RISC_STATUS_BIT2 (1 << 18)
#define BKTR_SET_RISC_STATUS_BIT3 (1 << 19)
#define BKTR_CLEAR_RISC_STATUS_BIT0 (1 << 20)
#define BKTR_CLEAR_RISC_STATUS_BIT1 (1 << 21)
#define BKTR_CLEAR_RISC_STATUS_BIT2 (1 << 22)
#define BKTR_CLEAR_RISC_STATUS_BIT3 (1 << 23)
#define BKTR_TEST_RISC_STATUS_BIT0 (1 << 28)
#define BKTR_TEST_RISC_STATUS_BIT1 (1 << 29)
#define BKTR_TEST_RISC_STATUS_BIT2 (1 << 30)
#define BKTR_TEST_RISC_STATUS_BIT3 (1 << 31)
static bool_t notclipped (bktr_reg_t * bktr, int x, int width) {
int i;
bktr_clip_t * clip_node;
bktr->clip_start = -1;
bktr->last_y = 0;
bktr->y = 0;
bktr->y2 = width;
bktr->line_length = width;
bktr->yclip = -1;
bktr->yclip2 = -1;
bktr->current_col = 0;
if (bktr->max_clip_node == 0 ) return TRUE;
clip_node = (bktr_clip_t *) &bktr->clip_list[0];
for (i = 0; i < bktr->max_clip_node; i++ ) {
clip_node = (bktr_clip_t *) &bktr->clip_list[i];
if (x >= clip_node->x_min && x <= clip_node->x_max ) {
bktr->clip_start = i;
return FALSE;
}
}
return TRUE;
}
static bool_t getline(bktr_reg_t *bktr, int x ) {
int i, j;
bktr_clip_t * clip_node ;
if (bktr->line_length == 0 ||
bktr->current_col >= bktr->line_length) return FALSE;
bktr->y = min(bktr->last_y, bktr->line_length);
bktr->y2 = bktr->line_length;
bktr->yclip = bktr->yclip2 = -1;
for (i = bktr->clip_start; i < bktr->max_clip_node; i++ ) {
clip_node = (bktr_clip_t *) &bktr->clip_list[i];
if (x >= clip_node->x_min && x <= clip_node->x_max) {
if (bktr->last_y <= clip_node->y_min) {
bktr->y = min(bktr->last_y, bktr->line_length);
bktr->y2 = min(clip_node->y_min, bktr->line_length);
bktr->yclip = min(clip_node->y_min, bktr->line_length);
bktr->yclip2 = min(clip_node->y_max, bktr->line_length);
bktr->last_y = bktr->yclip2;
bktr->clip_start = i;
for (j = i+1; j < bktr->max_clip_node; j++ ) {
clip_node = (bktr_clip_t *) &bktr->clip_list[j];
if (x >= clip_node->x_min && x <= clip_node->x_max) {
if (bktr->last_y >= clip_node->y_min) {
bktr->yclip2 = min(clip_node->y_max, bktr->line_length);
bktr->last_y = bktr->yclip2;
bktr->clip_start = j;
}
} else break ;
}
return TRUE;
}
}
}
if (bktr->current_col <= bktr->line_length) {
bktr->current_col = bktr->line_length;
return TRUE;
}
return FALSE;
}
static bool_t split(bktr_reg_t * bktr, volatile uint32_t **dma_prog, int width ,
u_long operation, int pixel_width,
volatile u_char ** target_buffer, int cols ) {
u_long flag, flag2;
struct meteor_pixfmt *pf = &pixfmt_table[ bktr->pixfmt ].public;
u_int skip, start_skip;
/* For RGB24, we need to align the component in FIFO Byte Lane 0 */
/* to the 1st byte in the mem dword containing our start addr. */
/* BTW, we know this pixfmt's 1st byte is Blue; thus the start addr */
/* must be Blue. */
start_skip = 0;
if (( pf->type == METEOR_PIXTYPE_RGB ) && ( pf->Bpp == 3 ))
switch ( ((uintptr_t) (volatile void *) *target_buffer) % 4 ) {
case 2 : start_skip = 4 ; break;
case 1 : start_skip = 8 ; break;
}
if ((width * pixel_width) < DMA_BT848_SPLIT ) {
if ( width == cols) {
flag = OP_SOL | OP_EOL;
} else if (bktr->current_col == 0 ) {
flag = OP_SOL;
} else if (bktr->current_col == cols) {
flag = OP_EOL;
} else flag = 0;
skip = 0;
if (( flag & OP_SOL ) && ( start_skip > 0 )) {
*(*dma_prog)++ = OP_SKIP | OP_SOL | start_skip;
flag &= ~OP_SOL;
skip = start_skip;
}
*(*dma_prog)++ = operation | flag | (width * pixel_width - skip);
if (operation != OP_SKIP )
*(*dma_prog)++ = (uintptr_t) (volatile void *) *target_buffer;
*target_buffer += width * pixel_width;
bktr->current_col += width;
} else {
if (bktr->current_col == 0 && width == cols) {
flag = OP_SOL ;
flag2 = OP_EOL;
} else if (bktr->current_col == 0 ) {
flag = OP_SOL;
flag2 = 0;
} else if (bktr->current_col >= cols) {
flag = 0;
flag2 = OP_EOL;
} else {
flag = 0;
flag2 = 0;
}
skip = 0;
if (( flag & OP_SOL ) && ( start_skip > 0 )) {
*(*dma_prog)++ = OP_SKIP | OP_SOL | start_skip;
flag &= ~OP_SOL;
skip = start_skip;
}
*(*dma_prog)++ = operation | flag |
(width * pixel_width / 2 - skip);
if (operation != OP_SKIP )
*(*dma_prog)++ = (uintptr_t) (volatile void *) *target_buffer ;
*target_buffer += (width * pixel_width / 2) ;
if ( operation == OP_WRITE )
operation = OP_WRITEC;
*(*dma_prog)++ = operation | flag2 |
(width * pixel_width / 2);
*target_buffer += (width * pixel_width / 2) ;
bktr->current_col += width;
}
return TRUE;
}
/*
* Generate the RISC instructions to capture both VBI and video images
*/
static void
rgb_vbi_prog( bktr_ptr_t bktr, char i_flag, int cols, int rows, int interlace )
{
int i;
volatile uint32_t target_buffer, buffer, target,width;
volatile uint32_t pitch;
volatile uint32_t *dma_prog; /* DMA prog is an array of
32 bit RISC instructions */
volatile uint32_t *loop_point;
struct meteor_pixfmt_internal *pf_int = &pixfmt_table[ bktr->pixfmt ];
u_int Bpp = pf_int->public.Bpp;
unsigned int vbisamples; /* VBI samples per line */
unsigned int vbilines; /* VBI lines per field */
unsigned int num_dwords; /* DWORDS per line */
vbisamples = format_params[bktr->format_params].vbi_num_samples;
vbilines = format_params[bktr->format_params].vbi_num_lines;
num_dwords = vbisamples/4;
OUTB(bktr, BKTR_COLOR_FMT, pf_int->color_fmt);
OUTB(bktr, BKTR_ADC, SYNC_LEVEL);
OUTB(bktr, BKTR_VBI_PACK_SIZE, ((num_dwords)) & 0xff);
OUTB(bktr, BKTR_VBI_PACK_DEL, ((num_dwords)>> 8) & 0x01); /* no hdelay */
/* no ext frame */
OUTB(bktr, BKTR_OFORM, 0x00);
OUTB(bktr, BKTR_E_VSCALE_HI, INB(bktr, BKTR_E_VSCALE_HI) | 0x40); /* set chroma comb */
OUTB(bktr, BKTR_O_VSCALE_HI, INB(bktr, BKTR_O_VSCALE_HI) | 0x40);
OUTB(bktr, BKTR_E_VSCALE_HI, INB(bktr, BKTR_E_VSCALE_HI) & ~0x80); /* clear Ycomb */
OUTB(bktr, BKTR_O_VSCALE_HI, INB(bktr, BKTR_O_VSCALE_HI) & ~0x80);
/* disable gamma correction removal */
OUTB(bktr, BKTR_COLOR_CTL, INB(bktr, BKTR_COLOR_CTL) | BT848_COLOR_CTL_GAMMA);
if (cols > 385 ) {
OUTB(bktr, BKTR_E_VTC, 0);
OUTB(bktr, BKTR_O_VTC, 0);
} else {
OUTB(bktr, BKTR_E_VTC, 1);
OUTB(bktr, BKTR_O_VTC, 1);
}
bktr->capcontrol = 3 << 2 | 3;
dma_prog = (uint32_t *) bktr->dma_prog;
/* Construct Write */
if (bktr->video.addr) {
target_buffer = (u_long) bktr->video.addr;
pitch = bktr->video.width;
}
else {
target_buffer = (u_long) vtophys(bktr->bigbuf);
pitch = cols*Bpp;
}
buffer = target_buffer;
/* Wait for the VRE sync marking the end of the Even and
* the start of the Odd field. Resync here.
*/
*dma_prog++ = OP_SYNC | BKTR_RESYNC |BKTR_VRE;
*dma_prog++ = 0;
loop_point = dma_prog;
/* store the VBI data */
/* look for sync with packed data */
*dma_prog++ = OP_SYNC | BKTR_FM1;
*dma_prog++ = 0;
for(i = 0; i < vbilines; i++) {
*dma_prog++ = OP_WRITE | OP_SOL | OP_EOL | vbisamples;
*dma_prog++ = (u_long) vtophys((caddr_t)bktr->vbidata +
(i * VBI_LINE_SIZE));
}
if ( (i_flag == 2/*Odd*/) || (i_flag==3) /*interlaced*/ ) {
/* store the Odd field video image */
/* look for sync with packed data */
*dma_prog++ = OP_SYNC | BKTR_FM1;
*dma_prog++ = 0; /* NULL WORD */
width = cols;
for (i = 0; i < (rows/interlace); i++) {
target = target_buffer;
if ( notclipped(bktr, i, width)) {
split(bktr, (volatile uint32_t **) &dma_prog,
bktr->y2 - bktr->y, OP_WRITE,
Bpp, (volatile u_char **)(uintptr_t)&target, cols);
} else {
while(getline(bktr, i)) {
if (bktr->y != bktr->y2 ) {
split(bktr, (volatile uint32_t **) &dma_prog,
bktr->y2 - bktr->y, OP_WRITE,
Bpp, (volatile u_char **)(uintptr_t)&target, cols);
}
if (bktr->yclip != bktr->yclip2 ) {
split(bktr,(volatile uint32_t **) &dma_prog,
bktr->yclip2 - bktr->yclip,
OP_SKIP,
Bpp, (volatile u_char **)(uintptr_t)&target, cols);
}
}
}
target_buffer += interlace * pitch;
}
} /* end if */
/* Grab the Even field */
/* Look for the VRO, end of Odd field, marker */
*dma_prog++ = OP_SYNC | BKTR_GEN_IRQ | BKTR_RESYNC | BKTR_VRO;
*dma_prog++ = 0; /* NULL WORD */
/* store the VBI data */
/* look for sync with packed data */
*dma_prog++ = OP_SYNC | BKTR_FM1;
*dma_prog++ = 0;
for(i = 0; i < vbilines; i++) {
*dma_prog++ = OP_WRITE | OP_SOL | OP_EOL | vbisamples;
*dma_prog++ = (u_long) vtophys((caddr_t)bktr->vbidata +
((i+MAX_VBI_LINES) * VBI_LINE_SIZE));
}
/* store the video image */
if (i_flag == 1) /*Even Only*/
target_buffer = buffer;
if (i_flag == 3) /*interlaced*/
target_buffer = buffer+pitch;
if ((i_flag == 1) /*Even Only*/ || (i_flag==3) /*interlaced*/) {
/* look for sync with packed data */
*dma_prog++ = OP_SYNC | BKTR_FM1;
*dma_prog++ = 0; /* NULL WORD */
width = cols;
for (i = 0; i < (rows/interlace); i++) {
target = target_buffer;
if ( notclipped(bktr, i, width)) {
split(bktr, (volatile uint32_t **) &dma_prog,
bktr->y2 - bktr->y, OP_WRITE,
Bpp, (volatile u_char **)(uintptr_t)&target, cols);
} else {
while(getline(bktr, i)) {
if (bktr->y != bktr->y2 ) {
split(bktr, (volatile uint32_t **) &dma_prog,
bktr->y2 - bktr->y, OP_WRITE,
Bpp, (volatile u_char **)(uintptr_t)&target,
cols);
}
if (bktr->yclip != bktr->yclip2 ) {
split(bktr, (volatile uint32_t **) &dma_prog,
bktr->yclip2 - bktr->yclip, OP_SKIP,
Bpp, (volatile u_char **)(uintptr_t) &target, cols);
}
}
}
target_buffer += interlace * pitch;
}
}
/* Look for end of 'Even Field' */
*dma_prog++ = OP_SYNC | BKTR_GEN_IRQ | BKTR_RESYNC | BKTR_VRE;
*dma_prog++ = 0; /* NULL WORD */
*dma_prog++ = OP_JUMP ;
*dma_prog++ = (u_long ) vtophys(loop_point) ;
*dma_prog++ = 0; /* NULL WORD */
}
static void
rgb_prog( bktr_ptr_t bktr, char i_flag, int cols, int rows, int interlace )
{
int i;
volatile uint32_t target_buffer, buffer, target,width;
volatile uint32_t pitch;
volatile uint32_t *dma_prog;
struct meteor_pixfmt_internal *pf_int = &pixfmt_table[ bktr->pixfmt ];
u_int Bpp = pf_int->public.Bpp;
OUTB(bktr, BKTR_COLOR_FMT, pf_int->color_fmt);
OUTB(bktr, BKTR_VBI_PACK_SIZE, 0);
OUTB(bktr, BKTR_VBI_PACK_DEL, 0);
OUTB(bktr, BKTR_ADC, SYNC_LEVEL);
OUTB(bktr, BKTR_OFORM, 0x00);
OUTB(bktr, BKTR_E_VSCALE_HI, INB(bktr, BKTR_E_VSCALE_HI) | 0x40); /* set chroma comb */
OUTB(bktr, BKTR_O_VSCALE_HI, INB(bktr, BKTR_O_VSCALE_HI) | 0x40);
OUTB(bktr, BKTR_E_VSCALE_HI, INB(bktr, BKTR_E_VSCALE_HI) & ~0x80); /* clear Ycomb */
OUTB(bktr, BKTR_O_VSCALE_HI, INB(bktr, BKTR_O_VSCALE_HI) & ~0x80);
/* disable gamma correction removal */
OUTB(bktr, BKTR_COLOR_CTL, INB(bktr, BKTR_COLOR_CTL) | BT848_COLOR_CTL_GAMMA);
if (cols > 385 ) {
OUTB(bktr, BKTR_E_VTC, 0);
OUTB(bktr, BKTR_O_VTC, 0);
} else {
OUTB(bktr, BKTR_E_VTC, 1);
OUTB(bktr, BKTR_O_VTC, 1);
}
bktr->capcontrol = 3 << 2 | 3;
dma_prog = (uint32_t *) bktr->dma_prog;
/* Construct Write */
if (bktr->video.addr) {
target_buffer = (uint32_t) bktr->video.addr;
pitch = bktr->video.width;
}
else {
target_buffer = (uint32_t) vtophys(bktr->bigbuf);
pitch = cols*Bpp;
}
buffer = target_buffer;
/* contruct sync : for video packet format */
*dma_prog++ = OP_SYNC | BKTR_RESYNC | BKTR_FM1;
/* sync, mode indicator packed data */
*dma_prog++ = 0; /* NULL WORD */
width = cols;
for (i = 0; i < (rows/interlace); i++) {
target = target_buffer;
if ( notclipped(bktr, i, width)) {
split(bktr, (volatile uint32_t **) &dma_prog,
bktr->y2 - bktr->y, OP_WRITE,
Bpp, (volatile u_char **)(uintptr_t)&target, cols);
} else {
while(getline(bktr, i)) {
if (bktr->y != bktr->y2 ) {
split(bktr, (volatile uint32_t **) &dma_prog,
bktr->y2 - bktr->y, OP_WRITE,
Bpp, (volatile u_char **)(uintptr_t)&target, cols);
}
if (bktr->yclip != bktr->yclip2 ) {
split(bktr,(volatile uint32_t **) &dma_prog,
bktr->yclip2 - bktr->yclip,
OP_SKIP,
Bpp, (volatile u_char **)(uintptr_t)&target, cols);
}
}
}
target_buffer += interlace * pitch;
}
switch (i_flag) {
case 1:
/* sync vre */
*dma_prog++ = OP_SYNC | BKTR_GEN_IRQ | BKTR_VRO;
*dma_prog++ = 0; /* NULL WORD */
*dma_prog++ = OP_JUMP;
*dma_prog++ = (uint32_t ) vtophys(bktr->dma_prog);
return;
case 2:
/* sync vro */
*dma_prog++ = OP_SYNC | BKTR_GEN_IRQ | BKTR_VRE;
*dma_prog++ = 0; /* NULL WORD */
*dma_prog++ = OP_JUMP;
*dma_prog++ = (uint32_t ) vtophys(bktr->dma_prog);
return;
case 3:
/* sync vro */
*dma_prog++ = OP_SYNC | BKTR_GEN_IRQ | BKTR_RESYNC | BKTR_VRO;
*dma_prog++ = 0; /* NULL WORD */
*dma_prog++ = OP_JUMP; ;
*dma_prog = (uint32_t ) vtophys(bktr->odd_dma_prog);
break;
}
if (interlace == 2) {
target_buffer = buffer + pitch;
dma_prog = (uint32_t *) bktr->odd_dma_prog;
/* sync vre IRQ bit */
*dma_prog++ = OP_SYNC | BKTR_RESYNC | BKTR_FM1;
*dma_prog++ = 0; /* NULL WORD */
width = cols;
for (i = 0; i < (rows/interlace); i++) {
target = target_buffer;
if ( notclipped(bktr, i, width)) {
split(bktr, (volatile uint32_t **) &dma_prog,
bktr->y2 - bktr->y, OP_WRITE,
Bpp, (volatile u_char **)(uintptr_t)&target, cols);
} else {
while(getline(bktr, i)) {
if (bktr->y != bktr->y2 ) {
split(bktr, (volatile uint32_t **) &dma_prog,
bktr->y2 - bktr->y, OP_WRITE,
Bpp, (volatile u_char **)(uintptr_t)&target,
cols);
}
if (bktr->yclip != bktr->yclip2 ) {
split(bktr, (volatile uint32_t **) &dma_prog,
bktr->yclip2 - bktr->yclip, OP_SKIP,
Bpp, (volatile u_char **)(uintptr_t)&target, cols);
}
}
}
target_buffer += interlace * pitch;
}
}
/* sync vre IRQ bit */
*dma_prog++ = OP_SYNC | BKTR_GEN_IRQ | BKTR_RESYNC | BKTR_VRE;
*dma_prog++ = 0; /* NULL WORD */
*dma_prog++ = OP_JUMP ;
*dma_prog++ = (uint32_t ) vtophys(bktr->dma_prog) ;
*dma_prog++ = 0; /* NULL WORD */
}
/*
*
*/
static void
yuvpack_prog( bktr_ptr_t bktr, char i_flag,
int cols, int rows, int interlace )
{
int i;
volatile unsigned int inst;
volatile unsigned int inst3;
volatile uint32_t target_buffer, buffer;
volatile uint32_t *dma_prog;
struct meteor_pixfmt_internal *pf_int = &pixfmt_table[ bktr->pixfmt ];
int b;
OUTB(bktr, BKTR_COLOR_FMT, pf_int->color_fmt);
OUTB(bktr, BKTR_E_SCLOOP, INB(bktr, BKTR_E_SCLOOP) | BT848_E_SCLOOP_CAGC); /* enable chroma comb */
OUTB(bktr, BKTR_O_SCLOOP, INB(bktr, BKTR_O_SCLOOP) | BT848_O_SCLOOP_CAGC);
OUTB(bktr, BKTR_COLOR_CTL, INB(bktr, BKTR_COLOR_CTL) | BT848_COLOR_CTL_RGB_DED | BT848_COLOR_CTL_GAMMA);
OUTB(bktr, BKTR_ADC, SYNC_LEVEL);
bktr->capcontrol = 1 << 6 | 1 << 4 | 1 << 2 | 3;
bktr->capcontrol = 3 << 2 | 3;
dma_prog = (uint32_t *) bktr->dma_prog;
/* Construct Write */
/* write , sol, eol */
inst = OP_WRITE | OP_SOL | (cols);
/* write , sol, eol */
inst3 = OP_WRITE | OP_EOL | (cols);
if (bktr->video.addr)
target_buffer = (uint32_t) bktr->video.addr;
else
target_buffer = (uint32_t) vtophys(bktr->bigbuf);
buffer = target_buffer;
/* contruct sync : for video packet format */
/* sync, mode indicator packed data */
*dma_prog++ = OP_SYNC | BKTR_RESYNC | BKTR_FM1;
*dma_prog++ = 0; /* NULL WORD */
b = cols;
for (i = 0; i < (rows/interlace); i++) {
*dma_prog++ = inst;
*dma_prog++ = target_buffer;
*dma_prog++ = inst3;
*dma_prog++ = target_buffer + b;
target_buffer += interlace*(cols * 2);
}
switch (i_flag) {
case 1:
/* sync vre */
*dma_prog++ = OP_SYNC | BKTR_GEN_IRQ | BKTR_VRE;
*dma_prog++ = 0; /* NULL WORD */
*dma_prog++ = OP_JUMP;
*dma_prog++ = (uint32_t) vtophys(bktr->dma_prog);
return;
case 2:
/* sync vro */
*dma_prog++ = OP_SYNC | BKTR_GEN_IRQ | BKTR_VRO;
*dma_prog++ = 0; /* NULL WORD */
*dma_prog++ = OP_JUMP;
*dma_prog++ = (uint32_t) vtophys(bktr->dma_prog);
return;
case 3:
/* sync vro */
*dma_prog++ = OP_SYNC | BKTR_GEN_IRQ | BKTR_RESYNC | BKTR_VRO;
*dma_prog++ = 0; /* NULL WORD */
*dma_prog++ = OP_JUMP ;
*dma_prog = (uint32_t) vtophys(bktr->odd_dma_prog);
break;
}
if (interlace == 2) {
target_buffer = (uint32_t) buffer + cols*2;
dma_prog = (uint32_t *) bktr->odd_dma_prog;
/* sync vre */
*dma_prog++ = OP_SYNC | BKTR_RESYNC | BKTR_FM1;
*dma_prog++ = 0; /* NULL WORD */
for (i = 0; i < (rows/interlace) ; i++) {
*dma_prog++ = inst;
*dma_prog++ = target_buffer;
*dma_prog++ = inst3;
*dma_prog++ = target_buffer + b;
target_buffer += interlace * ( cols*2);
}
}
/* sync vro IRQ bit */
*dma_prog++ = OP_SYNC | BKTR_GEN_IRQ | BKTR_RESYNC | BKTR_VRE;
*dma_prog++ = 0; /* NULL WORD */
*dma_prog++ = OP_JUMP ;
*dma_prog++ = (uint32_t) vtophys(bktr->dma_prog);
*dma_prog++ = OP_JUMP;
*dma_prog++ = (uint32_t) vtophys(bktr->dma_prog);
*dma_prog++ = 0; /* NULL WORD */
}
/*
*
*/
static void
yuv422_prog( bktr_ptr_t bktr, char i_flag,
int cols, int rows, int interlace ){
int i;
volatile unsigned int inst;
volatile uint32_t target_buffer, t1, buffer;
volatile uint32_t *dma_prog;
struct meteor_pixfmt_internal *pf_int = &pixfmt_table[ bktr->pixfmt ];
OUTB(bktr, BKTR_COLOR_FMT, pf_int->color_fmt);
dma_prog = (uint32_t*) bktr->dma_prog;
bktr->capcontrol = 1 << 6 | 1 << 4 | 3;
OUTB(bktr, BKTR_ADC, SYNC_LEVEL);
OUTB(bktr, BKTR_OFORM, 0x00);
OUTB(bktr, BKTR_E_CONTROL, INB(bktr, BKTR_E_CONTROL) | BT848_E_CONTROL_LDEC); /* disable luma decimation */
OUTB(bktr, BKTR_O_CONTROL, INB(bktr, BKTR_O_CONTROL) | BT848_O_CONTROL_LDEC);
OUTB(bktr, BKTR_E_SCLOOP, INB(bktr, BKTR_E_SCLOOP) | BT848_E_SCLOOP_CAGC); /* chroma agc enable */
OUTB(bktr, BKTR_O_SCLOOP, INB(bktr, BKTR_O_SCLOOP) | BT848_O_SCLOOP_CAGC);
OUTB(bktr, BKTR_E_VSCALE_HI, INB(bktr, BKTR_E_VSCALE_HI) & ~0x80); /* clear Ycomb */
OUTB(bktr, BKTR_O_VSCALE_HI, INB(bktr, BKTR_O_VSCALE_HI) & ~0x80);
OUTB(bktr, BKTR_E_VSCALE_HI, INB(bktr, BKTR_E_VSCALE_HI) | 0x40); /* set chroma comb */
OUTB(bktr, BKTR_O_VSCALE_HI, INB(bktr, BKTR_O_VSCALE_HI) | 0x40);
/* disable gamma correction removal */
OUTB(bktr, BKTR_COLOR_CTL, INB(bktr, BKTR_COLOR_CTL) | BT848_COLOR_CTL_GAMMA);
/* Construct Write */
inst = OP_WRITE123 | OP_SOL | OP_EOL | (cols);
if (bktr->video.addr)
target_buffer = (uint32_t) bktr->video.addr;
else
target_buffer = (uint32_t) vtophys(bktr->bigbuf);
buffer = target_buffer;
t1 = buffer;
/* contruct sync : for video packet format */
*dma_prog++ = OP_SYNC | 1 << 15 | BKTR_FM3; /*sync, mode indicator packed data*/
*dma_prog++ = 0; /* NULL WORD */
for (i = 0; i < (rows/interlace ) ; i++) {
*dma_prog++ = inst;
*dma_prog++ = cols/2 | cols/2 << 16;
*dma_prog++ = target_buffer;
*dma_prog++ = t1 + (cols*rows) + i*cols/2 * interlace;
*dma_prog++ = t1 + (cols*rows) + (cols*rows/2) + i*cols/2 * interlace;
target_buffer += interlace*cols;
}
switch (i_flag) {
case 1:
*dma_prog++ = OP_SYNC | 1 << 24 | BKTR_VRE; /*sync vre*/
*dma_prog++ = 0; /* NULL WORD */
*dma_prog++ = OP_JUMP ;
*dma_prog++ = (uint32_t) vtophys(bktr->dma_prog);
return;
case 2:
*dma_prog++ = OP_SYNC | 1 << 24 | BKTR_VRO; /*sync vre*/
*dma_prog++ = 0; /* NULL WORD */
*dma_prog++ = OP_JUMP;
*dma_prog++ = (uint32_t) vtophys(bktr->dma_prog);
return;
case 3:
*dma_prog++ = OP_SYNC | 1 << 24 | 1 << 15 | BKTR_VRO;
*dma_prog++ = 0; /* NULL WORD */
*dma_prog++ = OP_JUMP ;
*dma_prog = (uint32_t) vtophys(bktr->odd_dma_prog);
break;
}
if (interlace == 2) {
dma_prog = (uint32_t *) bktr->odd_dma_prog;
target_buffer = (uint32_t) buffer + cols;
t1 = buffer + cols/2;
*dma_prog++ = OP_SYNC | 1 << 15 | BKTR_FM3;
*dma_prog++ = 0; /* NULL WORD */
for (i = 0; i < (rows/interlace ) ; i++) {
*dma_prog++ = inst;
*dma_prog++ = cols/2 | cols/2 << 16;
*dma_prog++ = target_buffer;
*dma_prog++ = t1 + (cols*rows) + i*cols/2 * interlace;
*dma_prog++ = t1 + (cols*rows) + (cols*rows/2) + i*cols/2 * interlace;
target_buffer += interlace*cols;
}
}
*dma_prog++ = OP_SYNC | 1 << 24 | 1 << 15 | BKTR_VRE;
*dma_prog++ = 0; /* NULL WORD */
*dma_prog++ = OP_JUMP ;
*dma_prog++ = (uint32_t) vtophys(bktr->dma_prog) ;
*dma_prog++ = 0; /* NULL WORD */
}
/*
*
*/
static void
yuv12_prog( bktr_ptr_t bktr, char i_flag,
int cols, int rows, int interlace ){
int i;
volatile unsigned int inst;
volatile unsigned int inst1;
volatile uint32_t target_buffer, t1, buffer;
volatile uint32_t *dma_prog;
struct meteor_pixfmt_internal *pf_int = &pixfmt_table[ bktr->pixfmt ];
OUTB(bktr, BKTR_COLOR_FMT, pf_int->color_fmt);
dma_prog = (uint32_t *) bktr->dma_prog;
bktr->capcontrol = 1 << 6 | 1 << 4 | 3;
OUTB(bktr, BKTR_ADC, SYNC_LEVEL);
OUTB(bktr, BKTR_OFORM, 0x0);
/* Construct Write */
inst = OP_WRITE123 | OP_SOL | OP_EOL | (cols);
inst1 = OP_WRITES123 | OP_SOL | OP_EOL | (cols);
if (bktr->video.addr)
target_buffer = (uint32_t) bktr->video.addr;
else
target_buffer = (uint32_t) vtophys(bktr->bigbuf);
buffer = target_buffer;
t1 = buffer;
*dma_prog++ = OP_SYNC | 1 << 15 | BKTR_FM3; /*sync, mode indicator packed data*/
*dma_prog++ = 0; /* NULL WORD */
for (i = 0; i < (rows/interlace )/2 ; i++) {
*dma_prog++ = inst;
*dma_prog++ = cols/2 | (cols/2 << 16);
*dma_prog++ = target_buffer;
*dma_prog++ = t1 + (cols*rows) + i*cols/2 * interlace;
*dma_prog++ = t1 + (cols*rows) + (cols*rows/4) + i*cols/2 * interlace;
target_buffer += interlace*cols;
*dma_prog++ = inst1;
*dma_prog++ = cols/2 | (cols/2 << 16);
*dma_prog++ = target_buffer;
target_buffer += interlace*cols;
}
switch (i_flag) {
case 1:
*dma_prog++ = OP_SYNC | 1 << 24 | BKTR_VRE; /*sync vre*/
*dma_prog++ = 0; /* NULL WORD */
*dma_prog++ = OP_JUMP;
*dma_prog++ = (uint32_t) vtophys(bktr->dma_prog);
return;
case 2:
*dma_prog++ = OP_SYNC | 1 << 24 | BKTR_VRO; /*sync vro*/
*dma_prog++ = 0; /* NULL WORD */
*dma_prog++ = OP_JUMP;
*dma_prog++ = (uint32_t) vtophys(bktr->dma_prog);
return;
case 3:
*dma_prog++ = OP_SYNC | 1 << 24 | 1 << 15 | BKTR_VRO;
*dma_prog++ = 0; /* NULL WORD */
*dma_prog++ = OP_JUMP ;
*dma_prog = (uint32_t) vtophys(bktr->odd_dma_prog);
break;
}
if (interlace == 2) {
dma_prog = (uint32_t *) bktr->odd_dma_prog;
target_buffer = (uint32_t) buffer + cols;
t1 = buffer + cols/2;
*dma_prog++ = OP_SYNC | 1 << 15 | BKTR_FM3;
*dma_prog++ = 0; /* NULL WORD */
for (i = 0; i < ((rows/interlace )/2 ) ; i++) {
*dma_prog++ = inst;
*dma_prog++ = cols/2 | (cols/2 << 16);
*dma_prog++ = target_buffer;
*dma_prog++ = t1 + (cols*rows) + i*cols/2 * interlace;
*dma_prog++ = t1 + (cols*rows) + (cols*rows/4) + i*cols/2 * interlace;
target_buffer += interlace*cols;
*dma_prog++ = inst1;
*dma_prog++ = cols/2 | (cols/2 << 16);
*dma_prog++ = target_buffer;
target_buffer += interlace*cols;
}
}
*dma_prog++ = OP_SYNC | 1 << 24 | 1 << 15 | BKTR_VRE;
*dma_prog++ = 0; /* NULL WORD */
*dma_prog++ = OP_JUMP;
*dma_prog++ = (uint32_t) vtophys(bktr->dma_prog);
*dma_prog++ = 0; /* NULL WORD */
}
/*
*
*/
static void
build_dma_prog( bktr_ptr_t bktr, char i_flag )
{
int rows, cols, interlace;
int tmp_int;
unsigned int temp;
struct format_params *fp;
struct meteor_pixfmt_internal *pf_int = &pixfmt_table[ bktr->pixfmt ];
fp = &format_params[bktr->format_params];
OUTL(bktr, BKTR_INT_MASK, ALL_INTS_DISABLED);
/* disable FIFO & RISC, leave other bits alone */
OUTW(bktr, BKTR_GPIO_DMA_CTL, INW(bktr, BKTR_GPIO_DMA_CTL) & ~FIFO_RISC_ENABLED);
/* set video parameters */
if (bktr->capture_area_enabled)
temp = ((quad_t ) fp->htotal* (quad_t) bktr->capture_area_x_size * 4096
/ fp->scaled_htotal / bktr->cols) - 4096;
else
temp = ((quad_t ) fp->htotal* (quad_t) fp->scaled_hactive * 4096
/ fp->scaled_htotal / bktr->cols) - 4096;
/* printf("%s: HSCALE value is %d\n", bktr_name(bktr), temp); */
OUTB(bktr, BKTR_E_HSCALE_LO, temp & 0xff);
OUTB(bktr, BKTR_O_HSCALE_LO, temp & 0xff);
OUTB(bktr, BKTR_E_HSCALE_HI, (temp >> 8) & 0xff);
OUTB(bktr, BKTR_O_HSCALE_HI, (temp >> 8) & 0xff);
/* horizontal active */
temp = bktr->cols;
/* printf("%s: HACTIVE value is %d\n", bktr_name(bktr), temp); */
OUTB(bktr, BKTR_E_HACTIVE_LO, temp & 0xff);
OUTB(bktr, BKTR_O_HACTIVE_LO, temp & 0xff);
OUTB(bktr, BKTR_E_CROP, INB(bktr, BKTR_E_CROP) & ~0x3);
OUTB(bktr, BKTR_O_CROP, INB(bktr, BKTR_O_CROP) & ~0x3);
OUTB(bktr, BKTR_E_CROP, INB(bktr, BKTR_E_CROP) | ((temp >> 8) & 0x3));
OUTB(bktr, BKTR_O_CROP, INB(bktr, BKTR_O_CROP) | ((temp >> 8) & 0x3));
/* horizontal delay */
if (bktr->capture_area_enabled)
temp = ( (fp->hdelay* fp->scaled_hactive + bktr->capture_area_x_offset* fp->scaled_htotal)
* bktr->cols) / (bktr->capture_area_x_size * fp->hactive);
else
temp = (fp->hdelay * bktr->cols) / fp->hactive;
temp = temp & 0x3fe;
/* printf("%s: HDELAY value is %d\n", bktr_name(bktr), temp); */
OUTB(bktr, BKTR_E_DELAY_LO, temp & 0xff);
OUTB(bktr, BKTR_O_DELAY_LO, temp & 0xff);
OUTB(bktr, BKTR_E_CROP, INB(bktr, BKTR_E_CROP) & ~0xc);
OUTB(bktr, BKTR_O_CROP, INB(bktr, BKTR_O_CROP) & ~0xc);
OUTB(bktr, BKTR_E_CROP, INB(bktr, BKTR_E_CROP) | ((temp >> 6) & 0xc));
OUTB(bktr, BKTR_O_CROP, INB(bktr, BKTR_O_CROP) | ((temp >> 6) & 0xc));
/* vertical scale */
if (bktr->capture_area_enabled) {
if (bktr->flags & METEOR_ONLY_ODD_FIELDS ||
bktr->flags & METEOR_ONLY_EVEN_FIELDS)
tmp_int = 65536 -
(((bktr->capture_area_y_size * 256 + (bktr->rows/2)) / bktr->rows) - 512);
else {
tmp_int = 65536 -
(((bktr->capture_area_y_size * 512 + (bktr->rows / 2)) / bktr->rows) - 512);
}
} else {
if (bktr->flags & METEOR_ONLY_ODD_FIELDS ||
bktr->flags & METEOR_ONLY_EVEN_FIELDS)
tmp_int = 65536 -
(((fp->vactive * 256 + (bktr->rows/2)) / bktr->rows) - 512);
else {
tmp_int = 65536 -
(((fp->vactive * 512 + (bktr->rows / 2)) / bktr->rows) - 512);
}
}
tmp_int &= 0x1fff;
/* printf("%s: VSCALE value is %d\n", bktr_name(bktr), tmp_int); */
OUTB(bktr, BKTR_E_VSCALE_LO, tmp_int & 0xff);
OUTB(bktr, BKTR_O_VSCALE_LO, tmp_int & 0xff);
OUTB(bktr, BKTR_E_VSCALE_HI, INB(bktr, BKTR_E_VSCALE_HI) & ~0x1f);
OUTB(bktr, BKTR_O_VSCALE_HI, INB(bktr, BKTR_O_VSCALE_HI) & ~0x1f);
OUTB(bktr, BKTR_E_VSCALE_HI, INB(bktr, BKTR_E_VSCALE_HI) | ((tmp_int >> 8) & 0x1f));
OUTB(bktr, BKTR_O_VSCALE_HI, INB(bktr, BKTR_O_VSCALE_HI) | ((tmp_int >> 8) & 0x1f));
/* vertical active */
if (bktr->capture_area_enabled)
temp = bktr->capture_area_y_size;
else
temp = fp->vactive;
/* printf("%s: VACTIVE is %d\n", bktr_name(bktr), temp); */
OUTB(bktr, BKTR_E_CROP, INB(bktr, BKTR_E_CROP) & ~0x30);
OUTB(bktr, BKTR_E_CROP, INB(bktr, BKTR_E_CROP) | ((temp >> 4) & 0x30));
OUTB(bktr, BKTR_E_VACTIVE_LO, temp & 0xff);
OUTB(bktr, BKTR_O_CROP, INB(bktr, BKTR_O_CROP) & ~0x30);
OUTB(bktr, BKTR_O_CROP, INB(bktr, BKTR_O_CROP) | ((temp >> 4) & 0x30));
OUTB(bktr, BKTR_O_VACTIVE_LO, temp & 0xff);
/* vertical delay */
if (bktr->capture_area_enabled)
temp = fp->vdelay + (bktr->capture_area_y_offset);
else
temp = fp->vdelay;
/* printf("%s: VDELAY is %d\n", bktr_name(bktr), temp); */
OUTB(bktr, BKTR_E_CROP, INB(bktr, BKTR_E_CROP) & ~0xC0);
OUTB(bktr, BKTR_E_CROP, INB(bktr, BKTR_E_CROP) | ((temp >> 2) & 0xC0));
OUTB(bktr, BKTR_E_VDELAY_LO, temp & 0xff);
OUTB(bktr, BKTR_O_CROP, INB(bktr, BKTR_O_CROP) & ~0xC0);
OUTB(bktr, BKTR_O_CROP, INB(bktr, BKTR_O_CROP) | ((temp >> 2) & 0xC0));
OUTB(bktr, BKTR_O_VDELAY_LO, temp & 0xff);
/* end of video params */
if ((bktr->xtal_pll_mode == BT848_USE_PLL)
&& (fp->iform_xtsel==BT848_IFORM_X_XT1)) {
OUTB(bktr, BKTR_TGCTRL, BT848_TGCTRL_TGCKI_PLL); /* Select PLL mode */
} else {
OUTB(bktr, BKTR_TGCTRL, BT848_TGCTRL_TGCKI_XTAL); /* Select Normal xtal 0/xtal 1 mode */
}
/* capture control */
switch (i_flag) {
case 1:
bktr->bktr_cap_ctl =
(BT848_CAP_CTL_DITH_FRAME | BT848_CAP_CTL_EVEN);
OUTB(bktr, BKTR_E_VSCALE_HI, INB(bktr, BKTR_E_VSCALE_HI) & ~0x20);
OUTB(bktr, BKTR_O_VSCALE_HI, INB(bktr, BKTR_O_VSCALE_HI) & ~0x20);
interlace = 1;
break;
case 2:
bktr->bktr_cap_ctl =
(BT848_CAP_CTL_DITH_FRAME | BT848_CAP_CTL_ODD);
OUTB(bktr, BKTR_E_VSCALE_HI, INB(bktr, BKTR_E_VSCALE_HI) & ~0x20);
OUTB(bktr, BKTR_O_VSCALE_HI, INB(bktr, BKTR_O_VSCALE_HI) & ~0x20);
interlace = 1;
break;
default:
bktr->bktr_cap_ctl =
(BT848_CAP_CTL_DITH_FRAME |
BT848_CAP_CTL_EVEN | BT848_CAP_CTL_ODD);
OUTB(bktr, BKTR_E_VSCALE_HI, INB(bktr, BKTR_E_VSCALE_HI) | 0x20);
OUTB(bktr, BKTR_O_VSCALE_HI, INB(bktr, BKTR_O_VSCALE_HI) | 0x20);
interlace = 2;
break;
}
OUTL(bktr, BKTR_RISC_STRT_ADD, vtophys(bktr->dma_prog));
rows = bktr->rows;
cols = bktr->cols;
bktr->vbiflags &= ~VBI_CAPTURE; /* default - no vbi capture */
/* RGB Grabs. If /dev/vbi is already open, or we are a PAL/SECAM */
/* user, then use the rgb_vbi RISC program. */
/* Otherwise, use the normal rgb RISC program */
if (pf_int->public.type == METEOR_PIXTYPE_RGB) {
if ( (bktr->vbiflags & VBI_OPEN)
||(bktr->format_params == BT848_IFORM_F_PALBDGHI)
||(bktr->format_params == BT848_IFORM_F_SECAM)
){
bktr->bktr_cap_ctl |=
BT848_CAP_CTL_VBI_EVEN | BT848_CAP_CTL_VBI_ODD;
bktr->vbiflags |= VBI_CAPTURE;
rgb_vbi_prog(bktr, i_flag, cols, rows, interlace);
return;
} else {
rgb_prog(bktr, i_flag, cols, rows, interlace);
return;
}
}
if ( pf_int->public.type == METEOR_PIXTYPE_YUV ) {
yuv422_prog(bktr, i_flag, cols, rows, interlace);
OUTB(bktr, BKTR_COLOR_CTL, (INB(bktr, BKTR_COLOR_CTL) & 0xf0)
| pixfmt_swap_flags( bktr->pixfmt ));
return;
}
if ( pf_int->public.type == METEOR_PIXTYPE_YUV_PACKED ) {
yuvpack_prog(bktr, i_flag, cols, rows, interlace);
OUTB(bktr, BKTR_COLOR_CTL, (INB(bktr, BKTR_COLOR_CTL) & 0xf0)
| pixfmt_swap_flags( bktr->pixfmt ));
return;
}
if ( pf_int->public.type == METEOR_PIXTYPE_YUV_12 ) {
yuv12_prog(bktr, i_flag, cols, rows, interlace);
OUTB(bktr, BKTR_COLOR_CTL, (INB(bktr, BKTR_COLOR_CTL) & 0xf0)
| pixfmt_swap_flags( bktr->pixfmt ));
return;
}
return;
}
/******************************************************************************
* video & video capture specific routines:
*/
/*
*
*/
static void
start_capture( bktr_ptr_t bktr, unsigned type )
{
u_char i_flag;
struct format_params *fp;
fp = &format_params[bktr->format_params];
/* If requested, clear out capture buf first */
if (bktr->clr_on_start && (bktr->video.addr == 0)) {
bzero((caddr_t)bktr->bigbuf,
(size_t)bktr->rows * bktr->cols * bktr->frames *
pixfmt_table[ bktr->pixfmt ].public.Bpp);
}
OUTB(bktr, BKTR_DSTATUS, 0);
OUTL(bktr, BKTR_INT_STAT, INL(bktr, BKTR_INT_STAT));
bktr->flags |= type;
bktr->flags &= ~METEOR_WANT_MASK;
switch(bktr->flags & METEOR_ONLY_FIELDS_MASK) {
case METEOR_ONLY_EVEN_FIELDS:
bktr->flags |= METEOR_WANT_EVEN;
i_flag = 1;
break;
case METEOR_ONLY_ODD_FIELDS:
bktr->flags |= METEOR_WANT_ODD;
i_flag = 2;
break;
default:
bktr->flags |= METEOR_WANT_MASK;
i_flag = 3;
break;
}
/* TDEC is only valid for continuous captures */
if ( type == METEOR_SINGLE ) {
u_short fps_save = bktr->fps;
set_fps(bktr, fp->frame_rate);
bktr->fps = fps_save;
}
else
set_fps(bktr, bktr->fps);
if (bktr->dma_prog_loaded == FALSE) {
build_dma_prog(bktr, i_flag);
bktr->dma_prog_loaded = TRUE;
}
OUTL(bktr, BKTR_RISC_STRT_ADD, vtophys(bktr->dma_prog));
}
/*
*
*/
static void
set_fps( bktr_ptr_t bktr, u_short fps )
{
struct format_params *fp;
int i_flag;
fp = &format_params[bktr->format_params];
switch(bktr->flags & METEOR_ONLY_FIELDS_MASK) {
case METEOR_ONLY_EVEN_FIELDS:
bktr->flags |= METEOR_WANT_EVEN;
i_flag = 1;
break;
case METEOR_ONLY_ODD_FIELDS:
bktr->flags |= METEOR_WANT_ODD;
i_flag = 1;
break;
default:
bktr->flags |= METEOR_WANT_MASK;
i_flag = 2;
break;
}
OUTW(bktr, BKTR_GPIO_DMA_CTL, FIFO_RISC_DISABLED);
OUTL(bktr, BKTR_INT_STAT, ALL_INTS_CLEARED);
bktr->fps = fps;
OUTB(bktr, BKTR_TDEC, 0);
if (fps < fp->frame_rate)
OUTB(bktr, BKTR_TDEC, i_flag*(fp->frame_rate - fps) & 0x3f);
else
OUTB(bktr, BKTR_TDEC, 0);
return;
}
/*
* Given a pixfmt index, compute the bt848 swap_flags necessary to
* achieve the specified swapping.
* Note that without bt swapping, 2Bpp and 3Bpp modes are written
* byte-swapped, and 4Bpp modes are byte and word swapped (see Table 6
* and read R->L).
* Note also that for 3Bpp, we may additionally need to do some creative
* SKIPing to align the FIFO bytelines with the target buffer (see split()).
* This is abstracted here: e.g. no swaps = RGBA; byte & short swap = ABGR
* as one would expect.
*/
static u_int pixfmt_swap_flags( int pixfmt )
{
struct meteor_pixfmt *pf = &pixfmt_table[ pixfmt ].public;
u_int swapf = 0;
switch ( pf->Bpp ) {
case 2 : swapf = ( pf->swap_bytes ? 0 : BSWAP );
break;
case 3 : /* no swaps supported for 3bpp - makes no sense w/ bt848 */
break;
case 4 : if ( pf->swap_bytes )
swapf = pf->swap_shorts ? 0 : WSWAP;
else
swapf = pf->swap_shorts ? BSWAP : (BSWAP | WSWAP);
break;
}
return swapf;
}
/*
* Converts meteor-defined pixel formats (e.g. METEOR_GEO_RGB16) into
* our pixfmt_table indices.
*/
static int oformat_meteor_to_bt( u_long format )
{
int i;
struct meteor_pixfmt *pf1, *pf2;
/* Find format in compatibility table */
for ( i = 0; i < METEOR_PIXFMT_TABLE_SIZE; i++ )
if ( meteor_pixfmt_table[i].meteor_format == format )
break;
if ( i >= METEOR_PIXFMT_TABLE_SIZE )
return -1;
pf1 = &meteor_pixfmt_table[i].public;
/* Match it with an entry in master pixel format table */
for ( i = 0; i < PIXFMT_TABLE_SIZE; i++ ) {
pf2 = &pixfmt_table[i].public;
if (( pf1->type == pf2->type ) &&
( pf1->Bpp == pf2->Bpp ) &&
!bcmp( pf1->masks, pf2->masks, sizeof( pf1->masks )) &&
( pf1->swap_bytes == pf2->swap_bytes ) &&
( pf1->swap_shorts == pf2->swap_shorts ))
break;
}
if ( i >= PIXFMT_TABLE_SIZE )
return -1;
return i;
}
/******************************************************************************
* i2c primitives:
*/
/* */
#define I2CBITTIME (0x5<<4) /* 5 * 0.48uS */
#define I2CBITTIME_878 (1 << 7)
#define I2C_READ 0x01
#define I2C_COMMAND (I2CBITTIME | \
BT848_DATA_CTL_I2CSCL | \
BT848_DATA_CTL_I2CSDA)
#define I2C_COMMAND_878 (I2CBITTIME_878 | \
BT848_DATA_CTL_I2CSCL | \
BT848_DATA_CTL_I2CSDA)
/* Select between old i2c code and new iicbus / smbus code */
#if defined(BKTR_USE_FREEBSD_SMBUS)
/*
* The hardware interface is actually SMB commands
*/
int
i2cWrite( bktr_ptr_t bktr, int addr, int byte1, int byte2 )
{
char cmd;
if (bktr->id == BROOKTREE_848 ||
bktr->id == BROOKTREE_848A ||
bktr->id == BROOKTREE_849A)
cmd = I2C_COMMAND;
else
cmd = I2C_COMMAND_878;
if (byte2 != -1) {
if (smbus_writew(bktr->i2c_sc.smbus, addr, cmd,
(short)(((byte2 & 0xff) << 8) | (byte1 & 0xff))))
return (-1);
} else {
if (smbus_writeb(bktr->i2c_sc.smbus, addr, cmd,
(char)(byte1 & 0xff)))
return (-1);
}
/* return OK */
return( 0 );
}
int
i2cRead( bktr_ptr_t bktr, int addr )
{
char result;
char cmd;
if (bktr->id == BROOKTREE_848 ||
bktr->id == BROOKTREE_848A ||
bktr->id == BROOKTREE_849A)
cmd = I2C_COMMAND;
else
cmd = I2C_COMMAND_878;
if (smbus_readb(bktr->i2c_sc.smbus, addr, cmd, &result))
return (-1);
return ((int)((unsigned char)result));
}
#define IICBUS(bktr) ((bktr)->i2c_sc.iicbb)
/* The MSP34xx and DPL35xx Audio chip require i2c bus writes of up */
/* to 5 bytes which the bt848 automated i2c bus controller cannot handle */
/* Therefore we need low level control of the i2c bus hardware */
/* Write to the MSP or DPL registers */
void
msp_dpl_write(bktr_ptr_t bktr, int i2c_addr, unsigned char dev, unsigned int addr, unsigned int data)
{
unsigned char addr_l, addr_h, data_h, data_l ;
addr_h = (addr >>8) & 0xff;
addr_l = addr & 0xff;
data_h = (data >>8) & 0xff;
data_l = data & 0xff;
iicbus_start(IICBUS(bktr), i2c_addr, 0 /* no timeout? */);
iicbus_write_byte(IICBUS(bktr), dev, 0);
iicbus_write_byte(IICBUS(bktr), addr_h, 0);
iicbus_write_byte(IICBUS(bktr), addr_l, 0);
iicbus_write_byte(IICBUS(bktr), data_h, 0);
iicbus_write_byte(IICBUS(bktr), data_l, 0);
iicbus_stop(IICBUS(bktr));
return;
}
/* Read from the MSP or DPL registers */
unsigned int
msp_dpl_read(bktr_ptr_t bktr, int i2c_addr, unsigned char dev, unsigned int addr)
{
unsigned int data;
unsigned char addr_l, addr_h, dev_r;
int read;
u_char data_read[2];
addr_h = (addr >>8) & 0xff;
addr_l = addr & 0xff;
dev_r = dev+1;
/* XXX errors ignored */
iicbus_start(IICBUS(bktr), i2c_addr, 0 /* no timeout? */);
iicbus_write_byte(IICBUS(bktr), dev_r, 0);
iicbus_write_byte(IICBUS(bktr), addr_h, 0);
iicbus_write_byte(IICBUS(bktr), addr_l, 0);
iicbus_repeated_start(IICBUS(bktr), i2c_addr +1, 0 /* no timeout? */);
iicbus_read(IICBUS(bktr), data_read, 2, &read, IIC_LAST_READ, 0);
iicbus_stop(IICBUS(bktr));
data = (data_read[0]<<8) | data_read[1];
return (data);
}
/* Reset the MSP or DPL chip */
/* The user can block the reset (which is handy if you initialise the
* MSP and/or DPL audio in another operating system first (eg in Windows)
*/
void
msp_dpl_reset( bktr_ptr_t bktr, int i2c_addr )
{
#ifndef BKTR_NO_MSP_RESET
/* put into reset mode */
iicbus_start(IICBUS(bktr), i2c_addr, 0 /* no timeout? */);
iicbus_write_byte(IICBUS(bktr), 0x00, 0);
iicbus_write_byte(IICBUS(bktr), 0x80, 0);
iicbus_write_byte(IICBUS(bktr), 0x00, 0);
iicbus_stop(IICBUS(bktr));
/* put back to operational mode */
iicbus_start(IICBUS(bktr), i2c_addr, 0 /* no timeout? */);
iicbus_write_byte(IICBUS(bktr), 0x00, 0);
iicbus_write_byte(IICBUS(bktr), 0x00, 0);
iicbus_write_byte(IICBUS(bktr), 0x00, 0);
iicbus_stop(IICBUS(bktr));
#endif
return;
}
static void remote_read(bktr_ptr_t bktr, struct bktr_remote *remote) {
int read;
/* XXX errors ignored */
iicbus_start(IICBUS(bktr), bktr->remote_control_addr, 0 /* no timeout? */);
iicbus_read(IICBUS(bktr), remote->data, 3, &read, IIC_LAST_READ, 0);
iicbus_stop(IICBUS(bktr));
return;
}
#else /* defined(BKTR_USE_FREEBSD_SMBUS) */
/*
* Program the i2c bus directly
*/
int
i2cWrite( bktr_ptr_t bktr, int addr, int byte1, int byte2 )
{
u_long x;
u_long data;
/* clear status bits */
OUTL(bktr, BKTR_INT_STAT, BT848_INT_RACK | BT848_INT_I2CDONE);
/* build the command datum */
if (bktr->id == BROOKTREE_848 ||
bktr->id == BROOKTREE_848A ||
bktr->id == BROOKTREE_849A) {
data = ((addr & 0xff) << 24) | ((byte1 & 0xff) << 16) | I2C_COMMAND;
} else {
data = ((addr & 0xff) << 24) | ((byte1 & 0xff) << 16) | I2C_COMMAND_878;
}
if ( byte2 != -1 ) {
data |= ((byte2 & 0xff) << 8);
data |= BT848_DATA_CTL_I2CW3B;
}
/* write the address and data */
OUTL(bktr, BKTR_I2C_DATA_CTL, data);
/* wait for completion */
for ( x = 0x7fffffff; x; --x ) { /* safety valve */
if ( INL(bktr, BKTR_INT_STAT) & BT848_INT_I2CDONE )
break;
}
/* check for ACK */
if ( !x || !(INL(bktr, BKTR_INT_STAT) & BT848_INT_RACK) )
return( -1 );
/* return OK */
return( 0 );
}
/*
*
*/
int
i2cRead( bktr_ptr_t bktr, int addr )
{
u_long x;
/* clear status bits */
OUTL(bktr, BKTR_INT_STAT, BT848_INT_RACK | BT848_INT_I2CDONE);
/* write the READ address */
/* The Bt878 and Bt879 differed on the treatment of i2c commands */
if (bktr->id == BROOKTREE_848 ||
bktr->id == BROOKTREE_848A ||
bktr->id == BROOKTREE_849A) {
OUTL(bktr, BKTR_I2C_DATA_CTL, ((addr & 0xff) << 24) | I2C_COMMAND);
} else {
OUTL(bktr, BKTR_I2C_DATA_CTL, ((addr & 0xff) << 24) | I2C_COMMAND_878);
}
/* wait for completion */
for ( x = 0x7fffffff; x; --x ) { /* safety valve */
if ( INL(bktr, BKTR_INT_STAT) & BT848_INT_I2CDONE )
break;
}
/* check for ACK */
if ( !x || !(INL(bktr, BKTR_INT_STAT) & BT848_INT_RACK) )
return( -1 );
/* it was a read */
return( (INL(bktr, BKTR_I2C_DATA_CTL) >> 8) & 0xff );
}
/* The MSP34xx Audio chip require i2c bus writes of up to 5 bytes which the */
/* bt848 automated i2c bus controller cannot handle */
/* Therefore we need low level control of the i2c bus hardware */
/* Idea for the following functions are from elsewhere in this driver and */
/* from the Linux BTTV i2c driver by Gerd Knorr <kraxel@cs.tu-berlin.de> */
#define BITD 40
static void i2c_start( bktr_ptr_t bktr) {
OUTL(bktr, BKTR_I2C_DATA_CTL, 1); DELAY( BITD ); /* release data */
OUTL(bktr, BKTR_I2C_DATA_CTL, 3); DELAY( BITD ); /* release clock */
OUTL(bktr, BKTR_I2C_DATA_CTL, 2); DELAY( BITD ); /* lower data */
OUTL(bktr, BKTR_I2C_DATA_CTL, 0); DELAY( BITD ); /* lower clock */
}
static void i2c_stop( bktr_ptr_t bktr) {
OUTL(bktr, BKTR_I2C_DATA_CTL, 0); DELAY( BITD ); /* lower clock & data */
OUTL(bktr, BKTR_I2C_DATA_CTL, 2); DELAY( BITD ); /* release clock */
OUTL(bktr, BKTR_I2C_DATA_CTL, 3); DELAY( BITD ); /* release data */
}
static int i2c_write_byte( bktr_ptr_t bktr, unsigned char data) {
int x;
int status;
/* write out the byte */
for ( x = 7; x >= 0; --x ) {
if ( data & (1<<x) ) {
OUTL(bktr, BKTR_I2C_DATA_CTL, 1);
DELAY( BITD ); /* assert HI data */
OUTL(bktr, BKTR_I2C_DATA_CTL, 3);
DELAY( BITD ); /* strobe clock */
OUTL(bktr, BKTR_I2C_DATA_CTL, 1);
DELAY( BITD ); /* release clock */
}
else {
OUTL(bktr, BKTR_I2C_DATA_CTL, 0);
DELAY( BITD ); /* assert LO data */
OUTL(bktr, BKTR_I2C_DATA_CTL, 2);
DELAY( BITD ); /* strobe clock */
OUTL(bktr, BKTR_I2C_DATA_CTL, 0);
DELAY( BITD ); /* release clock */
}
}
/* look for an ACK */
OUTL(bktr, BKTR_I2C_DATA_CTL, 1); DELAY( BITD ); /* float data */
OUTL(bktr, BKTR_I2C_DATA_CTL, 3); DELAY( BITD ); /* strobe clock */
status = INL(bktr, BKTR_I2C_DATA_CTL) & 1; /* read the ACK bit */
OUTL(bktr, BKTR_I2C_DATA_CTL, 1); DELAY( BITD ); /* release clock */
return( status );
}
static int i2c_read_byte( bktr_ptr_t bktr, unsigned char *data, int last ) {
int x;
int bit;
int byte = 0;
/* read in the byte */
OUTL(bktr, BKTR_I2C_DATA_CTL, 1);
DELAY( BITD ); /* float data */
for ( x = 7; x >= 0; --x ) {
OUTL(bktr, BKTR_I2C_DATA_CTL, 3);
DELAY( BITD ); /* strobe clock */
bit = INL(bktr, BKTR_I2C_DATA_CTL) & 1; /* read the data bit */
if ( bit ) byte |= (1<<x);
OUTL(bktr, BKTR_I2C_DATA_CTL, 1);
DELAY( BITD ); /* release clock */
}
/* After reading the byte, send an ACK */
/* (unless that was the last byte, for which we send a NAK */
if (last) { /* send NAK - same a writing a 1 */
OUTL(bktr, BKTR_I2C_DATA_CTL, 1);
DELAY( BITD ); /* set data bit */
OUTL(bktr, BKTR_I2C_DATA_CTL, 3);
DELAY( BITD ); /* strobe clock */
OUTL(bktr, BKTR_I2C_DATA_CTL, 1);
DELAY( BITD ); /* release clock */
} else { /* send ACK - same as writing a 0 */
OUTL(bktr, BKTR_I2C_DATA_CTL, 0);
DELAY( BITD ); /* set data bit */
OUTL(bktr, BKTR_I2C_DATA_CTL, 2);
DELAY( BITD ); /* strobe clock */
OUTL(bktr, BKTR_I2C_DATA_CTL, 0);
DELAY( BITD ); /* release clock */
}
*data=byte;
return 0;
}
#undef BITD
/* Write to the MSP or DPL registers */
void msp_dpl_write( bktr_ptr_t bktr, int i2c_addr, unsigned char dev, unsigned int addr,
unsigned int data){
unsigned int msp_w_addr = i2c_addr;
unsigned char addr_l, addr_h, data_h, data_l ;
addr_h = (addr >>8) & 0xff;
addr_l = addr & 0xff;
data_h = (data >>8) & 0xff;
data_l = data & 0xff;
i2c_start(bktr);
i2c_write_byte(bktr, msp_w_addr);
i2c_write_byte(bktr, dev);
i2c_write_byte(bktr, addr_h);
i2c_write_byte(bktr, addr_l);
i2c_write_byte(bktr, data_h);
i2c_write_byte(bktr, data_l);
i2c_stop(bktr);
}
/* Read from the MSP or DPL registers */
unsigned int msp_dpl_read(bktr_ptr_t bktr, int i2c_addr, unsigned char dev, unsigned int addr){
unsigned int data;
unsigned char addr_l, addr_h, data_1, data_2, dev_r ;
addr_h = (addr >>8) & 0xff;
addr_l = addr & 0xff;
dev_r = dev+1;
i2c_start(bktr);
i2c_write_byte(bktr,i2c_addr);
i2c_write_byte(bktr,dev_r);
i2c_write_byte(bktr,addr_h);
i2c_write_byte(bktr,addr_l);
i2c_start(bktr);
i2c_write_byte(bktr,i2c_addr+1);
i2c_read_byte(bktr,&data_1, 0);
i2c_read_byte(bktr,&data_2, 1);
i2c_stop(bktr);
data = (data_1<<8) | data_2;
return data;
}
/* Reset the MSP or DPL chip */
/* The user can block the reset (which is handy if you initialise the
* MSP audio in another operating system first (eg in Windows)
*/
void msp_dpl_reset( bktr_ptr_t bktr, int i2c_addr ) {
#ifndef BKTR_NO_MSP_RESET
/* put into reset mode */
i2c_start(bktr);
i2c_write_byte(bktr, i2c_addr);
i2c_write_byte(bktr, 0x00);
i2c_write_byte(bktr, 0x80);
i2c_write_byte(bktr, 0x00);
i2c_stop(bktr);
/* put back to operational mode */
i2c_start(bktr);
i2c_write_byte(bktr, i2c_addr);
i2c_write_byte(bktr, 0x00);
i2c_write_byte(bktr, 0x00);
i2c_write_byte(bktr, 0x00);
i2c_stop(bktr);
#endif
return;
}
static void remote_read(bktr_ptr_t bktr, struct bktr_remote *remote) {
/* XXX errors ignored */
i2c_start(bktr);
i2c_write_byte(bktr,bktr->remote_control_addr);
i2c_read_byte(bktr,&(remote->data[0]), 0);
i2c_read_byte(bktr,&(remote->data[1]), 0);
i2c_read_byte(bktr,&(remote->data[2]), 0);
i2c_stop(bktr);
return;
}
#endif /* defined(BKTR_USE_FREEBSD_SMBUS) */
#if defined( I2C_SOFTWARE_PROBE )
/*
* we are keeping this around for any parts that we need to probe
* but that CANNOT be probed via an i2c read.
* this is necessary because the hardware i2c mechanism
* cannot be programmed for 1 byte writes.
* currently there are no known i2c parts that we need to probe
* and that cannot be safely read.
*/
static int i2cProbe( bktr_ptr_t bktr, int addr );
#define BITD 40
#define EXTRA_START
/*
* probe for an I2C device at addr.
*/
static int
i2cProbe( bktr_ptr_t bktr, int addr )
{
int x, status;
/* the START */
#if defined( EXTRA_START )
OUTL(bktr, BKTR_I2C_DATA_CTL, 1); DELAY( BITD ); /* release data */
OUTL(bktr, BKTR_I2C_DATA_CTL, 3); DELAY( BITD ); /* release clock */
#endif /* EXTRA_START */
OUTL(bktr, BKTR_I2C_DATA_CTL, 2); DELAY( BITD ); /* lower data */
OUTL(bktr, BKTR_I2C_DATA_CTL, 0); DELAY( BITD ); /* lower clock */
/* write addr */
for ( x = 7; x >= 0; --x ) {
if ( addr & (1<<x) ) {
OUTL(bktr, BKTR_I2C_DATA_CTL, 1);
DELAY( BITD ); /* assert HI data */
OUTL(bktr, BKTR_I2C_DATA_CTL, 3);
DELAY( BITD ); /* strobe clock */
OUTL(bktr, BKTR_I2C_DATA_CTL, 1);
DELAY( BITD ); /* release clock */
}
else {
OUTL(bktr, BKTR_I2C_DATA_CTL, 0);
DELAY( BITD ); /* assert LO data */
OUTL(bktr, BKTR_I2C_DATA_CTL, 2);
DELAY( BITD ); /* strobe clock */
OUTL(bktr, BKTR_I2C_DATA_CTL, 0);
DELAY( BITD ); /* release clock */
}
}
/* look for an ACK */
OUTL(bktr, BKTR_I2C_DATA_CTL, 1); DELAY( BITD ); /* float data */
OUTL(bktr, BKTR_I2C_DATA_CTL, 3); DELAY( BITD ); /* strobe clock */
status = INL(bktr, BKTR_I2C_DATA_CTL) & 1; /* read the ACK bit */
OUTL(bktr, BKTR_I2C_DATA_CTL, 1); DELAY( BITD ); /* release clock */
/* the STOP */
OUTL(bktr, BKTR_I2C_DATA_CTL, 0); DELAY( BITD ); /* lower clock & data */
OUTL(bktr, BKTR_I2C_DATA_CTL, 2); DELAY( BITD ); /* release clock */
OUTL(bktr, BKTR_I2C_DATA_CTL, 3); DELAY( BITD ); /* release data */
return( status );
}
#undef EXTRA_START
#undef BITD
#endif /* I2C_SOFTWARE_PROBE */
#define ABSENT (-1)
#endif /* FreeBSD, BSDI, NetBSD, OpenBSD */
Index: head/sys/dev/hwpmc/hwpmc_logging.c
===================================================================
--- head/sys/dev/hwpmc/hwpmc_logging.c (revision 225616)
+++ head/sys/dev/hwpmc/hwpmc_logging.c (revision 225617)
@@ -1,1025 +1,1025 @@
/*-
* Copyright (c) 2005-2007 Joseph Koshy
* Copyright (c) 2007 The FreeBSD Foundation
* All rights reserved.
*
* Portions of this software were developed by A. Joseph Koshy under
* sponsorship from the FreeBSD Foundation and Google, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
*/
/*
* Logging code for hwpmc(4)
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/capability.h>
#include <sys/file.h>
#include <sys/kernel.h>
#include <sys/kthread.h>
#include <sys/lock.h>
#include <sys/module.h>
#include <sys/mutex.h>
#include <sys/pmc.h>
#include <sys/pmckern.h>
#include <sys/pmclog.h>
#include <sys/proc.h>
#include <sys/signalvar.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
#include <sys/uio.h>
#include <sys/unistd.h>
#include <sys/vnode.h>
/*
* Sysctl tunables
*/
SYSCTL_DECL(_kern_hwpmc);
/*
* kern.hwpmc.logbuffersize -- size of the per-cpu owner buffers.
*/
static int pmclog_buffer_size = PMC_LOG_BUFFER_SIZE;
TUNABLE_INT(PMC_SYSCTL_NAME_PREFIX "logbuffersize", &pmclog_buffer_size);
SYSCTL_INT(_kern_hwpmc, OID_AUTO, logbuffersize, CTLFLAG_TUN|CTLFLAG_RD,
&pmclog_buffer_size, 0, "size of log buffers in kilobytes");
/*
* kern.hwpmc.nbuffer -- number of global log buffers
*/
static int pmc_nlogbuffers = PMC_NLOGBUFFERS;
TUNABLE_INT(PMC_SYSCTL_NAME_PREFIX "nbuffers", &pmc_nlogbuffers);
SYSCTL_INT(_kern_hwpmc, OID_AUTO, nbuffers, CTLFLAG_TUN|CTLFLAG_RD,
&pmc_nlogbuffers, 0, "number of global log buffers");
/*
* Global log buffer list and associated spin lock.
*/
TAILQ_HEAD(, pmclog_buffer) pmc_bufferlist =
TAILQ_HEAD_INITIALIZER(pmc_bufferlist);
static struct mtx pmc_bufferlist_mtx; /* spin lock */
static struct mtx pmc_kthread_mtx; /* sleep lock */
#define PMCLOG_INIT_BUFFER_DESCRIPTOR(D) do { \
const int __roundup = roundup(sizeof(*D), \
sizeof(uint32_t)); \
(D)->plb_fence = ((char *) (D)) + \
1024*pmclog_buffer_size; \
(D)->plb_base = (D)->plb_ptr = ((char *) (D)) + \
__roundup; \
} while (0)
/*
* Log file record constructors.
*/
#define _PMCLOG_TO_HEADER(T,L) \
((PMCLOG_HEADER_MAGIC << 24) | \
(PMCLOG_TYPE_ ## T << 16) | \
((L) & 0xFFFF))
/* reserve LEN bytes of space and initialize the entry header */
#define _PMCLOG_RESERVE(PO,TYPE,LEN,ACTION) do { \
uint32_t *_le; \
int _len = roundup((LEN), sizeof(uint32_t)); \
if ((_le = pmclog_reserve((PO), _len)) == NULL) { \
ACTION; \
} \
*_le = _PMCLOG_TO_HEADER(TYPE,_len); \
_le += 3 /* skip over timestamp */
#define PMCLOG_RESERVE(P,T,L) _PMCLOG_RESERVE(P,T,L,return)
#define PMCLOG_RESERVE_WITH_ERROR(P,T,L) _PMCLOG_RESERVE(P,T,L, \
error=ENOMEM;goto error)
#define PMCLOG_EMIT32(V) do { *_le++ = (V); } while (0)
#define PMCLOG_EMIT64(V) do { \
*_le++ = (uint32_t) ((V) & 0xFFFFFFFF); \
*_le++ = (uint32_t) (((V) >> 32) & 0xFFFFFFFF); \
} while (0)
/* Emit a string. Caution: does NOT update _le, so needs to be last */
#define PMCLOG_EMITSTRING(S,L) do { bcopy((S), _le, (L)); } while (0)
#define PMCLOG_DESPATCH(PO) \
pmclog_release((PO)); \
} while (0)
/*
* Assertions about the log file format.
*/
CTASSERT(sizeof(struct pmclog_callchain) == 6*4 +
PMC_CALLCHAIN_DEPTH_MAX*sizeof(uintfptr_t));
CTASSERT(sizeof(struct pmclog_closelog) == 3*4);
CTASSERT(sizeof(struct pmclog_dropnotify) == 3*4);
CTASSERT(sizeof(struct pmclog_map_in) == PATH_MAX +
4*4 + sizeof(uintfptr_t));
CTASSERT(offsetof(struct pmclog_map_in,pl_pathname) ==
4*4 + sizeof(uintfptr_t));
CTASSERT(sizeof(struct pmclog_map_out) == 4*4 + 2*sizeof(uintfptr_t));
CTASSERT(sizeof(struct pmclog_pcsample) == 6*4 + sizeof(uintfptr_t));
CTASSERT(sizeof(struct pmclog_pmcallocate) == 6*4);
CTASSERT(sizeof(struct pmclog_pmcattach) == 5*4 + PATH_MAX);
CTASSERT(offsetof(struct pmclog_pmcattach,pl_pathname) == 5*4);
CTASSERT(sizeof(struct pmclog_pmcdetach) == 5*4);
CTASSERT(sizeof(struct pmclog_proccsw) == 5*4 + 8);
CTASSERT(sizeof(struct pmclog_procexec) == 5*4 + PATH_MAX +
sizeof(uintfptr_t));
CTASSERT(offsetof(struct pmclog_procexec,pl_pathname) == 5*4 +
sizeof(uintfptr_t));
CTASSERT(sizeof(struct pmclog_procexit) == 5*4 + 8);
CTASSERT(sizeof(struct pmclog_procfork) == 5*4);
CTASSERT(sizeof(struct pmclog_sysexit) == 4*4);
CTASSERT(sizeof(struct pmclog_userdata) == 4*4);
/*
* Log buffer structure
*/
struct pmclog_buffer {
TAILQ_ENTRY(pmclog_buffer) plb_next;
char *plb_base;
char *plb_ptr;
char *plb_fence;
};
/*
* Prototypes
*/
static int pmclog_get_buffer(struct pmc_owner *po);
static void pmclog_loop(void *arg);
static void pmclog_release(struct pmc_owner *po);
static uint32_t *pmclog_reserve(struct pmc_owner *po, int length);
static void pmclog_schedule_io(struct pmc_owner *po);
static void pmclog_stop_kthread(struct pmc_owner *po);
/*
* Helper functions
*/
/*
* Get a log buffer
*/
static int
pmclog_get_buffer(struct pmc_owner *po)
{
struct pmclog_buffer *plb;
mtx_assert(&po->po_mtx, MA_OWNED);
KASSERT(po->po_curbuf == NULL,
("[pmclog,%d] po=%p current buffer still valid", __LINE__, po));
mtx_lock_spin(&pmc_bufferlist_mtx);
if ((plb = TAILQ_FIRST(&pmc_bufferlist)) != NULL)
TAILQ_REMOVE(&pmc_bufferlist, plb, plb_next);
mtx_unlock_spin(&pmc_bufferlist_mtx);
PMCDBG(LOG,GTB,1, "po=%p plb=%p", po, plb);
#ifdef DEBUG
if (plb)
KASSERT(plb->plb_ptr == plb->plb_base &&
plb->plb_base < plb->plb_fence,
("[pmclog,%d] po=%p buffer invariants: ptr=%p "
"base=%p fence=%p", __LINE__, po, plb->plb_ptr,
plb->plb_base, plb->plb_fence));
#endif
po->po_curbuf = plb;
/* update stats */
atomic_add_int(&pmc_stats.pm_buffer_requests, 1);
if (plb == NULL)
atomic_add_int(&pmc_stats.pm_buffer_requests_failed, 1);
return (plb ? 0 : ENOMEM);
}
/*
* Log handler loop.
*
* This function is executed by each pmc owner's helper thread.
*/
static void
pmclog_loop(void *arg)
{
int error, last_buffer;
struct pmc_owner *po;
struct pmclog_buffer *lb;
struct proc *p;
struct ucred *ownercred;
struct ucred *mycred;
struct thread *td;
struct uio auio;
struct iovec aiov;
size_t nbytes;
po = (struct pmc_owner *) arg;
p = po->po_owner;
td = curthread;
mycred = td->td_ucred;
last_buffer = 0;
PROC_LOCK(p);
ownercred = crhold(p->p_ucred);
PROC_UNLOCK(p);
PMCDBG(LOG,INI,1, "po=%p kt=%p", po, po->po_kthread);
KASSERT(po->po_kthread == curthread->td_proc,
("[pmclog,%d] proc mismatch po=%p po/kt=%p curproc=%p", __LINE__,
po, po->po_kthread, curthread->td_proc));
lb = NULL;
/*
* Loop waiting for I/O requests to be added to the owner
* struct's queue. The loop is exited when the log file
* is deconfigured.
*/
mtx_lock(&pmc_kthread_mtx);
for (;;) {
/* check if we've been asked to exit */
if ((po->po_flags & PMC_PO_OWNS_LOGFILE) == 0)
break;
if (lb == NULL) { /* look for a fresh buffer to write */
mtx_lock_spin(&po->po_mtx);
if ((lb = TAILQ_FIRST(&po->po_logbuffers)) == NULL) {
mtx_unlock_spin(&po->po_mtx);
(void) msleep(po, &pmc_kthread_mtx, PWAIT,
"pmcloop", 0);
continue;
}
TAILQ_REMOVE(&po->po_logbuffers, lb, plb_next);
if (po->po_flags & PMC_PO_SHUTDOWN)
last_buffer = TAILQ_EMPTY(&po->po_logbuffers);
mtx_unlock_spin(&po->po_mtx);
}
mtx_unlock(&pmc_kthread_mtx);
/* process the request */
PMCDBG(LOG,WRI,2, "po=%p base=%p ptr=%p", po,
lb->plb_base, lb->plb_ptr);
/* change our thread's credentials before issuing the I/O */
aiov.iov_base = lb->plb_base;
aiov.iov_len = nbytes = lb->plb_ptr - lb->plb_base;
auio.uio_iov = &aiov;
auio.uio_iovcnt = 1;
auio.uio_offset = -1;
auio.uio_resid = nbytes;
auio.uio_rw = UIO_WRITE;
auio.uio_segflg = UIO_SYSSPACE;
auio.uio_td = td;
/* switch thread credentials -- see kern_ktrace.c */
td->td_ucred = ownercred;
error = fo_write(po->po_file, &auio, ownercred, 0, td);
td->td_ucred = mycred;
if (error) {
/* XXX some errors are recoverable */
/* send a SIGIO to the owner and exit */
PROC_LOCK(p);
- psignal(p, SIGIO);
+ kern_psignal(p, SIGIO);
PROC_UNLOCK(p);
mtx_lock(&pmc_kthread_mtx);
po->po_error = error; /* save for flush log */
PMCDBG(LOG,WRI,2, "po=%p error=%d", po, error);
break;
}
if (last_buffer) {
/*
* Close the file to get PMCLOG_EOF error
* in pmclog(3).
*/
fo_close(po->po_file, curthread);
}
mtx_lock(&pmc_kthread_mtx);
/* put the used buffer back into the global pool */
PMCLOG_INIT_BUFFER_DESCRIPTOR(lb);
mtx_lock_spin(&pmc_bufferlist_mtx);
TAILQ_INSERT_HEAD(&pmc_bufferlist, lb, plb_next);
mtx_unlock_spin(&pmc_bufferlist_mtx);
lb = NULL;
}
po->po_kthread = NULL;
mtx_unlock(&pmc_kthread_mtx);
/* return the current I/O buffer to the global pool */
if (lb) {
PMCLOG_INIT_BUFFER_DESCRIPTOR(lb);
mtx_lock_spin(&pmc_bufferlist_mtx);
TAILQ_INSERT_HEAD(&pmc_bufferlist, lb, plb_next);
mtx_unlock_spin(&pmc_bufferlist_mtx);
}
/*
* Exit this thread, signalling the waiter
*/
crfree(ownercred);
kproc_exit(0);
}
/*
* Release and log entry and schedule an I/O if needed.
*/
static void
pmclog_release(struct pmc_owner *po)
{
KASSERT(po->po_curbuf->plb_ptr >= po->po_curbuf->plb_base,
("[pmclog,%d] buffer invariants po=%p ptr=%p base=%p", __LINE__,
po, po->po_curbuf->plb_ptr, po->po_curbuf->plb_base));
KASSERT(po->po_curbuf->plb_ptr <= po->po_curbuf->plb_fence,
("[pmclog,%d] buffer invariants po=%p ptr=%p fenc=%p", __LINE__,
po, po->po_curbuf->plb_ptr, po->po_curbuf->plb_fence));
/* schedule an I/O if we've filled a buffer */
if (po->po_curbuf->plb_ptr >= po->po_curbuf->plb_fence)
pmclog_schedule_io(po);
mtx_unlock_spin(&po->po_mtx);
PMCDBG(LOG,REL,1, "po=%p", po);
}
/*
* Attempt to reserve 'length' bytes of space in an owner's log
* buffer. The function returns a pointer to 'length' bytes of space
* if there was enough space or returns NULL if no space was
* available. Non-null returns do so with the po mutex locked. The
* caller must invoke pmclog_release() on the pmc owner structure
* when done.
*/
static uint32_t *
pmclog_reserve(struct pmc_owner *po, int length)
{
uintptr_t newptr, oldptr;
uint32_t *lh;
struct timespec ts;
PMCDBG(LOG,ALL,1, "po=%p len=%d", po, length);
KASSERT(length % sizeof(uint32_t) == 0,
("[pmclog,%d] length not a multiple of word size", __LINE__));
mtx_lock_spin(&po->po_mtx);
/* No more data when shutdown in progress. */
if (po->po_flags & PMC_PO_SHUTDOWN) {
mtx_unlock_spin(&po->po_mtx);
return (NULL);
}
if (po->po_curbuf == NULL)
if (pmclog_get_buffer(po) != 0) {
mtx_unlock_spin(&po->po_mtx);
return (NULL);
}
KASSERT(po->po_curbuf != NULL,
("[pmclog,%d] po=%p no current buffer", __LINE__, po));
KASSERT(po->po_curbuf->plb_ptr >= po->po_curbuf->plb_base &&
po->po_curbuf->plb_ptr <= po->po_curbuf->plb_fence,
("[pmclog,%d] po=%p buffer invariants: ptr=%p base=%p fence=%p",
__LINE__, po, po->po_curbuf->plb_ptr, po->po_curbuf->plb_base,
po->po_curbuf->plb_fence));
oldptr = (uintptr_t) po->po_curbuf->plb_ptr;
newptr = oldptr + length;
KASSERT(oldptr != (uintptr_t) NULL,
("[pmclog,%d] po=%p Null log buffer pointer", __LINE__, po));
/*
* If we have space in the current buffer, return a pointer to
* available space with the PO structure locked.
*/
if (newptr <= (uintptr_t) po->po_curbuf->plb_fence) {
po->po_curbuf->plb_ptr = (char *) newptr;
goto done;
}
/*
* Otherwise, schedule the current buffer for output and get a
* fresh buffer.
*/
pmclog_schedule_io(po);
if (pmclog_get_buffer(po) != 0) {
mtx_unlock_spin(&po->po_mtx);
return (NULL);
}
KASSERT(po->po_curbuf != NULL,
("[pmclog,%d] po=%p no current buffer", __LINE__, po));
KASSERT(po->po_curbuf->plb_ptr != NULL,
("[pmclog,%d] null return from pmc_get_log_buffer", __LINE__));
KASSERT(po->po_curbuf->plb_ptr == po->po_curbuf->plb_base &&
po->po_curbuf->plb_ptr <= po->po_curbuf->plb_fence,
("[pmclog,%d] po=%p buffer invariants: ptr=%p base=%p fence=%p",
__LINE__, po, po->po_curbuf->plb_ptr, po->po_curbuf->plb_base,
po->po_curbuf->plb_fence));
oldptr = (uintptr_t) po->po_curbuf->plb_ptr;
done:
lh = (uint32_t *) oldptr;
lh++; /* skip header */
getnanotime(&ts); /* fill in the timestamp */
*lh++ = ts.tv_sec & 0xFFFFFFFF;
*lh++ = ts.tv_nsec & 0xFFFFFFF;
return ((uint32_t *) oldptr);
}
/*
* Schedule an I/O.
*
* Transfer the current buffer to the helper kthread.
*/
static void
pmclog_schedule_io(struct pmc_owner *po)
{
KASSERT(po->po_curbuf != NULL,
("[pmclog,%d] schedule_io with null buffer po=%p", __LINE__, po));
KASSERT(po->po_curbuf->plb_ptr >= po->po_curbuf->plb_base,
("[pmclog,%d] buffer invariants po=%p ptr=%p base=%p", __LINE__,
po, po->po_curbuf->plb_ptr, po->po_curbuf->plb_base));
KASSERT(po->po_curbuf->plb_ptr <= po->po_curbuf->plb_fence,
("[pmclog,%d] buffer invariants po=%p ptr=%p fenc=%p", __LINE__,
po, po->po_curbuf->plb_ptr, po->po_curbuf->plb_fence));
PMCDBG(LOG,SIO, 1, "po=%p", po);
mtx_assert(&po->po_mtx, MA_OWNED);
/*
* Add the current buffer to the tail of the buffer list and
* wakeup the helper.
*/
TAILQ_INSERT_TAIL(&po->po_logbuffers, po->po_curbuf, plb_next);
po->po_curbuf = NULL;
wakeup_one(po);
}
/*
* Stop the helper kthread.
*/
static void
pmclog_stop_kthread(struct pmc_owner *po)
{
/*
* Close the file to force the thread out of fo_write,
* unset flag, wakeup the helper thread,
* wait for it to exit
*/
if (po->po_file != NULL)
fo_close(po->po_file, curthread);
mtx_lock(&pmc_kthread_mtx);
po->po_flags &= ~PMC_PO_OWNS_LOGFILE;
wakeup_one(po);
if (po->po_kthread)
msleep(po->po_kthread, &pmc_kthread_mtx, PPAUSE, "pmckstp", 0);
mtx_unlock(&pmc_kthread_mtx);
}
/*
* Public functions
*/
/*
* Configure a log file for pmc owner 'po'.
*
* Parameter 'logfd' is a file handle referencing an open file in the
* owner process. This file needs to have been opened for writing.
*/
int
pmclog_configure_log(struct pmc_mdep *md, struct pmc_owner *po, int logfd)
{
int error;
struct proc *p;
/*
* As long as it is possible to get a LOR between pmc_sx lock and
* proctree/allproc sx locks used for adding a new process, assure
* the former is not held here.
*/
sx_assert(&pmc_sx, SA_UNLOCKED);
PMCDBG(LOG,CFG,1, "config po=%p logfd=%d", po, logfd);
p = po->po_owner;
/* return EBUSY if a log file was already present */
if (po->po_flags & PMC_PO_OWNS_LOGFILE)
return (EBUSY);
KASSERT(po->po_kthread == NULL,
("[pmclog,%d] po=%p kthread (%p) already present", __LINE__, po,
po->po_kthread));
KASSERT(po->po_file == NULL,
("[pmclog,%d] po=%p file (%p) already present", __LINE__, po,
po->po_file));
/* get a reference to the file state */
error = fget_write(curthread, logfd, CAP_WRITE, &po->po_file);
if (error)
goto error;
/* mark process as owning a log file */
po->po_flags |= PMC_PO_OWNS_LOGFILE;
error = kproc_create(pmclog_loop, po, &po->po_kthread,
RFHIGHPID, 0, "hwpmc: proc(%d)", p->p_pid);
if (error)
goto error;
/* mark process as using HWPMCs */
PROC_LOCK(p);
p->p_flag |= P_HWPMC;
PROC_UNLOCK(p);
/* create a log initialization entry */
PMCLOG_RESERVE_WITH_ERROR(po, INITIALIZE,
sizeof(struct pmclog_initialize));
PMCLOG_EMIT32(PMC_VERSION);
PMCLOG_EMIT32(md->pmd_cputype);
PMCLOG_DESPATCH(po);
return (0);
error:
/* shutdown the thread */
if (po->po_kthread)
pmclog_stop_kthread(po);
KASSERT(po->po_kthread == NULL, ("[pmclog,%d] po=%p kthread not "
"stopped", __LINE__, po));
if (po->po_file)
(void) fdrop(po->po_file, curthread);
po->po_file = NULL; /* clear file and error state */
po->po_error = 0;
return (error);
}
/*
* De-configure a log file. This will throw away any buffers queued
* for this owner process.
*/
int
pmclog_deconfigure_log(struct pmc_owner *po)
{
int error;
struct pmclog_buffer *lb;
PMCDBG(LOG,CFG,1, "de-config po=%p", po);
if ((po->po_flags & PMC_PO_OWNS_LOGFILE) == 0)
return (EINVAL);
KASSERT(po->po_sscount == 0,
("[pmclog,%d] po=%p still owning SS PMCs", __LINE__, po));
KASSERT(po->po_file != NULL,
("[pmclog,%d] po=%p no log file", __LINE__, po));
/* stop the kthread, this will reset the 'OWNS_LOGFILE' flag */
if (po->po_kthread)
pmclog_stop_kthread(po);
KASSERT(po->po_kthread == NULL,
("[pmclog,%d] po=%p kthread not stopped", __LINE__, po));
/* return all queued log buffers to the global pool */
while ((lb = TAILQ_FIRST(&po->po_logbuffers)) != NULL) {
TAILQ_REMOVE(&po->po_logbuffers, lb, plb_next);
PMCLOG_INIT_BUFFER_DESCRIPTOR(lb);
mtx_lock_spin(&pmc_bufferlist_mtx);
TAILQ_INSERT_HEAD(&pmc_bufferlist, lb, plb_next);
mtx_unlock_spin(&pmc_bufferlist_mtx);
}
/* return the 'current' buffer to the global pool */
if ((lb = po->po_curbuf) != NULL) {
PMCLOG_INIT_BUFFER_DESCRIPTOR(lb);
mtx_lock_spin(&pmc_bufferlist_mtx);
TAILQ_INSERT_HEAD(&pmc_bufferlist, lb, plb_next);
mtx_unlock_spin(&pmc_bufferlist_mtx);
}
/* drop a reference to the fd */
error = fdrop(po->po_file, curthread);
po->po_file = NULL;
po->po_error = 0;
return (error);
}
/*
* Flush a process' log buffer.
*/
int
pmclog_flush(struct pmc_owner *po)
{
int error;
PMCDBG(LOG,FLS,1, "po=%p", po);
/*
* If there is a pending error recorded by the logger thread,
* return that.
*/
if (po->po_error)
return (po->po_error);
error = 0;
/*
* Check that we do have an active log file.
*/
mtx_lock(&pmc_kthread_mtx);
if ((po->po_flags & PMC_PO_OWNS_LOGFILE) == 0) {
error = EINVAL;
goto error;
}
/*
* Schedule the current buffer if any.
*/
mtx_lock_spin(&po->po_mtx);
if (po->po_curbuf)
pmclog_schedule_io(po);
mtx_unlock_spin(&po->po_mtx);
/*
* Initiate shutdown: no new data queued,
* thread will close file on last block.
*/
po->po_flags |= PMC_PO_SHUTDOWN;
error:
mtx_unlock(&pmc_kthread_mtx);
return (error);
}
void
pmclog_process_callchain(struct pmc *pm, struct pmc_sample *ps)
{
int n, recordlen;
uint32_t flags;
struct pmc_owner *po;
PMCDBG(LOG,SAM,1,"pm=%p pid=%d n=%d", pm, ps->ps_pid,
ps->ps_nsamples);
recordlen = offsetof(struct pmclog_callchain, pl_pc) +
ps->ps_nsamples * sizeof(uintfptr_t);
po = pm->pm_owner;
flags = PMC_CALLCHAIN_TO_CPUFLAGS(ps->ps_cpu,ps->ps_flags);
PMCLOG_RESERVE(po, CALLCHAIN, recordlen);
PMCLOG_EMIT32(ps->ps_pid);
PMCLOG_EMIT32(pm->pm_id);
PMCLOG_EMIT32(flags);
for (n = 0; n < ps->ps_nsamples; n++)
PMCLOG_EMITADDR(ps->ps_pc[n]);
PMCLOG_DESPATCH(po);
}
void
pmclog_process_closelog(struct pmc_owner *po)
{
PMCLOG_RESERVE(po,CLOSELOG,sizeof(struct pmclog_closelog));
PMCLOG_DESPATCH(po);
}
void
pmclog_process_dropnotify(struct pmc_owner *po)
{
PMCLOG_RESERVE(po,DROPNOTIFY,sizeof(struct pmclog_dropnotify));
PMCLOG_DESPATCH(po);
}
void
pmclog_process_map_in(struct pmc_owner *po, pid_t pid, uintfptr_t start,
const char *path)
{
int pathlen, recordlen;
KASSERT(path != NULL, ("[pmclog,%d] map-in, null path", __LINE__));
pathlen = strlen(path) + 1; /* #bytes for path name */
recordlen = offsetof(struct pmclog_map_in, pl_pathname) +
pathlen;
PMCLOG_RESERVE(po, MAP_IN, recordlen);
PMCLOG_EMIT32(pid);
PMCLOG_EMITADDR(start);
PMCLOG_EMITSTRING(path,pathlen);
PMCLOG_DESPATCH(po);
}
void
pmclog_process_map_out(struct pmc_owner *po, pid_t pid, uintfptr_t start,
uintfptr_t end)
{
KASSERT(start <= end, ("[pmclog,%d] start > end", __LINE__));
PMCLOG_RESERVE(po, MAP_OUT, sizeof(struct pmclog_map_out));
PMCLOG_EMIT32(pid);
PMCLOG_EMITADDR(start);
PMCLOG_EMITADDR(end);
PMCLOG_DESPATCH(po);
}
void
pmclog_process_pmcallocate(struct pmc *pm)
{
struct pmc_owner *po;
po = pm->pm_owner;
PMCDBG(LOG,ALL,1, "pm=%p", pm);
PMCLOG_RESERVE(po, PMCALLOCATE, sizeof(struct pmclog_pmcallocate));
PMCLOG_EMIT32(pm->pm_id);
PMCLOG_EMIT32(pm->pm_event);
PMCLOG_EMIT32(pm->pm_flags);
PMCLOG_DESPATCH(po);
}
void
pmclog_process_pmcattach(struct pmc *pm, pid_t pid, char *path)
{
int pathlen, recordlen;
struct pmc_owner *po;
PMCDBG(LOG,ATT,1,"pm=%p pid=%d", pm, pid);
po = pm->pm_owner;
pathlen = strlen(path) + 1; /* #bytes for the string */
recordlen = offsetof(struct pmclog_pmcattach, pl_pathname) + pathlen;
PMCLOG_RESERVE(po, PMCATTACH, recordlen);
PMCLOG_EMIT32(pm->pm_id);
PMCLOG_EMIT32(pid);
PMCLOG_EMITSTRING(path, pathlen);
PMCLOG_DESPATCH(po);
}
void
pmclog_process_pmcdetach(struct pmc *pm, pid_t pid)
{
struct pmc_owner *po;
PMCDBG(LOG,ATT,1,"!pm=%p pid=%d", pm, pid);
po = pm->pm_owner;
PMCLOG_RESERVE(po, PMCDETACH, sizeof(struct pmclog_pmcdetach));
PMCLOG_EMIT32(pm->pm_id);
PMCLOG_EMIT32(pid);
PMCLOG_DESPATCH(po);
}
/*
* Log a context switch event to the log file.
*/
void
pmclog_process_proccsw(struct pmc *pm, struct pmc_process *pp, pmc_value_t v)
{
struct pmc_owner *po;
KASSERT(pm->pm_flags & PMC_F_LOG_PROCCSW,
("[pmclog,%d] log-process-csw called gratuitously", __LINE__));
PMCDBG(LOG,SWO,1,"pm=%p pid=%d v=%jx", pm, pp->pp_proc->p_pid,
v);
po = pm->pm_owner;
PMCLOG_RESERVE(po, PROCCSW, sizeof(struct pmclog_proccsw));
PMCLOG_EMIT32(pm->pm_id);
PMCLOG_EMIT64(v);
PMCLOG_EMIT32(pp->pp_proc->p_pid);
PMCLOG_DESPATCH(po);
}
void
pmclog_process_procexec(struct pmc_owner *po, pmc_id_t pmid, pid_t pid,
uintfptr_t startaddr, char *path)
{
int pathlen, recordlen;
PMCDBG(LOG,EXC,1,"po=%p pid=%d path=\"%s\"", po, pid, path);
pathlen = strlen(path) + 1; /* #bytes for the path */
recordlen = offsetof(struct pmclog_procexec, pl_pathname) + pathlen;
PMCLOG_RESERVE(po, PROCEXEC, recordlen);
PMCLOG_EMIT32(pid);
PMCLOG_EMITADDR(startaddr);
PMCLOG_EMIT32(pmid);
PMCLOG_EMITSTRING(path,pathlen);
PMCLOG_DESPATCH(po);
}
/*
* Log a process exit event (and accumulated pmc value) to the log file.
*/
void
pmclog_process_procexit(struct pmc *pm, struct pmc_process *pp)
{
int ri;
struct pmc_owner *po;
ri = PMC_TO_ROWINDEX(pm);
PMCDBG(LOG,EXT,1,"pm=%p pid=%d v=%jx", pm, pp->pp_proc->p_pid,
pp->pp_pmcs[ri].pp_pmcval);
po = pm->pm_owner;
PMCLOG_RESERVE(po, PROCEXIT, sizeof(struct pmclog_procexit));
PMCLOG_EMIT32(pm->pm_id);
PMCLOG_EMIT64(pp->pp_pmcs[ri].pp_pmcval);
PMCLOG_EMIT32(pp->pp_proc->p_pid);
PMCLOG_DESPATCH(po);
}
/*
* Log a fork event.
*/
void
pmclog_process_procfork(struct pmc_owner *po, pid_t oldpid, pid_t newpid)
{
PMCLOG_RESERVE(po, PROCFORK, sizeof(struct pmclog_procfork));
PMCLOG_EMIT32(oldpid);
PMCLOG_EMIT32(newpid);
PMCLOG_DESPATCH(po);
}
/*
* Log a process exit event of the form suitable for system-wide PMCs.
*/
void
pmclog_process_sysexit(struct pmc_owner *po, pid_t pid)
{
PMCLOG_RESERVE(po, SYSEXIT, sizeof(struct pmclog_sysexit));
PMCLOG_EMIT32(pid);
PMCLOG_DESPATCH(po);
}
/*
* Write a user log entry.
*/
int
pmclog_process_userlog(struct pmc_owner *po, struct pmc_op_writelog *wl)
{
int error;
PMCDBG(LOG,WRI,1, "writelog po=%p ud=0x%x", po, wl->pm_userdata);
error = 0;
PMCLOG_RESERVE_WITH_ERROR(po, USERDATA,
sizeof(struct pmclog_userdata));
PMCLOG_EMIT32(wl->pm_userdata);
PMCLOG_DESPATCH(po);
error:
return (error);
}
/*
* Initialization.
*
* Create a pool of log buffers and initialize mutexes.
*/
void
pmclog_initialize()
{
int n;
struct pmclog_buffer *plb;
if (pmclog_buffer_size <= 0) {
(void) printf("hwpmc: tunable logbuffersize=%d must be "
"greater than zero.\n", pmclog_buffer_size);
pmclog_buffer_size = PMC_LOG_BUFFER_SIZE;
}
if (pmc_nlogbuffers <= 0) {
(void) printf("hwpmc: tunable nlogbuffers=%d must be greater "
"than zero.\n", pmc_nlogbuffers);
pmc_nlogbuffers = PMC_NLOGBUFFERS;
}
/* create global pool of log buffers */
for (n = 0; n < pmc_nlogbuffers; n++) {
plb = malloc(1024 * pmclog_buffer_size, M_PMC,
M_WAITOK|M_ZERO);
PMCLOG_INIT_BUFFER_DESCRIPTOR(plb);
TAILQ_INSERT_HEAD(&pmc_bufferlist, plb, plb_next);
}
mtx_init(&pmc_bufferlist_mtx, "pmc-buffer-list", "pmc-leaf",
MTX_SPIN);
mtx_init(&pmc_kthread_mtx, "pmc-kthread", "pmc-sleep", MTX_DEF);
}
/*
* Shutdown logging.
*
* Destroy mutexes and release memory back the to free pool.
*/
void
pmclog_shutdown()
{
struct pmclog_buffer *plb;
mtx_destroy(&pmc_kthread_mtx);
mtx_destroy(&pmc_bufferlist_mtx);
while ((plb = TAILQ_FIRST(&pmc_bufferlist)) != NULL) {
TAILQ_REMOVE(&pmc_bufferlist, plb, plb_next);
free(plb, M_PMC);
}
}
Index: head/sys/dev/hwpmc/hwpmc_mod.c
===================================================================
--- head/sys/dev/hwpmc/hwpmc_mod.c (revision 225616)
+++ head/sys/dev/hwpmc/hwpmc_mod.c (revision 225617)
@@ -1,4949 +1,4949 @@
/*-
* Copyright (c) 2003-2008 Joseph Koshy
* Copyright (c) 2007 The FreeBSD Foundation
* All rights reserved.
*
* Portions of this software were developed by A. Joseph Koshy under
* sponsorship from the FreeBSD Foundation and Google, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/eventhandler.h>
#include <sys/jail.h>
#include <sys/kernel.h>
#include <sys/kthread.h>
#include <sys/limits.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/module.h>
#include <sys/mount.h>
#include <sys/mutex.h>
#include <sys/pmc.h>
#include <sys/pmckern.h>
#include <sys/pmclog.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/queue.h>
#include <sys/resourcevar.h>
#include <sys/sched.h>
#include <sys/signalvar.h>
#include <sys/smp.h>
#include <sys/sx.h>
#include <sys/sysctl.h>
#include <sys/sysent.h>
#include <sys/systm.h>
#include <sys/vnode.h>
#include <sys/linker.h> /* needs to be after <sys/malloc.h> */
#include <machine/atomic.h>
#include <machine/md_var.h>
#include <vm/vm.h>
#include <vm/vm_extern.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
#include <vm/vm_object.h>
/*
* Types
*/
enum pmc_flags {
PMC_FLAG_NONE = 0x00, /* do nothing */
PMC_FLAG_REMOVE = 0x01, /* atomically remove entry from hash */
PMC_FLAG_ALLOCATE = 0x02, /* add entry to hash if not found */
};
/*
* The offset in sysent where the syscall is allocated.
*/
static int pmc_syscall_num = NO_SYSCALL;
struct pmc_cpu **pmc_pcpu; /* per-cpu state */
pmc_value_t *pmc_pcpu_saved; /* saved PMC values: CSW handling */
#define PMC_PCPU_SAVED(C,R) pmc_pcpu_saved[(R) + md->pmd_npmc*(C)]
struct mtx_pool *pmc_mtxpool;
static int *pmc_pmcdisp; /* PMC row dispositions */
#define PMC_ROW_DISP_IS_FREE(R) (pmc_pmcdisp[(R)] == 0)
#define PMC_ROW_DISP_IS_THREAD(R) (pmc_pmcdisp[(R)] > 0)
#define PMC_ROW_DISP_IS_STANDALONE(R) (pmc_pmcdisp[(R)] < 0)
#define PMC_MARK_ROW_FREE(R) do { \
pmc_pmcdisp[(R)] = 0; \
} while (0)
#define PMC_MARK_ROW_STANDALONE(R) do { \
KASSERT(pmc_pmcdisp[(R)] <= 0, ("[pmc,%d] row disposition error", \
__LINE__)); \
atomic_add_int(&pmc_pmcdisp[(R)], -1); \
KASSERT(pmc_pmcdisp[(R)] >= (-pmc_cpu_max_active()), \
("[pmc,%d] row disposition error", __LINE__)); \
} while (0)
#define PMC_UNMARK_ROW_STANDALONE(R) do { \
atomic_add_int(&pmc_pmcdisp[(R)], 1); \
KASSERT(pmc_pmcdisp[(R)] <= 0, ("[pmc,%d] row disposition error", \
__LINE__)); \
} while (0)
#define PMC_MARK_ROW_THREAD(R) do { \
KASSERT(pmc_pmcdisp[(R)] >= 0, ("[pmc,%d] row disposition error", \
__LINE__)); \
atomic_add_int(&pmc_pmcdisp[(R)], 1); \
} while (0)
#define PMC_UNMARK_ROW_THREAD(R) do { \
atomic_add_int(&pmc_pmcdisp[(R)], -1); \
KASSERT(pmc_pmcdisp[(R)] >= 0, ("[pmc,%d] row disposition error", \
__LINE__)); \
} while (0)
/* various event handlers */
static eventhandler_tag pmc_exit_tag, pmc_fork_tag;
/* Module statistics */
struct pmc_op_getdriverstats pmc_stats;
/* Machine/processor dependent operations */
static struct pmc_mdep *md;
/*
* Hash tables mapping owner processes and target threads to PMCs.
*/
struct mtx pmc_processhash_mtx; /* spin mutex */
static u_long pmc_processhashmask;
static LIST_HEAD(pmc_processhash, pmc_process) *pmc_processhash;
/*
* Hash table of PMC owner descriptors. This table is protected by
* the shared PMC "sx" lock.
*/
static u_long pmc_ownerhashmask;
static LIST_HEAD(pmc_ownerhash, pmc_owner) *pmc_ownerhash;
/*
* List of PMC owners with system-wide sampling PMCs.
*/
static LIST_HEAD(, pmc_owner) pmc_ss_owners;
/*
* A map of row indices to classdep structures.
*/
static struct pmc_classdep **pmc_rowindex_to_classdep;
/*
* Prototypes
*/
#ifdef DEBUG
static int pmc_debugflags_sysctl_handler(SYSCTL_HANDLER_ARGS);
static int pmc_debugflags_parse(char *newstr, char *fence);
#endif
static int load(struct module *module, int cmd, void *arg);
static int pmc_attach_process(struct proc *p, struct pmc *pm);
static struct pmc *pmc_allocate_pmc_descriptor(void);
static struct pmc_owner *pmc_allocate_owner_descriptor(struct proc *p);
static int pmc_attach_one_process(struct proc *p, struct pmc *pm);
static int pmc_can_allocate_rowindex(struct proc *p, unsigned int ri,
int cpu);
static int pmc_can_attach(struct pmc *pm, struct proc *p);
static void pmc_capture_user_callchain(int cpu, struct trapframe *tf);
static void pmc_cleanup(void);
static int pmc_detach_process(struct proc *p, struct pmc *pm);
static int pmc_detach_one_process(struct proc *p, struct pmc *pm,
int flags);
static void pmc_destroy_owner_descriptor(struct pmc_owner *po);
static struct pmc_owner *pmc_find_owner_descriptor(struct proc *p);
static int pmc_find_pmc(pmc_id_t pmcid, struct pmc **pm);
static struct pmc *pmc_find_pmc_descriptor_in_process(struct pmc_owner *po,
pmc_id_t pmc);
static struct pmc_process *pmc_find_process_descriptor(struct proc *p,
uint32_t mode);
static void pmc_force_context_switch(void);
static void pmc_link_target_process(struct pmc *pm,
struct pmc_process *pp);
static void pmc_log_all_process_mappings(struct pmc_owner *po);
static void pmc_log_kernel_mappings(struct pmc *pm);
static void pmc_log_process_mappings(struct pmc_owner *po, struct proc *p);
static void pmc_maybe_remove_owner(struct pmc_owner *po);
static void pmc_process_csw_in(struct thread *td);
static void pmc_process_csw_out(struct thread *td);
static void pmc_process_exit(void *arg, struct proc *p);
static void pmc_process_fork(void *arg, struct proc *p1,
struct proc *p2, int n);
static void pmc_process_samples(int cpu);
static void pmc_release_pmc_descriptor(struct pmc *pmc);
static void pmc_remove_owner(struct pmc_owner *po);
static void pmc_remove_process_descriptor(struct pmc_process *pp);
static void pmc_restore_cpu_binding(struct pmc_binding *pb);
static void pmc_save_cpu_binding(struct pmc_binding *pb);
static void pmc_select_cpu(int cpu);
static int pmc_start(struct pmc *pm);
static int pmc_stop(struct pmc *pm);
static int pmc_syscall_handler(struct thread *td, void *syscall_args);
static void pmc_unlink_target_process(struct pmc *pmc,
struct pmc_process *pp);
/*
* Kernel tunables and sysctl(8) interface.
*/
SYSCTL_NODE(_kern, OID_AUTO, hwpmc, CTLFLAG_RW, 0, "HWPMC parameters");
static int pmc_callchaindepth = PMC_CALLCHAIN_DEPTH;
TUNABLE_INT(PMC_SYSCTL_NAME_PREFIX "callchaindepth", &pmc_callchaindepth);
SYSCTL_INT(_kern_hwpmc, OID_AUTO, callchaindepth, CTLFLAG_TUN|CTLFLAG_RD,
&pmc_callchaindepth, 0, "depth of call chain records");
#ifdef DEBUG
struct pmc_debugflags pmc_debugflags = PMC_DEBUG_DEFAULT_FLAGS;
char pmc_debugstr[PMC_DEBUG_STRSIZE];
TUNABLE_STR(PMC_SYSCTL_NAME_PREFIX "debugflags", pmc_debugstr,
sizeof(pmc_debugstr));
SYSCTL_PROC(_kern_hwpmc, OID_AUTO, debugflags,
CTLTYPE_STRING|CTLFLAG_RW|CTLFLAG_TUN,
0, 0, pmc_debugflags_sysctl_handler, "A", "debug flags");
#endif
/*
* kern.hwpmc.hashrows -- determines the number of rows in the
* of the hash table used to look up threads
*/
static int pmc_hashsize = PMC_HASH_SIZE;
TUNABLE_INT(PMC_SYSCTL_NAME_PREFIX "hashsize", &pmc_hashsize);
SYSCTL_INT(_kern_hwpmc, OID_AUTO, hashsize, CTLFLAG_TUN|CTLFLAG_RD,
&pmc_hashsize, 0, "rows in hash tables");
/*
* kern.hwpmc.nsamples --- number of PC samples/callchain stacks per CPU
*/
static int pmc_nsamples = PMC_NSAMPLES;
TUNABLE_INT(PMC_SYSCTL_NAME_PREFIX "nsamples", &pmc_nsamples);
SYSCTL_INT(_kern_hwpmc, OID_AUTO, nsamples, CTLFLAG_TUN|CTLFLAG_RD,
&pmc_nsamples, 0, "number of PC samples per CPU");
/*
* kern.hwpmc.mtxpoolsize -- number of mutexes in the mutex pool.
*/
static int pmc_mtxpool_size = PMC_MTXPOOL_SIZE;
TUNABLE_INT(PMC_SYSCTL_NAME_PREFIX "mtxpoolsize", &pmc_mtxpool_size);
SYSCTL_INT(_kern_hwpmc, OID_AUTO, mtxpoolsize, CTLFLAG_TUN|CTLFLAG_RD,
&pmc_mtxpool_size, 0, "size of spin mutex pool");
/*
* security.bsd.unprivileged_syspmcs -- allow non-root processes to
* allocate system-wide PMCs.
*
* Allowing unprivileged processes to allocate system PMCs is convenient
* if system-wide measurements need to be taken concurrently with other
* per-process measurements. This feature is turned off by default.
*/
static int pmc_unprivileged_syspmcs = 0;
TUNABLE_INT("security.bsd.unprivileged_syspmcs", &pmc_unprivileged_syspmcs);
SYSCTL_INT(_security_bsd, OID_AUTO, unprivileged_syspmcs, CTLFLAG_RW,
&pmc_unprivileged_syspmcs, 0,
"allow unprivileged process to allocate system PMCs");
/*
* Hash function. Discard the lower 2 bits of the pointer since
* these are always zero for our uses. The hash multiplier is
* round((2^LONG_BIT) * ((sqrt(5)-1)/2)).
*/
#if LONG_BIT == 64
#define _PMC_HM 11400714819323198486u
#elif LONG_BIT == 32
#define _PMC_HM 2654435769u
#else
#error Must know the size of 'long' to compile
#endif
#define PMC_HASH_PTR(P,M) ((((unsigned long) (P) >> 2) * _PMC_HM) & (M))
/*
* Syscall structures
*/
/* The `sysent' for the new syscall */
static struct sysent pmc_sysent = {
2, /* sy_narg */
pmc_syscall_handler /* sy_call */
};
static struct syscall_module_data pmc_syscall_mod = {
load,
NULL,
&pmc_syscall_num,
&pmc_sysent,
{ 0, NULL }
};
static moduledata_t pmc_mod = {
PMC_MODULE_NAME,
syscall_module_handler,
&pmc_syscall_mod
};
DECLARE_MODULE(pmc, pmc_mod, SI_SUB_SMP, SI_ORDER_ANY);
MODULE_VERSION(pmc, PMC_VERSION);
#ifdef DEBUG
enum pmc_dbgparse_state {
PMCDS_WS, /* in whitespace */
PMCDS_MAJOR, /* seen a major keyword */
PMCDS_MINOR
};
static int
pmc_debugflags_parse(char *newstr, char *fence)
{
char c, *p, *q;
struct pmc_debugflags *tmpflags;
int error, found, *newbits, tmp;
size_t kwlen;
tmpflags = malloc(sizeof(*tmpflags), M_PMC, M_WAITOK|M_ZERO);
p = newstr;
error = 0;
for (; p < fence && (c = *p); p++) {
/* skip white space */
if (c == ' ' || c == '\t')
continue;
/* look for a keyword followed by "=" */
for (q = p; p < fence && (c = *p) && c != '='; p++)
;
if (c != '=') {
error = EINVAL;
goto done;
}
kwlen = p - q;
newbits = NULL;
/* lookup flag group name */
#define DBG_SET_FLAG_MAJ(S,F) \
if (kwlen == sizeof(S)-1 && strncmp(q, S, kwlen) == 0) \
newbits = &tmpflags->pdb_ ## F;
DBG_SET_FLAG_MAJ("cpu", CPU);
DBG_SET_FLAG_MAJ("csw", CSW);
DBG_SET_FLAG_MAJ("logging", LOG);
DBG_SET_FLAG_MAJ("module", MOD);
DBG_SET_FLAG_MAJ("md", MDP);
DBG_SET_FLAG_MAJ("owner", OWN);
DBG_SET_FLAG_MAJ("pmc", PMC);
DBG_SET_FLAG_MAJ("process", PRC);
DBG_SET_FLAG_MAJ("sampling", SAM);
if (newbits == NULL) {
error = EINVAL;
goto done;
}
p++; /* skip the '=' */
/* Now parse the individual flags */
tmp = 0;
newflag:
for (q = p; p < fence && (c = *p); p++)
if (c == ' ' || c == '\t' || c == ',')
break;
/* p == fence or c == ws or c == "," or c == 0 */
if ((kwlen = p - q) == 0) {
*newbits = tmp;
continue;
}
found = 0;
#define DBG_SET_FLAG_MIN(S,F) \
if (kwlen == sizeof(S)-1 && strncmp(q, S, kwlen) == 0) \
tmp |= found = (1 << PMC_DEBUG_MIN_ ## F)
/* a '*' denotes all possible flags in the group */
if (kwlen == 1 && *q == '*')
tmp = found = ~0;
/* look for individual flag names */
DBG_SET_FLAG_MIN("allocaterow", ALR);
DBG_SET_FLAG_MIN("allocate", ALL);
DBG_SET_FLAG_MIN("attach", ATT);
DBG_SET_FLAG_MIN("bind", BND);
DBG_SET_FLAG_MIN("config", CFG);
DBG_SET_FLAG_MIN("exec", EXC);
DBG_SET_FLAG_MIN("exit", EXT);
DBG_SET_FLAG_MIN("find", FND);
DBG_SET_FLAG_MIN("flush", FLS);
DBG_SET_FLAG_MIN("fork", FRK);
DBG_SET_FLAG_MIN("getbuf", GTB);
DBG_SET_FLAG_MIN("hook", PMH);
DBG_SET_FLAG_MIN("init", INI);
DBG_SET_FLAG_MIN("intr", INT);
DBG_SET_FLAG_MIN("linktarget", TLK);
DBG_SET_FLAG_MIN("mayberemove", OMR);
DBG_SET_FLAG_MIN("ops", OPS);
DBG_SET_FLAG_MIN("read", REA);
DBG_SET_FLAG_MIN("register", REG);
DBG_SET_FLAG_MIN("release", REL);
DBG_SET_FLAG_MIN("remove", ORM);
DBG_SET_FLAG_MIN("sample", SAM);
DBG_SET_FLAG_MIN("scheduleio", SIO);
DBG_SET_FLAG_MIN("select", SEL);
DBG_SET_FLAG_MIN("signal", SIG);
DBG_SET_FLAG_MIN("swi", SWI);
DBG_SET_FLAG_MIN("swo", SWO);
DBG_SET_FLAG_MIN("start", STA);
DBG_SET_FLAG_MIN("stop", STO);
DBG_SET_FLAG_MIN("syscall", PMS);
DBG_SET_FLAG_MIN("unlinktarget", TUL);
DBG_SET_FLAG_MIN("write", WRI);
if (found == 0) {
/* unrecognized flag name */
error = EINVAL;
goto done;
}
if (c == 0 || c == ' ' || c == '\t') { /* end of flag group */
*newbits = tmp;
continue;
}
p++;
goto newflag;
}
/* save the new flag set */
bcopy(tmpflags, &pmc_debugflags, sizeof(pmc_debugflags));
done:
free(tmpflags, M_PMC);
return error;
}
static int
pmc_debugflags_sysctl_handler(SYSCTL_HANDLER_ARGS)
{
char *fence, *newstr;
int error;
unsigned int n;
(void) arg1; (void) arg2; /* unused parameters */
n = sizeof(pmc_debugstr);
newstr = malloc(n, M_PMC, M_WAITOK|M_ZERO);
(void) strlcpy(newstr, pmc_debugstr, n);
error = sysctl_handle_string(oidp, newstr, n, req);
/* if there is a new string, parse and copy it */
if (error == 0 && req->newptr != NULL) {
fence = newstr + (n < req->newlen ? n : req->newlen + 1);
if ((error = pmc_debugflags_parse(newstr, fence)) == 0)
(void) strlcpy(pmc_debugstr, newstr,
sizeof(pmc_debugstr));
}
free(newstr, M_PMC);
return error;
}
#endif
/*
* Map a row index to a classdep structure and return the adjusted row
* index for the PMC class index.
*/
static struct pmc_classdep *
pmc_ri_to_classdep(struct pmc_mdep *md, int ri, int *adjri)
{
struct pmc_classdep *pcd;
(void) md;
KASSERT(ri >= 0 && ri < md->pmd_npmc,
("[pmc,%d] illegal row-index %d", __LINE__, ri));
pcd = pmc_rowindex_to_classdep[ri];
KASSERT(pcd != NULL,
("[pmc,%d] ri %d null pcd", __LINE__, ri));
*adjri = ri - pcd->pcd_ri;
KASSERT(*adjri >= 0 && *adjri < pcd->pcd_num,
("[pmc,%d] adjusted row-index %d", __LINE__, *adjri));
return (pcd);
}
/*
* Concurrency Control
*
* The driver manages the following data structures:
*
* - target process descriptors, one per target process
* - owner process descriptors (and attached lists), one per owner process
* - lookup hash tables for owner and target processes
* - PMC descriptors (and attached lists)
* - per-cpu hardware state
* - the 'hook' variable through which the kernel calls into
* this module
* - the machine hardware state (managed by the MD layer)
*
* These data structures are accessed from:
*
* - thread context-switch code
* - interrupt handlers (possibly on multiple cpus)
* - kernel threads on multiple cpus running on behalf of user
* processes doing system calls
* - this driver's private kernel threads
*
* = Locks and Locking strategy =
*
* The driver uses four locking strategies for its operation:
*
* - The global SX lock "pmc_sx" is used to protect internal
* data structures.
*
* Calls into the module by syscall() start with this lock being
* held in exclusive mode. Depending on the requested operation,
* the lock may be downgraded to 'shared' mode to allow more
* concurrent readers into the module. Calls into the module from
* other parts of the kernel acquire the lock in shared mode.
*
* This SX lock is held in exclusive mode for any operations that
* modify the linkages between the driver's internal data structures.
*
* The 'pmc_hook' function pointer is also protected by this lock.
* It is only examined with the sx lock held in exclusive mode. The
* kernel module is allowed to be unloaded only with the sx lock held
* in exclusive mode. In normal syscall handling, after acquiring the
* pmc_sx lock we first check that 'pmc_hook' is non-null before
* proceeding. This prevents races between the thread unloading the module
* and other threads seeking to use the module.
*
* - Lookups of target process structures and owner process structures
* cannot use the global "pmc_sx" SX lock because these lookups need
* to happen during context switches and in other critical sections
* where sleeping is not allowed. We protect these lookup tables
* with their own private spin-mutexes, "pmc_processhash_mtx" and
* "pmc_ownerhash_mtx".
*
* - Interrupt handlers work in a lock free manner. At interrupt
* time, handlers look at the PMC pointer (phw->phw_pmc) configured
* when the PMC was started. If this pointer is NULL, the interrupt
* is ignored after updating driver statistics. We ensure that this
* pointer is set (using an atomic operation if necessary) before the
* PMC hardware is started. Conversely, this pointer is unset atomically
* only after the PMC hardware is stopped.
*
* We ensure that everything needed for the operation of an
* interrupt handler is available without it needing to acquire any
* locks. We also ensure that a PMC's software state is destroyed only
* after the PMC is taken off hardware (on all CPUs).
*
* - Context-switch handling with process-private PMCs needs more
* care.
*
* A given process may be the target of multiple PMCs. For example,
* PMCATTACH and PMCDETACH may be requested by a process on one CPU
* while the target process is running on another. A PMC could also
* be getting released because its owner is exiting. We tackle
* these situations in the following manner:
*
* - each target process structure 'pmc_process' has an array
* of 'struct pmc *' pointers, one for each hardware PMC.
*
* - At context switch IN time, each "target" PMC in RUNNING state
* gets started on hardware and a pointer to each PMC is copied into
* the per-cpu phw array. The 'runcount' for the PMC is
* incremented.
*
* - At context switch OUT time, all process-virtual PMCs are stopped
* on hardware. The saved value is added to the PMCs value field
* only if the PMC is in a non-deleted state (the PMCs state could
* have changed during the current time slice).
*
* Note that since in-between a switch IN on a processor and a switch
* OUT, the PMC could have been released on another CPU. Therefore
* context switch OUT always looks at the hardware state to turn
* OFF PMCs and will update a PMC's saved value only if reachable
* from the target process record.
*
* - OP PMCRELEASE could be called on a PMC at any time (the PMC could
* be attached to many processes at the time of the call and could
* be active on multiple CPUs).
*
* We prevent further scheduling of the PMC by marking it as in
* state 'DELETED'. If the runcount of the PMC is non-zero then
* this PMC is currently running on a CPU somewhere. The thread
* doing the PMCRELEASE operation waits by repeatedly doing a
* pause() till the runcount comes to zero.
*
* The contents of a PMC descriptor (struct pmc) are protected using
* a spin-mutex. In order to save space, we use a mutex pool.
*
* In terms of lock types used by witness(4), we use:
* - Type "pmc-sx", used by the global SX lock.
* - Type "pmc-sleep", for sleep mutexes used by logger threads.
* - Type "pmc-per-proc", for protecting PMC owner descriptors.
* - Type "pmc-leaf", used for all other spin mutexes.
*/
/*
* save the cpu binding of the current kthread
*/
static void
pmc_save_cpu_binding(struct pmc_binding *pb)
{
PMCDBG(CPU,BND,2, "%s", "save-cpu");
thread_lock(curthread);
pb->pb_bound = sched_is_bound(curthread);
pb->pb_cpu = curthread->td_oncpu;
thread_unlock(curthread);
PMCDBG(CPU,BND,2, "save-cpu cpu=%d", pb->pb_cpu);
}
/*
* restore the cpu binding of the current thread
*/
static void
pmc_restore_cpu_binding(struct pmc_binding *pb)
{
PMCDBG(CPU,BND,2, "restore-cpu curcpu=%d restore=%d",
curthread->td_oncpu, pb->pb_cpu);
thread_lock(curthread);
if (pb->pb_bound)
sched_bind(curthread, pb->pb_cpu);
else
sched_unbind(curthread);
thread_unlock(curthread);
PMCDBG(CPU,BND,2, "%s", "restore-cpu done");
}
/*
* move execution over the specified cpu and bind it there.
*/
static void
pmc_select_cpu(int cpu)
{
KASSERT(cpu >= 0 && cpu < pmc_cpu_max(),
("[pmc,%d] bad cpu number %d", __LINE__, cpu));
/* Never move to an inactive CPU. */
KASSERT(pmc_cpu_is_active(cpu), ("[pmc,%d] selecting inactive "
"CPU %d", __LINE__, cpu));
PMCDBG(CPU,SEL,2, "select-cpu cpu=%d", cpu);
thread_lock(curthread);
sched_bind(curthread, cpu);
thread_unlock(curthread);
KASSERT(curthread->td_oncpu == cpu,
("[pmc,%d] CPU not bound [cpu=%d, curr=%d]", __LINE__,
cpu, curthread->td_oncpu));
PMCDBG(CPU,SEL,2, "select-cpu cpu=%d ok", cpu);
}
/*
* Force a context switch.
*
* We do this by pause'ing for 1 tick -- invoking mi_switch() is not
* guaranteed to force a context switch.
*/
static void
pmc_force_context_switch(void)
{
pause("pmcctx", 1);
}
/*
* Get the file name for an executable. This is a simple wrapper
* around vn_fullpath(9).
*/
static void
pmc_getfilename(struct vnode *v, char **fullpath, char **freepath)
{
*fullpath = "unknown";
*freepath = NULL;
vn_fullpath(curthread, v, fullpath, freepath);
}
/*
* remove an process owning PMCs
*/
void
pmc_remove_owner(struct pmc_owner *po)
{
struct pmc *pm, *tmp;
sx_assert(&pmc_sx, SX_XLOCKED);
PMCDBG(OWN,ORM,1, "remove-owner po=%p", po);
/* Remove descriptor from the owner hash table */
LIST_REMOVE(po, po_next);
/* release all owned PMC descriptors */
LIST_FOREACH_SAFE(pm, &po->po_pmcs, pm_next, tmp) {
PMCDBG(OWN,ORM,2, "pmc=%p", pm);
KASSERT(pm->pm_owner == po,
("[pmc,%d] owner %p != po %p", __LINE__, pm->pm_owner, po));
pmc_release_pmc_descriptor(pm); /* will unlink from the list */
}
KASSERT(po->po_sscount == 0,
("[pmc,%d] SS count not zero", __LINE__));
KASSERT(LIST_EMPTY(&po->po_pmcs),
("[pmc,%d] PMC list not empty", __LINE__));
/* de-configure the log file if present */
if (po->po_flags & PMC_PO_OWNS_LOGFILE)
pmclog_deconfigure_log(po);
}
/*
* remove an owner process record if all conditions are met.
*/
static void
pmc_maybe_remove_owner(struct pmc_owner *po)
{
PMCDBG(OWN,OMR,1, "maybe-remove-owner po=%p", po);
/*
* Remove owner record if
* - this process does not own any PMCs
* - this process has not allocated a system-wide sampling buffer
*/
if (LIST_EMPTY(&po->po_pmcs) &&
((po->po_flags & PMC_PO_OWNS_LOGFILE) == 0)) {
pmc_remove_owner(po);
pmc_destroy_owner_descriptor(po);
}
}
/*
* Add an association between a target process and a PMC.
*/
static void
pmc_link_target_process(struct pmc *pm, struct pmc_process *pp)
{
int ri;
struct pmc_target *pt;
sx_assert(&pmc_sx, SX_XLOCKED);
KASSERT(pm != NULL && pp != NULL,
("[pmc,%d] Null pm %p or pp %p", __LINE__, pm, pp));
KASSERT(PMC_IS_VIRTUAL_MODE(PMC_TO_MODE(pm)),
("[pmc,%d] Attaching a non-process-virtual pmc=%p to pid=%d",
__LINE__, pm, pp->pp_proc->p_pid));
KASSERT(pp->pp_refcnt >= 0 && pp->pp_refcnt <= ((int) md->pmd_npmc - 1),
("[pmc,%d] Illegal reference count %d for process record %p",
__LINE__, pp->pp_refcnt, (void *) pp));
ri = PMC_TO_ROWINDEX(pm);
PMCDBG(PRC,TLK,1, "link-target pmc=%p ri=%d pmc-process=%p",
pm, ri, pp);
#ifdef DEBUG
LIST_FOREACH(pt, &pm->pm_targets, pt_next)
if (pt->pt_process == pp)
KASSERT(0, ("[pmc,%d] pp %p already in pmc %p targets",
__LINE__, pp, pm));
#endif
pt = malloc(sizeof(struct pmc_target), M_PMC, M_WAITOK|M_ZERO);
pt->pt_process = pp;
LIST_INSERT_HEAD(&pm->pm_targets, pt, pt_next);
atomic_store_rel_ptr((uintptr_t *)&pp->pp_pmcs[ri].pp_pmc,
(uintptr_t)pm);
if (pm->pm_owner->po_owner == pp->pp_proc)
pm->pm_flags |= PMC_F_ATTACHED_TO_OWNER;
/*
* Initialize the per-process values at this row index.
*/
pp->pp_pmcs[ri].pp_pmcval = PMC_TO_MODE(pm) == PMC_MODE_TS ?
pm->pm_sc.pm_reloadcount : 0;
pp->pp_refcnt++;
}
/*
* Removes the association between a target process and a PMC.
*/
static void
pmc_unlink_target_process(struct pmc *pm, struct pmc_process *pp)
{
int ri;
struct proc *p;
struct pmc_target *ptgt;
sx_assert(&pmc_sx, SX_XLOCKED);
KASSERT(pm != NULL && pp != NULL,
("[pmc,%d] Null pm %p or pp %p", __LINE__, pm, pp));
KASSERT(pp->pp_refcnt >= 1 && pp->pp_refcnt <= (int) md->pmd_npmc,
("[pmc,%d] Illegal ref count %d on process record %p",
__LINE__, pp->pp_refcnt, (void *) pp));
ri = PMC_TO_ROWINDEX(pm);
PMCDBG(PRC,TUL,1, "unlink-target pmc=%p ri=%d pmc-process=%p",
pm, ri, pp);
KASSERT(pp->pp_pmcs[ri].pp_pmc == pm,
("[pmc,%d] PMC ri %d mismatch pmc %p pp->[ri] %p", __LINE__,
ri, pm, pp->pp_pmcs[ri].pp_pmc));
pp->pp_pmcs[ri].pp_pmc = NULL;
pp->pp_pmcs[ri].pp_pmcval = (pmc_value_t) 0;
/* Remove owner-specific flags */
if (pm->pm_owner->po_owner == pp->pp_proc) {
pp->pp_flags &= ~PMC_PP_ENABLE_MSR_ACCESS;
pm->pm_flags &= ~PMC_F_ATTACHED_TO_OWNER;
}
pp->pp_refcnt--;
/* Remove the target process from the PMC structure */
LIST_FOREACH(ptgt, &pm->pm_targets, pt_next)
if (ptgt->pt_process == pp)
break;
KASSERT(ptgt != NULL, ("[pmc,%d] process %p (pp: %p) not found "
"in pmc %p", __LINE__, pp->pp_proc, pp, pm));
LIST_REMOVE(ptgt, pt_next);
free(ptgt, M_PMC);
/* if the PMC now lacks targets, send the owner a SIGIO */
if (LIST_EMPTY(&pm->pm_targets)) {
p = pm->pm_owner->po_owner;
PROC_LOCK(p);
- psignal(p, SIGIO);
+ kern_psignal(p, SIGIO);
PROC_UNLOCK(p);
PMCDBG(PRC,SIG,2, "signalling proc=%p signal=%d", p,
SIGIO);
}
}
/*
* Check if PMC 'pm' may be attached to target process 't'.
*/
static int
pmc_can_attach(struct pmc *pm, struct proc *t)
{
struct proc *o; /* pmc owner */
struct ucred *oc, *tc; /* owner, target credentials */
int decline_attach, i;
/*
* A PMC's owner can always attach that PMC to itself.
*/
if ((o = pm->pm_owner->po_owner) == t)
return 0;
PROC_LOCK(o);
oc = o->p_ucred;
crhold(oc);
PROC_UNLOCK(o);
PROC_LOCK(t);
tc = t->p_ucred;
crhold(tc);
PROC_UNLOCK(t);
/*
* The effective uid of the PMC owner should match at least one
* of the {effective,real,saved} uids of the target process.
*/
decline_attach = oc->cr_uid != tc->cr_uid &&
oc->cr_uid != tc->cr_svuid &&
oc->cr_uid != tc->cr_ruid;
/*
* Every one of the target's group ids, must be in the owner's
* group list.
*/
for (i = 0; !decline_attach && i < tc->cr_ngroups; i++)
decline_attach = !groupmember(tc->cr_groups[i], oc);
/* check the read and saved gids too */
if (decline_attach == 0)
decline_attach = !groupmember(tc->cr_rgid, oc) ||
!groupmember(tc->cr_svgid, oc);
crfree(tc);
crfree(oc);
return !decline_attach;
}
/*
* Attach a process to a PMC.
*/
static int
pmc_attach_one_process(struct proc *p, struct pmc *pm)
{
int ri;
char *fullpath, *freepath;
struct pmc_process *pp;
sx_assert(&pmc_sx, SX_XLOCKED);
PMCDBG(PRC,ATT,2, "attach-one pm=%p ri=%d proc=%p (%d, %s)", pm,
PMC_TO_ROWINDEX(pm), p, p->p_pid, p->p_comm);
/*
* Locate the process descriptor corresponding to process 'p',
* allocating space as needed.
*
* Verify that rowindex 'pm_rowindex' is free in the process
* descriptor.
*
* If not, allocate space for a descriptor and link the
* process descriptor and PMC.
*/
ri = PMC_TO_ROWINDEX(pm);
if ((pp = pmc_find_process_descriptor(p, PMC_FLAG_ALLOCATE)) == NULL)
return ENOMEM;
if (pp->pp_pmcs[ri].pp_pmc == pm) /* already present at slot [ri] */
return EEXIST;
if (pp->pp_pmcs[ri].pp_pmc != NULL)
return EBUSY;
pmc_link_target_process(pm, pp);
if (PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm)) &&
(pm->pm_flags & PMC_F_ATTACHED_TO_OWNER) == 0)
pm->pm_flags |= PMC_F_NEEDS_LOGFILE;
pm->pm_flags |= PMC_F_ATTACH_DONE; /* mark as attached */
/* issue an attach event to a configured log file */
if (pm->pm_owner->po_flags & PMC_PO_OWNS_LOGFILE) {
pmc_getfilename(p->p_textvp, &fullpath, &freepath);
if (p->p_flag & P_KTHREAD) {
fullpath = kernelname;
freepath = NULL;
} else
pmclog_process_pmcattach(pm, p->p_pid, fullpath);
if (freepath)
free(freepath, M_TEMP);
if (PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm)))
pmc_log_process_mappings(pm->pm_owner, p);
}
/* mark process as using HWPMCs */
PROC_LOCK(p);
p->p_flag |= P_HWPMC;
PROC_UNLOCK(p);
return 0;
}
/*
* Attach a process and optionally its children
*/
static int
pmc_attach_process(struct proc *p, struct pmc *pm)
{
int error;
struct proc *top;
sx_assert(&pmc_sx, SX_XLOCKED);
PMCDBG(PRC,ATT,1, "attach pm=%p ri=%d proc=%p (%d, %s)", pm,
PMC_TO_ROWINDEX(pm), p, p->p_pid, p->p_comm);
/*
* If this PMC successfully allowed a GETMSR operation
* in the past, disallow further ATTACHes.
*/
if ((pm->pm_flags & PMC_PP_ENABLE_MSR_ACCESS) != 0)
return EPERM;
if ((pm->pm_flags & PMC_F_DESCENDANTS) == 0)
return pmc_attach_one_process(p, pm);
/*
* Traverse all child processes, attaching them to
* this PMC.
*/
sx_slock(&proctree_lock);
top = p;
for (;;) {
if ((error = pmc_attach_one_process(p, pm)) != 0)
break;
if (!LIST_EMPTY(&p->p_children))
p = LIST_FIRST(&p->p_children);
else for (;;) {
if (p == top)
goto done;
if (LIST_NEXT(p, p_sibling)) {
p = LIST_NEXT(p, p_sibling);
break;
}
p = p->p_pptr;
}
}
if (error)
(void) pmc_detach_process(top, pm);
done:
sx_sunlock(&proctree_lock);
return error;
}
/*
* Detach a process from a PMC. If there are no other PMCs tracking
* this process, remove the process structure from its hash table. If
* 'flags' contains PMC_FLAG_REMOVE, then free the process structure.
*/
static int
pmc_detach_one_process(struct proc *p, struct pmc *pm, int flags)
{
int ri;
struct pmc_process *pp;
sx_assert(&pmc_sx, SX_XLOCKED);
KASSERT(pm != NULL,
("[pmc,%d] null pm pointer", __LINE__));
ri = PMC_TO_ROWINDEX(pm);
PMCDBG(PRC,ATT,2, "detach-one pm=%p ri=%d proc=%p (%d, %s) flags=0x%x",
pm, ri, p, p->p_pid, p->p_comm, flags);
if ((pp = pmc_find_process_descriptor(p, 0)) == NULL)
return ESRCH;
if (pp->pp_pmcs[ri].pp_pmc != pm)
return EINVAL;
pmc_unlink_target_process(pm, pp);
/* Issue a detach entry if a log file is configured */
if (pm->pm_owner->po_flags & PMC_PO_OWNS_LOGFILE)
pmclog_process_pmcdetach(pm, p->p_pid);
/*
* If there are no PMCs targetting this process, we remove its
* descriptor from the target hash table and unset the P_HWPMC
* flag in the struct proc.
*/
KASSERT(pp->pp_refcnt >= 0 && pp->pp_refcnt <= (int) md->pmd_npmc,
("[pmc,%d] Illegal refcnt %d for process struct %p",
__LINE__, pp->pp_refcnt, pp));
if (pp->pp_refcnt != 0) /* still a target of some PMC */
return 0;
pmc_remove_process_descriptor(pp);
if (flags & PMC_FLAG_REMOVE)
free(pp, M_PMC);
PROC_LOCK(p);
p->p_flag &= ~P_HWPMC;
PROC_UNLOCK(p);
return 0;
}
/*
* Detach a process and optionally its descendants from a PMC.
*/
static int
pmc_detach_process(struct proc *p, struct pmc *pm)
{
struct proc *top;
sx_assert(&pmc_sx, SX_XLOCKED);
PMCDBG(PRC,ATT,1, "detach pm=%p ri=%d proc=%p (%d, %s)", pm,
PMC_TO_ROWINDEX(pm), p, p->p_pid, p->p_comm);
if ((pm->pm_flags & PMC_F_DESCENDANTS) == 0)
return pmc_detach_one_process(p, pm, PMC_FLAG_REMOVE);
/*
* Traverse all children, detaching them from this PMC. We
* ignore errors since we could be detaching a PMC from a
* partially attached proc tree.
*/
sx_slock(&proctree_lock);
top = p;
for (;;) {
(void) pmc_detach_one_process(p, pm, PMC_FLAG_REMOVE);
if (!LIST_EMPTY(&p->p_children))
p = LIST_FIRST(&p->p_children);
else for (;;) {
if (p == top)
goto done;
if (LIST_NEXT(p, p_sibling)) {
p = LIST_NEXT(p, p_sibling);
break;
}
p = p->p_pptr;
}
}
done:
sx_sunlock(&proctree_lock);
if (LIST_EMPTY(&pm->pm_targets))
pm->pm_flags &= ~PMC_F_ATTACH_DONE;
return 0;
}
/*
* Thread context switch IN
*/
static void
pmc_process_csw_in(struct thread *td)
{
int cpu;
unsigned int adjri, ri;
struct pmc *pm;
struct proc *p;
struct pmc_cpu *pc;
struct pmc_hw *phw;
pmc_value_t newvalue;
struct pmc_process *pp;
struct pmc_classdep *pcd;
p = td->td_proc;
if ((pp = pmc_find_process_descriptor(p, PMC_FLAG_NONE)) == NULL)
return;
KASSERT(pp->pp_proc == td->td_proc,
("[pmc,%d] not my thread state", __LINE__));
critical_enter(); /* no preemption from this point */
cpu = PCPU_GET(cpuid); /* td->td_oncpu is invalid */
PMCDBG(CSW,SWI,1, "cpu=%d proc=%p (%d, %s) pp=%p", cpu, p,
p->p_pid, p->p_comm, pp);
KASSERT(cpu >= 0 && cpu < pmc_cpu_max(),
("[pmc,%d] wierd CPU id %d", __LINE__, cpu));
pc = pmc_pcpu[cpu];
for (ri = 0; ri < md->pmd_npmc; ri++) {
if ((pm = pp->pp_pmcs[ri].pp_pmc) == NULL)
continue;
KASSERT(PMC_IS_VIRTUAL_MODE(PMC_TO_MODE(pm)),
("[pmc,%d] Target PMC in non-virtual mode (%d)",
__LINE__, PMC_TO_MODE(pm)));
KASSERT(PMC_TO_ROWINDEX(pm) == ri,
("[pmc,%d] Row index mismatch pmc %d != ri %d",
__LINE__, PMC_TO_ROWINDEX(pm), ri));
/*
* Only PMCs that are marked as 'RUNNING' need
* be placed on hardware.
*/
if (pm->pm_state != PMC_STATE_RUNNING)
continue;
/* increment PMC runcount */
atomic_add_rel_int(&pm->pm_runcount, 1);
/* configure the HWPMC we are going to use. */
pcd = pmc_ri_to_classdep(md, ri, &adjri);
pcd->pcd_config_pmc(cpu, adjri, pm);
phw = pc->pc_hwpmcs[ri];
KASSERT(phw != NULL,
("[pmc,%d] null hw pointer", __LINE__));
KASSERT(phw->phw_pmc == pm,
("[pmc,%d] hw->pmc %p != pmc %p", __LINE__,
phw->phw_pmc, pm));
/*
* Write out saved value and start the PMC.
*
* Sampling PMCs use a per-process value, while
* counting mode PMCs use a per-pmc value that is
* inherited across descendants.
*/
if (PMC_TO_MODE(pm) == PMC_MODE_TS) {
mtx_pool_lock_spin(pmc_mtxpool, pm);
newvalue = PMC_PCPU_SAVED(cpu,ri) =
pp->pp_pmcs[ri].pp_pmcval;
mtx_pool_unlock_spin(pmc_mtxpool, pm);
} else {
KASSERT(PMC_TO_MODE(pm) == PMC_MODE_TC,
("[pmc,%d] illegal mode=%d", __LINE__,
PMC_TO_MODE(pm)));
mtx_pool_lock_spin(pmc_mtxpool, pm);
newvalue = PMC_PCPU_SAVED(cpu, ri) =
pm->pm_gv.pm_savedvalue;
mtx_pool_unlock_spin(pmc_mtxpool, pm);
}
PMCDBG(CSW,SWI,1,"cpu=%d ri=%d new=%jd", cpu, ri, newvalue);
pcd->pcd_write_pmc(cpu, adjri, newvalue);
pcd->pcd_start_pmc(cpu, adjri);
}
/*
* perform any other architecture/cpu dependent thread
* switch-in actions.
*/
(void) (*md->pmd_switch_in)(pc, pp);
critical_exit();
}
/*
* Thread context switch OUT.
*/
static void
pmc_process_csw_out(struct thread *td)
{
int cpu;
int64_t tmp;
struct pmc *pm;
struct proc *p;
enum pmc_mode mode;
struct pmc_cpu *pc;
pmc_value_t newvalue;
unsigned int adjri, ri;
struct pmc_process *pp;
struct pmc_classdep *pcd;
/*
* Locate our process descriptor; this may be NULL if
* this process is exiting and we have already removed
* the process from the target process table.
*
* Note that due to kernel preemption, multiple
* context switches may happen while the process is
* exiting.
*
* Note also that if the target process cannot be
* found we still need to deconfigure any PMCs that
* are currently running on hardware.
*/
p = td->td_proc;
pp = pmc_find_process_descriptor(p, PMC_FLAG_NONE);
/*
* save PMCs
*/
critical_enter();
cpu = PCPU_GET(cpuid); /* td->td_oncpu is invalid */
PMCDBG(CSW,SWO,1, "cpu=%d proc=%p (%d, %s) pp=%p", cpu, p,
p->p_pid, p->p_comm, pp);
KASSERT(cpu >= 0 && cpu < pmc_cpu_max(),
("[pmc,%d wierd CPU id %d", __LINE__, cpu));
pc = pmc_pcpu[cpu];
/*
* When a PMC gets unlinked from a target PMC, it will
* be removed from the target's pp_pmc[] array.
*
* However, on a MP system, the target could have been
* executing on another CPU at the time of the unlink.
* So, at context switch OUT time, we need to look at
* the hardware to determine if a PMC is scheduled on
* it.
*/
for (ri = 0; ri < md->pmd_npmc; ri++) {
pcd = pmc_ri_to_classdep(md, ri, &adjri);
pm = NULL;
(void) (*pcd->pcd_get_config)(cpu, adjri, &pm);
if (pm == NULL) /* nothing at this row index */
continue;
mode = PMC_TO_MODE(pm);
if (!PMC_IS_VIRTUAL_MODE(mode))
continue; /* not a process virtual PMC */
KASSERT(PMC_TO_ROWINDEX(pm) == ri,
("[pmc,%d] ri mismatch pmc(%d) ri(%d)",
__LINE__, PMC_TO_ROWINDEX(pm), ri));
/* Stop hardware if not already stopped */
if (pm->pm_stalled == 0)
pcd->pcd_stop_pmc(cpu, adjri);
/* reduce this PMC's runcount */
atomic_subtract_rel_int(&pm->pm_runcount, 1);
/*
* If this PMC is associated with this process,
* save the reading.
*/
if (pp != NULL && pp->pp_pmcs[ri].pp_pmc != NULL) {
KASSERT(pm == pp->pp_pmcs[ri].pp_pmc,
("[pmc,%d] pm %p != pp_pmcs[%d] %p", __LINE__,
pm, ri, pp->pp_pmcs[ri].pp_pmc));
KASSERT(pp->pp_refcnt > 0,
("[pmc,%d] pp refcnt = %d", __LINE__,
pp->pp_refcnt));
pcd->pcd_read_pmc(cpu, adjri, &newvalue);
tmp = newvalue - PMC_PCPU_SAVED(cpu,ri);
PMCDBG(CSW,SWO,1,"cpu=%d ri=%d tmp=%jd", cpu, ri,
tmp);
if (mode == PMC_MODE_TS) {
/*
* For sampling process-virtual PMCs,
* we expect the count to be
* decreasing as the 'value'
* programmed into the PMC is the
* number of events to be seen till
* the next sampling interrupt.
*/
if (tmp < 0)
tmp += pm->pm_sc.pm_reloadcount;
mtx_pool_lock_spin(pmc_mtxpool, pm);
pp->pp_pmcs[ri].pp_pmcval -= tmp;
if ((int64_t) pp->pp_pmcs[ri].pp_pmcval < 0)
pp->pp_pmcs[ri].pp_pmcval +=
pm->pm_sc.pm_reloadcount;
mtx_pool_unlock_spin(pmc_mtxpool, pm);
} else {
/*
* For counting process-virtual PMCs,
* we expect the count to be
* increasing monotonically, modulo a 64
* bit wraparound.
*/
KASSERT((int64_t) tmp >= 0,
("[pmc,%d] negative increment cpu=%d "
"ri=%d newvalue=%jx saved=%jx "
"incr=%jx", __LINE__, cpu, ri,
newvalue, PMC_PCPU_SAVED(cpu,ri), tmp));
mtx_pool_lock_spin(pmc_mtxpool, pm);
pm->pm_gv.pm_savedvalue += tmp;
pp->pp_pmcs[ri].pp_pmcval += tmp;
mtx_pool_unlock_spin(pmc_mtxpool, pm);
if (pm->pm_flags & PMC_F_LOG_PROCCSW)
pmclog_process_proccsw(pm, pp, tmp);
}
}
/* mark hardware as free */
pcd->pcd_config_pmc(cpu, adjri, NULL);
}
/*
* perform any other architecture/cpu dependent thread
* switch out functions.
*/
(void) (*md->pmd_switch_out)(pc, pp);
critical_exit();
}
/*
* Log a KLD operation.
*/
static void
pmc_process_kld_load(struct pmckern_map_in *pkm)
{
struct pmc_owner *po;
sx_assert(&pmc_sx, SX_LOCKED);
/*
* Notify owners of system sampling PMCs about KLD operations.
*/
LIST_FOREACH(po, &pmc_ss_owners, po_ssnext)
if (po->po_flags & PMC_PO_OWNS_LOGFILE)
pmclog_process_map_in(po, (pid_t) -1, pkm->pm_address,
(char *) pkm->pm_file);
/*
* TODO: Notify owners of (all) process-sampling PMCs too.
*/
return;
}
static void
pmc_process_kld_unload(struct pmckern_map_out *pkm)
{
struct pmc_owner *po;
sx_assert(&pmc_sx, SX_LOCKED);
LIST_FOREACH(po, &pmc_ss_owners, po_ssnext)
if (po->po_flags & PMC_PO_OWNS_LOGFILE)
pmclog_process_map_out(po, (pid_t) -1,
pkm->pm_address, pkm->pm_address + pkm->pm_size);
/*
* TODO: Notify owners of process-sampling PMCs.
*/
}
/*
* A mapping change for a process.
*/
static void
pmc_process_mmap(struct thread *td, struct pmckern_map_in *pkm)
{
int ri;
pid_t pid;
char *fullpath, *freepath;
const struct pmc *pm;
struct pmc_owner *po;
const struct pmc_process *pp;
freepath = fullpath = NULL;
pmc_getfilename((struct vnode *) pkm->pm_file, &fullpath, &freepath);
pid = td->td_proc->p_pid;
/* Inform owners of all system-wide sampling PMCs. */
LIST_FOREACH(po, &pmc_ss_owners, po_ssnext)
if (po->po_flags & PMC_PO_OWNS_LOGFILE)
pmclog_process_map_in(po, pid, pkm->pm_address, fullpath);
if ((pp = pmc_find_process_descriptor(td->td_proc, 0)) == NULL)
goto done;
/*
* Inform sampling PMC owners tracking this process.
*/
for (ri = 0; ri < md->pmd_npmc; ri++)
if ((pm = pp->pp_pmcs[ri].pp_pmc) != NULL &&
PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm)))
pmclog_process_map_in(pm->pm_owner,
pid, pkm->pm_address, fullpath);
done:
if (freepath)
free(freepath, M_TEMP);
}
/*
* Log an munmap request.
*/
static void
pmc_process_munmap(struct thread *td, struct pmckern_map_out *pkm)
{
int ri;
pid_t pid;
struct pmc_owner *po;
const struct pmc *pm;
const struct pmc_process *pp;
pid = td->td_proc->p_pid;
LIST_FOREACH(po, &pmc_ss_owners, po_ssnext)
if (po->po_flags & PMC_PO_OWNS_LOGFILE)
pmclog_process_map_out(po, pid, pkm->pm_address,
pkm->pm_address + pkm->pm_size);
if ((pp = pmc_find_process_descriptor(td->td_proc, 0)) == NULL)
return;
for (ri = 0; ri < md->pmd_npmc; ri++)
if ((pm = pp->pp_pmcs[ri].pp_pmc) != NULL &&
PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm)))
pmclog_process_map_out(pm->pm_owner, pid,
pkm->pm_address, pkm->pm_address + pkm->pm_size);
}
/*
* Log mapping information about the kernel.
*/
static void
pmc_log_kernel_mappings(struct pmc *pm)
{
struct pmc_owner *po;
struct pmckern_map_in *km, *kmbase;
sx_assert(&pmc_sx, SX_LOCKED);
KASSERT(PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm)),
("[pmc,%d] non-sampling PMC (%p) desires mapping information",
__LINE__, (void *) pm));
po = pm->pm_owner;
if (po->po_flags & PMC_PO_INITIAL_MAPPINGS_DONE)
return;
/*
* Log the current set of kernel modules.
*/
kmbase = linker_hwpmc_list_objects();
for (km = kmbase; km->pm_file != NULL; km++) {
PMCDBG(LOG,REG,1,"%s %p", (char *) km->pm_file,
(void *) km->pm_address);
pmclog_process_map_in(po, (pid_t) -1, km->pm_address,
km->pm_file);
}
free(kmbase, M_LINKER);
po->po_flags |= PMC_PO_INITIAL_MAPPINGS_DONE;
}
/*
* Log the mappings for a single process.
*/
static void
pmc_log_process_mappings(struct pmc_owner *po, struct proc *p)
{
int locked;
vm_map_t map;
struct vnode *vp;
struct vmspace *vm;
vm_map_entry_t entry;
vm_offset_t last_end;
u_int last_timestamp;
struct vnode *last_vp;
vm_offset_t start_addr;
vm_object_t obj, lobj, tobj;
char *fullpath, *freepath;
last_vp = NULL;
last_end = (vm_offset_t) 0;
fullpath = freepath = NULL;
if ((vm = vmspace_acquire_ref(p)) == NULL)
return;
map = &vm->vm_map;
vm_map_lock_read(map);
for (entry = map->header.next; entry != &map->header; entry = entry->next) {
if (entry == NULL) {
PMCDBG(LOG,OPS,2, "hwpmc: vm_map entry unexpectedly "
"NULL! pid=%d vm_map=%p\n", p->p_pid, map);
break;
}
/*
* We only care about executable map entries.
*/
if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) ||
!(entry->protection & VM_PROT_EXECUTE) ||
(entry->object.vm_object == NULL)) {
continue;
}
obj = entry->object.vm_object;
VM_OBJECT_LOCK(obj);
/*
* Walk the backing_object list to find the base
* (non-shadowed) vm_object.
*/
for (lobj = tobj = obj; tobj != NULL; tobj = tobj->backing_object) {
if (tobj != obj)
VM_OBJECT_LOCK(tobj);
if (lobj != obj)
VM_OBJECT_UNLOCK(lobj);
lobj = tobj;
}
/*
* At this point lobj is the base vm_object and it is locked.
*/
if (lobj == NULL) {
PMCDBG(LOG,OPS,2, "hwpmc: lobj unexpectedly NULL! pid=%d "
"vm_map=%p vm_obj=%p\n", p->p_pid, map, obj);
VM_OBJECT_UNLOCK(obj);
continue;
}
if (lobj->type != OBJT_VNODE || lobj->handle == NULL) {
if (lobj != obj)
VM_OBJECT_UNLOCK(lobj);
VM_OBJECT_UNLOCK(obj);
continue;
}
/*
* Skip contiguous regions that point to the same
* vnode, so we don't emit redundant MAP-IN
* directives.
*/
if (entry->start == last_end && lobj->handle == last_vp) {
last_end = entry->end;
if (lobj != obj)
VM_OBJECT_UNLOCK(lobj);
VM_OBJECT_UNLOCK(obj);
continue;
}
/*
* We don't want to keep the proc's vm_map or this
* vm_object locked while we walk the pathname, since
* vn_fullpath() can sleep. However, if we drop the
* lock, it's possible for concurrent activity to
* modify the vm_map list. To protect against this,
* we save the vm_map timestamp before we release the
* lock, and check it after we reacquire the lock
* below.
*/
start_addr = entry->start;
last_end = entry->end;
last_timestamp = map->timestamp;
vm_map_unlock_read(map);
vp = lobj->handle;
vref(vp);
if (lobj != obj)
VM_OBJECT_UNLOCK(lobj);
VM_OBJECT_UNLOCK(obj);
freepath = NULL;
pmc_getfilename(vp, &fullpath, &freepath);
last_vp = vp;
locked = VFS_LOCK_GIANT(vp->v_mount);
vrele(vp);
VFS_UNLOCK_GIANT(locked);
vp = NULL;
pmclog_process_map_in(po, p->p_pid, start_addr, fullpath);
if (freepath)
free(freepath, M_TEMP);
vm_map_lock_read(map);
/*
* If our saved timestamp doesn't match, this means
* that the vm_map was modified out from under us and
* we can't trust our current "entry" pointer. Do a
* new lookup for this entry. If there is no entry
* for this address range, vm_map_lookup_entry() will
* return the previous one, so we always want to go to
* entry->next on the next loop iteration.
*
* There is an edge condition here that can occur if
* there is no entry at or before this address. In
* this situation, vm_map_lookup_entry returns
* &map->header, which would cause our loop to abort
* without processing the rest of the map. However,
* in practice this will never happen for process
* vm_map. This is because the executable's text
* segment is the first mapping in the proc's address
* space, and this mapping is never removed until the
* process exits, so there will always be a non-header
* entry at or before the requested address for
* vm_map_lookup_entry to return.
*/
if (map->timestamp != last_timestamp)
vm_map_lookup_entry(map, last_end - 1, &entry);
}
vm_map_unlock_read(map);
vmspace_free(vm);
return;
}
/*
* Log mappings for all processes in the system.
*/
static void
pmc_log_all_process_mappings(struct pmc_owner *po)
{
struct proc *p, *top;
sx_assert(&pmc_sx, SX_XLOCKED);
if ((p = pfind(1)) == NULL)
panic("[pmc,%d] Cannot find init", __LINE__);
PROC_UNLOCK(p);
sx_slock(&proctree_lock);
top = p;
for (;;) {
pmc_log_process_mappings(po, p);
if (!LIST_EMPTY(&p->p_children))
p = LIST_FIRST(&p->p_children);
else for (;;) {
if (p == top)
goto done;
if (LIST_NEXT(p, p_sibling)) {
p = LIST_NEXT(p, p_sibling);
break;
}
p = p->p_pptr;
}
}
done:
sx_sunlock(&proctree_lock);
}
/*
* The 'hook' invoked from the kernel proper
*/
#ifdef DEBUG
const char *pmc_hooknames[] = {
/* these strings correspond to PMC_FN_* in <sys/pmckern.h> */
"",
"EXEC",
"CSW-IN",
"CSW-OUT",
"SAMPLE",
"KLDLOAD",
"KLDUNLOAD",
"MMAP",
"MUNMAP",
"CALLCHAIN"
};
#endif
static int
pmc_hook_handler(struct thread *td, int function, void *arg)
{
PMCDBG(MOD,PMH,1, "hook td=%p func=%d \"%s\" arg=%p", td, function,
pmc_hooknames[function], arg);
switch (function)
{
/*
* Process exec()
*/
case PMC_FN_PROCESS_EXEC:
{
char *fullpath, *freepath;
unsigned int ri;
int is_using_hwpmcs;
struct pmc *pm;
struct proc *p;
struct pmc_owner *po;
struct pmc_process *pp;
struct pmckern_procexec *pk;
sx_assert(&pmc_sx, SX_XLOCKED);
p = td->td_proc;
pmc_getfilename(p->p_textvp, &fullpath, &freepath);
pk = (struct pmckern_procexec *) arg;
/* Inform owners of SS mode PMCs of the exec event. */
LIST_FOREACH(po, &pmc_ss_owners, po_ssnext)
if (po->po_flags & PMC_PO_OWNS_LOGFILE)
pmclog_process_procexec(po, PMC_ID_INVALID,
p->p_pid, pk->pm_entryaddr, fullpath);
PROC_LOCK(p);
is_using_hwpmcs = p->p_flag & P_HWPMC;
PROC_UNLOCK(p);
if (!is_using_hwpmcs) {
if (freepath)
free(freepath, M_TEMP);
break;
}
/*
* PMCs are not inherited across an exec(): remove any
* PMCs that this process is the owner of.
*/
if ((po = pmc_find_owner_descriptor(p)) != NULL) {
pmc_remove_owner(po);
pmc_destroy_owner_descriptor(po);
}
/*
* If the process being exec'ed is not the target of any
* PMC, we are done.
*/
if ((pp = pmc_find_process_descriptor(p, 0)) == NULL) {
if (freepath)
free(freepath, M_TEMP);
break;
}
/*
* Log the exec event to all monitoring owners. Skip
* owners who have already recieved the event because
* they had system sampling PMCs active.
*/
for (ri = 0; ri < md->pmd_npmc; ri++)
if ((pm = pp->pp_pmcs[ri].pp_pmc) != NULL) {
po = pm->pm_owner;
if (po->po_sscount == 0 &&
po->po_flags & PMC_PO_OWNS_LOGFILE)
pmclog_process_procexec(po, pm->pm_id,
p->p_pid, pk->pm_entryaddr,
fullpath);
}
if (freepath)
free(freepath, M_TEMP);
PMCDBG(PRC,EXC,1, "exec proc=%p (%d, %s) cred-changed=%d",
p, p->p_pid, p->p_comm, pk->pm_credentialschanged);
if (pk->pm_credentialschanged == 0) /* no change */
break;
/*
* If the newly exec()'ed process has a different credential
* than before, allow it to be the target of a PMC only if
* the PMC's owner has sufficient priviledge.
*/
for (ri = 0; ri < md->pmd_npmc; ri++)
if ((pm = pp->pp_pmcs[ri].pp_pmc) != NULL)
if (pmc_can_attach(pm, td->td_proc) != 0)
pmc_detach_one_process(td->td_proc,
pm, PMC_FLAG_NONE);
KASSERT(pp->pp_refcnt >= 0 && pp->pp_refcnt <= (int) md->pmd_npmc,
("[pmc,%d] Illegal ref count %d on pp %p", __LINE__,
pp->pp_refcnt, pp));
/*
* If this process is no longer the target of any
* PMCs, we can remove the process entry and free
* up space.
*/
if (pp->pp_refcnt == 0) {
pmc_remove_process_descriptor(pp);
free(pp, M_PMC);
break;
}
}
break;
case PMC_FN_CSW_IN:
pmc_process_csw_in(td);
break;
case PMC_FN_CSW_OUT:
pmc_process_csw_out(td);
break;
/*
* Process accumulated PC samples.
*
* This function is expected to be called by hardclock() for
* each CPU that has accumulated PC samples.
*
* This function is to be executed on the CPU whose samples
* are being processed.
*/
case PMC_FN_DO_SAMPLES:
/*
* Clear the cpu specific bit in the CPU mask before
* do the rest of the processing. If the NMI handler
* gets invoked after the "atomic_clear_int()" call
* below but before "pmc_process_samples()" gets
* around to processing the interrupt, then we will
* come back here at the next hardclock() tick (and
* may find nothing to do if "pmc_process_samples()"
* had already processed the interrupt). We don't
* lose the interrupt sample.
*/
CPU_CLR_ATOMIC(PCPU_GET(cpuid), &pmc_cpumask);
pmc_process_samples(PCPU_GET(cpuid));
break;
case PMC_FN_KLD_LOAD:
sx_assert(&pmc_sx, SX_LOCKED);
pmc_process_kld_load((struct pmckern_map_in *) arg);
break;
case PMC_FN_KLD_UNLOAD:
sx_assert(&pmc_sx, SX_LOCKED);
pmc_process_kld_unload((struct pmckern_map_out *) arg);
break;
case PMC_FN_MMAP:
sx_assert(&pmc_sx, SX_LOCKED);
pmc_process_mmap(td, (struct pmckern_map_in *) arg);
break;
case PMC_FN_MUNMAP:
sx_assert(&pmc_sx, SX_LOCKED);
pmc_process_munmap(td, (struct pmckern_map_out *) arg);
break;
case PMC_FN_USER_CALLCHAIN:
/*
* Record a call chain.
*/
KASSERT(td == curthread, ("[pmc,%d] td != curthread",
__LINE__));
pmc_capture_user_callchain(PCPU_GET(cpuid),
(struct trapframe *) arg);
td->td_pflags &= ~TDP_CALLCHAIN;
break;
default:
#ifdef DEBUG
KASSERT(0, ("[pmc,%d] unknown hook %d\n", __LINE__, function));
#endif
break;
}
return 0;
}
/*
* allocate a 'struct pmc_owner' descriptor in the owner hash table.
*/
static struct pmc_owner *
pmc_allocate_owner_descriptor(struct proc *p)
{
uint32_t hindex;
struct pmc_owner *po;
struct pmc_ownerhash *poh;
hindex = PMC_HASH_PTR(p, pmc_ownerhashmask);
poh = &pmc_ownerhash[hindex];
/* allocate space for N pointers and one descriptor struct */
po = malloc(sizeof(struct pmc_owner), M_PMC, M_WAITOK|M_ZERO);
po->po_sscount = po->po_error = po->po_flags = po->po_logprocmaps = 0;
po->po_file = NULL;
po->po_owner = p;
po->po_kthread = NULL;
LIST_INIT(&po->po_pmcs);
LIST_INSERT_HEAD(poh, po, po_next); /* insert into hash table */
TAILQ_INIT(&po->po_logbuffers);
mtx_init(&po->po_mtx, "pmc-owner-mtx", "pmc-per-proc", MTX_SPIN);
PMCDBG(OWN,ALL,1, "allocate-owner proc=%p (%d, %s) pmc-owner=%p",
p, p->p_pid, p->p_comm, po);
return po;
}
static void
pmc_destroy_owner_descriptor(struct pmc_owner *po)
{
PMCDBG(OWN,REL,1, "destroy-owner po=%p proc=%p (%d, %s)",
po, po->po_owner, po->po_owner->p_pid, po->po_owner->p_comm);
mtx_destroy(&po->po_mtx);
free(po, M_PMC);
}
/*
* find the descriptor corresponding to process 'p', adding or removing it
* as specified by 'mode'.
*/
static struct pmc_process *
pmc_find_process_descriptor(struct proc *p, uint32_t mode)
{
uint32_t hindex;
struct pmc_process *pp, *ppnew;
struct pmc_processhash *pph;
hindex = PMC_HASH_PTR(p, pmc_processhashmask);
pph = &pmc_processhash[hindex];
ppnew = NULL;
/*
* Pre-allocate memory in the FIND_ALLOCATE case since we
* cannot call malloc(9) once we hold a spin lock.
*/
if (mode & PMC_FLAG_ALLOCATE)
ppnew = malloc(sizeof(struct pmc_process) + md->pmd_npmc *
sizeof(struct pmc_targetstate), M_PMC, M_WAITOK|M_ZERO);
mtx_lock_spin(&pmc_processhash_mtx);
LIST_FOREACH(pp, pph, pp_next)
if (pp->pp_proc == p)
break;
if ((mode & PMC_FLAG_REMOVE) && pp != NULL)
LIST_REMOVE(pp, pp_next);
if ((mode & PMC_FLAG_ALLOCATE) && pp == NULL &&
ppnew != NULL) {
ppnew->pp_proc = p;
LIST_INSERT_HEAD(pph, ppnew, pp_next);
pp = ppnew;
ppnew = NULL;
}
mtx_unlock_spin(&pmc_processhash_mtx);
if (pp != NULL && ppnew != NULL)
free(ppnew, M_PMC);
return pp;
}
/*
* remove a process descriptor from the process hash table.
*/
static void
pmc_remove_process_descriptor(struct pmc_process *pp)
{
KASSERT(pp->pp_refcnt == 0,
("[pmc,%d] Removing process descriptor %p with count %d",
__LINE__, pp, pp->pp_refcnt));
mtx_lock_spin(&pmc_processhash_mtx);
LIST_REMOVE(pp, pp_next);
mtx_unlock_spin(&pmc_processhash_mtx);
}
/*
* find an owner descriptor corresponding to proc 'p'
*/
static struct pmc_owner *
pmc_find_owner_descriptor(struct proc *p)
{
uint32_t hindex;
struct pmc_owner *po;
struct pmc_ownerhash *poh;
hindex = PMC_HASH_PTR(p, pmc_ownerhashmask);
poh = &pmc_ownerhash[hindex];
po = NULL;
LIST_FOREACH(po, poh, po_next)
if (po->po_owner == p)
break;
PMCDBG(OWN,FND,1, "find-owner proc=%p (%d, %s) hindex=0x%x -> "
"pmc-owner=%p", p, p->p_pid, p->p_comm, hindex, po);
return po;
}
/*
* pmc_allocate_pmc_descriptor
*
* Allocate a pmc descriptor and initialize its
* fields.
*/
static struct pmc *
pmc_allocate_pmc_descriptor(void)
{
struct pmc *pmc;
pmc = malloc(sizeof(struct pmc), M_PMC, M_WAITOK|M_ZERO);
if (pmc != NULL) {
pmc->pm_owner = NULL;
LIST_INIT(&pmc->pm_targets);
}
PMCDBG(PMC,ALL,1, "allocate-pmc -> pmc=%p", pmc);
return pmc;
}
/*
* Destroy a pmc descriptor.
*/
static void
pmc_destroy_pmc_descriptor(struct pmc *pm)
{
(void) pm;
#ifdef DEBUG
KASSERT(pm->pm_state == PMC_STATE_DELETED ||
pm->pm_state == PMC_STATE_FREE,
("[pmc,%d] destroying non-deleted PMC", __LINE__));
KASSERT(LIST_EMPTY(&pm->pm_targets),
("[pmc,%d] destroying pmc with targets", __LINE__));
KASSERT(pm->pm_owner == NULL,
("[pmc,%d] destroying pmc attached to an owner", __LINE__));
KASSERT(pm->pm_runcount == 0,
("[pmc,%d] pmc has non-zero run count %d", __LINE__,
pm->pm_runcount));
#endif
}
static void
pmc_wait_for_pmc_idle(struct pmc *pm)
{
#ifdef DEBUG
volatile int maxloop;
maxloop = 100 * pmc_cpu_max();
#endif
/*
* Loop (with a forced context switch) till the PMC's runcount
* comes down to zero.
*/
while (atomic_load_acq_32(&pm->pm_runcount) > 0) {
#ifdef DEBUG
maxloop--;
KASSERT(maxloop > 0,
("[pmc,%d] (ri%d, rc%d) waiting too long for "
"pmc to be free", __LINE__,
PMC_TO_ROWINDEX(pm), pm->pm_runcount));
#endif
pmc_force_context_switch();
}
}
/*
* This function does the following things:
*
* - detaches the PMC from hardware
* - unlinks all target threads that were attached to it
* - removes the PMC from its owner's list
* - destroy's the PMC private mutex
*
* Once this function completes, the given pmc pointer can be safely
* FREE'd by the caller.
*/
static void
pmc_release_pmc_descriptor(struct pmc *pm)
{
enum pmc_mode mode;
struct pmc_hw *phw;
u_int adjri, ri, cpu;
struct pmc_owner *po;
struct pmc_binding pb;
struct pmc_process *pp;
struct pmc_classdep *pcd;
struct pmc_target *ptgt, *tmp;
sx_assert(&pmc_sx, SX_XLOCKED);
KASSERT(pm, ("[pmc,%d] null pmc", __LINE__));
ri = PMC_TO_ROWINDEX(pm);
pcd = pmc_ri_to_classdep(md, ri, &adjri);
mode = PMC_TO_MODE(pm);
PMCDBG(PMC,REL,1, "release-pmc pmc=%p ri=%d mode=%d", pm, ri,
mode);
/*
* First, we take the PMC off hardware.
*/
cpu = 0;
if (PMC_IS_SYSTEM_MODE(mode)) {
/*
* A system mode PMC runs on a specific CPU. Switch
* to this CPU and turn hardware off.
*/
pmc_save_cpu_binding(&pb);
cpu = PMC_TO_CPU(pm);
pmc_select_cpu(cpu);
/* switch off non-stalled CPUs */
if (pm->pm_state == PMC_STATE_RUNNING &&
pm->pm_stalled == 0) {
phw = pmc_pcpu[cpu]->pc_hwpmcs[ri];
KASSERT(phw->phw_pmc == pm,
("[pmc, %d] pmc ptr ri(%d) hw(%p) pm(%p)",
__LINE__, ri, phw->phw_pmc, pm));
PMCDBG(PMC,REL,2, "stopping cpu=%d ri=%d", cpu, ri);
critical_enter();
pcd->pcd_stop_pmc(cpu, adjri);
critical_exit();
}
PMCDBG(PMC,REL,2, "decfg cpu=%d ri=%d", cpu, ri);
critical_enter();
pcd->pcd_config_pmc(cpu, adjri, NULL);
critical_exit();
/* adjust the global and process count of SS mode PMCs */
if (mode == PMC_MODE_SS && pm->pm_state == PMC_STATE_RUNNING) {
po = pm->pm_owner;
po->po_sscount--;
if (po->po_sscount == 0) {
atomic_subtract_rel_int(&pmc_ss_count, 1);
LIST_REMOVE(po, po_ssnext);
}
}
pm->pm_state = PMC_STATE_DELETED;
pmc_restore_cpu_binding(&pb);
/*
* We could have references to this PMC structure in
* the per-cpu sample queues. Wait for the queue to
* drain.
*/
pmc_wait_for_pmc_idle(pm);
} else if (PMC_IS_VIRTUAL_MODE(mode)) {
/*
* A virtual PMC could be running on multiple CPUs at
* a given instant.
*
* By marking its state as DELETED, we ensure that
* this PMC is never further scheduled on hardware.
*
* Then we wait till all CPUs are done with this PMC.
*/
pm->pm_state = PMC_STATE_DELETED;
/* Wait for the PMCs runcount to come to zero. */
pmc_wait_for_pmc_idle(pm);
/*
* At this point the PMC is off all CPUs and cannot be
* freshly scheduled onto a CPU. It is now safe to
* unlink all targets from this PMC. If a
* process-record's refcount falls to zero, we remove
* it from the hash table. The module-wide SX lock
* protects us from races.
*/
LIST_FOREACH_SAFE(ptgt, &pm->pm_targets, pt_next, tmp) {
pp = ptgt->pt_process;
pmc_unlink_target_process(pm, pp); /* frees 'ptgt' */
PMCDBG(PMC,REL,3, "pp->refcnt=%d", pp->pp_refcnt);
/*
* If the target process record shows that no
* PMCs are attached to it, reclaim its space.
*/
if (pp->pp_refcnt == 0) {
pmc_remove_process_descriptor(pp);
free(pp, M_PMC);
}
}
cpu = curthread->td_oncpu; /* setup cpu for pmd_release() */
}
/*
* Release any MD resources
*/
(void) pcd->pcd_release_pmc(cpu, adjri, pm);
/*
* Update row disposition
*/
if (PMC_IS_SYSTEM_MODE(PMC_TO_MODE(pm)))
PMC_UNMARK_ROW_STANDALONE(ri);
else
PMC_UNMARK_ROW_THREAD(ri);
/* unlink from the owner's list */
if (pm->pm_owner) {
LIST_REMOVE(pm, pm_next);
pm->pm_owner = NULL;
}
pmc_destroy_pmc_descriptor(pm);
}
/*
* Register an owner and a pmc.
*/
static int
pmc_register_owner(struct proc *p, struct pmc *pmc)
{
struct pmc_owner *po;
sx_assert(&pmc_sx, SX_XLOCKED);
if ((po = pmc_find_owner_descriptor(p)) == NULL)
if ((po = pmc_allocate_owner_descriptor(p)) == NULL)
return ENOMEM;
KASSERT(pmc->pm_owner == NULL,
("[pmc,%d] attempting to own an initialized PMC", __LINE__));
pmc->pm_owner = po;
LIST_INSERT_HEAD(&po->po_pmcs, pmc, pm_next);
PROC_LOCK(p);
p->p_flag |= P_HWPMC;
PROC_UNLOCK(p);
if (po->po_flags & PMC_PO_OWNS_LOGFILE)
pmclog_process_pmcallocate(pmc);
PMCDBG(PMC,REG,1, "register-owner pmc-owner=%p pmc=%p",
po, pmc);
return 0;
}
/*
* Return the current row disposition:
* == 0 => FREE
* > 0 => PROCESS MODE
* < 0 => SYSTEM MODE
*/
int
pmc_getrowdisp(int ri)
{
return pmc_pmcdisp[ri];
}
/*
* Check if a PMC at row index 'ri' can be allocated to the current
* process.
*
* Allocation can fail if:
* - the current process is already being profiled by a PMC at index 'ri',
* attached to it via OP_PMCATTACH.
* - the current process has already allocated a PMC at index 'ri'
* via OP_ALLOCATE.
*/
static int
pmc_can_allocate_rowindex(struct proc *p, unsigned int ri, int cpu)
{
enum pmc_mode mode;
struct pmc *pm;
struct pmc_owner *po;
struct pmc_process *pp;
PMCDBG(PMC,ALR,1, "can-allocate-rowindex proc=%p (%d, %s) ri=%d "
"cpu=%d", p, p->p_pid, p->p_comm, ri, cpu);
/*
* We shouldn't have already allocated a process-mode PMC at
* row index 'ri'.
*
* We shouldn't have allocated a system-wide PMC on the same
* CPU and same RI.
*/
if ((po = pmc_find_owner_descriptor(p)) != NULL)
LIST_FOREACH(pm, &po->po_pmcs, pm_next) {
if (PMC_TO_ROWINDEX(pm) == ri) {
mode = PMC_TO_MODE(pm);
if (PMC_IS_VIRTUAL_MODE(mode))
return EEXIST;
if (PMC_IS_SYSTEM_MODE(mode) &&
(int) PMC_TO_CPU(pm) == cpu)
return EEXIST;
}
}
/*
* We also shouldn't be the target of any PMC at this index
* since otherwise a PMC_ATTACH to ourselves will fail.
*/
if ((pp = pmc_find_process_descriptor(p, 0)) != NULL)
if (pp->pp_pmcs[ri].pp_pmc)
return EEXIST;
PMCDBG(PMC,ALR,2, "can-allocate-rowindex proc=%p (%d, %s) ri=%d ok",
p, p->p_pid, p->p_comm, ri);
return 0;
}
/*
* Check if a given PMC at row index 'ri' can be currently used in
* mode 'mode'.
*/
static int
pmc_can_allocate_row(int ri, enum pmc_mode mode)
{
enum pmc_disp disp;
sx_assert(&pmc_sx, SX_XLOCKED);
PMCDBG(PMC,ALR,1, "can-allocate-row ri=%d mode=%d", ri, mode);
if (PMC_IS_SYSTEM_MODE(mode))
disp = PMC_DISP_STANDALONE;
else
disp = PMC_DISP_THREAD;
/*
* check disposition for PMC row 'ri':
*
* Expected disposition Row-disposition Result
*
* STANDALONE STANDALONE or FREE proceed
* STANDALONE THREAD fail
* THREAD THREAD or FREE proceed
* THREAD STANDALONE fail
*/
if (!PMC_ROW_DISP_IS_FREE(ri) &&
!(disp == PMC_DISP_THREAD && PMC_ROW_DISP_IS_THREAD(ri)) &&
!(disp == PMC_DISP_STANDALONE && PMC_ROW_DISP_IS_STANDALONE(ri)))
return EBUSY;
/*
* All OK
*/
PMCDBG(PMC,ALR,2, "can-allocate-row ri=%d mode=%d ok", ri, mode);
return 0;
}
/*
* Find a PMC descriptor with user handle 'pmcid' for thread 'td'.
*/
static struct pmc *
pmc_find_pmc_descriptor_in_process(struct pmc_owner *po, pmc_id_t pmcid)
{
struct pmc *pm;
KASSERT(PMC_ID_TO_ROWINDEX(pmcid) < md->pmd_npmc,
("[pmc,%d] Illegal pmc index %d (max %d)", __LINE__,
PMC_ID_TO_ROWINDEX(pmcid), md->pmd_npmc));
LIST_FOREACH(pm, &po->po_pmcs, pm_next)
if (pm->pm_id == pmcid)
return pm;
return NULL;
}
static int
pmc_find_pmc(pmc_id_t pmcid, struct pmc **pmc)
{
struct pmc *pm;
struct pmc_owner *po;
PMCDBG(PMC,FND,1, "find-pmc id=%d", pmcid);
if ((po = pmc_find_owner_descriptor(curthread->td_proc)) == NULL)
return ESRCH;
if ((pm = pmc_find_pmc_descriptor_in_process(po, pmcid)) == NULL)
return EINVAL;
PMCDBG(PMC,FND,2, "find-pmc id=%d -> pmc=%p", pmcid, pm);
*pmc = pm;
return 0;
}
/*
* Start a PMC.
*/
static int
pmc_start(struct pmc *pm)
{
enum pmc_mode mode;
struct pmc_owner *po;
struct pmc_binding pb;
struct pmc_classdep *pcd;
int adjri, error, cpu, ri;
KASSERT(pm != NULL,
("[pmc,%d] null pm", __LINE__));
mode = PMC_TO_MODE(pm);
ri = PMC_TO_ROWINDEX(pm);
pcd = pmc_ri_to_classdep(md, ri, &adjri);
error = 0;
PMCDBG(PMC,OPS,1, "start pmc=%p mode=%d ri=%d", pm, mode, ri);
po = pm->pm_owner;
/*
* Disallow PMCSTART if a logfile is required but has not been
* configured yet.
*/
if ((pm->pm_flags & PMC_F_NEEDS_LOGFILE) &&
(po->po_flags & PMC_PO_OWNS_LOGFILE) == 0)
return (EDOOFUS); /* programming error */
/*
* If this is a sampling mode PMC, log mapping information for
* the kernel modules that are currently loaded.
*/
if (PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm)))
pmc_log_kernel_mappings(pm);
if (PMC_IS_VIRTUAL_MODE(mode)) {
/*
* If a PMCATTACH has never been done on this PMC,
* attach it to its owner process.
*/
if (LIST_EMPTY(&pm->pm_targets))
error = (pm->pm_flags & PMC_F_ATTACH_DONE) ? ESRCH :
pmc_attach_process(po->po_owner, pm);
/*
* If the PMC is attached to its owner, then force a context
* switch to ensure that the MD state gets set correctly.
*/
if (error == 0) {
pm->pm_state = PMC_STATE_RUNNING;
if (pm->pm_flags & PMC_F_ATTACHED_TO_OWNER)
pmc_force_context_switch();
}
return (error);
}
/*
* A system-wide PMC.
*
* Add the owner to the global list if this is a system-wide
* sampling PMC.
*/
if (mode == PMC_MODE_SS) {
if (po->po_sscount == 0) {
LIST_INSERT_HEAD(&pmc_ss_owners, po, po_ssnext);
atomic_add_rel_int(&pmc_ss_count, 1);
PMCDBG(PMC,OPS,1, "po=%p in global list", po);
}
po->po_sscount++;
/*
* Log mapping information for all existing processes in the
* system. Subsequent mappings are logged as they happen;
* see pmc_process_mmap().
*/
if (po->po_logprocmaps == 0) {
pmc_log_all_process_mappings(po);
po->po_logprocmaps = 1;
}
}
/*
* Move to the CPU associated with this
* PMC, and start the hardware.
*/
pmc_save_cpu_binding(&pb);
cpu = PMC_TO_CPU(pm);
if (!pmc_cpu_is_active(cpu))
return (ENXIO);
pmc_select_cpu(cpu);
/*
* global PMCs are configured at allocation time
* so write out the initial value and start the PMC.
*/
pm->pm_state = PMC_STATE_RUNNING;
critical_enter();
if ((error = pcd->pcd_write_pmc(cpu, adjri,
PMC_IS_SAMPLING_MODE(mode) ?
pm->pm_sc.pm_reloadcount :
pm->pm_sc.pm_initial)) == 0)
error = pcd->pcd_start_pmc(cpu, adjri);
critical_exit();
pmc_restore_cpu_binding(&pb);
return (error);
}
/*
* Stop a PMC.
*/
static int
pmc_stop(struct pmc *pm)
{
struct pmc_owner *po;
struct pmc_binding pb;
struct pmc_classdep *pcd;
int adjri, cpu, error, ri;
KASSERT(pm != NULL, ("[pmc,%d] null pmc", __LINE__));
PMCDBG(PMC,OPS,1, "stop pmc=%p mode=%d ri=%d", pm,
PMC_TO_MODE(pm), PMC_TO_ROWINDEX(pm));
pm->pm_state = PMC_STATE_STOPPED;
/*
* If the PMC is a virtual mode one, changing the state to
* non-RUNNING is enough to ensure that the PMC never gets
* scheduled.
*
* If this PMC is current running on a CPU, then it will
* handled correctly at the time its target process is context
* switched out.
*/
if (PMC_IS_VIRTUAL_MODE(PMC_TO_MODE(pm)))
return 0;
/*
* A system-mode PMC. Move to the CPU associated with
* this PMC, and stop the hardware. We update the
* 'initial count' so that a subsequent PMCSTART will
* resume counting from the current hardware count.
*/
pmc_save_cpu_binding(&pb);
cpu = PMC_TO_CPU(pm);
KASSERT(cpu >= 0 && cpu < pmc_cpu_max(),
("[pmc,%d] illegal cpu=%d", __LINE__, cpu));
if (!pmc_cpu_is_active(cpu))
return ENXIO;
pmc_select_cpu(cpu);
ri = PMC_TO_ROWINDEX(pm);
pcd = pmc_ri_to_classdep(md, ri, &adjri);
critical_enter();
if ((error = pcd->pcd_stop_pmc(cpu, adjri)) == 0)
error = pcd->pcd_read_pmc(cpu, adjri, &pm->pm_sc.pm_initial);
critical_exit();
pmc_restore_cpu_binding(&pb);
po = pm->pm_owner;
/* remove this owner from the global list of SS PMC owners */
if (PMC_TO_MODE(pm) == PMC_MODE_SS) {
po->po_sscount--;
if (po->po_sscount == 0) {
atomic_subtract_rel_int(&pmc_ss_count, 1);
LIST_REMOVE(po, po_ssnext);
PMCDBG(PMC,OPS,2,"po=%p removed from global list", po);
}
}
return (error);
}
#ifdef DEBUG
static const char *pmc_op_to_name[] = {
#undef __PMC_OP
#define __PMC_OP(N, D) #N ,
__PMC_OPS()
NULL
};
#endif
/*
* The syscall interface
*/
#define PMC_GET_SX_XLOCK(...) do { \
sx_xlock(&pmc_sx); \
if (pmc_hook == NULL) { \
sx_xunlock(&pmc_sx); \
return __VA_ARGS__; \
} \
} while (0)
#define PMC_DOWNGRADE_SX() do { \
sx_downgrade(&pmc_sx); \
is_sx_downgraded = 1; \
} while (0)
static int
pmc_syscall_handler(struct thread *td, void *syscall_args)
{
int error, is_sx_downgraded, is_sx_locked, op;
struct pmc_syscall_args *c;
void *arg;
PMC_GET_SX_XLOCK(ENOSYS);
DROP_GIANT();
is_sx_downgraded = 0;
is_sx_locked = 1;
c = (struct pmc_syscall_args *) syscall_args;
op = c->pmop_code;
arg = c->pmop_data;
PMCDBG(MOD,PMS,1, "syscall op=%d \"%s\" arg=%p", op,
pmc_op_to_name[op], arg);
error = 0;
atomic_add_int(&pmc_stats.pm_syscalls, 1);
switch(op)
{
/*
* Configure a log file.
*
* XXX This OP will be reworked.
*/
case PMC_OP_CONFIGURELOG:
{
struct proc *p;
struct pmc *pm;
struct pmc_owner *po;
struct pmc_op_configurelog cl;
sx_assert(&pmc_sx, SX_XLOCKED);
if ((error = copyin(arg, &cl, sizeof(cl))) != 0)
break;
/* mark this process as owning a log file */
p = td->td_proc;
if ((po = pmc_find_owner_descriptor(p)) == NULL)
if ((po = pmc_allocate_owner_descriptor(p)) == NULL) {
error = ENOMEM;
break;
}
/*
* If a valid fd was passed in, try to configure that,
* otherwise if 'fd' was less than zero and there was
* a log file configured, flush its buffers and
* de-configure it.
*/
if (cl.pm_logfd >= 0) {
sx_xunlock(&pmc_sx);
is_sx_locked = 0;
error = pmclog_configure_log(md, po, cl.pm_logfd);
} else if (po->po_flags & PMC_PO_OWNS_LOGFILE) {
pmclog_process_closelog(po);
error = pmclog_flush(po);
if (error == 0) {
LIST_FOREACH(pm, &po->po_pmcs, pm_next)
if (pm->pm_flags & PMC_F_NEEDS_LOGFILE &&
pm->pm_state == PMC_STATE_RUNNING)
pmc_stop(pm);
error = pmclog_deconfigure_log(po);
}
} else
error = EINVAL;
if (error)
break;
}
break;
/*
* Flush a log file.
*/
case PMC_OP_FLUSHLOG:
{
struct pmc_owner *po;
sx_assert(&pmc_sx, SX_XLOCKED);
if ((po = pmc_find_owner_descriptor(td->td_proc)) == NULL) {
error = EINVAL;
break;
}
error = pmclog_flush(po);
}
break;
/*
* Retrieve hardware configuration.
*/
case PMC_OP_GETCPUINFO: /* CPU information */
{
struct pmc_op_getcpuinfo gci;
struct pmc_classinfo *pci;
struct pmc_classdep *pcd;
int cl;
gci.pm_cputype = md->pmd_cputype;
gci.pm_ncpu = pmc_cpu_max();
gci.pm_npmc = md->pmd_npmc;
gci.pm_nclass = md->pmd_nclass;
pci = gci.pm_classes;
pcd = md->pmd_classdep;
for (cl = 0; cl < md->pmd_nclass; cl++, pci++, pcd++) {
pci->pm_caps = pcd->pcd_caps;
pci->pm_class = pcd->pcd_class;
pci->pm_width = pcd->pcd_width;
pci->pm_num = pcd->pcd_num;
}
error = copyout(&gci, arg, sizeof(gci));
}
break;
/*
* Get module statistics
*/
case PMC_OP_GETDRIVERSTATS:
{
struct pmc_op_getdriverstats gms;
bcopy(&pmc_stats, &gms, sizeof(gms));
error = copyout(&gms, arg, sizeof(gms));
}
break;
/*
* Retrieve module version number
*/
case PMC_OP_GETMODULEVERSION:
{
uint32_t cv, modv;
/* retrieve the client's idea of the ABI version */
if ((error = copyin(arg, &cv, sizeof(uint32_t))) != 0)
break;
/* don't service clients newer than our driver */
modv = PMC_VERSION;
if ((cv & 0xFFFF0000) > (modv & 0xFFFF0000)) {
error = EPROGMISMATCH;
break;
}
error = copyout(&modv, arg, sizeof(int));
}
break;
/*
* Retrieve the state of all the PMCs on a given
* CPU.
*/
case PMC_OP_GETPMCINFO:
{
int ari;
struct pmc *pm;
size_t pmcinfo_size;
uint32_t cpu, n, npmc;
struct pmc_owner *po;
struct pmc_binding pb;
struct pmc_classdep *pcd;
struct pmc_info *p, *pmcinfo;
struct pmc_op_getpmcinfo *gpi;
PMC_DOWNGRADE_SX();
gpi = (struct pmc_op_getpmcinfo *) arg;
if ((error = copyin(&gpi->pm_cpu, &cpu, sizeof(cpu))) != 0)
break;
if (cpu >= pmc_cpu_max()) {
error = EINVAL;
break;
}
if (!pmc_cpu_is_active(cpu)) {
error = ENXIO;
break;
}
/* switch to CPU 'cpu' */
pmc_save_cpu_binding(&pb);
pmc_select_cpu(cpu);
npmc = md->pmd_npmc;
pmcinfo_size = npmc * sizeof(struct pmc_info);
pmcinfo = malloc(pmcinfo_size, M_PMC, M_WAITOK);
p = pmcinfo;
for (n = 0; n < md->pmd_npmc; n++, p++) {
pcd = pmc_ri_to_classdep(md, n, &ari);
KASSERT(pcd != NULL,
("[pmc,%d] null pcd ri=%d", __LINE__, n));
if ((error = pcd->pcd_describe(cpu, ari, p, &pm)) != 0)
break;
if (PMC_ROW_DISP_IS_STANDALONE(n))
p->pm_rowdisp = PMC_DISP_STANDALONE;
else if (PMC_ROW_DISP_IS_THREAD(n))
p->pm_rowdisp = PMC_DISP_THREAD;
else
p->pm_rowdisp = PMC_DISP_FREE;
p->pm_ownerpid = -1;
if (pm == NULL) /* no PMC associated */
continue;
po = pm->pm_owner;
KASSERT(po->po_owner != NULL,
("[pmc,%d] pmc_owner had a null proc pointer",
__LINE__));
p->pm_ownerpid = po->po_owner->p_pid;
p->pm_mode = PMC_TO_MODE(pm);
p->pm_event = pm->pm_event;
p->pm_flags = pm->pm_flags;
if (PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm)))
p->pm_reloadcount =
pm->pm_sc.pm_reloadcount;
}
pmc_restore_cpu_binding(&pb);
/* now copy out the PMC info collected */
if (error == 0)
error = copyout(pmcinfo, &gpi->pm_pmcs, pmcinfo_size);
free(pmcinfo, M_PMC);
}
break;
/*
* Set the administrative state of a PMC. I.e. whether
* the PMC is to be used or not.
*/
case PMC_OP_PMCADMIN:
{
int cpu, ri;
enum pmc_state request;
struct pmc_cpu *pc;
struct pmc_hw *phw;
struct pmc_op_pmcadmin pma;
struct pmc_binding pb;
sx_assert(&pmc_sx, SX_XLOCKED);
KASSERT(td == curthread,
("[pmc,%d] td != curthread", __LINE__));
error = priv_check(td, PRIV_PMC_MANAGE);
if (error)
break;
if ((error = copyin(arg, &pma, sizeof(pma))) != 0)
break;
cpu = pma.pm_cpu;
if (cpu < 0 || cpu >= (int) pmc_cpu_max()) {
error = EINVAL;
break;
}
if (!pmc_cpu_is_active(cpu)) {
error = ENXIO;
break;
}
request = pma.pm_state;
if (request != PMC_STATE_DISABLED &&
request != PMC_STATE_FREE) {
error = EINVAL;
break;
}
ri = pma.pm_pmc; /* pmc id == row index */
if (ri < 0 || ri >= (int) md->pmd_npmc) {
error = EINVAL;
break;
}
/*
* We can't disable a PMC with a row-index allocated
* for process virtual PMCs.
*/
if (PMC_ROW_DISP_IS_THREAD(ri) &&
request == PMC_STATE_DISABLED) {
error = EBUSY;
break;
}
/*
* otherwise, this PMC on this CPU is either free or
* in system-wide mode.
*/
pmc_save_cpu_binding(&pb);
pmc_select_cpu(cpu);
pc = pmc_pcpu[cpu];
phw = pc->pc_hwpmcs[ri];
/*
* XXX do we need some kind of 'forced' disable?
*/
if (phw->phw_pmc == NULL) {
if (request == PMC_STATE_DISABLED &&
(phw->phw_state & PMC_PHW_FLAG_IS_ENABLED)) {
phw->phw_state &= ~PMC_PHW_FLAG_IS_ENABLED;
PMC_MARK_ROW_STANDALONE(ri);
} else if (request == PMC_STATE_FREE &&
(phw->phw_state & PMC_PHW_FLAG_IS_ENABLED) == 0) {
phw->phw_state |= PMC_PHW_FLAG_IS_ENABLED;
PMC_UNMARK_ROW_STANDALONE(ri);
}
/* other cases are a no-op */
} else
error = EBUSY;
pmc_restore_cpu_binding(&pb);
}
break;
/*
* Allocate a PMC.
*/
case PMC_OP_PMCALLOCATE:
{
int adjri, n;
u_int cpu;
uint32_t caps;
struct pmc *pmc;
enum pmc_mode mode;
struct pmc_hw *phw;
struct pmc_binding pb;
struct pmc_classdep *pcd;
struct pmc_op_pmcallocate pa;
if ((error = copyin(arg, &pa, sizeof(pa))) != 0)
break;
caps = pa.pm_caps;
mode = pa.pm_mode;
cpu = pa.pm_cpu;
if ((mode != PMC_MODE_SS && mode != PMC_MODE_SC &&
mode != PMC_MODE_TS && mode != PMC_MODE_TC) ||
(cpu != (u_int) PMC_CPU_ANY && cpu >= pmc_cpu_max())) {
error = EINVAL;
break;
}
/*
* Virtual PMCs should only ask for a default CPU.
* System mode PMCs need to specify a non-default CPU.
*/
if ((PMC_IS_VIRTUAL_MODE(mode) && cpu != (u_int) PMC_CPU_ANY) ||
(PMC_IS_SYSTEM_MODE(mode) && cpu == (u_int) PMC_CPU_ANY)) {
error = EINVAL;
break;
}
/*
* Check that an inactive CPU is not being asked for.
*/
if (PMC_IS_SYSTEM_MODE(mode) && !pmc_cpu_is_active(cpu)) {
error = ENXIO;
break;
}
/*
* Refuse an allocation for a system-wide PMC if this
* process has been jailed, or if this process lacks
* super-user credentials and the sysctl tunable
* 'security.bsd.unprivileged_syspmcs' is zero.
*/
if (PMC_IS_SYSTEM_MODE(mode)) {
if (jailed(curthread->td_ucred)) {
error = EPERM;
break;
}
if (!pmc_unprivileged_syspmcs) {
error = priv_check(curthread,
PRIV_PMC_SYSTEM);
if (error)
break;
}
}
/*
* Look for valid values for 'pm_flags'
*/
if ((pa.pm_flags & ~(PMC_F_DESCENDANTS | PMC_F_LOG_PROCCSW |
PMC_F_LOG_PROCEXIT | PMC_F_CALLCHAIN)) != 0) {
error = EINVAL;
break;
}
/* process logging options are not allowed for system PMCs */
if (PMC_IS_SYSTEM_MODE(mode) && (pa.pm_flags &
(PMC_F_LOG_PROCCSW | PMC_F_LOG_PROCEXIT))) {
error = EINVAL;
break;
}
/*
* All sampling mode PMCs need to be able to interrupt the
* CPU.
*/
if (PMC_IS_SAMPLING_MODE(mode))
caps |= PMC_CAP_INTERRUPT;
/* A valid class specifier should have been passed in. */
for (n = 0; n < md->pmd_nclass; n++)
if (md->pmd_classdep[n].pcd_class == pa.pm_class)
break;
if (n == md->pmd_nclass) {
error = EINVAL;
break;
}
/* The requested PMC capabilities should be feasible. */
if ((md->pmd_classdep[n].pcd_caps & caps) != caps) {
error = EOPNOTSUPP;
break;
}
PMCDBG(PMC,ALL,2, "event=%d caps=0x%x mode=%d cpu=%d",
pa.pm_ev, caps, mode, cpu);
pmc = pmc_allocate_pmc_descriptor();
pmc->pm_id = PMC_ID_MAKE_ID(cpu,pa.pm_mode,pa.pm_class,
PMC_ID_INVALID);
pmc->pm_event = pa.pm_ev;
pmc->pm_state = PMC_STATE_FREE;
pmc->pm_caps = caps;
pmc->pm_flags = pa.pm_flags;
/* switch thread to CPU 'cpu' */
pmc_save_cpu_binding(&pb);
#define PMC_IS_SHAREABLE_PMC(cpu, n) \
(pmc_pcpu[(cpu)]->pc_hwpmcs[(n)]->phw_state & \
PMC_PHW_FLAG_IS_SHAREABLE)
#define PMC_IS_UNALLOCATED(cpu, n) \
(pmc_pcpu[(cpu)]->pc_hwpmcs[(n)]->phw_pmc == NULL)
if (PMC_IS_SYSTEM_MODE(mode)) {
pmc_select_cpu(cpu);
for (n = 0; n < (int) md->pmd_npmc; n++) {
pcd = pmc_ri_to_classdep(md, n, &adjri);
if (pmc_can_allocate_row(n, mode) == 0 &&
pmc_can_allocate_rowindex(
curthread->td_proc, n, cpu) == 0 &&
(PMC_IS_UNALLOCATED(cpu, n) ||
PMC_IS_SHAREABLE_PMC(cpu, n)) &&
pcd->pcd_allocate_pmc(cpu, adjri, pmc,
&pa) == 0)
break;
}
} else {
/* Process virtual mode */
for (n = 0; n < (int) md->pmd_npmc; n++) {
pcd = pmc_ri_to_classdep(md, n, &adjri);
if (pmc_can_allocate_row(n, mode) == 0 &&
pmc_can_allocate_rowindex(
curthread->td_proc, n,
PMC_CPU_ANY) == 0 &&
pcd->pcd_allocate_pmc(curthread->td_oncpu,
adjri, pmc, &pa) == 0)
break;
}
}
#undef PMC_IS_UNALLOCATED
#undef PMC_IS_SHAREABLE_PMC
pmc_restore_cpu_binding(&pb);
if (n == (int) md->pmd_npmc) {
pmc_destroy_pmc_descriptor(pmc);
free(pmc, M_PMC);
pmc = NULL;
error = EINVAL;
break;
}
/* Fill in the correct value in the ID field */
pmc->pm_id = PMC_ID_MAKE_ID(cpu,mode,pa.pm_class,n);
PMCDBG(PMC,ALL,2, "ev=%d class=%d mode=%d n=%d -> pmcid=%x",
pmc->pm_event, pa.pm_class, mode, n, pmc->pm_id);
/* Process mode PMCs with logging enabled need log files */
if (pmc->pm_flags & (PMC_F_LOG_PROCEXIT | PMC_F_LOG_PROCCSW))
pmc->pm_flags |= PMC_F_NEEDS_LOGFILE;
/* All system mode sampling PMCs require a log file */
if (PMC_IS_SAMPLING_MODE(mode) && PMC_IS_SYSTEM_MODE(mode))
pmc->pm_flags |= PMC_F_NEEDS_LOGFILE;
/*
* Configure global pmc's immediately
*/
if (PMC_IS_SYSTEM_MODE(PMC_TO_MODE(pmc))) {
pmc_save_cpu_binding(&pb);
pmc_select_cpu(cpu);
phw = pmc_pcpu[cpu]->pc_hwpmcs[n];
pcd = pmc_ri_to_classdep(md, n, &adjri);
if ((phw->phw_state & PMC_PHW_FLAG_IS_ENABLED) == 0 ||
(error = pcd->pcd_config_pmc(cpu, adjri, pmc)) != 0) {
(void) pcd->pcd_release_pmc(cpu, adjri, pmc);
pmc_destroy_pmc_descriptor(pmc);
free(pmc, M_PMC);
pmc = NULL;
pmc_restore_cpu_binding(&pb);
error = EPERM;
break;
}
pmc_restore_cpu_binding(&pb);
}
pmc->pm_state = PMC_STATE_ALLOCATED;
/*
* mark row disposition
*/
if (PMC_IS_SYSTEM_MODE(mode))
PMC_MARK_ROW_STANDALONE(n);
else
PMC_MARK_ROW_THREAD(n);
/*
* Register this PMC with the current thread as its owner.
*/
if ((error =
pmc_register_owner(curthread->td_proc, pmc)) != 0) {
pmc_release_pmc_descriptor(pmc);
free(pmc, M_PMC);
pmc = NULL;
break;
}
/*
* Return the allocated index.
*/
pa.pm_pmcid = pmc->pm_id;
error = copyout(&pa, arg, sizeof(pa));
}
break;
/*
* Attach a PMC to a process.
*/
case PMC_OP_PMCATTACH:
{
struct pmc *pm;
struct proc *p;
struct pmc_op_pmcattach a;
sx_assert(&pmc_sx, SX_XLOCKED);
if ((error = copyin(arg, &a, sizeof(a))) != 0)
break;
if (a.pm_pid < 0) {
error = EINVAL;
break;
} else if (a.pm_pid == 0)
a.pm_pid = td->td_proc->p_pid;
if ((error = pmc_find_pmc(a.pm_pmc, &pm)) != 0)
break;
if (PMC_IS_SYSTEM_MODE(PMC_TO_MODE(pm))) {
error = EINVAL;
break;
}
/* PMCs may be (re)attached only when allocated or stopped */
if (pm->pm_state == PMC_STATE_RUNNING) {
error = EBUSY;
break;
} else if (pm->pm_state != PMC_STATE_ALLOCATED &&
pm->pm_state != PMC_STATE_STOPPED) {
error = EINVAL;
break;
}
/* lookup pid */
if ((p = pfind(a.pm_pid)) == NULL) {
error = ESRCH;
break;
}
/*
* Ignore processes that are working on exiting.
*/
if (p->p_flag & P_WEXIT) {
error = ESRCH;
PROC_UNLOCK(p); /* pfind() returns a locked process */
break;
}
/*
* we are allowed to attach a PMC to a process if
* we can debug it.
*/
error = p_candebug(curthread, p);
PROC_UNLOCK(p);
if (error == 0)
error = pmc_attach_process(p, pm);
}
break;
/*
* Detach an attached PMC from a process.
*/
case PMC_OP_PMCDETACH:
{
struct pmc *pm;
struct proc *p;
struct pmc_op_pmcattach a;
if ((error = copyin(arg, &a, sizeof(a))) != 0)
break;
if (a.pm_pid < 0) {
error = EINVAL;
break;
} else if (a.pm_pid == 0)
a.pm_pid = td->td_proc->p_pid;
if ((error = pmc_find_pmc(a.pm_pmc, &pm)) != 0)
break;
if ((p = pfind(a.pm_pid)) == NULL) {
error = ESRCH;
break;
}
/*
* Treat processes that are in the process of exiting
* as if they were not present.
*/
if (p->p_flag & P_WEXIT)
error = ESRCH;
PROC_UNLOCK(p); /* pfind() returns a locked process */
if (error == 0)
error = pmc_detach_process(p, pm);
}
break;
/*
* Retrieve the MSR number associated with the counter
* 'pmc_id'. This allows processes to directly use RDPMC
* instructions to read their PMCs, without the overhead of a
* system call.
*/
case PMC_OP_PMCGETMSR:
{
int adjri, ri;
struct pmc *pm;
struct pmc_target *pt;
struct pmc_op_getmsr gm;
struct pmc_classdep *pcd;
PMC_DOWNGRADE_SX();
if ((error = copyin(arg, &gm, sizeof(gm))) != 0)
break;
if ((error = pmc_find_pmc(gm.pm_pmcid, &pm)) != 0)
break;
/*
* The allocated PMC has to be a process virtual PMC,
* i.e., of type MODE_T[CS]. Global PMCs can only be
* read using the PMCREAD operation since they may be
* allocated on a different CPU than the one we could
* be running on at the time of the RDPMC instruction.
*
* The GETMSR operation is not allowed for PMCs that
* are inherited across processes.
*/
if (!PMC_IS_VIRTUAL_MODE(PMC_TO_MODE(pm)) ||
(pm->pm_flags & PMC_F_DESCENDANTS)) {
error = EINVAL;
break;
}
/*
* It only makes sense to use a RDPMC (or its
* equivalent instruction on non-x86 architectures) on
* a process that has allocated and attached a PMC to
* itself. Conversely the PMC is only allowed to have
* one process attached to it -- its owner.
*/
if ((pt = LIST_FIRST(&pm->pm_targets)) == NULL ||
LIST_NEXT(pt, pt_next) != NULL ||
pt->pt_process->pp_proc != pm->pm_owner->po_owner) {
error = EINVAL;
break;
}
ri = PMC_TO_ROWINDEX(pm);
pcd = pmc_ri_to_classdep(md, ri, &adjri);
/* PMC class has no 'GETMSR' support */
if (pcd->pcd_get_msr == NULL) {
error = ENOSYS;
break;
}
if ((error = (*pcd->pcd_get_msr)(adjri, &gm.pm_msr)) < 0)
break;
if ((error = copyout(&gm, arg, sizeof(gm))) < 0)
break;
/*
* Mark our process as using MSRs. Update machine
* state using a forced context switch.
*/
pt->pt_process->pp_flags |= PMC_PP_ENABLE_MSR_ACCESS;
pmc_force_context_switch();
}
break;
/*
* Release an allocated PMC
*/
case PMC_OP_PMCRELEASE:
{
pmc_id_t pmcid;
struct pmc *pm;
struct pmc_owner *po;
struct pmc_op_simple sp;
/*
* Find PMC pointer for the named PMC.
*
* Use pmc_release_pmc_descriptor() to switch off the
* PMC, remove all its target threads, and remove the
* PMC from its owner's list.
*
* Remove the owner record if this is the last PMC
* owned.
*
* Free up space.
*/
if ((error = copyin(arg, &sp, sizeof(sp))) != 0)
break;
pmcid = sp.pm_pmcid;
if ((error = pmc_find_pmc(pmcid, &pm)) != 0)
break;
po = pm->pm_owner;
pmc_release_pmc_descriptor(pm);
pmc_maybe_remove_owner(po);
free(pm, M_PMC);
}
break;
/*
* Read and/or write a PMC.
*/
case PMC_OP_PMCRW:
{
int adjri;
struct pmc *pm;
uint32_t cpu, ri;
pmc_value_t oldvalue;
struct pmc_binding pb;
struct pmc_op_pmcrw prw;
struct pmc_classdep *pcd;
struct pmc_op_pmcrw *pprw;
PMC_DOWNGRADE_SX();
if ((error = copyin(arg, &prw, sizeof(prw))) != 0)
break;
ri = 0;
PMCDBG(PMC,OPS,1, "rw id=%d flags=0x%x", prw.pm_pmcid,
prw.pm_flags);
/* must have at least one flag set */
if ((prw.pm_flags & (PMC_F_OLDVALUE|PMC_F_NEWVALUE)) == 0) {
error = EINVAL;
break;
}
/* locate pmc descriptor */
if ((error = pmc_find_pmc(prw.pm_pmcid, &pm)) != 0)
break;
/* Can't read a PMC that hasn't been started. */
if (pm->pm_state != PMC_STATE_ALLOCATED &&
pm->pm_state != PMC_STATE_STOPPED &&
pm->pm_state != PMC_STATE_RUNNING) {
error = EINVAL;
break;
}
/* writing a new value is allowed only for 'STOPPED' pmcs */
if (pm->pm_state == PMC_STATE_RUNNING &&
(prw.pm_flags & PMC_F_NEWVALUE)) {
error = EBUSY;
break;
}
if (PMC_IS_VIRTUAL_MODE(PMC_TO_MODE(pm))) {
/*
* If this PMC is attached to its owner (i.e.,
* the process requesting this operation) and
* is running, then attempt to get an
* upto-date reading from hardware for a READ.
* Writes are only allowed when the PMC is
* stopped, so only update the saved value
* field.
*
* If the PMC is not running, or is not
* attached to its owner, read/write to the
* savedvalue field.
*/
ri = PMC_TO_ROWINDEX(pm);
pcd = pmc_ri_to_classdep(md, ri, &adjri);
mtx_pool_lock_spin(pmc_mtxpool, pm);
cpu = curthread->td_oncpu;
if (prw.pm_flags & PMC_F_OLDVALUE) {
if ((pm->pm_flags & PMC_F_ATTACHED_TO_OWNER) &&
(pm->pm_state == PMC_STATE_RUNNING))
error = (*pcd->pcd_read_pmc)(cpu, adjri,
&oldvalue);
else
oldvalue = pm->pm_gv.pm_savedvalue;
}
if (prw.pm_flags & PMC_F_NEWVALUE)
pm->pm_gv.pm_savedvalue = prw.pm_value;
mtx_pool_unlock_spin(pmc_mtxpool, pm);
} else { /* System mode PMCs */
cpu = PMC_TO_CPU(pm);
ri = PMC_TO_ROWINDEX(pm);
pcd = pmc_ri_to_classdep(md, ri, &adjri);
if (!pmc_cpu_is_active(cpu)) {
error = ENXIO;
break;
}
/* move this thread to CPU 'cpu' */
pmc_save_cpu_binding(&pb);
pmc_select_cpu(cpu);
critical_enter();
/* save old value */
if (prw.pm_flags & PMC_F_OLDVALUE)
if ((error = (*pcd->pcd_read_pmc)(cpu, adjri,
&oldvalue)))
goto error;
/* write out new value */
if (prw.pm_flags & PMC_F_NEWVALUE)
error = (*pcd->pcd_write_pmc)(cpu, adjri,
prw.pm_value);
error:
critical_exit();
pmc_restore_cpu_binding(&pb);
if (error)
break;
}
pprw = (struct pmc_op_pmcrw *) arg;
#ifdef DEBUG
if (prw.pm_flags & PMC_F_NEWVALUE)
PMCDBG(PMC,OPS,2, "rw id=%d new %jx -> old %jx",
ri, prw.pm_value, oldvalue);
else if (prw.pm_flags & PMC_F_OLDVALUE)
PMCDBG(PMC,OPS,2, "rw id=%d -> old %jx", ri, oldvalue);
#endif
/* return old value if requested */
if (prw.pm_flags & PMC_F_OLDVALUE)
if ((error = copyout(&oldvalue, &pprw->pm_value,
sizeof(prw.pm_value))))
break;
}
break;
/*
* Set the sampling rate for a sampling mode PMC and the
* initial count for a counting mode PMC.
*/
case PMC_OP_PMCSETCOUNT:
{
struct pmc *pm;
struct pmc_op_pmcsetcount sc;
PMC_DOWNGRADE_SX();
if ((error = copyin(arg, &sc, sizeof(sc))) != 0)
break;
if ((error = pmc_find_pmc(sc.pm_pmcid, &pm)) != 0)
break;
if (pm->pm_state == PMC_STATE_RUNNING) {
error = EBUSY;
break;
}
if (PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm)))
pm->pm_sc.pm_reloadcount = sc.pm_count;
else
pm->pm_sc.pm_initial = sc.pm_count;
}
break;
/*
* Start a PMC.
*/
case PMC_OP_PMCSTART:
{
pmc_id_t pmcid;
struct pmc *pm;
struct pmc_op_simple sp;
sx_assert(&pmc_sx, SX_XLOCKED);
if ((error = copyin(arg, &sp, sizeof(sp))) != 0)
break;
pmcid = sp.pm_pmcid;
if ((error = pmc_find_pmc(pmcid, &pm)) != 0)
break;
KASSERT(pmcid == pm->pm_id,
("[pmc,%d] pmcid %x != id %x", __LINE__,
pm->pm_id, pmcid));
if (pm->pm_state == PMC_STATE_RUNNING) /* already running */
break;
else if (pm->pm_state != PMC_STATE_STOPPED &&
pm->pm_state != PMC_STATE_ALLOCATED) {
error = EINVAL;
break;
}
error = pmc_start(pm);
}
break;
/*
* Stop a PMC.
*/
case PMC_OP_PMCSTOP:
{
pmc_id_t pmcid;
struct pmc *pm;
struct pmc_op_simple sp;
PMC_DOWNGRADE_SX();
if ((error = copyin(arg, &sp, sizeof(sp))) != 0)
break;
pmcid = sp.pm_pmcid;
/*
* Mark the PMC as inactive and invoke the MD stop
* routines if needed.
*/
if ((error = pmc_find_pmc(pmcid, &pm)) != 0)
break;
KASSERT(pmcid == pm->pm_id,
("[pmc,%d] pmc id %x != pmcid %x", __LINE__,
pm->pm_id, pmcid));
if (pm->pm_state == PMC_STATE_STOPPED) /* already stopped */
break;
else if (pm->pm_state != PMC_STATE_RUNNING) {
error = EINVAL;
break;
}
error = pmc_stop(pm);
}
break;
/*
* Write a user supplied value to the log file.
*/
case PMC_OP_WRITELOG:
{
struct pmc_op_writelog wl;
struct pmc_owner *po;
PMC_DOWNGRADE_SX();
if ((error = copyin(arg, &wl, sizeof(wl))) != 0)
break;
if ((po = pmc_find_owner_descriptor(td->td_proc)) == NULL) {
error = EINVAL;
break;
}
if ((po->po_flags & PMC_PO_OWNS_LOGFILE) == 0) {
error = EINVAL;
break;
}
error = pmclog_process_userlog(po, &wl);
}
break;
default:
error = EINVAL;
break;
}
if (is_sx_locked != 0) {
if (is_sx_downgraded)
sx_sunlock(&pmc_sx);
else
sx_xunlock(&pmc_sx);
}
if (error)
atomic_add_int(&pmc_stats.pm_syscall_errors, 1);
PICKUP_GIANT();
return error;
}
/*
* Helper functions
*/
/*
* Mark the thread as needing callchain capture and post an AST. The
* actual callchain capture will be done in a context where it is safe
* to take page faults.
*/
static void
pmc_post_callchain_callback(void)
{
struct thread *td;
td = curthread;
/*
* If there is multiple PMCs for the same interrupt ignore new post
*/
if (td->td_pflags & TDP_CALLCHAIN)
return;
/*
* Mark this thread as needing callchain capture.
* `td->td_pflags' will be safe to touch because this thread
* was in user space when it was interrupted.
*/
td->td_pflags |= TDP_CALLCHAIN;
/*
* Don't let this thread migrate between CPUs until callchain
* capture completes.
*/
sched_pin();
return;
}
/*
* Interrupt processing.
*
* Find a free slot in the per-cpu array of samples and capture the
* current callchain there. If a sample was successfully added, a bit
* is set in mask 'pmc_cpumask' denoting that the DO_SAMPLES hook
* needs to be invoked from the clock handler.
*
* This function is meant to be called from an NMI handler. It cannot
* use any of the locking primitives supplied by the OS.
*/
int
pmc_process_interrupt(int cpu, struct pmc *pm, struct trapframe *tf,
int inuserspace)
{
int error, callchaindepth;
struct thread *td;
struct pmc_sample *ps;
struct pmc_samplebuffer *psb;
error = 0;
/*
* Allocate space for a sample buffer.
*/
psb = pmc_pcpu[cpu]->pc_sb;
ps = psb->ps_write;
if (ps->ps_nsamples) { /* in use, reader hasn't caught up */
pm->pm_stalled = 1;
atomic_add_int(&pmc_stats.pm_intr_bufferfull, 1);
PMCDBG(SAM,INT,1,"(spc) cpu=%d pm=%p tf=%p um=%d wr=%d rd=%d",
cpu, pm, (void *) tf, inuserspace,
(int) (psb->ps_write - psb->ps_samples),
(int) (psb->ps_read - psb->ps_samples));
error = ENOMEM;
goto done;
}
/* Fill in entry. */
PMCDBG(SAM,INT,1,"cpu=%d pm=%p tf=%p um=%d wr=%d rd=%d", cpu, pm,
(void *) tf, inuserspace,
(int) (psb->ps_write - psb->ps_samples),
(int) (psb->ps_read - psb->ps_samples));
KASSERT(pm->pm_runcount >= 0,
("[pmc,%d] pm=%p runcount %d", __LINE__, (void *) pm,
pm->pm_runcount));
atomic_add_rel_int(&pm->pm_runcount, 1); /* hold onto PMC */
ps->ps_pmc = pm;
if ((td = curthread) && td->td_proc)
ps->ps_pid = td->td_proc->p_pid;
else
ps->ps_pid = -1;
ps->ps_cpu = cpu;
ps->ps_td = td;
ps->ps_flags = inuserspace ? PMC_CC_F_USERSPACE : 0;
callchaindepth = (pm->pm_flags & PMC_F_CALLCHAIN) ?
pmc_callchaindepth : 1;
if (callchaindepth == 1)
ps->ps_pc[0] = PMC_TRAPFRAME_TO_PC(tf);
else {
/*
* Kernel stack traversals can be done immediately,
* while we defer to an AST for user space traversals.
*/
if (!inuserspace)
callchaindepth =
pmc_save_kernel_callchain(ps->ps_pc,
callchaindepth, tf);
else {
pmc_post_callchain_callback();
callchaindepth = PMC_SAMPLE_INUSE;
}
}
ps->ps_nsamples = callchaindepth; /* mark entry as in use */
/* increment write pointer, modulo ring buffer size */
ps++;
if (ps == psb->ps_fence)
psb->ps_write = psb->ps_samples;
else
psb->ps_write = ps;
done:
/* mark CPU as needing processing */
CPU_SET_ATOMIC(cpu, &pmc_cpumask);
return (error);
}
/*
* Capture a user call chain. This function will be called from ast()
* before control returns to userland and before the process gets
* rescheduled.
*/
static void
pmc_capture_user_callchain(int cpu, struct trapframe *tf)
{
int i;
struct pmc *pm;
struct thread *td;
struct pmc_sample *ps;
struct pmc_samplebuffer *psb;
#ifdef INVARIANTS
int ncallchains;
#endif
sched_unpin(); /* Can migrate safely now. */
psb = pmc_pcpu[cpu]->pc_sb;
td = curthread;
KASSERT(td->td_pflags & TDP_CALLCHAIN,
("[pmc,%d] Retrieving callchain for thread that doesn't want it",
__LINE__));
#ifdef INVARIANTS
ncallchains = 0;
#endif
/*
* Iterate through all deferred callchain requests.
*/
ps = psb->ps_samples;
for (i = 0; i < pmc_nsamples; i++, ps++) {
if (ps->ps_nsamples != PMC_SAMPLE_INUSE)
continue;
if (ps->ps_td != td)
continue;
KASSERT(ps->ps_cpu == cpu,
("[pmc,%d] cpu mismatch ps_cpu=%d pcpu=%d", __LINE__,
ps->ps_cpu, PCPU_GET(cpuid)));
pm = ps->ps_pmc;
KASSERT(pm->pm_flags & PMC_F_CALLCHAIN,
("[pmc,%d] Retrieving callchain for PMC that doesn't "
"want it", __LINE__));
KASSERT(pm->pm_runcount > 0,
("[pmc,%d] runcount %d", __LINE__, pm->pm_runcount));
/*
* Retrieve the callchain and mark the sample buffer
* as 'processable' by the timer tick sweep code.
*/
ps->ps_nsamples = pmc_save_user_callchain(ps->ps_pc,
pmc_callchaindepth, tf);
#ifdef INVARIANTS
ncallchains++;
#endif
}
KASSERT(ncallchains > 0,
("[pmc,%d] cpu %d didn't find a sample to collect", __LINE__,
cpu));
return;
}
/*
* Process saved PC samples.
*/
static void
pmc_process_samples(int cpu)
{
struct pmc *pm;
int adjri, n;
struct thread *td;
struct pmc_owner *po;
struct pmc_sample *ps;
struct pmc_classdep *pcd;
struct pmc_samplebuffer *psb;
KASSERT(PCPU_GET(cpuid) == cpu,
("[pmc,%d] not on the correct CPU pcpu=%d cpu=%d", __LINE__,
PCPU_GET(cpuid), cpu));
psb = pmc_pcpu[cpu]->pc_sb;
for (n = 0; n < pmc_nsamples; n++) { /* bound on #iterations */
ps = psb->ps_read;
if (ps->ps_nsamples == PMC_SAMPLE_FREE)
break;
if (ps->ps_nsamples == PMC_SAMPLE_INUSE) {
/* Need a rescan at a later time. */
CPU_SET_ATOMIC(cpu, &pmc_cpumask);
break;
}
pm = ps->ps_pmc;
KASSERT(pm->pm_runcount > 0,
("[pmc,%d] pm=%p runcount %d", __LINE__, (void *) pm,
pm->pm_runcount));
po = pm->pm_owner;
KASSERT(PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm)),
("[pmc,%d] pmc=%p non-sampling mode=%d", __LINE__,
pm, PMC_TO_MODE(pm)));
/* Ignore PMCs that have been switched off */
if (pm->pm_state != PMC_STATE_RUNNING)
goto entrydone;
PMCDBG(SAM,OPS,1,"cpu=%d pm=%p n=%d fl=%x wr=%d rd=%d", cpu,
pm, ps->ps_nsamples, ps->ps_flags,
(int) (psb->ps_write - psb->ps_samples),
(int) (psb->ps_read - psb->ps_samples));
/*
* If this is a process-mode PMC that is attached to
* its owner, and if the PC is in user mode, update
* profiling statistics like timer-based profiling
* would have done.
*/
if (pm->pm_flags & PMC_F_ATTACHED_TO_OWNER) {
if (ps->ps_flags & PMC_CC_F_USERSPACE) {
td = FIRST_THREAD_IN_PROC(po->po_owner);
addupc_intr(td, ps->ps_pc[0], 1);
}
goto entrydone;
}
/*
* Otherwise, this is either a sampling mode PMC that
* is attached to a different process than its owner,
* or a system-wide sampling PMC. Dispatch a log
* entry to the PMC's owner process.
*/
pmclog_process_callchain(pm, ps);
entrydone:
ps->ps_nsamples = 0; /* mark entry as free */
atomic_subtract_rel_int(&pm->pm_runcount, 1);
/* increment read pointer, modulo sample size */
if (++ps == psb->ps_fence)
psb->ps_read = psb->ps_samples;
else
psb->ps_read = ps;
}
atomic_add_int(&pmc_stats.pm_log_sweeps, 1);
/* Do not re-enable stalled PMCs if we failed to process any samples */
if (n == 0)
return;
/*
* Restart any stalled sampling PMCs on this CPU.
*
* If the NMI handler sets the pm_stalled field of a PMC after
* the check below, we'll end up processing the stalled PMC at
* the next hardclock tick.
*/
for (n = 0; n < md->pmd_npmc; n++) {
pcd = pmc_ri_to_classdep(md, n, &adjri);
KASSERT(pcd != NULL,
("[pmc,%d] null pcd ri=%d", __LINE__, n));
(void) (*pcd->pcd_get_config)(cpu,adjri,&pm);
if (pm == NULL || /* !cfg'ed */
pm->pm_state != PMC_STATE_RUNNING || /* !active */
!PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm)) || /* !sampling */
pm->pm_stalled == 0) /* !stalled */
continue;
pm->pm_stalled = 0;
(*pcd->pcd_start_pmc)(cpu, adjri);
}
}
/*
* Event handlers.
*/
/*
* Handle a process exit.
*
* Remove this process from all hash tables. If this process
* owned any PMCs, turn off those PMCs and deallocate them,
* removing any associations with target processes.
*
* This function will be called by the last 'thread' of a
* process.
*
* XXX This eventhandler gets called early in the exit process.
* Consider using a 'hook' invocation from thread_exit() or equivalent
* spot. Another negative is that kse_exit doesn't seem to call
* exit1() [??].
*
*/
static void
pmc_process_exit(void *arg __unused, struct proc *p)
{
struct pmc *pm;
int adjri, cpu;
unsigned int ri;
int is_using_hwpmcs;
struct pmc_owner *po;
struct pmc_process *pp;
struct pmc_classdep *pcd;
pmc_value_t newvalue, tmp;
PROC_LOCK(p);
is_using_hwpmcs = p->p_flag & P_HWPMC;
PROC_UNLOCK(p);
/*
* Log a sysexit event to all SS PMC owners.
*/
LIST_FOREACH(po, &pmc_ss_owners, po_ssnext)
if (po->po_flags & PMC_PO_OWNS_LOGFILE)
pmclog_process_sysexit(po, p->p_pid);
if (!is_using_hwpmcs)
return;
PMC_GET_SX_XLOCK();
PMCDBG(PRC,EXT,1,"process-exit proc=%p (%d, %s)", p, p->p_pid,
p->p_comm);
/*
* Since this code is invoked by the last thread in an exiting
* process, we would have context switched IN at some prior
* point. However, with PREEMPTION, kernel mode context
* switches may happen any time, so we want to disable a
* context switch OUT till we get any PMCs targetting this
* process off the hardware.
*
* We also need to atomically remove this process'
* entry from our target process hash table, using
* PMC_FLAG_REMOVE.
*/
PMCDBG(PRC,EXT,1, "process-exit proc=%p (%d, %s)", p, p->p_pid,
p->p_comm);
critical_enter(); /* no preemption */
cpu = curthread->td_oncpu;
if ((pp = pmc_find_process_descriptor(p,
PMC_FLAG_REMOVE)) != NULL) {
PMCDBG(PRC,EXT,2,
"process-exit proc=%p pmc-process=%p", p, pp);
/*
* The exiting process could the target of
* some PMCs which will be running on
* currently executing CPU.
*
* We need to turn these PMCs off like we
* would do at context switch OUT time.
*/
for (ri = 0; ri < md->pmd_npmc; ri++) {
/*
* Pick up the pmc pointer from hardware
* state similar to the CSW_OUT code.
*/
pm = NULL;
pcd = pmc_ri_to_classdep(md, ri, &adjri);
(void) (*pcd->pcd_get_config)(cpu, adjri, &pm);
PMCDBG(PRC,EXT,2, "ri=%d pm=%p", ri, pm);
if (pm == NULL ||
!PMC_IS_VIRTUAL_MODE(PMC_TO_MODE(pm)))
continue;
PMCDBG(PRC,EXT,2, "ppmcs[%d]=%p pm=%p "
"state=%d", ri, pp->pp_pmcs[ri].pp_pmc,
pm, pm->pm_state);
KASSERT(PMC_TO_ROWINDEX(pm) == ri,
("[pmc,%d] ri mismatch pmc(%d) ri(%d)",
__LINE__, PMC_TO_ROWINDEX(pm), ri));
KASSERT(pm == pp->pp_pmcs[ri].pp_pmc,
("[pmc,%d] pm %p != pp_pmcs[%d] %p",
__LINE__, pm, ri, pp->pp_pmcs[ri].pp_pmc));
(void) pcd->pcd_stop_pmc(cpu, adjri);
KASSERT(pm->pm_runcount > 0,
("[pmc,%d] bad runcount ri %d rc %d",
__LINE__, ri, pm->pm_runcount));
/* Stop hardware only if it is actually running */
if (pm->pm_state == PMC_STATE_RUNNING &&
pm->pm_stalled == 0) {
pcd->pcd_read_pmc(cpu, adjri, &newvalue);
tmp = newvalue -
PMC_PCPU_SAVED(cpu,ri);
mtx_pool_lock_spin(pmc_mtxpool, pm);
pm->pm_gv.pm_savedvalue += tmp;
pp->pp_pmcs[ri].pp_pmcval += tmp;
mtx_pool_unlock_spin(pmc_mtxpool, pm);
}
atomic_subtract_rel_int(&pm->pm_runcount,1);
KASSERT((int) pm->pm_runcount >= 0,
("[pmc,%d] runcount is %d", __LINE__, ri));
(void) pcd->pcd_config_pmc(cpu, adjri, NULL);
}
/*
* Inform the MD layer of this pseudo "context switch
* out"
*/
(void) md->pmd_switch_out(pmc_pcpu[cpu], pp);
critical_exit(); /* ok to be pre-empted now */
/*
* Unlink this process from the PMCs that are
* targetting it. This will send a signal to
* all PMC owner's whose PMCs are orphaned.
*
* Log PMC value at exit time if requested.
*/
for (ri = 0; ri < md->pmd_npmc; ri++)
if ((pm = pp->pp_pmcs[ri].pp_pmc) != NULL) {
if (pm->pm_flags & PMC_F_NEEDS_LOGFILE &&
PMC_IS_COUNTING_MODE(PMC_TO_MODE(pm)))
pmclog_process_procexit(pm, pp);
pmc_unlink_target_process(pm, pp);
}
free(pp, M_PMC);
} else
critical_exit(); /* pp == NULL */
/*
* If the process owned PMCs, free them up and free up
* memory.
*/
if ((po = pmc_find_owner_descriptor(p)) != NULL) {
pmc_remove_owner(po);
pmc_destroy_owner_descriptor(po);
}
sx_xunlock(&pmc_sx);
}
/*
* Handle a process fork.
*
* If the parent process 'p1' is under HWPMC monitoring, then copy
* over any attached PMCs that have 'do_descendants' semantics.
*/
static void
pmc_process_fork(void *arg __unused, struct proc *p1, struct proc *newproc,
int flags)
{
int is_using_hwpmcs;
unsigned int ri;
uint32_t do_descendants;
struct pmc *pm;
struct pmc_owner *po;
struct pmc_process *ppnew, *ppold;
(void) flags; /* unused parameter */
PROC_LOCK(p1);
is_using_hwpmcs = p1->p_flag & P_HWPMC;
PROC_UNLOCK(p1);
/*
* If there are system-wide sampling PMCs active, we need to
* log all fork events to their owner's logs.
*/
LIST_FOREACH(po, &pmc_ss_owners, po_ssnext)
if (po->po_flags & PMC_PO_OWNS_LOGFILE)
pmclog_process_procfork(po, p1->p_pid, newproc->p_pid);
if (!is_using_hwpmcs)
return;
PMC_GET_SX_XLOCK();
PMCDBG(PMC,FRK,1, "process-fork proc=%p (%d, %s) -> %p", p1,
p1->p_pid, p1->p_comm, newproc);
/*
* If the parent process (curthread->td_proc) is a
* target of any PMCs, look for PMCs that are to be
* inherited, and link these into the new process
* descriptor.
*/
if ((ppold = pmc_find_process_descriptor(curthread->td_proc,
PMC_FLAG_NONE)) == NULL)
goto done; /* nothing to do */
do_descendants = 0;
for (ri = 0; ri < md->pmd_npmc; ri++)
if ((pm = ppold->pp_pmcs[ri].pp_pmc) != NULL)
do_descendants |= pm->pm_flags & PMC_F_DESCENDANTS;
if (do_descendants == 0) /* nothing to do */
goto done;
/* allocate a descriptor for the new process */
if ((ppnew = pmc_find_process_descriptor(newproc,
PMC_FLAG_ALLOCATE)) == NULL)
goto done;
/*
* Run through all PMCs that were targeting the old process
* and which specified F_DESCENDANTS and attach them to the
* new process.
*
* Log the fork event to all owners of PMCs attached to this
* process, if not already logged.
*/
for (ri = 0; ri < md->pmd_npmc; ri++)
if ((pm = ppold->pp_pmcs[ri].pp_pmc) != NULL &&
(pm->pm_flags & PMC_F_DESCENDANTS)) {
pmc_link_target_process(pm, ppnew);
po = pm->pm_owner;
if (po->po_sscount == 0 &&
po->po_flags & PMC_PO_OWNS_LOGFILE)
pmclog_process_procfork(po, p1->p_pid,
newproc->p_pid);
}
/*
* Now mark the new process as being tracked by this driver.
*/
PROC_LOCK(newproc);
newproc->p_flag |= P_HWPMC;
PROC_UNLOCK(newproc);
done:
sx_xunlock(&pmc_sx);
}
/*
* initialization
*/
static const char *pmc_name_of_pmcclass[] = {
#undef __PMC_CLASS
#define __PMC_CLASS(N) #N ,
__PMC_CLASSES()
};
static int
pmc_initialize(void)
{
int c, cpu, error, n, ri;
unsigned int maxcpu;
struct pmc_binding pb;
struct pmc_sample *ps;
struct pmc_classdep *pcd;
struct pmc_samplebuffer *sb;
md = NULL;
error = 0;
#ifdef DEBUG
/* parse debug flags first */
if (TUNABLE_STR_FETCH(PMC_SYSCTL_NAME_PREFIX "debugflags",
pmc_debugstr, sizeof(pmc_debugstr)))
pmc_debugflags_parse(pmc_debugstr,
pmc_debugstr+strlen(pmc_debugstr));
#endif
PMCDBG(MOD,INI,0, "PMC Initialize (version %x)", PMC_VERSION);
/* check kernel version */
if (pmc_kernel_version != PMC_VERSION) {
if (pmc_kernel_version == 0)
printf("hwpmc: this kernel has not been compiled with "
"'options HWPMC_HOOKS'.\n");
else
printf("hwpmc: kernel version (0x%x) does not match "
"module version (0x%x).\n", pmc_kernel_version,
PMC_VERSION);
return EPROGMISMATCH;
}
/*
* check sysctl parameters
*/
if (pmc_hashsize <= 0) {
(void) printf("hwpmc: tunable \"hashsize\"=%d must be "
"greater than zero.\n", pmc_hashsize);
pmc_hashsize = PMC_HASH_SIZE;
}
if (pmc_nsamples <= 0 || pmc_nsamples > 65535) {
(void) printf("hwpmc: tunable \"nsamples\"=%d out of "
"range.\n", pmc_nsamples);
pmc_nsamples = PMC_NSAMPLES;
}
if (pmc_callchaindepth <= 0 ||
pmc_callchaindepth > PMC_CALLCHAIN_DEPTH_MAX) {
(void) printf("hwpmc: tunable \"callchaindepth\"=%d out of "
"range.\n", pmc_callchaindepth);
pmc_callchaindepth = PMC_CALLCHAIN_DEPTH;
}
md = pmc_md_initialize();
if (md == NULL)
return (ENOSYS);
KASSERT(md->pmd_nclass >= 1 && md->pmd_npmc >= 1,
("[pmc,%d] no classes or pmcs", __LINE__));
/* Compute the map from row-indices to classdep pointers. */
pmc_rowindex_to_classdep = malloc(sizeof(struct pmc_classdep *) *
md->pmd_npmc, M_PMC, M_WAITOK|M_ZERO);
for (n = 0; n < md->pmd_npmc; n++)
pmc_rowindex_to_classdep[n] = NULL;
for (ri = c = 0; c < md->pmd_nclass; c++) {
pcd = &md->pmd_classdep[c];
for (n = 0; n < pcd->pcd_num; n++, ri++)
pmc_rowindex_to_classdep[ri] = pcd;
}
KASSERT(ri == md->pmd_npmc,
("[pmc,%d] npmc miscomputed: ri=%d, md->npmc=%d", __LINE__,
ri, md->pmd_npmc));
maxcpu = pmc_cpu_max();
/* allocate space for the per-cpu array */
pmc_pcpu = malloc(maxcpu * sizeof(struct pmc_cpu *), M_PMC,
M_WAITOK|M_ZERO);
/* per-cpu 'saved values' for managing process-mode PMCs */
pmc_pcpu_saved = malloc(sizeof(pmc_value_t) * maxcpu * md->pmd_npmc,
M_PMC, M_WAITOK);
/* Perform CPU-dependent initialization. */
pmc_save_cpu_binding(&pb);
error = 0;
for (cpu = 0; error == 0 && cpu < maxcpu; cpu++) {
if (!pmc_cpu_is_active(cpu))
continue;
pmc_select_cpu(cpu);
pmc_pcpu[cpu] = malloc(sizeof(struct pmc_cpu) +
md->pmd_npmc * sizeof(struct pmc_hw *), M_PMC,
M_WAITOK|M_ZERO);
if (md->pmd_pcpu_init)
error = md->pmd_pcpu_init(md, cpu);
for (n = 0; error == 0 && n < md->pmd_nclass; n++)
error = md->pmd_classdep[n].pcd_pcpu_init(md, cpu);
}
pmc_restore_cpu_binding(&pb);
if (error)
return (error);
/* allocate space for the sample array */
for (cpu = 0; cpu < maxcpu; cpu++) {
if (!pmc_cpu_is_active(cpu))
continue;
sb = malloc(sizeof(struct pmc_samplebuffer) +
pmc_nsamples * sizeof(struct pmc_sample), M_PMC,
M_WAITOK|M_ZERO);
sb->ps_read = sb->ps_write = sb->ps_samples;
sb->ps_fence = sb->ps_samples + pmc_nsamples;
KASSERT(pmc_pcpu[cpu] != NULL,
("[pmc,%d] cpu=%d Null per-cpu data", __LINE__, cpu));
sb->ps_callchains = malloc(pmc_callchaindepth * pmc_nsamples *
sizeof(uintptr_t), M_PMC, M_WAITOK|M_ZERO);
for (n = 0, ps = sb->ps_samples; n < pmc_nsamples; n++, ps++)
ps->ps_pc = sb->ps_callchains +
(n * pmc_callchaindepth);
pmc_pcpu[cpu]->pc_sb = sb;
}
/* allocate space for the row disposition array */
pmc_pmcdisp = malloc(sizeof(enum pmc_mode) * md->pmd_npmc,
M_PMC, M_WAITOK|M_ZERO);
KASSERT(pmc_pmcdisp != NULL,
("[pmc,%d] pmcdisp allocation returned NULL", __LINE__));
/* mark all PMCs as available */
for (n = 0; n < (int) md->pmd_npmc; n++)
PMC_MARK_ROW_FREE(n);
/* allocate thread hash tables */
pmc_ownerhash = hashinit(pmc_hashsize, M_PMC,
&pmc_ownerhashmask);
pmc_processhash = hashinit(pmc_hashsize, M_PMC,
&pmc_processhashmask);
mtx_init(&pmc_processhash_mtx, "pmc-process-hash", "pmc-leaf",
MTX_SPIN);
LIST_INIT(&pmc_ss_owners);
pmc_ss_count = 0;
/* allocate a pool of spin mutexes */
pmc_mtxpool = mtx_pool_create("pmc-leaf", pmc_mtxpool_size,
MTX_SPIN);
PMCDBG(MOD,INI,1, "pmc_ownerhash=%p, mask=0x%lx "
"targethash=%p mask=0x%lx", pmc_ownerhash, pmc_ownerhashmask,
pmc_processhash, pmc_processhashmask);
/* register process {exit,fork,exec} handlers */
pmc_exit_tag = EVENTHANDLER_REGISTER(process_exit,
pmc_process_exit, NULL, EVENTHANDLER_PRI_ANY);
pmc_fork_tag = EVENTHANDLER_REGISTER(process_fork,
pmc_process_fork, NULL, EVENTHANDLER_PRI_ANY);
/* initialize logging */
pmclog_initialize();
/* set hook functions */
pmc_intr = md->pmd_intr;
pmc_hook = pmc_hook_handler;
if (error == 0) {
printf(PMC_MODULE_NAME ":");
for (n = 0; n < (int) md->pmd_nclass; n++) {
pcd = &md->pmd_classdep[n];
printf(" %s/%d/%d/0x%b",
pmc_name_of_pmcclass[pcd->pcd_class],
pcd->pcd_num,
pcd->pcd_width,
pcd->pcd_caps,
"\20"
"\1INT\2USR\3SYS\4EDG\5THR"
"\6REA\7WRI\10INV\11QUA\12PRC"
"\13TAG\14CSC");
}
printf("\n");
}
return (error);
}
/* prepare to be unloaded */
static void
pmc_cleanup(void)
{
int c, cpu;
unsigned int maxcpu;
struct pmc_ownerhash *ph;
struct pmc_owner *po, *tmp;
struct pmc_binding pb;
#ifdef DEBUG
struct pmc_processhash *prh;
#endif
PMCDBG(MOD,INI,0, "%s", "cleanup");
/* switch off sampling */
CPU_ZERO(&pmc_cpumask);
pmc_intr = NULL;
sx_xlock(&pmc_sx);
if (pmc_hook == NULL) { /* being unloaded already */
sx_xunlock(&pmc_sx);
return;
}
pmc_hook = NULL; /* prevent new threads from entering module */
/* deregister event handlers */
EVENTHANDLER_DEREGISTER(process_fork, pmc_fork_tag);
EVENTHANDLER_DEREGISTER(process_exit, pmc_exit_tag);
/* send SIGBUS to all owner threads, free up allocations */
if (pmc_ownerhash)
for (ph = pmc_ownerhash;
ph <= &pmc_ownerhash[pmc_ownerhashmask];
ph++) {
LIST_FOREACH_SAFE(po, ph, po_next, tmp) {
pmc_remove_owner(po);
/* send SIGBUS to owner processes */
PMCDBG(MOD,INI,2, "cleanup signal proc=%p "
"(%d, %s)", po->po_owner,
po->po_owner->p_pid,
po->po_owner->p_comm);
PROC_LOCK(po->po_owner);
- psignal(po->po_owner, SIGBUS);
+ kern_psignal(po->po_owner, SIGBUS);
PROC_UNLOCK(po->po_owner);
pmc_destroy_owner_descriptor(po);
}
}
/* reclaim allocated data structures */
if (pmc_mtxpool)
mtx_pool_destroy(&pmc_mtxpool);
mtx_destroy(&pmc_processhash_mtx);
if (pmc_processhash) {
#ifdef DEBUG
struct pmc_process *pp;
PMCDBG(MOD,INI,3, "%s", "destroy process hash");
for (prh = pmc_processhash;
prh <= &pmc_processhash[pmc_processhashmask];
prh++)
LIST_FOREACH(pp, prh, pp_next)
PMCDBG(MOD,INI,3, "pid=%d", pp->pp_proc->p_pid);
#endif
hashdestroy(pmc_processhash, M_PMC, pmc_processhashmask);
pmc_processhash = NULL;
}
if (pmc_ownerhash) {
PMCDBG(MOD,INI,3, "%s", "destroy owner hash");
hashdestroy(pmc_ownerhash, M_PMC, pmc_ownerhashmask);
pmc_ownerhash = NULL;
}
KASSERT(LIST_EMPTY(&pmc_ss_owners),
("[pmc,%d] Global SS owner list not empty", __LINE__));
KASSERT(pmc_ss_count == 0,
("[pmc,%d] Global SS count not empty", __LINE__));
/* do processor and pmc-class dependent cleanup */
maxcpu = pmc_cpu_max();
PMCDBG(MOD,INI,3, "%s", "md cleanup");
if (md) {
pmc_save_cpu_binding(&pb);
for (cpu = 0; cpu < maxcpu; cpu++) {
PMCDBG(MOD,INI,1,"pmc-cleanup cpu=%d pcs=%p",
cpu, pmc_pcpu[cpu]);
if (!pmc_cpu_is_active(cpu) || pmc_pcpu[cpu] == NULL)
continue;
pmc_select_cpu(cpu);
for (c = 0; c < md->pmd_nclass; c++)
md->pmd_classdep[c].pcd_pcpu_fini(md, cpu);
if (md->pmd_pcpu_fini)
md->pmd_pcpu_fini(md, cpu);
}
pmc_md_finalize(md);
free(md, M_PMC);
md = NULL;
pmc_restore_cpu_binding(&pb);
}
/* Free per-cpu descriptors. */
for (cpu = 0; cpu < maxcpu; cpu++) {
if (!pmc_cpu_is_active(cpu))
continue;
KASSERT(pmc_pcpu[cpu]->pc_sb != NULL,
("[pmc,%d] Null cpu sample buffer cpu=%d", __LINE__,
cpu));
free(pmc_pcpu[cpu]->pc_sb->ps_callchains, M_PMC);
free(pmc_pcpu[cpu]->pc_sb, M_PMC);
free(pmc_pcpu[cpu], M_PMC);
}
free(pmc_pcpu, M_PMC);
pmc_pcpu = NULL;
free(pmc_pcpu_saved, M_PMC);
pmc_pcpu_saved = NULL;
if (pmc_pmcdisp) {
free(pmc_pmcdisp, M_PMC);
pmc_pmcdisp = NULL;
}
if (pmc_rowindex_to_classdep) {
free(pmc_rowindex_to_classdep, M_PMC);
pmc_rowindex_to_classdep = NULL;
}
pmclog_shutdown();
sx_xunlock(&pmc_sx); /* we are done */
}
/*
* The function called at load/unload.
*/
static int
load (struct module *module __unused, int cmd, void *arg __unused)
{
int error;
error = 0;
switch (cmd) {
case MOD_LOAD :
/* initialize the subsystem */
error = pmc_initialize();
if (error != 0)
break;
PMCDBG(MOD,INI,1, "syscall=%d maxcpu=%d",
pmc_syscall_num, pmc_cpu_max());
break;
case MOD_UNLOAD :
case MOD_SHUTDOWN:
pmc_cleanup();
PMCDBG(MOD,INI,1, "%s", "unloaded");
break;
default :
error = EINVAL; /* XXX should panic(9) */
break;
}
return error;
}
/* memory pool */
MALLOC_DEFINE(M_PMC, "pmc", "Memory space for the PMC module");
Index: head/sys/dev/iscsi/initiator/isc_soc.c
===================================================================
--- head/sys/dev/iscsi/initiator/isc_soc.c (revision 225616)
+++ head/sys/dev/iscsi/initiator/isc_soc.c (revision 225617)
@@ -1,701 +1,701 @@
/*-
* Copyright (c) 2005-2010 Daniel Braniss <danny@cs.huji.ac.il>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
*/
/*
| $Id: isc_soc.c 998 2009-12-20 10:32:45Z danny $
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_iscsi_initiator.h"
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/conf.h>
#include <sys/systm.h>
#include <sys/malloc.h>
#include <sys/ctype.h>
#include <sys/errno.h>
#include <sys/sysctl.h>
#include <sys/file.h>
#include <sys/uio.h>
#include <sys/socketvar.h>
#include <sys/socket.h>
#include <sys/protosw.h>
#include <sys/proc.h>
#include <sys/ioccom.h>
#include <sys/queue.h>
#include <sys/kthread.h>
#include <sys/syslog.h>
#include <sys/mbuf.h>
#include <sys/user.h>
#include <cam/cam.h>
#include <cam/cam_ccb.h>
#include <dev/iscsi/initiator/iscsi.h>
#include <dev/iscsi/initiator/iscsivar.h>
#ifndef NO_USE_MBUF
#define USE_MBUF
#endif
#ifdef USE_MBUF
static int ou_refcnt = 0;
/*
| function for freeing external storage for mbuf
*/
static void
ext_free(void *a, void *b)
{
pduq_t *pq = b;
if(pq->buf != NULL) {
debug(3, "ou_refcnt=%d a=%p b=%p", ou_refcnt, a, pq->buf);
free(pq->buf, M_ISCSIBUF);
pq->buf = NULL;
}
}
int
isc_sendPDU(isc_session_t *sp, pduq_t *pq)
{
struct mbuf *mh, **mp;
pdu_t *pp = &pq->pdu;
int len, error;
debug_called(8);
/*
| mbuf for the iSCSI header
*/
MGETHDR(mh, M_TRYWAIT, MT_DATA);
mh->m_pkthdr.rcvif = NULL;
mh->m_next = NULL;
mh->m_len = sizeof(union ipdu_u);
if(ISOK2DIG(sp->hdrDigest, pp)) {
pp->hdr_dig = sp->hdrDigest(&pp->ipdu, sizeof(union ipdu_u), 0);
mh->m_len += sizeof(pp->hdr_dig);
if(pp->ahs_len) {
debug(2, "ahs_len=%d", pp->ahs_len);
pp->hdr_dig = sp->hdrDigest(&pp->ahs_addr, pp->ahs_len, pp->hdr_dig);
}
debug(3, "pp->hdr_dig=%04x", htonl(pp->hdr_dig));
}
if(pp->ahs_len) {
/*
| Add any AHS to the iSCSI hdr mbuf
*/
if((mh->m_len + pp->ahs_len) < MHLEN) {
MH_ALIGN(mh, mh->m_len + pp->ahs_len);
bcopy(&pp->ipdu, mh->m_data, mh->m_len);
bcopy(pp->ahs_addr, mh->m_data + mh->m_len, pp->ahs_len);
mh->m_len += pp->ahs_len;
}
else
panic("len AHS=%d too big, not impleneted yet", pp->ahs_len);
}
else {
MH_ALIGN(mh, mh->m_len);
bcopy(&pp->ipdu, mh->m_data, mh->m_len);
}
mh->m_pkthdr.len = mh->m_len;
mp = &mh->m_next;
if(pp->ds_len && pq->pdu.ds_addr) {
struct mbuf *md;
int off = 0;
len = pp->ds_len;
while(len > 0) {
int l;
MGET(md, M_TRYWAIT, MT_DATA);
md->m_ext.ref_cnt = &ou_refcnt;
l = min(MCLBYTES, len);
debug(4, "setting ext_free(arg=%p len/l=%d/%d)", pq->buf, len, l);
MEXTADD(md, pp->ds_addr + off, l, ext_free,
#if __FreeBSD_version >= 800000
pp->ds_addr + off,
#endif
pq, 0, EXT_EXTREF);
md->m_len = l;
md->m_next = NULL;
mh->m_pkthdr.len += l;
*mp = md;
mp = &md->m_next;
len -= l;
off += l;
}
if(((pp->ds_len & 03) != 0) || ISOK2DIG(sp->dataDigest, pp)) {
MGET(md, M_TRYWAIT, MT_DATA);
if(pp->ds_len & 03)
len = 4 - (pp->ds_len & 03);
else
len = 0;
md->m_len = len;
if(ISOK2DIG(sp->dataDigest, pp))
md->m_len += sizeof(pp->ds_dig);
M_ALIGN(md, md->m_len);
if(ISOK2DIG(sp->dataDigest, pp)) {
pp->ds_dig = sp->dataDigest(pp->ds_addr, pp->ds_len, 0);
if(len) {
bzero(md->m_data, len); // RFC says SHOULD be 0
pp->ds_dig = sp->dataDigest(md->m_data, len, pp->ds_dig);
}
bcopy(&pp->ds_dig, md->m_data+len, sizeof(pp->ds_dig));
}
md->m_next = NULL;
mh->m_pkthdr.len += md->m_len;
*mp = md;
}
}
if((error = sosend(sp->soc, NULL, NULL, mh, 0, 0, sp->td)) != 0) {
sdebug(2, "error=%d", error);
return error;
}
sp->stats.nsent++;
getbintime(&sp->stats.t_sent);
return 0;
}
#else /* NO_USE_MBUF */
int
isc_sendPDU(isc_session_t *sp, pduq_t *pq)
{
struct uio *uio = &pq->uio;
struct iovec *iv;
pdu_t *pp = &pq->pdu;
int len, error;
debug_called(8);
bzero(uio, sizeof(struct uio));
uio->uio_rw = UIO_WRITE;
uio->uio_segflg = UIO_SYSSPACE;
uio->uio_td = sp->td;
uio->uio_iov = iv = pq->iov;
iv->iov_base = &pp->ipdu;
iv->iov_len = sizeof(union ipdu_u);
uio->uio_resid = iv->iov_len;
iv++;
if(ISOK2DIG(sp->hdrDigest, pp))
pq->pdu.hdr_dig = sp->hdrDigest(&pp->ipdu, sizeof(union ipdu_u), 0);
if(pp->ahs_len) {
iv->iov_base = pp->ahs_addr;
iv->iov_len = pp->ahs_len;
uio->uio_resid += iv->iov_len;
iv++;
if(ISOK2DIG(sp->hdrDigest, pp))
pp->hdr_dig = sp->hdrDigest(&pp->ahs_addr, pp->ahs_len, pp->hdr_dig);
}
if(ISOK2DIG(sp->hdrDigest, pp)) {
debug(3, "hdr_dig=%04x", htonl(pp->hdr_dig));
iv->iov_base = &pp->hdr_dig;
iv->iov_len = sizeof(int);
uio->uio_resid += iv->iov_len ;
iv++;
}
if(pq->pdu.ds_addr && pp->ds_len) {
iv->iov_base = pp->ds_addr;
iv->iov_len = pp->ds_len;
while(iv->iov_len & 03) // the specs say it must be int alligned
iv->iov_len++;
uio->uio_resid += iv->iov_len ;
iv++;
if(ISOK2DIG(sp->dataDigest, pp)) {
pp->ds_dig = sp->dataDigest(pp->ds, pp->ds_len, 0);
iv->iov_base = &pp->ds_dig;
iv->iov_len = sizeof(pp->ds_dig);
uio->uio_resid += iv->iov_len ;
iv++;
}
}
uio->uio_iovcnt = iv - pq->iov;
sdebug(4, "pq->len=%d uio->uio_resid=%d uio->uio_iovcnt=%d", pq->len,
uio->uio_resid,
uio->uio_iovcnt);
sdebug(4, "opcode=%x iovcnt=%d uio_resid=%d itt=%x",
pp->ipdu.bhs.opcode, uio->uio_iovcnt, uio->uio_resid,
ntohl(pp->ipdu.bhs.itt));
sdebug(5, "sp=%p sp->soc=%p uio=%p sp->td=%p",
sp, sp->soc, uio, sp->td);
do {
len = uio->uio_resid;
error = sosend(sp->soc, NULL, uio, 0, 0, 0, sp->td);
if(uio->uio_resid == 0 || error || len == uio->uio_resid) {
if(uio->uio_resid) {
sdebug(2, "uio->uio_resid=%d uio->uio_iovcnt=%d error=%d len=%d",
uio->uio_resid, uio->uio_iovcnt, error, len);
if(error == 0)
error = EAGAIN; // 35
}
break;
}
/*
| XXX: untested code
*/
sdebug(1, "uio->uio_resid=%d uio->uio_iovcnt=%d",
uio->uio_resid, uio->uio_iovcnt);
iv = uio->uio_iov;
len -= uio->uio_resid;
while(uio->uio_iovcnt > 0) {
if(iv->iov_len > len) {
caddr_t bp = (caddr_t)iv->iov_base;
iv->iov_len -= len;
iv->iov_base = (void *)&bp[len];
break;
}
len -= iv->iov_len;
uio->uio_iovcnt--;
uio->uio_iov++;
iv++;
}
} while(uio->uio_resid);
if(error == 0) {
sp->stats.nsent++;
getbintime(&sp->stats.t_sent);
}
return error;
}
#endif /* USE_MBUF */
/*
| wait till a PDU header is received
| from the socket.
*/
/*
The format of the BHS is:
Byte/ 0 | 1 | 2 | 3 |
/ | | | |
|0 1 2 3 4 5 6 7|0 1 2 3 4 5 6 7|0 1 2 3 4 5 6 7|0 1 2 3 4 5 6 7|
+---------------+---------------+---------------+---------------+
0|.|I| Opcode |F| Opcode-specific fields |
+---------------+---------------+---------------+---------------+
4|TotalAHSLength | DataSegmentLength |
+---------------+---------------+---------------+---------------+
8| LUN or Opcode-specific fields |
+ +
12| |
+---------------+---------------+---------------+---------------+
16| Initiator Task Tag |
+---------------+---------------+---------------+---------------+
20/ Opcode-specific fields /
+/ /
+---------------+---------------+---------------+---------------+
48
*/
static __inline int
so_getbhs(isc_session_t *sp)
{
bhs_t *bhs = &sp->bhs;
struct uio *uio = &sp->uio;
struct iovec *iov = &sp->iov;
int error, flags;
debug_called(8);
iov->iov_base = bhs;
iov->iov_len = sizeof(bhs_t);
uio->uio_iov = iov;
uio->uio_iovcnt = 1;
uio->uio_rw = UIO_READ;
uio->uio_segflg = UIO_SYSSPACE;
uio->uio_td = curthread; // why ...
uio->uio_resid = sizeof(bhs_t);
flags = MSG_WAITALL;
error = soreceive(sp->soc, NULL, uio, 0, 0, &flags);
if(error)
debug(2,
#if __FreeBSD_version > 800000
"error=%d so_error=%d uio->uio_resid=%zd iov.iov_len=%zd",
#else
"error=%d so_error=%d uio->uio_resid=%d iov.iov_len=%zd",
#endif
error,
sp->soc->so_error, uio->uio_resid, iov->iov_len);
if(!error && (uio->uio_resid > 0)) {
error = EPIPE; // was EAGAIN
debug(2,
#if __FreeBSD_version > 800000
"error=%d so_error=%d uio->uio_resid=%zd iov.iov_len=%zd so_state=%x",
#else
"error=%d so_error=%d uio->uio_resid=%d iov.iov_len=%zd so_state=%x",
#endif
error,
sp->soc->so_error, uio->uio_resid, iov->iov_len, sp->soc->so_state);
}
return error;
}
/*
| so_recv gets called when
| an iSCSI header has been received.
| Note: the designers had no intentions
| in making programmer's life easy.
*/
static int
so_recv(isc_session_t *sp, pduq_t *pq)
{
sn_t *sn = &sp->sn;
struct uio *uio = &pq->uio;
pdu_t *pp = &pq->pdu;
bhs_t *bhs = &pp->ipdu.bhs;
struct iovec *iov = pq->iov;
int error;
u_int len;
u_int max, exp;
int flags = MSG_WAITALL;
debug_called(8);
/*
| now calculate how much data should be in the buffer
*/
uio->uio_iov = iov;
uio->uio_iovcnt = 0;
len = 0;
if(bhs->AHSLength) {
debug(2, "bhs->AHSLength=%d", bhs->AHSLength);
pp->ahs_len = bhs->AHSLength * 4;
len += pp->ahs_len;
pp->ahs_addr = malloc(pp->ahs_len, M_TEMP, M_WAITOK); // XXX: could get stuck here
iov->iov_base = pp->ahs_addr;
iov->iov_len = pp->ahs_len;
uio->uio_iovcnt++;
iov++;
}
if(ISOK2DIG(sp->hdrDigest, pp)) {
len += sizeof(pp->hdr_dig);
iov->iov_base = &pp->hdr_dig;
iov->iov_len = sizeof(pp->hdr_dig);
uio->uio_iovcnt++;
}
if(len) {
uio->uio_rw = UIO_READ;
uio->uio_segflg = UIO_SYSSPACE;
uio->uio_resid = len;
uio->uio_td = sp->td; // why ...
error = soreceive(sp->soc, NULL, uio, NULL, NULL, &flags);
//if(error == EAGAIN)
// XXX: this needs work! it hangs iscontrol
if(error || uio->uio_resid) {
debug(2,
#if __FreeBSD_version > 800000
"len=%d error=%d uio->uio_resid=%zd",
#else
"len=%d error=%d uio->uio_resid=%d",
#endif
len, error, uio->uio_resid);
goto out;
}
if(ISOK2DIG(sp->hdrDigest, pp)) {
bhs_t *bhs;
u_int digest;
bhs = (bhs_t *)&pp->ipdu;
digest = sp->hdrDigest(bhs, sizeof(bhs_t), 0);
if(pp->ahs_len)
digest = sp->hdrDigest(pp->ahs_addr, pp->ahs_len, digest);
if(pp->hdr_dig != digest) {
debug(2, "bad header digest: received=%x calculated=%x", pp->hdr_dig, digest);
// XXX: now what?
error = EIO;
goto out;
}
}
if(pp->ahs_len) {
debug(2, "ahs len=%x type=%x spec=%x",
pp->ahs_addr->len, pp->ahs_addr->type, pp->ahs_addr->spec);
// XXX: till I figure out what to do with this
free(pp->ahs_addr, M_TEMP);
}
pq->len += len; // XXX: who needs this?
bzero(uio, sizeof(struct uio));
len = 0;
}
if(bhs->DSLength) {
len = bhs->DSLength;
#if BYTE_ORDER == LITTLE_ENDIAN
len = ((len & 0x00ff0000) >> 16)
| (len & 0x0000ff00)
| ((len & 0x000000ff) << 16);
#endif
pp->ds_len = len;
if((sp->opt.maxRecvDataSegmentLength > 0) && (len > sp->opt.maxRecvDataSegmentLength)) {
xdebug("impossible PDU length(%d) opt.maxRecvDataSegmentLength=%d",
len, sp->opt.maxRecvDataSegmentLength);
log(LOG_ERR,
"so_recv: impossible PDU length(%d) from iSCSI %s/%s\n",
len, sp->opt.targetAddress, sp->opt.targetName);
/*
| XXX: this will really screwup the stream.
| should clear up the buffer till a valid header
| is found, or just close connection ...
| should read the RFC.
*/
error = E2BIG;
goto out;
}
while(len & 03)
len++;
if(ISOK2DIG(sp->dataDigest, pp))
len += 4;
uio->uio_resid = len;
uio->uio_td = sp->td; // why ...
pq->len += len; // XXX: do we need this?
error = soreceive(sp->soc, NULL, uio, &pq->mp, NULL, &flags);
//if(error == EAGAIN)
// XXX: this needs work! it hangs iscontrol
if(error || uio->uio_resid)
goto out;
if(ISOK2DIG(sp->dataDigest, pp)) {
struct mbuf *m;
u_int digest, ds_len, cnt;
// get the received digest
m_copydata(pq->mp,
len - sizeof(pp->ds_dig),
sizeof(pp->ds_dig),
(caddr_t)&pp->ds_dig);
// calculate all mbufs
digest = 0;
ds_len = len - sizeof(pp->ds_dig);
for(m = pq->mp; m != NULL; m = m->m_next) {
cnt = MIN(ds_len, m->m_len);
digest = sp->dataDigest(mtod(m, char *), cnt, digest);
ds_len -= cnt;
if(ds_len == 0)
break;
}
if(digest != pp->ds_dig) {
sdebug(1, "bad data digest: received=%x calculated=%x", pp->ds_dig, digest);
error = EIO; // XXX: find a better error
goto out;
}
KASSERT(ds_len == 0, ("ds_len not zero"));
}
}
sdebug(6, "len=%d] opcode=0x%x ahs_len=0x%x ds_len=0x%x",
pq->len, bhs->opcode, pp->ahs_len, pp->ds_len);
max = ntohl(bhs->MaxCmdSN);
exp = ntohl(bhs->ExpStSN);
if(max < exp - 1 &&
max > exp - _MAXINCR) {
sdebug(2, "bad cmd window size");
error = EIO; // XXX: for now;
goto out; // error
}
if(SNA_GT(max, sn->maxCmd))
sn->maxCmd = max;
if(SNA_GT(exp, sn->expCmd))
sn->expCmd = exp;
/*
| remove from the holding queue packets
| that have been acked and don't need
| further processing.
*/
i_acked_hld(sp, NULL);
sp->cws = sn->maxCmd - sn->expCmd + 1;
return 0;
out:
// XXX: need some work here
if(pp->ahs_len) {
// XXX: till I figure out what to do with this
free(pp->ahs_addr, M_TEMP);
}
xdebug("have a problem, error=%d", error);
pdu_free(sp->isc, pq);
if(!error && uio->uio_resid > 0)
error = EPIPE;
return error;
}
/*
| wait for something to arrive.
| and if the pdu is without errors, process it.
*/
static int
so_input(isc_session_t *sp)
{
pduq_t *pq;
int error;
debug_called(8);
/*
| first read in the iSCSI header
*/
error = so_getbhs(sp);
if(error == 0) {
/*
| now read the rest.
*/
pq = pdu_alloc(sp->isc, M_NOWAIT);
if(pq == NULL) { // XXX: might cause a deadlock ...
debug(2, "out of pdus, wait");
pq = pdu_alloc(sp->isc, M_WAITOK); // OK to WAIT
}
pq->pdu.ipdu.bhs = sp->bhs;
pq->len = sizeof(bhs_t); // so far only the header was read
error = so_recv(sp, pq);
if(error != 0) {
error += 0x800; // XXX: just to see the error.
// terminal error
// XXX: close connection and exit
}
else {
sp->stats.nrecv++;
getbintime(&sp->stats.t_recv);
ism_recv(sp, pq);
}
}
return error;
}
/*
| one per active (connected) session.
| this thread is responsible for reading
| in packets from the target.
*/
static void
isc_in(void *vp)
{
isc_session_t *sp = (isc_session_t *)vp;
struct socket *so = sp->soc;
int error;
debug_called(8);
sp->flags |= ISC_CON_RUNNING;
error = 0;
while((sp->flags & (ISC_CON_RUN | ISC_LINK_UP)) == (ISC_CON_RUN | ISC_LINK_UP)) {
// XXX: hunting ...
if(sp->soc == NULL || !(so->so_state & SS_ISCONNECTED)) {
debug(2, "sp->soc=%p", sp->soc);
break;
}
error = so_input(sp);
if(error == 0) {
mtx_lock(&sp->io_mtx);
if(sp->flags & ISC_OWAITING) {
wakeup(&sp->flags);
}
mtx_unlock(&sp->io_mtx);
} else if(error == EPIPE) {
break;
}
else if(error == EAGAIN) {
if(so->so_state & SS_ISCONNECTED)
// there seems to be a problem in 6.0 ...
tsleep(sp, PRIBIO, "isc_soc", 2*hz);
}
}
sdebug(2, "terminated, flags=%x so_count=%d so_state=%x error=%d proc=%p",
sp->flags, so->so_count, so->so_state, error, sp->proc);
if((sp->proc != NULL) && sp->signal) {
PROC_LOCK(sp->proc);
- psignal(sp->proc, sp->signal);
+ kern_psignal(sp->proc, sp->signal);
PROC_UNLOCK(sp->proc);
sp->flags |= ISC_SIGNALED;
sdebug(2, "pid=%d signaled(%d)", sp->proc->p_pid, sp->signal);
}
else {
// we have to do something ourselves
// like closing this session ...
}
/*
| we've been terminated
*/
// do we need this mutex ...?
mtx_lock(&sp->io_mtx);
sp->flags &= ~(ISC_CON_RUNNING | ISC_LINK_UP);
wakeup(&sp->soc);
mtx_unlock(&sp->io_mtx);
sdebug(2, "dropped ISC_CON_RUNNING");
#if __FreeBSD_version >= 800000
kproc_exit(0);
#else
kthread_exit(0);
#endif
}
void
isc_stop_receiver(isc_session_t *sp)
{
int n;
debug_called(8);
sdebug(3, "sp=%p sp->soc=%p", sp, sp? sp->soc: 0);
mtx_lock(&sp->io_mtx);
sp->flags &= ~ISC_LINK_UP;
msleep(&sp->soc, &sp->io_mtx, PRIBIO|PDROP, "isc_stpc", 5*hz);
soshutdown(sp->soc, SHUT_RD);
mtx_lock(&sp->io_mtx);
sdebug(3, "soshutdown");
sp->flags &= ~ISC_CON_RUN;
n = 2;
while(n-- && (sp->flags & ISC_CON_RUNNING)) {
sdebug(3, "waiting n=%d... flags=%x", n, sp->flags);
msleep(&sp->soc, &sp->io_mtx, PRIBIO, "isc_stpc", 5*hz);
}
mtx_unlock(&sp->io_mtx);
if(sp->fp != NULL)
fdrop(sp->fp, sp->td);
fputsock(sp->soc);
sp->soc = NULL;
sp->fp = NULL;
sdebug(3, "done");
}
void
isc_start_receiver(isc_session_t *sp)
{
debug_called(8);
sp->flags |= ISC_CON_RUN | ISC_LINK_UP;
#if __FreeBSD_version >= 800000
kproc_create
#else
kthread_create
#endif
(isc_in, sp, &sp->soc_proc, 0, 0, "isc_in %d", sp->sid);
}
Index: head/sys/dev/mfi/mfi.c
===================================================================
--- head/sys/dev/mfi/mfi.c (revision 225616)
+++ head/sys/dev/mfi/mfi.c (revision 225617)
@@ -1,2549 +1,2549 @@
/*-
* Copyright (c) 2006 IronPort Systems
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*-
* Copyright (c) 2007 LSI Corp.
* Copyright (c) 2007 Rajesh Prabhakaran.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_mfi.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/sysctl.h>
#include <sys/malloc.h>
#include <sys/kernel.h>
#include <sys/poll.h>
#include <sys/selinfo.h>
#include <sys/bus.h>
#include <sys/conf.h>
#include <sys/eventhandler.h>
#include <sys/rman.h>
#include <sys/bus_dma.h>
#include <sys/bio.h>
#include <sys/ioccom.h>
#include <sys/uio.h>
#include <sys/proc.h>
#include <sys/signalvar.h>
#include <machine/bus.h>
#include <machine/resource.h>
#include <dev/mfi/mfireg.h>
#include <dev/mfi/mfi_ioctl.h>
#include <dev/mfi/mfivar.h>
static int mfi_alloc_commands(struct mfi_softc *);
static int mfi_comms_init(struct mfi_softc *);
static int mfi_wait_command(struct mfi_softc *, struct mfi_command *);
static int mfi_get_controller_info(struct mfi_softc *);
static int mfi_get_log_state(struct mfi_softc *,
struct mfi_evt_log_state **);
static int mfi_parse_entries(struct mfi_softc *, int, int);
static int mfi_dcmd_command(struct mfi_softc *, struct mfi_command **,
uint32_t, void **, size_t);
static void mfi_data_cb(void *, bus_dma_segment_t *, int, int);
static void mfi_startup(void *arg);
static void mfi_intr(void *arg);
static void mfi_ldprobe(struct mfi_softc *sc);
static int mfi_aen_register(struct mfi_softc *sc, int seq, int locale);
static void mfi_aen_complete(struct mfi_command *);
static int mfi_aen_setup(struct mfi_softc *, uint32_t);
static int mfi_add_ld(struct mfi_softc *sc, int);
static void mfi_add_ld_complete(struct mfi_command *);
static struct mfi_command * mfi_bio_command(struct mfi_softc *);
static void mfi_bio_complete(struct mfi_command *);
static int mfi_mapcmd(struct mfi_softc *, struct mfi_command *);
static int mfi_send_frame(struct mfi_softc *, struct mfi_command *);
static void mfi_complete(struct mfi_softc *, struct mfi_command *);
static int mfi_abort(struct mfi_softc *, struct mfi_command *);
static int mfi_linux_ioctl_int(struct cdev *, u_long, caddr_t, int, struct thread *);
static void mfi_timeout(void *);
static int mfi_user_command(struct mfi_softc *,
struct mfi_ioc_passthru *);
static void mfi_enable_intr_xscale(struct mfi_softc *sc);
static void mfi_enable_intr_ppc(struct mfi_softc *sc);
static int32_t mfi_read_fw_status_xscale(struct mfi_softc *sc);
static int32_t mfi_read_fw_status_ppc(struct mfi_softc *sc);
static int mfi_check_clear_intr_xscale(struct mfi_softc *sc);
static int mfi_check_clear_intr_ppc(struct mfi_softc *sc);
static void mfi_issue_cmd_xscale(struct mfi_softc *sc,uint32_t bus_add,uint32_t frame_cnt);
static void mfi_issue_cmd_ppc(struct mfi_softc *sc,uint32_t bus_add,uint32_t frame_cnt);
SYSCTL_NODE(_hw, OID_AUTO, mfi, CTLFLAG_RD, 0, "MFI driver parameters");
static int mfi_event_locale = MFI_EVT_LOCALE_ALL;
TUNABLE_INT("hw.mfi.event_locale", &mfi_event_locale);
SYSCTL_INT(_hw_mfi, OID_AUTO, event_locale, CTLFLAG_RW, &mfi_event_locale,
0, "event message locale");
static int mfi_event_class = MFI_EVT_CLASS_INFO;
TUNABLE_INT("hw.mfi.event_class", &mfi_event_class);
SYSCTL_INT(_hw_mfi, OID_AUTO, event_class, CTLFLAG_RW, &mfi_event_class,
0, "event message class");
static int mfi_max_cmds = 128;
TUNABLE_INT("hw.mfi.max_cmds", &mfi_max_cmds);
SYSCTL_INT(_hw_mfi, OID_AUTO, max_cmds, CTLFLAG_RD, &mfi_max_cmds,
0, "Max commands");
/* Management interface */
static d_open_t mfi_open;
static d_close_t mfi_close;
static d_ioctl_t mfi_ioctl;
static d_poll_t mfi_poll;
static struct cdevsw mfi_cdevsw = {
.d_version = D_VERSION,
.d_flags = 0,
.d_open = mfi_open,
.d_close = mfi_close,
.d_ioctl = mfi_ioctl,
.d_poll = mfi_poll,
.d_name = "mfi",
};
MALLOC_DEFINE(M_MFIBUF, "mfibuf", "Buffers for the MFI driver");
#define MFI_INQ_LENGTH SHORT_INQUIRY_LENGTH
static void
mfi_enable_intr_xscale(struct mfi_softc *sc)
{
MFI_WRITE4(sc, MFI_OMSK, 0x01);
}
static void
mfi_enable_intr_ppc(struct mfi_softc *sc)
{
MFI_WRITE4(sc, MFI_ODCR0, 0xFFFFFFFF);
if (sc->mfi_flags & MFI_FLAGS_1078) {
MFI_WRITE4(sc, MFI_OMSK, ~MFI_1078_EIM);
} else if (sc->mfi_flags & MFI_FLAGS_GEN2) {
MFI_WRITE4(sc, MFI_OMSK, ~MFI_GEN2_EIM);
}
}
static int32_t
mfi_read_fw_status_xscale(struct mfi_softc *sc)
{
return MFI_READ4(sc, MFI_OMSG0);
}
static int32_t
mfi_read_fw_status_ppc(struct mfi_softc *sc)
{
return MFI_READ4(sc, MFI_OSP0);
}
static int
mfi_check_clear_intr_xscale(struct mfi_softc *sc)
{
int32_t status;
status = MFI_READ4(sc, MFI_OSTS);
if ((status & MFI_OSTS_INTR_VALID) == 0)
return 1;
MFI_WRITE4(sc, MFI_OSTS, status);
return 0;
}
static int
mfi_check_clear_intr_ppc(struct mfi_softc *sc)
{
int32_t status;
status = MFI_READ4(sc, MFI_OSTS);
if (sc->mfi_flags & MFI_FLAGS_1078) {
if (!(status & MFI_1078_RM)) {
return 1;
}
} else if (sc->mfi_flags & MFI_FLAGS_GEN2) {
if (!(status & MFI_GEN2_RM)) {
return 1;
}
}
MFI_WRITE4(sc, MFI_ODCR0, status);
return 0;
}
static void
mfi_issue_cmd_xscale(struct mfi_softc *sc,uint32_t bus_add,uint32_t frame_cnt)
{
MFI_WRITE4(sc, MFI_IQP,(bus_add >>3)|frame_cnt);
}
static void
mfi_issue_cmd_ppc(struct mfi_softc *sc,uint32_t bus_add,uint32_t frame_cnt)
{
MFI_WRITE4(sc, MFI_IQP, (bus_add |frame_cnt <<1)|1 );
}
static int
mfi_transition_firmware(struct mfi_softc *sc)
{
uint32_t fw_state, cur_state;
int max_wait, i;
fw_state = sc->mfi_read_fw_status(sc)& MFI_FWSTATE_MASK;
while (fw_state != MFI_FWSTATE_READY) {
if (bootverbose)
device_printf(sc->mfi_dev, "Waiting for firmware to "
"become ready\n");
cur_state = fw_state;
switch (fw_state) {
case MFI_FWSTATE_FAULT:
device_printf(sc->mfi_dev, "Firmware fault\n");
return (ENXIO);
case MFI_FWSTATE_WAIT_HANDSHAKE:
MFI_WRITE4(sc, MFI_IDB, MFI_FWINIT_CLEAR_HANDSHAKE);
max_wait = 2;
break;
case MFI_FWSTATE_OPERATIONAL:
MFI_WRITE4(sc, MFI_IDB, MFI_FWINIT_READY);
max_wait = 10;
break;
case MFI_FWSTATE_UNDEFINED:
case MFI_FWSTATE_BB_INIT:
max_wait = 2;
break;
case MFI_FWSTATE_FW_INIT:
case MFI_FWSTATE_DEVICE_SCAN:
case MFI_FWSTATE_FLUSH_CACHE:
max_wait = 20;
break;
case MFI_FWSTATE_BOOT_MESSAGE_PENDING:
MFI_WRITE4(sc, MFI_IDB, MFI_FWINIT_HOTPLUG);
max_wait = 10;
break;
default:
device_printf(sc->mfi_dev,"Unknown firmware state %#x\n",
fw_state);
return (ENXIO);
}
for (i = 0; i < (max_wait * 10); i++) {
fw_state = sc->mfi_read_fw_status(sc) & MFI_FWSTATE_MASK;
if (fw_state == cur_state)
DELAY(100000);
else
break;
}
if (fw_state == cur_state) {
device_printf(sc->mfi_dev, "Firmware stuck in state "
"%#x\n", fw_state);
return (ENXIO);
}
}
return (0);
}
static void
mfi_addr32_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
{
uint32_t *addr;
addr = arg;
*addr = segs[0].ds_addr;
}
int
mfi_attach(struct mfi_softc *sc)
{
uint32_t status;
int error, commsz, framessz, sensesz;
int frames, unit, max_fw_sge;
device_printf(sc->mfi_dev, "Megaraid SAS driver Ver 3.00 \n");
mtx_init(&sc->mfi_io_lock, "MFI I/O lock", NULL, MTX_DEF);
sx_init(&sc->mfi_config_lock, "MFI config");
TAILQ_INIT(&sc->mfi_ld_tqh);
TAILQ_INIT(&sc->mfi_aen_pids);
TAILQ_INIT(&sc->mfi_cam_ccbq);
mfi_initq_free(sc);
mfi_initq_ready(sc);
mfi_initq_busy(sc);
mfi_initq_bio(sc);
if (sc->mfi_flags & MFI_FLAGS_1064R) {
sc->mfi_enable_intr = mfi_enable_intr_xscale;
sc->mfi_read_fw_status = mfi_read_fw_status_xscale;
sc->mfi_check_clear_intr = mfi_check_clear_intr_xscale;
sc->mfi_issue_cmd = mfi_issue_cmd_xscale;
}
else {
sc->mfi_enable_intr = mfi_enable_intr_ppc;
sc->mfi_read_fw_status = mfi_read_fw_status_ppc;
sc->mfi_check_clear_intr = mfi_check_clear_intr_ppc;
sc->mfi_issue_cmd = mfi_issue_cmd_ppc;
}
/* Before we get too far, see if the firmware is working */
if ((error = mfi_transition_firmware(sc)) != 0) {
device_printf(sc->mfi_dev, "Firmware not in READY state, "
"error %d\n", error);
return (ENXIO);
}
/*
* Get information needed for sizing the contiguous memory for the
* frame pool. Size down the sgl parameter since we know that
* we will never need more than what's required for MAXPHYS.
* It would be nice if these constants were available at runtime
* instead of compile time.
*/
status = sc->mfi_read_fw_status(sc);
sc->mfi_max_fw_cmds = status & MFI_FWSTATE_MAXCMD_MASK;
max_fw_sge = (status & MFI_FWSTATE_MAXSGL_MASK) >> 16;
sc->mfi_max_sge = min(max_fw_sge, ((MFI_MAXPHYS / PAGE_SIZE) + 1));
/*
* Create the dma tag for data buffers. Used both for block I/O
* and for various internal data queries.
*/
if (bus_dma_tag_create( sc->mfi_parent_dmat, /* parent */
1, 0, /* algnmnt, boundary */
BUS_SPACE_MAXADDR, /* lowaddr */
BUS_SPACE_MAXADDR, /* highaddr */
NULL, NULL, /* filter, filterarg */
BUS_SPACE_MAXSIZE_32BIT,/* maxsize */
sc->mfi_max_sge, /* nsegments */
BUS_SPACE_MAXSIZE_32BIT,/* maxsegsize */
BUS_DMA_ALLOCNOW, /* flags */
busdma_lock_mutex, /* lockfunc */
&sc->mfi_io_lock, /* lockfuncarg */
&sc->mfi_buffer_dmat)) {
device_printf(sc->mfi_dev, "Cannot allocate buffer DMA tag\n");
return (ENOMEM);
}
/*
* Allocate DMA memory for the comms queues. Keep it under 4GB for
* efficiency. The mfi_hwcomms struct includes space for 1 reply queue
* entry, so the calculated size here will be will be 1 more than
* mfi_max_fw_cmds. This is apparently a requirement of the hardware.
*/
commsz = (sizeof(uint32_t) * sc->mfi_max_fw_cmds) +
sizeof(struct mfi_hwcomms);
if (bus_dma_tag_create( sc->mfi_parent_dmat, /* parent */
1, 0, /* algnmnt, boundary */
BUS_SPACE_MAXADDR_32BIT,/* lowaddr */
BUS_SPACE_MAXADDR, /* highaddr */
NULL, NULL, /* filter, filterarg */
commsz, /* maxsize */
1, /* msegments */
commsz, /* maxsegsize */
0, /* flags */
NULL, NULL, /* lockfunc, lockarg */
&sc->mfi_comms_dmat)) {
device_printf(sc->mfi_dev, "Cannot allocate comms DMA tag\n");
return (ENOMEM);
}
if (bus_dmamem_alloc(sc->mfi_comms_dmat, (void **)&sc->mfi_comms,
BUS_DMA_NOWAIT, &sc->mfi_comms_dmamap)) {
device_printf(sc->mfi_dev, "Cannot allocate comms memory\n");
return (ENOMEM);
}
bzero(sc->mfi_comms, commsz);
bus_dmamap_load(sc->mfi_comms_dmat, sc->mfi_comms_dmamap,
sc->mfi_comms, commsz, mfi_addr32_cb, &sc->mfi_comms_busaddr, 0);
/*
* Allocate DMA memory for the command frames. Keep them in the
* lower 4GB for efficiency. Calculate the size of the commands at
* the same time; each command is one 64 byte frame plus a set of
* additional frames for holding sg lists or other data.
* The assumption here is that the SG list will start at the second
* frame and not use the unused bytes in the first frame. While this
* isn't technically correct, it simplifies the calculation and allows
* for command frames that might be larger than an mfi_io_frame.
*/
if (sizeof(bus_addr_t) == 8) {
sc->mfi_sge_size = sizeof(struct mfi_sg64);
sc->mfi_flags |= MFI_FLAGS_SG64;
} else {
sc->mfi_sge_size = sizeof(struct mfi_sg32);
}
frames = (sc->mfi_sge_size * sc->mfi_max_sge - 1) / MFI_FRAME_SIZE + 2;
sc->mfi_cmd_size = frames * MFI_FRAME_SIZE;
framessz = sc->mfi_cmd_size * sc->mfi_max_fw_cmds;
if (bus_dma_tag_create( sc->mfi_parent_dmat, /* parent */
64, 0, /* algnmnt, boundary */
BUS_SPACE_MAXADDR_32BIT,/* lowaddr */
BUS_SPACE_MAXADDR, /* highaddr */
NULL, NULL, /* filter, filterarg */
framessz, /* maxsize */
1, /* nsegments */
framessz, /* maxsegsize */
0, /* flags */
NULL, NULL, /* lockfunc, lockarg */
&sc->mfi_frames_dmat)) {
device_printf(sc->mfi_dev, "Cannot allocate frame DMA tag\n");
return (ENOMEM);
}
if (bus_dmamem_alloc(sc->mfi_frames_dmat, (void **)&sc->mfi_frames,
BUS_DMA_NOWAIT, &sc->mfi_frames_dmamap)) {
device_printf(sc->mfi_dev, "Cannot allocate frames memory\n");
return (ENOMEM);
}
bzero(sc->mfi_frames, framessz);
bus_dmamap_load(sc->mfi_frames_dmat, sc->mfi_frames_dmamap,
sc->mfi_frames, framessz, mfi_addr32_cb, &sc->mfi_frames_busaddr,0);
/*
* Allocate DMA memory for the frame sense data. Keep them in the
* lower 4GB for efficiency
*/
sensesz = sc->mfi_max_fw_cmds * MFI_SENSE_LEN;
if (bus_dma_tag_create( sc->mfi_parent_dmat, /* parent */
4, 0, /* algnmnt, boundary */
BUS_SPACE_MAXADDR_32BIT,/* lowaddr */
BUS_SPACE_MAXADDR, /* highaddr */
NULL, NULL, /* filter, filterarg */
sensesz, /* maxsize */
1, /* nsegments */
sensesz, /* maxsegsize */
0, /* flags */
NULL, NULL, /* lockfunc, lockarg */
&sc->mfi_sense_dmat)) {
device_printf(sc->mfi_dev, "Cannot allocate sense DMA tag\n");
return (ENOMEM);
}
if (bus_dmamem_alloc(sc->mfi_sense_dmat, (void **)&sc->mfi_sense,
BUS_DMA_NOWAIT, &sc->mfi_sense_dmamap)) {
device_printf(sc->mfi_dev, "Cannot allocate sense memory\n");
return (ENOMEM);
}
bus_dmamap_load(sc->mfi_sense_dmat, sc->mfi_sense_dmamap,
sc->mfi_sense, sensesz, mfi_addr32_cb, &sc->mfi_sense_busaddr, 0);
if ((error = mfi_alloc_commands(sc)) != 0)
return (error);
if ((error = mfi_comms_init(sc)) != 0)
return (error);
if ((error = mfi_get_controller_info(sc)) != 0)
return (error);
mtx_lock(&sc->mfi_io_lock);
if ((error = mfi_aen_setup(sc, 0), 0) != 0) {
mtx_unlock(&sc->mfi_io_lock);
return (error);
}
mtx_unlock(&sc->mfi_io_lock);
/*
* Set up the interrupt handler. XXX This should happen in
* mfi_pci.c
*/
sc->mfi_irq_rid = 0;
if ((sc->mfi_irq = bus_alloc_resource_any(sc->mfi_dev, SYS_RES_IRQ,
&sc->mfi_irq_rid, RF_SHAREABLE | RF_ACTIVE)) == NULL) {
device_printf(sc->mfi_dev, "Cannot allocate interrupt\n");
return (EINVAL);
}
if (bus_setup_intr(sc->mfi_dev, sc->mfi_irq, INTR_MPSAFE|INTR_TYPE_BIO,
NULL, mfi_intr, sc, &sc->mfi_intr)) {
device_printf(sc->mfi_dev, "Cannot set up interrupt\n");
return (EINVAL);
}
/* Register a config hook to probe the bus for arrays */
sc->mfi_ich.ich_func = mfi_startup;
sc->mfi_ich.ich_arg = sc;
if (config_intrhook_establish(&sc->mfi_ich) != 0) {
device_printf(sc->mfi_dev, "Cannot establish configuration "
"hook\n");
return (EINVAL);
}
/*
* Register a shutdown handler.
*/
if ((sc->mfi_eh = EVENTHANDLER_REGISTER(shutdown_final, mfi_shutdown,
sc, SHUTDOWN_PRI_DEFAULT)) == NULL) {
device_printf(sc->mfi_dev, "Warning: shutdown event "
"registration failed\n");
}
/*
* Create the control device for doing management
*/
unit = device_get_unit(sc->mfi_dev);
sc->mfi_cdev = make_dev(&mfi_cdevsw, unit, UID_ROOT, GID_OPERATOR,
0640, "mfi%d", unit);
if (unit == 0)
make_dev_alias(sc->mfi_cdev, "megaraid_sas_ioctl_node");
if (sc->mfi_cdev != NULL)
sc->mfi_cdev->si_drv1 = sc;
SYSCTL_ADD_INT(device_get_sysctl_ctx(sc->mfi_dev),
SYSCTL_CHILDREN(device_get_sysctl_tree(sc->mfi_dev)),
OID_AUTO, "delete_busy_volumes", CTLFLAG_RW,
&sc->mfi_delete_busy_volumes, 0, "Allow removal of busy volumes");
SYSCTL_ADD_INT(device_get_sysctl_ctx(sc->mfi_dev),
SYSCTL_CHILDREN(device_get_sysctl_tree(sc->mfi_dev)),
OID_AUTO, "keep_deleted_volumes", CTLFLAG_RW,
&sc->mfi_keep_deleted_volumes, 0,
"Don't detach the mfid device for a busy volume that is deleted");
device_add_child(sc->mfi_dev, "mfip", -1);
bus_generic_attach(sc->mfi_dev);
/* Start the timeout watchdog */
callout_init(&sc->mfi_watchdog_callout, CALLOUT_MPSAFE);
callout_reset(&sc->mfi_watchdog_callout, MFI_CMD_TIMEOUT * hz,
mfi_timeout, sc);
return (0);
}
static int
mfi_alloc_commands(struct mfi_softc *sc)
{
struct mfi_command *cm;
int i, ncmds;
/*
* XXX Should we allocate all the commands up front, or allocate on
* demand later like 'aac' does?
*/
ncmds = MIN(mfi_max_cmds, sc->mfi_max_fw_cmds);
if (bootverbose)
device_printf(sc->mfi_dev, "Max fw cmds= %d, sizing driver "
"pool to %d\n", sc->mfi_max_fw_cmds, ncmds);
sc->mfi_commands = malloc(sizeof(struct mfi_command) * ncmds, M_MFIBUF,
M_WAITOK | M_ZERO);
for (i = 0; i < ncmds; i++) {
cm = &sc->mfi_commands[i];
cm->cm_frame = (union mfi_frame *)((uintptr_t)sc->mfi_frames +
sc->mfi_cmd_size * i);
cm->cm_frame_busaddr = sc->mfi_frames_busaddr +
sc->mfi_cmd_size * i;
cm->cm_frame->header.context = i;
cm->cm_sense = &sc->mfi_sense[i];
cm->cm_sense_busaddr= sc->mfi_sense_busaddr + MFI_SENSE_LEN * i;
cm->cm_sc = sc;
cm->cm_index = i;
if (bus_dmamap_create(sc->mfi_buffer_dmat, 0,
&cm->cm_dmamap) == 0)
mfi_release_command(cm);
else
break;
sc->mfi_total_cmds++;
}
return (0);
}
void
mfi_release_command(struct mfi_command *cm)
{
struct mfi_frame_header *hdr;
uint32_t *hdr_data;
/*
* Zero out the important fields of the frame, but make sure the
* context field is preserved. For efficiency, handle the fields
* as 32 bit words. Clear out the first S/G entry too for safety.
*/
hdr = &cm->cm_frame->header;
if (cm->cm_data != NULL && hdr->sg_count) {
cm->cm_sg->sg32[0].len = 0;
cm->cm_sg->sg32[0].addr = 0;
}
hdr_data = (uint32_t *)cm->cm_frame;
hdr_data[0] = 0; /* cmd, sense_len, cmd_status, scsi_status */
hdr_data[1] = 0; /* target_id, lun_id, cdb_len, sg_count */
hdr_data[4] = 0; /* flags, timeout */
hdr_data[5] = 0; /* data_len */
cm->cm_extra_frames = 0;
cm->cm_flags = 0;
cm->cm_complete = NULL;
cm->cm_private = NULL;
cm->cm_data = NULL;
cm->cm_sg = 0;
cm->cm_total_frame_size = 0;
mfi_enqueue_free(cm);
}
static int
mfi_dcmd_command(struct mfi_softc *sc, struct mfi_command **cmp, uint32_t opcode,
void **bufp, size_t bufsize)
{
struct mfi_command *cm;
struct mfi_dcmd_frame *dcmd;
void *buf = NULL;
mtx_assert(&sc->mfi_io_lock, MA_OWNED);
cm = mfi_dequeue_free(sc);
if (cm == NULL)
return (EBUSY);
if ((bufsize > 0) && (bufp != NULL)) {
if (*bufp == NULL) {
buf = malloc(bufsize, M_MFIBUF, M_NOWAIT|M_ZERO);
if (buf == NULL) {
mfi_release_command(cm);
return (ENOMEM);
}
*bufp = buf;
} else {
buf = *bufp;
}
}
dcmd = &cm->cm_frame->dcmd;
bzero(dcmd->mbox, MFI_MBOX_SIZE);
dcmd->header.cmd = MFI_CMD_DCMD;
dcmd->header.timeout = 0;
dcmd->header.flags = 0;
dcmd->header.data_len = bufsize;
dcmd->opcode = opcode;
cm->cm_sg = &dcmd->sgl;
cm->cm_total_frame_size = MFI_DCMD_FRAME_SIZE;
cm->cm_flags = 0;
cm->cm_data = buf;
cm->cm_private = buf;
cm->cm_len = bufsize;
*cmp = cm;
if ((bufp != NULL) && (*bufp == NULL) && (buf != NULL))
*bufp = buf;
return (0);
}
static int
mfi_comms_init(struct mfi_softc *sc)
{
struct mfi_command *cm;
struct mfi_init_frame *init;
struct mfi_init_qinfo *qinfo;
int error;
mtx_lock(&sc->mfi_io_lock);
if ((cm = mfi_dequeue_free(sc)) == NULL)
return (EBUSY);
/*
* Abuse the SG list area of the frame to hold the init_qinfo
* object;
*/
init = &cm->cm_frame->init;
qinfo = (struct mfi_init_qinfo *)((uintptr_t)init + MFI_FRAME_SIZE);
bzero(qinfo, sizeof(struct mfi_init_qinfo));
qinfo->rq_entries = sc->mfi_max_fw_cmds + 1;
qinfo->rq_addr_lo = sc->mfi_comms_busaddr +
offsetof(struct mfi_hwcomms, hw_reply_q);
qinfo->pi_addr_lo = sc->mfi_comms_busaddr +
offsetof(struct mfi_hwcomms, hw_pi);
qinfo->ci_addr_lo = sc->mfi_comms_busaddr +
offsetof(struct mfi_hwcomms, hw_ci);
init->header.cmd = MFI_CMD_INIT;
init->header.data_len = sizeof(struct mfi_init_qinfo);
init->qinfo_new_addr_lo = cm->cm_frame_busaddr + MFI_FRAME_SIZE;
cm->cm_data = NULL;
cm->cm_flags = MFI_CMD_POLLED;
if ((error = mfi_mapcmd(sc, cm)) != 0) {
device_printf(sc->mfi_dev, "failed to send init command\n");
mtx_unlock(&sc->mfi_io_lock);
return (error);
}
mfi_release_command(cm);
mtx_unlock(&sc->mfi_io_lock);
return (0);
}
static int
mfi_get_controller_info(struct mfi_softc *sc)
{
struct mfi_command *cm = NULL;
struct mfi_ctrl_info *ci = NULL;
uint32_t max_sectors_1, max_sectors_2;
int error;
mtx_lock(&sc->mfi_io_lock);
error = mfi_dcmd_command(sc, &cm, MFI_DCMD_CTRL_GETINFO,
(void **)&ci, sizeof(*ci));
if (error)
goto out;
cm->cm_flags = MFI_CMD_DATAIN | MFI_CMD_POLLED;
if ((error = mfi_mapcmd(sc, cm)) != 0) {
device_printf(sc->mfi_dev, "Failed to get controller info\n");
sc->mfi_max_io = (sc->mfi_max_sge - 1) * PAGE_SIZE /
MFI_SECTOR_LEN;
error = 0;
goto out;
}
bus_dmamap_sync(sc->mfi_buffer_dmat, cm->cm_dmamap,
BUS_DMASYNC_POSTREAD);
bus_dmamap_unload(sc->mfi_buffer_dmat, cm->cm_dmamap);
max_sectors_1 = (1 << ci->stripe_sz_ops.min) * ci->max_strips_per_io;
max_sectors_2 = ci->max_request_size;
sc->mfi_max_io = min(max_sectors_1, max_sectors_2);
out:
if (ci)
free(ci, M_MFIBUF);
if (cm)
mfi_release_command(cm);
mtx_unlock(&sc->mfi_io_lock);
return (error);
}
static int
mfi_get_log_state(struct mfi_softc *sc, struct mfi_evt_log_state **log_state)
{
struct mfi_command *cm = NULL;
int error;
error = mfi_dcmd_command(sc, &cm, MFI_DCMD_CTRL_EVENT_GETINFO,
(void **)log_state, sizeof(**log_state));
if (error)
goto out;
cm->cm_flags = MFI_CMD_DATAIN | MFI_CMD_POLLED;
if ((error = mfi_mapcmd(sc, cm)) != 0) {
device_printf(sc->mfi_dev, "Failed to get log state\n");
goto out;
}
bus_dmamap_sync(sc->mfi_buffer_dmat, cm->cm_dmamap,
BUS_DMASYNC_POSTREAD);
bus_dmamap_unload(sc->mfi_buffer_dmat, cm->cm_dmamap);
out:
if (cm)
mfi_release_command(cm);
return (error);
}
static int
mfi_aen_setup(struct mfi_softc *sc, uint32_t seq_start)
{
struct mfi_evt_log_state *log_state = NULL;
union mfi_evt class_locale;
int error = 0;
uint32_t seq;
class_locale.members.reserved = 0;
class_locale.members.locale = mfi_event_locale;
class_locale.members.evt_class = mfi_event_class;
if (seq_start == 0) {
error = mfi_get_log_state(sc, &log_state);
if (error) {
if (log_state)
free(log_state, M_MFIBUF);
return (error);
}
/*
* Walk through any events that fired since the last
* shutdown.
*/
mfi_parse_entries(sc, log_state->shutdown_seq_num,
log_state->newest_seq_num);
seq = log_state->newest_seq_num;
} else
seq = seq_start;
mfi_aen_register(sc, seq, class_locale.word);
free(log_state, M_MFIBUF);
return 0;
}
static int
mfi_wait_command(struct mfi_softc *sc, struct mfi_command *cm)
{
mtx_assert(&sc->mfi_io_lock, MA_OWNED);
cm->cm_complete = NULL;
/*
* MegaCli can issue a DCMD of 0. In this case do nothing
* and return 0 to it as status
*/
if (cm->cm_frame->dcmd.opcode == 0) {
cm->cm_frame->header.cmd_status = MFI_STAT_OK;
cm->cm_error = 0;
return (cm->cm_error);
}
mfi_enqueue_ready(cm);
mfi_startio(sc);
if ((cm->cm_flags & MFI_CMD_COMPLETED) == 0)
msleep(cm, &sc->mfi_io_lock, PRIBIO, "mfiwait", 0);
return (cm->cm_error);
}
void
mfi_free(struct mfi_softc *sc)
{
struct mfi_command *cm;
int i;
callout_drain(&sc->mfi_watchdog_callout);
if (sc->mfi_cdev != NULL)
destroy_dev(sc->mfi_cdev);
if (sc->mfi_total_cmds != 0) {
for (i = 0; i < sc->mfi_total_cmds; i++) {
cm = &sc->mfi_commands[i];
bus_dmamap_destroy(sc->mfi_buffer_dmat, cm->cm_dmamap);
}
free(sc->mfi_commands, M_MFIBUF);
}
if (sc->mfi_intr)
bus_teardown_intr(sc->mfi_dev, sc->mfi_irq, sc->mfi_intr);
if (sc->mfi_irq != NULL)
bus_release_resource(sc->mfi_dev, SYS_RES_IRQ, sc->mfi_irq_rid,
sc->mfi_irq);
if (sc->mfi_sense_busaddr != 0)
bus_dmamap_unload(sc->mfi_sense_dmat, sc->mfi_sense_dmamap);
if (sc->mfi_sense != NULL)
bus_dmamem_free(sc->mfi_sense_dmat, sc->mfi_sense,
sc->mfi_sense_dmamap);
if (sc->mfi_sense_dmat != NULL)
bus_dma_tag_destroy(sc->mfi_sense_dmat);
if (sc->mfi_frames_busaddr != 0)
bus_dmamap_unload(sc->mfi_frames_dmat, sc->mfi_frames_dmamap);
if (sc->mfi_frames != NULL)
bus_dmamem_free(sc->mfi_frames_dmat, sc->mfi_frames,
sc->mfi_frames_dmamap);
if (sc->mfi_frames_dmat != NULL)
bus_dma_tag_destroy(sc->mfi_frames_dmat);
if (sc->mfi_comms_busaddr != 0)
bus_dmamap_unload(sc->mfi_comms_dmat, sc->mfi_comms_dmamap);
if (sc->mfi_comms != NULL)
bus_dmamem_free(sc->mfi_comms_dmat, sc->mfi_comms,
sc->mfi_comms_dmamap);
if (sc->mfi_comms_dmat != NULL)
bus_dma_tag_destroy(sc->mfi_comms_dmat);
if (sc->mfi_buffer_dmat != NULL)
bus_dma_tag_destroy(sc->mfi_buffer_dmat);
if (sc->mfi_parent_dmat != NULL)
bus_dma_tag_destroy(sc->mfi_parent_dmat);
if (mtx_initialized(&sc->mfi_io_lock)) {
mtx_destroy(&sc->mfi_io_lock);
sx_destroy(&sc->mfi_config_lock);
}
return;
}
static void
mfi_startup(void *arg)
{
struct mfi_softc *sc;
sc = (struct mfi_softc *)arg;
config_intrhook_disestablish(&sc->mfi_ich);
sc->mfi_enable_intr(sc);
sx_xlock(&sc->mfi_config_lock);
mtx_lock(&sc->mfi_io_lock);
mfi_ldprobe(sc);
mtx_unlock(&sc->mfi_io_lock);
sx_xunlock(&sc->mfi_config_lock);
}
static void
mfi_intr(void *arg)
{
struct mfi_softc *sc;
struct mfi_command *cm;
uint32_t pi, ci, context;
sc = (struct mfi_softc *)arg;
if (sc->mfi_check_clear_intr(sc))
return;
pi = sc->mfi_comms->hw_pi;
ci = sc->mfi_comms->hw_ci;
mtx_lock(&sc->mfi_io_lock);
while (ci != pi) {
context = sc->mfi_comms->hw_reply_q[ci];
if (context < sc->mfi_max_fw_cmds) {
cm = &sc->mfi_commands[context];
mfi_remove_busy(cm);
cm->cm_error = 0;
mfi_complete(sc, cm);
}
if (++ci == (sc->mfi_max_fw_cmds + 1)) {
ci = 0;
}
}
sc->mfi_comms->hw_ci = ci;
/* Give defered I/O a chance to run */
if (sc->mfi_flags & MFI_FLAGS_QFRZN)
sc->mfi_flags &= ~MFI_FLAGS_QFRZN;
mfi_startio(sc);
mtx_unlock(&sc->mfi_io_lock);
return;
}
int
mfi_shutdown(struct mfi_softc *sc)
{
struct mfi_dcmd_frame *dcmd;
struct mfi_command *cm;
int error;
mtx_lock(&sc->mfi_io_lock);
error = mfi_dcmd_command(sc, &cm, MFI_DCMD_CTRL_SHUTDOWN, NULL, 0);
if (error) {
mtx_unlock(&sc->mfi_io_lock);
return (error);
}
if (sc->mfi_aen_cm != NULL)
mfi_abort(sc, sc->mfi_aen_cm);
dcmd = &cm->cm_frame->dcmd;
dcmd->header.flags = MFI_FRAME_DIR_NONE;
cm->cm_flags = MFI_CMD_POLLED;
cm->cm_data = NULL;
if ((error = mfi_mapcmd(sc, cm)) != 0) {
device_printf(sc->mfi_dev, "Failed to shutdown controller\n");
}
mfi_release_command(cm);
mtx_unlock(&sc->mfi_io_lock);
return (error);
}
static void
mfi_ldprobe(struct mfi_softc *sc)
{
struct mfi_frame_header *hdr;
struct mfi_command *cm = NULL;
struct mfi_ld_list *list = NULL;
struct mfi_disk *ld;
int error, i;
sx_assert(&sc->mfi_config_lock, SA_XLOCKED);
mtx_assert(&sc->mfi_io_lock, MA_OWNED);
error = mfi_dcmd_command(sc, &cm, MFI_DCMD_LD_GET_LIST,
(void **)&list, sizeof(*list));
if (error)
goto out;
cm->cm_flags = MFI_CMD_DATAIN;
if (mfi_wait_command(sc, cm) != 0) {
device_printf(sc->mfi_dev, "Failed to get device listing\n");
goto out;
}
hdr = &cm->cm_frame->header;
if (hdr->cmd_status != MFI_STAT_OK) {
device_printf(sc->mfi_dev, "MFI_DCMD_LD_GET_LIST failed %x\n",
hdr->cmd_status);
goto out;
}
for (i = 0; i < list->ld_count; i++) {
TAILQ_FOREACH(ld, &sc->mfi_ld_tqh, ld_link) {
if (ld->ld_id == list->ld_list[i].ld.v.target_id)
goto skip_add;
}
mfi_add_ld(sc, list->ld_list[i].ld.v.target_id);
skip_add:;
}
out:
if (list)
free(list, M_MFIBUF);
if (cm)
mfi_release_command(cm);
return;
}
/*
* The timestamp is the number of seconds since 00:00 Jan 1, 2000. If
* the bits in 24-31 are all set, then it is the number of seconds since
* boot.
*/
static const char *
format_timestamp(uint32_t timestamp)
{
static char buffer[32];
if ((timestamp & 0xff000000) == 0xff000000)
snprintf(buffer, sizeof(buffer), "boot + %us", timestamp &
0x00ffffff);
else
snprintf(buffer, sizeof(buffer), "%us", timestamp);
return (buffer);
}
static const char *
format_class(int8_t class)
{
static char buffer[6];
switch (class) {
case MFI_EVT_CLASS_DEBUG:
return ("debug");
case MFI_EVT_CLASS_PROGRESS:
return ("progress");
case MFI_EVT_CLASS_INFO:
return ("info");
case MFI_EVT_CLASS_WARNING:
return ("WARN");
case MFI_EVT_CLASS_CRITICAL:
return ("CRIT");
case MFI_EVT_CLASS_FATAL:
return ("FATAL");
case MFI_EVT_CLASS_DEAD:
return ("DEAD");
default:
snprintf(buffer, sizeof(buffer), "%d", class);
return (buffer);
}
}
static void
mfi_decode_evt(struct mfi_softc *sc, struct mfi_evt_detail *detail)
{
device_printf(sc->mfi_dev, "%d (%s/0x%04x/%s) - %s\n", detail->seq,
format_timestamp(detail->time), detail->evt_class.members.locale,
format_class(detail->evt_class.members.evt_class), detail->description);
}
static int
mfi_aen_register(struct mfi_softc *sc, int seq, int locale)
{
struct mfi_command *cm;
struct mfi_dcmd_frame *dcmd;
union mfi_evt current_aen, prior_aen;
struct mfi_evt_detail *ed = NULL;
int error = 0;
current_aen.word = locale;
if (sc->mfi_aen_cm != NULL) {
prior_aen.word =
((uint32_t *)&sc->mfi_aen_cm->cm_frame->dcmd.mbox)[1];
if (prior_aen.members.evt_class <= current_aen.members.evt_class &&
!((prior_aen.members.locale & current_aen.members.locale)
^current_aen.members.locale)) {
return (0);
} else {
prior_aen.members.locale |= current_aen.members.locale;
if (prior_aen.members.evt_class
< current_aen.members.evt_class)
current_aen.members.evt_class =
prior_aen.members.evt_class;
mfi_abort(sc, sc->mfi_aen_cm);
}
}
error = mfi_dcmd_command(sc, &cm, MFI_DCMD_CTRL_EVENT_WAIT,
(void **)&ed, sizeof(*ed));
if (error) {
goto out;
}
dcmd = &cm->cm_frame->dcmd;
((uint32_t *)&dcmd->mbox)[0] = seq;
((uint32_t *)&dcmd->mbox)[1] = locale;
cm->cm_flags = MFI_CMD_DATAIN;
cm->cm_complete = mfi_aen_complete;
sc->mfi_aen_cm = cm;
mfi_enqueue_ready(cm);
mfi_startio(sc);
out:
return (error);
}
static void
mfi_aen_complete(struct mfi_command *cm)
{
struct mfi_frame_header *hdr;
struct mfi_softc *sc;
struct mfi_evt_detail *detail;
struct mfi_aen *mfi_aen_entry, *tmp;
int seq = 0, aborted = 0;
sc = cm->cm_sc;
hdr = &cm->cm_frame->header;
if (sc->mfi_aen_cm == NULL)
return;
if (sc->mfi_aen_cm->cm_aen_abort ||
hdr->cmd_status == MFI_STAT_INVALID_STATUS) {
sc->mfi_aen_cm->cm_aen_abort = 0;
aborted = 1;
} else {
sc->mfi_aen_triggered = 1;
if (sc->mfi_poll_waiting) {
sc->mfi_poll_waiting = 0;
selwakeup(&sc->mfi_select);
}
detail = cm->cm_data;
/*
* XXX If this function is too expensive or is recursive, then
* events should be put onto a queue and processed later.
*/
mfi_decode_evt(sc, detail);
seq = detail->seq + 1;
TAILQ_FOREACH_SAFE(mfi_aen_entry, &sc->mfi_aen_pids, aen_link, tmp) {
TAILQ_REMOVE(&sc->mfi_aen_pids, mfi_aen_entry,
aen_link);
PROC_LOCK(mfi_aen_entry->p);
- psignal(mfi_aen_entry->p, SIGIO);
+ kern_psignal(mfi_aen_entry->p, SIGIO);
PROC_UNLOCK(mfi_aen_entry->p);
free(mfi_aen_entry, M_MFIBUF);
}
}
free(cm->cm_data, M_MFIBUF);
sc->mfi_aen_cm = NULL;
wakeup(&sc->mfi_aen_cm);
mfi_release_command(cm);
/* set it up again so the driver can catch more events */
if (!aborted) {
mfi_aen_setup(sc, seq);
}
}
#define MAX_EVENTS 15
static int
mfi_parse_entries(struct mfi_softc *sc, int start_seq, int stop_seq)
{
struct mfi_command *cm;
struct mfi_dcmd_frame *dcmd;
struct mfi_evt_list *el;
union mfi_evt class_locale;
int error, i, seq, size;
class_locale.members.reserved = 0;
class_locale.members.locale = mfi_event_locale;
class_locale.members.evt_class = mfi_event_class;
size = sizeof(struct mfi_evt_list) + sizeof(struct mfi_evt_detail)
* (MAX_EVENTS - 1);
el = malloc(size, M_MFIBUF, M_NOWAIT | M_ZERO);
if (el == NULL)
return (ENOMEM);
for (seq = start_seq;;) {
if ((cm = mfi_dequeue_free(sc)) == NULL) {
free(el, M_MFIBUF);
return (EBUSY);
}
dcmd = &cm->cm_frame->dcmd;
bzero(dcmd->mbox, MFI_MBOX_SIZE);
dcmd->header.cmd = MFI_CMD_DCMD;
dcmd->header.timeout = 0;
dcmd->header.data_len = size;
dcmd->opcode = MFI_DCMD_CTRL_EVENT_GET;
((uint32_t *)&dcmd->mbox)[0] = seq;
((uint32_t *)&dcmd->mbox)[1] = class_locale.word;
cm->cm_sg = &dcmd->sgl;
cm->cm_total_frame_size = MFI_DCMD_FRAME_SIZE;
cm->cm_flags = MFI_CMD_DATAIN | MFI_CMD_POLLED;
cm->cm_data = el;
cm->cm_len = size;
if ((error = mfi_mapcmd(sc, cm)) != 0) {
device_printf(sc->mfi_dev,
"Failed to get controller entries\n");
mfi_release_command(cm);
break;
}
bus_dmamap_sync(sc->mfi_buffer_dmat, cm->cm_dmamap,
BUS_DMASYNC_POSTREAD);
bus_dmamap_unload(sc->mfi_buffer_dmat, cm->cm_dmamap);
if (dcmd->header.cmd_status == MFI_STAT_NOT_FOUND) {
mfi_release_command(cm);
break;
}
if (dcmd->header.cmd_status != MFI_STAT_OK) {
device_printf(sc->mfi_dev,
"Error %d fetching controller entries\n",
dcmd->header.cmd_status);
mfi_release_command(cm);
break;
}
mfi_release_command(cm);
for (i = 0; i < el->count; i++) {
/*
* If this event is newer than 'stop_seq' then
* break out of the loop. Note that the log
* is a circular buffer so we have to handle
* the case that our stop point is earlier in
* the buffer than our start point.
*/
if (el->event[i].seq >= stop_seq) {
if (start_seq <= stop_seq)
break;
else if (el->event[i].seq < start_seq)
break;
}
mfi_decode_evt(sc, &el->event[i]);
}
seq = el->event[el->count - 1].seq + 1;
}
free(el, M_MFIBUF);
return (0);
}
static int
mfi_add_ld(struct mfi_softc *sc, int id)
{
struct mfi_command *cm;
struct mfi_dcmd_frame *dcmd = NULL;
struct mfi_ld_info *ld_info = NULL;
int error;
mtx_assert(&sc->mfi_io_lock, MA_OWNED);
error = mfi_dcmd_command(sc, &cm, MFI_DCMD_LD_GET_INFO,
(void **)&ld_info, sizeof(*ld_info));
if (error) {
device_printf(sc->mfi_dev,
"Failed to allocate for MFI_DCMD_LD_GET_INFO %d\n", error);
if (ld_info)
free(ld_info, M_MFIBUF);
return (error);
}
cm->cm_flags = MFI_CMD_DATAIN;
dcmd = &cm->cm_frame->dcmd;
dcmd->mbox[0] = id;
if (mfi_wait_command(sc, cm) != 0) {
device_printf(sc->mfi_dev,
"Failed to get logical drive: %d\n", id);
free(ld_info, M_MFIBUF);
return (0);
}
mfi_add_ld_complete(cm);
return (0);
}
static void
mfi_add_ld_complete(struct mfi_command *cm)
{
struct mfi_frame_header *hdr;
struct mfi_ld_info *ld_info;
struct mfi_softc *sc;
device_t child;
sc = cm->cm_sc;
hdr = &cm->cm_frame->header;
ld_info = cm->cm_private;
if (hdr->cmd_status != MFI_STAT_OK) {
free(ld_info, M_MFIBUF);
mfi_release_command(cm);
return;
}
mfi_release_command(cm);
mtx_unlock(&sc->mfi_io_lock);
mtx_lock(&Giant);
if ((child = device_add_child(sc->mfi_dev, "mfid", -1)) == NULL) {
device_printf(sc->mfi_dev, "Failed to add logical disk\n");
free(ld_info, M_MFIBUF);
mtx_unlock(&Giant);
mtx_lock(&sc->mfi_io_lock);
return;
}
device_set_ivars(child, ld_info);
device_set_desc(child, "MFI Logical Disk");
bus_generic_attach(sc->mfi_dev);
mtx_unlock(&Giant);
mtx_lock(&sc->mfi_io_lock);
}
static struct mfi_command *
mfi_bio_command(struct mfi_softc *sc)
{
struct mfi_io_frame *io;
struct mfi_command *cm;
struct bio *bio;
int flags, blkcount;
if ((cm = mfi_dequeue_free(sc)) == NULL)
return (NULL);
if ((bio = mfi_dequeue_bio(sc)) == NULL) {
mfi_release_command(cm);
return (NULL);
}
io = &cm->cm_frame->io;
switch (bio->bio_cmd & 0x03) {
case BIO_READ:
io->header.cmd = MFI_CMD_LD_READ;
flags = MFI_CMD_DATAIN;
break;
case BIO_WRITE:
io->header.cmd = MFI_CMD_LD_WRITE;
flags = MFI_CMD_DATAOUT;
break;
default:
panic("Invalid bio command");
}
/* Cheat with the sector length to avoid a non-constant division */
blkcount = (bio->bio_bcount + MFI_SECTOR_LEN - 1) / MFI_SECTOR_LEN;
io->header.target_id = (uintptr_t)bio->bio_driver1;
io->header.timeout = 0;
io->header.flags = 0;
io->header.sense_len = MFI_SENSE_LEN;
io->header.data_len = blkcount;
io->sense_addr_lo = cm->cm_sense_busaddr;
io->sense_addr_hi = 0;
io->lba_hi = (bio->bio_pblkno & 0xffffffff00000000) >> 32;
io->lba_lo = bio->bio_pblkno & 0xffffffff;
cm->cm_complete = mfi_bio_complete;
cm->cm_private = bio;
cm->cm_data = bio->bio_data;
cm->cm_len = bio->bio_bcount;
cm->cm_sg = &io->sgl;
cm->cm_total_frame_size = MFI_IO_FRAME_SIZE;
cm->cm_flags = flags;
return (cm);
}
static void
mfi_bio_complete(struct mfi_command *cm)
{
struct bio *bio;
struct mfi_frame_header *hdr;
struct mfi_softc *sc;
bio = cm->cm_private;
hdr = &cm->cm_frame->header;
sc = cm->cm_sc;
if ((hdr->cmd_status != MFI_STAT_OK) || (hdr->scsi_status != 0)) {
bio->bio_flags |= BIO_ERROR;
bio->bio_error = EIO;
device_printf(sc->mfi_dev, "I/O error, status= %d "
"scsi_status= %d\n", hdr->cmd_status, hdr->scsi_status);
mfi_print_sense(cm->cm_sc, cm->cm_sense);
} else if (cm->cm_error != 0) {
bio->bio_flags |= BIO_ERROR;
}
mfi_release_command(cm);
mfi_disk_complete(bio);
}
void
mfi_startio(struct mfi_softc *sc)
{
struct mfi_command *cm;
struct ccb_hdr *ccbh;
for (;;) {
/* Don't bother if we're short on resources */
if (sc->mfi_flags & MFI_FLAGS_QFRZN)
break;
/* Try a command that has already been prepared */
cm = mfi_dequeue_ready(sc);
if (cm == NULL) {
if ((ccbh = TAILQ_FIRST(&sc->mfi_cam_ccbq)) != NULL)
cm = sc->mfi_cam_start(ccbh);
}
/* Nope, so look for work on the bioq */
if (cm == NULL)
cm = mfi_bio_command(sc);
/* No work available, so exit */
if (cm == NULL)
break;
/* Send the command to the controller */
if (mfi_mapcmd(sc, cm) != 0) {
mfi_requeue_ready(cm);
break;
}
}
}
static int
mfi_mapcmd(struct mfi_softc *sc, struct mfi_command *cm)
{
int error, polled;
mtx_assert(&sc->mfi_io_lock, MA_OWNED);
if (cm->cm_data != NULL) {
polled = (cm->cm_flags & MFI_CMD_POLLED) ? BUS_DMA_NOWAIT : 0;
error = bus_dmamap_load(sc->mfi_buffer_dmat, cm->cm_dmamap,
cm->cm_data, cm->cm_len, mfi_data_cb, cm, polled);
if (error == EINPROGRESS) {
sc->mfi_flags |= MFI_FLAGS_QFRZN;
return (0);
}
} else {
error = mfi_send_frame(sc, cm);
}
return (error);
}
static void
mfi_data_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
{
struct mfi_frame_header *hdr;
struct mfi_command *cm;
union mfi_sgl *sgl;
struct mfi_softc *sc;
int i, dir;
cm = (struct mfi_command *)arg;
sc = cm->cm_sc;
hdr = &cm->cm_frame->header;
sgl = cm->cm_sg;
if (error) {
printf("error %d in callback\n", error);
cm->cm_error = error;
mfi_complete(sc, cm);
return;
}
if ((sc->mfi_flags & MFI_FLAGS_SG64) == 0) {
for (i = 0; i < nsegs; i++) {
sgl->sg32[i].addr = segs[i].ds_addr;
sgl->sg32[i].len = segs[i].ds_len;
}
} else {
for (i = 0; i < nsegs; i++) {
sgl->sg64[i].addr = segs[i].ds_addr;
sgl->sg64[i].len = segs[i].ds_len;
}
hdr->flags |= MFI_FRAME_SGL64;
}
hdr->sg_count = nsegs;
dir = 0;
if (cm->cm_flags & MFI_CMD_DATAIN) {
dir |= BUS_DMASYNC_PREREAD;
hdr->flags |= MFI_FRAME_DIR_READ;
}
if (cm->cm_flags & MFI_CMD_DATAOUT) {
dir |= BUS_DMASYNC_PREWRITE;
hdr->flags |= MFI_FRAME_DIR_WRITE;
}
bus_dmamap_sync(sc->mfi_buffer_dmat, cm->cm_dmamap, dir);
cm->cm_flags |= MFI_CMD_MAPPED;
/*
* Instead of calculating the total number of frames in the
* compound frame, it's already assumed that there will be at
* least 1 frame, so don't compensate for the modulo of the
* following division.
*/
cm->cm_total_frame_size += (sc->mfi_sge_size * nsegs);
cm->cm_extra_frames = (cm->cm_total_frame_size - 1) / MFI_FRAME_SIZE;
mfi_send_frame(sc, cm);
return;
}
static int
mfi_send_frame(struct mfi_softc *sc, struct mfi_command *cm)
{
struct mfi_frame_header *hdr;
int tm = MFI_POLL_TIMEOUT_SECS * 1000;
hdr = &cm->cm_frame->header;
if ((cm->cm_flags & MFI_CMD_POLLED) == 0) {
cm->cm_timestamp = time_uptime;
mfi_enqueue_busy(cm);
} else {
hdr->cmd_status = MFI_STAT_INVALID_STATUS;
hdr->flags |= MFI_FRAME_DONT_POST_IN_REPLY_QUEUE;
}
/*
* The bus address of the command is aligned on a 64 byte boundary,
* leaving the least 6 bits as zero. For whatever reason, the
* hardware wants the address shifted right by three, leaving just
* 3 zero bits. These three bits are then used as a prefetching
* hint for the hardware to predict how many frames need to be
* fetched across the bus. If a command has more than 8 frames
* then the 3 bits are set to 0x7 and the firmware uses other
* information in the command to determine the total amount to fetch.
* However, FreeBSD doesn't support I/O larger than 128K, so 8 frames
* is enough for both 32bit and 64bit systems.
*/
if (cm->cm_extra_frames > 7)
cm->cm_extra_frames = 7;
sc->mfi_issue_cmd(sc,cm->cm_frame_busaddr,cm->cm_extra_frames);
if ((cm->cm_flags & MFI_CMD_POLLED) == 0)
return (0);
/* This is a polled command, so busy-wait for it to complete. */
while (hdr->cmd_status == MFI_STAT_INVALID_STATUS) {
DELAY(1000);
tm -= 1;
if (tm <= 0)
break;
}
if (hdr->cmd_status == MFI_STAT_INVALID_STATUS) {
device_printf(sc->mfi_dev, "Frame %p timed out "
"command 0x%X\n", hdr, cm->cm_frame->dcmd.opcode);
return (ETIMEDOUT);
}
return (0);
}
static void
mfi_complete(struct mfi_softc *sc, struct mfi_command *cm)
{
int dir;
if ((cm->cm_flags & MFI_CMD_MAPPED) != 0) {
dir = 0;
if (cm->cm_flags & MFI_CMD_DATAIN)
dir |= BUS_DMASYNC_POSTREAD;
if (cm->cm_flags & MFI_CMD_DATAOUT)
dir |= BUS_DMASYNC_POSTWRITE;
bus_dmamap_sync(sc->mfi_buffer_dmat, cm->cm_dmamap, dir);
bus_dmamap_unload(sc->mfi_buffer_dmat, cm->cm_dmamap);
cm->cm_flags &= ~MFI_CMD_MAPPED;
}
cm->cm_flags |= MFI_CMD_COMPLETED;
if (cm->cm_complete != NULL)
cm->cm_complete(cm);
else
wakeup(cm);
}
static int
mfi_abort(struct mfi_softc *sc, struct mfi_command *cm_abort)
{
struct mfi_command *cm;
struct mfi_abort_frame *abort;
int i = 0;
mtx_assert(&sc->mfi_io_lock, MA_OWNED);
if ((cm = mfi_dequeue_free(sc)) == NULL) {
return (EBUSY);
}
abort = &cm->cm_frame->abort;
abort->header.cmd = MFI_CMD_ABORT;
abort->header.flags = 0;
abort->abort_context = cm_abort->cm_frame->header.context;
abort->abort_mfi_addr_lo = cm_abort->cm_frame_busaddr;
abort->abort_mfi_addr_hi = 0;
cm->cm_data = NULL;
cm->cm_flags = MFI_CMD_POLLED;
sc->mfi_aen_cm->cm_aen_abort = 1;
mfi_mapcmd(sc, cm);
mfi_release_command(cm);
while (i < 5 && sc->mfi_aen_cm != NULL) {
msleep(&sc->mfi_aen_cm, &sc->mfi_io_lock, 0, "mfiabort", 5 * hz);
i++;
}
return (0);
}
int
mfi_dump_blocks(struct mfi_softc *sc, int id, uint64_t lba, void *virt, int len)
{
struct mfi_command *cm;
struct mfi_io_frame *io;
int error;
if ((cm = mfi_dequeue_free(sc)) == NULL)
return (EBUSY);
io = &cm->cm_frame->io;
io->header.cmd = MFI_CMD_LD_WRITE;
io->header.target_id = id;
io->header.timeout = 0;
io->header.flags = 0;
io->header.sense_len = MFI_SENSE_LEN;
io->header.data_len = (len + MFI_SECTOR_LEN - 1) / MFI_SECTOR_LEN;
io->sense_addr_lo = cm->cm_sense_busaddr;
io->sense_addr_hi = 0;
io->lba_hi = (lba & 0xffffffff00000000) >> 32;
io->lba_lo = lba & 0xffffffff;
cm->cm_data = virt;
cm->cm_len = len;
cm->cm_sg = &io->sgl;
cm->cm_total_frame_size = MFI_IO_FRAME_SIZE;
cm->cm_flags = MFI_CMD_POLLED | MFI_CMD_DATAOUT;
error = mfi_mapcmd(sc, cm);
bus_dmamap_sync(sc->mfi_buffer_dmat, cm->cm_dmamap,
BUS_DMASYNC_POSTWRITE);
bus_dmamap_unload(sc->mfi_buffer_dmat, cm->cm_dmamap);
mfi_release_command(cm);
return (error);
}
static int
mfi_open(struct cdev *dev, int flags, int fmt, struct thread *td)
{
struct mfi_softc *sc;
int error;
sc = dev->si_drv1;
mtx_lock(&sc->mfi_io_lock);
if (sc->mfi_detaching)
error = ENXIO;
else {
sc->mfi_flags |= MFI_FLAGS_OPEN;
error = 0;
}
mtx_unlock(&sc->mfi_io_lock);
return (error);
}
static int
mfi_close(struct cdev *dev, int flags, int fmt, struct thread *td)
{
struct mfi_softc *sc;
struct mfi_aen *mfi_aen_entry, *tmp;
sc = dev->si_drv1;
mtx_lock(&sc->mfi_io_lock);
sc->mfi_flags &= ~MFI_FLAGS_OPEN;
TAILQ_FOREACH_SAFE(mfi_aen_entry, &sc->mfi_aen_pids, aen_link, tmp) {
if (mfi_aen_entry->p == curproc) {
TAILQ_REMOVE(&sc->mfi_aen_pids, mfi_aen_entry,
aen_link);
free(mfi_aen_entry, M_MFIBUF);
}
}
mtx_unlock(&sc->mfi_io_lock);
return (0);
}
static int
mfi_config_lock(struct mfi_softc *sc, uint32_t opcode)
{
switch (opcode) {
case MFI_DCMD_LD_DELETE:
case MFI_DCMD_CFG_ADD:
case MFI_DCMD_CFG_CLEAR:
sx_xlock(&sc->mfi_config_lock);
return (1);
default:
return (0);
}
}
static void
mfi_config_unlock(struct mfi_softc *sc, int locked)
{
if (locked)
sx_xunlock(&sc->mfi_config_lock);
}
/* Perform pre-issue checks on commands from userland and possibly veto them. */
static int
mfi_check_command_pre(struct mfi_softc *sc, struct mfi_command *cm)
{
struct mfi_disk *ld, *ld2;
int error;
mtx_assert(&sc->mfi_io_lock, MA_OWNED);
error = 0;
switch (cm->cm_frame->dcmd.opcode) {
case MFI_DCMD_LD_DELETE:
TAILQ_FOREACH(ld, &sc->mfi_ld_tqh, ld_link) {
if (ld->ld_id == cm->cm_frame->dcmd.mbox[0])
break;
}
if (ld == NULL)
error = ENOENT;
else
error = mfi_disk_disable(ld);
break;
case MFI_DCMD_CFG_CLEAR:
TAILQ_FOREACH(ld, &sc->mfi_ld_tqh, ld_link) {
error = mfi_disk_disable(ld);
if (error)
break;
}
if (error) {
TAILQ_FOREACH(ld2, &sc->mfi_ld_tqh, ld_link) {
if (ld2 == ld)
break;
mfi_disk_enable(ld2);
}
}
break;
default:
break;
}
return (error);
}
/* Perform post-issue checks on commands from userland. */
static void
mfi_check_command_post(struct mfi_softc *sc, struct mfi_command *cm)
{
struct mfi_disk *ld, *ldn;
switch (cm->cm_frame->dcmd.opcode) {
case MFI_DCMD_LD_DELETE:
TAILQ_FOREACH(ld, &sc->mfi_ld_tqh, ld_link) {
if (ld->ld_id == cm->cm_frame->dcmd.mbox[0])
break;
}
KASSERT(ld != NULL, ("volume dissappeared"));
if (cm->cm_frame->header.cmd_status == MFI_STAT_OK) {
mtx_unlock(&sc->mfi_io_lock);
mtx_lock(&Giant);
device_delete_child(sc->mfi_dev, ld->ld_dev);
mtx_unlock(&Giant);
mtx_lock(&sc->mfi_io_lock);
} else
mfi_disk_enable(ld);
break;
case MFI_DCMD_CFG_CLEAR:
if (cm->cm_frame->header.cmd_status == MFI_STAT_OK) {
mtx_unlock(&sc->mfi_io_lock);
mtx_lock(&Giant);
TAILQ_FOREACH_SAFE(ld, &sc->mfi_ld_tqh, ld_link, ldn) {
device_delete_child(sc->mfi_dev, ld->ld_dev);
}
mtx_unlock(&Giant);
mtx_lock(&sc->mfi_io_lock);
} else {
TAILQ_FOREACH(ld, &sc->mfi_ld_tqh, ld_link)
mfi_disk_enable(ld);
}
break;
case MFI_DCMD_CFG_ADD:
mfi_ldprobe(sc);
break;
case MFI_DCMD_CFG_FOREIGN_IMPORT:
mfi_ldprobe(sc);
break;
}
}
static int
mfi_user_command(struct mfi_softc *sc, struct mfi_ioc_passthru *ioc)
{
struct mfi_command *cm;
struct mfi_dcmd_frame *dcmd;
void *ioc_buf = NULL;
uint32_t context;
int error = 0, locked;
if (ioc->buf_size > 0) {
ioc_buf = malloc(ioc->buf_size, M_MFIBUF, M_WAITOK);
if (ioc_buf == NULL) {
return (ENOMEM);
}
error = copyin(ioc->buf, ioc_buf, ioc->buf_size);
if (error) {
device_printf(sc->mfi_dev, "failed to copyin\n");
free(ioc_buf, M_MFIBUF);
return (error);
}
}
locked = mfi_config_lock(sc, ioc->ioc_frame.opcode);
mtx_lock(&sc->mfi_io_lock);
while ((cm = mfi_dequeue_free(sc)) == NULL)
msleep(mfi_user_command, &sc->mfi_io_lock, 0, "mfiioc", hz);
/* Save context for later */
context = cm->cm_frame->header.context;
dcmd = &cm->cm_frame->dcmd;
bcopy(&ioc->ioc_frame, dcmd, sizeof(struct mfi_dcmd_frame));
cm->cm_sg = &dcmd->sgl;
cm->cm_total_frame_size = MFI_DCMD_FRAME_SIZE;
cm->cm_data = ioc_buf;
cm->cm_len = ioc->buf_size;
/* restore context */
cm->cm_frame->header.context = context;
/* Cheat since we don't know if we're writing or reading */
cm->cm_flags = MFI_CMD_DATAIN | MFI_CMD_DATAOUT;
error = mfi_check_command_pre(sc, cm);
if (error)
goto out;
error = mfi_wait_command(sc, cm);
if (error) {
device_printf(sc->mfi_dev, "ioctl failed %d\n", error);
goto out;
}
bcopy(dcmd, &ioc->ioc_frame, sizeof(struct mfi_dcmd_frame));
mfi_check_command_post(sc, cm);
out:
mfi_release_command(cm);
mtx_unlock(&sc->mfi_io_lock);
mfi_config_unlock(sc, locked);
if (ioc->buf_size > 0)
error = copyout(ioc_buf, ioc->buf, ioc->buf_size);
if (ioc_buf)
free(ioc_buf, M_MFIBUF);
return (error);
}
#ifdef __amd64__
#define PTRIN(p) ((void *)(uintptr_t)(p))
#else
#define PTRIN(p) (p)
#endif
static int
mfi_ioctl(struct cdev *dev, u_long cmd, caddr_t arg, int flag, struct thread *td)
{
struct mfi_softc *sc;
union mfi_statrequest *ms;
struct mfi_ioc_packet *ioc;
#ifdef __amd64__
struct mfi_ioc_packet32 *ioc32;
#endif
struct mfi_ioc_aen *aen;
struct mfi_command *cm = NULL;
uint32_t context;
union mfi_sense_ptr sense_ptr;
uint8_t *data = NULL, *temp;
int i;
struct mfi_ioc_passthru *iop = (struct mfi_ioc_passthru *)arg;
#ifdef __amd64__
struct mfi_ioc_passthru32 *iop32 = (struct mfi_ioc_passthru32 *)arg;
struct mfi_ioc_passthru iop_swab;
#endif
int error, locked;
sc = dev->si_drv1;
error = 0;
switch (cmd) {
case MFIIO_STATS:
ms = (union mfi_statrequest *)arg;
switch (ms->ms_item) {
case MFIQ_FREE:
case MFIQ_BIO:
case MFIQ_READY:
case MFIQ_BUSY:
bcopy(&sc->mfi_qstat[ms->ms_item], &ms->ms_qstat,
sizeof(struct mfi_qstat));
break;
default:
error = ENOIOCTL;
break;
}
break;
case MFIIO_QUERY_DISK:
{
struct mfi_query_disk *qd;
struct mfi_disk *ld;
qd = (struct mfi_query_disk *)arg;
mtx_lock(&sc->mfi_io_lock);
TAILQ_FOREACH(ld, &sc->mfi_ld_tqh, ld_link) {
if (ld->ld_id == qd->array_id)
break;
}
if (ld == NULL) {
qd->present = 0;
mtx_unlock(&sc->mfi_io_lock);
return (0);
}
qd->present = 1;
if (ld->ld_flags & MFI_DISK_FLAGS_OPEN)
qd->open = 1;
bzero(qd->devname, SPECNAMELEN + 1);
snprintf(qd->devname, SPECNAMELEN, "mfid%d", ld->ld_unit);
mtx_unlock(&sc->mfi_io_lock);
break;
}
case MFI_CMD:
#ifdef __amd64__
case MFI_CMD32:
#endif
{
devclass_t devclass;
ioc = (struct mfi_ioc_packet *)arg;
int adapter;
adapter = ioc->mfi_adapter_no;
if (device_get_unit(sc->mfi_dev) == 0 && adapter != 0) {
devclass = devclass_find("mfi");
sc = devclass_get_softc(devclass, adapter);
}
mtx_lock(&sc->mfi_io_lock);
if ((cm = mfi_dequeue_free(sc)) == NULL) {
mtx_unlock(&sc->mfi_io_lock);
return (EBUSY);
}
mtx_unlock(&sc->mfi_io_lock);
locked = 0;
/*
* save off original context since copying from user
* will clobber some data
*/
context = cm->cm_frame->header.context;
bcopy(ioc->mfi_frame.raw, cm->cm_frame,
2 * MFI_DCMD_FRAME_SIZE); /* this isn't quite right */
cm->cm_total_frame_size = (sizeof(union mfi_sgl)
* ioc->mfi_sge_count) + ioc->mfi_sgl_off;
if (ioc->mfi_sge_count) {
cm->cm_sg =
(union mfi_sgl *)&cm->cm_frame->bytes[ioc->mfi_sgl_off];
}
cm->cm_flags = 0;
if (cm->cm_frame->header.flags & MFI_FRAME_DATAIN)
cm->cm_flags |= MFI_CMD_DATAIN;
if (cm->cm_frame->header.flags & MFI_FRAME_DATAOUT)
cm->cm_flags |= MFI_CMD_DATAOUT;
/* Legacy app shim */
if (cm->cm_flags == 0)
cm->cm_flags |= MFI_CMD_DATAIN | MFI_CMD_DATAOUT;
cm->cm_len = cm->cm_frame->header.data_len;
if (cm->cm_len &&
(cm->cm_flags & (MFI_CMD_DATAIN | MFI_CMD_DATAOUT))) {
cm->cm_data = data = malloc(cm->cm_len, M_MFIBUF,
M_WAITOK | M_ZERO);
if (cm->cm_data == NULL) {
device_printf(sc->mfi_dev, "Malloc failed\n");
goto out;
}
} else {
cm->cm_data = 0;
}
/* restore header context */
cm->cm_frame->header.context = context;
temp = data;
if (cm->cm_flags & MFI_CMD_DATAOUT) {
for (i = 0; i < ioc->mfi_sge_count; i++) {
#ifdef __amd64__
if (cmd == MFI_CMD) {
/* Native */
error = copyin(ioc->mfi_sgl[i].iov_base,
temp,
ioc->mfi_sgl[i].iov_len);
} else {
void *temp_convert;
/* 32bit */
ioc32 = (struct mfi_ioc_packet32 *)ioc;
temp_convert =
PTRIN(ioc32->mfi_sgl[i].iov_base);
error = copyin(temp_convert,
temp,
ioc32->mfi_sgl[i].iov_len);
}
#else
error = copyin(ioc->mfi_sgl[i].iov_base,
temp,
ioc->mfi_sgl[i].iov_len);
#endif
if (error != 0) {
device_printf(sc->mfi_dev,
"Copy in failed\n");
goto out;
}
temp = &temp[ioc->mfi_sgl[i].iov_len];
}
}
if (cm->cm_frame->header.cmd == MFI_CMD_DCMD)
locked = mfi_config_lock(sc, cm->cm_frame->dcmd.opcode);
if (cm->cm_frame->header.cmd == MFI_CMD_PD_SCSI_IO) {
cm->cm_frame->pass.sense_addr_lo = cm->cm_sense_busaddr;
cm->cm_frame->pass.sense_addr_hi = 0;
}
mtx_lock(&sc->mfi_io_lock);
error = mfi_check_command_pre(sc, cm);
if (error) {
mtx_unlock(&sc->mfi_io_lock);
goto out;
}
if ((error = mfi_wait_command(sc, cm)) != 0) {
device_printf(sc->mfi_dev,
"Controller polled failed\n");
mtx_unlock(&sc->mfi_io_lock);
goto out;
}
mfi_check_command_post(sc, cm);
mtx_unlock(&sc->mfi_io_lock);
temp = data;
if (cm->cm_flags & MFI_CMD_DATAIN) {
for (i = 0; i < ioc->mfi_sge_count; i++) {
#ifdef __amd64__
if (cmd == MFI_CMD) {
/* Native */
error = copyout(temp,
ioc->mfi_sgl[i].iov_base,
ioc->mfi_sgl[i].iov_len);
} else {
void *temp_convert;
/* 32bit */
ioc32 = (struct mfi_ioc_packet32 *)ioc;
temp_convert =
PTRIN(ioc32->mfi_sgl[i].iov_base);
error = copyout(temp,
temp_convert,
ioc32->mfi_sgl[i].iov_len);
}
#else
error = copyout(temp,
ioc->mfi_sgl[i].iov_base,
ioc->mfi_sgl[i].iov_len);
#endif
if (error != 0) {
device_printf(sc->mfi_dev,
"Copy out failed\n");
goto out;
}
temp = &temp[ioc->mfi_sgl[i].iov_len];
}
}
if (ioc->mfi_sense_len) {
/* get user-space sense ptr then copy out sense */
bcopy(&ioc->mfi_frame.raw[ioc->mfi_sense_off],
&sense_ptr.sense_ptr_data[0],
sizeof(sense_ptr.sense_ptr_data));
#ifdef __amd64__
if (cmd != MFI_CMD) {
/*
* not 64bit native so zero out any address
* over 32bit */
sense_ptr.addr.high = 0;
}
#endif
error = copyout(cm->cm_sense, sense_ptr.user_space,
ioc->mfi_sense_len);
if (error != 0) {
device_printf(sc->mfi_dev,
"Copy out failed\n");
goto out;
}
}
ioc->mfi_frame.hdr.cmd_status = cm->cm_frame->header.cmd_status;
out:
mfi_config_unlock(sc, locked);
if (data)
free(data, M_MFIBUF);
if (cm) {
mtx_lock(&sc->mfi_io_lock);
mfi_release_command(cm);
mtx_unlock(&sc->mfi_io_lock);
}
break;
}
case MFI_SET_AEN:
aen = (struct mfi_ioc_aen *)arg;
error = mfi_aen_register(sc, aen->aen_seq_num,
aen->aen_class_locale);
break;
case MFI_LINUX_CMD_2: /* Firmware Linux ioctl shim */
{
devclass_t devclass;
struct mfi_linux_ioc_packet l_ioc;
int adapter;
devclass = devclass_find("mfi");
if (devclass == NULL)
return (ENOENT);
error = copyin(arg, &l_ioc, sizeof(l_ioc));
if (error)
return (error);
adapter = l_ioc.lioc_adapter_no;
sc = devclass_get_softc(devclass, adapter);
if (sc == NULL)
return (ENOENT);
return (mfi_linux_ioctl_int(sc->mfi_cdev,
cmd, arg, flag, td));
break;
}
case MFI_LINUX_SET_AEN_2: /* AEN Linux ioctl shim */
{
devclass_t devclass;
struct mfi_linux_ioc_aen l_aen;
int adapter;
devclass = devclass_find("mfi");
if (devclass == NULL)
return (ENOENT);
error = copyin(arg, &l_aen, sizeof(l_aen));
if (error)
return (error);
adapter = l_aen.laen_adapter_no;
sc = devclass_get_softc(devclass, adapter);
if (sc == NULL)
return (ENOENT);
return (mfi_linux_ioctl_int(sc->mfi_cdev,
cmd, arg, flag, td));
break;
}
#ifdef __amd64__
case MFIIO_PASSTHRU32:
iop_swab.ioc_frame = iop32->ioc_frame;
iop_swab.buf_size = iop32->buf_size;
iop_swab.buf = PTRIN(iop32->buf);
iop = &iop_swab;
/* FALLTHROUGH */
#endif
case MFIIO_PASSTHRU:
error = mfi_user_command(sc, iop);
#ifdef __amd64__
if (cmd == MFIIO_PASSTHRU32)
iop32->ioc_frame = iop_swab.ioc_frame;
#endif
break;
default:
device_printf(sc->mfi_dev, "IOCTL 0x%lx not handled\n", cmd);
error = ENOENT;
break;
}
return (error);
}
static int
mfi_linux_ioctl_int(struct cdev *dev, u_long cmd, caddr_t arg, int flag, struct thread *td)
{
struct mfi_softc *sc;
struct mfi_linux_ioc_packet l_ioc;
struct mfi_linux_ioc_aen l_aen;
struct mfi_command *cm = NULL;
struct mfi_aen *mfi_aen_entry;
union mfi_sense_ptr sense_ptr;
uint32_t context;
uint8_t *data = NULL, *temp;
int i;
int error, locked;
sc = dev->si_drv1;
error = 0;
switch (cmd) {
case MFI_LINUX_CMD_2: /* Firmware Linux ioctl shim */
error = copyin(arg, &l_ioc, sizeof(l_ioc));
if (error != 0)
return (error);
if (l_ioc.lioc_sge_count > MAX_LINUX_IOCTL_SGE) {
return (EINVAL);
}
mtx_lock(&sc->mfi_io_lock);
if ((cm = mfi_dequeue_free(sc)) == NULL) {
mtx_unlock(&sc->mfi_io_lock);
return (EBUSY);
}
mtx_unlock(&sc->mfi_io_lock);
locked = 0;
/*
* save off original context since copying from user
* will clobber some data
*/
context = cm->cm_frame->header.context;
bcopy(l_ioc.lioc_frame.raw, cm->cm_frame,
2 * MFI_DCMD_FRAME_SIZE); /* this isn't quite right */
cm->cm_total_frame_size = (sizeof(union mfi_sgl)
* l_ioc.lioc_sge_count) + l_ioc.lioc_sgl_off;
if (l_ioc.lioc_sge_count)
cm->cm_sg =
(union mfi_sgl *)&cm->cm_frame->bytes[l_ioc.lioc_sgl_off];
cm->cm_flags = 0;
if (cm->cm_frame->header.flags & MFI_FRAME_DATAIN)
cm->cm_flags |= MFI_CMD_DATAIN;
if (cm->cm_frame->header.flags & MFI_FRAME_DATAOUT)
cm->cm_flags |= MFI_CMD_DATAOUT;
cm->cm_len = cm->cm_frame->header.data_len;
if (cm->cm_len &&
(cm->cm_flags & (MFI_CMD_DATAIN | MFI_CMD_DATAOUT))) {
cm->cm_data = data = malloc(cm->cm_len, M_MFIBUF,
M_WAITOK | M_ZERO);
if (cm->cm_data == NULL) {
device_printf(sc->mfi_dev, "Malloc failed\n");
goto out;
}
} else {
cm->cm_data = 0;
}
/* restore header context */
cm->cm_frame->header.context = context;
temp = data;
if (cm->cm_flags & MFI_CMD_DATAOUT) {
for (i = 0; i < l_ioc.lioc_sge_count; i++) {
error = copyin(PTRIN(l_ioc.lioc_sgl[i].iov_base),
temp,
l_ioc.lioc_sgl[i].iov_len);
if (error != 0) {
device_printf(sc->mfi_dev,
"Copy in failed\n");
goto out;
}
temp = &temp[l_ioc.lioc_sgl[i].iov_len];
}
}
if (cm->cm_frame->header.cmd == MFI_CMD_DCMD)
locked = mfi_config_lock(sc, cm->cm_frame->dcmd.opcode);
if (cm->cm_frame->header.cmd == MFI_CMD_PD_SCSI_IO) {
cm->cm_frame->pass.sense_addr_lo = cm->cm_sense_busaddr;
cm->cm_frame->pass.sense_addr_hi = 0;
}
mtx_lock(&sc->mfi_io_lock);
error = mfi_check_command_pre(sc, cm);
if (error) {
mtx_unlock(&sc->mfi_io_lock);
goto out;
}
if ((error = mfi_wait_command(sc, cm)) != 0) {
device_printf(sc->mfi_dev,
"Controller polled failed\n");
mtx_unlock(&sc->mfi_io_lock);
goto out;
}
mfi_check_command_post(sc, cm);
mtx_unlock(&sc->mfi_io_lock);
temp = data;
if (cm->cm_flags & MFI_CMD_DATAIN) {
for (i = 0; i < l_ioc.lioc_sge_count; i++) {
error = copyout(temp,
PTRIN(l_ioc.lioc_sgl[i].iov_base),
l_ioc.lioc_sgl[i].iov_len);
if (error != 0) {
device_printf(sc->mfi_dev,
"Copy out failed\n");
goto out;
}
temp = &temp[l_ioc.lioc_sgl[i].iov_len];
}
}
if (l_ioc.lioc_sense_len) {
/* get user-space sense ptr then copy out sense */
bcopy(&((struct mfi_linux_ioc_packet*)arg)
->lioc_frame.raw[l_ioc.lioc_sense_off],
&sense_ptr.sense_ptr_data[0],
sizeof(sense_ptr.sense_ptr_data));
#ifdef __amd64__
/*
* only 32bit Linux support so zero out any
* address over 32bit
*/
sense_ptr.addr.high = 0;
#endif
error = copyout(cm->cm_sense, sense_ptr.user_space,
l_ioc.lioc_sense_len);
if (error != 0) {
device_printf(sc->mfi_dev,
"Copy out failed\n");
goto out;
}
}
error = copyout(&cm->cm_frame->header.cmd_status,
&((struct mfi_linux_ioc_packet*)arg)
->lioc_frame.hdr.cmd_status,
1);
if (error != 0) {
device_printf(sc->mfi_dev,
"Copy out failed\n");
goto out;
}
out:
mfi_config_unlock(sc, locked);
if (data)
free(data, M_MFIBUF);
if (cm) {
mtx_lock(&sc->mfi_io_lock);
mfi_release_command(cm);
mtx_unlock(&sc->mfi_io_lock);
}
return (error);
case MFI_LINUX_SET_AEN_2: /* AEN Linux ioctl shim */
error = copyin(arg, &l_aen, sizeof(l_aen));
if (error != 0)
return (error);
printf("AEN IMPLEMENTED for pid %d\n", curproc->p_pid);
mfi_aen_entry = malloc(sizeof(struct mfi_aen), M_MFIBUF,
M_WAITOK);
mtx_lock(&sc->mfi_io_lock);
if (mfi_aen_entry != NULL) {
mfi_aen_entry->p = curproc;
TAILQ_INSERT_TAIL(&sc->mfi_aen_pids, mfi_aen_entry,
aen_link);
}
error = mfi_aen_register(sc, l_aen.laen_seq_num,
l_aen.laen_class_locale);
if (error != 0) {
TAILQ_REMOVE(&sc->mfi_aen_pids, mfi_aen_entry,
aen_link);
free(mfi_aen_entry, M_MFIBUF);
}
mtx_unlock(&sc->mfi_io_lock);
return (error);
default:
device_printf(sc->mfi_dev, "IOCTL 0x%lx not handled\n", cmd);
error = ENOENT;
break;
}
return (error);
}
static int
mfi_poll(struct cdev *dev, int poll_events, struct thread *td)
{
struct mfi_softc *sc;
int revents = 0;
sc = dev->si_drv1;
if (poll_events & (POLLIN | POLLRDNORM)) {
if (sc->mfi_aen_triggered != 0) {
revents |= poll_events & (POLLIN | POLLRDNORM);
sc->mfi_aen_triggered = 0;
}
if (sc->mfi_aen_triggered == 0 && sc->mfi_aen_cm == NULL) {
revents |= POLLERR;
}
}
if (revents == 0) {
if (poll_events & (POLLIN | POLLRDNORM)) {
sc->mfi_poll_waiting = 1;
selrecord(td, &sc->mfi_select);
}
}
return revents;
}
static void
mfi_dump_all(void)
{
struct mfi_softc *sc;
struct mfi_command *cm;
devclass_t dc;
time_t deadline;
int timedout;
int i;
dc = devclass_find("mfi");
if (dc == NULL) {
printf("No mfi dev class\n");
return;
}
for (i = 0; ; i++) {
sc = devclass_get_softc(dc, i);
if (sc == NULL)
break;
device_printf(sc->mfi_dev, "Dumping\n\n");
timedout = 0;
deadline = time_uptime - MFI_CMD_TIMEOUT;
mtx_lock(&sc->mfi_io_lock);
TAILQ_FOREACH(cm, &sc->mfi_busy, cm_link) {
if (cm->cm_timestamp < deadline) {
device_printf(sc->mfi_dev,
"COMMAND %p TIMEOUT AFTER %d SECONDS\n", cm,
(int)(time_uptime - cm->cm_timestamp));
MFI_PRINT_CMD(cm);
timedout++;
}
}
#if 0
if (timedout)
MFI_DUMP_CMDS(SC);
#endif
mtx_unlock(&sc->mfi_io_lock);
}
return;
}
static void
mfi_timeout(void *data)
{
struct mfi_softc *sc = (struct mfi_softc *)data;
struct mfi_command *cm;
time_t deadline;
int timedout = 0;
deadline = time_uptime - MFI_CMD_TIMEOUT;
mtx_lock(&sc->mfi_io_lock);
TAILQ_FOREACH(cm, &sc->mfi_busy, cm_link) {
if (sc->mfi_aen_cm == cm)
continue;
if ((sc->mfi_aen_cm != cm) && (cm->cm_timestamp < deadline)) {
device_printf(sc->mfi_dev,
"COMMAND %p TIMEOUT AFTER %d SECONDS\n", cm,
(int)(time_uptime - cm->cm_timestamp));
MFI_PRINT_CMD(cm);
MFI_VALIDATE_CMD(sc, cm);
timedout++;
}
}
#if 0
if (timedout)
MFI_DUMP_CMDS(SC);
#endif
mtx_unlock(&sc->mfi_io_lock);
callout_reset(&sc->mfi_watchdog_callout, MFI_CMD_TIMEOUT * hz,
mfi_timeout, sc);
if (0)
mfi_dump_all();
return;
}
Index: head/sys/dev/sound/midi/midi.c
===================================================================
--- head/sys/dev/sound/midi/midi.c (revision 225616)
+++ head/sys/dev/sound/midi/midi.c (revision 225617)
@@ -1,1531 +1,1531 @@
/*-
* Copyright (c) 2003 Mathew Kanner
* Copyright (c) 1998 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Lennart Augustsson (augustss@netbsd.org).
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Parts of this file started out as NetBSD: midi.c 1.31
* They are mostly gone. Still the most obvious will be the state
* machine midi_in
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/queue.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/signalvar.h>
#include <sys/conf.h>
#include <sys/selinfo.h>
#include <sys/sysctl.h>
#include <sys/types.h>
#include <sys/malloc.h>
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/fcntl.h>
#include <sys/types.h>
#include <sys/uio.h>
#include <sys/poll.h>
#include <sys/sbuf.h>
#include <sys/kobj.h>
#include <sys/module.h>
#ifdef HAVE_KERNEL_OPTION_HEADERS
#include "opt_snd.h"
#endif
#include <dev/sound/midi/midi.h>
#include "mpu_if.h"
#include <dev/sound/midi/midiq.h>
#include "synth_if.h"
MALLOC_DEFINE(M_MIDI, "midi buffers", "Midi data allocation area");
#ifndef KOBJMETHOD_END
#define KOBJMETHOD_END { NULL, NULL }
#endif
#define PCMMKMINOR(u, d, c) ((((c) & 0xff) << 16) | (((u) & 0x0f) << 4) | ((d) & 0x0f))
#define MIDIMKMINOR(u, d, c) PCMMKMINOR(u, d, c)
#define MIDI_DEV_RAW 2
#define MIDI_DEV_MIDICTL 12
enum midi_states {
MIDI_IN_START, MIDI_IN_SYSEX, MIDI_IN_DATA
};
/*
* The MPU interface current has init() uninit() inqsize(( outqsize()
* callback() : fiddle with the tx|rx status.
*/
#include "mpu_if.h"
/*
* /dev/rmidi Structure definitions
*/
#define MIDI_NAMELEN 16
struct snd_midi {
KOBJ_FIELDS;
struct mtx lock; /* Protects all but queues */
void *cookie;
int unit; /* Should only be used in midistat */
int channel; /* Should only be used in midistat */
int busy;
int flags; /* File flags */
char name[MIDI_NAMELEN];
struct mtx qlock; /* Protects inq, outq and flags */
MIDIQ_HEAD(, char) inq, outq;
int rchan, wchan;
struct selinfo rsel, wsel;
int hiwat; /* QLEN(outq)>High-water -> disable
* writes from userland */
enum midi_states inq_state;
int inq_status, inq_left; /* Variables for the state machine in
* Midi_in, this is to provide that
* signals only get issued only
* complete command packets. */
struct proc *async;
struct cdev *dev;
struct synth_midi *synth;
int synth_flags;
TAILQ_ENTRY(snd_midi) link;
};
struct synth_midi {
KOBJ_FIELDS;
struct snd_midi *m;
};
static synth_open_t midisynth_open;
static synth_close_t midisynth_close;
static synth_writeraw_t midisynth_writeraw;
static synth_killnote_t midisynth_killnote;
static synth_startnote_t midisynth_startnote;
static synth_setinstr_t midisynth_setinstr;
static synth_alloc_t midisynth_alloc;
static synth_controller_t midisynth_controller;
static synth_bender_t midisynth_bender;
static kobj_method_t midisynth_methods[] = {
KOBJMETHOD(synth_open, midisynth_open),
KOBJMETHOD(synth_close, midisynth_close),
KOBJMETHOD(synth_writeraw, midisynth_writeraw),
KOBJMETHOD(synth_setinstr, midisynth_setinstr),
KOBJMETHOD(synth_startnote, midisynth_startnote),
KOBJMETHOD(synth_killnote, midisynth_killnote),
KOBJMETHOD(synth_alloc, midisynth_alloc),
KOBJMETHOD(synth_controller, midisynth_controller),
KOBJMETHOD(synth_bender, midisynth_bender),
KOBJMETHOD_END
};
DEFINE_CLASS(midisynth, midisynth_methods, 0);
/*
* Module Exports & Interface
*
* struct midi_chan *midi_init(MPU_CLASS cls, int unit, int chan) int
* midi_uninit(struct snd_midi *) 0 == no error EBUSY or other error int
* Midi_in(struct midi_chan *, char *buf, int count) int Midi_out(struct
* midi_chan *, char *buf, int count)
*
* midi_{in,out} return actual size transfered
*
*/
/*
* midi_devs tailq, holder of all rmidi instances protected by midistat_lock
*/
TAILQ_HEAD(, snd_midi) midi_devs;
/*
* /dev/midistat variables and declarations, protected by midistat_lock
*/
static struct mtx midistat_lock;
static int midistat_isopen = 0;
static struct sbuf midistat_sbuf;
static int midistat_bufptr;
static struct cdev *midistat_dev;
/*
* /dev/midistat dev_t declarations
*/
static d_open_t midistat_open;
static d_close_t midistat_close;
static d_read_t midistat_read;
static struct cdevsw midistat_cdevsw = {
.d_version = D_VERSION,
.d_open = midistat_open,
.d_close = midistat_close,
.d_read = midistat_read,
.d_name = "midistat",
};
/*
* /dev/rmidi dev_t declarations, struct variable access is protected by
* locks contained within the structure.
*/
static d_open_t midi_open;
static d_close_t midi_close;
static d_ioctl_t midi_ioctl;
static d_read_t midi_read;
static d_write_t midi_write;
static d_poll_t midi_poll;
static struct cdevsw midi_cdevsw = {
.d_version = D_VERSION,
.d_open = midi_open,
.d_close = midi_close,
.d_read = midi_read,
.d_write = midi_write,
.d_ioctl = midi_ioctl,
.d_poll = midi_poll,
.d_name = "rmidi",
};
/*
* Prototypes of library functions
*/
static int midi_destroy(struct snd_midi *, int);
static int midistat_prepare(struct sbuf * s);
static int midi_load(void);
static int midi_unload(void);
/*
* Misc declr.
*/
SYSCTL_NODE(_hw, OID_AUTO, midi, CTLFLAG_RD, 0, "Midi driver");
SYSCTL_NODE(_hw_midi, OID_AUTO, stat, CTLFLAG_RD, 0, "Status device");
int midi_debug;
/* XXX: should this be moved into debug.midi? */
SYSCTL_INT(_hw_midi, OID_AUTO, debug, CTLFLAG_RW, &midi_debug, 0, "");
int midi_dumpraw;
SYSCTL_INT(_hw_midi, OID_AUTO, dumpraw, CTLFLAG_RW, &midi_dumpraw, 0, "");
int midi_instroff;
SYSCTL_INT(_hw_midi, OID_AUTO, instroff, CTLFLAG_RW, &midi_instroff, 0, "");
int midistat_verbose;
SYSCTL_INT(_hw_midi_stat, OID_AUTO, verbose, CTLFLAG_RW,
&midistat_verbose, 0, "");
#define MIDI_DEBUG(l,a) if(midi_debug>=l) a
/*
* CODE START
*/
/*
* Register a new rmidi device. cls midi_if interface unit == 0 means
* auto-assign new unit number unit != 0 already assigned a unit number, eg.
* not the first channel provided by this device. channel, sub-unit
* cookie is passed back on MPU calls Typical device drivers will call with
* unit=0, channel=1..(number of channels) and cookie=soft_c and won't care
* what unit number is used.
*
* It is an error to call midi_init with an already used unit/channel combo.
*
* Returns NULL on error
*
*/
struct snd_midi *
midi_init(kobj_class_t cls, int unit, int channel, void *cookie)
{
struct snd_midi *m;
int i;
int inqsize, outqsize;
MIDI_TYPE *buf;
MIDI_DEBUG(1, printf("midiinit: unit %d/%d.\n", unit, channel));
mtx_lock(&midistat_lock);
/*
* Protect against call with existing unit/channel or auto-allocate a
* new unit number.
*/
i = -1;
TAILQ_FOREACH(m, &midi_devs, link) {
mtx_lock(&m->lock);
if (unit != 0) {
if (m->unit == unit && m->channel == channel) {
mtx_unlock(&m->lock);
goto err0;
}
} else {
/*
* Find a better unit number
*/
if (m->unit > i)
i = m->unit;
}
mtx_unlock(&m->lock);
}
if (unit == 0)
unit = i + 1;
MIDI_DEBUG(1, printf("midiinit #2: unit %d/%d.\n", unit, channel));
m = malloc(sizeof(*m), M_MIDI, M_NOWAIT | M_ZERO);
if (m == NULL)
goto err0;
m->synth = malloc(sizeof(*m->synth), M_MIDI, M_NOWAIT | M_ZERO);
kobj_init((kobj_t)m->synth, &midisynth_class);
m->synth->m = m;
kobj_init((kobj_t)m, cls);
inqsize = MPU_INQSIZE(m, cookie);
outqsize = MPU_OUTQSIZE(m, cookie);
MIDI_DEBUG(1, printf("midiinit queues %d/%d.\n", inqsize, outqsize));
if (!inqsize && !outqsize)
goto err1;
mtx_init(&m->lock, "raw midi", NULL, 0);
mtx_init(&m->qlock, "q raw midi", NULL, 0);
mtx_lock(&m->lock);
mtx_lock(&m->qlock);
if (inqsize)
buf = malloc(sizeof(MIDI_TYPE) * inqsize, M_MIDI, M_NOWAIT);
else
buf = NULL;
MIDIQ_INIT(m->inq, buf, inqsize);
if (outqsize)
buf = malloc(sizeof(MIDI_TYPE) * outqsize, M_MIDI, M_NOWAIT);
else
buf = NULL;
m->hiwat = outqsize / 2;
MIDIQ_INIT(m->outq, buf, outqsize);
if ((inqsize && !MIDIQ_BUF(m->inq)) ||
(outqsize && !MIDIQ_BUF(m->outq)))
goto err2;
m->busy = 0;
m->flags = 0;
m->unit = unit;
m->channel = channel;
m->cookie = cookie;
if (MPU_INIT(m, cookie))
goto err2;
mtx_unlock(&m->lock);
mtx_unlock(&m->qlock);
TAILQ_INSERT_TAIL(&midi_devs, m, link);
mtx_unlock(&midistat_lock);
m->dev = make_dev(&midi_cdevsw,
MIDIMKMINOR(unit, MIDI_DEV_RAW, channel),
UID_ROOT, GID_WHEEL, 0666, "midi%d.%d", unit, channel);
m->dev->si_drv1 = m;
return m;
err2: mtx_destroy(&m->qlock);
mtx_destroy(&m->lock);
if (MIDIQ_BUF(m->inq))
free(MIDIQ_BUF(m->inq), M_MIDI);
if (MIDIQ_BUF(m->outq))
free(MIDIQ_BUF(m->outq), M_MIDI);
err1: free(m, M_MIDI);
err0: mtx_unlock(&midistat_lock);
MIDI_DEBUG(1, printf("midi_init ended in error\n"));
return NULL;
}
/*
* midi_uninit does not call MIDI_UNINIT, as since this is the implementors
* entry point. midi_unint if fact, does not send any methods. A call to
* midi_uninit is a defacto promise that you won't manipulate ch anymore
*
*/
int
midi_uninit(struct snd_midi *m)
{
int err;
err = ENXIO;
mtx_lock(&midistat_lock);
mtx_lock(&m->lock);
if (m->busy) {
if (!(m->rchan || m->wchan))
goto err;
if (m->rchan) {
wakeup(&m->rchan);
m->rchan = 0;
}
if (m->wchan) {
wakeup(&m->wchan);
m->wchan = 0;
}
}
err = midi_destroy(m, 0);
if (!err)
goto exit;
err: mtx_unlock(&m->lock);
exit: mtx_unlock(&midistat_lock);
return err;
}
/*
* midi_in: process all data until the queue is full, then discards the rest.
* Since midi_in is a state machine, data discards can cause it to get out of
* whack. Process as much as possible. It calls, wakeup, selnotify and
* psignal at most once.
*/
#ifdef notdef
static int midi_lengths[] = {2, 2, 2, 2, 1, 1, 2, 0};
#endif /* notdef */
/* Number of bytes in a MIDI command */
#define MIDI_LENGTH(d) (midi_lengths[((d) >> 4) & 7])
#define MIDI_ACK 0xfe
#define MIDI_IS_STATUS(d) ((d) >= 0x80)
#define MIDI_IS_COMMON(d) ((d) >= 0xf0)
#define MIDI_SYSEX_START 0xF0
#define MIDI_SYSEX_END 0xF7
int
midi_in(struct snd_midi *m, MIDI_TYPE *buf, int size)
{
/* int i, sig, enq; */
int used;
/* MIDI_TYPE data; */
MIDI_DEBUG(5, printf("midi_in: m=%p size=%d\n", m, size));
/*
* XXX: locking flub
*/
if (!(m->flags & M_RX))
return size;
used = 0;
mtx_lock(&m->qlock);
#if 0
/*
* Don't bother queuing if not in read mode. Discard everything and
* return size so the caller doesn't freak out.
*/
if (!(m->flags & M_RX))
return size;
for (i = sig = 0; i < size; i++) {
data = buf[i];
enq = 0;
if (data == MIDI_ACK)
continue;
switch (m->inq_state) {
case MIDI_IN_START:
if (MIDI_IS_STATUS(data)) {
switch (data) {
case 0xf0: /* Sysex */
m->inq_state = MIDI_IN_SYSEX;
break;
case 0xf1: /* MTC quarter frame */
case 0xf3: /* Song select */
m->inq_state = MIDI_IN_DATA;
enq = 1;
m->inq_left = 1;
break;
case 0xf2: /* Song position pointer */
m->inq_state = MIDI_IN_DATA;
enq = 1;
m->inq_left = 2;
break;
default:
if (MIDI_IS_COMMON(data)) {
enq = 1;
sig = 1;
} else {
m->inq_state = MIDI_IN_DATA;
enq = 1;
m->inq_status = data;
m->inq_left = MIDI_LENGTH(data);
}
break;
}
} else if (MIDI_IS_STATUS(m->inq_status)) {
m->inq_state = MIDI_IN_DATA;
if (!MIDIQ_FULL(m->inq)) {
used++;
MIDIQ_ENQ(m->inq, &m->inq_status, 1);
}
enq = 1;
m->inq_left = MIDI_LENGTH(m->inq_status) - 1;
}
break;
/*
* End of case MIDI_IN_START:
*/
case MIDI_IN_DATA:
enq = 1;
if (--m->inq_left <= 0)
sig = 1;/* deliver data */
break;
case MIDI_IN_SYSEX:
if (data == MIDI_SYSEX_END)
m->inq_state = MIDI_IN_START;
break;
}
if (enq)
if (!MIDIQ_FULL(m->inq)) {
MIDIQ_ENQ(m->inq, &data, 1);
used++;
}
/*
* End of the state machines main "for loop"
*/
}
if (sig) {
#endif
MIDI_DEBUG(6, printf("midi_in: len %jd avail %jd\n",
(intmax_t)MIDIQ_LEN(m->inq),
(intmax_t)MIDIQ_AVAIL(m->inq)));
if (MIDIQ_AVAIL(m->inq) > size) {
used = size;
MIDIQ_ENQ(m->inq, buf, size);
} else {
MIDI_DEBUG(4, printf("midi_in: Discarding data qu\n"));
mtx_unlock(&m->qlock);
return 0;
}
if (m->rchan) {
wakeup(&m->rchan);
m->rchan = 0;
}
selwakeup(&m->rsel);
if (m->async) {
PROC_LOCK(m->async);
- psignal(m->async, SIGIO);
+ kern_psignal(m->async, SIGIO);
PROC_UNLOCK(m->async);
}
#if 0
}
#endif
mtx_unlock(&m->qlock);
return used;
}
/*
* midi_out: The only clearer of the M_TXEN flag.
*/
int
midi_out(struct snd_midi *m, MIDI_TYPE *buf, int size)
{
int used;
/*
* XXX: locking flub
*/
if (!(m->flags & M_TXEN))
return 0;
MIDI_DEBUG(2, printf("midi_out: %p\n", m));
mtx_lock(&m->qlock);
used = MIN(size, MIDIQ_LEN(m->outq));
MIDI_DEBUG(3, printf("midi_out: used %d\n", used));
if (used)
MIDIQ_DEQ(m->outq, buf, used);
if (MIDIQ_EMPTY(m->outq)) {
m->flags &= ~M_TXEN;
MPU_CALLBACKP(m, m->cookie, m->flags);
}
if (used && MIDIQ_AVAIL(m->outq) > m->hiwat) {
if (m->wchan) {
wakeup(&m->wchan);
m->wchan = 0;
}
selwakeup(&m->wsel);
if (m->async) {
PROC_LOCK(m->async);
- psignal(m->async, SIGIO);
+ kern_psignal(m->async, SIGIO);
PROC_UNLOCK(m->async);
}
}
mtx_unlock(&m->qlock);
return used;
}
/*
* /dev/rmidi#.# device access functions
*/
int
midi_open(struct cdev *i_dev, int flags, int mode, struct thread *td)
{
struct snd_midi *m = i_dev->si_drv1;
int retval;
MIDI_DEBUG(1, printf("midiopen %p %s %s\n", td,
flags & FREAD ? "M_RX" : "", flags & FWRITE ? "M_TX" : ""));
if (m == NULL)
return ENXIO;
mtx_lock(&m->lock);
mtx_lock(&m->qlock);
retval = 0;
if (flags & FREAD) {
if (MIDIQ_SIZE(m->inq) == 0)
retval = ENXIO;
else if (m->flags & M_RX)
retval = EBUSY;
if (retval)
goto err;
}
if (flags & FWRITE) {
if (MIDIQ_SIZE(m->outq) == 0)
retval = ENXIO;
else if (m->flags & M_TX)
retval = EBUSY;
if (retval)
goto err;
}
m->busy++;
m->rchan = 0;
m->wchan = 0;
m->async = 0;
if (flags & FREAD) {
m->flags |= M_RX | M_RXEN;
/*
* Only clear the inq, the outq might still have data to drain
* from a previous session
*/
MIDIQ_CLEAR(m->inq);
};
if (flags & FWRITE)
m->flags |= M_TX;
MPU_CALLBACK(m, m->cookie, m->flags);
MIDI_DEBUG(2, printf("midi_open: opened.\n"));
err: mtx_unlock(&m->qlock);
mtx_unlock(&m->lock);
return retval;
}
int
midi_close(struct cdev *i_dev, int flags, int mode, struct thread *td)
{
struct snd_midi *m = i_dev->si_drv1;
int retval;
int oldflags;
MIDI_DEBUG(1, printf("midi_close %p %s %s\n", td,
flags & FREAD ? "M_RX" : "", flags & FWRITE ? "M_TX" : ""));
if (m == NULL)
return ENXIO;
mtx_lock(&m->lock);
mtx_lock(&m->qlock);
if ((flags & FREAD && !(m->flags & M_RX)) ||
(flags & FWRITE && !(m->flags & M_TX))) {
retval = ENXIO;
goto err;
}
m->busy--;
oldflags = m->flags;
if (flags & FREAD)
m->flags &= ~(M_RX | M_RXEN);
if (flags & FWRITE)
m->flags &= ~M_TX;
if ((m->flags & (M_TXEN | M_RXEN)) != (oldflags & (M_RXEN | M_TXEN)))
MPU_CALLBACK(m, m->cookie, m->flags);
MIDI_DEBUG(1, printf("midi_close: closed, busy = %d.\n", m->busy));
mtx_unlock(&m->qlock);
mtx_unlock(&m->lock);
retval = 0;
err: return retval;
}
/*
* TODO: midi_read, per oss programmer's guide pg. 42 should return as soon
* as data is available.
*/
int
midi_read(struct cdev *i_dev, struct uio *uio, int ioflag)
{
#define MIDI_RSIZE 32
struct snd_midi *m = i_dev->si_drv1;
int retval;
int used;
char buf[MIDI_RSIZE];
MIDI_DEBUG(5, printf("midiread: count=%lu\n",
(unsigned long)uio->uio_resid));
retval = EIO;
if (m == NULL)
goto err0;
mtx_lock(&m->lock);
mtx_lock(&m->qlock);
if (!(m->flags & M_RX))
goto err1;
while (uio->uio_resid > 0) {
while (MIDIQ_EMPTY(m->inq)) {
retval = EWOULDBLOCK;
if (ioflag & O_NONBLOCK)
goto err1;
mtx_unlock(&m->lock);
m->rchan = 1;
retval = msleep(&m->rchan, &m->qlock,
PCATCH | PDROP, "midi RX", 0);
/*
* We slept, maybe things have changed since last
* dying check
*/
if (retval == EINTR)
goto err0;
if (m != i_dev->si_drv1)
retval = ENXIO;
/* if (retval && retval != ERESTART) */
if (retval)
goto err0;
mtx_lock(&m->lock);
mtx_lock(&m->qlock);
m->rchan = 0;
if (!m->busy)
goto err1;
}
MIDI_DEBUG(6, printf("midi_read start\n"));
/*
* At this point, it is certain that m->inq has data
*/
used = MIN(MIDIQ_LEN(m->inq), uio->uio_resid);
used = MIN(used, MIDI_RSIZE);
MIDI_DEBUG(6, printf("midiread: uiomove cc=%d\n", used));
MIDIQ_DEQ(m->inq, buf, used);
retval = uiomove(buf, used, uio);
if (retval)
goto err1;
}
/*
* If we Made it here then transfer is good
*/
retval = 0;
err1: mtx_unlock(&m->qlock);
mtx_unlock(&m->lock);
err0: MIDI_DEBUG(4, printf("midi_read: ret %d\n", retval));
return retval;
}
/*
* midi_write: The only setter of M_TXEN
*/
int
midi_write(struct cdev *i_dev, struct uio *uio, int ioflag)
{
#define MIDI_WSIZE 32
struct snd_midi *m = i_dev->si_drv1;
int retval;
int used;
char buf[MIDI_WSIZE];
MIDI_DEBUG(4, printf("midi_write\n"));
retval = 0;
if (m == NULL)
goto err0;
mtx_lock(&m->lock);
mtx_lock(&m->qlock);
if (!(m->flags & M_TX))
goto err1;
while (uio->uio_resid > 0) {
while (MIDIQ_AVAIL(m->outq) == 0) {
retval = EWOULDBLOCK;
if (ioflag & O_NONBLOCK)
goto err1;
mtx_unlock(&m->lock);
m->wchan = 1;
MIDI_DEBUG(3, printf("midi_write msleep\n"));
retval = msleep(&m->wchan, &m->qlock,
PCATCH | PDROP, "midi TX", 0);
/*
* We slept, maybe things have changed since last
* dying check
*/
if (retval == EINTR)
goto err0;
if (m != i_dev->si_drv1)
retval = ENXIO;
if (retval)
goto err0;
mtx_lock(&m->lock);
mtx_lock(&m->qlock);
m->wchan = 0;
if (!m->busy)
goto err1;
}
/*
* We are certain than data can be placed on the queue
*/
used = MIN(MIDIQ_AVAIL(m->outq), uio->uio_resid);
used = MIN(used, MIDI_WSIZE);
MIDI_DEBUG(5, printf("midiout: resid %zd len %jd avail %jd\n",
uio->uio_resid, (intmax_t)MIDIQ_LEN(m->outq),
(intmax_t)MIDIQ_AVAIL(m->outq)));
MIDI_DEBUG(5, printf("midi_write: uiomove cc=%d\n", used));
retval = uiomove(buf, used, uio);
if (retval)
goto err1;
MIDIQ_ENQ(m->outq, buf, used);
/*
* Inform the bottom half that data can be written
*/
if (!(m->flags & M_TXEN)) {
m->flags |= M_TXEN;
MPU_CALLBACK(m, m->cookie, m->flags);
}
}
/*
* If we Made it here then transfer is good
*/
retval = 0;
err1: mtx_unlock(&m->qlock);
mtx_unlock(&m->lock);
err0: return retval;
}
int
midi_ioctl(struct cdev *i_dev, u_long cmd, caddr_t arg, int mode,
struct thread *td)
{
return ENXIO;
}
int
midi_poll(struct cdev *i_dev, int events, struct thread *td)
{
struct snd_midi *m = i_dev->si_drv1;
int revents;
if (m == NULL)
return 0;
revents = 0;
mtx_lock(&m->lock);
mtx_lock(&m->qlock);
if (events & (POLLIN | POLLRDNORM))
if (!MIDIQ_EMPTY(m->inq))
events |= events & (POLLIN | POLLRDNORM);
if (events & (POLLOUT | POLLWRNORM))
if (MIDIQ_AVAIL(m->outq) < m->hiwat)
events |= events & (POLLOUT | POLLWRNORM);
if (revents == 0) {
if (events & (POLLIN | POLLRDNORM))
selrecord(td, &m->rsel);
if (events & (POLLOUT | POLLWRNORM))
selrecord(td, &m->wsel);
}
mtx_unlock(&m->lock);
mtx_unlock(&m->qlock);
return (revents);
}
/*
* /dev/midistat device functions
*
*/
static int
midistat_open(struct cdev *i_dev, int flags, int mode, struct thread *td)
{
int error;
MIDI_DEBUG(1, printf("midistat_open\n"));
mtx_lock(&midistat_lock);
if (midistat_isopen) {
mtx_unlock(&midistat_lock);
return EBUSY;
}
midistat_isopen = 1;
mtx_unlock(&midistat_lock);
if (sbuf_new(&midistat_sbuf, NULL, 4096, SBUF_AUTOEXTEND) == NULL) {
error = ENXIO;
mtx_lock(&midistat_lock);
goto out;
}
mtx_lock(&midistat_lock);
midistat_bufptr = 0;
error = (midistat_prepare(&midistat_sbuf) > 0) ? 0 : ENOMEM;
out: if (error)
midistat_isopen = 0;
mtx_unlock(&midistat_lock);
return error;
}
static int
midistat_close(struct cdev *i_dev, int flags, int mode, struct thread *td)
{
MIDI_DEBUG(1, printf("midistat_close\n"));
mtx_lock(&midistat_lock);
if (!midistat_isopen) {
mtx_unlock(&midistat_lock);
return EBADF;
}
sbuf_delete(&midistat_sbuf);
midistat_isopen = 0;
mtx_unlock(&midistat_lock);
return 0;
}
static int
midistat_read(struct cdev *i_dev, struct uio *buf, int flag)
{
int l, err;
MIDI_DEBUG(4, printf("midistat_read\n"));
mtx_lock(&midistat_lock);
if (!midistat_isopen) {
mtx_unlock(&midistat_lock);
return EBADF;
}
l = min(buf->uio_resid, sbuf_len(&midistat_sbuf) - midistat_bufptr);
err = 0;
if (l > 0) {
mtx_unlock(&midistat_lock);
err = uiomove(sbuf_data(&midistat_sbuf) + midistat_bufptr, l,
buf);
mtx_lock(&midistat_lock);
} else
l = 0;
midistat_bufptr += l;
mtx_unlock(&midistat_lock);
return err;
}
/*
* Module library functions
*/
static int
midistat_prepare(struct sbuf *s)
{
struct snd_midi *m;
mtx_assert(&midistat_lock, MA_OWNED);
sbuf_printf(s, "FreeBSD Midi Driver (midi2)\n");
if (TAILQ_EMPTY(&midi_devs)) {
sbuf_printf(s, "No devices installed.\n");
sbuf_finish(s);
return sbuf_len(s);
}
sbuf_printf(s, "Installed devices:\n");
TAILQ_FOREACH(m, &midi_devs, link) {
mtx_lock(&m->lock);
sbuf_printf(s, "%s [%d/%d:%s]", m->name, m->unit, m->channel,
MPU_PROVIDER(m, m->cookie));
sbuf_printf(s, "%s", MPU_DESCR(m, m->cookie, midistat_verbose));
sbuf_printf(s, "\n");
mtx_unlock(&m->lock);
}
sbuf_finish(s);
return sbuf_len(s);
}
#ifdef notdef
/*
* Convert IOCTL command to string for debugging
*/
static char *
midi_cmdname(int cmd)
{
static struct {
int cmd;
char *name;
} *tab, cmdtab_midiioctl[] = {
#define A(x) {x, ## x}
/*
* Once we have some real IOCTLs define, the following will
* be relavant.
*
* A(SNDCTL_MIDI_PRETIME), A(SNDCTL_MIDI_MPUMODE),
* A(SNDCTL_MIDI_MPUCMD), A(SNDCTL_SYNTH_INFO),
* A(SNDCTL_MIDI_INFO), A(SNDCTL_SYNTH_MEMAVL),
* A(SNDCTL_FM_LOAD_INSTR), A(SNDCTL_FM_4OP_ENABLE),
* A(MIOSPASSTHRU), A(MIOGPASSTHRU), A(AIONWRITE),
* A(AIOGSIZE), A(AIOSSIZE), A(AIOGFMT), A(AIOSFMT),
* A(AIOGMIX), A(AIOSMIX), A(AIOSTOP), A(AIOSYNC),
* A(AIOGCAP),
*/
#undef A
{
-1, "unknown"
},
};
for (tab = cmdtab_midiioctl; tab->cmd != cmd && tab->cmd != -1; tab++);
return tab->name;
}
#endif /* notdef */
/*
* midisynth
*/
int
midisynth_open(void *n, void *arg, int flags)
{
struct snd_midi *m = ((struct synth_midi *)n)->m;
int retval;
MIDI_DEBUG(1, printf("midisynth_open %s %s\n",
flags & FREAD ? "M_RX" : "", flags & FWRITE ? "M_TX" : ""));
if (m == NULL)
return ENXIO;
mtx_lock(&m->lock);
mtx_lock(&m->qlock);
retval = 0;
if (flags & FREAD) {
if (MIDIQ_SIZE(m->inq) == 0)
retval = ENXIO;
else if (m->flags & M_RX)
retval = EBUSY;
if (retval)
goto err;
}
if (flags & FWRITE) {
if (MIDIQ_SIZE(m->outq) == 0)
retval = ENXIO;
else if (m->flags & M_TX)
retval = EBUSY;
if (retval)
goto err;
}
m->busy++;
/*
* TODO: Consider m->async = 0;
*/
if (flags & FREAD) {
m->flags |= M_RX | M_RXEN;
/*
* Only clear the inq, the outq might still have data to drain
* from a previous session
*/
MIDIQ_CLEAR(m->inq);
m->rchan = 0;
};
if (flags & FWRITE) {
m->flags |= M_TX;
m->wchan = 0;
}
m->synth_flags = flags & (FREAD | FWRITE);
MPU_CALLBACK(m, m->cookie, m->flags);
err: mtx_unlock(&m->qlock);
mtx_unlock(&m->lock);
MIDI_DEBUG(2, printf("midisynth_open: return %d.\n", retval));
return retval;
}
int
midisynth_close(void *n)
{
struct snd_midi *m = ((struct synth_midi *)n)->m;
int retval;
int oldflags;
MIDI_DEBUG(1, printf("midisynth_close %s %s\n",
m->synth_flags & FREAD ? "M_RX" : "",
m->synth_flags & FWRITE ? "M_TX" : ""));
if (m == NULL)
return ENXIO;
mtx_lock(&m->lock);
mtx_lock(&m->qlock);
if ((m->synth_flags & FREAD && !(m->flags & M_RX)) ||
(m->synth_flags & FWRITE && !(m->flags & M_TX))) {
retval = ENXIO;
goto err;
}
m->busy--;
oldflags = m->flags;
if (m->synth_flags & FREAD)
m->flags &= ~(M_RX | M_RXEN);
if (m->synth_flags & FWRITE)
m->flags &= ~M_TX;
if ((m->flags & (M_TXEN | M_RXEN)) != (oldflags & (M_RXEN | M_TXEN)))
MPU_CALLBACK(m, m->cookie, m->flags);
MIDI_DEBUG(1, printf("midi_close: closed, busy = %d.\n", m->busy));
mtx_unlock(&m->qlock);
mtx_unlock(&m->lock);
retval = 0;
err: return retval;
}
/*
* Always blocking.
*/
int
midisynth_writeraw(void *n, uint8_t *buf, size_t len)
{
struct snd_midi *m = ((struct synth_midi *)n)->m;
int retval;
int used;
int i;
MIDI_DEBUG(4, printf("midisynth_writeraw\n"));
retval = 0;
if (m == NULL)
return ENXIO;
mtx_lock(&m->lock);
mtx_lock(&m->qlock);
if (!(m->flags & M_TX))
goto err1;
if (midi_dumpraw)
printf("midi dump: ");
while (len > 0) {
while (MIDIQ_AVAIL(m->outq) == 0) {
if (!(m->flags & M_TXEN)) {
m->flags |= M_TXEN;
MPU_CALLBACK(m, m->cookie, m->flags);
}
mtx_unlock(&m->lock);
m->wchan = 1;
MIDI_DEBUG(3, printf("midisynth_writeraw msleep\n"));
retval = msleep(&m->wchan, &m->qlock,
PCATCH | PDROP, "midi TX", 0);
/*
* We slept, maybe things have changed since last
* dying check
*/
if (retval == EINTR)
goto err0;
if (retval)
goto err0;
mtx_lock(&m->lock);
mtx_lock(&m->qlock);
m->wchan = 0;
if (!m->busy)
goto err1;
}
/*
* We are certain than data can be placed on the queue
*/
used = MIN(MIDIQ_AVAIL(m->outq), len);
used = MIN(used, MIDI_WSIZE);
MIDI_DEBUG(5,
printf("midi_synth: resid %zu len %jd avail %jd\n",
len, (intmax_t)MIDIQ_LEN(m->outq),
(intmax_t)MIDIQ_AVAIL(m->outq)));
if (midi_dumpraw)
for (i = 0; i < used; i++)
printf("%x ", buf[i]);
MIDIQ_ENQ(m->outq, buf, used);
len -= used;
/*
* Inform the bottom half that data can be written
*/
if (!(m->flags & M_TXEN)) {
m->flags |= M_TXEN;
MPU_CALLBACK(m, m->cookie, m->flags);
}
}
/*
* If we Made it here then transfer is good
*/
if (midi_dumpraw)
printf("\n");
retval = 0;
err1: mtx_unlock(&m->qlock);
mtx_unlock(&m->lock);
err0: return retval;
}
static int
midisynth_killnote(void *n, uint8_t chn, uint8_t note, uint8_t vel)
{
u_char c[3];
if (note > 127 || chn > 15)
return (EINVAL);
if (vel > 127)
vel = 127;
if (vel == 64) {
c[0] = 0x90 | (chn & 0x0f); /* Note on. */
c[1] = (u_char)note;
c[2] = 0;
} else {
c[0] = 0x80 | (chn & 0x0f); /* Note off. */
c[1] = (u_char)note;
c[2] = (u_char)vel;
}
return midisynth_writeraw(n, c, 3);
}
static int
midisynth_setinstr(void *n, uint8_t chn, uint16_t instr)
{
u_char c[2];
if (instr > 127 || chn > 15)
return EINVAL;
c[0] = 0xc0 | (chn & 0x0f); /* Progamme change. */
c[1] = instr + midi_instroff;
return midisynth_writeraw(n, c, 2);
}
static int
midisynth_startnote(void *n, uint8_t chn, uint8_t note, uint8_t vel)
{
u_char c[3];
if (note > 127 || chn > 15)
return EINVAL;
if (vel > 127)
vel = 127;
c[0] = 0x90 | (chn & 0x0f); /* Note on. */
c[1] = (u_char)note;
c[2] = (u_char)vel;
return midisynth_writeraw(n, c, 3);
}
static int
midisynth_alloc(void *n, uint8_t chan, uint8_t note)
{
return chan;
}
static int
midisynth_controller(void *n, uint8_t chn, uint8_t ctrlnum, uint16_t val)
{
u_char c[3];
if (ctrlnum > 127 || chn > 15)
return EINVAL;
c[0] = 0xb0 | (chn & 0x0f); /* Control Message. */
c[1] = ctrlnum;
c[2] = val;
return midisynth_writeraw(n, c, 3);
}
static int
midisynth_bender(void *n, uint8_t chn, uint16_t val)
{
u_char c[3];
if (val > 16383 || chn > 15)
return EINVAL;
c[0] = 0xe0 | (chn & 0x0f); /* Pitch bend. */
c[1] = (u_char)val & 0x7f;
c[2] = (u_char)(val >> 7) & 0x7f;
return midisynth_writeraw(n, c, 3);
}
/*
* Single point of midi destructions.
*/
static int
midi_destroy(struct snd_midi *m, int midiuninit)
{
mtx_assert(&midistat_lock, MA_OWNED);
mtx_assert(&m->lock, MA_OWNED);
MIDI_DEBUG(3, printf("midi_destroy\n"));
m->dev->si_drv1 = NULL;
mtx_unlock(&m->lock); /* XXX */
destroy_dev(m->dev);
TAILQ_REMOVE(&midi_devs, m, link);
if (midiuninit)
MPU_UNINIT(m, m->cookie);
free(MIDIQ_BUF(m->inq), M_MIDI);
free(MIDIQ_BUF(m->outq), M_MIDI);
mtx_destroy(&m->qlock);
mtx_destroy(&m->lock);
free(m, M_MIDI);
return 0;
}
/*
* Load and unload functions, creates the /dev/midistat device
*/
static int
midi_load()
{
mtx_init(&midistat_lock, "midistat lock", NULL, 0);
TAILQ_INIT(&midi_devs); /* Initialize the queue. */
midistat_dev = make_dev(&midistat_cdevsw,
MIDIMKMINOR(0, MIDI_DEV_MIDICTL, 0),
UID_ROOT, GID_WHEEL, 0666, "midistat");
return 0;
}
static int
midi_unload()
{
struct snd_midi *m;
int retval;
MIDI_DEBUG(1, printf("midi_unload()\n"));
retval = EBUSY;
mtx_lock(&midistat_lock);
if (midistat_isopen)
goto exit0;
TAILQ_FOREACH(m, &midi_devs, link) {
mtx_lock(&m->lock);
if (m->busy)
retval = EBUSY;
else
retval = midi_destroy(m, 1);
if (retval)
goto exit1;
}
mtx_unlock(&midistat_lock); /* XXX */
destroy_dev(midistat_dev);
/*
* Made it here then unload is complete
*/
mtx_destroy(&midistat_lock);
return 0;
exit1:
mtx_unlock(&m->lock);
exit0:
mtx_unlock(&midistat_lock);
if (retval)
MIDI_DEBUG(2, printf("midi_unload: failed\n"));
return retval;
}
extern int seq_modevent(module_t mod, int type, void *data);
static int
midi_modevent(module_t mod, int type, void *data)
{
int retval;
retval = 0;
switch (type) {
case MOD_LOAD:
retval = midi_load();
#if 0
if (retval == 0)
retval = seq_modevent(mod, type, data);
#endif
break;
case MOD_UNLOAD:
retval = midi_unload();
#if 0
if (retval == 0)
retval = seq_modevent(mod, type, data);
#endif
break;
default:
break;
}
return retval;
}
kobj_t
midimapper_addseq(void *arg1, int *unit, void **cookie)
{
unit = 0;
return (kobj_t)arg1;
}
int
midimapper_open(void *arg1, void **cookie)
{
int retval = 0;
struct snd_midi *m;
mtx_lock(&midistat_lock);
TAILQ_FOREACH(m, &midi_devs, link) {
retval++;
}
mtx_unlock(&midistat_lock);
return retval;
}
int
midimapper_close(void *arg1, void *cookie)
{
return 0;
}
kobj_t
midimapper_fetch_synth(void *arg, void *cookie, int unit)
{
struct snd_midi *m;
int retval = 0;
mtx_lock(&midistat_lock);
TAILQ_FOREACH(m, &midi_devs, link) {
if (unit == retval) {
mtx_unlock(&midistat_lock);
return (kobj_t)m->synth;
}
retval++;
}
mtx_unlock(&midistat_lock);
return NULL;
}
DEV_MODULE(midi, midi_modevent, NULL);
MODULE_VERSION(midi, 1);
Index: head/sys/dev/syscons/scmouse.c
===================================================================
--- head/sys/dev/syscons/scmouse.c (revision 225616)
+++ head/sys/dev/syscons/scmouse.c (revision 225617)
@@ -1,958 +1,958 @@
/*-
* Copyright (c) 1999 Kazutaka YOKOTA <yokota@zodiac.mech.utsunomiya-u.ac.jp>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer as
* the first lines of this file unmodified.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_syscons.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/conf.h>
#include <sys/consio.h>
#include <sys/fbio.h>
#include <sys/limits.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mouse.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/random.h>
#include <sys/signalvar.h>
#include <sys/tty.h>
#include <dev/syscons/syscons.h>
#ifdef SC_TWOBUTTON_MOUSE
#define SC_MOUSE_PASTEBUTTON MOUSE_BUTTON3DOWN /* right button */
#define SC_MOUSE_EXTENDBUTTON MOUSE_BUTTON2DOWN /* not really used */
#else
#define SC_MOUSE_PASTEBUTTON MOUSE_BUTTON2DOWN /* middle button */
#define SC_MOUSE_EXTENDBUTTON MOUSE_BUTTON3DOWN /* right button */
#endif /* SC_TWOBUTTON_MOUSE */
#define SC_WAKEUP_DELTA 20
/* for backward compatibility */
#define OLD_CONS_MOUSECTL _IOWR('c', 10, old_mouse_info_t)
typedef struct old_mouse_data {
int x;
int y;
int buttons;
} old_mouse_data_t;
typedef struct old_mouse_info {
int operation;
union {
struct old_mouse_data data;
struct mouse_mode mode;
} u;
} old_mouse_info_t;
#ifndef SC_NO_SYSMOUSE
/* local variables */
#ifndef SC_NO_CUTPASTE
static int cut_buffer_size;
static u_char *cut_buffer;
#endif
/* local functions */
static void set_mouse_pos(scr_stat *scp);
#ifndef SC_NO_CUTPASTE
static int skip_spc_right(scr_stat *scp, int p);
static int skip_spc_left(scr_stat *scp, int p);
static void mouse_cut(scr_stat *scp);
static void mouse_cut_start(scr_stat *scp);
static void mouse_cut_end(scr_stat *scp);
static void mouse_cut_word(scr_stat *scp);
static void mouse_cut_line(scr_stat *scp);
static void mouse_cut_extend(scr_stat *scp);
#endif /* SC_NO_CUTPASTE */
#ifndef SC_NO_CUTPASTE
/* allocate a cut buffer */
void
sc_alloc_cut_buffer(scr_stat *scp, int wait)
{
u_char *p;
if ((cut_buffer == NULL)
|| (cut_buffer_size < scp->xsize * scp->ysize + 1)) {
p = cut_buffer;
cut_buffer = NULL;
if (p != NULL)
free(p, M_DEVBUF);
cut_buffer_size = scp->xsize * scp->ysize + 1;
p = (u_char *)malloc(cut_buffer_size,
M_DEVBUF, (wait) ? M_WAITOK : M_NOWAIT);
if (p != NULL)
p[0] = '\0';
cut_buffer = p;
}
}
#endif /* SC_NO_CUTPASTE */
static void
sc_mouse_input_button(scr_stat *scp, int button)
{
char mouseb[6] = "\x1B[M";
mouseb[3] = ' ' + button;
mouseb[4] = '!' + scp->mouse_pos % scp->xsize;
mouseb[5] = '!' + scp->mouse_pos / scp->xsize;
sc_respond(scp, mouseb, sizeof mouseb, 1);
}
static void
sc_mouse_input(scr_stat *scp, mouse_info_t *mouse)
{
switch (mouse->operation) {
case MOUSE_BUTTON_EVENT:
if (mouse->u.event.value > 0) {
/* Mouse button pressed. */
if (mouse->u.event.id & MOUSE_BUTTON1DOWN)
sc_mouse_input_button(scp, 0);
if (mouse->u.event.id & MOUSE_BUTTON2DOWN)
sc_mouse_input_button(scp, 1);
if (mouse->u.event.id & MOUSE_BUTTON3DOWN)
sc_mouse_input_button(scp, 2);
} else {
/* Mouse button released. */
sc_mouse_input_button(scp, 3);
}
break;
case MOUSE_MOTION_EVENT:
if (mouse->u.data.z < 0) {
/* Scroll up. */
sc_mouse_input_button(scp, 64);
} else if (mouse->u.data.z > 0) {
/* Scroll down. */
sc_mouse_input_button(scp, 65);
}
break;
}
}
/* move mouse */
void
sc_mouse_move(scr_stat *scp, int x, int y)
{
int s;
s = spltty();
scp->mouse_xpos = scp->mouse_oldxpos = x;
scp->mouse_ypos = scp->mouse_oldypos = y;
if (scp->font_size <= 0 || scp->font_width <= 0)
scp->mouse_pos = scp->mouse_oldpos = 0;
else
scp->mouse_pos = scp->mouse_oldpos =
(y/scp->font_size - scp->yoff)*scp->xsize + x/scp->font_width -
scp->xoff;
scp->status |= MOUSE_MOVED;
splx(s);
}
/* adjust mouse position */
static void
set_mouse_pos(scr_stat *scp)
{
if (scp->mouse_xpos < scp->xoff*scp->font_width)
scp->mouse_xpos = scp->xoff*scp->font_width;
if (scp->mouse_ypos < scp->yoff*scp->font_size)
scp->mouse_ypos = scp->yoff*scp->font_size;
if (ISGRAPHSC(scp)) {
if (scp->mouse_xpos > scp->xpixel-1)
scp->mouse_xpos = scp->xpixel-1;
if (scp->mouse_ypos > scp->ypixel-1)
scp->mouse_ypos = scp->ypixel-1;
return;
} else {
if (scp->mouse_xpos > (scp->xsize + scp->xoff)*scp->font_width - 1)
scp->mouse_xpos = (scp->xsize + scp->xoff)*scp->font_width - 1;
if (scp->mouse_ypos > (scp->ysize + scp->yoff)*scp->font_size - 1)
scp->mouse_ypos = (scp->ysize + scp->yoff)*scp->font_size - 1;
}
if ((scp->mouse_xpos != scp->mouse_oldxpos || scp->mouse_ypos != scp->mouse_oldypos)
&& (scp->font_size != 0 && scp->font_width != 0)) {
scp->status |= MOUSE_MOVED;
scp->mouse_pos =
(scp->mouse_ypos/scp->font_size - scp->yoff)*scp->xsize
+ scp->mouse_xpos/scp->font_width - scp->xoff;
#ifndef SC_NO_CUTPASTE
if ((scp->status & MOUSE_VISIBLE) && (scp->status & MOUSE_CUTTING))
mouse_cut(scp);
#endif
}
}
#ifndef SC_NO_CUTPASTE
void
sc_draw_mouse_image(scr_stat *scp)
{
if (ISGRAPHSC(scp))
return;
SC_VIDEO_LOCK(scp->sc);
(*scp->rndr->draw_mouse)(scp, scp->mouse_xpos, scp->mouse_ypos, TRUE);
scp->mouse_oldpos = scp->mouse_pos;
scp->mouse_oldxpos = scp->mouse_xpos;
scp->mouse_oldypos = scp->mouse_ypos;
scp->status |= MOUSE_VISIBLE;
SC_VIDEO_UNLOCK(scp->sc);
}
void
sc_remove_mouse_image(scr_stat *scp)
{
int size;
int i;
if (ISGRAPHSC(scp))
return;
SC_VIDEO_LOCK(scp->sc);
(*scp->rndr->draw_mouse)(scp,
(scp->mouse_oldpos%scp->xsize + scp->xoff)
* scp->font_width,
(scp->mouse_oldpos/scp->xsize + scp->yoff)
* scp->font_size,
FALSE);
size = scp->xsize*scp->ysize;
i = scp->mouse_oldpos;
mark_for_update(scp, i);
mark_for_update(scp, i);
#ifndef PC98
if (i + scp->xsize + 1 < size) {
mark_for_update(scp, i + scp->xsize + 1);
} else if (i + scp->xsize < size) {
mark_for_update(scp, i + scp->xsize);
} else if (i + 1 < size) {
mark_for_update(scp, i + 1);
}
#endif /* PC98 */
scp->status &= ~MOUSE_VISIBLE;
SC_VIDEO_UNLOCK(scp->sc);
}
int
sc_inside_cutmark(scr_stat *scp, int pos)
{
int start;
int end;
if (scp->mouse_cut_end < 0)
return FALSE;
if (scp->mouse_cut_start <= scp->mouse_cut_end) {
start = scp->mouse_cut_start;
end = scp->mouse_cut_end;
} else {
start = scp->mouse_cut_end;
end = scp->mouse_cut_start - 1;
}
return ((start <= pos) && (pos <= end));
}
void
sc_remove_cutmarking(scr_stat *scp)
{
int s;
s = spltty();
if (scp->mouse_cut_end >= 0) {
mark_for_update(scp, scp->mouse_cut_start);
mark_for_update(scp, scp->mouse_cut_end);
}
scp->mouse_cut_start = scp->xsize*scp->ysize;
scp->mouse_cut_end = -1;
splx(s);
scp->status &= ~MOUSE_CUTTING;
}
void
sc_remove_all_cutmarkings(sc_softc_t *sc)
{
scr_stat *scp;
int i;
/* delete cut markings in all vtys */
for (i = 0; i < sc->vtys; ++i) {
scp = SC_STAT(sc->dev[i]);
if (scp == NULL)
continue;
sc_remove_cutmarking(scp);
}
}
void
sc_remove_all_mouse(sc_softc_t *sc)
{
scr_stat *scp;
int i;
for (i = 0; i < sc->vtys; ++i) {
scp = SC_STAT(sc->dev[i]);
if (scp == NULL)
continue;
if (scp->status & MOUSE_VISIBLE) {
scp->status &= ~MOUSE_VISIBLE;
mark_all(scp);
}
}
}
#define IS_SPACE_CHAR(c) (((c) & 0xff) == ' ')
#ifdef SC_CUT_SPACES2TABS
#define IS_BLANK_CHAR(c) (((c) & 0xff) == ' ' || ((c) & 0xff) == '\t')
#else
#define IS_BLANK_CHAR(c) IS_SPACE_CHAR(c)
#endif /* SC_CUT_SPACES2TABS */
#ifdef SC_CUT_SEPCHARS
#define IS_SEP_CHAR(c) (index(SC_CUT_SEPCHARS, (c) & 0xff) != NULL)
#else
#define IS_SEP_CHAR(c) IS_SPACE_CHAR(c)
#endif /* SC_CUT_SEPCHARS */
/* skip spaces to right */
static int
skip_spc_right(scr_stat *scp, int p)
{
int c;
int i;
for (i = p % scp->xsize; i < scp->xsize; ++i) {
c = sc_vtb_getc(&scp->vtb, p);
if (!IS_SPACE_CHAR(c))
break;
++p;
}
return i;
}
/* skip spaces to left */
static int
skip_spc_left(scr_stat *scp, int p)
{
int c;
int i;
for (i = p-- % scp->xsize - 1; i >= 0; --i) {
c = sc_vtb_getc(&scp->vtb, p);
if (!IS_SPACE_CHAR(c))
break;
--p;
}
return i;
}
static void
mouse_do_cut(scr_stat *scp, int from, int to)
{
int blank;
int i;
int leadspaces;
int p;
int s;
for (p = from, i = blank = leadspaces = 0; p <= to; ++p) {
cut_buffer[i] = sc_vtb_getc(&scp->vtb, p);
/* Be prepared that sc_vtb_getc() can return '\0' */
if (cut_buffer[i] == '\0')
cut_buffer[i] = ' ';
#ifdef SC_CUT_SPACES2TABS
if (leadspaces != -1) {
if (IS_SPACE_CHAR(cut_buffer[i])) {
leadspaces++;
/* Check that we are at tabstop position */
if ((p % scp->xsize) % 8 == 7) {
i -= leadspaces - 1;
cut_buffer[i] = '\t';
leadspaces = 0;
}
} else {
leadspaces = -1;
}
}
#endif /* SC_CUT_SPACES2TABS */
/* remember the position of the last non-space char */
if (!IS_BLANK_CHAR(cut_buffer[i]))
blank = i + 1; /* the first space after the last non-space */
++i;
/* trim trailing blank when crossing lines */
if ((p % scp->xsize) == (scp->xsize - 1)) {
cut_buffer[blank++] = '\r';
i = blank;
leadspaces = 0;
}
}
cut_buffer[i] = '\0';
/* remove the current marking */
s = spltty();
if (scp->mouse_cut_start <= scp->mouse_cut_end) {
mark_for_update(scp, scp->mouse_cut_start);
mark_for_update(scp, scp->mouse_cut_end);
} else if (scp->mouse_cut_end >= 0) {
mark_for_update(scp, scp->mouse_cut_end);
mark_for_update(scp, scp->mouse_cut_start);
}
/* mark the new region */
scp->mouse_cut_start = from;
scp->mouse_cut_end = to;
mark_for_update(scp, from);
mark_for_update(scp, to);
splx(s);
}
/* copy marked region to the cut buffer */
static void
mouse_cut(scr_stat *scp)
{
int start;
int end;
int from;
int to;
int c;
int p;
int s;
int i;
start = scp->mouse_cut_start;
end = scp->mouse_cut_end;
if (scp->mouse_pos >= start) {
from = start;
to = end = scp->mouse_pos;
} else {
from = end = scp->mouse_pos;
to = start - 1;
}
p = to;
for (i = p % scp->xsize; i < scp->xsize; ++i) {
c = sc_vtb_getc(&scp->vtb, p);
if (!IS_SPACE_CHAR(c))
break;
++p;
}
/* if there is nothing but blank chars, trim them, but mark towards eol */
if (i == scp->xsize) {
if (end >= start)
to = end = p - 1;
else
to = start = p;
}
mouse_do_cut(scp, from, to);
s = spltty();
scp->mouse_cut_start = start;
scp->mouse_cut_end = end;
splx(s);
}
/* a mouse button is pressed, start cut operation */
static void
mouse_cut_start(scr_stat *scp)
{
int i;
int s;
if (scp->status & MOUSE_VISIBLE) {
sc_remove_all_cutmarkings(scp->sc);
if ((scp->mouse_pos == scp->mouse_cut_start) &&
(scp->mouse_pos == scp->mouse_cut_end)) {
cut_buffer[0] = '\0';
return;
} else if (skip_spc_right(scp, scp->mouse_pos) >= scp->xsize) {
/* if the pointer is on trailing blank chars, mark towards eol */
i = skip_spc_left(scp, scp->mouse_pos) + 1;
s = spltty();
scp->mouse_cut_start =
(scp->mouse_pos / scp->xsize) * scp->xsize + i;
scp->mouse_cut_end =
(scp->mouse_pos / scp->xsize + 1) * scp->xsize - 1;
splx(s);
cut_buffer[0] = '\r';
} else {
s = spltty();
scp->mouse_cut_start = scp->mouse_pos;
scp->mouse_cut_end = scp->mouse_cut_start;
splx(s);
cut_buffer[0] = sc_vtb_getc(&scp->vtb, scp->mouse_cut_start);
}
cut_buffer[1] = '\0';
scp->status |= MOUSE_CUTTING;
mark_all(scp); /* this is probably overkill XXX */
}
}
/* end of cut operation */
static void
mouse_cut_end(scr_stat *scp)
{
if (scp->status & MOUSE_VISIBLE)
scp->status &= ~MOUSE_CUTTING;
}
/* copy a word under the mouse pointer */
static void
mouse_cut_word(scr_stat *scp)
{
int start;
int end;
int sol;
int eol;
int c;
int j;
int len;
/*
* Because we don't have locale information in the kernel,
* we only distinguish space char and non-space chars. Punctuation
* chars, symbols and other regular chars are all treated alike
* unless user specified SC_CUT_SEPCHARS in his kernel config file.
*/
if (scp->status & MOUSE_VISIBLE) {
sol = (scp->mouse_pos / scp->xsize) * scp->xsize;
eol = sol + scp->xsize;
c = sc_vtb_getc(&scp->vtb, scp->mouse_pos);
if (IS_SEP_CHAR(c)) {
/* blank space */
for (j = scp->mouse_pos; j >= sol; --j) {
c = sc_vtb_getc(&scp->vtb, j);
if (!IS_SEP_CHAR(c))
break;
}
start = ++j;
for (j = scp->mouse_pos; j < eol; ++j) {
c = sc_vtb_getc(&scp->vtb, j);
if (!IS_SEP_CHAR(c))
break;
}
end = j - 1;
} else {
/* non-space word */
for (j = scp->mouse_pos; j >= sol; --j) {
c = sc_vtb_getc(&scp->vtb, j);
if (IS_SEP_CHAR(c))
break;
}
start = ++j;
for (j = scp->mouse_pos; j < eol; ++j) {
c = sc_vtb_getc(&scp->vtb, j);
if (IS_SEP_CHAR(c))
break;
}
end = j - 1;
}
/* copy the found word */
mouse_do_cut(scp, start, end);
len = strlen(cut_buffer);
if (cut_buffer[len - 1] == '\r')
cut_buffer[len - 1] = '\0';
}
}
/* copy a line under the mouse pointer */
static void
mouse_cut_line(scr_stat *scp)
{
int len;
int from;
if (scp->status & MOUSE_VISIBLE) {
from = (scp->mouse_pos / scp->xsize) * scp->xsize;
mouse_do_cut(scp, from, from + scp->xsize - 1);
len = strlen(cut_buffer);
if (cut_buffer[len - 1] == '\r')
cut_buffer[len - 1] = '\0';
scp->status |= MOUSE_CUTTING;
}
}
/* extend the marked region to the mouse pointer position */
static void
mouse_cut_extend(scr_stat *scp)
{
int start;
int end;
int s;
if ((scp->status & MOUSE_VISIBLE) && !(scp->status & MOUSE_CUTTING)
&& (scp->mouse_cut_end >= 0)) {
if (scp->mouse_cut_start <= scp->mouse_cut_end) {
start = scp->mouse_cut_start;
end = scp->mouse_cut_end;
} else {
start = scp->mouse_cut_end;
end = scp->mouse_cut_start - 1;
}
s = spltty();
if (scp->mouse_pos > end) {
scp->mouse_cut_start = start;
scp->mouse_cut_end = end;
} else if (scp->mouse_pos < start) {
scp->mouse_cut_start = end + 1;
scp->mouse_cut_end = start;
} else {
if (scp->mouse_pos - start > end + 1 - scp->mouse_pos) {
scp->mouse_cut_start = start;
scp->mouse_cut_end = end;
} else {
scp->mouse_cut_start = end + 1;
scp->mouse_cut_end = start;
}
}
splx(s);
mouse_cut(scp);
scp->status |= MOUSE_CUTTING;
}
}
/* paste cut buffer contents into the current vty */
void
sc_mouse_paste(scr_stat *scp)
{
sc_paste(scp, cut_buffer, strlen(cut_buffer));
}
#endif /* SC_NO_CUTPASTE */
int
sc_mouse_ioctl(struct tty *tp, u_long cmd, caddr_t data, struct thread *td)
{
mouse_info_t *mouse;
mouse_info_t buf;
scr_stat *cur_scp;
scr_stat *scp;
struct proc *p1;
int s;
int f;
scp = SC_STAT(tp);
switch (cmd) {
case CONS_MOUSECTL: /* control mouse arrow */
case OLD_CONS_MOUSECTL:
mouse = (mouse_info_t*)data;
random_harvest(mouse, sizeof(mouse_info_t), 2, 0, RANDOM_MOUSE);
if (cmd == OLD_CONS_MOUSECTL) {
static u_char swapb[] = { 0, 4, 2, 6, 1, 5, 3, 7 };
old_mouse_info_t *old_mouse = (old_mouse_info_t *)data;
mouse = &buf;
mouse->operation = old_mouse->operation;
switch (mouse->operation) {
case MOUSE_MODE:
mouse->u.mode = old_mouse->u.mode;
break;
case MOUSE_SHOW:
case MOUSE_HIDE:
break;
case MOUSE_MOVEABS:
case MOUSE_MOVEREL:
case MOUSE_ACTION:
mouse->u.data.x = old_mouse->u.data.x;
mouse->u.data.y = old_mouse->u.data.y;
mouse->u.data.z = 0;
mouse->u.data.buttons = swapb[old_mouse->u.data.buttons & 0x7];
break;
case MOUSE_GETINFO:
old_mouse->u.data.x = scp->mouse_xpos;
old_mouse->u.data.y = scp->mouse_ypos;
old_mouse->u.data.buttons = swapb[scp->mouse_buttons & 0x7];
return 0;
default:
return EINVAL;
}
}
cur_scp = scp->sc->cur_scp;
switch (mouse->operation) {
case MOUSE_MODE:
if (ISSIGVALID(mouse->u.mode.signal)) {
scp->mouse_signal = mouse->u.mode.signal;
scp->mouse_proc = td->td_proc;
scp->mouse_pid = td->td_proc->p_pid;
}
else {
scp->mouse_signal = 0;
scp->mouse_proc = NULL;
scp->mouse_pid = 0;
}
return 0;
case MOUSE_SHOW:
s = spltty();
if (!(scp->sc->flags & SC_MOUSE_ENABLED)) {
scp->sc->flags |= SC_MOUSE_ENABLED;
cur_scp->status &= ~MOUSE_HIDDEN;
if (!ISGRAPHSC(cur_scp))
mark_all(cur_scp);
}
splx(s);
return 0;
/* NOTREACHED */
case MOUSE_HIDE:
s = spltty();
if (scp->sc->flags & SC_MOUSE_ENABLED) {
scp->sc->flags &= ~SC_MOUSE_ENABLED;
sc_remove_all_mouse(scp->sc);
}
splx(s);
return 0;
/* NOTREACHED */
case MOUSE_MOVEABS:
s = spltty();
scp->mouse_xpos = mouse->u.data.x;
scp->mouse_ypos = mouse->u.data.y;
set_mouse_pos(scp);
splx(s);
break;
case MOUSE_MOVEREL:
s = spltty();
scp->mouse_xpos += mouse->u.data.x;
scp->mouse_ypos += mouse->u.data.y;
set_mouse_pos(scp);
splx(s);
break;
case MOUSE_GETINFO:
mouse->u.data.x = scp->mouse_xpos;
mouse->u.data.y = scp->mouse_ypos;
mouse->u.data.z = 0;
mouse->u.data.buttons = scp->mouse_buttons;
return 0;
case MOUSE_ACTION:
case MOUSE_MOTION_EVENT:
/* send out mouse event on /dev/sysmouse */
#if 0
/* this should maybe only be settable from /dev/consolectl SOS */
if (SC_VTY(tp->t_dev) != SC_CONSOLECTL)
return ENOTTY;
#endif
s = spltty();
if (mouse->u.data.x != 0 || mouse->u.data.y != 0) {
cur_scp->mouse_xpos += mouse->u.data.x;
cur_scp->mouse_ypos += mouse->u.data.y;
set_mouse_pos(cur_scp);
}
f = 0;
if (mouse->operation == MOUSE_ACTION) {
f = cur_scp->mouse_buttons ^ mouse->u.data.buttons;
cur_scp->mouse_buttons = mouse->u.data.buttons;
}
splx(s);
if (sysmouse_event(mouse) == 0)
return 0;
/*
* If any buttons are down or the mouse has moved a lot,
* stop the screen saver.
*/
if (((mouse->operation == MOUSE_ACTION) && mouse->u.data.buttons)
|| (mouse->u.data.x*mouse->u.data.x
+ mouse->u.data.y*mouse->u.data.y
>= SC_WAKEUP_DELTA*SC_WAKEUP_DELTA)) {
sc_touch_scrn_saver();
}
cur_scp->status &= ~MOUSE_HIDDEN;
if (cur_scp->mouse_level > 0) {
sc_mouse_input(scp, mouse);
break;
}
if (cur_scp->mouse_signal && cur_scp->mouse_proc) {
/* has controlling process died? */
if (cur_scp->mouse_proc != (p1 = pfind(cur_scp->mouse_pid))) {
cur_scp->mouse_signal = 0;
cur_scp->mouse_proc = NULL;
cur_scp->mouse_pid = 0;
if (p1)
PROC_UNLOCK(p1);
} else {
- psignal(cur_scp->mouse_proc, cur_scp->mouse_signal);
+ kern_psignal(cur_scp->mouse_proc, cur_scp->mouse_signal);
PROC_UNLOCK(cur_scp->mouse_proc);
break;
}
}
#ifndef SC_NO_CUTPASTE
if (ISGRAPHSC(cur_scp) || (cut_buffer == NULL))
break;
if ((mouse->operation == MOUSE_ACTION) && f) {
/* process button presses */
if (cur_scp->mouse_buttons & MOUSE_BUTTON1DOWN)
mouse_cut_start(cur_scp);
else
mouse_cut_end(cur_scp);
if (cur_scp->mouse_buttons & MOUSE_BUTTON2DOWN ||
cur_scp->mouse_buttons & MOUSE_BUTTON3DOWN)
sc_mouse_paste(cur_scp);
}
#endif /* SC_NO_CUTPASTE */
break;
case MOUSE_BUTTON_EVENT:
if ((mouse->u.event.id & MOUSE_BUTTONS) == 0)
return EINVAL;
if (mouse->u.event.value < 0)
return EINVAL;
#if 0
/* this should maybe only be settable from /dev/consolectl SOS */
if (SC_VTY(tp->t_dev) != SC_CONSOLECTL)
return ENOTTY;
#endif
if (mouse->u.event.value > 0)
cur_scp->mouse_buttons |= mouse->u.event.id;
else
cur_scp->mouse_buttons &= ~mouse->u.event.id;
if (sysmouse_event(mouse) == 0)
return 0;
/* if a button is held down, stop the screen saver */
if (mouse->u.event.value > 0)
sc_touch_scrn_saver();
cur_scp->status &= ~MOUSE_HIDDEN;
if (cur_scp->mouse_level > 0) {
sc_mouse_input(scp, mouse);
break;
}
if (cur_scp->mouse_signal && cur_scp->mouse_proc) {
if (cur_scp->mouse_proc != (p1 = pfind(cur_scp->mouse_pid))){
cur_scp->mouse_signal = 0;
cur_scp->mouse_proc = NULL;
cur_scp->mouse_pid = 0;
if (p1)
PROC_UNLOCK(p1);
} else {
- psignal(cur_scp->mouse_proc, cur_scp->mouse_signal);
+ kern_psignal(cur_scp->mouse_proc, cur_scp->mouse_signal);
PROC_UNLOCK(cur_scp->mouse_proc);
break;
}
}
#ifndef SC_NO_CUTPASTE
if (ISGRAPHSC(cur_scp) || (cut_buffer == NULL))
break;
switch (mouse->u.event.id) {
case MOUSE_BUTTON1DOWN:
switch (mouse->u.event.value % 4) {
case 0: /* up */
mouse_cut_end(cur_scp);
break;
case 1: /* single click: start cut operation */
mouse_cut_start(cur_scp);
break;
case 2: /* double click: cut a word */
mouse_cut_word(cur_scp);
mouse_cut_end(cur_scp);
break;
case 3: /* triple click: cut a line */
mouse_cut_line(cur_scp);
mouse_cut_end(cur_scp);
break;
}
break;
case SC_MOUSE_PASTEBUTTON:
switch (mouse->u.event.value) {
case 0: /* up */
break;
default:
sc_mouse_paste(cur_scp);
break;
}
break;
case SC_MOUSE_EXTENDBUTTON:
switch (mouse->u.event.value) {
case 0: /* up */
if (!(cur_scp->mouse_buttons & MOUSE_BUTTON1DOWN))
mouse_cut_end(cur_scp);
break;
default:
mouse_cut_extend(cur_scp);
break;
}
break;
}
#endif /* SC_NO_CUTPASTE */
break;
case MOUSE_MOUSECHAR:
if (mouse->u.mouse_char < 0) {
mouse->u.mouse_char = scp->sc->mouse_char;
} else {
if (mouse->u.mouse_char > UCHAR_MAX - 3)
return EINVAL;
s = spltty();
sc_remove_all_mouse(scp->sc);
#ifndef SC_NO_FONT_LOADING
if (ISTEXTSC(cur_scp) && (cur_scp->font != NULL))
sc_load_font(cur_scp, 0, cur_scp->font_size,
cur_scp->font_width,
cur_scp->font + cur_scp->font_size
* cur_scp->sc->mouse_char,
cur_scp->sc->mouse_char, 4);
#endif
scp->sc->mouse_char = mouse->u.mouse_char;
splx(s);
}
break;
default:
return EINVAL;
}
return 0;
}
return ENOIOCTL;
}
#endif /* SC_NO_SYSMOUSE */
Index: head/sys/dev/syscons/syscons.c
===================================================================
--- head/sys/dev/syscons/syscons.c (revision 225616)
+++ head/sys/dev/syscons/syscons.c (revision 225617)
@@ -1,3862 +1,3862 @@
/*-
* Copyright (c) 1992-1998 Søren Schmidt
* All rights reserved.
*
* This code is derived from software contributed to The DragonFly Project
* by Sascha Wildner <saw@online.de>
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer,
* without modification, immediately at the beginning of the file.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_compat.h"
#include "opt_syscons.h"
#include "opt_splash.h"
#include "opt_ddb.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/bus.h>
#include <sys/conf.h>
#include <sys/cons.h>
#include <sys/consio.h>
#include <sys/kdb.h>
#include <sys/eventhandler.h>
#include <sys/fbio.h>
#include <sys/kbio.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/random.h>
#include <sys/reboot.h>
#include <sys/serial.h>
#include <sys/signalvar.h>
#include <sys/sysctl.h>
#include <sys/tty.h>
#include <sys/power.h>
#include <machine/clock.h>
#if defined(__sparc64__) || defined(__powerpc__)
#include <machine/sc_machdep.h>
#else
#include <machine/pc/display.h>
#endif
#if defined( __i386__) || defined(__amd64__)
#include <machine/psl.h>
#include <machine/frame.h>
#endif
#include <machine/stdarg.h>
#include <dev/kbd/kbdreg.h>
#include <dev/fb/fbreg.h>
#include <dev/fb/splashreg.h>
#include <dev/syscons/syscons.h>
#define COLD 0
#define WARM 1
#define DEFAULT_BLANKTIME (5*60) /* 5 minutes */
#define MAX_BLANKTIME (7*24*60*60) /* 7 days!? */
#define KEYCODE_BS 0x0e /* "<-- Backspace" key, XXX */
typedef struct default_attr {
int std_color; /* normal hardware color */
int rev_color; /* reverse hardware color */
} default_attr;
static default_attr user_default = {
SC_NORM_ATTR,
SC_NORM_REV_ATTR,
};
static int sc_console_unit = -1;
static int sc_saver_keyb_only = 1;
static scr_stat *sc_console;
static struct consdev *sc_consptr;
static scr_stat main_console;
static struct tty *main_devs[MAXCONS];
static char init_done = COLD;
static int shutdown_in_progress = FALSE;
static int suspend_in_progress = FALSE;
static char sc_malloc = FALSE;
static int saver_mode = CONS_NO_SAVER; /* LKM/user saver */
static int run_scrn_saver = FALSE; /* should run the saver? */
static int enable_bell = TRUE; /* enable beeper */
#ifndef SC_DISABLE_REBOOT
static int enable_reboot = TRUE; /* enable keyboard reboot */
#endif
#ifndef SC_DISABLE_KDBKEY
static int enable_kdbkey = TRUE; /* enable keyboard debug */
#endif
static long scrn_blank_time = 0; /* screen saver timeout value */
#ifdef DEV_SPLASH
static int scrn_blanked; /* # of blanked screen */
static int sticky_splash = FALSE;
static void none_saver(sc_softc_t *sc, int blank) { }
static void (*current_saver)(sc_softc_t *, int) = none_saver;
#endif
#ifdef SC_NO_SUSPEND_VTYSWITCH
static int sc_no_suspend_vtswitch = 1;
#else
static int sc_no_suspend_vtswitch = 0;
#endif
static int sc_susp_scr;
SYSCTL_NODE(_hw, OID_AUTO, syscons, CTLFLAG_RD, 0, "syscons");
SYSCTL_NODE(_hw_syscons, OID_AUTO, saver, CTLFLAG_RD, 0, "saver");
SYSCTL_INT(_hw_syscons_saver, OID_AUTO, keybonly, CTLFLAG_RW,
&sc_saver_keyb_only, 0, "screen saver interrupted by input only");
SYSCTL_INT(_hw_syscons, OID_AUTO, bell, CTLFLAG_RW, &enable_bell,
0, "enable bell");
#ifndef SC_DISABLE_REBOOT
SYSCTL_INT(_hw_syscons, OID_AUTO, kbd_reboot, CTLFLAG_RW|CTLFLAG_SECURE, &enable_reboot,
0, "enable keyboard reboot");
#endif
#ifndef SC_DISABLE_KDBKEY
SYSCTL_INT(_hw_syscons, OID_AUTO, kbd_debug, CTLFLAG_RW|CTLFLAG_SECURE, &enable_kdbkey,
0, "enable keyboard debug");
#endif
TUNABLE_INT("hw.syscons.sc_no_suspend_vtswitch", &sc_no_suspend_vtswitch);
SYSCTL_INT(_hw_syscons, OID_AUTO, sc_no_suspend_vtswitch, CTLFLAG_RW,
&sc_no_suspend_vtswitch, 0, "Disable VT switch before suspend.");
#if !defined(SC_NO_FONT_LOADING) && defined(SC_DFLT_FONT)
#include "font.h"
#endif
tsw_ioctl_t *sc_user_ioctl;
static bios_values_t bios_value;
static int enable_panic_key;
SYSCTL_INT(_machdep, OID_AUTO, enable_panic_key, CTLFLAG_RW, &enable_panic_key,
0, "Enable panic via keypress specified in kbdmap(5)");
#define SC_CONSOLECTL 255
#define VTY_WCHAN(sc, vty) (&SC_DEV(sc, vty))
static int debugger;
/* prototypes */
static int sc_allocate_keyboard(sc_softc_t *sc, int unit);
static int scvidprobe(int unit, int flags, int cons);
static int sckbdprobe(int unit, int flags, int cons);
static void scmeminit(void *arg);
static int scdevtounit(struct tty *tp);
static kbd_callback_func_t sckbdevent;
static void scinit(int unit, int flags);
static scr_stat *sc_get_stat(struct tty *tp);
static void scterm(int unit, int flags);
static void scshutdown(void *, int);
static void scsuspend(void *);
static void scresume(void *);
static u_int scgetc(sc_softc_t *sc, u_int flags);
#define SCGETC_CN 1
#define SCGETC_NONBLOCK 2
static void sccnupdate(scr_stat *scp);
static scr_stat *alloc_scp(sc_softc_t *sc, int vty);
static void init_scp(sc_softc_t *sc, int vty, scr_stat *scp);
static timeout_t scrn_timer;
static int and_region(int *s1, int *e1, int s2, int e2);
static void scrn_update(scr_stat *scp, int show_cursor);
#ifdef DEV_SPLASH
static int scsplash_callback(int event, void *arg);
static void scsplash_saver(sc_softc_t *sc, int show);
static int add_scrn_saver(void (*this_saver)(sc_softc_t *, int));
static int remove_scrn_saver(void (*this_saver)(sc_softc_t *, int));
static int set_scrn_saver_mode(scr_stat *scp, int mode, u_char *pal, int border);
static int restore_scrn_saver_mode(scr_stat *scp, int changemode);
static void stop_scrn_saver(sc_softc_t *sc, void (*saver)(sc_softc_t *, int));
static int wait_scrn_saver_stop(sc_softc_t *sc);
#define scsplash_stick(stick) (sticky_splash = (stick))
#else /* !DEV_SPLASH */
#define scsplash_stick(stick)
#endif /* DEV_SPLASH */
static int do_switch_scr(sc_softc_t *sc, int s);
static int vt_proc_alive(scr_stat *scp);
static int signal_vt_rel(scr_stat *scp);
static int signal_vt_acq(scr_stat *scp);
static int finish_vt_rel(scr_stat *scp, int release, int *s);
static int finish_vt_acq(scr_stat *scp);
static void exchange_scr(sc_softc_t *sc);
static void update_cursor_image(scr_stat *scp);
static void change_cursor_shape(scr_stat *scp, int flags, int base, int height);
static int save_kbd_state(scr_stat *scp);
static int update_kbd_state(scr_stat *scp, int state, int mask);
static int update_kbd_leds(scr_stat *scp, int which);
static timeout_t blink_screen;
static struct tty *sc_alloc_tty(int, int);
static cn_probe_t sc_cnprobe;
static cn_init_t sc_cninit;
static cn_term_t sc_cnterm;
static cn_getc_t sc_cngetc;
static cn_putc_t sc_cnputc;
CONSOLE_DRIVER(sc);
static tsw_open_t sctty_open;
static tsw_close_t sctty_close;
static tsw_outwakeup_t sctty_outwakeup;
static tsw_ioctl_t sctty_ioctl;
static tsw_mmap_t sctty_mmap;
static struct ttydevsw sc_ttydevsw = {
.tsw_open = sctty_open,
.tsw_close = sctty_close,
.tsw_outwakeup = sctty_outwakeup,
.tsw_ioctl = sctty_ioctl,
.tsw_mmap = sctty_mmap,
};
static d_ioctl_t consolectl_ioctl;
static struct cdevsw consolectl_devsw = {
.d_version = D_VERSION,
.d_flags = D_NEEDGIANT,
.d_ioctl = consolectl_ioctl,
.d_name = "consolectl",
};
int
sc_probe_unit(int unit, int flags)
{
if (!scvidprobe(unit, flags, FALSE)) {
if (bootverbose)
printf("%s%d: no video adapter found.\n", SC_DRIVER_NAME, unit);
return ENXIO;
}
/* syscons will be attached even when there is no keyboard */
sckbdprobe(unit, flags, FALSE);
return 0;
}
/* probe video adapters, return TRUE if found */
static int
scvidprobe(int unit, int flags, int cons)
{
/*
* Access the video adapter driver through the back door!
* Video adapter drivers need to be configured before syscons.
* However, when syscons is being probed as the low-level console,
* they have not been initialized yet. We force them to initialize
* themselves here. XXX
*/
vid_configure(cons ? VIO_PROBE_ONLY : 0);
return (vid_find_adapter("*", unit) >= 0);
}
/* probe the keyboard, return TRUE if found */
static int
sckbdprobe(int unit, int flags, int cons)
{
/* access the keyboard driver through the backdoor! */
kbd_configure(cons ? KB_CONF_PROBE_ONLY : 0);
return (kbd_find_keyboard("*", unit) >= 0);
}
static char
*adapter_name(video_adapter_t *adp)
{
static struct {
int type;
char *name[2];
} names[] = {
{ KD_MONO, { "MDA", "MDA" } },
{ KD_HERCULES, { "Hercules", "Hercules" } },
{ KD_CGA, { "CGA", "CGA" } },
{ KD_EGA, { "EGA", "EGA (mono)" } },
{ KD_VGA, { "VGA", "VGA (mono)" } },
{ KD_PC98, { "PC-98x1", "PC-98x1" } },
{ KD_TGA, { "TGA", "TGA" } },
{ -1, { "Unknown", "Unknown" } },
};
int i;
for (i = 0; names[i].type != -1; ++i)
if (names[i].type == adp->va_type)
break;
return names[i].name[(adp->va_flags & V_ADP_COLOR) ? 0 : 1];
}
static void
sctty_outwakeup(struct tty *tp)
{
size_t len;
u_char buf[PCBURST];
scr_stat *scp = sc_get_stat(tp);
if (scp->status & SLKED ||
(scp == scp->sc->cur_scp && scp->sc->blink_in_progress))
return;
for (;;) {
len = ttydisc_getc(tp, buf, sizeof buf);
if (len == 0)
break;
sc_puts(scp, buf, len, 0);
}
}
static struct tty *
sc_alloc_tty(int index, int devnum)
{
struct sc_ttysoftc *stc;
struct tty *tp;
/* Allocate TTY object and softc to store unit number. */
stc = malloc(sizeof(struct sc_ttysoftc), M_DEVBUF, M_WAITOK);
stc->st_index = index;
stc->st_stat = NULL;
tp = tty_alloc_mutex(&sc_ttydevsw, stc, &Giant);
/* Create device node. */
tty_makedev(tp, NULL, "v%r", devnum);
return (tp);
}
#ifdef SC_PIXEL_MODE
static void
sc_set_vesa_mode(scr_stat *scp, sc_softc_t *sc, int unit)
{
video_info_t info;
u_char *font;
int depth;
int fontsize;
int i;
int vmode;
vmode = 0;
(void)resource_int_value("sc", unit, "vesa_mode", &vmode);
if (vmode < M_VESA_BASE || vmode > M_VESA_MODE_MAX ||
vidd_get_info(sc->adp, vmode, &info) != 0 ||
!sc_support_pixel_mode(&info))
vmode = 0;
/*
* If the mode is unset or unsupported, search for an available
* 800x600 graphics mode with the highest color depth.
*/
if (vmode == 0) {
for (depth = 0, i = M_VESA_BASE; i <= M_VESA_MODE_MAX; i++)
if (vidd_get_info(sc->adp, i, &info) == 0 &&
info.vi_width == 800 && info.vi_height == 600 &&
sc_support_pixel_mode(&info) &&
info.vi_depth > depth) {
vmode = i;
depth = info.vi_depth;
}
if (vmode == 0)
return;
vidd_get_info(sc->adp, vmode, &info);
}
#if !defined(SC_NO_FONT_LOADING) && defined(SC_DFLT_FONT)
fontsize = info.vi_cheight;
#else
fontsize = scp->font_size;
#endif
if (fontsize < 14)
fontsize = 8;
else if (fontsize >= 16)
fontsize = 16;
else
fontsize = 14;
#ifndef SC_NO_FONT_LOADING
switch (fontsize) {
case 8:
if ((sc->fonts_loaded & FONT_8) == 0)
return;
font = sc->font_8;
break;
case 14:
if ((sc->fonts_loaded & FONT_14) == 0)
return;
font = sc->font_14;
break;
case 16:
if ((sc->fonts_loaded & FONT_16) == 0)
return;
font = sc->font_16;
break;
}
#else
font = NULL;
#endif
#ifdef DEV_SPLASH
if ((sc->flags & SC_SPLASH_SCRN) != 0)
splash_term(sc->adp);
#endif
#ifndef SC_NO_HISTORY
if (scp->history != NULL) {
sc_vtb_append(&scp->vtb, 0, scp->history,
scp->ypos * scp->xsize + scp->xpos);
scp->history_pos = sc_vtb_tail(scp->history);
}
#endif
vidd_set_mode(sc->adp, vmode);
scp->status |= (UNKNOWN_MODE | PIXEL_MODE | MOUSE_HIDDEN);
scp->status &= ~(GRAPHICS_MODE | MOUSE_VISIBLE);
scp->xpixel = info.vi_width;
scp->ypixel = info.vi_height;
scp->xsize = scp->xpixel / 8;
scp->ysize = scp->ypixel / fontsize;
scp->xpos = 0;
scp->ypos = scp->ysize - 1;
scp->xoff = scp->yoff = 0;
scp->font = font;
scp->font_size = fontsize;
scp->font_width = 8;
scp->start = scp->xsize * scp->ysize - 1;
scp->end = 0;
scp->cursor_pos = scp->cursor_oldpos = scp->xsize * scp->xsize;
scp->mode = sc->initial_mode = vmode;
#ifndef __sparc64__
sc_vtb_init(&scp->scr, VTB_FRAMEBUFFER, scp->xsize, scp->ysize,
(void *)sc->adp->va_window, FALSE);
#endif
sc_alloc_scr_buffer(scp, FALSE, FALSE);
sc_init_emulator(scp, NULL);
#ifndef SC_NO_CUTPASTE
sc_alloc_cut_buffer(scp, FALSE);
#endif
#ifndef SC_NO_HISTORY
sc_alloc_history_buffer(scp, 0, 0, FALSE);
#endif
sc_set_border(scp, scp->border);
sc_set_cursor_image(scp);
scp->status &= ~UNKNOWN_MODE;
#ifdef DEV_SPLASH
if ((sc->flags & SC_SPLASH_SCRN) != 0)
splash_init(sc->adp, scsplash_callback, sc);
#endif
}
#endif
int
sc_attach_unit(int unit, int flags)
{
sc_softc_t *sc;
scr_stat *scp;
struct cdev *dev;
int vc;
flags &= ~SC_KERNEL_CONSOLE;
if (sc_console_unit == unit) {
/*
* If this unit is being used as the system console, we need to
* adjust some variables and buffers before and after scinit().
*/
/* assert(sc_console != NULL) */
flags |= SC_KERNEL_CONSOLE;
scmeminit(NULL);
}
scinit(unit, flags);
sc = sc_get_softc(unit, flags & SC_KERNEL_CONSOLE);
sc->config = flags;
scp = sc_get_stat(sc->dev[0]);
if (sc_console == NULL) /* sc_console_unit < 0 */
sc_console = scp;
#ifdef SC_PIXEL_MODE
if ((sc->config & SC_VESAMODE) != 0)
sc_set_vesa_mode(scp, sc, unit);
#endif /* SC_PIXEL_MODE */
/* initialize cursor */
if (!ISGRAPHSC(scp))
update_cursor_image(scp);
/* get screen update going */
scrn_timer(sc);
/* set up the keyboard */
(void)kbdd_ioctl(sc->kbd, KDSKBMODE, (caddr_t)&scp->kbd_mode);
update_kbd_state(scp, scp->status, LOCK_MASK);
printf("%s%d: %s <%d virtual consoles, flags=0x%x>\n",
SC_DRIVER_NAME, unit, adapter_name(sc->adp), sc->vtys, sc->config);
if (bootverbose) {
printf("%s%d:", SC_DRIVER_NAME, unit);
if (sc->adapter >= 0)
printf(" fb%d", sc->adapter);
if (sc->keyboard >= 0)
printf(", kbd%d", sc->keyboard);
if (scp->tsw)
printf(", terminal emulator: %s (%s)",
scp->tsw->te_name, scp->tsw->te_desc);
printf("\n");
}
/* Register suspend/resume/shutdown callbacks for the kernel console. */
if (sc_console_unit == unit) {
EVENTHANDLER_REGISTER(power_suspend, scsuspend, NULL,
EVENTHANDLER_PRI_ANY);
EVENTHANDLER_REGISTER(power_resume, scresume, NULL,
EVENTHANDLER_PRI_ANY);
EVENTHANDLER_REGISTER(shutdown_pre_sync, scshutdown, NULL,
SHUTDOWN_PRI_DEFAULT);
}
for (vc = 0; vc < sc->vtys; vc++) {
if (sc->dev[vc] == NULL) {
sc->dev[vc] = sc_alloc_tty(vc, vc + unit * MAXCONS);
if (vc == 0 && sc->dev == main_devs)
SC_STAT(sc->dev[0]) = &main_console;
}
/*
* The first vty already has struct tty and scr_stat initialized
* in scinit(). The other vtys will have these structs when
* first opened.
*/
}
dev = make_dev(&consolectl_devsw, 0, UID_ROOT, GID_WHEEL, 0600,
"consolectl");
dev->si_drv1 = sc->dev[0];
return 0;
}
static void
scmeminit(void *arg)
{
if (sc_malloc)
return;
sc_malloc = TRUE;
/*
* As soon as malloc() becomes functional, we had better allocate
* various buffers for the kernel console.
*/
if (sc_console_unit < 0) /* sc_console == NULL */
return;
/* copy the temporary buffer to the final buffer */
sc_alloc_scr_buffer(sc_console, FALSE, FALSE);
#ifndef SC_NO_CUTPASTE
sc_alloc_cut_buffer(sc_console, FALSE);
#endif
#ifndef SC_NO_HISTORY
/* initialize history buffer & pointers */
sc_alloc_history_buffer(sc_console, 0, 0, FALSE);
#endif
}
/* XXX */
SYSINIT(sc_mem, SI_SUB_KMEM, SI_ORDER_ANY, scmeminit, NULL);
static int
scdevtounit(struct tty *tp)
{
int vty = SC_VTY(tp);
if (vty == SC_CONSOLECTL)
return ((sc_console != NULL) ? sc_console->sc->unit : -1);
else if ((vty < 0) || (vty >= MAXCONS*sc_max_unit()))
return -1;
else
return vty/MAXCONS;
}
static int
sctty_open(struct tty *tp)
{
int unit = scdevtounit(tp);
sc_softc_t *sc;
scr_stat *scp;
#ifndef __sparc64__
keyarg_t key;
#endif
DPRINTF(5, ("scopen: dev:%s, unit:%d, vty:%d\n",
devtoname(tp->t_dev), unit, SC_VTY(tp)));
sc = sc_get_softc(unit, (sc_console_unit == unit) ? SC_KERNEL_CONSOLE : 0);
if (sc == NULL)
return ENXIO;
if (!tty_opened(tp)) {
/* Use the current setting of the <-- key as default VERASE. */
/* If the Delete key is preferable, an stty is necessary */
#ifndef __sparc64__
if (sc->kbd != NULL) {
key.keynum = KEYCODE_BS;
(void)kbdd_ioctl(sc->kbd, GIO_KEYMAPENT, (caddr_t)&key);
tp->t_termios.c_cc[VERASE] = key.key.map[0];
}
#endif
}
scp = sc_get_stat(tp);
if (scp == NULL) {
scp = SC_STAT(tp) = alloc_scp(sc, SC_VTY(tp));
if (ISGRAPHSC(scp))
sc_set_pixel_mode(scp, NULL, 0, 0, 16, 8);
}
if (!tp->t_winsize.ws_col && !tp->t_winsize.ws_row) {
tp->t_winsize.ws_col = scp->xsize;
tp->t_winsize.ws_row = scp->ysize;
}
return (0);
}
static void
sctty_close(struct tty *tp)
{
scr_stat *scp;
int s;
if (SC_VTY(tp) != SC_CONSOLECTL) {
scp = sc_get_stat(tp);
/* were we in the middle of the VT switching process? */
DPRINTF(5, ("sc%d: scclose(), ", scp->sc->unit));
s = spltty();
if ((scp == scp->sc->cur_scp) && (scp->sc->unit == sc_console_unit))
cnavailable(sc_consptr, TRUE);
if (finish_vt_rel(scp, TRUE, &s) == 0) /* force release */
DPRINTF(5, ("reset WAIT_REL, "));
if (finish_vt_acq(scp) == 0) /* force acknowledge */
DPRINTF(5, ("reset WAIT_ACQ, "));
#ifdef not_yet_done
if (scp == &main_console) {
scp->pid = 0;
scp->proc = NULL;
scp->smode.mode = VT_AUTO;
}
else {
sc_vtb_destroy(&scp->vtb);
#ifndef __sparc64__
sc_vtb_destroy(&scp->scr);
#endif
sc_free_history_buffer(scp, scp->ysize);
SC_STAT(tp) = NULL;
free(scp, M_DEVBUF);
}
#else
scp->pid = 0;
scp->proc = NULL;
scp->smode.mode = VT_AUTO;
#endif
scp->kbd_mode = K_XLATE;
if (scp == scp->sc->cur_scp)
(void)kbdd_ioctl(scp->sc->kbd, KDSKBMODE, (caddr_t)&scp->kbd_mode);
DPRINTF(5, ("done.\n"));
}
}
#if 0 /* XXX mpsafetty: fix screensaver. What about outwakeup? */
static int
scread(struct cdev *dev, struct uio *uio, int flag)
{
if (!sc_saver_keyb_only)
sc_touch_scrn_saver();
return ttyread(dev, uio, flag);
}
#endif
static int
sckbdevent(keyboard_t *thiskbd, int event, void *arg)
{
sc_softc_t *sc;
struct tty *cur_tty;
int c, error = 0;
size_t len;
const u_char *cp;
sc = (sc_softc_t *)arg;
/* assert(thiskbd == sc->kbd) */
mtx_lock(&Giant);
switch (event) {
case KBDIO_KEYINPUT:
break;
case KBDIO_UNLOADING:
sc->kbd = NULL;
sc->keyboard = -1;
kbd_release(thiskbd, (void *)&sc->keyboard);
goto done;
default:
error = EINVAL;
goto done;
}
/*
* Loop while there is still input to get from the keyboard.
* I don't think this is nessesary, and it doesn't fix
* the Xaccel-2.1 keyboard hang, but it can't hurt. XXX
*/
while ((c = scgetc(sc, SCGETC_NONBLOCK)) != NOKEY) {
cur_tty = SC_DEV(sc, sc->cur_scp->index);
if (!tty_opened(cur_tty))
continue;
if ((*sc->cur_scp->tsw->te_input)(sc->cur_scp, c, cur_tty))
continue;
switch (KEYFLAGS(c)) {
case 0x0000: /* normal key */
ttydisc_rint(cur_tty, KEYCHAR(c), 0);
break;
case FKEY: /* function key, return string */
cp = (*sc->cur_scp->tsw->te_fkeystr)(sc->cur_scp, c);
if (cp != NULL) {
ttydisc_rint_simple(cur_tty, cp, strlen(cp));
break;
}
cp = kbdd_get_fkeystr(thiskbd, KEYCHAR(c), &len);
if (cp != NULL)
ttydisc_rint_simple(cur_tty, cp, len);
break;
case MKEY: /* meta is active, prepend ESC */
ttydisc_rint(cur_tty, 0x1b, 0);
ttydisc_rint(cur_tty, KEYCHAR(c), 0);
break;
case BKEY: /* backtab fixed sequence (esc [ Z) */
ttydisc_rint_simple(cur_tty, "\x1B[Z", 3);
break;
}
ttydisc_rint_done(cur_tty);
}
sc->cur_scp->status |= MOUSE_HIDDEN;
done:
mtx_unlock(&Giant);
return (error);
}
static int
sctty_ioctl(struct tty *tp, u_long cmd, caddr_t data, struct thread *td)
{
int error;
int i;
sc_softc_t *sc;
scr_stat *scp;
int s;
#if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \
defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
int ival;
#endif
/* If there is a user_ioctl function call that first */
if (sc_user_ioctl) {
error = (*sc_user_ioctl)(tp, cmd, data, td);
if (error != ENOIOCTL)
return error;
}
error = sc_vid_ioctl(tp, cmd, data, td);
if (error != ENOIOCTL)
return error;
#ifndef SC_NO_HISTORY
error = sc_hist_ioctl(tp, cmd, data, td);
if (error != ENOIOCTL)
return error;
#endif
#ifndef SC_NO_SYSMOUSE
error = sc_mouse_ioctl(tp, cmd, data, td);
if (error != ENOIOCTL)
return error;
#endif
scp = sc_get_stat(tp);
/* assert(scp != NULL) */
/* scp is sc_console, if SC_VTY(dev) == SC_CONSOLECTL. */
sc = scp->sc;
if (scp->tsw) {
error = (*scp->tsw->te_ioctl)(scp, tp, cmd, data, td);
if (error != ENOIOCTL)
return error;
}
switch (cmd) { /* process console hardware related ioctl's */
case GIO_ATTR: /* get current attributes */
/* this ioctl is not processed here, but in the terminal emulator */
return ENOTTY;
case GIO_COLOR: /* is this a color console ? */
*(int *)data = (sc->adp->va_flags & V_ADP_COLOR) ? 1 : 0;
return 0;
case CONS_BLANKTIME: /* set screen saver timeout (0 = no saver) */
if (*(int *)data < 0 || *(int *)data > MAX_BLANKTIME)
return EINVAL;
s = spltty();
scrn_blank_time = *(int *)data;
run_scrn_saver = (scrn_blank_time != 0);
splx(s);
return 0;
case CONS_CURSORTYPE: /* set cursor type (obsolete) */
s = spltty();
*(int *)data &= CONS_CURSOR_ATTRS;
sc_change_cursor_shape(scp, *(int *)data, -1, -1);
splx(s);
return 0;
case CONS_GETCURSORSHAPE: /* get cursor shape (new interface) */
if (((int *)data)[0] & CONS_LOCAL_CURSOR) {
((int *)data)[0] = scp->curr_curs_attr.flags;
((int *)data)[1] = scp->curr_curs_attr.base;
((int *)data)[2] = scp->curr_curs_attr.height;
} else {
((int *)data)[0] = sc->curs_attr.flags;
((int *)data)[1] = sc->curs_attr.base;
((int *)data)[2] = sc->curs_attr.height;
}
return 0;
case CONS_SETCURSORSHAPE: /* set cursor shape (new interface) */
s = spltty();
sc_change_cursor_shape(scp, ((int *)data)[0],
((int *)data)[1], ((int *)data)[2]);
splx(s);
return 0;
case CONS_BELLTYPE: /* set bell type sound/visual */
if ((*(int *)data) & CONS_VISUAL_BELL)
sc->flags |= SC_VISUAL_BELL;
else
sc->flags &= ~SC_VISUAL_BELL;
if ((*(int *)data) & CONS_QUIET_BELL)
sc->flags |= SC_QUIET_BELL;
else
sc->flags &= ~SC_QUIET_BELL;
return 0;
case CONS_GETINFO: /* get current (virtual) console info */
{
vid_info_t *ptr = (vid_info_t*)data;
if (ptr->size == sizeof(struct vid_info)) {
ptr->m_num = sc->cur_scp->index;
ptr->font_size = scp->font_size;
ptr->mv_col = scp->xpos;
ptr->mv_row = scp->ypos;
ptr->mv_csz = scp->xsize;
ptr->mv_rsz = scp->ysize;
ptr->mv_hsz = (scp->history != NULL) ? scp->history->vtb_rows : 0;
/*
* The following fields are filled by the terminal emulator. XXX
*
* ptr->mv_norm.fore
* ptr->mv_norm.back
* ptr->mv_rev.fore
* ptr->mv_rev.back
*/
ptr->mv_grfc.fore = 0; /* not supported */
ptr->mv_grfc.back = 0; /* not supported */
ptr->mv_ovscan = scp->border;
if (scp == sc->cur_scp)
save_kbd_state(scp);
ptr->mk_keylock = scp->status & LOCK_MASK;
return 0;
}
return EINVAL;
}
case CONS_GETVERS: /* get version number */
*(int*)data = 0x200; /* version 2.0 */
return 0;
case CONS_IDLE: /* see if the screen has been idle */
/*
* When the screen is in the GRAPHICS_MODE or UNKNOWN_MODE,
* the user process may have been writing something on the
* screen and syscons is not aware of it. Declare the screen
* is NOT idle if it is in one of these modes. But there is
* an exception to it; if a screen saver is running in the
* graphics mode in the current screen, we should say that the
* screen has been idle.
*/
*(int *)data = (sc->flags & SC_SCRN_IDLE)
&& (!ISGRAPHSC(sc->cur_scp)
|| (sc->cur_scp->status & SAVER_RUNNING));
return 0;
case CONS_SAVERMODE: /* set saver mode */
switch(*(int *)data) {
case CONS_NO_SAVER:
case CONS_USR_SAVER:
/* if a LKM screen saver is running, stop it first. */
scsplash_stick(FALSE);
saver_mode = *(int *)data;
s = spltty();
#ifdef DEV_SPLASH
if ((error = wait_scrn_saver_stop(NULL))) {
splx(s);
return error;
}
#endif
run_scrn_saver = TRUE;
if (saver_mode == CONS_USR_SAVER)
scp->status |= SAVER_RUNNING;
else
scp->status &= ~SAVER_RUNNING;
scsplash_stick(TRUE);
splx(s);
break;
case CONS_LKM_SAVER:
s = spltty();
if ((saver_mode == CONS_USR_SAVER) && (scp->status & SAVER_RUNNING))
scp->status &= ~SAVER_RUNNING;
saver_mode = *(int *)data;
splx(s);
break;
default:
return EINVAL;
}
return 0;
case CONS_SAVERSTART: /* immediately start/stop the screen saver */
/*
* Note that this ioctl does not guarantee the screen saver
* actually starts or stops. It merely attempts to do so...
*/
s = spltty();
run_scrn_saver = (*(int *)data != 0);
if (run_scrn_saver)
sc->scrn_time_stamp -= scrn_blank_time;
splx(s);
return 0;
case CONS_SCRSHOT: /* get a screen shot */
{
int retval, hist_rsz;
size_t lsize, csize;
vm_offset_t frbp, hstp;
unsigned lnum;
scrshot_t *ptr = (scrshot_t *)data;
void *outp = ptr->buf;
if (ptr->x < 0 || ptr->y < 0 || ptr->xsize < 0 || ptr->ysize < 0)
return EINVAL;
s = spltty();
if (ISGRAPHSC(scp)) {
splx(s);
return EOPNOTSUPP;
}
hist_rsz = (scp->history != NULL) ? scp->history->vtb_rows : 0;
if (((u_int)ptr->x + ptr->xsize) > scp->xsize ||
((u_int)ptr->y + ptr->ysize) > (scp->ysize + hist_rsz)) {
splx(s);
return EINVAL;
}
lsize = scp->xsize * sizeof(u_int16_t);
csize = ptr->xsize * sizeof(u_int16_t);
/* Pointer to the last line of framebuffer */
frbp = scp->vtb.vtb_buffer + scp->ysize * lsize + ptr->x *
sizeof(u_int16_t);
/* Pointer to the last line of target buffer */
outp = (char *)outp + ptr->ysize * csize;
/* Pointer to the last line of history buffer */
if (scp->history != NULL)
hstp = scp->history->vtb_buffer + sc_vtb_tail(scp->history) *
sizeof(u_int16_t) + ptr->x * sizeof(u_int16_t);
else
hstp = 0;
retval = 0;
for (lnum = 0; lnum < (ptr->y + ptr->ysize); lnum++) {
if (lnum < scp->ysize) {
frbp -= lsize;
} else {
hstp -= lsize;
if (hstp < scp->history->vtb_buffer)
hstp += scp->history->vtb_rows * lsize;
frbp = hstp;
}
if (lnum < ptr->y)
continue;
outp = (char *)outp - csize;
retval = copyout((void *)frbp, outp, csize);
if (retval != 0)
break;
}
splx(s);
return retval;
}
case VT_SETMODE: /* set screen switcher mode */
{
struct vt_mode *mode;
struct proc *p1;
mode = (struct vt_mode *)data;
DPRINTF(5, ("%s%d: VT_SETMODE ", SC_DRIVER_NAME, sc->unit));
if (scp->smode.mode == VT_PROCESS) {
p1 = pfind(scp->pid);
if (scp->proc == p1 && scp->proc != td->td_proc) {
if (p1)
PROC_UNLOCK(p1);
DPRINTF(5, ("error EPERM\n"));
return EPERM;
}
if (p1)
PROC_UNLOCK(p1);
}
s = spltty();
if (mode->mode == VT_AUTO) {
scp->smode.mode = VT_AUTO;
scp->proc = NULL;
scp->pid = 0;
DPRINTF(5, ("VT_AUTO, "));
if ((scp == sc->cur_scp) && (sc->unit == sc_console_unit))
cnavailable(sc_consptr, TRUE);
/* were we in the middle of the vty switching process? */
if (finish_vt_rel(scp, TRUE, &s) == 0)
DPRINTF(5, ("reset WAIT_REL, "));
if (finish_vt_acq(scp) == 0)
DPRINTF(5, ("reset WAIT_ACQ, "));
} else {
if (!ISSIGVALID(mode->relsig) || !ISSIGVALID(mode->acqsig)
|| !ISSIGVALID(mode->frsig)) {
splx(s);
DPRINTF(5, ("error EINVAL\n"));
return EINVAL;
}
DPRINTF(5, ("VT_PROCESS %d, ", td->td_proc->p_pid));
bcopy(data, &scp->smode, sizeof(struct vt_mode));
scp->proc = td->td_proc;
scp->pid = scp->proc->p_pid;
if ((scp == sc->cur_scp) && (sc->unit == sc_console_unit))
cnavailable(sc_consptr, FALSE);
}
splx(s);
DPRINTF(5, ("\n"));
return 0;
}
case VT_GETMODE: /* get screen switcher mode */
bcopy(&scp->smode, data, sizeof(struct vt_mode));
return 0;
#if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \
defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
case _IO('v', 4):
ival = IOCPARM_IVAL(data);
data = (caddr_t)&ival;
/* FALLTHROUGH */
#endif
case VT_RELDISP: /* screen switcher ioctl */
s = spltty();
/*
* This must be the current vty which is in the VT_PROCESS
* switching mode...
*/
if ((scp != sc->cur_scp) || (scp->smode.mode != VT_PROCESS)) {
splx(s);
return EINVAL;
}
/* ...and this process is controlling it. */
if (scp->proc != td->td_proc) {
splx(s);
return EPERM;
}
error = EINVAL;
switch(*(int *)data) {
case VT_FALSE: /* user refuses to release screen, abort */
if ((error = finish_vt_rel(scp, FALSE, &s)) == 0)
DPRINTF(5, ("%s%d: VT_FALSE\n", SC_DRIVER_NAME, sc->unit));
break;
case VT_TRUE: /* user has released screen, go on */
if ((error = finish_vt_rel(scp, TRUE, &s)) == 0)
DPRINTF(5, ("%s%d: VT_TRUE\n", SC_DRIVER_NAME, sc->unit));
break;
case VT_ACKACQ: /* acquire acknowledged, switch completed */
if ((error = finish_vt_acq(scp)) == 0)
DPRINTF(5, ("%s%d: VT_ACKACQ\n", SC_DRIVER_NAME, sc->unit));
break;
default:
break;
}
splx(s);
return error;
case VT_OPENQRY: /* return free virtual console */
for (i = sc->first_vty; i < sc->first_vty + sc->vtys; i++) {
tp = SC_DEV(sc, i);
if (!tty_opened(tp)) {
*(int *)data = i + 1;
return 0;
}
}
return EINVAL;
#if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \
defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
case _IO('v', 5):
ival = IOCPARM_IVAL(data);
data = (caddr_t)&ival;
/* FALLTHROUGH */
#endif
case VT_ACTIVATE: /* switch to screen *data */
i = (*(int *)data == 0) ? scp->index : (*(int *)data - 1);
s = spltty();
error = sc_clean_up(sc->cur_scp);
splx(s);
if (error)
return error;
error = sc_switch_scr(sc, i);
return (error);
#if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \
defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
case _IO('v', 6):
ival = IOCPARM_IVAL(data);
data = (caddr_t)&ival;
/* FALLTHROUGH */
#endif
case VT_WAITACTIVE: /* wait for switch to occur */
i = (*(int *)data == 0) ? scp->index : (*(int *)data - 1);
if ((i < sc->first_vty) || (i >= sc->first_vty + sc->vtys))
return EINVAL;
if (i == sc->cur_scp->index)
return 0;
error = tsleep(VTY_WCHAN(sc, i), (PZERO + 1) | PCATCH, "waitvt", 0);
return error;
case VT_GETACTIVE: /* get active vty # */
*(int *)data = sc->cur_scp->index + 1;
return 0;
case VT_GETINDEX: /* get this vty # */
*(int *)data = scp->index + 1;
return 0;
case VT_LOCKSWITCH: /* prevent vty switching */
if ((*(int *)data) & 0x01)
sc->flags |= SC_SCRN_VTYLOCK;
else
sc->flags &= ~SC_SCRN_VTYLOCK;
return 0;
case KDENABIO: /* allow io operations */
error = priv_check(td, PRIV_IO);
if (error != 0)
return error;
error = securelevel_gt(td->td_ucred, 0);
if (error != 0)
return error;
#ifdef __i386__
td->td_frame->tf_eflags |= PSL_IOPL;
#elif defined(__amd64__)
td->td_frame->tf_rflags |= PSL_IOPL;
#endif
return 0;
case KDDISABIO: /* disallow io operations (default) */
#ifdef __i386__
td->td_frame->tf_eflags &= ~PSL_IOPL;
#elif defined(__amd64__)
td->td_frame->tf_rflags &= ~PSL_IOPL;
#endif
return 0;
#if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \
defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
case _IO('K', 20):
ival = IOCPARM_IVAL(data);
data = (caddr_t)&ival;
/* FALLTHROUGH */
#endif
case KDSKBSTATE: /* set keyboard state (locks) */
if (*(int *)data & ~LOCK_MASK)
return EINVAL;
scp->status &= ~LOCK_MASK;
scp->status |= *(int *)data;
if (scp == sc->cur_scp)
update_kbd_state(scp, scp->status, LOCK_MASK);
return 0;
case KDGKBSTATE: /* get keyboard state (locks) */
if (scp == sc->cur_scp)
save_kbd_state(scp);
*(int *)data = scp->status & LOCK_MASK;
return 0;
case KDGETREPEAT: /* get keyboard repeat & delay rates */
case KDSETREPEAT: /* set keyboard repeat & delay rates (new) */
error = kbdd_ioctl(sc->kbd, cmd, data);
if (error == ENOIOCTL)
error = ENODEV;
return error;
#if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \
defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
case _IO('K', 67):
ival = IOCPARM_IVAL(data);
data = (caddr_t)&ival;
/* FALLTHROUGH */
#endif
case KDSETRAD: /* set keyboard repeat & delay rates (old) */
if (*(int *)data & ~0x7f)
return EINVAL;
error = kbdd_ioctl(sc->kbd, KDSETRAD, data);
if (error == ENOIOCTL)
error = ENODEV;
return error;
#if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \
defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
case _IO('K', 7):
ival = IOCPARM_IVAL(data);
data = (caddr_t)&ival;
/* FALLTHROUGH */
#endif
case KDSKBMODE: /* set keyboard mode */
switch (*(int *)data) {
case K_XLATE: /* switch to XLT ascii mode */
case K_RAW: /* switch to RAW scancode mode */
case K_CODE: /* switch to CODE mode */
scp->kbd_mode = *(int *)data;
if (scp == sc->cur_scp)
(void)kbdd_ioctl(sc->kbd, KDSKBMODE, data);
return 0;
default:
return EINVAL;
}
/* NOT REACHED */
case KDGKBMODE: /* get keyboard mode */
*(int *)data = scp->kbd_mode;
return 0;
case KDGKBINFO:
error = kbdd_ioctl(sc->kbd, cmd, data);
if (error == ENOIOCTL)
error = ENODEV;
return error;
#if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \
defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
case _IO('K', 8):
ival = IOCPARM_IVAL(data);
data = (caddr_t)&ival;
/* FALLTHROUGH */
#endif
case KDMKTONE: /* sound the bell */
if (*(int*)data)
sc_bell(scp, (*(int*)data)&0xffff,
(((*(int*)data)>>16)&0xffff)*hz/1000);
else
sc_bell(scp, scp->bell_pitch, scp->bell_duration);
return 0;
#if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \
defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
case _IO('K', 63):
ival = IOCPARM_IVAL(data);
data = (caddr_t)&ival;
/* FALLTHROUGH */
#endif
case KIOCSOUND: /* make tone (*data) hz */
if (scp == sc->cur_scp) {
if (*(int *)data)
return sc_tone(*(int *)data);
else
return sc_tone(0);
}
return 0;
case KDGKBTYPE: /* get keyboard type */
error = kbdd_ioctl(sc->kbd, cmd, data);
if (error == ENOIOCTL) {
/* always return something? XXX */
*(int *)data = 0;
}
return 0;
#if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \
defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
case _IO('K', 66):
ival = IOCPARM_IVAL(data);
data = (caddr_t)&ival;
/* FALLTHROUGH */
#endif
case KDSETLED: /* set keyboard LED status */
if (*(int *)data & ~LED_MASK) /* FIXME: LOCK_MASK? */
return EINVAL;
scp->status &= ~LED_MASK;
scp->status |= *(int *)data;
if (scp == sc->cur_scp)
update_kbd_leds(scp, scp->status);
return 0;
case KDGETLED: /* get keyboard LED status */
if (scp == sc->cur_scp)
save_kbd_state(scp);
*(int *)data = scp->status & LED_MASK;
return 0;
case KBADDKBD: /* add/remove keyboard to/from mux */
case KBRELKBD:
error = kbdd_ioctl(sc->kbd, cmd, data);
if (error == ENOIOCTL)
error = ENODEV;
return error;
#if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \
defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
case _IO('c', 110):
ival = IOCPARM_IVAL(data);
data = (caddr_t)&ival;
/* FALLTHROUGH */
#endif
case CONS_SETKBD: /* set the new keyboard */
{
keyboard_t *newkbd;
s = spltty();
newkbd = kbd_get_keyboard(*(int *)data);
if (newkbd == NULL) {
splx(s);
return EINVAL;
}
error = 0;
if (sc->kbd != newkbd) {
i = kbd_allocate(newkbd->kb_name, newkbd->kb_unit,
(void *)&sc->keyboard, sckbdevent, sc);
/* i == newkbd->kb_index */
if (i >= 0) {
if (sc->kbd != NULL) {
save_kbd_state(sc->cur_scp);
kbd_release(sc->kbd, (void *)&sc->keyboard);
}
sc->kbd = kbd_get_keyboard(i); /* sc->kbd == newkbd */
sc->keyboard = i;
(void)kbdd_ioctl(sc->kbd, KDSKBMODE,
(caddr_t)&sc->cur_scp->kbd_mode);
update_kbd_state(sc->cur_scp, sc->cur_scp->status,
LOCK_MASK);
} else {
error = EPERM; /* XXX */
}
}
splx(s);
return error;
}
case CONS_RELKBD: /* release the current keyboard */
s = spltty();
error = 0;
if (sc->kbd != NULL) {
save_kbd_state(sc->cur_scp);
error = kbd_release(sc->kbd, (void *)&sc->keyboard);
if (error == 0) {
sc->kbd = NULL;
sc->keyboard = -1;
}
}
splx(s);
return error;
case CONS_GETTERM: /* get the current terminal emulator info */
{
sc_term_sw_t *sw;
if (((term_info_t *)data)->ti_index == 0) {
sw = scp->tsw;
} else {
sw = sc_term_match_by_number(((term_info_t *)data)->ti_index);
}
if (sw != NULL) {
strncpy(((term_info_t *)data)->ti_name, sw->te_name,
sizeof(((term_info_t *)data)->ti_name));
strncpy(((term_info_t *)data)->ti_desc, sw->te_desc,
sizeof(((term_info_t *)data)->ti_desc));
((term_info_t *)data)->ti_flags = 0;
return 0;
} else {
((term_info_t *)data)->ti_name[0] = '\0';
((term_info_t *)data)->ti_desc[0] = '\0';
((term_info_t *)data)->ti_flags = 0;
return EINVAL;
}
}
case CONS_SETTERM: /* set the current terminal emulator */
s = spltty();
error = sc_init_emulator(scp, ((term_info_t *)data)->ti_name);
/* FIXME: what if scp == sc_console! XXX */
splx(s);
return error;
case GIO_SCRNMAP: /* get output translation table */
bcopy(&sc->scr_map, data, sizeof(sc->scr_map));
return 0;
case PIO_SCRNMAP: /* set output translation table */
bcopy(data, &sc->scr_map, sizeof(sc->scr_map));
for (i=0; i<sizeof(sc->scr_map); i++) {
sc->scr_rmap[sc->scr_map[i]] = i;
}
return 0;
case GIO_KEYMAP: /* get keyboard translation table */
case PIO_KEYMAP: /* set keyboard translation table */
case OGIO_KEYMAP: /* get keyboard translation table (compat) */
case OPIO_KEYMAP: /* set keyboard translation table (compat) */
case GIO_DEADKEYMAP: /* get accent key translation table */
case PIO_DEADKEYMAP: /* set accent key translation table */
case GETFKEY: /* get function key string */
case SETFKEY: /* set function key string */
error = kbdd_ioctl(sc->kbd, cmd, data);
if (error == ENOIOCTL)
error = ENODEV;
return error;
#ifndef SC_NO_FONT_LOADING
case PIO_FONT8x8: /* set 8x8 dot font */
if (!ISFONTAVAIL(sc->adp->va_flags))
return ENXIO;
bcopy(data, sc->font_8, 8*256);
sc->fonts_loaded |= FONT_8;
/*
* FONT KLUDGE
* Always use the font page #0. XXX
* Don't load if the current font size is not 8x8.
*/
if (ISTEXTSC(sc->cur_scp) && (sc->cur_scp->font_size < 14))
sc_load_font(sc->cur_scp, 0, 8, 8, sc->font_8, 0, 256);
return 0;
case GIO_FONT8x8: /* get 8x8 dot font */
if (!ISFONTAVAIL(sc->adp->va_flags))
return ENXIO;
if (sc->fonts_loaded & FONT_8) {
bcopy(sc->font_8, data, 8*256);
return 0;
}
else
return ENXIO;
case PIO_FONT8x14: /* set 8x14 dot font */
if (!ISFONTAVAIL(sc->adp->va_flags))
return ENXIO;
bcopy(data, sc->font_14, 14*256);
sc->fonts_loaded |= FONT_14;
/*
* FONT KLUDGE
* Always use the font page #0. XXX
* Don't load if the current font size is not 8x14.
*/
if (ISTEXTSC(sc->cur_scp)
&& (sc->cur_scp->font_size >= 14)
&& (sc->cur_scp->font_size < 16))
sc_load_font(sc->cur_scp, 0, 14, 8, sc->font_14, 0, 256);
return 0;
case GIO_FONT8x14: /* get 8x14 dot font */
if (!ISFONTAVAIL(sc->adp->va_flags))
return ENXIO;
if (sc->fonts_loaded & FONT_14) {
bcopy(sc->font_14, data, 14*256);
return 0;
}
else
return ENXIO;
case PIO_FONT8x16: /* set 8x16 dot font */
if (!ISFONTAVAIL(sc->adp->va_flags))
return ENXIO;
bcopy(data, sc->font_16, 16*256);
sc->fonts_loaded |= FONT_16;
/*
* FONT KLUDGE
* Always use the font page #0. XXX
* Don't load if the current font size is not 8x16.
*/
if (ISTEXTSC(sc->cur_scp) && (sc->cur_scp->font_size >= 16))
sc_load_font(sc->cur_scp, 0, 16, 8, sc->font_16, 0, 256);
return 0;
case GIO_FONT8x16: /* get 8x16 dot font */
if (!ISFONTAVAIL(sc->adp->va_flags))
return ENXIO;
if (sc->fonts_loaded & FONT_16) {
bcopy(sc->font_16, data, 16*256);
return 0;
}
else
return ENXIO;
#endif /* SC_NO_FONT_LOADING */
default:
break;
}
return (ENOIOCTL);
}
static int
consolectl_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag,
struct thread *td)
{
return sctty_ioctl(dev->si_drv1, cmd, data, td);
}
static void
sc_cnprobe(struct consdev *cp)
{
int unit;
int flags;
cp->cn_pri = sc_get_cons_priority(&unit, &flags);
/* a video card is always required */
if (!scvidprobe(unit, flags, TRUE))
cp->cn_pri = CN_DEAD;
/* syscons will become console even when there is no keyboard */
sckbdprobe(unit, flags, TRUE);
if (cp->cn_pri == CN_DEAD)
return;
/* initialize required fields */
strcpy(cp->cn_name, "ttyv0");
}
static void
sc_cninit(struct consdev *cp)
{
int unit;
int flags;
sc_get_cons_priority(&unit, &flags);
scinit(unit, flags | SC_KERNEL_CONSOLE);
sc_console_unit = unit;
sc_console = sc_get_stat(sc_get_softc(unit, SC_KERNEL_CONSOLE)->dev[0]);
sc_consptr = cp;
}
static void
sc_cnterm(struct consdev *cp)
{
/* we are not the kernel console any more, release everything */
if (sc_console_unit < 0)
return; /* shouldn't happen */
#if 0 /* XXX */
sc_clear_screen(sc_console);
sccnupdate(sc_console);
#endif
scterm(sc_console_unit, SC_KERNEL_CONSOLE);
sc_console_unit = -1;
sc_console = NULL;
}
static void
sc_cnputc(struct consdev *cd, int c)
{
u_char buf[1];
scr_stat *scp = sc_console;
#ifndef SC_NO_HISTORY
#if 0
struct tty *tp;
#endif
#endif /* !SC_NO_HISTORY */
int s;
/* assert(sc_console != NULL) */
#ifndef SC_NO_HISTORY
if (scp == scp->sc->cur_scp && scp->status & SLKED) {
scp->status &= ~SLKED;
update_kbd_state(scp, scp->status, SLKED);
if (scp->status & BUFFER_SAVED) {
if (!sc_hist_restore(scp))
sc_remove_cutmarking(scp);
scp->status &= ~BUFFER_SAVED;
scp->status |= CURSOR_ENABLED;
sc_draw_cursor_image(scp);
}
#if 0
/*
* XXX: Now that TTY's have their own locks, we cannot process
* any data after disabling scroll lock. cnputs already holds a
* spinlock.
*/
tp = SC_DEV(scp->sc, scp->index);
tty_lock(tp);
if (tty_opened(tp))
sctty_outwakeup(tp);
tty_unlock(tp);
#endif
}
#endif /* !SC_NO_HISTORY */
buf[0] = c;
sc_puts(scp, buf, 1, 1);
s = spltty(); /* block sckbdevent and scrn_timer */
sccnupdate(scp);
splx(s);
}
static int
sc_cngetc(struct consdev *cd)
{
static struct fkeytab fkey;
static int fkeycp;
scr_stat *scp;
const u_char *p;
int cur_mode;
int s = spltty(); /* block sckbdevent and scrn_timer while we poll */
int c;
/* assert(sc_console != NULL) */
/*
* Stop the screen saver and update the screen if necessary.
* What if we have been running in the screen saver code... XXX
*/
sc_touch_scrn_saver();
scp = sc_console->sc->cur_scp; /* XXX */
sccnupdate(scp);
if (fkeycp < fkey.len) {
splx(s);
return fkey.str[fkeycp++];
}
if (scp->sc->kbd == NULL) {
splx(s);
return -1;
}
/*
* Make sure the keyboard is accessible even when the kbd device
* driver is disabled.
*/
kbdd_enable(scp->sc->kbd);
/* we shall always use the keyboard in the XLATE mode here */
cur_mode = scp->kbd_mode;
scp->kbd_mode = K_XLATE;
(void)kbdd_ioctl(scp->sc->kbd, KDSKBMODE, (caddr_t)&scp->kbd_mode);
kbdd_poll(scp->sc->kbd, TRUE);
c = scgetc(scp->sc, SCGETC_CN | SCGETC_NONBLOCK);
kbdd_poll(scp->sc->kbd, FALSE);
scp->kbd_mode = cur_mode;
(void)kbdd_ioctl(scp->sc->kbd, KDSKBMODE, (caddr_t)&scp->kbd_mode);
kbdd_disable(scp->sc->kbd);
splx(s);
switch (KEYFLAGS(c)) {
case 0: /* normal char */
return KEYCHAR(c);
case FKEY: /* function key */
p = (*scp->tsw->te_fkeystr)(scp, c);
if (p != NULL) {
fkey.len = strlen(p);
bcopy(p, fkey.str, fkey.len);
fkeycp = 1;
return fkey.str[0];
}
p = kbdd_get_fkeystr(scp->sc->kbd, KEYCHAR(c), (size_t *)&fkeycp);
fkey.len = fkeycp;
if ((p != NULL) && (fkey.len > 0)) {
bcopy(p, fkey.str, fkey.len);
fkeycp = 1;
return fkey.str[0];
}
return c; /* XXX */
case NOKEY:
case ERRKEY:
default:
return -1;
}
/* NOT REACHED */
}
static void
sccnupdate(scr_stat *scp)
{
/* this is a cut-down version of scrn_timer()... */
if (suspend_in_progress || scp->sc->font_loading_in_progress)
return;
if (debugger > 0 || panicstr || shutdown_in_progress) {
sc_touch_scrn_saver();
} else if (scp != scp->sc->cur_scp) {
return;
}
if (!run_scrn_saver)
scp->sc->flags &= ~SC_SCRN_IDLE;
#ifdef DEV_SPLASH
if ((saver_mode != CONS_LKM_SAVER) || !(scp->sc->flags & SC_SCRN_IDLE))
if (scp->sc->flags & SC_SCRN_BLANKED)
stop_scrn_saver(scp->sc, current_saver);
#endif
if (scp != scp->sc->cur_scp || scp->sc->blink_in_progress
|| scp->sc->switch_in_progress)
return;
/*
* FIXME: unlike scrn_timer(), we call scrn_update() from here even
* when write_in_progress is non-zero. XXX
*/
if (!ISGRAPHSC(scp) && !(scp->sc->flags & SC_SCRN_BLANKED))
scrn_update(scp, TRUE);
}
static void
scrn_timer(void *arg)
{
#ifndef PC98
static int kbd_interval = 0;
#endif
struct timeval tv;
sc_softc_t *sc;
scr_stat *scp;
int again;
int s;
again = (arg != NULL);
if (arg != NULL)
sc = (sc_softc_t *)arg;
else if (sc_console != NULL)
sc = sc_console->sc;
else
return;
/* don't do anything when we are performing some I/O operations */
if (suspend_in_progress || sc->font_loading_in_progress) {
if (again)
timeout(scrn_timer, sc, hz / 10);
return;
}
s = spltty();
#ifndef PC98
if ((sc->kbd == NULL) && (sc->config & SC_AUTODETECT_KBD)) {
/* try to allocate a keyboard automatically */
if (++kbd_interval >= 25) {
sc->keyboard = sc_allocate_keyboard(sc, -1);
if (sc->keyboard >= 0) {
sc->kbd = kbd_get_keyboard(sc->keyboard);
(void)kbdd_ioctl(sc->kbd, KDSKBMODE,
(caddr_t)&sc->cur_scp->kbd_mode);
update_kbd_state(sc->cur_scp, sc->cur_scp->status,
LOCK_MASK);
}
kbd_interval = 0;
}
}
#endif /* PC98 */
/* find the vty to update */
scp = sc->cur_scp;
/* should we stop the screen saver? */
getmicrouptime(&tv);
if (debugger > 0 || panicstr || shutdown_in_progress)
sc_touch_scrn_saver();
if (run_scrn_saver) {
if (tv.tv_sec > sc->scrn_time_stamp + scrn_blank_time)
sc->flags |= SC_SCRN_IDLE;
else
sc->flags &= ~SC_SCRN_IDLE;
} else {
sc->scrn_time_stamp = tv.tv_sec;
sc->flags &= ~SC_SCRN_IDLE;
if (scrn_blank_time > 0)
run_scrn_saver = TRUE;
}
#ifdef DEV_SPLASH
if ((saver_mode != CONS_LKM_SAVER) || !(sc->flags & SC_SCRN_IDLE))
if (sc->flags & SC_SCRN_BLANKED)
stop_scrn_saver(sc, current_saver);
#endif
/* should we just return ? */
if (sc->blink_in_progress || sc->switch_in_progress
|| sc->write_in_progress) {
if (again)
timeout(scrn_timer, sc, hz / 10);
splx(s);
return;
}
/* Update the screen */
scp = sc->cur_scp; /* cur_scp may have changed... */
if (!ISGRAPHSC(scp) && !(sc->flags & SC_SCRN_BLANKED))
scrn_update(scp, TRUE);
#ifdef DEV_SPLASH
/* should we activate the screen saver? */
if ((saver_mode == CONS_LKM_SAVER) && (sc->flags & SC_SCRN_IDLE))
if (!ISGRAPHSC(scp) || (sc->flags & SC_SCRN_BLANKED))
(*current_saver)(sc, TRUE);
#endif
if (again)
timeout(scrn_timer, sc, hz / 25);
splx(s);
}
static int
and_region(int *s1, int *e1, int s2, int e2)
{
if (*e1 < s2 || e2 < *s1)
return FALSE;
*s1 = imax(*s1, s2);
*e1 = imin(*e1, e2);
return TRUE;
}
static void
scrn_update(scr_stat *scp, int show_cursor)
{
int start;
int end;
int s;
int e;
/* assert(scp == scp->sc->cur_scp) */
SC_VIDEO_LOCK(scp->sc);
#ifndef SC_NO_CUTPASTE
/* remove the previous mouse pointer image if necessary */
if (scp->status & MOUSE_VISIBLE) {
s = scp->mouse_pos;
e = scp->mouse_pos + scp->xsize + 1;
if ((scp->status & (MOUSE_MOVED | MOUSE_HIDDEN))
|| and_region(&s, &e, scp->start, scp->end)
|| ((scp->status & CURSOR_ENABLED) &&
(scp->cursor_pos != scp->cursor_oldpos) &&
(and_region(&s, &e, scp->cursor_pos, scp->cursor_pos)
|| and_region(&s, &e, scp->cursor_oldpos, scp->cursor_oldpos)))) {
sc_remove_mouse_image(scp);
if (scp->end >= scp->xsize*scp->ysize)
scp->end = scp->xsize*scp->ysize - 1;
}
}
#endif /* !SC_NO_CUTPASTE */
#if 1
/* debug: XXX */
if (scp->end >= scp->xsize*scp->ysize) {
printf("scrn_update(): scp->end %d > size_of_screen!!\n", scp->end);
scp->end = scp->xsize*scp->ysize - 1;
}
if (scp->start < 0) {
printf("scrn_update(): scp->start %d < 0\n", scp->start);
scp->start = 0;
}
#endif
/* update screen image */
if (scp->start <= scp->end) {
if (scp->mouse_cut_end >= 0) {
/* there is a marked region for cut & paste */
if (scp->mouse_cut_start <= scp->mouse_cut_end) {
start = scp->mouse_cut_start;
end = scp->mouse_cut_end;
} else {
start = scp->mouse_cut_end;
end = scp->mouse_cut_start - 1;
}
s = start;
e = end;
/* does the cut-mark region overlap with the update region? */
if (and_region(&s, &e, scp->start, scp->end)) {
(*scp->rndr->draw)(scp, s, e - s + 1, TRUE);
s = 0;
e = start - 1;
if (and_region(&s, &e, scp->start, scp->end))
(*scp->rndr->draw)(scp, s, e - s + 1, FALSE);
s = end + 1;
e = scp->xsize*scp->ysize - 1;
if (and_region(&s, &e, scp->start, scp->end))
(*scp->rndr->draw)(scp, s, e - s + 1, FALSE);
} else {
(*scp->rndr->draw)(scp, scp->start,
scp->end - scp->start + 1, FALSE);
}
} else {
(*scp->rndr->draw)(scp, scp->start,
scp->end - scp->start + 1, FALSE);
}
}
/* we are not to show the cursor and the mouse pointer... */
if (!show_cursor) {
scp->end = 0;
scp->start = scp->xsize*scp->ysize - 1;
SC_VIDEO_UNLOCK(scp->sc);
return;
}
/* update cursor image */
if (scp->status & CURSOR_ENABLED) {
s = scp->start;
e = scp->end;
/* did cursor move since last time ? */
if (scp->cursor_pos != scp->cursor_oldpos) {
/* do we need to remove old cursor image ? */
if (!and_region(&s, &e, scp->cursor_oldpos, scp->cursor_oldpos))
sc_remove_cursor_image(scp);
sc_draw_cursor_image(scp);
} else {
if (and_region(&s, &e, scp->cursor_pos, scp->cursor_pos))
/* cursor didn't move, but has been overwritten */
sc_draw_cursor_image(scp);
else if (scp->curs_attr.flags & CONS_BLINK_CURSOR)
/* if it's a blinking cursor, update it */
(*scp->rndr->blink_cursor)(scp, scp->cursor_pos,
sc_inside_cutmark(scp,
scp->cursor_pos));
}
}
#ifndef SC_NO_CUTPASTE
/* update "pseudo" mouse pointer image */
if (scp->sc->flags & SC_MOUSE_ENABLED) {
if (!(scp->status & (MOUSE_VISIBLE | MOUSE_HIDDEN))) {
scp->status &= ~MOUSE_MOVED;
sc_draw_mouse_image(scp);
}
}
#endif /* SC_NO_CUTPASTE */
scp->end = 0;
scp->start = scp->xsize*scp->ysize - 1;
SC_VIDEO_UNLOCK(scp->sc);
}
#ifdef DEV_SPLASH
static int
scsplash_callback(int event, void *arg)
{
sc_softc_t *sc;
int error;
sc = (sc_softc_t *)arg;
switch (event) {
case SPLASH_INIT:
if (add_scrn_saver(scsplash_saver) == 0) {
sc->flags &= ~SC_SAVER_FAILED;
run_scrn_saver = TRUE;
if (cold && !(boothowto & RB_VERBOSE)) {
scsplash_stick(TRUE);
(*current_saver)(sc, TRUE);
}
}
return 0;
case SPLASH_TERM:
if (current_saver == scsplash_saver) {
scsplash_stick(FALSE);
error = remove_scrn_saver(scsplash_saver);
if (error)
return error;
}
return 0;
default:
return EINVAL;
}
}
static void
scsplash_saver(sc_softc_t *sc, int show)
{
static int busy = FALSE;
scr_stat *scp;
if (busy)
return;
busy = TRUE;
scp = sc->cur_scp;
if (show) {
if (!(sc->flags & SC_SAVER_FAILED)) {
if (!(sc->flags & SC_SCRN_BLANKED))
set_scrn_saver_mode(scp, -1, NULL, 0);
switch (splash(sc->adp, TRUE)) {
case 0: /* succeeded */
break;
case EAGAIN: /* try later */
restore_scrn_saver_mode(scp, FALSE);
sc_touch_scrn_saver(); /* XXX */
break;
default:
sc->flags |= SC_SAVER_FAILED;
scsplash_stick(FALSE);
restore_scrn_saver_mode(scp, TRUE);
printf("scsplash_saver(): failed to put up the image\n");
break;
}
}
} else if (!sticky_splash) {
if ((sc->flags & SC_SCRN_BLANKED) && (splash(sc->adp, FALSE) == 0))
restore_scrn_saver_mode(scp, TRUE);
}
busy = FALSE;
}
static int
add_scrn_saver(void (*this_saver)(sc_softc_t *, int))
{
#if 0
int error;
if (current_saver != none_saver) {
error = remove_scrn_saver(current_saver);
if (error)
return error;
}
#endif
if (current_saver != none_saver)
return EBUSY;
run_scrn_saver = FALSE;
saver_mode = CONS_LKM_SAVER;
current_saver = this_saver;
return 0;
}
static int
remove_scrn_saver(void (*this_saver)(sc_softc_t *, int))
{
if (current_saver != this_saver)
return EINVAL;
#if 0
/*
* In order to prevent `current_saver' from being called by
* the timeout routine `scrn_timer()' while we manipulate
* the saver list, we shall set `current_saver' to `none_saver'
* before stopping the current saver, rather than blocking by `splXX()'.
*/
current_saver = none_saver;
if (scrn_blanked)
stop_scrn_saver(this_saver);
#endif
/* unblank all blanked screens */
wait_scrn_saver_stop(NULL);
if (scrn_blanked)
return EBUSY;
current_saver = none_saver;
return 0;
}
static int
set_scrn_saver_mode(scr_stat *scp, int mode, u_char *pal, int border)
{
int s;
/* assert(scp == scp->sc->cur_scp) */
s = spltty();
if (!ISGRAPHSC(scp))
sc_remove_cursor_image(scp);
scp->splash_save_mode = scp->mode;
scp->splash_save_status = scp->status & (GRAPHICS_MODE | PIXEL_MODE);
scp->status &= ~(GRAPHICS_MODE | PIXEL_MODE);
scp->status |= (UNKNOWN_MODE | SAVER_RUNNING);
scp->sc->flags |= SC_SCRN_BLANKED;
++scrn_blanked;
splx(s);
if (mode < 0)
return 0;
scp->mode = mode;
if (set_mode(scp) == 0) {
if (scp->sc->adp->va_info.vi_flags & V_INFO_GRAPHICS)
scp->status |= GRAPHICS_MODE;
#ifndef SC_NO_PALETTE_LOADING
if (pal != NULL)
vidd_load_palette(scp->sc->adp, pal);
#endif
sc_set_border(scp, border);
return 0;
} else {
s = spltty();
scp->mode = scp->splash_save_mode;
scp->status &= ~(UNKNOWN_MODE | SAVER_RUNNING);
scp->status |= scp->splash_save_status;
splx(s);
return 1;
}
}
static int
restore_scrn_saver_mode(scr_stat *scp, int changemode)
{
int mode;
int status;
int s;
/* assert(scp == scp->sc->cur_scp) */
s = spltty();
mode = scp->mode;
status = scp->status;
scp->mode = scp->splash_save_mode;
scp->status &= ~(UNKNOWN_MODE | SAVER_RUNNING);
scp->status |= scp->splash_save_status;
scp->sc->flags &= ~SC_SCRN_BLANKED;
if (!changemode) {
if (!ISGRAPHSC(scp))
sc_draw_cursor_image(scp);
--scrn_blanked;
splx(s);
return 0;
}
if (set_mode(scp) == 0) {
#ifndef SC_NO_PALETTE_LOADING
#ifdef SC_PIXEL_MODE
if (scp->sc->adp->va_info.vi_mem_model == V_INFO_MM_DIRECT)
vidd_load_palette(scp->sc->adp, scp->sc->palette2);
else
#endif
vidd_load_palette(scp->sc->adp, scp->sc->palette);
#endif
--scrn_blanked;
splx(s);
return 0;
} else {
scp->mode = mode;
scp->status = status;
splx(s);
return 1;
}
}
static void
stop_scrn_saver(sc_softc_t *sc, void (*saver)(sc_softc_t *, int))
{
(*saver)(sc, FALSE);
run_scrn_saver = FALSE;
/* the screen saver may have chosen not to stop after all... */
if (sc->flags & SC_SCRN_BLANKED)
return;
mark_all(sc->cur_scp);
if (sc->delayed_next_scr)
sc_switch_scr(sc, sc->delayed_next_scr - 1);
if (debugger == 0)
wakeup(&scrn_blanked);
}
static int
wait_scrn_saver_stop(sc_softc_t *sc)
{
int error = 0;
while (scrn_blanked > 0) {
run_scrn_saver = FALSE;
if (sc && !(sc->flags & SC_SCRN_BLANKED)) {
error = 0;
break;
}
error = tsleep(&scrn_blanked, PZERO | PCATCH, "scrsav", 0);
if ((error != 0) && (error != ERESTART))
break;
}
run_scrn_saver = FALSE;
return error;
}
#endif /* DEV_SPLASH */
void
sc_touch_scrn_saver(void)
{
scsplash_stick(FALSE);
run_scrn_saver = FALSE;
}
int
sc_switch_scr(sc_softc_t *sc, u_int next_scr)
{
scr_stat *cur_scp;
struct tty *tp;
struct proc *p;
int s;
DPRINTF(5, ("sc0: sc_switch_scr() %d ", next_scr + 1));
if (sc->cur_scp == NULL)
return (0);
/* prevent switch if previously requested */
if (sc->flags & SC_SCRN_VTYLOCK) {
sc_bell(sc->cur_scp, sc->cur_scp->bell_pitch,
sc->cur_scp->bell_duration);
return EPERM;
}
/* delay switch if the screen is blanked or being updated */
if ((sc->flags & SC_SCRN_BLANKED) || sc->write_in_progress
|| sc->blink_in_progress) {
sc->delayed_next_scr = next_scr + 1;
sc_touch_scrn_saver();
DPRINTF(5, ("switch delayed\n"));
return 0;
}
sc->delayed_next_scr = 0;
s = spltty();
cur_scp = sc->cur_scp;
/* we are in the middle of the vty switching process... */
if (sc->switch_in_progress
&& (cur_scp->smode.mode == VT_PROCESS)
&& cur_scp->proc) {
p = pfind(cur_scp->pid);
if (cur_scp->proc != p) {
if (p)
PROC_UNLOCK(p);
/*
* The controlling process has died!!. Do some clean up.
* NOTE:`cur_scp->proc' and `cur_scp->smode.mode'
* are not reset here yet; they will be cleared later.
*/
DPRINTF(5, ("cur_scp controlling process %d died, ",
cur_scp->pid));
if (cur_scp->status & SWITCH_WAIT_REL) {
/*
* Force the previous switch to finish, but return now
* with error.
*/
DPRINTF(5, ("reset WAIT_REL, "));
finish_vt_rel(cur_scp, TRUE, &s);
splx(s);
DPRINTF(5, ("finishing previous switch\n"));
return EINVAL;
} else if (cur_scp->status & SWITCH_WAIT_ACQ) {
/* let's assume screen switch has been completed. */
DPRINTF(5, ("reset WAIT_ACQ, "));
finish_vt_acq(cur_scp);
} else {
/*
* We are in between screen release and acquisition, and
* reached here via scgetc() or scrn_timer() which has
* interrupted exchange_scr(). Don't do anything stupid.
*/
DPRINTF(5, ("waiting nothing, "));
}
} else {
if (p)
PROC_UNLOCK(p);
/*
* The controlling process is alive, but not responding...
* It is either buggy or it may be just taking time.
* The following code is a gross kludge to cope with this
* problem for which there is no clean solution. XXX
*/
if (cur_scp->status & SWITCH_WAIT_REL) {
switch (sc->switch_in_progress++) {
case 1:
break;
case 2:
DPRINTF(5, ("sending relsig again, "));
signal_vt_rel(cur_scp);
break;
case 3:
break;
case 4:
default:
/*
* Act as if the controlling program returned
* VT_FALSE.
*/
DPRINTF(5, ("force reset WAIT_REL, "));
finish_vt_rel(cur_scp, FALSE, &s);
splx(s);
DPRINTF(5, ("act as if VT_FALSE was seen\n"));
return EINVAL;
}
} else if (cur_scp->status & SWITCH_WAIT_ACQ) {
switch (sc->switch_in_progress++) {
case 1:
break;
case 2:
DPRINTF(5, ("sending acqsig again, "));
signal_vt_acq(cur_scp);
break;
case 3:
break;
case 4:
default:
/* clear the flag and finish the previous switch */
DPRINTF(5, ("force reset WAIT_ACQ, "));
finish_vt_acq(cur_scp);
break;
}
}
}
}
/*
* Return error if an invalid argument is given, or vty switch
* is still in progress.
*/
if ((next_scr < sc->first_vty) || (next_scr >= sc->first_vty + sc->vtys)
|| sc->switch_in_progress) {
splx(s);
sc_bell(cur_scp, bios_value.bell_pitch, BELL_DURATION);
DPRINTF(5, ("error 1\n"));
return EINVAL;
}
/*
* Don't allow switching away from the graphics mode vty
* if the switch mode is VT_AUTO, unless the next vty is the same
* as the current or the current vty has been closed (but showing).
*/
tp = SC_DEV(sc, cur_scp->index);
if ((cur_scp->index != next_scr)
&& tty_opened(tp)
&& (cur_scp->smode.mode == VT_AUTO)
&& ISGRAPHSC(cur_scp)) {
splx(s);
sc_bell(cur_scp, bios_value.bell_pitch, BELL_DURATION);
DPRINTF(5, ("error, graphics mode\n"));
return EINVAL;
}
/*
* Is the wanted vty open? Don't allow switching to a closed vty.
* If we are in DDB, don't switch to a vty in the VT_PROCESS mode.
* Note that we always allow the user to switch to the kernel
* console even if it is closed.
*/
if ((sc_console == NULL) || (next_scr != sc_console->index)) {
tp = SC_DEV(sc, next_scr);
if (!tty_opened(tp)) {
splx(s);
sc_bell(cur_scp, bios_value.bell_pitch, BELL_DURATION);
DPRINTF(5, ("error 2, requested vty isn't open!\n"));
return EINVAL;
}
if ((debugger > 0) && (SC_STAT(tp)->smode.mode == VT_PROCESS)) {
splx(s);
DPRINTF(5, ("error 3, requested vty is in the VT_PROCESS mode\n"));
return EINVAL;
}
}
/* this is the start of vty switching process... */
++sc->switch_in_progress;
sc->old_scp = cur_scp;
sc->new_scp = sc_get_stat(SC_DEV(sc, next_scr));
if (sc->new_scp == sc->old_scp) {
sc->switch_in_progress = 0;
/*
* XXX wakeup() locks the scheduler lock which will hang if
* the lock is in an in-between state, e.g., when we stop at
* a breakpoint at fork_exit. It has always been wrong to call
* wakeup() when the debugger is active. In RELENG_4, wakeup()
* is supposed to be locked by splhigh(), but the debugger may
* be invoked at splhigh().
*/
if (debugger == 0)
wakeup(VTY_WCHAN(sc,next_scr));
splx(s);
DPRINTF(5, ("switch done (new == old)\n"));
return 0;
}
/* has controlling process died? */
vt_proc_alive(sc->old_scp);
vt_proc_alive(sc->new_scp);
/* wait for the controlling process to release the screen, if necessary */
if (signal_vt_rel(sc->old_scp)) {
splx(s);
return 0;
}
/* go set up the new vty screen */
splx(s);
exchange_scr(sc);
s = spltty();
/* wake up processes waiting for this vty */
if (debugger == 0)
wakeup(VTY_WCHAN(sc,next_scr));
/* wait for the controlling process to acknowledge, if necessary */
if (signal_vt_acq(sc->cur_scp)) {
splx(s);
return 0;
}
sc->switch_in_progress = 0;
if (sc->unit == sc_console_unit)
cnavailable(sc_consptr, TRUE);
splx(s);
DPRINTF(5, ("switch done\n"));
return 0;
}
static int
do_switch_scr(sc_softc_t *sc, int s)
{
vt_proc_alive(sc->new_scp);
splx(s);
exchange_scr(sc);
s = spltty();
/* sc->cur_scp == sc->new_scp */
wakeup(VTY_WCHAN(sc,sc->cur_scp->index));
/* wait for the controlling process to acknowledge, if necessary */
if (!signal_vt_acq(sc->cur_scp)) {
sc->switch_in_progress = 0;
if (sc->unit == sc_console_unit)
cnavailable(sc_consptr, TRUE);
}
return s;
}
static int
vt_proc_alive(scr_stat *scp)
{
struct proc *p;
if (scp->proc) {
if ((p = pfind(scp->pid)) != NULL)
PROC_UNLOCK(p);
if (scp->proc == p)
return TRUE;
scp->proc = NULL;
scp->smode.mode = VT_AUTO;
DPRINTF(5, ("vt controlling process %d died\n", scp->pid));
}
return FALSE;
}
static int
signal_vt_rel(scr_stat *scp)
{
if (scp->smode.mode != VT_PROCESS)
return FALSE;
scp->status |= SWITCH_WAIT_REL;
PROC_LOCK(scp->proc);
- psignal(scp->proc, scp->smode.relsig);
+ kern_psignal(scp->proc, scp->smode.relsig);
PROC_UNLOCK(scp->proc);
DPRINTF(5, ("sending relsig to %d\n", scp->pid));
return TRUE;
}
static int
signal_vt_acq(scr_stat *scp)
{
if (scp->smode.mode != VT_PROCESS)
return FALSE;
if (scp->sc->unit == sc_console_unit)
cnavailable(sc_consptr, FALSE);
scp->status |= SWITCH_WAIT_ACQ;
PROC_LOCK(scp->proc);
- psignal(scp->proc, scp->smode.acqsig);
+ kern_psignal(scp->proc, scp->smode.acqsig);
PROC_UNLOCK(scp->proc);
DPRINTF(5, ("sending acqsig to %d\n", scp->pid));
return TRUE;
}
static int
finish_vt_rel(scr_stat *scp, int release, int *s)
{
if (scp == scp->sc->old_scp && scp->status & SWITCH_WAIT_REL) {
scp->status &= ~SWITCH_WAIT_REL;
if (release)
*s = do_switch_scr(scp->sc, *s);
else
scp->sc->switch_in_progress = 0;
return 0;
}
return EINVAL;
}
static int
finish_vt_acq(scr_stat *scp)
{
if (scp == scp->sc->new_scp && scp->status & SWITCH_WAIT_ACQ) {
scp->status &= ~SWITCH_WAIT_ACQ;
scp->sc->switch_in_progress = 0;
return 0;
}
return EINVAL;
}
static void
exchange_scr(sc_softc_t *sc)
{
scr_stat *scp;
/* save the current state of video and keyboard */
sc_move_cursor(sc->old_scp, sc->old_scp->xpos, sc->old_scp->ypos);
if (!ISGRAPHSC(sc->old_scp))
sc_remove_cursor_image(sc->old_scp);
if (sc->old_scp->kbd_mode == K_XLATE)
save_kbd_state(sc->old_scp);
/* set up the video for the new screen */
scp = sc->cur_scp = sc->new_scp;
#ifdef PC98
if (sc->old_scp->mode != scp->mode || ISUNKNOWNSC(sc->old_scp) || ISUNKNOWNSC(sc->new_scp))
#else
if (sc->old_scp->mode != scp->mode || ISUNKNOWNSC(sc->old_scp))
#endif
set_mode(scp);
#ifndef __sparc64__
else
sc_vtb_init(&scp->scr, VTB_FRAMEBUFFER, scp->xsize, scp->ysize,
(void *)sc->adp->va_window, FALSE);
#endif
scp->status |= MOUSE_HIDDEN;
sc_move_cursor(scp, scp->xpos, scp->ypos);
if (!ISGRAPHSC(scp))
sc_set_cursor_image(scp);
#ifndef SC_NO_PALETTE_LOADING
if (ISGRAPHSC(sc->old_scp)) {
#ifdef SC_PIXEL_MODE
if (sc->adp->va_info.vi_mem_model == V_INFO_MM_DIRECT)
vidd_load_palette(sc->adp, sc->palette2);
else
#endif
vidd_load_palette(sc->adp, sc->palette);
}
#endif
sc_set_border(scp, scp->border);
/* set up the keyboard for the new screen */
if (sc->old_scp->kbd_mode != scp->kbd_mode)
(void)kbdd_ioctl(sc->kbd, KDSKBMODE, (caddr_t)&scp->kbd_mode);
update_kbd_state(scp, scp->status, LOCK_MASK);
mark_all(scp);
}
void
sc_puts(scr_stat *scp, u_char *buf, int len, int kernel)
{
int need_unlock = 0;
#ifdef DEV_SPLASH
/* make screensaver happy */
if (!sticky_splash && scp == scp->sc->cur_scp && !sc_saver_keyb_only)
run_scrn_saver = FALSE;
#endif
if (scp->tsw) {
if (!kdb_active && !mtx_owned(&scp->scr_lock)) {
need_unlock = 1;
mtx_lock_spin(&scp->scr_lock);
}
(*scp->tsw->te_puts)(scp, buf, len, kernel);
if (need_unlock)
mtx_unlock_spin(&scp->scr_lock);
}
if (scp->sc->delayed_next_scr)
sc_switch_scr(scp->sc, scp->sc->delayed_next_scr - 1);
}
void
sc_draw_cursor_image(scr_stat *scp)
{
/* assert(scp == scp->sc->cur_scp); */
SC_VIDEO_LOCK(scp->sc);
(*scp->rndr->draw_cursor)(scp, scp->cursor_pos,
scp->curs_attr.flags & CONS_BLINK_CURSOR, TRUE,
sc_inside_cutmark(scp, scp->cursor_pos));
scp->cursor_oldpos = scp->cursor_pos;
SC_VIDEO_UNLOCK(scp->sc);
}
void
sc_remove_cursor_image(scr_stat *scp)
{
/* assert(scp == scp->sc->cur_scp); */
SC_VIDEO_LOCK(scp->sc);
(*scp->rndr->draw_cursor)(scp, scp->cursor_oldpos,
scp->curs_attr.flags & CONS_BLINK_CURSOR, FALSE,
sc_inside_cutmark(scp, scp->cursor_oldpos));
SC_VIDEO_UNLOCK(scp->sc);
}
static void
update_cursor_image(scr_stat *scp)
{
/* assert(scp == scp->sc->cur_scp); */
sc_remove_cursor_image(scp);
sc_set_cursor_image(scp);
sc_draw_cursor_image(scp);
}
void
sc_set_cursor_image(scr_stat *scp)
{
scp->curs_attr.flags = scp->curr_curs_attr.flags;
if (scp->curs_attr.flags & CONS_HIDDEN_CURSOR) {
/* hidden cursor is internally represented as zero-height underline */
scp->curs_attr.flags = CONS_CHAR_CURSOR;
scp->curs_attr.base = scp->curs_attr.height = 0;
} else if (scp->curs_attr.flags & CONS_CHAR_CURSOR) {
scp->curs_attr.base = imin(scp->curr_curs_attr.base,
scp->font_size - 1);
scp->curs_attr.height = imin(scp->curr_curs_attr.height,
scp->font_size - scp->curs_attr.base);
} else { /* block cursor */
scp->curs_attr.base = 0;
scp->curs_attr.height = scp->font_size;
}
/* assert(scp == scp->sc->cur_scp); */
SC_VIDEO_LOCK(scp->sc);
(*scp->rndr->set_cursor)(scp, scp->curs_attr.base, scp->curs_attr.height,
scp->curs_attr.flags & CONS_BLINK_CURSOR);
SC_VIDEO_UNLOCK(scp->sc);
}
static void
change_cursor_shape(scr_stat *scp, int flags, int base, int height)
{
if ((scp == scp->sc->cur_scp) && !ISGRAPHSC(scp))
sc_remove_cursor_image(scp);
if (base >= 0)
scp->curr_curs_attr.base = base;
if (height >= 0)
scp->curr_curs_attr.height = height;
if (flags & CONS_RESET_CURSOR)
scp->curr_curs_attr = scp->dflt_curs_attr;
else
scp->curr_curs_attr.flags = flags & CONS_CURSOR_ATTRS;
if ((scp == scp->sc->cur_scp) && !ISGRAPHSC(scp)) {
sc_set_cursor_image(scp);
sc_draw_cursor_image(scp);
}
}
void
sc_change_cursor_shape(scr_stat *scp, int flags, int base, int height)
{
sc_softc_t *sc;
struct tty *tp;
int s;
int i;
s = spltty();
if ((flags != -1) && (flags & CONS_LOCAL_CURSOR)) {
/* local (per vty) change */
change_cursor_shape(scp, flags, base, height);
splx(s);
return;
}
/* global change */
sc = scp->sc;
if (base >= 0)
sc->curs_attr.base = base;
if (height >= 0)
sc->curs_attr.height = height;
if (flags != -1) {
if (flags & CONS_RESET_CURSOR)
sc->curs_attr = sc->dflt_curs_attr;
else
sc->curs_attr.flags = flags & CONS_CURSOR_ATTRS;
}
for (i = sc->first_vty; i < sc->first_vty + sc->vtys; ++i) {
if ((tp = SC_DEV(sc, i)) == NULL)
continue;
if ((scp = sc_get_stat(tp)) == NULL)
continue;
scp->dflt_curs_attr = sc->curs_attr;
change_cursor_shape(scp, CONS_RESET_CURSOR, -1, -1);
}
splx(s);
}
static void
scinit(int unit, int flags)
{
/*
* When syscons is being initialized as the kernel console, malloc()
* is not yet functional, because various kernel structures has not been
* fully initialized yet. Therefore, we need to declare the following
* static buffers for the console. This is less than ideal,
* but is necessry evil for the time being. XXX
*/
#ifdef PC98
static u_short sc_buffer[ROW*COL*2];/* XXX */
#else
static u_short sc_buffer[ROW*COL]; /* XXX */
#endif
#ifndef SC_NO_FONT_LOADING
static u_char font_8[256*8];
static u_char font_14[256*14];
static u_char font_16[256*16];
#endif
sc_softc_t *sc;
scr_stat *scp;
video_adapter_t *adp;
int col;
int row;
int i;
/* one time initialization */
if (init_done == COLD)
sc_get_bios_values(&bios_value);
init_done = WARM;
/*
* Allocate resources. Even if we are being called for the second
* time, we must allocate them again, because they might have
* disappeared...
*/
sc = sc_get_softc(unit, flags & SC_KERNEL_CONSOLE);
if ((sc->flags & SC_INIT_DONE) == 0)
SC_VIDEO_LOCKINIT(sc);
adp = NULL;
if (sc->adapter >= 0) {
vid_release(sc->adp, (void *)&sc->adapter);
adp = sc->adp;
sc->adp = NULL;
}
if (sc->keyboard >= 0) {
DPRINTF(5, ("sc%d: releasing kbd%d\n", unit, sc->keyboard));
i = kbd_release(sc->kbd, (void *)&sc->keyboard);
DPRINTF(5, ("sc%d: kbd_release returned %d\n", unit, i));
if (sc->kbd != NULL) {
DPRINTF(5, ("sc%d: kbd != NULL!, index:%d, unit:%d, flags:0x%x\n",
unit, sc->kbd->kb_index, sc->kbd->kb_unit, sc->kbd->kb_flags));
}
sc->kbd = NULL;
}
sc->adapter = vid_allocate("*", unit, (void *)&sc->adapter);
sc->adp = vid_get_adapter(sc->adapter);
/* assert((sc->adapter >= 0) && (sc->adp != NULL)) */
sc->keyboard = sc_allocate_keyboard(sc, unit);
DPRINTF(1, ("sc%d: keyboard %d\n", unit, sc->keyboard));
sc->kbd = kbd_get_keyboard(sc->keyboard);
if (sc->kbd != NULL) {
DPRINTF(1, ("sc%d: kbd index:%d, unit:%d, flags:0x%x\n",
unit, sc->kbd->kb_index, sc->kbd->kb_unit, sc->kbd->kb_flags));
}
if (!(sc->flags & SC_INIT_DONE) || (adp != sc->adp)) {
sc->initial_mode = sc->adp->va_initial_mode;
#ifndef SC_NO_FONT_LOADING
if (flags & SC_KERNEL_CONSOLE) {
sc->font_8 = font_8;
sc->font_14 = font_14;
sc->font_16 = font_16;
} else if (sc->font_8 == NULL) {
/* assert(sc_malloc) */
sc->font_8 = malloc(sizeof(font_8), M_DEVBUF, M_WAITOK);
sc->font_14 = malloc(sizeof(font_14), M_DEVBUF, M_WAITOK);
sc->font_16 = malloc(sizeof(font_16), M_DEVBUF, M_WAITOK);
}
#endif
/* extract the hardware cursor location and hide the cursor for now */
vidd_read_hw_cursor(sc->adp, &col, &row);
vidd_set_hw_cursor(sc->adp, -1, -1);
/* set up the first console */
sc->first_vty = unit*MAXCONS;
sc->vtys = MAXCONS; /* XXX: should be configurable */
if (flags & SC_KERNEL_CONSOLE) {
/*
* Set up devs structure but don't use it yet, calling make_dev()
* might panic kernel. Wait for sc_attach_unit() to actually
* create the devices.
*/
sc->dev = main_devs;
scp = &main_console;
init_scp(sc, sc->first_vty, scp);
sc_vtb_init(&scp->vtb, VTB_MEMORY, scp->xsize, scp->ysize,
(void *)sc_buffer, FALSE);
/* move cursors to the initial positions */
if (col >= scp->xsize)
col = 0;
if (row >= scp->ysize)
row = scp->ysize - 1;
scp->xpos = col;
scp->ypos = row;
scp->cursor_pos = scp->cursor_oldpos = row*scp->xsize + col;
if (sc_init_emulator(scp, SC_DFLT_TERM))
sc_init_emulator(scp, "*");
(*scp->tsw->te_default_attr)(scp,
user_default.std_color,
user_default.rev_color);
} else {
/* assert(sc_malloc) */
sc->dev = malloc(sizeof(struct tty *)*sc->vtys, M_DEVBUF,
M_WAITOK|M_ZERO);
sc->dev[0] = sc_alloc_tty(0, unit * MAXCONS);
scp = alloc_scp(sc, sc->first_vty);
SC_STAT(sc->dev[0]) = scp;
}
sc->cur_scp = scp;
#ifndef __sparc64__
/* copy screen to temporary buffer */
sc_vtb_init(&scp->scr, VTB_FRAMEBUFFER, scp->xsize, scp->ysize,
(void *)scp->sc->adp->va_window, FALSE);
if (ISTEXTSC(scp))
sc_vtb_copy(&scp->scr, 0, &scp->vtb, 0, scp->xsize*scp->ysize);
#endif
if (bios_value.cursor_end < scp->font_size)
sc->dflt_curs_attr.base = scp->font_size -
bios_value.cursor_end - 1;
else
sc->dflt_curs_attr.base = 0;
i = bios_value.cursor_end - bios_value.cursor_start + 1;
sc->dflt_curs_attr.height = imin(i, scp->font_size);
sc->dflt_curs_attr.flags = 0;
sc->curs_attr = sc->dflt_curs_attr;
scp->curr_curs_attr = scp->dflt_curs_attr = sc->curs_attr;
#ifndef SC_NO_SYSMOUSE
sc_mouse_move(scp, scp->xpixel/2, scp->ypixel/2);
#endif
if (!ISGRAPHSC(scp)) {
sc_set_cursor_image(scp);
sc_draw_cursor_image(scp);
}
/* save font and palette */
#ifndef SC_NO_FONT_LOADING
sc->fonts_loaded = 0;
if (ISFONTAVAIL(sc->adp->va_flags)) {
#ifdef SC_DFLT_FONT
bcopy(dflt_font_8, sc->font_8, sizeof(dflt_font_8));
bcopy(dflt_font_14, sc->font_14, sizeof(dflt_font_14));
bcopy(dflt_font_16, sc->font_16, sizeof(dflt_font_16));
sc->fonts_loaded = FONT_16 | FONT_14 | FONT_8;
if (scp->font_size < 14) {
sc_load_font(scp, 0, 8, 8, sc->font_8, 0, 256);
} else if (scp->font_size >= 16) {
sc_load_font(scp, 0, 16, 8, sc->font_16, 0, 256);
} else {
sc_load_font(scp, 0, 14, 8, sc->font_14, 0, 256);
}
#else /* !SC_DFLT_FONT */
if (scp->font_size < 14) {
sc_save_font(scp, 0, 8, 8, sc->font_8, 0, 256);
sc->fonts_loaded = FONT_8;
} else if (scp->font_size >= 16) {
sc_save_font(scp, 0, 16, 8, sc->font_16, 0, 256);
sc->fonts_loaded = FONT_16;
} else {
sc_save_font(scp, 0, 14, 8, sc->font_14, 0, 256);
sc->fonts_loaded = FONT_14;
}
#endif /* SC_DFLT_FONT */
/* FONT KLUDGE: always use the font page #0. XXX */
sc_show_font(scp, 0);
}
#endif /* !SC_NO_FONT_LOADING */
#ifndef SC_NO_PALETTE_LOADING
vidd_save_palette(sc->adp, sc->palette);
#ifdef SC_PIXEL_MODE
for (i = 0; i < sizeof(sc->palette2); i++)
sc->palette2[i] = i / 3;
#endif
#endif
#ifdef DEV_SPLASH
if (!(sc->flags & SC_SPLASH_SCRN)) {
/* we are ready to put up the splash image! */
splash_init(sc->adp, scsplash_callback, sc);
sc->flags |= SC_SPLASH_SCRN;
}
#endif
}
/* the rest is not necessary, if we have done it once */
if (sc->flags & SC_INIT_DONE)
return;
/* initialize mapscrn arrays to a one to one map */
for (i = 0; i < sizeof(sc->scr_map); i++)
sc->scr_map[i] = sc->scr_rmap[i] = i;
#ifdef PC98
sc->scr_map[0x5c] = (u_char)0xfc; /* for backslash */
#endif
sc->flags |= SC_INIT_DONE;
}
static void
scterm(int unit, int flags)
{
sc_softc_t *sc;
scr_stat *scp;
sc = sc_get_softc(unit, flags & SC_KERNEL_CONSOLE);
if (sc == NULL)
return; /* shouldn't happen */
#ifdef DEV_SPLASH
/* this console is no longer available for the splash screen */
if (sc->flags & SC_SPLASH_SCRN) {
splash_term(sc->adp);
sc->flags &= ~SC_SPLASH_SCRN;
}
#endif
#if 0 /* XXX */
/* move the hardware cursor to the upper-left corner */
vidd_set_hw_cursor(sc->adp, 0, 0);
#endif
/* release the keyboard and the video card */
if (sc->keyboard >= 0)
kbd_release(sc->kbd, &sc->keyboard);
if (sc->adapter >= 0)
vid_release(sc->adp, &sc->adapter);
/* stop the terminal emulator, if any */
scp = sc_get_stat(sc->dev[0]);
if (scp->tsw)
(*scp->tsw->te_term)(scp, &scp->ts);
if (scp->ts != NULL)
free(scp->ts, M_DEVBUF);
mtx_destroy(&scp->scr_lock);
/* clear the structure */
if (!(flags & SC_KERNEL_CONSOLE)) {
/* XXX: We need delete_dev() for this */
free(sc->dev, M_DEVBUF);
#if 0
/* XXX: We need a ttyunregister for this */
free(sc->tty, M_DEVBUF);
#endif
#ifndef SC_NO_FONT_LOADING
free(sc->font_8, M_DEVBUF);
free(sc->font_14, M_DEVBUF);
free(sc->font_16, M_DEVBUF);
#endif
/* XXX vtb, history */
}
bzero(sc, sizeof(*sc));
sc->keyboard = -1;
sc->adapter = -1;
}
static void
scshutdown(__unused void *arg, __unused int howto)
{
KASSERT(sc_console != NULL, ("sc_console != NULL"));
KASSERT(sc_console->sc != NULL, ("sc_console->sc != NULL"));
KASSERT(sc_console->sc->cur_scp != NULL,
("sc_console->sc->cur_scp != NULL"));
sc_touch_scrn_saver();
if (!cold &&
sc_console->sc->cur_scp->index != sc_console->index &&
sc_console->sc->cur_scp->smode.mode == VT_AUTO &&
sc_console->smode.mode == VT_AUTO)
sc_switch_scr(sc_console->sc, sc_console->index);
shutdown_in_progress = TRUE;
}
static void
scsuspend(__unused void *arg)
{
int retry;
KASSERT(sc_console != NULL, ("sc_console != NULL"));
KASSERT(sc_console->sc != NULL, ("sc_console->sc != NULL"));
KASSERT(sc_console->sc->cur_scp != NULL,
("sc_console->sc->cur_scp != NULL"));
sc_susp_scr = sc_console->sc->cur_scp->index;
if (sc_no_suspend_vtswitch ||
sc_susp_scr == sc_console->index) {
sc_touch_scrn_saver();
sc_susp_scr = -1;
return;
}
for (retry = 0; retry < 10; retry++) {
sc_switch_scr(sc_console->sc, sc_console->index);
if (!sc_console->sc->switch_in_progress)
break;
pause("scsuspend", hz);
}
suspend_in_progress = TRUE;
}
static void
scresume(__unused void *arg)
{
KASSERT(sc_console != NULL, ("sc_console != NULL"));
KASSERT(sc_console->sc != NULL, ("sc_console->sc != NULL"));
KASSERT(sc_console->sc->cur_scp != NULL,
("sc_console->sc->cur_scp != NULL"));
suspend_in_progress = FALSE;
if (sc_susp_scr < 0) {
mark_all(sc_console->sc->cur_scp);
return;
}
sc_switch_scr(sc_console->sc, sc_susp_scr);
}
int
sc_clean_up(scr_stat *scp)
{
#ifdef DEV_SPLASH
int error;
#endif
if (scp->sc->flags & SC_SCRN_BLANKED) {
sc_touch_scrn_saver();
#ifdef DEV_SPLASH
if ((error = wait_scrn_saver_stop(scp->sc)))
return error;
#endif
}
scp->status |= MOUSE_HIDDEN;
sc_remove_mouse_image(scp);
sc_remove_cutmarking(scp);
return 0;
}
void
sc_alloc_scr_buffer(scr_stat *scp, int wait, int discard)
{
sc_vtb_t new;
sc_vtb_t old;
old = scp->vtb;
sc_vtb_init(&new, VTB_MEMORY, scp->xsize, scp->ysize, NULL, wait);
if (!discard && (old.vtb_flags & VTB_VALID)) {
/* retain the current cursor position and buffer contants */
scp->cursor_oldpos = scp->cursor_pos;
/*
* This works only if the old buffer has the same size as or larger
* than the new one. XXX
*/
sc_vtb_copy(&old, 0, &new, 0, scp->xsize*scp->ysize);
scp->vtb = new;
} else {
scp->vtb = new;
sc_vtb_destroy(&old);
}
#ifndef SC_NO_SYSMOUSE
/* move the mouse cursor at the center of the screen */
sc_mouse_move(scp, scp->xpixel / 2, scp->ypixel / 2);
#endif
}
static scr_stat
*alloc_scp(sc_softc_t *sc, int vty)
{
scr_stat *scp;
/* assert(sc_malloc) */
scp = (scr_stat *)malloc(sizeof(scr_stat), M_DEVBUF, M_WAITOK);
init_scp(sc, vty, scp);
sc_alloc_scr_buffer(scp, TRUE, TRUE);
if (sc_init_emulator(scp, SC_DFLT_TERM))
sc_init_emulator(scp, "*");
#ifndef SC_NO_CUTPASTE
sc_alloc_cut_buffer(scp, TRUE);
#endif
#ifndef SC_NO_HISTORY
sc_alloc_history_buffer(scp, 0, 0, TRUE);
#endif
return scp;
}
static void
init_scp(sc_softc_t *sc, int vty, scr_stat *scp)
{
video_info_t info;
bzero(scp, sizeof(*scp));
scp->index = vty;
scp->sc = sc;
scp->status = 0;
scp->mode = sc->initial_mode;
vidd_get_info(sc->adp, scp->mode, &info);
if (info.vi_flags & V_INFO_GRAPHICS) {
scp->status |= GRAPHICS_MODE;
scp->xpixel = info.vi_width;
scp->ypixel = info.vi_height;
scp->xsize = info.vi_width/info.vi_cwidth;
scp->ysize = info.vi_height/info.vi_cheight;
scp->font_size = 0;
scp->font = NULL;
} else {
scp->xsize = info.vi_width;
scp->ysize = info.vi_height;
scp->xpixel = scp->xsize*info.vi_cwidth;
scp->ypixel = scp->ysize*info.vi_cheight;
}
scp->font_size = info.vi_cheight;
scp->font_width = info.vi_cwidth;
#ifndef SC_NO_FONT_LOADING
if (info.vi_cheight < 14)
scp->font = sc->font_8;
else if (info.vi_cheight >= 16)
scp->font = sc->font_16;
else
scp->font = sc->font_14;
#else
scp->font = NULL;
#endif
sc_vtb_init(&scp->vtb, VTB_MEMORY, 0, 0, NULL, FALSE);
#ifndef __sparc64__
sc_vtb_init(&scp->scr, VTB_FRAMEBUFFER, 0, 0, NULL, FALSE);
#endif
scp->xoff = scp->yoff = 0;
scp->xpos = scp->ypos = 0;
scp->start = scp->xsize * scp->ysize - 1;
scp->end = 0;
scp->tsw = NULL;
scp->ts = NULL;
scp->rndr = NULL;
scp->border = (SC_NORM_ATTR >> 4) & 0x0f;
scp->curr_curs_attr = scp->dflt_curs_attr = sc->curs_attr;
scp->mouse_cut_start = scp->xsize*scp->ysize;
scp->mouse_cut_end = -1;
scp->mouse_signal = 0;
scp->mouse_pid = 0;
scp->mouse_proc = NULL;
scp->kbd_mode = K_XLATE;
scp->bell_pitch = bios_value.bell_pitch;
scp->bell_duration = BELL_DURATION;
scp->status |= (bios_value.shift_state & NLKED);
scp->status |= CURSOR_ENABLED | MOUSE_HIDDEN;
scp->pid = 0;
scp->proc = NULL;
scp->smode.mode = VT_AUTO;
scp->history = NULL;
scp->history_pos = 0;
scp->history_size = 0;
mtx_init(&scp->scr_lock, "scrlock", NULL, MTX_SPIN);
}
int
sc_init_emulator(scr_stat *scp, char *name)
{
sc_term_sw_t *sw;
sc_rndr_sw_t *rndr;
void *p;
int error;
if (name == NULL) /* if no name is given, use the current emulator */
sw = scp->tsw;
else /* ...otherwise find the named emulator */
sw = sc_term_match(name);
if (sw == NULL)
return EINVAL;
rndr = NULL;
if (strcmp(sw->te_renderer, "*") != 0) {
rndr = sc_render_match(scp, sw->te_renderer,
scp->status & (GRAPHICS_MODE | PIXEL_MODE));
}
if (rndr == NULL) {
rndr = sc_render_match(scp, scp->sc->adp->va_name,
scp->status & (GRAPHICS_MODE | PIXEL_MODE));
if (rndr == NULL)
return ENODEV;
}
if (sw == scp->tsw) {
error = (*sw->te_init)(scp, &scp->ts, SC_TE_WARM_INIT);
scp->rndr = rndr;
scp->rndr->init(scp);
sc_clear_screen(scp);
/* assert(error == 0); */
return error;
}
if (sc_malloc && (sw->te_size > 0))
p = malloc(sw->te_size, M_DEVBUF, M_NOWAIT);
else
p = NULL;
error = (*sw->te_init)(scp, &p, SC_TE_COLD_INIT);
if (error)
return error;
if (scp->tsw)
(*scp->tsw->te_term)(scp, &scp->ts);
if (scp->ts != NULL)
free(scp->ts, M_DEVBUF);
scp->tsw = sw;
scp->ts = p;
scp->rndr = rndr;
scp->rndr->init(scp);
/* XXX */
(*sw->te_default_attr)(scp, user_default.std_color, user_default.rev_color);
sc_clear_screen(scp);
return 0;
}
/*
* scgetc(flags) - get character from keyboard.
* If flags & SCGETC_CN, then avoid harmful side effects.
* If flags & SCGETC_NONBLOCK, then wait until a key is pressed, else
* return NOKEY if there is nothing there.
*/
static u_int
scgetc(sc_softc_t *sc, u_int flags)
{
scr_stat *scp;
#ifndef SC_NO_HISTORY
struct tty *tp;
#endif
u_int c;
int this_scr;
int f;
int i;
if (sc->kbd == NULL)
return NOKEY;
next_code:
#if 1
/* I don't like this, but... XXX */
if (flags & SCGETC_CN)
sccnupdate(sc->cur_scp);
#endif
scp = sc->cur_scp;
/* first see if there is something in the keyboard port */
for (;;) {
c = kbdd_read_char(sc->kbd, !(flags & SCGETC_NONBLOCK));
if (c == ERRKEY) {
if (!(flags & SCGETC_CN))
sc_bell(scp, bios_value.bell_pitch, BELL_DURATION);
} else if (c == NOKEY)
return c;
else
break;
}
/* make screensaver happy */
if (!(c & RELKEY))
sc_touch_scrn_saver();
if (!(flags & SCGETC_CN))
random_harvest(&c, sizeof(c), 1, 0, RANDOM_KEYBOARD);
if (scp->kbd_mode != K_XLATE)
return KEYCHAR(c);
/* if scroll-lock pressed allow history browsing */
if (!ISGRAPHSC(scp) && scp->history && scp->status & SLKED) {
scp->status &= ~CURSOR_ENABLED;
sc_remove_cursor_image(scp);
#ifndef SC_NO_HISTORY
if (!(scp->status & BUFFER_SAVED)) {
scp->status |= BUFFER_SAVED;
sc_hist_save(scp);
}
switch (c) {
/* FIXME: key codes */
case SPCLKEY | FKEY | F(49): /* home key */
sc_remove_cutmarking(scp);
sc_hist_home(scp);
goto next_code;
case SPCLKEY | FKEY | F(57): /* end key */
sc_remove_cutmarking(scp);
sc_hist_end(scp);
goto next_code;
case SPCLKEY | FKEY | F(50): /* up arrow key */
sc_remove_cutmarking(scp);
if (sc_hist_up_line(scp))
if (!(flags & SCGETC_CN))
sc_bell(scp, bios_value.bell_pitch, BELL_DURATION);
goto next_code;
case SPCLKEY | FKEY | F(58): /* down arrow key */
sc_remove_cutmarking(scp);
if (sc_hist_down_line(scp))
if (!(flags & SCGETC_CN))
sc_bell(scp, bios_value.bell_pitch, BELL_DURATION);
goto next_code;
case SPCLKEY | FKEY | F(51): /* page up key */
sc_remove_cutmarking(scp);
for (i=0; i<scp->ysize; i++)
if (sc_hist_up_line(scp)) {
if (!(flags & SCGETC_CN))
sc_bell(scp, bios_value.bell_pitch, BELL_DURATION);
break;
}
goto next_code;
case SPCLKEY | FKEY | F(59): /* page down key */
sc_remove_cutmarking(scp);
for (i=0; i<scp->ysize; i++)
if (sc_hist_down_line(scp)) {
if (!(flags & SCGETC_CN))
sc_bell(scp, bios_value.bell_pitch, BELL_DURATION);
break;
}
goto next_code;
}
#endif /* SC_NO_HISTORY */
}
/*
* Process and consume special keys here. Return a plain char code
* or a char code with the META flag or a function key code.
*/
if (c & RELKEY) {
/* key released */
/* goto next_code */
} else {
/* key pressed */
if (c & SPCLKEY) {
c &= ~SPCLKEY;
switch (KEYCHAR(c)) {
/* LOCKING KEYS */
case NLK: case CLK: case ALK:
break;
case SLK:
(void)kbdd_ioctl(sc->kbd, KDGKBSTATE, (caddr_t)&f);
if (f & SLKED) {
scp->status |= SLKED;
} else {
if (scp->status & SLKED) {
scp->status &= ~SLKED;
#ifndef SC_NO_HISTORY
if (scp->status & BUFFER_SAVED) {
if (!sc_hist_restore(scp))
sc_remove_cutmarking(scp);
scp->status &= ~BUFFER_SAVED;
scp->status |= CURSOR_ENABLED;
sc_draw_cursor_image(scp);
}
tp = SC_DEV(sc, scp->index);
if (!kdb_active && tty_opened(tp))
sctty_outwakeup(tp);
#endif
}
}
break;
case PASTE:
#ifndef SC_NO_CUTPASTE
sc_mouse_paste(scp);
#endif
break;
/* NON-LOCKING KEYS */
case NOP:
case LSH: case RSH: case LCTR: case RCTR:
case LALT: case RALT: case ASH: case META:
break;
case BTAB:
if (!(sc->flags & SC_SCRN_BLANKED))
return c;
break;
case SPSC:
#ifdef DEV_SPLASH
/* force activatation/deactivation of the screen saver */
if (!(sc->flags & SC_SCRN_BLANKED)) {
run_scrn_saver = TRUE;
sc->scrn_time_stamp -= scrn_blank_time;
}
if (cold) {
/*
* While devices are being probed, the screen saver need
* to be invoked explictly. XXX
*/
if (sc->flags & SC_SCRN_BLANKED) {
scsplash_stick(FALSE);
stop_scrn_saver(sc, current_saver);
} else {
if (!ISGRAPHSC(scp)) {
scsplash_stick(TRUE);
(*current_saver)(sc, TRUE);
}
}
}
#endif /* DEV_SPLASH */
break;
case RBT:
#ifndef SC_DISABLE_REBOOT
if (enable_reboot)
shutdown_nice(0);
#endif
break;
case HALT:
#ifndef SC_DISABLE_REBOOT
if (enable_reboot)
shutdown_nice(RB_HALT);
#endif
break;
case PDWN:
#ifndef SC_DISABLE_REBOOT
if (enable_reboot)
shutdown_nice(RB_HALT|RB_POWEROFF);
#endif
break;
case SUSP:
power_pm_suspend(POWER_SLEEP_STATE_SUSPEND);
break;
case STBY:
power_pm_suspend(POWER_SLEEP_STATE_STANDBY);
break;
case DBG:
#ifndef SC_DISABLE_KDBKEY
if (enable_kdbkey)
kdb_break();
#endif
break;
case PNC:
if (enable_panic_key)
panic("Forced by the panic key");
break;
case NEXT:
this_scr = scp->index;
for (i = (this_scr - sc->first_vty + 1)%sc->vtys;
sc->first_vty + i != this_scr;
i = (i + 1)%sc->vtys) {
struct tty *tp = SC_DEV(sc, sc->first_vty + i);
if (tty_opened(tp)) {
sc_switch_scr(scp->sc, sc->first_vty + i);
break;
}
}
break;
case PREV:
this_scr = scp->index;
for (i = (this_scr - sc->first_vty + sc->vtys - 1)%sc->vtys;
sc->first_vty + i != this_scr;
i = (i + sc->vtys - 1)%sc->vtys) {
struct tty *tp = SC_DEV(sc, sc->first_vty + i);
if (tty_opened(tp)) {
sc_switch_scr(scp->sc, sc->first_vty + i);
break;
}
}
break;
default:
if (KEYCHAR(c) >= F_SCR && KEYCHAR(c) <= L_SCR) {
sc_switch_scr(scp->sc, sc->first_vty + KEYCHAR(c) - F_SCR);
break;
}
/* assert(c & FKEY) */
if (!(sc->flags & SC_SCRN_BLANKED))
return c;
break;
}
/* goto next_code */
} else {
/* regular keys (maybe MKEY is set) */
#if !defined(SC_DISABLE_KDBKEY) && defined(KDB)
if (enable_kdbkey)
kdb_alt_break(c, &sc->sc_altbrk);
#endif
if (!(sc->flags & SC_SCRN_BLANKED))
return c;
}
}
goto next_code;
}
static int
sctty_mmap(struct tty *tp, vm_ooffset_t offset, vm_paddr_t *paddr,
int nprot, vm_memattr_t *memattr)
{
scr_stat *scp;
scp = sc_get_stat(tp);
if (scp != scp->sc->cur_scp)
return -1;
return vidd_mmap(scp->sc->adp, offset, paddr, nprot, memattr);
}
static int
save_kbd_state(scr_stat *scp)
{
int state;
int error;
error = kbdd_ioctl(scp->sc->kbd, KDGKBSTATE, (caddr_t)&state);
if (error == ENOIOCTL)
error = ENODEV;
if (error == 0) {
scp->status &= ~LOCK_MASK;
scp->status |= state;
}
return error;
}
static int
update_kbd_state(scr_stat *scp, int new_bits, int mask)
{
int state;
int error;
if (mask != LOCK_MASK) {
error = kbdd_ioctl(scp->sc->kbd, KDGKBSTATE, (caddr_t)&state);
if (error == ENOIOCTL)
error = ENODEV;
if (error)
return error;
state &= ~mask;
state |= new_bits & mask;
} else {
state = new_bits & LOCK_MASK;
}
error = kbdd_ioctl(scp->sc->kbd, KDSKBSTATE, (caddr_t)&state);
if (error == ENOIOCTL)
error = ENODEV;
return error;
}
static int
update_kbd_leds(scr_stat *scp, int which)
{
int error;
which &= LOCK_MASK;
error = kbdd_ioctl(scp->sc->kbd, KDSETLED, (caddr_t)&which);
if (error == ENOIOCTL)
error = ENODEV;
return error;
}
int
set_mode(scr_stat *scp)
{
video_info_t info;
/* reject unsupported mode */
if (vidd_get_info(scp->sc->adp, scp->mode, &info))
return 1;
/* if this vty is not currently showing, do nothing */
if (scp != scp->sc->cur_scp)
return 0;
/* setup video hardware for the given mode */
vidd_set_mode(scp->sc->adp, scp->mode);
scp->rndr->init(scp);
#ifndef __sparc64__
sc_vtb_init(&scp->scr, VTB_FRAMEBUFFER, scp->xsize, scp->ysize,
(void *)scp->sc->adp->va_window, FALSE);
#endif
#ifndef SC_NO_FONT_LOADING
/* load appropriate font */
if (!(scp->status & GRAPHICS_MODE)) {
if (!(scp->status & PIXEL_MODE) && ISFONTAVAIL(scp->sc->adp->va_flags)) {
if (scp->font_size < 14) {
if (scp->sc->fonts_loaded & FONT_8)
sc_load_font(scp, 0, 8, 8, scp->sc->font_8, 0, 256);
} else if (scp->font_size >= 16) {
if (scp->sc->fonts_loaded & FONT_16)
sc_load_font(scp, 0, 16, 8, scp->sc->font_16, 0, 256);
} else {
if (scp->sc->fonts_loaded & FONT_14)
sc_load_font(scp, 0, 14, 8, scp->sc->font_14, 0, 256);
}
/*
* FONT KLUDGE:
* This is an interim kludge to display correct font.
* Always use the font page #0 on the video plane 2.
* Somehow we cannot show the font in other font pages on
* some video cards... XXX
*/
sc_show_font(scp, 0);
}
mark_all(scp);
}
#endif /* !SC_NO_FONT_LOADING */
sc_set_border(scp, scp->border);
sc_set_cursor_image(scp);
return 0;
}
void
sc_set_border(scr_stat *scp, int color)
{
SC_VIDEO_LOCK(scp->sc);
(*scp->rndr->draw_border)(scp, color);
SC_VIDEO_UNLOCK(scp->sc);
}
#ifndef SC_NO_FONT_LOADING
void
sc_load_font(scr_stat *scp, int page, int size, int width, u_char *buf,
int base, int count)
{
sc_softc_t *sc;
sc = scp->sc;
sc->font_loading_in_progress = TRUE;
vidd_load_font(sc->adp, page, size, width, buf, base, count);
sc->font_loading_in_progress = FALSE;
}
void
sc_save_font(scr_stat *scp, int page, int size, int width, u_char *buf,
int base, int count)
{
sc_softc_t *sc;
sc = scp->sc;
sc->font_loading_in_progress = TRUE;
vidd_save_font(sc->adp, page, size, width, buf, base, count);
sc->font_loading_in_progress = FALSE;
}
void
sc_show_font(scr_stat *scp, int page)
{
vidd_show_font(scp->sc->adp, page);
}
#endif /* !SC_NO_FONT_LOADING */
void
sc_paste(scr_stat *scp, const u_char *p, int count)
{
struct tty *tp;
u_char *rmap;
tp = SC_DEV(scp->sc, scp->sc->cur_scp->index);
if (!tty_opened(tp))
return;
rmap = scp->sc->scr_rmap;
for (; count > 0; --count)
ttydisc_rint(tp, rmap[*p++], 0);
ttydisc_rint_done(tp);
}
void
sc_respond(scr_stat *scp, const u_char *p, int count, int wakeup)
{
struct tty *tp;
tp = SC_DEV(scp->sc, scp->sc->cur_scp->index);
if (!tty_opened(tp))
return;
ttydisc_rint_simple(tp, p, count);
if (wakeup) {
/* XXX: we can't always call ttydisc_rint_done() here! */
ttydisc_rint_done(tp);
}
}
void
sc_bell(scr_stat *scp, int pitch, int duration)
{
if (cold || shutdown_in_progress || !enable_bell)
return;
if (scp != scp->sc->cur_scp && (scp->sc->flags & SC_QUIET_BELL))
return;
if (scp->sc->flags & SC_VISUAL_BELL) {
if (scp->sc->blink_in_progress)
return;
scp->sc->blink_in_progress = 3;
if (scp != scp->sc->cur_scp)
scp->sc->blink_in_progress += 2;
blink_screen(scp->sc->cur_scp);
} else if (duration != 0 && pitch != 0) {
if (scp != scp->sc->cur_scp)
pitch *= 2;
sysbeep(1193182 / pitch, duration);
}
}
static void
blink_screen(void *arg)
{
scr_stat *scp = arg;
struct tty *tp;
if (ISGRAPHSC(scp) || (scp->sc->blink_in_progress <= 1)) {
scp->sc->blink_in_progress = 0;
mark_all(scp);
tp = SC_DEV(scp->sc, scp->index);
if (tty_opened(tp))
sctty_outwakeup(tp);
if (scp->sc->delayed_next_scr)
sc_switch_scr(scp->sc, scp->sc->delayed_next_scr - 1);
}
else {
(*scp->rndr->draw)(scp, 0, scp->xsize*scp->ysize,
scp->sc->blink_in_progress & 1);
scp->sc->blink_in_progress--;
timeout(blink_screen, scp, hz / 10);
}
}
/*
* Until sc_attach_unit() gets called no dev structures will be available
* to store the per-screen current status. This is the case when the
* kernel is initially booting and needs access to its console. During
* this early phase of booting the console's current status is kept in
* one statically defined scr_stat structure, and any pointers to the
* dev structures will be NULL.
*/
static scr_stat *
sc_get_stat(struct tty *tp)
{
if (tp == NULL)
return (&main_console);
return (SC_STAT(tp));
}
/*
* Allocate active keyboard. Try to allocate "kbdmux" keyboard first, and,
* if found, add all non-busy keyboards to "kbdmux". Otherwise look for
* any keyboard.
*/
static int
sc_allocate_keyboard(sc_softc_t *sc, int unit)
{
int idx0, idx;
keyboard_t *k0, *k;
keyboard_info_t ki;
idx0 = kbd_allocate("kbdmux", -1, (void *)&sc->keyboard, sckbdevent, sc);
if (idx0 != -1) {
k0 = kbd_get_keyboard(idx0);
for (idx = kbd_find_keyboard2("*", -1, 0);
idx != -1;
idx = kbd_find_keyboard2("*", -1, idx + 1)) {
k = kbd_get_keyboard(idx);
if (idx == idx0 || KBD_IS_BUSY(k))
continue;
bzero(&ki, sizeof(ki));
strcpy(ki.kb_name, k->kb_name);
ki.kb_unit = k->kb_unit;
(void)kbdd_ioctl(k0, KBADDKBD, (caddr_t) &ki);
}
} else
idx0 = kbd_allocate("*", unit, (void *)&sc->keyboard, sckbdevent, sc);
return (idx0);
}
Index: head/sys/dev/usb/usb_dev.c
===================================================================
--- head/sys/dev/usb/usb_dev.c (revision 225616)
+++ head/sys/dev/usb/usb_dev.c (revision 225617)
@@ -1,2295 +1,2295 @@
/* $FreeBSD$ */
/*-
* Copyright (c) 2006-2008 Hans Petter Selasky. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
*
* usb_dev.c - An abstraction layer for creating devices under /dev/...
*/
#include <sys/stdint.h>
#include <sys/stddef.h>
#include <sys/param.h>
#include <sys/queue.h>
#include <sys/types.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/bus.h>
#include <sys/module.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/condvar.h>
#include <sys/sysctl.h>
#include <sys/sx.h>
#include <sys/unistd.h>
#include <sys/callout.h>
#include <sys/malloc.h>
#include <sys/priv.h>
#include <sys/vnode.h>
#include <sys/conf.h>
#include <sys/fcntl.h>
#include <dev/usb/usb.h>
#include <dev/usb/usb_ioctl.h>
#include <dev/usb/usbdi.h>
#include <dev/usb/usbdi_util.h>
#define USB_DEBUG_VAR usb_fifo_debug
#include <dev/usb/usb_core.h>
#include <dev/usb/usb_dev.h>
#include <dev/usb/usb_mbuf.h>
#include <dev/usb/usb_process.h>
#include <dev/usb/usb_device.h>
#include <dev/usb/usb_debug.h>
#include <dev/usb/usb_busdma.h>
#include <dev/usb/usb_generic.h>
#include <dev/usb/usb_dynamic.h>
#include <dev/usb/usb_util.h>
#include <dev/usb/usb_controller.h>
#include <dev/usb/usb_bus.h>
#include <sys/filio.h>
#include <sys/ttycom.h>
#include <sys/syscallsubr.h>
#include <machine/stdarg.h>
#if USB_HAVE_UGEN
#ifdef USB_DEBUG
static int usb_fifo_debug = 0;
SYSCTL_NODE(_hw_usb, OID_AUTO, dev, CTLFLAG_RW, 0, "USB device");
SYSCTL_INT(_hw_usb_dev, OID_AUTO, debug, CTLFLAG_RW,
&usb_fifo_debug, 0, "Debug Level");
TUNABLE_INT("hw.usb.dev.debug", &usb_fifo_debug);
#endif
#if ((__FreeBSD_version >= 700001) || (__FreeBSD_version == 0) || \
((__FreeBSD_version >= 600034) && (__FreeBSD_version < 700000)))
#define USB_UCRED struct ucred *ucred,
#else
#define USB_UCRED
#endif
/* prototypes */
static int usb_fifo_open(struct usb_cdev_privdata *,
struct usb_fifo *, int);
static void usb_fifo_close(struct usb_fifo *, int);
static void usb_dev_init(void *);
static void usb_dev_init_post(void *);
static void usb_dev_uninit(void *);
static int usb_fifo_uiomove(struct usb_fifo *, void *, int,
struct uio *);
static void usb_fifo_check_methods(struct usb_fifo_methods *);
static struct usb_fifo *usb_fifo_alloc(void);
static struct usb_endpoint *usb_dev_get_ep(struct usb_device *, uint8_t,
uint8_t);
static void usb_loc_fill(struct usb_fs_privdata *,
struct usb_cdev_privdata *);
static void usb_close(void *);
static usb_error_t usb_ref_device(struct usb_cdev_privdata *, struct usb_cdev_refdata *, int);
static usb_error_t usb_usb_ref_device(struct usb_cdev_privdata *, struct usb_cdev_refdata *);
static void usb_unref_device(struct usb_cdev_privdata *, struct usb_cdev_refdata *);
static d_open_t usb_open;
static d_ioctl_t usb_ioctl;
static d_read_t usb_read;
static d_write_t usb_write;
static d_poll_t usb_poll;
static d_ioctl_t usb_static_ioctl;
static usb_fifo_open_t usb_fifo_dummy_open;
static usb_fifo_close_t usb_fifo_dummy_close;
static usb_fifo_ioctl_t usb_fifo_dummy_ioctl;
static usb_fifo_cmd_t usb_fifo_dummy_cmd;
/* character device structure used for devices (/dev/ugenX.Y and /dev/uXXX) */
struct cdevsw usb_devsw = {
.d_version = D_VERSION,
.d_open = usb_open,
.d_ioctl = usb_ioctl,
.d_name = "usbdev",
.d_flags = D_TRACKCLOSE,
.d_read = usb_read,
.d_write = usb_write,
.d_poll = usb_poll
};
static struct cdev* usb_dev = NULL;
/* character device structure used for /dev/usb */
static struct cdevsw usb_static_devsw = {
.d_version = D_VERSION,
.d_ioctl = usb_static_ioctl,
.d_name = "usb"
};
static TAILQ_HEAD(, usb_symlink) usb_sym_head;
static struct sx usb_sym_lock;
struct mtx usb_ref_lock;
/*------------------------------------------------------------------------*
* usb_loc_fill
*
* This is used to fill out a usb_cdev_privdata structure based on the
* device's address as contained in usb_fs_privdata.
*------------------------------------------------------------------------*/
static void
usb_loc_fill(struct usb_fs_privdata* pd, struct usb_cdev_privdata *cpd)
{
cpd->bus_index = pd->bus_index;
cpd->dev_index = pd->dev_index;
cpd->ep_addr = pd->ep_addr;
cpd->fifo_index = pd->fifo_index;
}
/*------------------------------------------------------------------------*
* usb_ref_device
*
* This function is used to atomically refer an USB device by its
* device location. If this function returns success the USB device
* will not dissappear until the USB device is unreferenced.
*
* Return values:
* 0: Success, refcount incremented on the given USB device.
* Else: Failure.
*------------------------------------------------------------------------*/
static usb_error_t
usb_ref_device(struct usb_cdev_privdata *cpd,
struct usb_cdev_refdata *crd, int need_uref)
{
struct usb_fifo **ppf;
struct usb_fifo *f;
DPRINTFN(2, "cpd=%p need uref=%d\n", cpd, need_uref);
/* clear all refs */
memset(crd, 0, sizeof(*crd));
mtx_lock(&usb_ref_lock);
cpd->bus = devclass_get_softc(usb_devclass_ptr, cpd->bus_index);
if (cpd->bus == NULL) {
DPRINTFN(2, "no bus at %u\n", cpd->bus_index);
goto error;
}
cpd->udev = cpd->bus->devices[cpd->dev_index];
if (cpd->udev == NULL) {
DPRINTFN(2, "no device at %u\n", cpd->dev_index);
goto error;
}
if (cpd->udev->refcount == USB_DEV_REF_MAX) {
DPRINTFN(2, "no dev ref\n");
goto error;
}
if (need_uref) {
DPRINTFN(2, "ref udev - needed\n");
cpd->udev->refcount++;
mtx_unlock(&usb_ref_lock);
/*
* We need to grab the sx-lock before grabbing the
* FIFO refs to avoid deadlock at detach!
*/
usbd_enum_lock(cpd->udev);
mtx_lock(&usb_ref_lock);
/*
* Set "is_uref" after grabbing the default SX lock
*/
crd->is_uref = 1;
}
/* check if we are doing an open */
if (cpd->fflags == 0) {
/* use zero defaults */
} else {
/* check for write */
if (cpd->fflags & FWRITE) {
ppf = cpd->udev->fifo;
f = ppf[cpd->fifo_index + USB_FIFO_TX];
crd->txfifo = f;
crd->is_write = 1; /* ref */
if (f == NULL || f->refcount == USB_FIFO_REF_MAX)
goto error;
if (f->curr_cpd != cpd)
goto error;
/* check if USB-FS is active */
if (f->fs_ep_max != 0) {
crd->is_usbfs = 1;
}
}
/* check for read */
if (cpd->fflags & FREAD) {
ppf = cpd->udev->fifo;
f = ppf[cpd->fifo_index + USB_FIFO_RX];
crd->rxfifo = f;
crd->is_read = 1; /* ref */
if (f == NULL || f->refcount == USB_FIFO_REF_MAX)
goto error;
if (f->curr_cpd != cpd)
goto error;
/* check if USB-FS is active */
if (f->fs_ep_max != 0) {
crd->is_usbfs = 1;
}
}
}
/* when everything is OK we increment the refcounts */
if (crd->is_write) {
DPRINTFN(2, "ref write\n");
crd->txfifo->refcount++;
}
if (crd->is_read) {
DPRINTFN(2, "ref read\n");
crd->rxfifo->refcount++;
}
mtx_unlock(&usb_ref_lock);
return (0);
error:
if (crd->is_uref) {
usbd_enum_unlock(cpd->udev);
if (--(cpd->udev->refcount) == 0) {
cv_signal(&cpd->udev->ref_cv);
}
}
mtx_unlock(&usb_ref_lock);
DPRINTFN(2, "fail\n");
return (USB_ERR_INVAL);
}
/*------------------------------------------------------------------------*
* usb_usb_ref_device
*
* This function is used to upgrade an USB reference to include the
* USB device reference on a USB location.
*
* Return values:
* 0: Success, refcount incremented on the given USB device.
* Else: Failure.
*------------------------------------------------------------------------*/
static usb_error_t
usb_usb_ref_device(struct usb_cdev_privdata *cpd,
struct usb_cdev_refdata *crd)
{
/*
* Check if we already got an USB reference on this location:
*/
if (crd->is_uref)
return (0); /* success */
/*
* To avoid deadlock at detach we need to drop the FIFO ref
* and re-acquire a new ref!
*/
usb_unref_device(cpd, crd);
return (usb_ref_device(cpd, crd, 1 /* need uref */));
}
/*------------------------------------------------------------------------*
* usb_unref_device
*
* This function will release the reference count by one unit for the
* given USB device.
*------------------------------------------------------------------------*/
static void
usb_unref_device(struct usb_cdev_privdata *cpd,
struct usb_cdev_refdata *crd)
{
DPRINTFN(2, "cpd=%p is_uref=%d\n", cpd, crd->is_uref);
if (crd->is_uref)
usbd_enum_unlock(cpd->udev);
mtx_lock(&usb_ref_lock);
if (crd->is_read) {
if (--(crd->rxfifo->refcount) == 0) {
cv_signal(&crd->rxfifo->cv_drain);
}
crd->is_read = 0;
}
if (crd->is_write) {
if (--(crd->txfifo->refcount) == 0) {
cv_signal(&crd->txfifo->cv_drain);
}
crd->is_write = 0;
}
if (crd->is_uref) {
if (--(cpd->udev->refcount) == 0) {
cv_signal(&cpd->udev->ref_cv);
}
crd->is_uref = 0;
}
mtx_unlock(&usb_ref_lock);
}
static struct usb_fifo *
usb_fifo_alloc(void)
{
struct usb_fifo *f;
f = malloc(sizeof(*f), M_USBDEV, M_WAITOK | M_ZERO);
if (f) {
cv_init(&f->cv_io, "FIFO-IO");
cv_init(&f->cv_drain, "FIFO-DRAIN");
f->refcount = 1;
}
return (f);
}
/*------------------------------------------------------------------------*
* usb_fifo_create
*------------------------------------------------------------------------*/
static int
usb_fifo_create(struct usb_cdev_privdata *cpd,
struct usb_cdev_refdata *crd)
{
struct usb_device *udev = cpd->udev;
struct usb_fifo *f;
struct usb_endpoint *ep;
uint8_t n;
uint8_t is_tx;
uint8_t is_rx;
uint8_t no_null;
uint8_t is_busy;
int e = cpd->ep_addr;
is_tx = (cpd->fflags & FWRITE) ? 1 : 0;
is_rx = (cpd->fflags & FREAD) ? 1 : 0;
no_null = 1;
is_busy = 0;
/* Preallocated FIFO */
if (e < 0) {
DPRINTFN(5, "Preallocated FIFO\n");
if (is_tx) {
f = udev->fifo[cpd->fifo_index + USB_FIFO_TX];
if (f == NULL)
return (EINVAL);
crd->txfifo = f;
}
if (is_rx) {
f = udev->fifo[cpd->fifo_index + USB_FIFO_RX];
if (f == NULL)
return (EINVAL);
crd->rxfifo = f;
}
return (0);
}
KASSERT(e >= 0 && e <= 15, ("endpoint %d out of range", e));
/* search for a free FIFO slot */
DPRINTFN(5, "Endpoint device, searching for 0x%02x\n", e);
for (n = 0;; n += 2) {
if (n == USB_FIFO_MAX) {
if (no_null) {
no_null = 0;
n = 0;
} else {
/* end of FIFOs reached */
DPRINTFN(5, "out of FIFOs\n");
return (ENOMEM);
}
}
/* Check for TX FIFO */
if (is_tx) {
f = udev->fifo[n + USB_FIFO_TX];
if (f != NULL) {
if (f->dev_ep_index != e) {
/* wrong endpoint index */
continue;
}
if (f->curr_cpd != NULL) {
/* FIFO is opened */
is_busy = 1;
continue;
}
} else if (no_null) {
continue;
}
}
/* Check for RX FIFO */
if (is_rx) {
f = udev->fifo[n + USB_FIFO_RX];
if (f != NULL) {
if (f->dev_ep_index != e) {
/* wrong endpoint index */
continue;
}
if (f->curr_cpd != NULL) {
/* FIFO is opened */
is_busy = 1;
continue;
}
} else if (no_null) {
continue;
}
}
break;
}
if (no_null == 0) {
if (e >= (USB_EP_MAX / 2)) {
/* we don't create any endpoints in this range */
DPRINTFN(5, "ep out of range\n");
return (is_busy ? EBUSY : EINVAL);
}
}
if ((e != 0) && is_busy) {
/*
* Only the default control endpoint is allowed to be
* opened multiple times!
*/
DPRINTFN(5, "busy\n");
return (EBUSY);
}
/* Check TX FIFO */
if (is_tx &&
(udev->fifo[n + USB_FIFO_TX] == NULL)) {
ep = usb_dev_get_ep(udev, e, USB_FIFO_TX);
DPRINTFN(5, "dev_get_endpoint(%d, 0x%x)\n", e, USB_FIFO_TX);
if (ep == NULL) {
DPRINTFN(5, "dev_get_endpoint returned NULL\n");
return (EINVAL);
}
f = usb_fifo_alloc();
if (f == NULL) {
DPRINTFN(5, "could not alloc tx fifo\n");
return (ENOMEM);
}
/* update some fields */
f->fifo_index = n + USB_FIFO_TX;
f->dev_ep_index = e;
f->priv_mtx = &udev->device_mtx;
f->priv_sc0 = ep;
f->methods = &usb_ugen_methods;
f->iface_index = ep->iface_index;
f->udev = udev;
mtx_lock(&usb_ref_lock);
udev->fifo[n + USB_FIFO_TX] = f;
mtx_unlock(&usb_ref_lock);
}
/* Check RX FIFO */
if (is_rx &&
(udev->fifo[n + USB_FIFO_RX] == NULL)) {
ep = usb_dev_get_ep(udev, e, USB_FIFO_RX);
DPRINTFN(5, "dev_get_endpoint(%d, 0x%x)\n", e, USB_FIFO_RX);
if (ep == NULL) {
DPRINTFN(5, "dev_get_endpoint returned NULL\n");
return (EINVAL);
}
f = usb_fifo_alloc();
if (f == NULL) {
DPRINTFN(5, "could not alloc rx fifo\n");
return (ENOMEM);
}
/* update some fields */
f->fifo_index = n + USB_FIFO_RX;
f->dev_ep_index = e;
f->priv_mtx = &udev->device_mtx;
f->priv_sc0 = ep;
f->methods = &usb_ugen_methods;
f->iface_index = ep->iface_index;
f->udev = udev;
mtx_lock(&usb_ref_lock);
udev->fifo[n + USB_FIFO_RX] = f;
mtx_unlock(&usb_ref_lock);
}
if (is_tx) {
crd->txfifo = udev->fifo[n + USB_FIFO_TX];
}
if (is_rx) {
crd->rxfifo = udev->fifo[n + USB_FIFO_RX];
}
/* fill out fifo index */
DPRINTFN(5, "fifo index = %d\n", n);
cpd->fifo_index = n;
/* complete */
return (0);
}
void
usb_fifo_free(struct usb_fifo *f)
{
uint8_t n;
if (f == NULL) {
/* be NULL safe */
return;
}
/* destroy symlink devices, if any */
for (n = 0; n != 2; n++) {
if (f->symlink[n]) {
usb_free_symlink(f->symlink[n]);
f->symlink[n] = NULL;
}
}
mtx_lock(&usb_ref_lock);
/* delink ourselves to stop calls from userland */
if ((f->fifo_index < USB_FIFO_MAX) &&
(f->udev != NULL) &&
(f->udev->fifo[f->fifo_index] == f)) {
f->udev->fifo[f->fifo_index] = NULL;
} else {
DPRINTFN(0, "USB FIFO %p has not been linked\n", f);
}
/* decrease refcount */
f->refcount--;
/* prevent any write flush */
f->flag_iserror = 1;
/* need to wait until all callers have exited */
while (f->refcount != 0) {
mtx_unlock(&usb_ref_lock); /* avoid LOR */
mtx_lock(f->priv_mtx);
/* get I/O thread out of any sleep state */
if (f->flag_sleeping) {
f->flag_sleeping = 0;
cv_broadcast(&f->cv_io);
}
mtx_unlock(f->priv_mtx);
mtx_lock(&usb_ref_lock);
/* wait for sync */
cv_wait(&f->cv_drain, &usb_ref_lock);
}
mtx_unlock(&usb_ref_lock);
/* take care of closing the device here, if any */
usb_fifo_close(f, 0);
cv_destroy(&f->cv_io);
cv_destroy(&f->cv_drain);
free(f, M_USBDEV);
}
static struct usb_endpoint *
usb_dev_get_ep(struct usb_device *udev, uint8_t ep_index, uint8_t dir)
{
struct usb_endpoint *ep;
uint8_t ep_dir;
if (ep_index == 0) {
ep = &udev->ctrl_ep;
} else {
if (dir == USB_FIFO_RX) {
if (udev->flags.usb_mode == USB_MODE_HOST) {
ep_dir = UE_DIR_IN;
} else {
ep_dir = UE_DIR_OUT;
}
} else {
if (udev->flags.usb_mode == USB_MODE_HOST) {
ep_dir = UE_DIR_OUT;
} else {
ep_dir = UE_DIR_IN;
}
}
ep = usbd_get_ep_by_addr(udev, ep_index | ep_dir);
}
if (ep == NULL) {
/* if the endpoint does not exist then return */
return (NULL);
}
if (ep->edesc == NULL) {
/* invalid endpoint */
return (NULL);
}
return (ep); /* success */
}
/*------------------------------------------------------------------------*
* usb_fifo_open
*
* Returns:
* 0: Success
* Else: Failure
*------------------------------------------------------------------------*/
static int
usb_fifo_open(struct usb_cdev_privdata *cpd,
struct usb_fifo *f, int fflags)
{
int err;
if (f == NULL) {
/* no FIFO there */
DPRINTFN(2, "no FIFO\n");
return (ENXIO);
}
/* remove FWRITE and FREAD flags */
fflags &= ~(FWRITE | FREAD);
/* set correct file flags */
if ((f->fifo_index & 1) == USB_FIFO_TX) {
fflags |= FWRITE;
} else {
fflags |= FREAD;
}
/* check if we are already opened */
/* we don't need any locks when checking this variable */
if (f->curr_cpd != NULL) {
err = EBUSY;
goto done;
}
/* reset short flag before open */
f->flag_short = 0;
/* call open method */
err = (f->methods->f_open) (f, fflags);
if (err) {
goto done;
}
mtx_lock(f->priv_mtx);
/* reset sleep flag */
f->flag_sleeping = 0;
/* reset error flag */
f->flag_iserror = 0;
/* reset complete flag */
f->flag_iscomplete = 0;
/* reset select flag */
f->flag_isselect = 0;
/* reset flushing flag */
f->flag_flushing = 0;
/* reset ASYNC proc flag */
f->async_p = NULL;
mtx_lock(&usb_ref_lock);
/* flag the fifo as opened to prevent others */
f->curr_cpd = cpd;
mtx_unlock(&usb_ref_lock);
/* reset queue */
usb_fifo_reset(f);
mtx_unlock(f->priv_mtx);
done:
return (err);
}
/*------------------------------------------------------------------------*
* usb_fifo_reset
*------------------------------------------------------------------------*/
void
usb_fifo_reset(struct usb_fifo *f)
{
struct usb_mbuf *m;
if (f == NULL) {
return;
}
while (1) {
USB_IF_DEQUEUE(&f->used_q, m);
if (m) {
USB_IF_ENQUEUE(&f->free_q, m);
} else {
break;
}
}
/* reset have fragment flag */
f->flag_have_fragment = 0;
}
/*------------------------------------------------------------------------*
* usb_fifo_close
*------------------------------------------------------------------------*/
static void
usb_fifo_close(struct usb_fifo *f, int fflags)
{
int err;
/* check if we are not opened */
if (f->curr_cpd == NULL) {
/* nothing to do - already closed */
return;
}
mtx_lock(f->priv_mtx);
/* clear current cdev private data pointer */
f->curr_cpd = NULL;
/* check if we are selected */
if (f->flag_isselect) {
selwakeup(&f->selinfo);
f->flag_isselect = 0;
}
/* check if a thread wants SIGIO */
if (f->async_p != NULL) {
PROC_LOCK(f->async_p);
- psignal(f->async_p, SIGIO);
+ kern_psignal(f->async_p, SIGIO);
PROC_UNLOCK(f->async_p);
f->async_p = NULL;
}
/* remove FWRITE and FREAD flags */
fflags &= ~(FWRITE | FREAD);
/* flush written data, if any */
if ((f->fifo_index & 1) == USB_FIFO_TX) {
if (!f->flag_iserror) {
/* set flushing flag */
f->flag_flushing = 1;
/* get the last packet in */
if (f->flag_have_fragment) {
struct usb_mbuf *m;
f->flag_have_fragment = 0;
USB_IF_DEQUEUE(&f->free_q, m);
if (m) {
USB_IF_ENQUEUE(&f->used_q, m);
}
}
/* start write transfer, if not already started */
(f->methods->f_start_write) (f);
/* check if flushed already */
while (f->flag_flushing &&
(!f->flag_iserror)) {
/* wait until all data has been written */
f->flag_sleeping = 1;
err = cv_wait_sig(&f->cv_io, f->priv_mtx);
if (err) {
DPRINTF("signal received\n");
break;
}
}
}
fflags |= FWRITE;
/* stop write transfer, if not already stopped */
(f->methods->f_stop_write) (f);
} else {
fflags |= FREAD;
/* stop write transfer, if not already stopped */
(f->methods->f_stop_read) (f);
}
/* check if we are sleeping */
if (f->flag_sleeping) {
DPRINTFN(2, "Sleeping at close!\n");
}
mtx_unlock(f->priv_mtx);
/* call close method */
(f->methods->f_close) (f, fflags);
DPRINTF("closed\n");
}
/*------------------------------------------------------------------------*
* usb_open - cdev callback
*------------------------------------------------------------------------*/
static int
usb_open(struct cdev *dev, int fflags, int devtype, struct thread *td)
{
struct usb_fs_privdata* pd = (struct usb_fs_privdata*)dev->si_drv1;
struct usb_cdev_refdata refs;
struct usb_cdev_privdata *cpd;
int err, ep;
DPRINTFN(2, "%s fflags=0x%08x\n", dev->si_name, fflags);
KASSERT(fflags & (FREAD|FWRITE), ("invalid open flags"));
if (((fflags & FREAD) && !(pd->mode & FREAD)) ||
((fflags & FWRITE) && !(pd->mode & FWRITE))) {
DPRINTFN(2, "access mode not supported\n");
return (EPERM);
}
cpd = malloc(sizeof(*cpd), M_USBDEV, M_WAITOK | M_ZERO);
ep = cpd->ep_addr = pd->ep_addr;
usb_loc_fill(pd, cpd);
err = usb_ref_device(cpd, &refs, 1);
if (err) {
DPRINTFN(2, "cannot ref device\n");
free(cpd, M_USBDEV);
return (ENXIO);
}
cpd->fflags = fflags; /* access mode for open lifetime */
/* create FIFOs, if any */
err = usb_fifo_create(cpd, &refs);
/* check for error */
if (err) {
DPRINTFN(2, "cannot create fifo\n");
usb_unref_device(cpd, &refs);
free(cpd, M_USBDEV);
return (err);
}
if (fflags & FREAD) {
err = usb_fifo_open(cpd, refs.rxfifo, fflags);
if (err) {
DPRINTFN(2, "read open failed\n");
usb_unref_device(cpd, &refs);
free(cpd, M_USBDEV);
return (err);
}
}
if (fflags & FWRITE) {
err = usb_fifo_open(cpd, refs.txfifo, fflags);
if (err) {
DPRINTFN(2, "write open failed\n");
if (fflags & FREAD) {
usb_fifo_close(refs.rxfifo, fflags);
}
usb_unref_device(cpd, &refs);
free(cpd, M_USBDEV);
return (err);
}
}
usb_unref_device(cpd, &refs);
devfs_set_cdevpriv(cpd, usb_close);
return (0);
}
/*------------------------------------------------------------------------*
* usb_close - cdev callback
*------------------------------------------------------------------------*/
static void
usb_close(void *arg)
{
struct usb_cdev_refdata refs;
struct usb_cdev_privdata *cpd = arg;
int err;
DPRINTFN(2, "cpd=%p\n", cpd);
err = usb_ref_device(cpd, &refs, 0);
if (err)
goto done;
/*
* If this function is not called directly from the root HUB
* thread, there is usually a need to lock the enumeration
* lock. Check this.
*/
if (!usbd_enum_is_locked(cpd->udev)) {
DPRINTFN(2, "Locking enumeration\n");
/* reference device */
err = usb_usb_ref_device(cpd, &refs);
if (err)
goto done;
}
if (cpd->fflags & FREAD) {
usb_fifo_close(refs.rxfifo, cpd->fflags);
}
if (cpd->fflags & FWRITE) {
usb_fifo_close(refs.txfifo, cpd->fflags);
}
usb_unref_device(cpd, &refs);
done:
free(cpd, M_USBDEV);
}
static void
usb_dev_init(void *arg)
{
mtx_init(&usb_ref_lock, "USB ref mutex", NULL, MTX_DEF);
sx_init(&usb_sym_lock, "USB sym mutex");
TAILQ_INIT(&usb_sym_head);
/* check the UGEN methods */
usb_fifo_check_methods(&usb_ugen_methods);
}
SYSINIT(usb_dev_init, SI_SUB_KLD, SI_ORDER_FIRST, usb_dev_init, NULL);
static void
usb_dev_init_post(void *arg)
{
/*
* Create /dev/usb - this is needed for usbconfig(8), which
* needs a well-known device name to access.
*/
usb_dev = make_dev(&usb_static_devsw, 0, UID_ROOT, GID_OPERATOR,
0644, USB_DEVICE_NAME);
if (usb_dev == NULL) {
DPRINTFN(0, "Could not create usb bus device\n");
}
}
SYSINIT(usb_dev_init_post, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, usb_dev_init_post, NULL);
static void
usb_dev_uninit(void *arg)
{
if (usb_dev != NULL) {
destroy_dev(usb_dev);
usb_dev = NULL;
}
mtx_destroy(&usb_ref_lock);
sx_destroy(&usb_sym_lock);
}
SYSUNINIT(usb_dev_uninit, SI_SUB_KICK_SCHEDULER, SI_ORDER_ANY, usb_dev_uninit, NULL);
static int
usb_ioctl_f_sub(struct usb_fifo *f, u_long cmd, void *addr,
struct thread *td)
{
int error = 0;
switch (cmd) {
case FIODTYPE:
*(int *)addr = 0; /* character device */
break;
case FIONBIO:
/* handled by upper FS layer */
break;
case FIOASYNC:
if (*(int *)addr) {
if (f->async_p != NULL) {
error = EBUSY;
break;
}
f->async_p = USB_TD_GET_PROC(td);
} else {
f->async_p = NULL;
}
break;
/* XXX this is not the most general solution */
case TIOCSPGRP:
if (f->async_p == NULL) {
error = EINVAL;
break;
}
if (*(int *)addr != USB_PROC_GET_GID(f->async_p)) {
error = EPERM;
break;
}
break;
default:
return (ENOIOCTL);
}
DPRINTFN(3, "cmd 0x%lx = %d\n", cmd, error);
return (error);
}
/*------------------------------------------------------------------------*
* usb_ioctl - cdev callback
*------------------------------------------------------------------------*/
static int
usb_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int fflag, struct thread* td)
{
struct usb_cdev_refdata refs;
struct usb_cdev_privdata* cpd;
struct usb_fifo *f;
int fflags;
int err;
DPRINTFN(2, "cmd=0x%lx\n", cmd);
err = devfs_get_cdevpriv((void **)&cpd);
if (err != 0)
return (err);
/*
* Performance optimisation: We try to check for IOCTL's that
* don't need the USB reference first. Then we grab the USB
* reference if we need it!
*/
err = usb_ref_device(cpd, &refs, 0 /* no uref */ );
if (err)
return (ENXIO);
fflags = cpd->fflags;
f = NULL; /* set default value */
err = ENOIOCTL; /* set default value */
if (fflags & FWRITE) {
f = refs.txfifo;
err = usb_ioctl_f_sub(f, cmd, addr, td);
}
if (fflags & FREAD) {
f = refs.rxfifo;
err = usb_ioctl_f_sub(f, cmd, addr, td);
}
KASSERT(f != NULL, ("fifo not found"));
if (err != ENOIOCTL)
goto done;
err = (f->methods->f_ioctl) (f, cmd, addr, fflags);
DPRINTFN(2, "f_ioctl cmd 0x%lx = %d\n", cmd, err);
if (err != ENOIOCTL)
goto done;
if (usb_usb_ref_device(cpd, &refs)) {
err = ENXIO;
goto done;
}
err = (f->methods->f_ioctl_post) (f, cmd, addr, fflags);
DPRINTFN(2, "f_ioctl_post cmd 0x%lx = %d\n", cmd, err);
if (err == ENOIOCTL)
err = ENOTTY;
if (err)
goto done;
/* Wait for re-enumeration, if any */
while (f->udev->re_enumerate_wait != 0) {
usb_unref_device(cpd, &refs);
usb_pause_mtx(NULL, hz / 128);
if (usb_ref_device(cpd, &refs, 1 /* need uref */)) {
err = ENXIO;
goto done;
}
}
done:
usb_unref_device(cpd, &refs);
return (err);
}
/* ARGSUSED */
static int
usb_poll(struct cdev* dev, int events, struct thread* td)
{
struct usb_cdev_refdata refs;
struct usb_cdev_privdata* cpd;
struct usb_fifo *f;
struct usb_mbuf *m;
int fflags, revents;
if (devfs_get_cdevpriv((void **)&cpd) != 0 ||
usb_ref_device(cpd, &refs, 0) != 0)
return (events &
(POLLHUP|POLLIN|POLLRDNORM|POLLOUT|POLLWRNORM));
fflags = cpd->fflags;
/* Figure out who needs service */
revents = 0;
if ((events & (POLLOUT | POLLWRNORM)) &&
(fflags & FWRITE)) {
f = refs.txfifo;
mtx_lock(f->priv_mtx);
if (!refs.is_usbfs) {
if (f->flag_iserror) {
/* we got an error */
m = (void *)1;
} else {
if (f->queue_data == NULL) {
/*
* start write transfer, if not
* already started
*/
(f->methods->f_start_write) (f);
}
/* check if any packets are available */
USB_IF_POLL(&f->free_q, m);
}
} else {
if (f->flag_iscomplete) {
m = (void *)1;
} else {
m = NULL;
}
}
if (m) {
revents |= events & (POLLOUT | POLLWRNORM);
} else {
f->flag_isselect = 1;
selrecord(td, &f->selinfo);
}
mtx_unlock(f->priv_mtx);
}
if ((events & (POLLIN | POLLRDNORM)) &&
(fflags & FREAD)) {
f = refs.rxfifo;
mtx_lock(f->priv_mtx);
if (!refs.is_usbfs) {
if (f->flag_iserror) {
/* we have and error */
m = (void *)1;
} else {
if (f->queue_data == NULL) {
/*
* start read transfer, if not
* already started
*/
(f->methods->f_start_read) (f);
}
/* check if any packets are available */
USB_IF_POLL(&f->used_q, m);
}
} else {
if (f->flag_iscomplete) {
m = (void *)1;
} else {
m = NULL;
}
}
if (m) {
revents |= events & (POLLIN | POLLRDNORM);
} else {
f->flag_isselect = 1;
selrecord(td, &f->selinfo);
if (!refs.is_usbfs) {
/* start reading data */
(f->methods->f_start_read) (f);
}
}
mtx_unlock(f->priv_mtx);
}
usb_unref_device(cpd, &refs);
return (revents);
}
static int
usb_read(struct cdev *dev, struct uio *uio, int ioflag)
{
struct usb_cdev_refdata refs;
struct usb_cdev_privdata* cpd;
struct usb_fifo *f;
struct usb_mbuf *m;
int fflags;
int resid;
int io_len;
int err;
uint8_t tr_data = 0;
err = devfs_get_cdevpriv((void **)&cpd);
if (err != 0)
return (err);
err = usb_ref_device(cpd, &refs, 0 /* no uref */ );
if (err) {
return (ENXIO);
}
fflags = cpd->fflags;
f = refs.rxfifo;
if (f == NULL) {
/* should not happen */
usb_unref_device(cpd, &refs);
return (EPERM);
}
resid = uio->uio_resid;
mtx_lock(f->priv_mtx);
/* check for permanent read error */
if (f->flag_iserror) {
err = EIO;
goto done;
}
/* check if USB-FS interface is active */
if (refs.is_usbfs) {
/*
* The queue is used for events that should be
* retrieved using the "USB_FS_COMPLETE" ioctl.
*/
err = EINVAL;
goto done;
}
while (uio->uio_resid > 0) {
USB_IF_DEQUEUE(&f->used_q, m);
if (m == NULL) {
/* start read transfer, if not already started */
(f->methods->f_start_read) (f);
if (ioflag & IO_NDELAY) {
if (tr_data) {
/* return length before error */
break;
}
err = EWOULDBLOCK;
break;
}
DPRINTF("sleeping\n");
err = usb_fifo_wait(f);
if (err) {
break;
}
continue;
}
if (f->methods->f_filter_read) {
/*
* Sometimes it is convenient to process data at the
* expense of a userland process instead of a kernel
* process.
*/
(f->methods->f_filter_read) (f, m);
}
tr_data = 1;
io_len = MIN(m->cur_data_len, uio->uio_resid);
DPRINTFN(2, "transfer %d bytes from %p\n",
io_len, m->cur_data_ptr);
err = usb_fifo_uiomove(f,
m->cur_data_ptr, io_len, uio);
m->cur_data_len -= io_len;
m->cur_data_ptr += io_len;
if (m->cur_data_len == 0) {
uint8_t last_packet;
last_packet = m->last_packet;
USB_IF_ENQUEUE(&f->free_q, m);
if (last_packet) {
/* keep framing */
break;
}
} else {
USB_IF_PREPEND(&f->used_q, m);
}
if (err) {
break;
}
}
done:
mtx_unlock(f->priv_mtx);
usb_unref_device(cpd, &refs);
return (err);
}
static int
usb_write(struct cdev *dev, struct uio *uio, int ioflag)
{
struct usb_cdev_refdata refs;
struct usb_cdev_privdata* cpd;
struct usb_fifo *f;
struct usb_mbuf *m;
uint8_t *pdata;
int fflags;
int resid;
int io_len;
int err;
uint8_t tr_data = 0;
DPRINTFN(2, "\n");
err = devfs_get_cdevpriv((void **)&cpd);
if (err != 0)
return (err);
err = usb_ref_device(cpd, &refs, 0 /* no uref */ );
if (err) {
return (ENXIO);
}
fflags = cpd->fflags;
f = refs.txfifo;
if (f == NULL) {
/* should not happen */
usb_unref_device(cpd, &refs);
return (EPERM);
}
resid = uio->uio_resid;
mtx_lock(f->priv_mtx);
/* check for permanent write error */
if (f->flag_iserror) {
err = EIO;
goto done;
}
/* check if USB-FS interface is active */
if (refs.is_usbfs) {
/*
* The queue is used for events that should be
* retrieved using the "USB_FS_COMPLETE" ioctl.
*/
err = EINVAL;
goto done;
}
if (f->queue_data == NULL) {
/* start write transfer, if not already started */
(f->methods->f_start_write) (f);
}
/* we allow writing zero length data */
do {
USB_IF_DEQUEUE(&f->free_q, m);
if (m == NULL) {
if (ioflag & IO_NDELAY) {
if (tr_data) {
/* return length before error */
break;
}
err = EWOULDBLOCK;
break;
}
DPRINTF("sleeping\n");
err = usb_fifo_wait(f);
if (err) {
break;
}
continue;
}
tr_data = 1;
if (f->flag_have_fragment == 0) {
USB_MBUF_RESET(m);
io_len = m->cur_data_len;
pdata = m->cur_data_ptr;
if (io_len > uio->uio_resid)
io_len = uio->uio_resid;
m->cur_data_len = io_len;
} else {
io_len = m->max_data_len - m->cur_data_len;
pdata = m->cur_data_ptr + m->cur_data_len;
if (io_len > uio->uio_resid)
io_len = uio->uio_resid;
m->cur_data_len += io_len;
}
DPRINTFN(2, "transfer %d bytes to %p\n",
io_len, pdata);
err = usb_fifo_uiomove(f, pdata, io_len, uio);
if (err) {
f->flag_have_fragment = 0;
USB_IF_ENQUEUE(&f->free_q, m);
break;
}
/* check if the buffer is ready to be transmitted */
if ((f->flag_write_defrag == 0) ||
(m->cur_data_len == m->max_data_len)) {
f->flag_have_fragment = 0;
/*
* Check for write filter:
*
* Sometimes it is convenient to process data
* at the expense of a userland process
* instead of a kernel process.
*/
if (f->methods->f_filter_write) {
(f->methods->f_filter_write) (f, m);
}
/* Put USB mbuf in the used queue */
USB_IF_ENQUEUE(&f->used_q, m);
/* Start writing data, if not already started */
(f->methods->f_start_write) (f);
} else {
/* Wait for more data or close */
f->flag_have_fragment = 1;
USB_IF_PREPEND(&f->free_q, m);
}
} while (uio->uio_resid > 0);
done:
mtx_unlock(f->priv_mtx);
usb_unref_device(cpd, &refs);
return (err);
}
int
usb_static_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag,
struct thread *td)
{
union {
struct usb_read_dir *urd;
void* data;
} u;
int err;
u.data = data;
switch (cmd) {
case USB_READ_DIR:
err = usb_read_symlink(u.urd->urd_data,
u.urd->urd_startentry, u.urd->urd_maxlen);
break;
case USB_DEV_QUIRK_GET:
case USB_QUIRK_NAME_GET:
case USB_DEV_QUIRK_ADD:
case USB_DEV_QUIRK_REMOVE:
err = usb_quirk_ioctl_p(cmd, data, fflag, td);
break;
case USB_GET_TEMPLATE:
*(int *)data = usb_template;
err = 0;
break;
case USB_SET_TEMPLATE:
err = priv_check(curthread, PRIV_DRIVER);
if (err)
break;
usb_template = *(int *)data;
break;
default:
err = ENOTTY;
break;
}
return (err);
}
static int
usb_fifo_uiomove(struct usb_fifo *f, void *cp,
int n, struct uio *uio)
{
int error;
mtx_unlock(f->priv_mtx);
/*
* "uiomove()" can sleep so one needs to make a wrapper,
* exiting the mutex and checking things:
*/
error = uiomove(cp, n, uio);
mtx_lock(f->priv_mtx);
return (error);
}
int
usb_fifo_wait(struct usb_fifo *f)
{
int err;
mtx_assert(f->priv_mtx, MA_OWNED);
if (f->flag_iserror) {
/* we are gone */
return (EIO);
}
f->flag_sleeping = 1;
err = cv_wait_sig(&f->cv_io, f->priv_mtx);
if (f->flag_iserror) {
/* we are gone */
err = EIO;
}
return (err);
}
void
usb_fifo_signal(struct usb_fifo *f)
{
if (f->flag_sleeping) {
f->flag_sleeping = 0;
cv_broadcast(&f->cv_io);
}
}
void
usb_fifo_wakeup(struct usb_fifo *f)
{
usb_fifo_signal(f);
if (f->flag_isselect) {
selwakeup(&f->selinfo);
f->flag_isselect = 0;
}
if (f->async_p != NULL) {
PROC_LOCK(f->async_p);
- psignal(f->async_p, SIGIO);
+ kern_psignal(f->async_p, SIGIO);
PROC_UNLOCK(f->async_p);
}
}
static int
usb_fifo_dummy_open(struct usb_fifo *fifo, int fflags)
{
return (0);
}
static void
usb_fifo_dummy_close(struct usb_fifo *fifo, int fflags)
{
return;
}
static int
usb_fifo_dummy_ioctl(struct usb_fifo *fifo, u_long cmd, void *addr, int fflags)
{
return (ENOIOCTL);
}
static void
usb_fifo_dummy_cmd(struct usb_fifo *fifo)
{
fifo->flag_flushing = 0; /* not flushing */
}
static void
usb_fifo_check_methods(struct usb_fifo_methods *pm)
{
/* check that all callback functions are OK */
if (pm->f_open == NULL)
pm->f_open = &usb_fifo_dummy_open;
if (pm->f_close == NULL)
pm->f_close = &usb_fifo_dummy_close;
if (pm->f_ioctl == NULL)
pm->f_ioctl = &usb_fifo_dummy_ioctl;
if (pm->f_ioctl_post == NULL)
pm->f_ioctl_post = &usb_fifo_dummy_ioctl;
if (pm->f_start_read == NULL)
pm->f_start_read = &usb_fifo_dummy_cmd;
if (pm->f_stop_read == NULL)
pm->f_stop_read = &usb_fifo_dummy_cmd;
if (pm->f_start_write == NULL)
pm->f_start_write = &usb_fifo_dummy_cmd;
if (pm->f_stop_write == NULL)
pm->f_stop_write = &usb_fifo_dummy_cmd;
}
/*------------------------------------------------------------------------*
* usb_fifo_attach
*
* The following function will create a duplex FIFO.
*
* Return values:
* 0: Success.
* Else: Failure.
*------------------------------------------------------------------------*/
int
usb_fifo_attach(struct usb_device *udev, void *priv_sc,
struct mtx *priv_mtx, struct usb_fifo_methods *pm,
struct usb_fifo_sc *f_sc, uint16_t unit, uint16_t subunit,
uint8_t iface_index, uid_t uid, gid_t gid, int mode)
{
struct usb_fifo *f_tx;
struct usb_fifo *f_rx;
char devname[32];
uint8_t n;
f_sc->fp[USB_FIFO_TX] = NULL;
f_sc->fp[USB_FIFO_RX] = NULL;
if (pm == NULL)
return (EINVAL);
/* check the methods */
usb_fifo_check_methods(pm);
if (priv_mtx == NULL)
priv_mtx = &Giant;
/* search for a free FIFO slot */
for (n = 0;; n += 2) {
if (n == USB_FIFO_MAX) {
/* end of FIFOs reached */
return (ENOMEM);
}
/* Check for TX FIFO */
if (udev->fifo[n + USB_FIFO_TX] != NULL) {
continue;
}
/* Check for RX FIFO */
if (udev->fifo[n + USB_FIFO_RX] != NULL) {
continue;
}
break;
}
f_tx = usb_fifo_alloc();
f_rx = usb_fifo_alloc();
if ((f_tx == NULL) || (f_rx == NULL)) {
usb_fifo_free(f_tx);
usb_fifo_free(f_rx);
return (ENOMEM);
}
/* initialise FIFO structures */
f_tx->fifo_index = n + USB_FIFO_TX;
f_tx->dev_ep_index = -1;
f_tx->priv_mtx = priv_mtx;
f_tx->priv_sc0 = priv_sc;
f_tx->methods = pm;
f_tx->iface_index = iface_index;
f_tx->udev = udev;
f_rx->fifo_index = n + USB_FIFO_RX;
f_rx->dev_ep_index = -1;
f_rx->priv_mtx = priv_mtx;
f_rx->priv_sc0 = priv_sc;
f_rx->methods = pm;
f_rx->iface_index = iface_index;
f_rx->udev = udev;
f_sc->fp[USB_FIFO_TX] = f_tx;
f_sc->fp[USB_FIFO_RX] = f_rx;
mtx_lock(&usb_ref_lock);
udev->fifo[f_tx->fifo_index] = f_tx;
udev->fifo[f_rx->fifo_index] = f_rx;
mtx_unlock(&usb_ref_lock);
for (n = 0; n != 4; n++) {
if (pm->basename[n] == NULL) {
continue;
}
if (subunit == 0xFFFF) {
if (snprintf(devname, sizeof(devname),
"%s%u%s", pm->basename[n],
unit, pm->postfix[n] ?
pm->postfix[n] : "")) {
/* ignore */
}
} else {
if (snprintf(devname, sizeof(devname),
"%s%u.%u%s", pm->basename[n],
unit, subunit, pm->postfix[n] ?
pm->postfix[n] : "")) {
/* ignore */
}
}
/*
* Distribute the symbolic links into two FIFO structures:
*/
if (n & 1) {
f_rx->symlink[n / 2] =
usb_alloc_symlink(devname);
} else {
f_tx->symlink[n / 2] =
usb_alloc_symlink(devname);
}
/* Create the device */
f_sc->dev = usb_make_dev(udev, devname, -1,
f_tx->fifo_index & f_rx->fifo_index,
FREAD|FWRITE, uid, gid, mode);
}
DPRINTFN(2, "attached %p/%p\n", f_tx, f_rx);
return (0);
}
/*------------------------------------------------------------------------*
* usb_fifo_alloc_buffer
*
* Return values:
* 0: Success
* Else failure
*------------------------------------------------------------------------*/
int
usb_fifo_alloc_buffer(struct usb_fifo *f, usb_size_t bufsize,
uint16_t nbuf)
{
usb_fifo_free_buffer(f);
/* allocate an endpoint */
f->free_q.ifq_maxlen = nbuf;
f->used_q.ifq_maxlen = nbuf;
f->queue_data = usb_alloc_mbufs(
M_USBDEV, &f->free_q, bufsize, nbuf);
if ((f->queue_data == NULL) && bufsize && nbuf) {
return (ENOMEM);
}
return (0); /* success */
}
/*------------------------------------------------------------------------*
* usb_fifo_free_buffer
*
* This function will free the buffers associated with a FIFO. This
* function can be called multiple times in a row.
*------------------------------------------------------------------------*/
void
usb_fifo_free_buffer(struct usb_fifo *f)
{
if (f->queue_data) {
/* free old buffer */
free(f->queue_data, M_USBDEV);
f->queue_data = NULL;
}
/* reset queues */
bzero(&f->free_q, sizeof(f->free_q));
bzero(&f->used_q, sizeof(f->used_q));
}
void
usb_fifo_detach(struct usb_fifo_sc *f_sc)
{
if (f_sc == NULL) {
return;
}
usb_fifo_free(f_sc->fp[USB_FIFO_TX]);
usb_fifo_free(f_sc->fp[USB_FIFO_RX]);
f_sc->fp[USB_FIFO_TX] = NULL;
f_sc->fp[USB_FIFO_RX] = NULL;
usb_destroy_dev(f_sc->dev);
f_sc->dev = NULL;
DPRINTFN(2, "detached %p\n", f_sc);
}
usb_size_t
usb_fifo_put_bytes_max(struct usb_fifo *f)
{
struct usb_mbuf *m;
usb_size_t len;
USB_IF_POLL(&f->free_q, m);
if (m) {
len = m->max_data_len;
} else {
len = 0;
}
return (len);
}
/*------------------------------------------------------------------------*
* usb_fifo_put_data
*
* what:
* 0 - normal operation
* 1 - set last packet flag to enforce framing
*------------------------------------------------------------------------*/
void
usb_fifo_put_data(struct usb_fifo *f, struct usb_page_cache *pc,
usb_frlength_t offset, usb_frlength_t len, uint8_t what)
{
struct usb_mbuf *m;
usb_frlength_t io_len;
while (len || (what == 1)) {
USB_IF_DEQUEUE(&f->free_q, m);
if (m) {
USB_MBUF_RESET(m);
io_len = MIN(len, m->cur_data_len);
usbd_copy_out(pc, offset, m->cur_data_ptr, io_len);
m->cur_data_len = io_len;
offset += io_len;
len -= io_len;
if ((len == 0) && (what == 1)) {
m->last_packet = 1;
}
USB_IF_ENQUEUE(&f->used_q, m);
usb_fifo_wakeup(f);
if ((len == 0) || (what == 1)) {
break;
}
} else {
break;
}
}
}
void
usb_fifo_put_data_linear(struct usb_fifo *f, void *ptr,
usb_size_t len, uint8_t what)
{
struct usb_mbuf *m;
usb_size_t io_len;
while (len || (what == 1)) {
USB_IF_DEQUEUE(&f->free_q, m);
if (m) {
USB_MBUF_RESET(m);
io_len = MIN(len, m->cur_data_len);
bcopy(ptr, m->cur_data_ptr, io_len);
m->cur_data_len = io_len;
ptr = USB_ADD_BYTES(ptr, io_len);
len -= io_len;
if ((len == 0) && (what == 1)) {
m->last_packet = 1;
}
USB_IF_ENQUEUE(&f->used_q, m);
usb_fifo_wakeup(f);
if ((len == 0) || (what == 1)) {
break;
}
} else {
break;
}
}
}
uint8_t
usb_fifo_put_data_buffer(struct usb_fifo *f, void *ptr, usb_size_t len)
{
struct usb_mbuf *m;
USB_IF_DEQUEUE(&f->free_q, m);
if (m) {
m->cur_data_len = len;
m->cur_data_ptr = ptr;
USB_IF_ENQUEUE(&f->used_q, m);
usb_fifo_wakeup(f);
return (1);
}
return (0);
}
void
usb_fifo_put_data_error(struct usb_fifo *f)
{
f->flag_iserror = 1;
usb_fifo_wakeup(f);
}
/*------------------------------------------------------------------------*
* usb_fifo_get_data
*
* what:
* 0 - normal operation
* 1 - only get one "usb_mbuf"
*
* returns:
* 0 - no more data
* 1 - data in buffer
*------------------------------------------------------------------------*/
uint8_t
usb_fifo_get_data(struct usb_fifo *f, struct usb_page_cache *pc,
usb_frlength_t offset, usb_frlength_t len, usb_frlength_t *actlen,
uint8_t what)
{
struct usb_mbuf *m;
usb_frlength_t io_len;
uint8_t tr_data = 0;
actlen[0] = 0;
while (1) {
USB_IF_DEQUEUE(&f->used_q, m);
if (m) {
tr_data = 1;
io_len = MIN(len, m->cur_data_len);
usbd_copy_in(pc, offset, m->cur_data_ptr, io_len);
len -= io_len;
offset += io_len;
actlen[0] += io_len;
m->cur_data_ptr += io_len;
m->cur_data_len -= io_len;
if ((m->cur_data_len == 0) || (what == 1)) {
USB_IF_ENQUEUE(&f->free_q, m);
usb_fifo_wakeup(f);
if (what == 1) {
break;
}
} else {
USB_IF_PREPEND(&f->used_q, m);
}
} else {
if (tr_data) {
/* wait for data to be written out */
break;
}
if (f->flag_flushing) {
/* check if we should send a short packet */
if (f->flag_short != 0) {
f->flag_short = 0;
tr_data = 1;
break;
}
/* flushing complete */
f->flag_flushing = 0;
usb_fifo_wakeup(f);
}
break;
}
if (len == 0) {
break;
}
}
return (tr_data);
}
uint8_t
usb_fifo_get_data_linear(struct usb_fifo *f, void *ptr,
usb_size_t len, usb_size_t *actlen, uint8_t what)
{
struct usb_mbuf *m;
usb_size_t io_len;
uint8_t tr_data = 0;
actlen[0] = 0;
while (1) {
USB_IF_DEQUEUE(&f->used_q, m);
if (m) {
tr_data = 1;
io_len = MIN(len, m->cur_data_len);
bcopy(m->cur_data_ptr, ptr, io_len);
len -= io_len;
ptr = USB_ADD_BYTES(ptr, io_len);
actlen[0] += io_len;
m->cur_data_ptr += io_len;
m->cur_data_len -= io_len;
if ((m->cur_data_len == 0) || (what == 1)) {
USB_IF_ENQUEUE(&f->free_q, m);
usb_fifo_wakeup(f);
if (what == 1) {
break;
}
} else {
USB_IF_PREPEND(&f->used_q, m);
}
} else {
if (tr_data) {
/* wait for data to be written out */
break;
}
if (f->flag_flushing) {
/* check if we should send a short packet */
if (f->flag_short != 0) {
f->flag_short = 0;
tr_data = 1;
break;
}
/* flushing complete */
f->flag_flushing = 0;
usb_fifo_wakeup(f);
}
break;
}
if (len == 0) {
break;
}
}
return (tr_data);
}
uint8_t
usb_fifo_get_data_buffer(struct usb_fifo *f, void **pptr, usb_size_t *plen)
{
struct usb_mbuf *m;
USB_IF_POLL(&f->used_q, m);
if (m) {
*plen = m->cur_data_len;
*pptr = m->cur_data_ptr;
return (1);
}
return (0);
}
void
usb_fifo_get_data_error(struct usb_fifo *f)
{
f->flag_iserror = 1;
usb_fifo_wakeup(f);
}
/*------------------------------------------------------------------------*
* usb_alloc_symlink
*
* Return values:
* NULL: Failure
* Else: Pointer to symlink entry
*------------------------------------------------------------------------*/
struct usb_symlink *
usb_alloc_symlink(const char *target)
{
struct usb_symlink *ps;
ps = malloc(sizeof(*ps), M_USBDEV, M_WAITOK);
if (ps == NULL) {
return (ps);
}
/* XXX no longer needed */
strlcpy(ps->src_path, target, sizeof(ps->src_path));
ps->src_len = strlen(ps->src_path);
strlcpy(ps->dst_path, target, sizeof(ps->dst_path));
ps->dst_len = strlen(ps->dst_path);
sx_xlock(&usb_sym_lock);
TAILQ_INSERT_TAIL(&usb_sym_head, ps, sym_entry);
sx_unlock(&usb_sym_lock);
return (ps);
}
/*------------------------------------------------------------------------*
* usb_free_symlink
*------------------------------------------------------------------------*/
void
usb_free_symlink(struct usb_symlink *ps)
{
if (ps == NULL) {
return;
}
sx_xlock(&usb_sym_lock);
TAILQ_REMOVE(&usb_sym_head, ps, sym_entry);
sx_unlock(&usb_sym_lock);
free(ps, M_USBDEV);
}
/*------------------------------------------------------------------------*
* usb_read_symlink
*
* Return value:
* 0: Success
* Else: Failure
*------------------------------------------------------------------------*/
int
usb_read_symlink(uint8_t *user_ptr, uint32_t startentry, uint32_t user_len)
{
struct usb_symlink *ps;
uint32_t temp;
uint32_t delta = 0;
uint8_t len;
int error = 0;
sx_xlock(&usb_sym_lock);
TAILQ_FOREACH(ps, &usb_sym_head, sym_entry) {
/*
* Compute total length of source and destination symlink
* strings pluss one length byte and two NUL bytes:
*/
temp = ps->src_len + ps->dst_len + 3;
if (temp > 255) {
/*
* Skip entry because this length cannot fit
* into one byte:
*/
continue;
}
if (startentry != 0) {
/* decrement read offset */
startentry--;
continue;
}
if (temp > user_len) {
/* out of buffer space */
break;
}
len = temp;
/* copy out total length */
error = copyout(&len,
USB_ADD_BYTES(user_ptr, delta), 1);
if (error) {
break;
}
delta += 1;
/* copy out source string */
error = copyout(ps->src_path,
USB_ADD_BYTES(user_ptr, delta), ps->src_len);
if (error) {
break;
}
len = 0;
delta += ps->src_len;
error = copyout(&len,
USB_ADD_BYTES(user_ptr, delta), 1);
if (error) {
break;
}
delta += 1;
/* copy out destination string */
error = copyout(ps->dst_path,
USB_ADD_BYTES(user_ptr, delta), ps->dst_len);
if (error) {
break;
}
len = 0;
delta += ps->dst_len;
error = copyout(&len,
USB_ADD_BYTES(user_ptr, delta), 1);
if (error) {
break;
}
delta += 1;
user_len -= temp;
}
/* a zero length entry indicates the end */
if ((user_len != 0) && (error == 0)) {
len = 0;
error = copyout(&len,
USB_ADD_BYTES(user_ptr, delta), 1);
}
sx_unlock(&usb_sym_lock);
return (error);
}
void
usb_fifo_set_close_zlp(struct usb_fifo *f, uint8_t onoff)
{
if (f == NULL)
return;
/* send a Zero Length Packet, ZLP, before close */
f->flag_short = onoff;
}
void
usb_fifo_set_write_defrag(struct usb_fifo *f, uint8_t onoff)
{
if (f == NULL)
return;
/* defrag written data */
f->flag_write_defrag = onoff;
/* reset defrag state */
f->flag_have_fragment = 0;
}
void *
usb_fifo_softc(struct usb_fifo *f)
{
return (f->priv_sc0);
}
#endif /* USB_HAVE_UGEN */
Index: head/sys/fs/nfsserver/nfs_nfsdport.c
===================================================================
--- head/sys/fs/nfsserver/nfs_nfsdport.c (revision 225616)
+++ head/sys/fs/nfsserver/nfs_nfsdport.c (revision 225617)
@@ -1,3331 +1,3331 @@
/*-
* Copyright (c) 1989, 1993
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* Rick Macklem at The University of Guelph.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/capability.h>
/*
* Functions that perform the vfs operations required by the routines in
* nfsd_serv.c. It is hoped that this change will make the server more
* portable.
*/
#include <fs/nfs/nfsport.h>
#include <sys/hash.h>
#include <sys/sysctl.h>
#include <nlm/nlm_prot.h>
#include <nlm/nlm.h>
FEATURE(nfsd, "NFSv4 server");
extern u_int32_t newnfs_true, newnfs_false, newnfs_xdrneg1;
extern int nfsrv_useacl;
extern int newnfs_numnfsd;
extern struct mount nfsv4root_mnt;
extern struct nfsrv_stablefirst nfsrv_stablefirst;
extern void (*nfsd_call_servertimer)(void);
extern SVCPOOL *nfsrvd_pool;
struct vfsoptlist nfsv4root_opt, nfsv4root_newopt;
NFSDLOCKMUTEX;
struct mtx nfs_cache_mutex;
struct mtx nfs_v4root_mutex;
struct nfsrvfh nfs_rootfh, nfs_pubfh;
int nfs_pubfhset = 0, nfs_rootfhset = 0;
struct proc *nfsd_master_proc = NULL;
static pid_t nfsd_master_pid = (pid_t)-1;
static char nfsd_master_comm[MAXCOMLEN + 1];
static struct timeval nfsd_master_start;
static uint32_t nfsv4_sysid = 0;
static int nfssvc_srvcall(struct thread *, struct nfssvc_args *,
struct ucred *);
int nfsrv_enable_crossmntpt = 1;
static int nfs_commit_blks;
static int nfs_commit_miss;
extern int nfsrv_issuedelegs;
extern int nfsrv_dolocallocks;
SYSCTL_NODE(_vfs, OID_AUTO, nfsd, CTLFLAG_RW, 0, "New NFS server");
SYSCTL_INT(_vfs_nfsd, OID_AUTO, mirrormnt, CTLFLAG_RW,
&nfsrv_enable_crossmntpt, 0, "Enable nfsd to cross mount points");
SYSCTL_INT(_vfs_nfsd, OID_AUTO, commit_blks, CTLFLAG_RW, &nfs_commit_blks,
0, "");
SYSCTL_INT(_vfs_nfsd, OID_AUTO, commit_miss, CTLFLAG_RW, &nfs_commit_miss,
0, "");
SYSCTL_INT(_vfs_nfsd, OID_AUTO, issue_delegations, CTLFLAG_RW,
&nfsrv_issuedelegs, 0, "Enable nfsd to issue delegations");
SYSCTL_INT(_vfs_nfsd, OID_AUTO, enable_locallocks, CTLFLAG_RW,
&nfsrv_dolocallocks, 0, "Enable nfsd to acquire local locks on files");
#define NUM_HEURISTIC 1017
#define NHUSE_INIT 64
#define NHUSE_INC 16
#define NHUSE_MAX 2048
static struct nfsheur {
struct vnode *nh_vp; /* vp to match (unreferenced pointer) */
off_t nh_nextr; /* next offset for sequential detection */
int nh_use; /* use count for selection */
int nh_seqcount; /* heuristic */
} nfsheur[NUM_HEURISTIC];
/*
* Get attributes into nfsvattr structure.
*/
int
nfsvno_getattr(struct vnode *vp, struct nfsvattr *nvap, struct ucred *cred,
struct thread *p, int vpislocked)
{
int error, lockedit = 0;
if (vpislocked == 0) {
/*
* When vpislocked == 0, the vnode is either exclusively
* locked by this thread or not locked by this thread.
* As such, shared lock it, if not exclusively locked.
*/
if (NFSVOPISLOCKED(vp) != LK_EXCLUSIVE) {
lockedit = 1;
NFSVOPLOCK(vp, LK_SHARED | LK_RETRY);
}
}
error = VOP_GETATTR(vp, &nvap->na_vattr, cred);
if (lockedit != 0)
NFSVOPUNLOCK(vp, 0);
NFSEXITCODE(error);
return (error);
}
/*
* Get a file handle for a vnode.
*/
int
nfsvno_getfh(struct vnode *vp, fhandle_t *fhp, struct thread *p)
{
int error;
NFSBZERO((caddr_t)fhp, sizeof(fhandle_t));
fhp->fh_fsid = vp->v_mount->mnt_stat.f_fsid;
error = VOP_VPTOFH(vp, &fhp->fh_fid);
NFSEXITCODE(error);
return (error);
}
/*
* Perform access checking for vnodes obtained from file handles that would
* refer to files already opened by a Unix client. You cannot just use
* vn_writechk() and VOP_ACCESSX() for two reasons.
* 1 - You must check for exported rdonly as well as MNT_RDONLY for the write
* case.
* 2 - The owner is to be given access irrespective of mode bits for some
* operations, so that processes that chmod after opening a file don't
* break.
*/
int
nfsvno_accchk(struct vnode *vp, accmode_t accmode, struct ucred *cred,
struct nfsexstuff *exp, struct thread *p, int override, int vpislocked,
u_int32_t *supportedtypep)
{
struct vattr vattr;
int error = 0, getret = 0;
if (vpislocked == 0) {
if (NFSVOPLOCK(vp, LK_SHARED) != 0) {
error = EPERM;
goto out;
}
}
if (accmode & VWRITE) {
/* Just vn_writechk() changed to check rdonly */
/*
* Disallow write attempts on read-only file systems;
* unless the file is a socket or a block or character
* device resident on the file system.
*/
if (NFSVNO_EXRDONLY(exp) ||
(vp->v_mount->mnt_flag & MNT_RDONLY)) {
switch (vp->v_type) {
case VREG:
case VDIR:
case VLNK:
error = EROFS;
default:
break;
}
}
/*
* If there's shared text associated with
* the inode, try to free it up once. If
* we fail, we can't allow writing.
*/
if ((vp->v_vflag & VV_TEXT) != 0 && error == 0)
error = ETXTBSY;
}
if (error != 0) {
if (vpislocked == 0)
NFSVOPUNLOCK(vp, 0);
goto out;
}
/*
* Should the override still be applied when ACLs are enabled?
*/
error = VOP_ACCESSX(vp, accmode, cred, p);
if (error != 0 && (accmode & (VDELETE | VDELETE_CHILD))) {
/*
* Try again with VEXPLICIT_DENY, to see if the test for
* deletion is supported.
*/
error = VOP_ACCESSX(vp, accmode | VEXPLICIT_DENY, cred, p);
if (error == 0) {
if (vp->v_type == VDIR) {
accmode &= ~(VDELETE | VDELETE_CHILD);
accmode |= VWRITE;
error = VOP_ACCESSX(vp, accmode, cred, p);
} else if (supportedtypep != NULL) {
*supportedtypep &= ~NFSACCESS_DELETE;
}
}
}
/*
* Allow certain operations for the owner (reads and writes
* on files that are already open).
*/
if (override != NFSACCCHK_NOOVERRIDE &&
(error == EPERM || error == EACCES)) {
if (cred->cr_uid == 0 && (override & NFSACCCHK_ALLOWROOT))
error = 0;
else if (override & NFSACCCHK_ALLOWOWNER) {
getret = VOP_GETATTR(vp, &vattr, cred);
if (getret == 0 && cred->cr_uid == vattr.va_uid)
error = 0;
}
}
if (vpislocked == 0)
NFSVOPUNLOCK(vp, 0);
out:
NFSEXITCODE(error);
return (error);
}
/*
* Set attribute(s) vnop.
*/
int
nfsvno_setattr(struct vnode *vp, struct nfsvattr *nvap, struct ucred *cred,
struct thread *p, struct nfsexstuff *exp)
{
int error;
error = VOP_SETATTR(vp, &nvap->na_vattr, cred);
NFSEXITCODE(error);
return (error);
}
/*
* Set up nameidata for a lookup() call and do it
* For the cases where we are crossing mount points
* (looking up the public fh path or the v4 root path when
* not using a pseudo-root fs), set/release the Giant lock,
* as required.
*/
int
nfsvno_namei(struct nfsrv_descript *nd, struct nameidata *ndp,
struct vnode *dp, int islocked, struct nfsexstuff *exp, struct thread *p,
struct vnode **retdirp)
{
struct componentname *cnp = &ndp->ni_cnd;
int i;
struct iovec aiov;
struct uio auio;
int lockleaf = (cnp->cn_flags & LOCKLEAF) != 0, linklen;
int error = 0, crossmnt;
char *cp;
*retdirp = NULL;
cnp->cn_nameptr = cnp->cn_pnbuf;
ndp->ni_strictrelative = 0;
/*
* Extract and set starting directory.
*/
if (dp->v_type != VDIR) {
if (islocked)
vput(dp);
else
vrele(dp);
nfsvno_relpathbuf(ndp);
error = ENOTDIR;
goto out1;
}
if (islocked)
NFSVOPUNLOCK(dp, 0);
VREF(dp);
*retdirp = dp;
if (NFSVNO_EXRDONLY(exp))
cnp->cn_flags |= RDONLY;
ndp->ni_segflg = UIO_SYSSPACE;
crossmnt = 1;
if (nd->nd_flag & ND_PUBLOOKUP) {
ndp->ni_loopcnt = 0;
if (cnp->cn_pnbuf[0] == '/') {
vrele(dp);
/*
* Check for degenerate pathnames here, since lookup()
* panics on them.
*/
for (i = 1; i < ndp->ni_pathlen; i++)
if (cnp->cn_pnbuf[i] != '/')
break;
if (i == ndp->ni_pathlen) {
error = NFSERR_ACCES;
goto out;
}
dp = rootvnode;
VREF(dp);
}
} else if ((nfsrv_enable_crossmntpt == 0 && NFSVNO_EXPORTED(exp)) ||
(nd->nd_flag & ND_NFSV4) == 0) {
/*
* Only cross mount points for NFSv4 when doing a
* mount while traversing the file system above
* the mount point, unless nfsrv_enable_crossmntpt is set.
*/
cnp->cn_flags |= NOCROSSMOUNT;
crossmnt = 0;
}
/*
* Initialize for scan, set ni_startdir and bump ref on dp again
* becuase lookup() will dereference ni_startdir.
*/
cnp->cn_thread = p;
ndp->ni_startdir = dp;
ndp->ni_rootdir = rootvnode;
if (!lockleaf)
cnp->cn_flags |= LOCKLEAF;
for (;;) {
cnp->cn_nameptr = cnp->cn_pnbuf;
/*
* Call lookup() to do the real work. If an error occurs,
* ndp->ni_vp and ni_dvp are left uninitialized or NULL and
* we do not have to dereference anything before returning.
* In either case ni_startdir will be dereferenced and NULLed
* out.
*/
error = lookup(ndp);
if (error)
break;
/*
* Check for encountering a symbolic link. Trivial
* termination occurs if no symlink encountered.
*/
if ((cnp->cn_flags & ISSYMLINK) == 0) {
if ((cnp->cn_flags & (SAVENAME | SAVESTART)) == 0)
nfsvno_relpathbuf(ndp);
if (ndp->ni_vp && !lockleaf)
NFSVOPUNLOCK(ndp->ni_vp, 0);
break;
}
/*
* Validate symlink
*/
if ((cnp->cn_flags & LOCKPARENT) && ndp->ni_pathlen == 1)
NFSVOPUNLOCK(ndp->ni_dvp, 0);
if (!(nd->nd_flag & ND_PUBLOOKUP)) {
error = EINVAL;
goto badlink2;
}
if (ndp->ni_loopcnt++ >= MAXSYMLINKS) {
error = ELOOP;
goto badlink2;
}
if (ndp->ni_pathlen > 1)
cp = uma_zalloc(namei_zone, M_WAITOK);
else
cp = cnp->cn_pnbuf;
aiov.iov_base = cp;
aiov.iov_len = MAXPATHLEN;
auio.uio_iov = &aiov;
auio.uio_iovcnt = 1;
auio.uio_offset = 0;
auio.uio_rw = UIO_READ;
auio.uio_segflg = UIO_SYSSPACE;
auio.uio_td = NULL;
auio.uio_resid = MAXPATHLEN;
error = VOP_READLINK(ndp->ni_vp, &auio, cnp->cn_cred);
if (error) {
badlink1:
if (ndp->ni_pathlen > 1)
uma_zfree(namei_zone, cp);
badlink2:
vrele(ndp->ni_dvp);
vput(ndp->ni_vp);
break;
}
linklen = MAXPATHLEN - auio.uio_resid;
if (linklen == 0) {
error = ENOENT;
goto badlink1;
}
if (linklen + ndp->ni_pathlen >= MAXPATHLEN) {
error = ENAMETOOLONG;
goto badlink1;
}
/*
* Adjust or replace path
*/
if (ndp->ni_pathlen > 1) {
NFSBCOPY(ndp->ni_next, cp + linklen, ndp->ni_pathlen);
uma_zfree(namei_zone, cnp->cn_pnbuf);
cnp->cn_pnbuf = cp;
} else
cnp->cn_pnbuf[linklen] = '\0';
ndp->ni_pathlen += linklen;
/*
* Cleanup refs for next loop and check if root directory
* should replace current directory. Normally ni_dvp
* becomes the new base directory and is cleaned up when
* we loop. Explicitly null pointers after invalidation
* to clarify operation.
*/
vput(ndp->ni_vp);
ndp->ni_vp = NULL;
if (cnp->cn_pnbuf[0] == '/') {
vrele(ndp->ni_dvp);
ndp->ni_dvp = ndp->ni_rootdir;
VREF(ndp->ni_dvp);
}
ndp->ni_startdir = ndp->ni_dvp;
ndp->ni_dvp = NULL;
}
if (!lockleaf)
cnp->cn_flags &= ~LOCKLEAF;
out:
if (error) {
uma_zfree(namei_zone, cnp->cn_pnbuf);
ndp->ni_vp = NULL;
ndp->ni_dvp = NULL;
ndp->ni_startdir = NULL;
cnp->cn_flags &= ~HASBUF;
} else if ((ndp->ni_cnd.cn_flags & (WANTPARENT|LOCKPARENT)) == 0) {
ndp->ni_dvp = NULL;
}
out1:
NFSEXITCODE2(error, nd);
return (error);
}
/*
* Set up a pathname buffer and return a pointer to it and, optionally
* set a hash pointer.
*/
void
nfsvno_setpathbuf(struct nameidata *ndp, char **bufpp, u_long **hashpp)
{
struct componentname *cnp = &ndp->ni_cnd;
cnp->cn_flags |= (NOMACCHECK | HASBUF);
cnp->cn_pnbuf = uma_zalloc(namei_zone, M_WAITOK);
if (hashpp != NULL)
*hashpp = NULL;
*bufpp = cnp->cn_pnbuf;
}
/*
* Release the above path buffer, if not released by nfsvno_namei().
*/
void
nfsvno_relpathbuf(struct nameidata *ndp)
{
if ((ndp->ni_cnd.cn_flags & HASBUF) == 0)
panic("nfsrelpath");
uma_zfree(namei_zone, ndp->ni_cnd.cn_pnbuf);
ndp->ni_cnd.cn_flags &= ~HASBUF;
}
/*
* Readlink vnode op into an mbuf list.
*/
int
nfsvno_readlink(struct vnode *vp, struct ucred *cred, struct thread *p,
struct mbuf **mpp, struct mbuf **mpendp, int *lenp)
{
struct iovec iv[(NFS_MAXPATHLEN+MLEN-1)/MLEN];
struct iovec *ivp = iv;
struct uio io, *uiop = &io;
struct mbuf *mp, *mp2 = NULL, *mp3 = NULL;
int i, len, tlen, error = 0;
len = 0;
i = 0;
while (len < NFS_MAXPATHLEN) {
NFSMGET(mp);
MCLGET(mp, M_WAIT);
mp->m_len = NFSMSIZ(mp);
if (len == 0) {
mp3 = mp2 = mp;
} else {
mp2->m_next = mp;
mp2 = mp;
}
if ((len + mp->m_len) > NFS_MAXPATHLEN) {
mp->m_len = NFS_MAXPATHLEN - len;
len = NFS_MAXPATHLEN;
} else {
len += mp->m_len;
}
ivp->iov_base = mtod(mp, caddr_t);
ivp->iov_len = mp->m_len;
i++;
ivp++;
}
uiop->uio_iov = iv;
uiop->uio_iovcnt = i;
uiop->uio_offset = 0;
uiop->uio_resid = len;
uiop->uio_rw = UIO_READ;
uiop->uio_segflg = UIO_SYSSPACE;
uiop->uio_td = NULL;
error = VOP_READLINK(vp, uiop, cred);
if (error) {
m_freem(mp3);
*lenp = 0;
goto out;
}
if (uiop->uio_resid > 0) {
len -= uiop->uio_resid;
tlen = NFSM_RNDUP(len);
nfsrv_adj(mp3, NFS_MAXPATHLEN - tlen, tlen - len);
}
*lenp = len;
*mpp = mp3;
*mpendp = mp;
out:
NFSEXITCODE(error);
return (error);
}
/*
* Read vnode op call into mbuf list.
*/
int
nfsvno_read(struct vnode *vp, off_t off, int cnt, struct ucred *cred,
struct thread *p, struct mbuf **mpp, struct mbuf **mpendp)
{
struct mbuf *m;
int i;
struct iovec *iv;
struct iovec *iv2;
int error = 0, len, left, siz, tlen, ioflag = 0, hi, try = 32;
struct mbuf *m2 = NULL, *m3;
struct uio io, *uiop = &io;
struct nfsheur *nh;
/*
* Calculate seqcount for heuristic
*/
/*
* Locate best candidate
*/
hi = ((int)(vm_offset_t)vp / sizeof(struct vnode)) % NUM_HEURISTIC;
nh = &nfsheur[hi];
while (try--) {
if (nfsheur[hi].nh_vp == vp) {
nh = &nfsheur[hi];
break;
}
if (nfsheur[hi].nh_use > 0)
--nfsheur[hi].nh_use;
hi = (hi + 1) % NUM_HEURISTIC;
if (nfsheur[hi].nh_use < nh->nh_use)
nh = &nfsheur[hi];
}
if (nh->nh_vp != vp) {
nh->nh_vp = vp;
nh->nh_nextr = off;
nh->nh_use = NHUSE_INIT;
if (off == 0)
nh->nh_seqcount = 4;
else
nh->nh_seqcount = 1;
}
/*
* Calculate heuristic
*/
if ((off == 0 && nh->nh_seqcount > 0) || off == nh->nh_nextr) {
if (++nh->nh_seqcount > IO_SEQMAX)
nh->nh_seqcount = IO_SEQMAX;
} else if (nh->nh_seqcount > 1) {
nh->nh_seqcount = 1;
} else {
nh->nh_seqcount = 0;
}
nh->nh_use += NHUSE_INC;
if (nh->nh_use > NHUSE_MAX)
nh->nh_use = NHUSE_MAX;
ioflag |= nh->nh_seqcount << IO_SEQSHIFT;
len = left = NFSM_RNDUP(cnt);
m3 = NULL;
/*
* Generate the mbuf list with the uio_iov ref. to it.
*/
i = 0;
while (left > 0) {
NFSMGET(m);
MCLGET(m, M_WAIT);
m->m_len = 0;
siz = min(M_TRAILINGSPACE(m), left);
left -= siz;
i++;
if (m3)
m2->m_next = m;
else
m3 = m;
m2 = m;
}
MALLOC(iv, struct iovec *, i * sizeof (struct iovec),
M_TEMP, M_WAITOK);
uiop->uio_iov = iv2 = iv;
m = m3;
left = len;
i = 0;
while (left > 0) {
if (m == NULL)
panic("nfsvno_read iov");
siz = min(M_TRAILINGSPACE(m), left);
if (siz > 0) {
iv->iov_base = mtod(m, caddr_t) + m->m_len;
iv->iov_len = siz;
m->m_len += siz;
left -= siz;
iv++;
i++;
}
m = m->m_next;
}
uiop->uio_iovcnt = i;
uiop->uio_offset = off;
uiop->uio_resid = len;
uiop->uio_rw = UIO_READ;
uiop->uio_segflg = UIO_SYSSPACE;
error = VOP_READ(vp, uiop, IO_NODELOCKED | ioflag, cred);
FREE((caddr_t)iv2, M_TEMP);
if (error) {
m_freem(m3);
*mpp = NULL;
goto out;
}
tlen = len - uiop->uio_resid;
cnt = cnt < tlen ? cnt : tlen;
tlen = NFSM_RNDUP(cnt);
if (tlen == 0) {
m_freem(m3);
m3 = NULL;
} else if (len != tlen || tlen != cnt)
nfsrv_adj(m3, len - tlen, tlen - cnt);
*mpp = m3;
*mpendp = m2;
out:
NFSEXITCODE(error);
return (error);
}
/*
* Write vnode op from an mbuf list.
*/
int
nfsvno_write(struct vnode *vp, off_t off, int retlen, int cnt, int stable,
struct mbuf *mp, char *cp, struct ucred *cred, struct thread *p)
{
struct iovec *ivp;
int i, len;
struct iovec *iv;
int ioflags, error;
struct uio io, *uiop = &io;
MALLOC(ivp, struct iovec *, cnt * sizeof (struct iovec), M_TEMP,
M_WAITOK);
uiop->uio_iov = iv = ivp;
uiop->uio_iovcnt = cnt;
i = mtod(mp, caddr_t) + mp->m_len - cp;
len = retlen;
while (len > 0) {
if (mp == NULL)
panic("nfsvno_write");
if (i > 0) {
i = min(i, len);
ivp->iov_base = cp;
ivp->iov_len = i;
ivp++;
len -= i;
}
mp = mp->m_next;
if (mp) {
i = mp->m_len;
cp = mtod(mp, caddr_t);
}
}
if (stable == NFSWRITE_UNSTABLE)
ioflags = IO_NODELOCKED;
else
ioflags = (IO_SYNC | IO_NODELOCKED);
uiop->uio_resid = retlen;
uiop->uio_rw = UIO_WRITE;
uiop->uio_segflg = UIO_SYSSPACE;
NFSUIOPROC(uiop, p);
uiop->uio_offset = off;
error = VOP_WRITE(vp, uiop, ioflags, cred);
FREE((caddr_t)iv, M_TEMP);
NFSEXITCODE(error);
return (error);
}
/*
* Common code for creating a regular file (plus special files for V2).
*/
int
nfsvno_createsub(struct nfsrv_descript *nd, struct nameidata *ndp,
struct vnode **vpp, struct nfsvattr *nvap, int *exclusive_flagp,
int32_t *cverf, NFSDEV_T rdev, struct thread *p, struct nfsexstuff *exp)
{
u_quad_t tempsize;
int error;
error = nd->nd_repstat;
if (!error && ndp->ni_vp == NULL) {
if (nvap->na_type == VREG || nvap->na_type == VSOCK) {
vrele(ndp->ni_startdir);
error = VOP_CREATE(ndp->ni_dvp,
&ndp->ni_vp, &ndp->ni_cnd, &nvap->na_vattr);
vput(ndp->ni_dvp);
nfsvno_relpathbuf(ndp);
if (!error) {
if (*exclusive_flagp) {
*exclusive_flagp = 0;
NFSVNO_ATTRINIT(nvap);
nvap->na_atime.tv_sec = cverf[0];
nvap->na_atime.tv_nsec = cverf[1];
error = VOP_SETATTR(ndp->ni_vp,
&nvap->na_vattr, nd->nd_cred);
}
}
/*
* NFS V2 Only. nfsrvd_mknod() does this for V3.
* (This implies, just get out on an error.)
*/
} else if (nvap->na_type == VCHR || nvap->na_type == VBLK ||
nvap->na_type == VFIFO) {
if (nvap->na_type == VCHR && rdev == 0xffffffff)
nvap->na_type = VFIFO;
if (nvap->na_type != VFIFO &&
(error = priv_check_cred(nd->nd_cred,
PRIV_VFS_MKNOD_DEV, 0))) {
vrele(ndp->ni_startdir);
nfsvno_relpathbuf(ndp);
vput(ndp->ni_dvp);
goto out;
}
nvap->na_rdev = rdev;
error = VOP_MKNOD(ndp->ni_dvp, &ndp->ni_vp,
&ndp->ni_cnd, &nvap->na_vattr);
vput(ndp->ni_dvp);
nfsvno_relpathbuf(ndp);
vrele(ndp->ni_startdir);
if (error)
goto out;
} else {
vrele(ndp->ni_startdir);
nfsvno_relpathbuf(ndp);
vput(ndp->ni_dvp);
error = ENXIO;
goto out;
}
*vpp = ndp->ni_vp;
} else {
/*
* Handle cases where error is already set and/or
* the file exists.
* 1 - clean up the lookup
* 2 - iff !error and na_size set, truncate it
*/
vrele(ndp->ni_startdir);
nfsvno_relpathbuf(ndp);
*vpp = ndp->ni_vp;
if (ndp->ni_dvp == *vpp)
vrele(ndp->ni_dvp);
else
vput(ndp->ni_dvp);
if (!error && nvap->na_size != VNOVAL) {
error = nfsvno_accchk(*vpp, VWRITE,
nd->nd_cred, exp, p, NFSACCCHK_NOOVERRIDE,
NFSACCCHK_VPISLOCKED, NULL);
if (!error) {
tempsize = nvap->na_size;
NFSVNO_ATTRINIT(nvap);
nvap->na_size = tempsize;
error = VOP_SETATTR(*vpp,
&nvap->na_vattr, nd->nd_cred);
}
}
if (error)
vput(*vpp);
}
out:
NFSEXITCODE(error);
return (error);
}
/*
* Do a mknod vnode op.
*/
int
nfsvno_mknod(struct nameidata *ndp, struct nfsvattr *nvap, struct ucred *cred,
struct thread *p)
{
int error = 0;
enum vtype vtyp;
vtyp = nvap->na_type;
/*
* Iff doesn't exist, create it.
*/
if (ndp->ni_vp) {
vrele(ndp->ni_startdir);
nfsvno_relpathbuf(ndp);
vput(ndp->ni_dvp);
vrele(ndp->ni_vp);
error = EEXIST;
goto out;
}
if (vtyp != VCHR && vtyp != VBLK && vtyp != VSOCK && vtyp != VFIFO) {
vrele(ndp->ni_startdir);
nfsvno_relpathbuf(ndp);
vput(ndp->ni_dvp);
error = NFSERR_BADTYPE;
goto out;
}
if (vtyp == VSOCK) {
vrele(ndp->ni_startdir);
error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp,
&ndp->ni_cnd, &nvap->na_vattr);
vput(ndp->ni_dvp);
nfsvno_relpathbuf(ndp);
} else {
if (nvap->na_type != VFIFO &&
(error = priv_check_cred(cred, PRIV_VFS_MKNOD_DEV, 0))) {
vrele(ndp->ni_startdir);
nfsvno_relpathbuf(ndp);
vput(ndp->ni_dvp);
goto out;
}
error = VOP_MKNOD(ndp->ni_dvp, &ndp->ni_vp,
&ndp->ni_cnd, &nvap->na_vattr);
vput(ndp->ni_dvp);
nfsvno_relpathbuf(ndp);
vrele(ndp->ni_startdir);
/*
* Since VOP_MKNOD returns the ni_vp, I can't
* see any reason to do the lookup.
*/
}
out:
NFSEXITCODE(error);
return (error);
}
/*
* Mkdir vnode op.
*/
int
nfsvno_mkdir(struct nameidata *ndp, struct nfsvattr *nvap, uid_t saved_uid,
struct ucred *cred, struct thread *p, struct nfsexstuff *exp)
{
int error = 0;
if (ndp->ni_vp != NULL) {
if (ndp->ni_dvp == ndp->ni_vp)
vrele(ndp->ni_dvp);
else
vput(ndp->ni_dvp);
vrele(ndp->ni_vp);
nfsvno_relpathbuf(ndp);
error = EEXIST;
goto out;
}
error = VOP_MKDIR(ndp->ni_dvp, &ndp->ni_vp, &ndp->ni_cnd,
&nvap->na_vattr);
vput(ndp->ni_dvp);
nfsvno_relpathbuf(ndp);
out:
NFSEXITCODE(error);
return (error);
}
/*
* symlink vnode op.
*/
int
nfsvno_symlink(struct nameidata *ndp, struct nfsvattr *nvap, char *pathcp,
int pathlen, int not_v2, uid_t saved_uid, struct ucred *cred, struct thread *p,
struct nfsexstuff *exp)
{
int error = 0;
if (ndp->ni_vp) {
vrele(ndp->ni_startdir);
nfsvno_relpathbuf(ndp);
if (ndp->ni_dvp == ndp->ni_vp)
vrele(ndp->ni_dvp);
else
vput(ndp->ni_dvp);
vrele(ndp->ni_vp);
error = EEXIST;
goto out;
}
error = VOP_SYMLINK(ndp->ni_dvp, &ndp->ni_vp, &ndp->ni_cnd,
&nvap->na_vattr, pathcp);
vput(ndp->ni_dvp);
vrele(ndp->ni_startdir);
nfsvno_relpathbuf(ndp);
/*
* Although FreeBSD still had the lookup code in
* it for 7/current, there doesn't seem to be any
* point, since VOP_SYMLINK() returns the ni_vp.
* Just vput it for v2.
*/
if (!not_v2 && !error)
vput(ndp->ni_vp);
out:
NFSEXITCODE(error);
return (error);
}
/*
* Parse symbolic link arguments.
* This function has an ugly side effect. It will MALLOC() an area for
* the symlink and set iov_base to point to it, only if it succeeds.
* So, if it returns with uiop->uio_iov->iov_base != NULL, that must
* be FREE'd later.
*/
int
nfsvno_getsymlink(struct nfsrv_descript *nd, struct nfsvattr *nvap,
struct thread *p, char **pathcpp, int *lenp)
{
u_int32_t *tl;
char *pathcp = NULL;
int error = 0, len;
struct nfsv2_sattr *sp;
*pathcpp = NULL;
*lenp = 0;
if ((nd->nd_flag & ND_NFSV3) &&
(error = nfsrv_sattr(nd, nvap, NULL, NULL, p)))
goto nfsmout;
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
len = fxdr_unsigned(int, *tl);
if (len > NFS_MAXPATHLEN || len <= 0) {
error = EBADRPC;
goto nfsmout;
}
MALLOC(pathcp, caddr_t, len + 1, M_TEMP, M_WAITOK);
error = nfsrv_mtostr(nd, pathcp, len);
if (error)
goto nfsmout;
if (nd->nd_flag & ND_NFSV2) {
NFSM_DISSECT(sp, struct nfsv2_sattr *, NFSX_V2SATTR);
nvap->na_mode = fxdr_unsigned(u_int16_t, sp->sa_mode);
}
*pathcpp = pathcp;
*lenp = len;
NFSEXITCODE2(0, nd);
return (0);
nfsmout:
if (pathcp)
free(pathcp, M_TEMP);
NFSEXITCODE2(error, nd);
return (error);
}
/*
* Remove a non-directory object.
*/
int
nfsvno_removesub(struct nameidata *ndp, int is_v4, struct ucred *cred,
struct thread *p, struct nfsexstuff *exp)
{
struct vnode *vp;
int error = 0;
vp = ndp->ni_vp;
if (vp->v_type == VDIR)
error = NFSERR_ISDIR;
else if (is_v4)
error = nfsrv_checkremove(vp, 1, p);
if (!error)
error = VOP_REMOVE(ndp->ni_dvp, vp, &ndp->ni_cnd);
if (ndp->ni_dvp == vp)
vrele(ndp->ni_dvp);
else
vput(ndp->ni_dvp);
vput(vp);
NFSEXITCODE(error);
return (error);
}
/*
* Remove a directory.
*/
int
nfsvno_rmdirsub(struct nameidata *ndp, int is_v4, struct ucred *cred,
struct thread *p, struct nfsexstuff *exp)
{
struct vnode *vp;
int error = 0;
vp = ndp->ni_vp;
if (vp->v_type != VDIR) {
error = ENOTDIR;
goto out;
}
/*
* No rmdir "." please.
*/
if (ndp->ni_dvp == vp) {
error = EINVAL;
goto out;
}
/*
* The root of a mounted filesystem cannot be deleted.
*/
if (vp->v_vflag & VV_ROOT)
error = EBUSY;
out:
if (!error)
error = VOP_RMDIR(ndp->ni_dvp, vp, &ndp->ni_cnd);
if (ndp->ni_dvp == vp)
vrele(ndp->ni_dvp);
else
vput(ndp->ni_dvp);
vput(vp);
NFSEXITCODE(error);
return (error);
}
/*
* Rename vnode op.
*/
int
nfsvno_rename(struct nameidata *fromndp, struct nameidata *tondp,
u_int32_t ndstat, u_int32_t ndflag, struct ucred *cred, struct thread *p)
{
struct vnode *fvp, *tvp, *tdvp;
int error = 0;
fvp = fromndp->ni_vp;
if (ndstat) {
vrele(fromndp->ni_dvp);
vrele(fvp);
error = ndstat;
goto out1;
}
tdvp = tondp->ni_dvp;
tvp = tondp->ni_vp;
if (tvp != NULL) {
if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
error = (ndflag & ND_NFSV2) ? EISDIR : EEXIST;
goto out;
} else if (fvp->v_type != VDIR && tvp->v_type == VDIR) {
error = (ndflag & ND_NFSV2) ? ENOTDIR : EEXIST;
goto out;
}
if (tvp->v_type == VDIR && tvp->v_mountedhere) {
error = (ndflag & ND_NFSV2) ? ENOTEMPTY : EXDEV;
goto out;
}
/*
* A rename to '.' or '..' results in a prematurely
* unlocked vnode on FreeBSD5, so I'm just going to fail that
* here.
*/
if ((tondp->ni_cnd.cn_namelen == 1 &&
tondp->ni_cnd.cn_nameptr[0] == '.') ||
(tondp->ni_cnd.cn_namelen == 2 &&
tondp->ni_cnd.cn_nameptr[0] == '.' &&
tondp->ni_cnd.cn_nameptr[1] == '.')) {
error = EINVAL;
goto out;
}
}
if (fvp->v_type == VDIR && fvp->v_mountedhere) {
error = (ndflag & ND_NFSV2) ? ENOTEMPTY : EXDEV;
goto out;
}
if (fvp->v_mount != tdvp->v_mount) {
error = (ndflag & ND_NFSV2) ? ENOTEMPTY : EXDEV;
goto out;
}
if (fvp == tdvp) {
error = (ndflag & ND_NFSV2) ? ENOTEMPTY : EINVAL;
goto out;
}
if (fvp == tvp) {
/*
* If source and destination are the same, there is nothing to
* do. Set error to -1 to indicate this.
*/
error = -1;
goto out;
}
if (ndflag & ND_NFSV4) {
if (NFSVOPLOCK(fvp, LK_EXCLUSIVE) == 0) {
error = nfsrv_checkremove(fvp, 0, p);
NFSVOPUNLOCK(fvp, 0);
} else
error = EPERM;
if (tvp && !error)
error = nfsrv_checkremove(tvp, 1, p);
} else {
/*
* For NFSv2 and NFSv3, try to get rid of the delegation, so
* that the NFSv4 client won't be confused by the rename.
* Since nfsd_recalldelegation() can only be called on an
* unlocked vnode at this point and fvp is the file that will
* still exist after the rename, just do fvp.
*/
nfsd_recalldelegation(fvp, p);
}
out:
if (!error) {
error = VOP_RENAME(fromndp->ni_dvp, fromndp->ni_vp,
&fromndp->ni_cnd, tondp->ni_dvp, tondp->ni_vp,
&tondp->ni_cnd);
} else {
if (tdvp == tvp)
vrele(tdvp);
else
vput(tdvp);
if (tvp)
vput(tvp);
vrele(fromndp->ni_dvp);
vrele(fvp);
if (error == -1)
error = 0;
}
vrele(tondp->ni_startdir);
nfsvno_relpathbuf(tondp);
out1:
vrele(fromndp->ni_startdir);
nfsvno_relpathbuf(fromndp);
NFSEXITCODE(error);
return (error);
}
/*
* Link vnode op.
*/
int
nfsvno_link(struct nameidata *ndp, struct vnode *vp, struct ucred *cred,
struct thread *p, struct nfsexstuff *exp)
{
struct vnode *xp;
int error = 0;
xp = ndp->ni_vp;
if (xp != NULL) {
error = EEXIST;
} else {
xp = ndp->ni_dvp;
if (vp->v_mount != xp->v_mount)
error = EXDEV;
}
if (!error) {
NFSVOPLOCK(vp, LK_EXCLUSIVE | LK_RETRY);
if ((vp->v_iflag & VI_DOOMED) == 0)
error = VOP_LINK(ndp->ni_dvp, vp, &ndp->ni_cnd);
else
error = EPERM;
if (ndp->ni_dvp == vp)
vrele(ndp->ni_dvp);
else
vput(ndp->ni_dvp);
NFSVOPUNLOCK(vp, 0);
} else {
if (ndp->ni_dvp == ndp->ni_vp)
vrele(ndp->ni_dvp);
else
vput(ndp->ni_dvp);
if (ndp->ni_vp)
vrele(ndp->ni_vp);
}
nfsvno_relpathbuf(ndp);
NFSEXITCODE(error);
return (error);
}
/*
* Do the fsync() appropriate for the commit.
*/
int
nfsvno_fsync(struct vnode *vp, u_int64_t off, int cnt, struct ucred *cred,
struct thread *td)
{
int error = 0;
if (cnt > MAX_COMMIT_COUNT) {
/*
* Give up and do the whole thing
*/
if (vp->v_object &&
(vp->v_object->flags & OBJ_MIGHTBEDIRTY)) {
VM_OBJECT_LOCK(vp->v_object);
vm_object_page_clean(vp->v_object, 0, 0, OBJPC_SYNC);
VM_OBJECT_UNLOCK(vp->v_object);
}
error = VOP_FSYNC(vp, MNT_WAIT, td);
} else {
/*
* Locate and synchronously write any buffers that fall
* into the requested range. Note: we are assuming that
* f_iosize is a power of 2.
*/
int iosize = vp->v_mount->mnt_stat.f_iosize;
int iomask = iosize - 1;
struct bufobj *bo;
daddr_t lblkno;
/*
* Align to iosize boundry, super-align to page boundry.
*/
if (off & iomask) {
cnt += off & iomask;
off &= ~(u_quad_t)iomask;
}
if (off & PAGE_MASK) {
cnt += off & PAGE_MASK;
off &= ~(u_quad_t)PAGE_MASK;
}
lblkno = off / iosize;
if (vp->v_object &&
(vp->v_object->flags & OBJ_MIGHTBEDIRTY)) {
VM_OBJECT_LOCK(vp->v_object);
vm_object_page_clean(vp->v_object, off, off + cnt,
OBJPC_SYNC);
VM_OBJECT_UNLOCK(vp->v_object);
}
bo = &vp->v_bufobj;
BO_LOCK(bo);
while (cnt > 0) {
struct buf *bp;
/*
* If we have a buffer and it is marked B_DELWRI we
* have to lock and write it. Otherwise the prior
* write is assumed to have already been committed.
*
* gbincore() can return invalid buffers now so we
* have to check that bit as well (though B_DELWRI
* should not be set if B_INVAL is set there could be
* a race here since we haven't locked the buffer).
*/
if ((bp = gbincore(&vp->v_bufobj, lblkno)) != NULL) {
if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL |
LK_INTERLOCK, BO_MTX(bo)) == ENOLCK) {
BO_LOCK(bo);
continue; /* retry */
}
if ((bp->b_flags & (B_DELWRI|B_INVAL)) ==
B_DELWRI) {
bremfree(bp);
bp->b_flags &= ~B_ASYNC;
bwrite(bp);
++nfs_commit_miss;
} else
BUF_UNLOCK(bp);
BO_LOCK(bo);
}
++nfs_commit_blks;
if (cnt < iosize)
break;
cnt -= iosize;
++lblkno;
}
BO_UNLOCK(bo);
}
NFSEXITCODE(error);
return (error);
}
/*
* Statfs vnode op.
*/
int
nfsvno_statfs(struct vnode *vp, struct statfs *sf)
{
int error;
error = VFS_STATFS(vp->v_mount, sf);
if (error == 0) {
/*
* Since NFS handles these values as unsigned on the
* wire, there is no way to represent negative values,
* so set them to 0. Without this, they will appear
* to be very large positive values for clients like
* Solaris10.
*/
if (sf->f_bavail < 0)
sf->f_bavail = 0;
if (sf->f_ffree < 0)
sf->f_ffree = 0;
}
NFSEXITCODE(error);
return (error);
}
/*
* Do the vnode op stuff for Open. Similar to nfsvno_createsub(), but
* must handle nfsrv_opencheck() calls after any other access checks.
*/
void
nfsvno_open(struct nfsrv_descript *nd, struct nameidata *ndp,
nfsquad_t clientid, nfsv4stateid_t *stateidp, struct nfsstate *stp,
int *exclusive_flagp, struct nfsvattr *nvap, int32_t *cverf, int create,
NFSACL_T *aclp, nfsattrbit_t *attrbitp, struct ucred *cred, struct thread *p,
struct nfsexstuff *exp, struct vnode **vpp)
{
struct vnode *vp = NULL;
u_quad_t tempsize;
struct nfsexstuff nes;
if (ndp->ni_vp == NULL)
nd->nd_repstat = nfsrv_opencheck(clientid,
stateidp, stp, NULL, nd, p, nd->nd_repstat);
if (!nd->nd_repstat) {
if (ndp->ni_vp == NULL) {
vrele(ndp->ni_startdir);
nd->nd_repstat = VOP_CREATE(ndp->ni_dvp,
&ndp->ni_vp, &ndp->ni_cnd, &nvap->na_vattr);
vput(ndp->ni_dvp);
nfsvno_relpathbuf(ndp);
if (!nd->nd_repstat) {
if (*exclusive_flagp) {
*exclusive_flagp = 0;
NFSVNO_ATTRINIT(nvap);
nvap->na_atime.tv_sec = cverf[0];
nvap->na_atime.tv_nsec = cverf[1];
nd->nd_repstat = VOP_SETATTR(ndp->ni_vp,
&nvap->na_vattr, cred);
} else {
nfsrv_fixattr(nd, ndp->ni_vp, nvap,
aclp, p, attrbitp, exp);
}
}
vp = ndp->ni_vp;
} else {
if (ndp->ni_startdir)
vrele(ndp->ni_startdir);
nfsvno_relpathbuf(ndp);
vp = ndp->ni_vp;
if (create == NFSV4OPEN_CREATE) {
if (ndp->ni_dvp == vp)
vrele(ndp->ni_dvp);
else
vput(ndp->ni_dvp);
}
if (NFSVNO_ISSETSIZE(nvap) && vp->v_type == VREG) {
if (ndp->ni_cnd.cn_flags & RDONLY)
NFSVNO_SETEXRDONLY(&nes);
else
NFSVNO_EXINIT(&nes);
nd->nd_repstat = nfsvno_accchk(vp,
VWRITE, cred, &nes, p,
NFSACCCHK_NOOVERRIDE,
NFSACCCHK_VPISLOCKED, NULL);
nd->nd_repstat = nfsrv_opencheck(clientid,
stateidp, stp, vp, nd, p, nd->nd_repstat);
if (!nd->nd_repstat) {
tempsize = nvap->na_size;
NFSVNO_ATTRINIT(nvap);
nvap->na_size = tempsize;
nd->nd_repstat = VOP_SETATTR(vp,
&nvap->na_vattr, cred);
}
} else if (vp->v_type == VREG) {
nd->nd_repstat = nfsrv_opencheck(clientid,
stateidp, stp, vp, nd, p, nd->nd_repstat);
}
}
} else {
if (ndp->ni_cnd.cn_flags & HASBUF)
nfsvno_relpathbuf(ndp);
if (ndp->ni_startdir && create == NFSV4OPEN_CREATE) {
vrele(ndp->ni_startdir);
if (ndp->ni_dvp == ndp->ni_vp)
vrele(ndp->ni_dvp);
else
vput(ndp->ni_dvp);
if (ndp->ni_vp)
vput(ndp->ni_vp);
}
}
*vpp = vp;
NFSEXITCODE2(0, nd);
}
/*
* Updates the file rev and sets the mtime and ctime
* to the current clock time, returning the va_filerev and va_Xtime
* values.
*/
void
nfsvno_updfilerev(struct vnode *vp, struct nfsvattr *nvap,
struct ucred *cred, struct thread *p)
{
struct vattr va;
VATTR_NULL(&va);
getnanotime(&va.va_mtime);
(void) VOP_SETATTR(vp, &va, cred);
(void) nfsvno_getattr(vp, nvap, cred, p, 1);
}
/*
* Glue routine to nfsv4_fillattr().
*/
int
nfsvno_fillattr(struct nfsrv_descript *nd, struct mount *mp, struct vnode *vp,
struct nfsvattr *nvap, fhandle_t *fhp, int rderror, nfsattrbit_t *attrbitp,
struct ucred *cred, struct thread *p, int isdgram, int reterr,
int supports_nfsv4acls, int at_root, uint64_t mounted_on_fileno)
{
int error;
error = nfsv4_fillattr(nd, mp, vp, NULL, &nvap->na_vattr, fhp, rderror,
attrbitp, cred, p, isdgram, reterr, supports_nfsv4acls, at_root,
mounted_on_fileno);
NFSEXITCODE2(0, nd);
return (error);
}
/* Since the Readdir vnode ops vary, put the entire functions in here. */
/*
* nfs readdir service
* - mallocs what it thinks is enough to read
* count rounded up to a multiple of DIRBLKSIZ <= NFS_MAXREADDIR
* - calls VOP_READDIR()
* - loops around building the reply
* if the output generated exceeds count break out of loop
* The NFSM_CLGET macro is used here so that the reply will be packed
* tightly in mbuf clusters.
* - it trims out records with d_fileno == 0
* this doesn't matter for Unix clients, but they might confuse clients
* for other os'.
* - it trims out records with d_type == DT_WHT
* these cannot be seen through NFS (unless we extend the protocol)
* The alternate call nfsrvd_readdirplus() does lookups as well.
* PS: The NFS protocol spec. does not clarify what the "count" byte
* argument is a count of.. just name strings and file id's or the
* entire reply rpc or ...
* I tried just file name and id sizes and it confused the Sun client,
* so I am using the full rpc size now. The "paranoia.." comment refers
* to including the status longwords that are not a part of the dir.
* "entry" structures, but are in the rpc.
*/
int
nfsrvd_readdir(struct nfsrv_descript *nd, int isdgram,
struct vnode *vp, struct thread *p, struct nfsexstuff *exp)
{
struct dirent *dp;
u_int32_t *tl;
int dirlen;
char *cpos, *cend, *rbuf;
struct nfsvattr at;
int nlen, error = 0, getret = 1;
int siz, cnt, fullsiz, eofflag, ncookies;
u_int64_t off, toff, verf;
u_long *cookies = NULL, *cookiep;
struct uio io;
struct iovec iv;
int not_zfs;
if (nd->nd_repstat) {
nfsrv_postopattr(nd, getret, &at);
goto out;
}
if (nd->nd_flag & ND_NFSV2) {
NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
off = fxdr_unsigned(u_quad_t, *tl++);
} else {
NFSM_DISSECT(tl, u_int32_t *, 5 * NFSX_UNSIGNED);
off = fxdr_hyper(tl);
tl += 2;
verf = fxdr_hyper(tl);
tl += 2;
}
toff = off;
cnt = fxdr_unsigned(int, *tl);
if (cnt > NFS_SRVMAXDATA(nd) || cnt < 0)
cnt = NFS_SRVMAXDATA(nd);
siz = ((cnt + DIRBLKSIZ - 1) & ~(DIRBLKSIZ - 1));
fullsiz = siz;
if (nd->nd_flag & ND_NFSV3) {
nd->nd_repstat = getret = nfsvno_getattr(vp, &at, nd->nd_cred,
p, 1);
#if 0
/*
* va_filerev is not sufficient as a cookie verifier,
* since it is not supposed to change when entries are
* removed/added unless that offset cookies returned to
* the client are no longer valid.
*/
if (!nd->nd_repstat && toff && verf != at.na_filerev)
nd->nd_repstat = NFSERR_BAD_COOKIE;
#endif
}
if (nd->nd_repstat == 0 && cnt == 0) {
if (nd->nd_flag & ND_NFSV2)
/* NFSv2 does not have NFSERR_TOOSMALL */
nd->nd_repstat = EPERM;
else
nd->nd_repstat = NFSERR_TOOSMALL;
}
if (!nd->nd_repstat)
nd->nd_repstat = nfsvno_accchk(vp, VEXEC,
nd->nd_cred, exp, p, NFSACCCHK_NOOVERRIDE,
NFSACCCHK_VPISLOCKED, NULL);
if (nd->nd_repstat) {
vput(vp);
if (nd->nd_flag & ND_NFSV3)
nfsrv_postopattr(nd, getret, &at);
goto out;
}
not_zfs = strcmp(vp->v_mount->mnt_vfc->vfc_name, "zfs");
MALLOC(rbuf, caddr_t, siz, M_TEMP, M_WAITOK);
again:
eofflag = 0;
if (cookies) {
free((caddr_t)cookies, M_TEMP);
cookies = NULL;
}
iv.iov_base = rbuf;
iv.iov_len = siz;
io.uio_iov = &iv;
io.uio_iovcnt = 1;
io.uio_offset = (off_t)off;
io.uio_resid = siz;
io.uio_segflg = UIO_SYSSPACE;
io.uio_rw = UIO_READ;
io.uio_td = NULL;
nd->nd_repstat = VOP_READDIR(vp, &io, nd->nd_cred, &eofflag, &ncookies,
&cookies);
off = (u_int64_t)io.uio_offset;
if (io.uio_resid)
siz -= io.uio_resid;
if (!cookies && !nd->nd_repstat)
nd->nd_repstat = NFSERR_PERM;
if (nd->nd_flag & ND_NFSV3) {
getret = nfsvno_getattr(vp, &at, nd->nd_cred, p, 1);
if (!nd->nd_repstat)
nd->nd_repstat = getret;
}
/*
* Handles the failed cases. nd->nd_repstat == 0 past here.
*/
if (nd->nd_repstat) {
vput(vp);
free((caddr_t)rbuf, M_TEMP);
if (cookies)
free((caddr_t)cookies, M_TEMP);
if (nd->nd_flag & ND_NFSV3)
nfsrv_postopattr(nd, getret, &at);
goto out;
}
/*
* If nothing read, return eof
* rpc reply
*/
if (siz == 0) {
vput(vp);
if (nd->nd_flag & ND_NFSV2) {
NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
} else {
nfsrv_postopattr(nd, getret, &at);
NFSM_BUILD(tl, u_int32_t *, 4 * NFSX_UNSIGNED);
txdr_hyper(at.na_filerev, tl);
tl += 2;
}
*tl++ = newnfs_false;
*tl = newnfs_true;
FREE((caddr_t)rbuf, M_TEMP);
FREE((caddr_t)cookies, M_TEMP);
goto out;
}
/*
* Check for degenerate cases of nothing useful read.
* If so go try again
*/
cpos = rbuf;
cend = rbuf + siz;
dp = (struct dirent *)cpos;
cookiep = cookies;
/*
* For some reason FreeBSD's ufs_readdir() chooses to back the
* directory offset up to a block boundary, so it is necessary to
* skip over the records that precede the requested offset. This
* requires the assumption that file offset cookies monotonically
* increase.
* Since the offset cookies don't monotonically increase for ZFS,
* this is not done when ZFS is the file system.
*/
while (cpos < cend && ncookies > 0 &&
(dp->d_fileno == 0 || dp->d_type == DT_WHT ||
(not_zfs != 0 && ((u_quad_t)(*cookiep)) <= toff))) {
cpos += dp->d_reclen;
dp = (struct dirent *)cpos;
cookiep++;
ncookies--;
}
if (cpos >= cend || ncookies == 0) {
siz = fullsiz;
toff = off;
goto again;
}
vput(vp);
/*
* dirlen is the size of the reply, including all XDR and must
* not exceed cnt. For NFSv2, RFC1094 didn't clearly indicate
* if the XDR should be included in "count", but to be safe, we do.
* (Include the two booleans at the end of the reply in dirlen now.)
*/
if (nd->nd_flag & ND_NFSV3) {
nfsrv_postopattr(nd, getret, &at);
NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
txdr_hyper(at.na_filerev, tl);
dirlen = NFSX_V3POSTOPATTR + NFSX_VERF + 2 * NFSX_UNSIGNED;
} else {
dirlen = 2 * NFSX_UNSIGNED;
}
/* Loop through the records and build reply */
while (cpos < cend && ncookies > 0) {
nlen = dp->d_namlen;
if (dp->d_fileno != 0 && dp->d_type != DT_WHT &&
nlen <= NFS_MAXNAMLEN) {
if (nd->nd_flag & ND_NFSV3)
dirlen += (6*NFSX_UNSIGNED + NFSM_RNDUP(nlen));
else
dirlen += (4*NFSX_UNSIGNED + NFSM_RNDUP(nlen));
if (dirlen > cnt) {
eofflag = 0;
break;
}
/*
* Build the directory record xdr from
* the dirent entry.
*/
if (nd->nd_flag & ND_NFSV3) {
NFSM_BUILD(tl, u_int32_t *, 3 * NFSX_UNSIGNED);
*tl++ = newnfs_true;
*tl++ = 0;
} else {
NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
*tl++ = newnfs_true;
}
*tl = txdr_unsigned(dp->d_fileno);
(void) nfsm_strtom(nd, dp->d_name, nlen);
if (nd->nd_flag & ND_NFSV3) {
NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
*tl++ = 0;
} else
NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
*tl = txdr_unsigned(*cookiep);
}
cpos += dp->d_reclen;
dp = (struct dirent *)cpos;
cookiep++;
ncookies--;
}
if (cpos < cend)
eofflag = 0;
NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
*tl++ = newnfs_false;
if (eofflag)
*tl = newnfs_true;
else
*tl = newnfs_false;
FREE((caddr_t)rbuf, M_TEMP);
FREE((caddr_t)cookies, M_TEMP);
out:
NFSEXITCODE2(0, nd);
return (0);
nfsmout:
vput(vp);
NFSEXITCODE2(error, nd);
return (error);
}
/*
* Readdirplus for V3 and Readdir for V4.
*/
int
nfsrvd_readdirplus(struct nfsrv_descript *nd, int isdgram,
struct vnode *vp, struct thread *p, struct nfsexstuff *exp)
{
struct dirent *dp;
u_int32_t *tl;
int dirlen;
char *cpos, *cend, *rbuf;
struct vnode *nvp;
fhandle_t nfh;
struct nfsvattr nva, at, *nvap = &nva;
struct mbuf *mb0, *mb1;
struct nfsreferral *refp;
int nlen, r, error = 0, getret = 1, usevget = 1;
int siz, cnt, fullsiz, eofflag, ncookies, entrycnt;
caddr_t bpos0, bpos1;
u_int64_t off, toff, verf;
u_long *cookies = NULL, *cookiep;
nfsattrbit_t attrbits, rderrbits, savbits;
struct uio io;
struct iovec iv;
struct componentname cn;
int at_root, needs_unbusy, not_zfs, supports_nfsv4acls;
struct mount *mp, *new_mp;
uint64_t mounted_on_fileno;
if (nd->nd_repstat) {
nfsrv_postopattr(nd, getret, &at);
goto out;
}
NFSM_DISSECT(tl, u_int32_t *, 6 * NFSX_UNSIGNED);
off = fxdr_hyper(tl);
toff = off;
tl += 2;
verf = fxdr_hyper(tl);
tl += 2;
siz = fxdr_unsigned(int, *tl++);
cnt = fxdr_unsigned(int, *tl);
/*
* Use the server's maximum data transfer size as the upper bound
* on reply datalen.
*/
if (cnt > NFS_SRVMAXDATA(nd) || cnt < 0)
cnt = NFS_SRVMAXDATA(nd);
/*
* siz is a "hint" of how much directory information (name, fileid,
* cookie) should be in the reply. At least one client "hints" 0,
* so I set it to cnt for that case. I also round it up to the
* next multiple of DIRBLKSIZ.
*/
if (siz <= 0)
siz = cnt;
siz = ((siz + DIRBLKSIZ - 1) & ~(DIRBLKSIZ - 1));
if (nd->nd_flag & ND_NFSV4) {
error = nfsrv_getattrbits(nd, &attrbits, NULL, NULL);
if (error)
goto nfsmout;
NFSSET_ATTRBIT(&savbits, &attrbits);
NFSCLRNOTFILLABLE_ATTRBIT(&attrbits);
NFSZERO_ATTRBIT(&rderrbits);
NFSSETBIT_ATTRBIT(&rderrbits, NFSATTRBIT_RDATTRERROR);
} else {
NFSZERO_ATTRBIT(&attrbits);
}
fullsiz = siz;
nd->nd_repstat = getret = nfsvno_getattr(vp, &at, nd->nd_cred, p, 1);
if (!nd->nd_repstat) {
if (off && verf != at.na_filerev) {
/*
* va_filerev is not sufficient as a cookie verifier,
* since it is not supposed to change when entries are
* removed/added unless that offset cookies returned to
* the client are no longer valid.
*/
#if 0
if (nd->nd_flag & ND_NFSV4) {
nd->nd_repstat = NFSERR_NOTSAME;
} else {
nd->nd_repstat = NFSERR_BAD_COOKIE;
}
#endif
} else if ((nd->nd_flag & ND_NFSV4) && off == 0 && verf != 0) {
nd->nd_repstat = NFSERR_BAD_COOKIE;
}
}
if (!nd->nd_repstat && vp->v_type != VDIR)
nd->nd_repstat = NFSERR_NOTDIR;
if (!nd->nd_repstat && cnt == 0)
nd->nd_repstat = NFSERR_TOOSMALL;
if (!nd->nd_repstat)
nd->nd_repstat = nfsvno_accchk(vp, VEXEC,
nd->nd_cred, exp, p, NFSACCCHK_NOOVERRIDE,
NFSACCCHK_VPISLOCKED, NULL);
if (nd->nd_repstat) {
vput(vp);
if (nd->nd_flag & ND_NFSV3)
nfsrv_postopattr(nd, getret, &at);
goto out;
}
not_zfs = strcmp(vp->v_mount->mnt_vfc->vfc_name, "zfs");
MALLOC(rbuf, caddr_t, siz, M_TEMP, M_WAITOK);
again:
eofflag = 0;
if (cookies) {
free((caddr_t)cookies, M_TEMP);
cookies = NULL;
}
iv.iov_base = rbuf;
iv.iov_len = siz;
io.uio_iov = &iv;
io.uio_iovcnt = 1;
io.uio_offset = (off_t)off;
io.uio_resid = siz;
io.uio_segflg = UIO_SYSSPACE;
io.uio_rw = UIO_READ;
io.uio_td = NULL;
nd->nd_repstat = VOP_READDIR(vp, &io, nd->nd_cred, &eofflag, &ncookies,
&cookies);
off = (u_int64_t)io.uio_offset;
if (io.uio_resid)
siz -= io.uio_resid;
getret = nfsvno_getattr(vp, &at, nd->nd_cred, p, 1);
if (!cookies && !nd->nd_repstat)
nd->nd_repstat = NFSERR_PERM;
if (!nd->nd_repstat)
nd->nd_repstat = getret;
if (nd->nd_repstat) {
vput(vp);
if (cookies)
free((caddr_t)cookies, M_TEMP);
free((caddr_t)rbuf, M_TEMP);
if (nd->nd_flag & ND_NFSV3)
nfsrv_postopattr(nd, getret, &at);
goto out;
}
/*
* If nothing read, return eof
* rpc reply
*/
if (siz == 0) {
vput(vp);
if (nd->nd_flag & ND_NFSV3)
nfsrv_postopattr(nd, getret, &at);
NFSM_BUILD(tl, u_int32_t *, 4 * NFSX_UNSIGNED);
txdr_hyper(at.na_filerev, tl);
tl += 2;
*tl++ = newnfs_false;
*tl = newnfs_true;
free((caddr_t)cookies, M_TEMP);
free((caddr_t)rbuf, M_TEMP);
goto out;
}
/*
* Check for degenerate cases of nothing useful read.
* If so go try again
*/
cpos = rbuf;
cend = rbuf + siz;
dp = (struct dirent *)cpos;
cookiep = cookies;
/*
* For some reason FreeBSD's ufs_readdir() chooses to back the
* directory offset up to a block boundary, so it is necessary to
* skip over the records that precede the requested offset. This
* requires the assumption that file offset cookies monotonically
* increase.
* Since the offset cookies don't monotonically increase for ZFS,
* this is not done when ZFS is the file system.
*/
while (cpos < cend && ncookies > 0 &&
(dp->d_fileno == 0 || dp->d_type == DT_WHT ||
(not_zfs != 0 && ((u_quad_t)(*cookiep)) <= toff) ||
((nd->nd_flag & ND_NFSV4) &&
((dp->d_namlen == 1 && dp->d_name[0] == '.') ||
(dp->d_namlen==2 && dp->d_name[0]=='.' && dp->d_name[1]=='.'))))) {
cpos += dp->d_reclen;
dp = (struct dirent *)cpos;
cookiep++;
ncookies--;
}
if (cpos >= cend || ncookies == 0) {
siz = fullsiz;
toff = off;
goto again;
}
/*
* Busy the file system so that the mount point won't go away
* and, as such, VFS_VGET() can be used safely.
*/
mp = vp->v_mount;
vfs_ref(mp);
NFSVOPUNLOCK(vp, 0);
nd->nd_repstat = vfs_busy(mp, 0);
vfs_rel(mp);
if (nd->nd_repstat != 0) {
vrele(vp);
free(cookies, M_TEMP);
free(rbuf, M_TEMP);
if (nd->nd_flag & ND_NFSV3)
nfsrv_postopattr(nd, getret, &at);
goto out;
}
/*
* Save this position, in case there is an error before one entry
* is created.
*/
mb0 = nd->nd_mb;
bpos0 = nd->nd_bpos;
/*
* Fill in the first part of the reply.
* dirlen is the reply length in bytes and cannot exceed cnt.
* (Include the two booleans at the end of the reply in dirlen now,
* so we recognize when we have exceeded cnt.)
*/
if (nd->nd_flag & ND_NFSV3) {
dirlen = NFSX_V3POSTOPATTR + NFSX_VERF + 2 * NFSX_UNSIGNED;
nfsrv_postopattr(nd, getret, &at);
} else {
dirlen = NFSX_VERF + 2 * NFSX_UNSIGNED;
}
NFSM_BUILD(tl, u_int32_t *, NFSX_VERF);
txdr_hyper(at.na_filerev, tl);
/*
* Save this position, in case there is an empty reply needed.
*/
mb1 = nd->nd_mb;
bpos1 = nd->nd_bpos;
/* Loop through the records and build reply */
entrycnt = 0;
while (cpos < cend && ncookies > 0 && dirlen < cnt) {
nlen = dp->d_namlen;
if (dp->d_fileno != 0 && dp->d_type != DT_WHT &&
nlen <= NFS_MAXNAMLEN &&
((nd->nd_flag & ND_NFSV3) || nlen > 2 ||
(nlen==2 && (dp->d_name[0]!='.' || dp->d_name[1]!='.'))
|| (nlen == 1 && dp->d_name[0] != '.'))) {
/*
* Save the current position in the reply, in case
* this entry exceeds cnt.
*/
mb1 = nd->nd_mb;
bpos1 = nd->nd_bpos;
/*
* For readdir_and_lookup get the vnode using
* the file number.
*/
nvp = NULL;
refp = NULL;
r = 0;
at_root = 0;
needs_unbusy = 0;
new_mp = mp;
mounted_on_fileno = (uint64_t)dp->d_fileno;
if ((nd->nd_flag & ND_NFSV3) ||
NFSNONZERO_ATTRBIT(&savbits)) {
if (nd->nd_flag & ND_NFSV4)
refp = nfsv4root_getreferral(NULL,
vp, dp->d_fileno);
if (refp == NULL) {
if (usevget)
r = VFS_VGET(mp, dp->d_fileno,
LK_SHARED, &nvp);
else
r = EOPNOTSUPP;
if (r == EOPNOTSUPP) {
if (usevget) {
usevget = 0;
cn.cn_nameiop = LOOKUP;
cn.cn_lkflags =
LK_SHARED |
LK_RETRY;
cn.cn_cred =
nd->nd_cred;
cn.cn_thread = p;
}
cn.cn_nameptr = dp->d_name;
cn.cn_namelen = nlen;
cn.cn_flags = ISLASTCN |
NOFOLLOW | LOCKLEAF |
MPSAFE;
if (nlen == 2 &&
dp->d_name[0] == '.' &&
dp->d_name[1] == '.')
cn.cn_flags |=
ISDOTDOT;
if (NFSVOPLOCK(vp, LK_SHARED)
!= 0) {
nd->nd_repstat = EPERM;
break;
}
if ((vp->v_vflag & VV_ROOT) != 0
&& (cn.cn_flags & ISDOTDOT)
!= 0) {
vref(vp);
nvp = vp;
r = 0;
} else {
r = VOP_LOOKUP(vp, &nvp,
&cn);
if (vp != nvp)
NFSVOPUNLOCK(vp,
0);
}
}
/*
* For NFSv4, check to see if nvp is
* a mount point and get the mount
* point vnode, as required.
*/
if (r == 0 &&
nfsrv_enable_crossmntpt != 0 &&
(nd->nd_flag & ND_NFSV4) != 0 &&
nvp->v_type == VDIR &&
nvp->v_mountedhere != NULL) {
new_mp = nvp->v_mountedhere;
r = vfs_busy(new_mp, 0);
vput(nvp);
nvp = NULL;
if (r == 0) {
r = VFS_ROOT(new_mp,
LK_SHARED, &nvp);
needs_unbusy = 1;
if (r == 0)
at_root = 1;
}
}
}
if (!r) {
if (refp == NULL &&
((nd->nd_flag & ND_NFSV3) ||
NFSNONZERO_ATTRBIT(&attrbits))) {
r = nfsvno_getfh(nvp, &nfh, p);
if (!r)
r = nfsvno_getattr(nvp, nvap,
nd->nd_cred, p, 1);
}
} else {
nvp = NULL;
}
if (r) {
if (!NFSISSET_ATTRBIT(&attrbits,
NFSATTRBIT_RDATTRERROR)) {
if (nvp != NULL)
vput(nvp);
if (needs_unbusy != 0)
vfs_unbusy(new_mp);
nd->nd_repstat = r;
break;
}
}
}
/*
* Build the directory record xdr
*/
if (nd->nd_flag & ND_NFSV3) {
NFSM_BUILD(tl, u_int32_t *, 3 * NFSX_UNSIGNED);
*tl++ = newnfs_true;
*tl++ = 0;
*tl = txdr_unsigned(dp->d_fileno);
dirlen += nfsm_strtom(nd, dp->d_name, nlen);
NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
*tl++ = 0;
*tl = txdr_unsigned(*cookiep);
nfsrv_postopattr(nd, 0, nvap);
dirlen += nfsm_fhtom(nd,(u_int8_t *)&nfh,0,1);
dirlen += (5*NFSX_UNSIGNED+NFSX_V3POSTOPATTR);
if (nvp != NULL)
vput(nvp);
} else {
NFSM_BUILD(tl, u_int32_t *, 3 * NFSX_UNSIGNED);
*tl++ = newnfs_true;
*tl++ = 0;
*tl = txdr_unsigned(*cookiep);
dirlen += nfsm_strtom(nd, dp->d_name, nlen);
if (nvp != NULL) {
supports_nfsv4acls =
nfs_supportsnfsv4acls(nvp);
NFSVOPUNLOCK(nvp, 0);
} else
supports_nfsv4acls = 0;
if (refp != NULL) {
dirlen += nfsrv_putreferralattr(nd,
&savbits, refp, 0,
&nd->nd_repstat);
if (nd->nd_repstat) {
if (nvp != NULL)
vrele(nvp);
if (needs_unbusy != 0)
vfs_unbusy(new_mp);
break;
}
} else if (r) {
dirlen += nfsvno_fillattr(nd, new_mp,
nvp, nvap, &nfh, r, &rderrbits,
nd->nd_cred, p, isdgram, 0,
supports_nfsv4acls, at_root,
mounted_on_fileno);
} else {
dirlen += nfsvno_fillattr(nd, new_mp,
nvp, nvap, &nfh, r, &attrbits,
nd->nd_cred, p, isdgram, 0,
supports_nfsv4acls, at_root,
mounted_on_fileno);
}
if (nvp != NULL)
vrele(nvp);
dirlen += (3 * NFSX_UNSIGNED);
}
if (needs_unbusy != 0)
vfs_unbusy(new_mp);
if (dirlen <= cnt)
entrycnt++;
}
cpos += dp->d_reclen;
dp = (struct dirent *)cpos;
cookiep++;
ncookies--;
}
vrele(vp);
vfs_unbusy(mp);
/*
* If dirlen > cnt, we must strip off the last entry. If that
* results in an empty reply, report NFSERR_TOOSMALL.
*/
if (dirlen > cnt || nd->nd_repstat) {
if (!nd->nd_repstat && entrycnt == 0)
nd->nd_repstat = NFSERR_TOOSMALL;
if (nd->nd_repstat)
newnfs_trimtrailing(nd, mb0, bpos0);
else
newnfs_trimtrailing(nd, mb1, bpos1);
eofflag = 0;
} else if (cpos < cend)
eofflag = 0;
if (!nd->nd_repstat) {
NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
*tl++ = newnfs_false;
if (eofflag)
*tl = newnfs_true;
else
*tl = newnfs_false;
}
FREE((caddr_t)cookies, M_TEMP);
FREE((caddr_t)rbuf, M_TEMP);
out:
NFSEXITCODE2(0, nd);
return (0);
nfsmout:
vput(vp);
NFSEXITCODE2(error, nd);
return (error);
}
/*
* Get the settable attributes out of the mbuf list.
* (Return 0 or EBADRPC)
*/
int
nfsrv_sattr(struct nfsrv_descript *nd, struct nfsvattr *nvap,
nfsattrbit_t *attrbitp, NFSACL_T *aclp, struct thread *p)
{
u_int32_t *tl;
struct nfsv2_sattr *sp;
struct timeval curtime;
int error = 0, toclient = 0;
switch (nd->nd_flag & (ND_NFSV2 | ND_NFSV3 | ND_NFSV4)) {
case ND_NFSV2:
NFSM_DISSECT(sp, struct nfsv2_sattr *, NFSX_V2SATTR);
/*
* Some old clients didn't fill in the high order 16bits.
* --> check the low order 2 bytes for 0xffff
*/
if ((fxdr_unsigned(int, sp->sa_mode) & 0xffff) != 0xffff)
nvap->na_mode = nfstov_mode(sp->sa_mode);
if (sp->sa_uid != newnfs_xdrneg1)
nvap->na_uid = fxdr_unsigned(uid_t, sp->sa_uid);
if (sp->sa_gid != newnfs_xdrneg1)
nvap->na_gid = fxdr_unsigned(gid_t, sp->sa_gid);
if (sp->sa_size != newnfs_xdrneg1)
nvap->na_size = fxdr_unsigned(u_quad_t, sp->sa_size);
if (sp->sa_atime.nfsv2_sec != newnfs_xdrneg1) {
#ifdef notyet
fxdr_nfsv2time(&sp->sa_atime, &nvap->na_atime);
#else
nvap->na_atime.tv_sec =
fxdr_unsigned(u_int32_t,sp->sa_atime.nfsv2_sec);
nvap->na_atime.tv_nsec = 0;
#endif
}
if (sp->sa_mtime.nfsv2_sec != newnfs_xdrneg1)
fxdr_nfsv2time(&sp->sa_mtime, &nvap->na_mtime);
break;
case ND_NFSV3:
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
if (*tl == newnfs_true) {
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
nvap->na_mode = nfstov_mode(*tl);
}
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
if (*tl == newnfs_true) {
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
nvap->na_uid = fxdr_unsigned(uid_t, *tl);
}
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
if (*tl == newnfs_true) {
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
nvap->na_gid = fxdr_unsigned(gid_t, *tl);
}
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
if (*tl == newnfs_true) {
NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
nvap->na_size = fxdr_hyper(tl);
}
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
switch (fxdr_unsigned(int, *tl)) {
case NFSV3SATTRTIME_TOCLIENT:
NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
fxdr_nfsv3time(tl, &nvap->na_atime);
toclient = 1;
break;
case NFSV3SATTRTIME_TOSERVER:
NFSGETTIME(&curtime);
nvap->na_atime.tv_sec = curtime.tv_sec;
nvap->na_atime.tv_nsec = curtime.tv_usec * 1000;
nvap->na_vaflags |= VA_UTIMES_NULL;
break;
};
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
switch (fxdr_unsigned(int, *tl)) {
case NFSV3SATTRTIME_TOCLIENT:
NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
fxdr_nfsv3time(tl, &nvap->na_mtime);
nvap->na_vaflags &= ~VA_UTIMES_NULL;
break;
case NFSV3SATTRTIME_TOSERVER:
NFSGETTIME(&curtime);
nvap->na_mtime.tv_sec = curtime.tv_sec;
nvap->na_mtime.tv_nsec = curtime.tv_usec * 1000;
if (!toclient)
nvap->na_vaflags |= VA_UTIMES_NULL;
break;
};
break;
case ND_NFSV4:
error = nfsv4_sattr(nd, nvap, attrbitp, aclp, p);
};
nfsmout:
NFSEXITCODE2(error, nd);
return (error);
}
/*
* Handle the setable attributes for V4.
* Returns NFSERR_BADXDR if it can't be parsed, 0 otherwise.
*/
int
nfsv4_sattr(struct nfsrv_descript *nd, struct nfsvattr *nvap,
nfsattrbit_t *attrbitp, NFSACL_T *aclp, struct thread *p)
{
u_int32_t *tl;
int attrsum = 0;
int i, j;
int error, attrsize, bitpos, aclsize, aceerr, retnotsup = 0;
int toclient = 0;
u_char *cp, namestr[NFSV4_SMALLSTR + 1];
uid_t uid;
gid_t gid;
struct timeval curtime;
error = nfsrv_getattrbits(nd, attrbitp, NULL, &retnotsup);
if (error)
goto nfsmout;
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
attrsize = fxdr_unsigned(int, *tl);
/*
* Loop around getting the setable attributes. If an unsupported
* one is found, set nd_repstat == NFSERR_ATTRNOTSUPP and return.
*/
if (retnotsup) {
nd->nd_repstat = NFSERR_ATTRNOTSUPP;
bitpos = NFSATTRBIT_MAX;
} else {
bitpos = 0;
}
for (; bitpos < NFSATTRBIT_MAX; bitpos++) {
if (attrsum > attrsize) {
error = NFSERR_BADXDR;
goto nfsmout;
}
if (NFSISSET_ATTRBIT(attrbitp, bitpos))
switch (bitpos) {
case NFSATTRBIT_SIZE:
NFSM_DISSECT(tl, u_int32_t *, NFSX_HYPER);
nvap->na_size = fxdr_hyper(tl);
attrsum += NFSX_HYPER;
break;
case NFSATTRBIT_ACL:
error = nfsrv_dissectacl(nd, aclp, &aceerr, &aclsize,
p);
if (error)
goto nfsmout;
if (aceerr && !nd->nd_repstat)
nd->nd_repstat = aceerr;
attrsum += aclsize;
break;
case NFSATTRBIT_ARCHIVE:
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
if (!nd->nd_repstat)
nd->nd_repstat = NFSERR_ATTRNOTSUPP;
attrsum += NFSX_UNSIGNED;
break;
case NFSATTRBIT_HIDDEN:
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
if (!nd->nd_repstat)
nd->nd_repstat = NFSERR_ATTRNOTSUPP;
attrsum += NFSX_UNSIGNED;
break;
case NFSATTRBIT_MIMETYPE:
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
i = fxdr_unsigned(int, *tl);
error = nfsm_advance(nd, NFSM_RNDUP(i), -1);
if (error)
goto nfsmout;
if (!nd->nd_repstat)
nd->nd_repstat = NFSERR_ATTRNOTSUPP;
attrsum += (NFSX_UNSIGNED + NFSM_RNDUP(i));
break;
case NFSATTRBIT_MODE:
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
nvap->na_mode = nfstov_mode(*tl);
attrsum += NFSX_UNSIGNED;
break;
case NFSATTRBIT_OWNER:
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
j = fxdr_unsigned(int, *tl);
if (j < 0) {
error = NFSERR_BADXDR;
goto nfsmout;
}
if (j > NFSV4_SMALLSTR)
cp = malloc(j + 1, M_NFSSTRING, M_WAITOK);
else
cp = namestr;
error = nfsrv_mtostr(nd, cp, j);
if (error) {
if (j > NFSV4_SMALLSTR)
free(cp, M_NFSSTRING);
goto nfsmout;
}
if (!nd->nd_repstat) {
nd->nd_repstat = nfsv4_strtouid(cp,j,&uid,p);
if (!nd->nd_repstat)
nvap->na_uid = uid;
}
if (j > NFSV4_SMALLSTR)
free(cp, M_NFSSTRING);
attrsum += (NFSX_UNSIGNED + NFSM_RNDUP(j));
break;
case NFSATTRBIT_OWNERGROUP:
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
j = fxdr_unsigned(int, *tl);
if (j < 0) {
error = NFSERR_BADXDR;
goto nfsmout;
}
if (j > NFSV4_SMALLSTR)
cp = malloc(j + 1, M_NFSSTRING, M_WAITOK);
else
cp = namestr;
error = nfsrv_mtostr(nd, cp, j);
if (error) {
if (j > NFSV4_SMALLSTR)
free(cp, M_NFSSTRING);
goto nfsmout;
}
if (!nd->nd_repstat) {
nd->nd_repstat = nfsv4_strtogid(cp,j,&gid,p);
if (!nd->nd_repstat)
nvap->na_gid = gid;
}
if (j > NFSV4_SMALLSTR)
free(cp, M_NFSSTRING);
attrsum += (NFSX_UNSIGNED + NFSM_RNDUP(j));
break;
case NFSATTRBIT_SYSTEM:
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
if (!nd->nd_repstat)
nd->nd_repstat = NFSERR_ATTRNOTSUPP;
attrsum += NFSX_UNSIGNED;
break;
case NFSATTRBIT_TIMEACCESSSET:
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
attrsum += NFSX_UNSIGNED;
if (fxdr_unsigned(int, *tl)==NFSV4SATTRTIME_TOCLIENT) {
NFSM_DISSECT(tl, u_int32_t *, NFSX_V4TIME);
fxdr_nfsv4time(tl, &nvap->na_atime);
toclient = 1;
attrsum += NFSX_V4TIME;
} else {
NFSGETTIME(&curtime);
nvap->na_atime.tv_sec = curtime.tv_sec;
nvap->na_atime.tv_nsec = curtime.tv_usec * 1000;
nvap->na_vaflags |= VA_UTIMES_NULL;
}
break;
case NFSATTRBIT_TIMEBACKUP:
NFSM_DISSECT(tl, u_int32_t *, NFSX_V4TIME);
if (!nd->nd_repstat)
nd->nd_repstat = NFSERR_ATTRNOTSUPP;
attrsum += NFSX_V4TIME;
break;
case NFSATTRBIT_TIMECREATE:
NFSM_DISSECT(tl, u_int32_t *, NFSX_V4TIME);
if (!nd->nd_repstat)
nd->nd_repstat = NFSERR_ATTRNOTSUPP;
attrsum += NFSX_V4TIME;
break;
case NFSATTRBIT_TIMEMODIFYSET:
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
attrsum += NFSX_UNSIGNED;
if (fxdr_unsigned(int, *tl)==NFSV4SATTRTIME_TOCLIENT) {
NFSM_DISSECT(tl, u_int32_t *, NFSX_V4TIME);
fxdr_nfsv4time(tl, &nvap->na_mtime);
nvap->na_vaflags &= ~VA_UTIMES_NULL;
attrsum += NFSX_V4TIME;
} else {
NFSGETTIME(&curtime);
nvap->na_mtime.tv_sec = curtime.tv_sec;
nvap->na_mtime.tv_nsec = curtime.tv_usec * 1000;
if (!toclient)
nvap->na_vaflags |= VA_UTIMES_NULL;
}
break;
default:
nd->nd_repstat = NFSERR_ATTRNOTSUPP;
/*
* set bitpos so we drop out of the loop.
*/
bitpos = NFSATTRBIT_MAX;
break;
};
}
/*
* some clients pad the attrlist, so we need to skip over the
* padding.
*/
if (attrsum > attrsize) {
error = NFSERR_BADXDR;
} else {
attrsize = NFSM_RNDUP(attrsize);
if (attrsum < attrsize)
error = nfsm_advance(nd, attrsize - attrsum, -1);
}
nfsmout:
NFSEXITCODE2(error, nd);
return (error);
}
/*
* Check/setup export credentials.
*/
int
nfsd_excred(struct nfsrv_descript *nd, struct nfsexstuff *exp,
struct ucred *credanon)
{
int error = 0;
/*
* Check/setup credentials.
*/
if (nd->nd_flag & ND_GSS)
exp->nes_exflag &= ~MNT_EXPORTANON;
/*
* Check to see if the operation is allowed for this security flavor.
* RFC2623 suggests that the NFSv3 Fsinfo RPC be allowed to
* AUTH_NONE or AUTH_SYS for file systems requiring RPCSEC_GSS.
* Also, allow Secinfo, so that it can acquire the correct flavor(s).
*/
if (nfsvno_testexp(nd, exp) &&
nd->nd_procnum != NFSV4OP_SECINFO &&
nd->nd_procnum != NFSPROC_FSINFO) {
if (nd->nd_flag & ND_NFSV4)
error = NFSERR_WRONGSEC;
else
error = (NFSERR_AUTHERR | AUTH_TOOWEAK);
goto out;
}
/*
* Check to see if the file system is exported V4 only.
*/
if (NFSVNO_EXV4ONLY(exp) && !(nd->nd_flag & ND_NFSV4)) {
error = NFSERR_PROGNOTV4;
goto out;
}
/*
* Now, map the user credentials.
* (Note that ND_AUTHNONE will only be set for an NFSv3
* Fsinfo RPC. If set for anything else, this code might need
* to change.)
*/
if (NFSVNO_EXPORTED(exp) &&
((!(nd->nd_flag & ND_GSS) && nd->nd_cred->cr_uid == 0) ||
NFSVNO_EXPORTANON(exp) ||
(nd->nd_flag & ND_AUTHNONE))) {
nd->nd_cred->cr_uid = credanon->cr_uid;
nd->nd_cred->cr_gid = credanon->cr_gid;
crsetgroups(nd->nd_cred, credanon->cr_ngroups,
credanon->cr_groups);
}
out:
NFSEXITCODE2(error, nd);
return (error);
}
/*
* Check exports.
*/
int
nfsvno_checkexp(struct mount *mp, struct sockaddr *nam, struct nfsexstuff *exp,
struct ucred **credp)
{
int i, error, *secflavors;
error = VFS_CHECKEXP(mp, nam, &exp->nes_exflag, credp,
&exp->nes_numsecflavor, &secflavors);
if (error) {
if (nfs_rootfhset) {
exp->nes_exflag = 0;
exp->nes_numsecflavor = 0;
error = 0;
}
} else {
/* Copy the security flavors. */
for (i = 0; i < exp->nes_numsecflavor; i++)
exp->nes_secflavors[i] = secflavors[i];
}
NFSEXITCODE(error);
return (error);
}
/*
* Get a vnode for a file handle and export stuff.
*/
int
nfsvno_fhtovp(struct mount *mp, fhandle_t *fhp, struct sockaddr *nam,
int lktype, struct vnode **vpp, struct nfsexstuff *exp,
struct ucred **credp)
{
int i, error, *secflavors;
*credp = NULL;
exp->nes_numsecflavor = 0;
if (VFS_NEEDSGIANT(mp))
error = ESTALE;
else
error = VFS_FHTOVP(mp, &fhp->fh_fid, LK_EXCLUSIVE, vpp);
if (error != 0)
/* Make sure the server replies ESTALE to the client. */
error = ESTALE;
if (nam && !error) {
error = VFS_CHECKEXP(mp, nam, &exp->nes_exflag, credp,
&exp->nes_numsecflavor, &secflavors);
if (error) {
if (nfs_rootfhset) {
exp->nes_exflag = 0;
exp->nes_numsecflavor = 0;
error = 0;
} else {
vput(*vpp);
}
} else {
/* Copy the security flavors. */
for (i = 0; i < exp->nes_numsecflavor; i++)
exp->nes_secflavors[i] = secflavors[i];
}
}
if (error == 0 && lktype == LK_SHARED)
/*
* It would be much better to pass lktype to VFS_FHTOVP(),
* but this will have to do until VFS_FHTOVP() has a lock
* type argument like VFS_VGET().
*/
NFSVOPLOCK(*vpp, LK_DOWNGRADE | LK_RETRY);
NFSEXITCODE(error);
return (error);
}
/*
* nfsd_fhtovp() - convert a fh to a vnode ptr
* - look up fsid in mount list (if not found ret error)
* - get vp and export rights by calling nfsvno_fhtovp()
* - if cred->cr_uid == 0 or MNT_EXPORTANON set it to credanon
* for AUTH_SYS
* - if mpp != NULL, return the mount point so that it can
* be used for vn_finished_write() by the caller
*/
void
nfsd_fhtovp(struct nfsrv_descript *nd, struct nfsrvfh *nfp, int lktype,
struct vnode **vpp, struct nfsexstuff *exp,
struct mount **mpp, int startwrite, struct thread *p)
{
struct mount *mp;
struct ucred *credanon;
fhandle_t *fhp;
fhp = (fhandle_t *)nfp->nfsrvfh_data;
/*
* Check for the special case of the nfsv4root_fh.
*/
mp = vfs_busyfs(&fhp->fh_fsid);
if (mpp != NULL)
*mpp = mp;
if (mp == NULL) {
*vpp = NULL;
nd->nd_repstat = ESTALE;
goto out;
}
if (startwrite)
vn_start_write(NULL, mpp, V_WAIT);
nd->nd_repstat = nfsvno_fhtovp(mp, fhp, nd->nd_nam, lktype, vpp, exp,
&credanon);
vfs_unbusy(mp);
/*
* For NFSv4 without a pseudo root fs, unexported file handles
* can be returned, so that Lookup works everywhere.
*/
if (!nd->nd_repstat && exp->nes_exflag == 0 &&
!(nd->nd_flag & ND_NFSV4)) {
vput(*vpp);
nd->nd_repstat = EACCES;
}
/*
* Personally, I've never seen any point in requiring a
* reserved port#, since only in the rare case where the
* clients are all boxes with secure system priviledges,
* does it provide any enhanced security, but... some people
* believe it to be useful and keep putting this code back in.
* (There is also some "security checker" out there that
* complains if the nfs server doesn't enforce this.)
* However, note the following:
* RFC3530 (NFSv4) specifies that a reserved port# not be
* required.
* RFC2623 recommends that, if a reserved port# is checked for,
* that there be a way to turn that off--> ifdef'd.
*/
#ifdef NFS_REQRSVPORT
if (!nd->nd_repstat) {
struct sockaddr_in *saddr;
struct sockaddr_in6 *saddr6;
saddr = NFSSOCKADDR(nd->nd_nam, struct sockaddr_in *);
saddr6 = NFSSOCKADDR(nd->nd_nam, struct sockaddr_in6 *);
if (!(nd->nd_flag & ND_NFSV4) &&
((saddr->sin_family == AF_INET &&
ntohs(saddr->sin_port) >= IPPORT_RESERVED) ||
(saddr6->sin6_family == AF_INET6 &&
ntohs(saddr6->sin6_port) >= IPPORT_RESERVED))) {
vput(*vpp);
nd->nd_repstat = (NFSERR_AUTHERR | AUTH_TOOWEAK);
}
}
#endif /* NFS_REQRSVPORT */
/*
* Check/setup credentials.
*/
if (!nd->nd_repstat) {
nd->nd_saveduid = nd->nd_cred->cr_uid;
nd->nd_repstat = nfsd_excred(nd, exp, credanon);
if (nd->nd_repstat)
vput(*vpp);
}
if (credanon != NULL)
crfree(credanon);
if (nd->nd_repstat) {
if (startwrite)
vn_finished_write(mp);
*vpp = NULL;
if (mpp != NULL)
*mpp = NULL;
}
out:
NFSEXITCODE2(0, nd);
}
/*
* glue for fp.
*/
int
fp_getfvp(struct thread *p, int fd, struct file **fpp, struct vnode **vpp)
{
struct filedesc *fdp;
struct file *fp;
int error = 0;
fdp = p->td_proc->p_fd;
if (fd >= fdp->fd_nfiles ||
(fp = fdp->fd_ofiles[fd]) == NULL) {
error = EBADF;
goto out;
}
*fpp = fp;
out:
NFSEXITCODE(error);
return (error);
}
/*
* Called from nfssvc() to update the exports list. Just call
* vfs_export(). This has to be done, since the v4 root fake fs isn't
* in the mount list.
*/
int
nfsrv_v4rootexport(void *argp, struct ucred *cred, struct thread *p)
{
struct nfsex_args *nfsexargp = (struct nfsex_args *)argp;
int error = 0;
struct nameidata nd;
fhandle_t fh;
error = vfs_export(&nfsv4root_mnt, &nfsexargp->export);
if ((nfsexargp->export.ex_flags & MNT_DELEXPORT) != 0)
nfs_rootfhset = 0;
else if (error == 0) {
if (nfsexargp->fspec == NULL) {
error = EPERM;
goto out;
}
/*
* If fspec != NULL, this is the v4root path.
*/
NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE, UIO_USERSPACE,
nfsexargp->fspec, p);
if ((error = namei(&nd)) != 0)
goto out;
error = nfsvno_getfh(nd.ni_vp, &fh, p);
vrele(nd.ni_vp);
if (!error) {
nfs_rootfh.nfsrvfh_len = NFSX_MYFH;
NFSBCOPY((caddr_t)&fh,
nfs_rootfh.nfsrvfh_data,
sizeof (fhandle_t));
nfs_rootfhset = 1;
}
}
out:
NFSEXITCODE(error);
return (error);
}
/*
* Get the tcp socket sequence numbers we need.
* (Maybe this should be moved to the tcp sources?)
*/
int
nfsrv_getsocksndseq(struct socket *so, tcp_seq *maxp, tcp_seq *unap)
{
struct inpcb *inp;
struct tcpcb *tp;
int error = 0;
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("nfsrv_getsocksndseq: inp == NULL"));
INP_RLOCK(inp);
if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
INP_RUNLOCK(inp);
error = EPIPE;
goto out;
}
tp = intotcpcb(inp);
if (tp->t_state != TCPS_ESTABLISHED) {
INP_RUNLOCK(inp);
error = EPIPE;
goto out;
}
*maxp = tp->snd_max;
*unap = tp->snd_una;
INP_RUNLOCK(inp);
out:
NFSEXITCODE(error);
return (error);
}
/*
* This function needs to test to see if the system is near its limit
* for memory allocation via malloc() or mget() and return True iff
* either of these resources are near their limit.
* XXX (For now, this is just a stub.)
*/
int nfsrv_testmalloclimit = 0;
int
nfsrv_mallocmget_limit(void)
{
static int printmesg = 0;
static int testval = 1;
if (nfsrv_testmalloclimit && (testval++ % 1000) == 0) {
if ((printmesg++ % 100) == 0)
printf("nfsd: malloc/mget near limit\n");
return (1);
}
return (0);
}
/*
* BSD specific initialization of a mount point.
*/
void
nfsd_mntinit(void)
{
static int inited = 0;
if (inited)
return;
inited = 1;
nfsv4root_mnt.mnt_flag = (MNT_RDONLY | MNT_EXPORTED);
TAILQ_INIT(&nfsv4root_mnt.mnt_nvnodelist);
nfsv4root_mnt.mnt_export = NULL;
TAILQ_INIT(&nfsv4root_opt);
TAILQ_INIT(&nfsv4root_newopt);
nfsv4root_mnt.mnt_opt = &nfsv4root_opt;
nfsv4root_mnt.mnt_optnew = &nfsv4root_newopt;
nfsv4root_mnt.mnt_nvnodelistsize = 0;
}
/*
* Get a vnode for a file handle, without checking exports, etc.
*/
struct vnode *
nfsvno_getvp(fhandle_t *fhp)
{
struct mount *mp;
struct vnode *vp;
int error;
mp = vfs_busyfs(&fhp->fh_fsid);
if (mp == NULL)
return (NULL);
error = VFS_FHTOVP(mp, &fhp->fh_fid, LK_EXCLUSIVE, &vp);
vfs_unbusy(mp);
if (error)
return (NULL);
return (vp);
}
/*
* Do a local VOP_ADVLOCK().
*/
int
nfsvno_advlock(struct vnode *vp, int ftype, u_int64_t first,
u_int64_t end, struct thread *td)
{
int error = 0;
struct flock fl;
u_int64_t tlen;
if (nfsrv_dolocallocks == 0)
goto out;
/* Check for VI_DOOMED here, so that VOP_ADVLOCK() isn't performed. */
if ((vp->v_iflag & VI_DOOMED) != 0) {
error = EPERM;
goto out;
}
fl.l_whence = SEEK_SET;
fl.l_type = ftype;
fl.l_start = (off_t)first;
if (end == NFS64BITSSET) {
fl.l_len = 0;
} else {
tlen = end - first;
fl.l_len = (off_t)tlen;
}
/*
* For FreeBSD8, the l_pid and l_sysid must be set to the same
* values for all calls, so that all locks will be held by the
* nfsd server. (The nfsd server handles conflicts between the
* various clients.)
* Since an NFSv4 lockowner is a ClientID plus an array of up to 1024
* bytes, so it can't be put in l_sysid.
*/
if (nfsv4_sysid == 0)
nfsv4_sysid = nlm_acquire_next_sysid();
fl.l_pid = (pid_t)0;
fl.l_sysid = (int)nfsv4_sysid;
NFSVOPUNLOCK(vp, 0);
if (ftype == F_UNLCK)
error = VOP_ADVLOCK(vp, (caddr_t)td->td_proc, F_UNLCK, &fl,
(F_POSIX | F_REMOTE));
else
error = VOP_ADVLOCK(vp, (caddr_t)td->td_proc, F_SETLK, &fl,
(F_POSIX | F_REMOTE));
NFSVOPLOCK(vp, LK_EXCLUSIVE | LK_RETRY);
out:
NFSEXITCODE(error);
return (error);
}
/*
* Check the nfsv4 root exports.
*/
int
nfsvno_v4rootexport(struct nfsrv_descript *nd)
{
struct ucred *credanon;
int exflags, error = 0, numsecflavor, *secflavors, i;
error = vfs_stdcheckexp(&nfsv4root_mnt, nd->nd_nam, &exflags,
&credanon, &numsecflavor, &secflavors);
if (error) {
error = NFSERR_PROGUNAVAIL;
goto out;
}
if (credanon != NULL)
crfree(credanon);
for (i = 0; i < numsecflavor; i++) {
if (secflavors[i] == AUTH_SYS)
nd->nd_flag |= ND_EXAUTHSYS;
else if (secflavors[i] == RPCSEC_GSS_KRB5)
nd->nd_flag |= ND_EXGSS;
else if (secflavors[i] == RPCSEC_GSS_KRB5I)
nd->nd_flag |= ND_EXGSSINTEGRITY;
else if (secflavors[i] == RPCSEC_GSS_KRB5P)
nd->nd_flag |= ND_EXGSSPRIVACY;
}
out:
NFSEXITCODE(error);
return (error);
}
/*
* Nfs server psuedo system call for the nfsd's
*/
/*
* MPSAFE
*/
static int
nfssvc_nfsd(struct thread *td, struct nfssvc_args *uap)
{
struct file *fp;
struct nfsd_addsock_args sockarg;
struct nfsd_nfsd_args nfsdarg;
int error;
if (uap->flag & NFSSVC_NFSDADDSOCK) {
error = copyin(uap->argp, (caddr_t)&sockarg, sizeof (sockarg));
if (error)
goto out;
/*
* Since we don't know what rights might be required,
* pretend that we need them all. It is better to be too
* careful than too reckless.
*/
if ((error = fget(td, sockarg.sock, CAP_SOCK_ALL, &fp)) != 0)
goto out;
if (fp->f_type != DTYPE_SOCKET) {
fdrop(fp, td);
error = EPERM;
goto out;
}
error = nfsrvd_addsock(fp);
fdrop(fp, td);
} else if (uap->flag & NFSSVC_NFSDNFSD) {
if (uap->argp == NULL) {
error = EINVAL;
goto out;
}
error = copyin(uap->argp, (caddr_t)&nfsdarg,
sizeof (nfsdarg));
if (error)
goto out;
error = nfsrvd_nfsd(td, &nfsdarg);
} else {
error = nfssvc_srvcall(td, uap, td->td_ucred);
}
out:
NFSEXITCODE(error);
return (error);
}
static int
nfssvc_srvcall(struct thread *p, struct nfssvc_args *uap, struct ucred *cred)
{
struct nfsex_args export;
struct file *fp = NULL;
int stablefd, len;
struct nfsd_clid adminrevoke;
struct nfsd_dumplist dumplist;
struct nfsd_dumpclients *dumpclients;
struct nfsd_dumplocklist dumplocklist;
struct nfsd_dumplocks *dumplocks;
struct nameidata nd;
vnode_t vp;
int error = EINVAL;
struct proc *procp;
if (uap->flag & NFSSVC_PUBLICFH) {
NFSBZERO((caddr_t)&nfs_pubfh.nfsrvfh_data,
sizeof (fhandle_t));
error = copyin(uap->argp,
&nfs_pubfh.nfsrvfh_data, sizeof (fhandle_t));
if (!error)
nfs_pubfhset = 1;
} else if (uap->flag & NFSSVC_V4ROOTEXPORT) {
error = copyin(uap->argp,(caddr_t)&export,
sizeof (struct nfsex_args));
if (!error)
error = nfsrv_v4rootexport(&export, cred, p);
} else if (uap->flag & NFSSVC_NOPUBLICFH) {
nfs_pubfhset = 0;
error = 0;
} else if (uap->flag & NFSSVC_STABLERESTART) {
error = copyin(uap->argp, (caddr_t)&stablefd,
sizeof (int));
if (!error)
error = fp_getfvp(p, stablefd, &fp, &vp);
if (!error && (NFSFPFLAG(fp) & (FREAD | FWRITE)) != (FREAD | FWRITE))
error = EBADF;
if (!error && newnfs_numnfsd != 0)
error = EPERM;
if (!error) {
nfsrv_stablefirst.nsf_fp = fp;
nfsrv_setupstable(p);
}
} else if (uap->flag & NFSSVC_ADMINREVOKE) {
error = copyin(uap->argp, (caddr_t)&adminrevoke,
sizeof (struct nfsd_clid));
if (!error)
error = nfsrv_adminrevoke(&adminrevoke, p);
} else if (uap->flag & NFSSVC_DUMPCLIENTS) {
error = copyin(uap->argp, (caddr_t)&dumplist,
sizeof (struct nfsd_dumplist));
if (!error && (dumplist.ndl_size < 1 ||
dumplist.ndl_size > NFSRV_MAXDUMPLIST))
error = EPERM;
if (!error) {
len = sizeof (struct nfsd_dumpclients) * dumplist.ndl_size;
dumpclients = (struct nfsd_dumpclients *)malloc(len,
M_TEMP, M_WAITOK);
nfsrv_dumpclients(dumpclients, dumplist.ndl_size);
error = copyout(dumpclients,
CAST_USER_ADDR_T(dumplist.ndl_list), len);
free((caddr_t)dumpclients, M_TEMP);
}
} else if (uap->flag & NFSSVC_DUMPLOCKS) {
error = copyin(uap->argp, (caddr_t)&dumplocklist,
sizeof (struct nfsd_dumplocklist));
if (!error && (dumplocklist.ndllck_size < 1 ||
dumplocklist.ndllck_size > NFSRV_MAXDUMPLIST))
error = EPERM;
if (!error)
error = nfsrv_lookupfilename(&nd,
dumplocklist.ndllck_fname, p);
if (!error) {
len = sizeof (struct nfsd_dumplocks) *
dumplocklist.ndllck_size;
dumplocks = (struct nfsd_dumplocks *)malloc(len,
M_TEMP, M_WAITOK);
nfsrv_dumplocks(nd.ni_vp, dumplocks,
dumplocklist.ndllck_size, p);
vput(nd.ni_vp);
error = copyout(dumplocks,
CAST_USER_ADDR_T(dumplocklist.ndllck_list), len);
free((caddr_t)dumplocks, M_TEMP);
}
} else if (uap->flag & NFSSVC_BACKUPSTABLE) {
procp = p->td_proc;
PROC_LOCK(procp);
nfsd_master_pid = procp->p_pid;
bcopy(procp->p_comm, nfsd_master_comm, MAXCOMLEN + 1);
nfsd_master_start = procp->p_stats->p_start;
nfsd_master_proc = procp;
PROC_UNLOCK(procp);
}
NFSEXITCODE(error);
return (error);
}
/*
* Check exports.
* Returns 0 if ok, 1 otherwise.
*/
int
nfsvno_testexp(struct nfsrv_descript *nd, struct nfsexstuff *exp)
{
int i;
/*
* This seems odd, but allow the case where the security flavor
* list is empty. This happens when NFSv4 is traversing non-exported
* file systems. Exported file systems should always have a non-empty
* security flavor list.
*/
if (exp->nes_numsecflavor == 0)
return (0);
for (i = 0; i < exp->nes_numsecflavor; i++) {
/*
* The tests for privacy and integrity must be first,
* since ND_GSS is set for everything but AUTH_SYS.
*/
if (exp->nes_secflavors[i] == RPCSEC_GSS_KRB5P &&
(nd->nd_flag & ND_GSSPRIVACY))
return (0);
if (exp->nes_secflavors[i] == RPCSEC_GSS_KRB5I &&
(nd->nd_flag & ND_GSSINTEGRITY))
return (0);
if (exp->nes_secflavors[i] == RPCSEC_GSS_KRB5 &&
(nd->nd_flag & ND_GSS))
return (0);
if (exp->nes_secflavors[i] == AUTH_SYS &&
(nd->nd_flag & ND_GSS) == 0)
return (0);
}
return (1);
}
/*
* Calculate a hash value for the fid in a file handle.
*/
uint32_t
nfsrv_hashfh(fhandle_t *fhp)
{
uint32_t hashval;
hashval = hash32_buf(&fhp->fh_fid, sizeof(struct fid), 0);
return (hashval);
}
/*
* Signal the userland master nfsd to backup the stable restart file.
*/
void
nfsrv_backupstable(void)
{
struct proc *procp;
if (nfsd_master_proc != NULL) {
procp = pfind(nfsd_master_pid);
/* Try to make sure it is the correct process. */
if (procp == nfsd_master_proc &&
procp->p_stats->p_start.tv_sec ==
nfsd_master_start.tv_sec &&
procp->p_stats->p_start.tv_usec ==
nfsd_master_start.tv_usec &&
strcmp(procp->p_comm, nfsd_master_comm) == 0)
- psignal(procp, SIGUSR2);
+ kern_psignal(procp, SIGUSR2);
else
nfsd_master_proc = NULL;
if (procp != NULL)
PROC_UNLOCK(procp);
}
}
extern int (*nfsd_call_nfsd)(struct thread *, struct nfssvc_args *);
/*
* Called once to initialize data structures...
*/
static int
nfsd_modevent(module_t mod, int type, void *data)
{
int error = 0;
static int loaded = 0;
switch (type) {
case MOD_LOAD:
if (loaded)
goto out;
newnfs_portinit();
mtx_init(&nfs_cache_mutex, "nfs_cache_mutex", NULL, MTX_DEF);
mtx_init(&nfs_v4root_mutex, "nfs_v4root_mutex", NULL, MTX_DEF);
mtx_init(&nfsv4root_mnt.mnt_mtx, "struct mount mtx", NULL,
MTX_DEF);
lockinit(&nfsv4root_mnt.mnt_explock, PVFS, "explock", 0, 0);
nfsrvd_initcache();
nfsd_init();
NFSD_LOCK();
nfsrvd_init(0);
NFSD_UNLOCK();
nfsd_mntinit();
#ifdef VV_DISABLEDELEG
vn_deleg_ops.vndeleg_recall = nfsd_recalldelegation;
vn_deleg_ops.vndeleg_disable = nfsd_disabledelegation;
#endif
nfsd_call_servertimer = nfsrv_servertimer;
nfsd_call_nfsd = nfssvc_nfsd;
loaded = 1;
break;
case MOD_UNLOAD:
if (newnfs_numnfsd != 0) {
error = EBUSY;
break;
}
#ifdef VV_DISABLEDELEG
vn_deleg_ops.vndeleg_recall = NULL;
vn_deleg_ops.vndeleg_disable = NULL;
#endif
nfsd_call_servertimer = NULL;
nfsd_call_nfsd = NULL;
/* Clean out all NFSv4 state. */
nfsrv_throwawayallstate(curthread);
/* Clean the NFS server reply cache */
nfsrvd_cleancache();
/* Free up the krpc server pool. */
if (nfsrvd_pool != NULL)
svcpool_destroy(nfsrvd_pool);
/* and get rid of the locks */
mtx_destroy(&nfs_cache_mutex);
mtx_destroy(&nfs_v4root_mutex);
mtx_destroy(&nfsv4root_mnt.mnt_mtx);
lockdestroy(&nfsv4root_mnt.mnt_explock);
loaded = 0;
break;
default:
error = EOPNOTSUPP;
break;
}
out:
NFSEXITCODE(error);
return (error);
}
static moduledata_t nfsd_mod = {
"nfsd",
nfsd_modevent,
NULL,
};
DECLARE_MODULE(nfsd, nfsd_mod, SI_SUB_VFS, SI_ORDER_ANY);
/* So that loader and kldload(2) can find us, wherever we are.. */
MODULE_VERSION(nfsd, 1);
MODULE_DEPEND(nfsd, nfscommon, 1, 1, 1);
MODULE_DEPEND(nfsd, nfslock, 1, 1, 1);
MODULE_DEPEND(nfsd, nfslockd, 1, 1, 1);
MODULE_DEPEND(nfsd, krpc, 1, 1, 1);
MODULE_DEPEND(nfsd, nfssvc, 1, 1, 1);
Index: head/sys/fs/procfs/procfs_ctl.c
===================================================================
--- head/sys/fs/procfs/procfs_ctl.c (revision 225616)
+++ head/sys/fs/procfs/procfs_ctl.c (revision 225617)
@@ -1,358 +1,358 @@
/*-
* Copyright (c) 1993 Jan-Simon Pendry
* Copyright (c) 1993
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* Jan-Simon Pendry.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)procfs_ctl.c 8.4 (Berkeley) 6/15/94
*
* From:
* $Id: procfs_ctl.c,v 1.51 2003/12/07 17:40:00 des Exp $
* $FreeBSD$
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/ptrace.h>
#include <sys/sbuf.h>
#include <sys/signalvar.h>
#include <sys/sx.h>
#include <sys/uio.h>
#include <fs/pseudofs/pseudofs.h>
#include <fs/procfs/procfs.h>
#include <vm/vm.h>
/*
* True iff process (p) is in trace wait state
* relative to process (curp)
*/
#define TRACE_WAIT_P(curp, p) \
(P_SHOULDSTOP(p) && \
(p)->p_pptr == (curp) && \
((p)->p_flag & P_TRACED))
#define PROCFS_CTL_ATTACH 1
#define PROCFS_CTL_DETACH 2
#define PROCFS_CTL_STEP 3
#define PROCFS_CTL_RUN 4
#define PROCFS_CTL_WAIT 5
struct namemap {
const char *nm_name;
int nm_val;
};
static struct namemap ctlnames[] = {
/* special /proc commands */
{ "attach", PROCFS_CTL_ATTACH },
{ "detach", PROCFS_CTL_DETACH },
{ "step", PROCFS_CTL_STEP },
{ "run", PROCFS_CTL_RUN },
{ "wait", PROCFS_CTL_WAIT },
{ 0 },
};
static struct namemap signames[] = {
/* regular signal names */
{ "hup", SIGHUP }, { "int", SIGINT },
{ "quit", SIGQUIT }, { "ill", SIGILL },
{ "trap", SIGTRAP }, { "abrt", SIGABRT },
{ "iot", SIGIOT }, { "emt", SIGEMT },
{ "fpe", SIGFPE }, { "kill", SIGKILL },
{ "bus", SIGBUS }, { "segv", SIGSEGV },
{ "sys", SIGSYS }, { "pipe", SIGPIPE },
{ "alrm", SIGALRM }, { "term", SIGTERM },
{ "urg", SIGURG }, { "stop", SIGSTOP },
{ "tstp", SIGTSTP }, { "cont", SIGCONT },
{ "chld", SIGCHLD }, { "ttin", SIGTTIN },
{ "ttou", SIGTTOU }, { "io", SIGIO },
{ "xcpu", SIGXCPU }, { "xfsz", SIGXFSZ },
{ "vtalrm", SIGVTALRM }, { "prof", SIGPROF },
{ "winch", SIGWINCH }, { "info", SIGINFO },
{ "usr1", SIGUSR1 }, { "usr2", SIGUSR2 },
{ 0 },
};
static int procfs_control(struct thread *td, struct proc *p, int op);
static int
procfs_control(struct thread *td, struct proc *p, int op)
{
int error = 0;
struct thread *temp;
/*
* Attach - attaches the target process for debugging
* by the calling process.
*/
if (op == PROCFS_CTL_ATTACH) {
sx_xlock(&proctree_lock);
PROC_LOCK(p);
if ((error = p_candebug(td, p)) != 0)
goto out;
if (p->p_flag & P_TRACED) {
error = EBUSY;
goto out;
}
/* Can't trace yourself! */
if (p->p_pid == td->td_proc->p_pid) {
error = EINVAL;
goto out;
}
/*
* Go ahead and set the trace flag.
* Save the old parent (it's reset in
* _DETACH, and also in kern_exit.c:wait4()
* Reparent the process so that the tracing
* proc gets to see all the action.
* Stop the target.
*/
p->p_flag |= P_TRACED;
faultin(p);
p->p_xstat = 0; /* XXX ? */
if (p->p_pptr != td->td_proc) {
p->p_oppid = p->p_pptr->p_pid;
proc_reparent(p, td->td_proc);
}
- psignal(p, SIGSTOP);
+ kern_psignal(p, SIGSTOP);
out:
PROC_UNLOCK(p);
sx_xunlock(&proctree_lock);
return (error);
}
/*
* Authorization check: rely on normal debugging protection, except
* allow processes to disengage debugging on a process onto which
* they have previously attached, but no longer have permission to
* debug.
*/
PROC_LOCK(p);
if (op != PROCFS_CTL_DETACH &&
((error = p_candebug(td, p)))) {
PROC_UNLOCK(p);
return (error);
}
/*
* Target process must be stopped, owned by (td) and
* be set up for tracing (P_TRACED flag set).
* Allow DETACH to take place at any time for sanity.
* Allow WAIT any time, of course.
*/
switch (op) {
case PROCFS_CTL_DETACH:
case PROCFS_CTL_WAIT:
break;
default:
if (!TRACE_WAIT_P(td->td_proc, p)) {
PROC_UNLOCK(p);
return (EBUSY);
}
}
#ifdef FIX_SSTEP
/*
* do single-step fixup if needed
*/
FIX_SSTEP(FIRST_THREAD_IN_PROC(p));
#endif
/*
* Don't deliver any signal by default.
* To continue with a signal, just send
* the signal name to the ctl file
*/
p->p_xstat = 0;
switch (op) {
/*
* Detach. Cleans up the target process, reparent it if possible
* and set it running once more.
*/
case PROCFS_CTL_DETACH:
/* if not being traced, then this is a painless no-op */
if ((p->p_flag & P_TRACED) == 0) {
PROC_UNLOCK(p);
return (0);
}
/* not being traced any more */
p->p_flag &= ~(P_TRACED | P_STOPPED_TRACE);
/* remove pending SIGTRAP, else the process will die */
sigqueue_delete_proc(p, SIGTRAP);
FOREACH_THREAD_IN_PROC(p, temp)
temp->td_dbgflags &= ~TDB_SUSPEND;
PROC_UNLOCK(p);
/* give process back to original parent */
sx_xlock(&proctree_lock);
if (p->p_oppid != p->p_pptr->p_pid) {
struct proc *pp;
pp = pfind(p->p_oppid);
PROC_LOCK(p);
if (pp) {
PROC_UNLOCK(pp);
proc_reparent(p, pp);
}
} else
PROC_LOCK(p);
p->p_oppid = 0;
p->p_flag &= ~P_WAITED; /* XXX ? */
sx_xunlock(&proctree_lock);
wakeup(td->td_proc); /* XXX for CTL_WAIT below ? */
break;
/*
* Step. Let the target process execute a single instruction.
* What does it mean to single step a threaded program?
*/
case PROCFS_CTL_STEP:
error = proc_sstep(FIRST_THREAD_IN_PROC(p));
if (error) {
PROC_UNLOCK(p);
return (error);
}
break;
/*
* Run. Let the target process continue running until a breakpoint
* or some other trap.
*/
case PROCFS_CTL_RUN:
p->p_flag &= ~P_STOPPED_SIG; /* this uses SIGSTOP */
break;
/*
* Wait for the target process to stop.
* If the target is not being traced then just wait
* to enter
*/
case PROCFS_CTL_WAIT:
if (p->p_flag & P_TRACED) {
while (error == 0 &&
(P_SHOULDSTOP(p)) &&
(p->p_flag & P_TRACED) &&
(p->p_pptr == td->td_proc))
error = msleep(p, &p->p_mtx,
PWAIT|PCATCH, "procfsx", 0);
if (error == 0 && !TRACE_WAIT_P(td->td_proc, p))
error = EBUSY;
} else {
while (error == 0 && P_SHOULDSTOP(p))
error = msleep(p, &p->p_mtx,
PWAIT|PCATCH, "procfs", 0);
}
PROC_UNLOCK(p);
return (error);
default:
panic("procfs_control");
}
PROC_SLOCK(p);
thread_unsuspend(p); /* If it can run, let it do so. */
PROC_SUNLOCK(p);
PROC_UNLOCK(p);
return (0);
}
static struct namemap *
findname(struct namemap *nm, char *buf, int buflen)
{
for (; nm->nm_name; nm++)
if (bcmp(buf, nm->nm_name, buflen+1) == 0)
return (nm);
return (0);
}
int
procfs_doprocctl(PFS_FILL_ARGS)
{
int error;
struct namemap *nm;
if (uio == NULL || uio->uio_rw != UIO_WRITE)
return (EOPNOTSUPP);
/*
* Map signal names into signal generation
* or debug control. Unknown commands and/or signals
* return EOPNOTSUPP.
*
* Sending a signal while the process is being debugged
* also has the side effect of letting the target continue
* to run. There is no way to single-step a signal delivery.
*/
error = EOPNOTSUPP;
sbuf_trim(sb);
sbuf_finish(sb);
nm = findname(ctlnames, sbuf_data(sb), sbuf_len(sb));
if (nm) {
printf("procfs: got a %s command\n", sbuf_data(sb));
error = procfs_control(td, p, nm->nm_val);
} else {
nm = findname(signames, sbuf_data(sb), sbuf_len(sb));
if (nm) {
printf("procfs: got a sig%s\n", sbuf_data(sb));
PROC_LOCK(p);
if (TRACE_WAIT_P(td->td_proc, p)) {
p->p_xstat = nm->nm_val;
#ifdef FIX_SSTEP
FIX_SSTEP(FIRST_THREAD_IN_PROC(p));
#endif
p->p_flag &= ~P_STOPPED_SIG;
PROC_SLOCK(p);
thread_unsuspend(p);
PROC_SUNLOCK(p);
} else
- psignal(p, nm->nm_val);
+ kern_psignal(p, nm->nm_val);
PROC_UNLOCK(p);
error = 0;
}
}
return (error);
}
Index: head/sys/fs/procfs/procfs_ioctl.c
===================================================================
--- head/sys/fs/procfs/procfs_ioctl.c (revision 225616)
+++ head/sys/fs/procfs/procfs_ioctl.c (revision 225617)
@@ -1,220 +1,220 @@
/*-
* Copyright (c) 2001 Dag-Erling Coïdan Smørgrav
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer
* in this position and unchanged.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* $FreeBSD$
*/
#include "opt_compat.h"
#include <sys/param.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/pioctl.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/signalvar.h>
#include <sys/systm.h>
#include <fs/pseudofs/pseudofs.h>
#include <fs/procfs/procfs.h>
#ifdef COMPAT_FREEBSD32
struct procfs_status32 {
int state; /* Running, stopped, something else? */
int flags; /* Any flags */
unsigned int events; /* Events to stop on */
int why; /* What event, if any, proc stopped on */
unsigned int val; /* Any extra data */
};
#define PIOCWAIT32 _IOR('p', 4, struct procfs_status32)
#define PIOCSTATUS32 _IOR('p', 6, struct procfs_status32)
#endif
/*
* Process ioctls
*/
int
procfs_ioctl(PFS_IOCTL_ARGS)
{
struct procfs_status *ps;
#ifdef COMPAT_FREEBSD32
struct procfs_status32 *ps32;
#endif
int error, flags, sig;
#ifdef COMPAT_FREEBSD6
int ival;
#endif
KASSERT(p != NULL,
("%s() called without a process", __func__));
PROC_LOCK_ASSERT(p, MA_OWNED);
error = 0;
switch (cmd) {
#if defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
case _IOC(IOC_IN, 'p', 1, 0):
#endif
#ifdef COMPAT_FREEBSD6
case _IO('p', 1):
ival = IOCPARM_IVAL(data);
data = &ival;
#endif
case PIOCBIS:
p->p_stops |= *(unsigned int *)data;
break;
#if defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
case _IOC(IOC_IN, 'p', 2, 0):
#endif
#ifdef COMPAT_FREEBSD6
case _IO('p', 2):
ival = IOCPARM_IVAL(data);
data = &ival;
#endif
case PIOCBIC:
p->p_stops &= ~*(unsigned int *)data;
break;
#if defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
case _IOC(IOC_IN, 'p', 3, 0):
#endif
#ifdef COMPAT_FREEBSD6
case _IO('p', 3):
ival = IOCPARM_IVAL(data);
data = &ival;
#endif
case PIOCSFL:
flags = *(unsigned int *)data;
if (flags & PF_ISUGID) {
/*
* XXXRW: Is this specific check required here, as
* p_candebug() should implement it, or other checks
* are missing.
*/
error = priv_check(td, PRIV_DEBUG_SUGID);
if (error)
break;
}
p->p_pfsflags = flags;
break;
case PIOCGFL:
*(unsigned int *)data = p->p_pfsflags;
break;
case PIOCWAIT:
while (p->p_step == 0 && (p->p_flag & P_WEXIT) == 0) {
/* sleep until p stops */
_PHOLD(p);
error = msleep(&p->p_stype, &p->p_mtx,
PWAIT|PCATCH, "pioctl", 0);
_PRELE(p);
if (error != 0)
break;
}
/* fall through to PIOCSTATUS */
case PIOCSTATUS:
ps = (struct procfs_status *)data;
ps->state = (p->p_step == 0);
ps->flags = 0; /* nope */
ps->events = p->p_stops;
ps->why = p->p_step ? p->p_stype : 0;
ps->val = p->p_step ? p->p_xstat : 0;
break;
#ifdef COMPAT_FREEBSD32
case PIOCWAIT32:
while (p->p_step == 0 && (p->p_flag & P_WEXIT) == 0) {
/* sleep until p stops */
_PHOLD(p);
error = msleep(&p->p_stype, &p->p_mtx,
PWAIT|PCATCH, "pioctl", 0);
_PRELE(p);
if (error != 0)
break;
}
/* fall through to PIOCSTATUS32 */
case PIOCSTATUS32:
ps32 = (struct procfs_status32 *)data;
ps32->state = (p->p_step == 0);
ps32->flags = 0; /* nope */
ps32->events = p->p_stops;
ps32->why = p->p_step ? p->p_stype : 0;
ps32->val = p->p_step ? p->p_xstat : 0;
break;
#endif
#if defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
case _IOC(IOC_IN, 'p', 5, 0):
#endif
#ifdef COMPAT_FREEBSD6
case _IO('p', 5):
ival = IOCPARM_IVAL(data);
data = &ival;
#endif
case PIOCCONT:
if (p->p_step == 0)
break;
sig = *(unsigned int *)data;
if (sig != 0 && !_SIG_VALID(sig)) {
error = EINVAL;
break;
}
#if 0
p->p_step = 0;
if (P_SHOULDSTOP(p)) {
p->p_xstat = sig;
p->p_flag &= ~(P_STOPPED_TRACE|P_STOPPED_SIG);
PROC_SLOCK(p);
thread_unsuspend(p);
PROC_SUNLOCK(p);
} else if (sig)
- psignal(p, sig);
+ kern_psignal(p, sig);
#else
if (sig)
- psignal(p, sig);
+ kern_psignal(p, sig);
p->p_step = 0;
wakeup(&p->p_step);
#endif
break;
default:
error = (ENOTTY);
}
return (error);
}
/*
* Clean up on last close
*/
int
procfs_close(PFS_CLOSE_ARGS)
{
if (p != NULL && (p->p_pfsflags & PF_LINGER) == 0) {
PROC_LOCK_ASSERT(p, MA_OWNED);
p->p_pfsflags = 0;
p->p_stops = 0;
p->p_step = 0;
wakeup(&p->p_step);
}
return (0);
}
Index: head/sys/i386/i386/machdep.c
===================================================================
--- head/sys/i386/i386/machdep.c (revision 225616)
+++ head/sys/i386/i386/machdep.c (revision 225617)
@@ -1,3664 +1,3664 @@
/*-
* Copyright (c) 1992 Terrence R. Lambert.
* Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
* All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* William Jolitz.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* from: @(#)machdep.c 7.4 (Berkeley) 6/3/91
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_atalk.h"
#include "opt_compat.h"
#include "opt_cpu.h"
#include "opt_ddb.h"
#include "opt_inet.h"
#include "opt_ipx.h"
#include "opt_isa.h"
#include "opt_kstack_pages.h"
#include "opt_maxmem.h"
#include "opt_mp_watchdog.h"
#include "opt_npx.h"
#include "opt_perfmon.h"
#include "opt_xbox.h"
#include "opt_kdtrace.h"
#include <sys/param.h>
#include <sys/proc.h>
#include <sys/systm.h>
#include <sys/bio.h>
#include <sys/buf.h>
#include <sys/bus.h>
#include <sys/callout.h>
#include <sys/cons.h>
#include <sys/cpu.h>
#include <sys/eventhandler.h>
#include <sys/exec.h>
#include <sys/imgact.h>
#include <sys/kdb.h>
#include <sys/kernel.h>
#include <sys/ktr.h>
#include <sys/linker.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/msgbuf.h>
#include <sys/mutex.h>
#include <sys/pcpu.h>
#include <sys/ptrace.h>
#include <sys/reboot.h>
#include <sys/sched.h>
#include <sys/signalvar.h>
#ifdef SMP
#include <sys/smp.h>
#endif
#include <sys/syscallsubr.h>
#include <sys/sysctl.h>
#include <sys/sysent.h>
#include <sys/sysproto.h>
#include <sys/ucontext.h>
#include <sys/vmmeter.h>
#include <vm/vm.h>
#include <vm/vm_extern.h>
#include <vm/vm_kern.h>
#include <vm/vm_page.h>
#include <vm/vm_map.h>
#include <vm/vm_object.h>
#include <vm/vm_pager.h>
#include <vm/vm_param.h>
#ifdef DDB
#ifndef KDB
#error KDB must be enabled in order for DDB to work!
#endif
#include <ddb/ddb.h>
#include <ddb/db_sym.h>
#endif
#include <isa/rtc.h>
#include <net/netisr.h>
#include <machine/bootinfo.h>
#include <machine/clock.h>
#include <machine/cpu.h>
#include <machine/cputypes.h>
#include <machine/intr_machdep.h>
#include <x86/mca.h>
#include <machine/md_var.h>
#include <machine/metadata.h>
#include <machine/mp_watchdog.h>
#include <machine/pc/bios.h>
#include <machine/pcb.h>
#include <machine/pcb_ext.h>
#include <machine/proc.h>
#include <machine/reg.h>
#include <machine/sigframe.h>
#include <machine/specialreg.h>
#include <machine/vm86.h>
#ifdef PERFMON
#include <machine/perfmon.h>
#endif
#ifdef SMP
#include <machine/smp.h>
#endif
#ifdef DEV_ISA
#include <x86/isa/icu.h>
#endif
#ifdef XBOX
#include <machine/xbox.h>
int arch_i386_is_xbox = 0;
uint32_t arch_i386_xbox_memsize = 0;
#endif
#ifdef XEN
/* XEN includes */
#include <machine/xen/xen-os.h>
#include <xen/hypervisor.h>
#include <machine/xen/xen-os.h>
#include <machine/xen/xenvar.h>
#include <machine/xen/xenfunc.h>
#include <xen/xen_intr.h>
void Xhypervisor_callback(void);
void failsafe_callback(void);
extern trap_info_t trap_table[];
struct proc_ldt default_proc_ldt;
extern int init_first;
int running_xen = 1;
extern unsigned long physfree;
#endif /* XEN */
/* Sanity check for __curthread() */
CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
extern void init386(int first);
extern void dblfault_handler(void);
extern void printcpuinfo(void); /* XXX header file */
extern void finishidentcpu(void);
extern void panicifcpuunsupported(void);
extern void initializecpu(void);
#define CS_SECURE(cs) (ISPL(cs) == SEL_UPL)
#define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
#if !defined(CPU_DISABLE_SSE) && defined(I686_CPU)
#define CPU_ENABLE_SSE
#endif
static void cpu_startup(void *);
static void fpstate_drop(struct thread *td);
static void get_fpcontext(struct thread *td, mcontext_t *mcp);
static int set_fpcontext(struct thread *td, const mcontext_t *mcp);
#ifdef CPU_ENABLE_SSE
static void set_fpregs_xmm(struct save87 *, struct savexmm *);
static void fill_fpregs_xmm(struct savexmm *, struct save87 *);
#endif /* CPU_ENABLE_SSE */
SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
#ifdef DDB
extern vm_offset_t ksym_start, ksym_end;
#endif
/* Intel ICH registers */
#define ICH_PMBASE 0x400
#define ICH_SMI_EN ICH_PMBASE + 0x30
int _udatasel, _ucodesel;
u_int basemem;
int cold = 1;
#ifdef COMPAT_43
static void osendsig(sig_t catcher, ksiginfo_t *, sigset_t *mask);
#endif
#ifdef COMPAT_FREEBSD4
static void freebsd4_sendsig(sig_t catcher, ksiginfo_t *, sigset_t *mask);
#endif
long Maxmem = 0;
long realmem = 0;
#ifdef PAE
FEATURE(pae, "Physical Address Extensions");
#endif
/*
* The number of PHYSMAP entries must be one less than the number of
* PHYSSEG entries because the PHYSMAP entry that spans the largest
* physical address that is accessible by ISA DMA is split into two
* PHYSSEG entries.
*/
#define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1))
vm_paddr_t phys_avail[PHYSMAP_SIZE + 2];
vm_paddr_t dump_avail[PHYSMAP_SIZE + 2];
/* must be 2 less so 0 0 can signal end of chunks */
#define PHYS_AVAIL_ARRAY_END ((sizeof(phys_avail) / sizeof(phys_avail[0])) - 2)
#define DUMP_AVAIL_ARRAY_END ((sizeof(dump_avail) / sizeof(dump_avail[0])) - 2)
struct kva_md_info kmi;
static struct trapframe proc0_tf;
struct pcpu __pcpu[MAXCPU];
struct mtx icu_lock;
static void
cpu_startup(dummy)
void *dummy;
{
uintmax_t memsize;
char *sysenv;
/*
* On MacBooks, we need to disallow the legacy USB circuit to
* generate an SMI# because this can cause several problems,
* namely: incorrect CPU frequency detection and failure to
* start the APs.
* We do this by disabling a bit in the SMI_EN (SMI Control and
* Enable register) of the Intel ICH LPC Interface Bridge.
*/
sysenv = getenv("smbios.system.product");
if (sysenv != NULL) {
if (strncmp(sysenv, "MacBook1,1", 10) == 0 ||
strncmp(sysenv, "MacBook3,1", 10) == 0 ||
strncmp(sysenv, "MacBookPro1,1", 13) == 0 ||
strncmp(sysenv, "MacBookPro1,2", 13) == 0 ||
strncmp(sysenv, "MacBookPro3,1", 13) == 0 ||
strncmp(sysenv, "Macmini1,1", 10) == 0) {
if (bootverbose)
printf("Disabling LEGACY_USB_EN bit on "
"Intel ICH.\n");
outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8);
}
freeenv(sysenv);
}
/*
* Good {morning,afternoon,evening,night}.
*/
startrtclock();
printcpuinfo();
panicifcpuunsupported();
#ifdef PERFMON
perfmon_init();
#endif
realmem = Maxmem;
/*
* Display physical memory if SMBIOS reports reasonable amount.
*/
memsize = 0;
sysenv = getenv("smbios.memory.enabled");
if (sysenv != NULL) {
memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
freeenv(sysenv);
}
if (memsize < ptoa((uintmax_t)cnt.v_free_count))
memsize = ptoa((uintmax_t)Maxmem);
printf("real memory = %ju (%ju MB)\n", memsize, memsize >> 20);
/*
* Display any holes after the first chunk of extended memory.
*/
if (bootverbose) {
int indx;
printf("Physical memory chunk(s):\n");
for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
vm_paddr_t size;
size = phys_avail[indx + 1] - phys_avail[indx];
printf(
"0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
(uintmax_t)phys_avail[indx],
(uintmax_t)phys_avail[indx + 1] - 1,
(uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
}
}
vm_ksubmap_init(&kmi);
printf("avail memory = %ju (%ju MB)\n",
ptoa((uintmax_t)cnt.v_free_count),
ptoa((uintmax_t)cnt.v_free_count) / 1048576);
/*
* Set up buffers, so they can be used to read disk labels.
*/
bufinit();
vm_pager_bufferinit();
#ifndef XEN
cpu_setregs();
#endif
}
/*
* Send an interrupt to process.
*
* Stack is set up to allow sigcode stored
* at top to call routine, followed by kcall
* to sigreturn routine below. After sigreturn
* resets the signal mask, the stack, and the
* frame pointer, it returns to the user
* specified pc, psl.
*/
#ifdef COMPAT_43
static void
osendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
{
struct osigframe sf, *fp;
struct proc *p;
struct thread *td;
struct sigacts *psp;
struct trapframe *regs;
int sig;
int oonstack;
td = curthread;
p = td->td_proc;
PROC_LOCK_ASSERT(p, MA_OWNED);
sig = ksi->ksi_signo;
psp = p->p_sigacts;
mtx_assert(&psp->ps_mtx, MA_OWNED);
regs = td->td_frame;
oonstack = sigonstack(regs->tf_esp);
/* Allocate space for the signal handler context. */
if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
SIGISMEMBER(psp->ps_sigonstack, sig)) {
fp = (struct osigframe *)(td->td_sigstk.ss_sp +
td->td_sigstk.ss_size - sizeof(struct osigframe));
#if defined(COMPAT_43)
td->td_sigstk.ss_flags |= SS_ONSTACK;
#endif
} else
fp = (struct osigframe *)regs->tf_esp - 1;
/* Translate the signal if appropriate. */
if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
/* Build the argument list for the signal handler. */
sf.sf_signum = sig;
sf.sf_scp = (register_t)&fp->sf_siginfo.si_sc;
bzero(&sf.sf_siginfo, sizeof(sf.sf_siginfo));
if (SIGISMEMBER(psp->ps_siginfo, sig)) {
/* Signal handler installed with SA_SIGINFO. */
sf.sf_arg2 = (register_t)&fp->sf_siginfo;
sf.sf_siginfo.si_signo = sig;
sf.sf_siginfo.si_code = ksi->ksi_code;
sf.sf_ahu.sf_action = (__osiginfohandler_t *)catcher;
sf.sf_addr = 0;
} else {
/* Old FreeBSD-style arguments. */
sf.sf_arg2 = ksi->ksi_code;
sf.sf_addr = (register_t)ksi->ksi_addr;
sf.sf_ahu.sf_handler = catcher;
}
mtx_unlock(&psp->ps_mtx);
PROC_UNLOCK(p);
/* Save most if not all of trap frame. */
sf.sf_siginfo.si_sc.sc_eax = regs->tf_eax;
sf.sf_siginfo.si_sc.sc_ebx = regs->tf_ebx;
sf.sf_siginfo.si_sc.sc_ecx = regs->tf_ecx;
sf.sf_siginfo.si_sc.sc_edx = regs->tf_edx;
sf.sf_siginfo.si_sc.sc_esi = regs->tf_esi;
sf.sf_siginfo.si_sc.sc_edi = regs->tf_edi;
sf.sf_siginfo.si_sc.sc_cs = regs->tf_cs;
sf.sf_siginfo.si_sc.sc_ds = regs->tf_ds;
sf.sf_siginfo.si_sc.sc_ss = regs->tf_ss;
sf.sf_siginfo.si_sc.sc_es = regs->tf_es;
sf.sf_siginfo.si_sc.sc_fs = regs->tf_fs;
sf.sf_siginfo.si_sc.sc_gs = rgs();
sf.sf_siginfo.si_sc.sc_isp = regs->tf_isp;
/* Build the signal context to be used by osigreturn(). */
sf.sf_siginfo.si_sc.sc_onstack = (oonstack) ? 1 : 0;
SIG2OSIG(*mask, sf.sf_siginfo.si_sc.sc_mask);
sf.sf_siginfo.si_sc.sc_sp = regs->tf_esp;
sf.sf_siginfo.si_sc.sc_fp = regs->tf_ebp;
sf.sf_siginfo.si_sc.sc_pc = regs->tf_eip;
sf.sf_siginfo.si_sc.sc_ps = regs->tf_eflags;
sf.sf_siginfo.si_sc.sc_trapno = regs->tf_trapno;
sf.sf_siginfo.si_sc.sc_err = regs->tf_err;
/*
* If we're a vm86 process, we want to save the segment registers.
* We also change eflags to be our emulated eflags, not the actual
* eflags.
*/
if (regs->tf_eflags & PSL_VM) {
/* XXX confusing names: `tf' isn't a trapframe; `regs' is. */
struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86;
sf.sf_siginfo.si_sc.sc_gs = tf->tf_vm86_gs;
sf.sf_siginfo.si_sc.sc_fs = tf->tf_vm86_fs;
sf.sf_siginfo.si_sc.sc_es = tf->tf_vm86_es;
sf.sf_siginfo.si_sc.sc_ds = tf->tf_vm86_ds;
if (vm86->vm86_has_vme == 0)
sf.sf_siginfo.si_sc.sc_ps =
(tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) |
(vm86->vm86_eflags & (PSL_VIF | PSL_VIP));
/* See sendsig() for comments. */
tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP);
}
/*
* Copy the sigframe out to the user's stack.
*/
if (copyout(&sf, fp, sizeof(*fp)) != 0) {
#ifdef DEBUG
printf("process %ld has trashed its stack\n", (long)p->p_pid);
#endif
PROC_LOCK(p);
sigexit(td, SIGILL);
}
regs->tf_esp = (int)fp;
regs->tf_eip = PS_STRINGS - szosigcode;
regs->tf_eflags &= ~(PSL_T | PSL_D);
regs->tf_cs = _ucodesel;
regs->tf_ds = _udatasel;
regs->tf_es = _udatasel;
regs->tf_fs = _udatasel;
load_gs(_udatasel);
regs->tf_ss = _udatasel;
PROC_LOCK(p);
mtx_lock(&psp->ps_mtx);
}
#endif /* COMPAT_43 */
#ifdef COMPAT_FREEBSD4
static void
freebsd4_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
{
struct sigframe4 sf, *sfp;
struct proc *p;
struct thread *td;
struct sigacts *psp;
struct trapframe *regs;
int sig;
int oonstack;
td = curthread;
p = td->td_proc;
PROC_LOCK_ASSERT(p, MA_OWNED);
sig = ksi->ksi_signo;
psp = p->p_sigacts;
mtx_assert(&psp->ps_mtx, MA_OWNED);
regs = td->td_frame;
oonstack = sigonstack(regs->tf_esp);
/* Save user context. */
bzero(&sf, sizeof(sf));
sf.sf_uc.uc_sigmask = *mask;
sf.sf_uc.uc_stack = td->td_sigstk;
sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
sf.sf_uc.uc_mcontext.mc_gs = rgs();
bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs));
bzero(sf.sf_uc.uc_mcontext.mc_fpregs,
sizeof(sf.sf_uc.uc_mcontext.mc_fpregs));
bzero(sf.sf_uc.uc_mcontext.__spare__,
sizeof(sf.sf_uc.uc_mcontext.__spare__));
bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__));
/* Allocate space for the signal handler context. */
if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
SIGISMEMBER(psp->ps_sigonstack, sig)) {
sfp = (struct sigframe4 *)(td->td_sigstk.ss_sp +
td->td_sigstk.ss_size - sizeof(struct sigframe4));
#if defined(COMPAT_43)
td->td_sigstk.ss_flags |= SS_ONSTACK;
#endif
} else
sfp = (struct sigframe4 *)regs->tf_esp - 1;
/* Translate the signal if appropriate. */
if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
/* Build the argument list for the signal handler. */
sf.sf_signum = sig;
sf.sf_ucontext = (register_t)&sfp->sf_uc;
bzero(&sf.sf_si, sizeof(sf.sf_si));
if (SIGISMEMBER(psp->ps_siginfo, sig)) {
/* Signal handler installed with SA_SIGINFO. */
sf.sf_siginfo = (register_t)&sfp->sf_si;
sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
/* Fill in POSIX parts */
sf.sf_si.si_signo = sig;
sf.sf_si.si_code = ksi->ksi_code;
sf.sf_si.si_addr = ksi->ksi_addr;
} else {
/* Old FreeBSD-style arguments. */
sf.sf_siginfo = ksi->ksi_code;
sf.sf_addr = (register_t)ksi->ksi_addr;
sf.sf_ahu.sf_handler = catcher;
}
mtx_unlock(&psp->ps_mtx);
PROC_UNLOCK(p);
/*
* If we're a vm86 process, we want to save the segment registers.
* We also change eflags to be our emulated eflags, not the actual
* eflags.
*/
if (regs->tf_eflags & PSL_VM) {
struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86;
sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs;
sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs;
sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es;
sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds;
if (vm86->vm86_has_vme == 0)
sf.sf_uc.uc_mcontext.mc_eflags =
(tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) |
(vm86->vm86_eflags & (PSL_VIF | PSL_VIP));
/*
* Clear PSL_NT to inhibit T_TSSFLT faults on return from
* syscalls made by the signal handler. This just avoids
* wasting time for our lazy fixup of such faults. PSL_NT
* does nothing in vm86 mode, but vm86 programs can set it
* almost legitimately in probes for old cpu types.
*/
tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP);
}
/*
* Copy the sigframe out to the user's stack.
*/
if (copyout(&sf, sfp, sizeof(*sfp)) != 0) {
#ifdef DEBUG
printf("process %ld has trashed its stack\n", (long)p->p_pid);
#endif
PROC_LOCK(p);
sigexit(td, SIGILL);
}
regs->tf_esp = (int)sfp;
regs->tf_eip = PS_STRINGS - szfreebsd4_sigcode;
regs->tf_eflags &= ~(PSL_T | PSL_D);
regs->tf_cs = _ucodesel;
regs->tf_ds = _udatasel;
regs->tf_es = _udatasel;
regs->tf_fs = _udatasel;
regs->tf_ss = _udatasel;
PROC_LOCK(p);
mtx_lock(&psp->ps_mtx);
}
#endif /* COMPAT_FREEBSD4 */
void
sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
{
struct sigframe sf, *sfp;
struct proc *p;
struct thread *td;
struct sigacts *psp;
char *sp;
struct trapframe *regs;
struct segment_descriptor *sdp;
int sig;
int oonstack;
td = curthread;
p = td->td_proc;
PROC_LOCK_ASSERT(p, MA_OWNED);
sig = ksi->ksi_signo;
psp = p->p_sigacts;
mtx_assert(&psp->ps_mtx, MA_OWNED);
#ifdef COMPAT_FREEBSD4
if (SIGISMEMBER(psp->ps_freebsd4, sig)) {
freebsd4_sendsig(catcher, ksi, mask);
return;
}
#endif
#ifdef COMPAT_43
if (SIGISMEMBER(psp->ps_osigset, sig)) {
osendsig(catcher, ksi, mask);
return;
}
#endif
regs = td->td_frame;
oonstack = sigonstack(regs->tf_esp);
/* Save user context. */
bzero(&sf, sizeof(sf));
sf.sf_uc.uc_sigmask = *mask;
sf.sf_uc.uc_stack = td->td_sigstk;
sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
sf.sf_uc.uc_mcontext.mc_gs = rgs();
bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs));
sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */
get_fpcontext(td, &sf.sf_uc.uc_mcontext);
fpstate_drop(td);
/*
* Unconditionally fill the fsbase and gsbase into the mcontext.
*/
sdp = &td->td_pcb->pcb_fsd;
sf.sf_uc.uc_mcontext.mc_fsbase = sdp->sd_hibase << 24 |
sdp->sd_lobase;
sdp = &td->td_pcb->pcb_gsd;
sf.sf_uc.uc_mcontext.mc_gsbase = sdp->sd_hibase << 24 |
sdp->sd_lobase;
bzero(sf.sf_uc.uc_mcontext.mc_spare1,
sizeof(sf.sf_uc.uc_mcontext.mc_spare1));
bzero(sf.sf_uc.uc_mcontext.mc_spare2,
sizeof(sf.sf_uc.uc_mcontext.mc_spare2));
bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__));
/* Allocate space for the signal handler context. */
if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
SIGISMEMBER(psp->ps_sigonstack, sig)) {
sp = td->td_sigstk.ss_sp +
td->td_sigstk.ss_size - sizeof(struct sigframe);
#if defined(COMPAT_43)
td->td_sigstk.ss_flags |= SS_ONSTACK;
#endif
} else
sp = (char *)regs->tf_esp - sizeof(struct sigframe);
/* Align to 16 bytes. */
sfp = (struct sigframe *)((unsigned int)sp & ~0xF);
/* Translate the signal if appropriate. */
if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
/* Build the argument list for the signal handler. */
sf.sf_signum = sig;
sf.sf_ucontext = (register_t)&sfp->sf_uc;
bzero(&sf.sf_si, sizeof(sf.sf_si));
if (SIGISMEMBER(psp->ps_siginfo, sig)) {
/* Signal handler installed with SA_SIGINFO. */
sf.sf_siginfo = (register_t)&sfp->sf_si;
sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
/* Fill in POSIX parts */
sf.sf_si = ksi->ksi_info;
sf.sf_si.si_signo = sig; /* maybe a translated signal */
} else {
/* Old FreeBSD-style arguments. */
sf.sf_siginfo = ksi->ksi_code;
sf.sf_addr = (register_t)ksi->ksi_addr;
sf.sf_ahu.sf_handler = catcher;
}
mtx_unlock(&psp->ps_mtx);
PROC_UNLOCK(p);
/*
* If we're a vm86 process, we want to save the segment registers.
* We also change eflags to be our emulated eflags, not the actual
* eflags.
*/
if (regs->tf_eflags & PSL_VM) {
struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86;
sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs;
sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs;
sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es;
sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds;
if (vm86->vm86_has_vme == 0)
sf.sf_uc.uc_mcontext.mc_eflags =
(tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) |
(vm86->vm86_eflags & (PSL_VIF | PSL_VIP));
/*
* Clear PSL_NT to inhibit T_TSSFLT faults on return from
* syscalls made by the signal handler. This just avoids
* wasting time for our lazy fixup of such faults. PSL_NT
* does nothing in vm86 mode, but vm86 programs can set it
* almost legitimately in probes for old cpu types.
*/
tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP);
}
/*
* Copy the sigframe out to the user's stack.
*/
if (copyout(&sf, sfp, sizeof(*sfp)) != 0) {
#ifdef DEBUG
printf("process %ld has trashed its stack\n", (long)p->p_pid);
#endif
PROC_LOCK(p);
sigexit(td, SIGILL);
}
regs->tf_esp = (int)sfp;
regs->tf_eip = PS_STRINGS - *(p->p_sysent->sv_szsigcode);
regs->tf_eflags &= ~(PSL_T | PSL_D);
regs->tf_cs = _ucodesel;
regs->tf_ds = _udatasel;
regs->tf_es = _udatasel;
regs->tf_fs = _udatasel;
regs->tf_ss = _udatasel;
PROC_LOCK(p);
mtx_lock(&psp->ps_mtx);
}
/*
* System call to cleanup state after a signal
* has been taken. Reset signal mask and
* stack state from context left by sendsig (above).
* Return to previous pc and psl as specified by
* context left by sendsig. Check carefully to
* make sure that the user has not modified the
* state to gain improper privileges.
*
* MPSAFE
*/
#ifdef COMPAT_43
int
osigreturn(td, uap)
struct thread *td;
struct osigreturn_args /* {
struct osigcontext *sigcntxp;
} */ *uap;
{
struct osigcontext sc;
struct trapframe *regs;
struct osigcontext *scp;
int eflags, error;
ksiginfo_t ksi;
regs = td->td_frame;
error = copyin(uap->sigcntxp, &sc, sizeof(sc));
if (error != 0)
return (error);
scp = &sc;
eflags = scp->sc_ps;
if (eflags & PSL_VM) {
struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
struct vm86_kernel *vm86;
/*
* if pcb_ext == 0 or vm86_inited == 0, the user hasn't
* set up the vm86 area, and we can't enter vm86 mode.
*/
if (td->td_pcb->pcb_ext == 0)
return (EINVAL);
vm86 = &td->td_pcb->pcb_ext->ext_vm86;
if (vm86->vm86_inited == 0)
return (EINVAL);
/* Go back to user mode if both flags are set. */
if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) {
ksiginfo_init_trap(&ksi);
ksi.ksi_signo = SIGBUS;
ksi.ksi_code = BUS_OBJERR;
ksi.ksi_addr = (void *)regs->tf_eip;
trapsignal(td, &ksi);
}
if (vm86->vm86_has_vme) {
eflags = (tf->tf_eflags & ~VME_USERCHANGE) |
(eflags & VME_USERCHANGE) | PSL_VM;
} else {
vm86->vm86_eflags = eflags; /* save VIF, VIP */
eflags = (tf->tf_eflags & ~VM_USERCHANGE) |
(eflags & VM_USERCHANGE) | PSL_VM;
}
tf->tf_vm86_ds = scp->sc_ds;
tf->tf_vm86_es = scp->sc_es;
tf->tf_vm86_fs = scp->sc_fs;
tf->tf_vm86_gs = scp->sc_gs;
tf->tf_ds = _udatasel;
tf->tf_es = _udatasel;
tf->tf_fs = _udatasel;
} else {
/*
* Don't allow users to change privileged or reserved flags.
*/
/*
* XXX do allow users to change the privileged flag PSL_RF.
* The cpu sets PSL_RF in tf_eflags for faults. Debuggers
* should sometimes set it there too. tf_eflags is kept in
* the signal context during signal handling and there is no
* other place to remember it, so the PSL_RF bit may be
* corrupted by the signal handler without us knowing.
* Corruption of the PSL_RF bit at worst causes one more or
* one less debugger trap, so allowing it is fairly harmless.
*/
if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) {
return (EINVAL);
}
/*
* Don't allow users to load a valid privileged %cs. Let the
* hardware check for invalid selectors, excess privilege in
* other selectors, invalid %eip's and invalid %esp's.
*/
if (!CS_SECURE(scp->sc_cs)) {
ksiginfo_init_trap(&ksi);
ksi.ksi_signo = SIGBUS;
ksi.ksi_code = BUS_OBJERR;
ksi.ksi_trapno = T_PROTFLT;
ksi.ksi_addr = (void *)regs->tf_eip;
trapsignal(td, &ksi);
return (EINVAL);
}
regs->tf_ds = scp->sc_ds;
regs->tf_es = scp->sc_es;
regs->tf_fs = scp->sc_fs;
}
/* Restore remaining registers. */
regs->tf_eax = scp->sc_eax;
regs->tf_ebx = scp->sc_ebx;
regs->tf_ecx = scp->sc_ecx;
regs->tf_edx = scp->sc_edx;
regs->tf_esi = scp->sc_esi;
regs->tf_edi = scp->sc_edi;
regs->tf_cs = scp->sc_cs;
regs->tf_ss = scp->sc_ss;
regs->tf_isp = scp->sc_isp;
regs->tf_ebp = scp->sc_fp;
regs->tf_esp = scp->sc_sp;
regs->tf_eip = scp->sc_pc;
regs->tf_eflags = eflags;
#if defined(COMPAT_43)
if (scp->sc_onstack & 1)
td->td_sigstk.ss_flags |= SS_ONSTACK;
else
td->td_sigstk.ss_flags &= ~SS_ONSTACK;
#endif
kern_sigprocmask(td, SIG_SETMASK, (sigset_t *)&scp->sc_mask, NULL,
SIGPROCMASK_OLD);
return (EJUSTRETURN);
}
#endif /* COMPAT_43 */
#ifdef COMPAT_FREEBSD4
/*
* MPSAFE
*/
int
freebsd4_sigreturn(td, uap)
struct thread *td;
struct freebsd4_sigreturn_args /* {
const ucontext4 *sigcntxp;
} */ *uap;
{
struct ucontext4 uc;
struct trapframe *regs;
struct ucontext4 *ucp;
int cs, eflags, error;
ksiginfo_t ksi;
error = copyin(uap->sigcntxp, &uc, sizeof(uc));
if (error != 0)
return (error);
ucp = &uc;
regs = td->td_frame;
eflags = ucp->uc_mcontext.mc_eflags;
if (eflags & PSL_VM) {
struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
struct vm86_kernel *vm86;
/*
* if pcb_ext == 0 or vm86_inited == 0, the user hasn't
* set up the vm86 area, and we can't enter vm86 mode.
*/
if (td->td_pcb->pcb_ext == 0)
return (EINVAL);
vm86 = &td->td_pcb->pcb_ext->ext_vm86;
if (vm86->vm86_inited == 0)
return (EINVAL);
/* Go back to user mode if both flags are set. */
if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) {
ksiginfo_init_trap(&ksi);
ksi.ksi_signo = SIGBUS;
ksi.ksi_code = BUS_OBJERR;
ksi.ksi_addr = (void *)regs->tf_eip;
trapsignal(td, &ksi);
}
if (vm86->vm86_has_vme) {
eflags = (tf->tf_eflags & ~VME_USERCHANGE) |
(eflags & VME_USERCHANGE) | PSL_VM;
} else {
vm86->vm86_eflags = eflags; /* save VIF, VIP */
eflags = (tf->tf_eflags & ~VM_USERCHANGE) |
(eflags & VM_USERCHANGE) | PSL_VM;
}
bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe));
tf->tf_eflags = eflags;
tf->tf_vm86_ds = tf->tf_ds;
tf->tf_vm86_es = tf->tf_es;
tf->tf_vm86_fs = tf->tf_fs;
tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs;
tf->tf_ds = _udatasel;
tf->tf_es = _udatasel;
tf->tf_fs = _udatasel;
} else {
/*
* Don't allow users to change privileged or reserved flags.
*/
/*
* XXX do allow users to change the privileged flag PSL_RF.
* The cpu sets PSL_RF in tf_eflags for faults. Debuggers
* should sometimes set it there too. tf_eflags is kept in
* the signal context during signal handling and there is no
* other place to remember it, so the PSL_RF bit may be
* corrupted by the signal handler without us knowing.
* Corruption of the PSL_RF bit at worst causes one more or
* one less debugger trap, so allowing it is fairly harmless.
*/
if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) {
uprintf("pid %d (%s): freebsd4_sigreturn eflags = 0x%x\n",
td->td_proc->p_pid, td->td_name, eflags);
return (EINVAL);
}
/*
* Don't allow users to load a valid privileged %cs. Let the
* hardware check for invalid selectors, excess privilege in
* other selectors, invalid %eip's and invalid %esp's.
*/
cs = ucp->uc_mcontext.mc_cs;
if (!CS_SECURE(cs)) {
uprintf("pid %d (%s): freebsd4_sigreturn cs = 0x%x\n",
td->td_proc->p_pid, td->td_name, cs);
ksiginfo_init_trap(&ksi);
ksi.ksi_signo = SIGBUS;
ksi.ksi_code = BUS_OBJERR;
ksi.ksi_trapno = T_PROTFLT;
ksi.ksi_addr = (void *)regs->tf_eip;
trapsignal(td, &ksi);
return (EINVAL);
}
bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs));
}
#if defined(COMPAT_43)
if (ucp->uc_mcontext.mc_onstack & 1)
td->td_sigstk.ss_flags |= SS_ONSTACK;
else
td->td_sigstk.ss_flags &= ~SS_ONSTACK;
#endif
kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
return (EJUSTRETURN);
}
#endif /* COMPAT_FREEBSD4 */
/*
* MPSAFE
*/
int
-sigreturn(td, uap)
+sys_sigreturn(td, uap)
struct thread *td;
struct sigreturn_args /* {
const struct __ucontext *sigcntxp;
} */ *uap;
{
ucontext_t uc;
struct trapframe *regs;
ucontext_t *ucp;
int cs, eflags, error, ret;
ksiginfo_t ksi;
error = copyin(uap->sigcntxp, &uc, sizeof(uc));
if (error != 0)
return (error);
ucp = &uc;
regs = td->td_frame;
eflags = ucp->uc_mcontext.mc_eflags;
if (eflags & PSL_VM) {
struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
struct vm86_kernel *vm86;
/*
* if pcb_ext == 0 or vm86_inited == 0, the user hasn't
* set up the vm86 area, and we can't enter vm86 mode.
*/
if (td->td_pcb->pcb_ext == 0)
return (EINVAL);
vm86 = &td->td_pcb->pcb_ext->ext_vm86;
if (vm86->vm86_inited == 0)
return (EINVAL);
/* Go back to user mode if both flags are set. */
if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) {
ksiginfo_init_trap(&ksi);
ksi.ksi_signo = SIGBUS;
ksi.ksi_code = BUS_OBJERR;
ksi.ksi_addr = (void *)regs->tf_eip;
trapsignal(td, &ksi);
}
if (vm86->vm86_has_vme) {
eflags = (tf->tf_eflags & ~VME_USERCHANGE) |
(eflags & VME_USERCHANGE) | PSL_VM;
} else {
vm86->vm86_eflags = eflags; /* save VIF, VIP */
eflags = (tf->tf_eflags & ~VM_USERCHANGE) |
(eflags & VM_USERCHANGE) | PSL_VM;
}
bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe));
tf->tf_eflags = eflags;
tf->tf_vm86_ds = tf->tf_ds;
tf->tf_vm86_es = tf->tf_es;
tf->tf_vm86_fs = tf->tf_fs;
tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs;
tf->tf_ds = _udatasel;
tf->tf_es = _udatasel;
tf->tf_fs = _udatasel;
} else {
/*
* Don't allow users to change privileged or reserved flags.
*/
/*
* XXX do allow users to change the privileged flag PSL_RF.
* The cpu sets PSL_RF in tf_eflags for faults. Debuggers
* should sometimes set it there too. tf_eflags is kept in
* the signal context during signal handling and there is no
* other place to remember it, so the PSL_RF bit may be
* corrupted by the signal handler without us knowing.
* Corruption of the PSL_RF bit at worst causes one more or
* one less debugger trap, so allowing it is fairly harmless.
*/
if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) {
uprintf("pid %d (%s): sigreturn eflags = 0x%x\n",
td->td_proc->p_pid, td->td_name, eflags);
return (EINVAL);
}
/*
* Don't allow users to load a valid privileged %cs. Let the
* hardware check for invalid selectors, excess privilege in
* other selectors, invalid %eip's and invalid %esp's.
*/
cs = ucp->uc_mcontext.mc_cs;
if (!CS_SECURE(cs)) {
uprintf("pid %d (%s): sigreturn cs = 0x%x\n",
td->td_proc->p_pid, td->td_name, cs);
ksiginfo_init_trap(&ksi);
ksi.ksi_signo = SIGBUS;
ksi.ksi_code = BUS_OBJERR;
ksi.ksi_trapno = T_PROTFLT;
ksi.ksi_addr = (void *)regs->tf_eip;
trapsignal(td, &ksi);
return (EINVAL);
}
ret = set_fpcontext(td, &ucp->uc_mcontext);
if (ret != 0)
return (ret);
bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs));
}
#if defined(COMPAT_43)
if (ucp->uc_mcontext.mc_onstack & 1)
td->td_sigstk.ss_flags |= SS_ONSTACK;
else
td->td_sigstk.ss_flags &= ~SS_ONSTACK;
#endif
kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
return (EJUSTRETURN);
}
/*
* Machine dependent boot() routine
*
* I haven't seen anything to put here yet
* Possibly some stuff might be grafted back here from boot()
*/
void
cpu_boot(int howto)
{
}
/*
* Flush the D-cache for non-DMA I/O so that the I-cache can
* be made coherent later.
*/
void
cpu_flush_dcache(void *ptr, size_t len)
{
/* Not applicable */
}
/* Get current clock frequency for the given cpu id. */
int
cpu_est_clockrate(int cpu_id, uint64_t *rate)
{
uint64_t tsc1, tsc2;
uint64_t acnt, mcnt, perf;
register_t reg;
if (pcpu_find(cpu_id) == NULL || rate == NULL)
return (EINVAL);
if ((cpu_feature & CPUID_TSC) == 0)
return (EOPNOTSUPP);
/*
* If TSC is P-state invariant and APERF/MPERF MSRs do not exist,
* DELAY(9) based logic fails.
*/
if (tsc_is_invariant && !tsc_perf_stat)
return (EOPNOTSUPP);
#ifdef SMP
if (smp_cpus > 1) {
/* Schedule ourselves on the indicated cpu. */
thread_lock(curthread);
sched_bind(curthread, cpu_id);
thread_unlock(curthread);
}
#endif
/* Calibrate by measuring a short delay. */
reg = intr_disable();
if (tsc_is_invariant) {
wrmsr(MSR_MPERF, 0);
wrmsr(MSR_APERF, 0);
tsc1 = rdtsc();
DELAY(1000);
mcnt = rdmsr(MSR_MPERF);
acnt = rdmsr(MSR_APERF);
tsc2 = rdtsc();
intr_restore(reg);
perf = 1000 * acnt / mcnt;
*rate = (tsc2 - tsc1) * perf;
} else {
tsc1 = rdtsc();
DELAY(1000);
tsc2 = rdtsc();
intr_restore(reg);
*rate = (tsc2 - tsc1) * 1000;
}
#ifdef SMP
if (smp_cpus > 1) {
thread_lock(curthread);
sched_unbind(curthread);
thread_unlock(curthread);
}
#endif
return (0);
}
#ifdef XEN
void
cpu_halt(void)
{
HYPERVISOR_shutdown(SHUTDOWN_poweroff);
}
int scheduler_running;
static void
cpu_idle_hlt(int busy)
{
scheduler_running = 1;
enable_intr();
idle_block();
}
#else
/*
* Shutdown the CPU as much as possible
*/
void
cpu_halt(void)
{
for (;;)
__asm__ ("hlt");
}
#endif
void (*cpu_idle_hook)(void) = NULL; /* ACPI idle hook. */
static int cpu_ident_amdc1e = 0; /* AMD C1E supported. */
static int idle_mwait = 1; /* Use MONITOR/MWAIT for short idle. */
TUNABLE_INT("machdep.idle_mwait", &idle_mwait);
SYSCTL_INT(_machdep, OID_AUTO, idle_mwait, CTLFLAG_RW, &idle_mwait,
0, "Use MONITOR/MWAIT for short idle");
#define STATE_RUNNING 0x0
#define STATE_MWAIT 0x1
#define STATE_SLEEPING 0x2
static void
cpu_idle_acpi(int busy)
{
int *state;
state = (int *)PCPU_PTR(monitorbuf);
*state = STATE_SLEEPING;
disable_intr();
if (sched_runnable())
enable_intr();
else if (cpu_idle_hook)
cpu_idle_hook();
else
__asm __volatile("sti; hlt");
*state = STATE_RUNNING;
}
#ifndef XEN
static void
cpu_idle_hlt(int busy)
{
int *state;
state = (int *)PCPU_PTR(monitorbuf);
*state = STATE_SLEEPING;
/*
* We must absolutely guarentee that hlt is the next instruction
* after sti or we introduce a timing window.
*/
disable_intr();
if (sched_runnable())
enable_intr();
else
__asm __volatile("sti; hlt");
*state = STATE_RUNNING;
}
#endif
/*
* MWAIT cpu power states. Lower 4 bits are sub-states.
*/
#define MWAIT_C0 0xf0
#define MWAIT_C1 0x00
#define MWAIT_C2 0x10
#define MWAIT_C3 0x20
#define MWAIT_C4 0x30
static void
cpu_idle_mwait(int busy)
{
int *state;
state = (int *)PCPU_PTR(monitorbuf);
*state = STATE_MWAIT;
if (!sched_runnable()) {
cpu_monitor(state, 0, 0);
if (*state == STATE_MWAIT)
cpu_mwait(0, MWAIT_C1);
}
*state = STATE_RUNNING;
}
static void
cpu_idle_spin(int busy)
{
int *state;
int i;
state = (int *)PCPU_PTR(monitorbuf);
*state = STATE_RUNNING;
for (i = 0; i < 1000; i++) {
if (sched_runnable())
return;
cpu_spinwait();
}
}
/*
* C1E renders the local APIC timer dead, so we disable it by
* reading the Interrupt Pending Message register and clearing
* both C1eOnCmpHalt (bit 28) and SmiOnCmpHalt (bit 27).
*
* Reference:
* "BIOS and Kernel Developer's Guide for AMD NPT Family 0Fh Processors"
* #32559 revision 3.00+
*/
#define MSR_AMDK8_IPM 0xc0010055
#define AMDK8_SMIONCMPHALT (1ULL << 27)
#define AMDK8_C1EONCMPHALT (1ULL << 28)
#define AMDK8_CMPHALT (AMDK8_SMIONCMPHALT | AMDK8_C1EONCMPHALT)
static void
cpu_probe_amdc1e(void)
{
/*
* Detect the presence of C1E capability mostly on latest
* dual-cores (or future) k8 family.
*/
if (cpu_vendor_id == CPU_VENDOR_AMD &&
(cpu_id & 0x00000f00) == 0x00000f00 &&
(cpu_id & 0x0fff0000) >= 0x00040000) {
cpu_ident_amdc1e = 1;
}
}
#ifdef XEN
void (*cpu_idle_fn)(int) = cpu_idle_hlt;
#else
void (*cpu_idle_fn)(int) = cpu_idle_acpi;
#endif
void
cpu_idle(int busy)
{
#ifndef XEN
uint64_t msr;
#endif
CTR2(KTR_SPARE2, "cpu_idle(%d) at %d",
busy, curcpu);
#if defined(MP_WATCHDOG) && !defined(XEN)
ap_watchdog(PCPU_GET(cpuid));
#endif
#ifndef XEN
/* If we are busy - try to use fast methods. */
if (busy) {
if ((cpu_feature2 & CPUID2_MON) && idle_mwait) {
cpu_idle_mwait(busy);
goto out;
}
}
#endif
/* If we have time - switch timers into idle mode. */
if (!busy) {
critical_enter();
cpu_idleclock();
}
#ifndef XEN
/* Apply AMD APIC timer C1E workaround. */
if (cpu_ident_amdc1e && cpu_disable_deep_sleep) {
msr = rdmsr(MSR_AMDK8_IPM);
if (msr & AMDK8_CMPHALT)
wrmsr(MSR_AMDK8_IPM, msr & ~AMDK8_CMPHALT);
}
#endif
/* Call main idle method. */
cpu_idle_fn(busy);
/* Switch timers mack into active mode. */
if (!busy) {
cpu_activeclock();
critical_exit();
}
#ifndef XEN
out:
#endif
CTR2(KTR_SPARE2, "cpu_idle(%d) at %d done",
busy, curcpu);
}
int
cpu_idle_wakeup(int cpu)
{
struct pcpu *pcpu;
int *state;
pcpu = pcpu_find(cpu);
state = (int *)pcpu->pc_monitorbuf;
/*
* This doesn't need to be atomic since missing the race will
* simply result in unnecessary IPIs.
*/
if (*state == STATE_SLEEPING)
return (0);
if (*state == STATE_MWAIT)
*state = STATE_RUNNING;
return (1);
}
/*
* Ordered by speed/power consumption.
*/
struct {
void *id_fn;
char *id_name;
} idle_tbl[] = {
{ cpu_idle_spin, "spin" },
{ cpu_idle_mwait, "mwait" },
{ cpu_idle_hlt, "hlt" },
{ cpu_idle_acpi, "acpi" },
{ NULL, NULL }
};
static int
idle_sysctl_available(SYSCTL_HANDLER_ARGS)
{
char *avail, *p;
int error;
int i;
avail = malloc(256, M_TEMP, M_WAITOK);
p = avail;
for (i = 0; idle_tbl[i].id_name != NULL; i++) {
if (strstr(idle_tbl[i].id_name, "mwait") &&
(cpu_feature2 & CPUID2_MON) == 0)
continue;
if (strcmp(idle_tbl[i].id_name, "acpi") == 0 &&
cpu_idle_hook == NULL)
continue;
p += sprintf(p, "%s%s", p != avail ? ", " : "",
idle_tbl[i].id_name);
}
error = sysctl_handle_string(oidp, avail, 0, req);
free(avail, M_TEMP);
return (error);
}
SYSCTL_PROC(_machdep, OID_AUTO, idle_available, CTLTYPE_STRING | CTLFLAG_RD,
0, 0, idle_sysctl_available, "A", "list of available idle functions");
static int
idle_sysctl(SYSCTL_HANDLER_ARGS)
{
char buf[16];
int error;
char *p;
int i;
p = "unknown";
for (i = 0; idle_tbl[i].id_name != NULL; i++) {
if (idle_tbl[i].id_fn == cpu_idle_fn) {
p = idle_tbl[i].id_name;
break;
}
}
strncpy(buf, p, sizeof(buf));
error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
if (error != 0 || req->newptr == NULL)
return (error);
for (i = 0; idle_tbl[i].id_name != NULL; i++) {
if (strstr(idle_tbl[i].id_name, "mwait") &&
(cpu_feature2 & CPUID2_MON) == 0)
continue;
if (strcmp(idle_tbl[i].id_name, "acpi") == 0 &&
cpu_idle_hook == NULL)
continue;
if (strcmp(idle_tbl[i].id_name, buf))
continue;
cpu_idle_fn = idle_tbl[i].id_fn;
return (0);
}
return (EINVAL);
}
SYSCTL_PROC(_machdep, OID_AUTO, idle, CTLTYPE_STRING | CTLFLAG_RW, 0, 0,
idle_sysctl, "A", "currently selected idle function");
uint64_t (*atomic_load_acq_64)(volatile uint64_t *) =
atomic_load_acq_64_i386;
void (*atomic_store_rel_64)(volatile uint64_t *, uint64_t) =
atomic_store_rel_64_i386;
static void
cpu_probe_cmpxchg8b(void)
{
if ((cpu_feature & CPUID_CX8) != 0 ||
cpu_vendor_id == CPU_VENDOR_RISE) {
atomic_load_acq_64 = atomic_load_acq_64_i586;
atomic_store_rel_64 = atomic_store_rel_64_i586;
}
}
/*
* Reset registers to default values on exec.
*/
void
exec_setregs(struct thread *td, struct image_params *imgp, u_long stack)
{
struct trapframe *regs = td->td_frame;
struct pcb *pcb = td->td_pcb;
/* Reset pc->pcb_gs and %gs before possibly invalidating it. */
pcb->pcb_gs = _udatasel;
load_gs(_udatasel);
mtx_lock_spin(&dt_lock);
if (td->td_proc->p_md.md_ldt)
user_ldt_free(td);
else
mtx_unlock_spin(&dt_lock);
bzero((char *)regs, sizeof(struct trapframe));
regs->tf_eip = imgp->entry_addr;
regs->tf_esp = stack;
regs->tf_eflags = PSL_USER | (regs->tf_eflags & PSL_T);
regs->tf_ss = _udatasel;
regs->tf_ds = _udatasel;
regs->tf_es = _udatasel;
regs->tf_fs = _udatasel;
regs->tf_cs = _ucodesel;
/* PS_STRINGS value for BSD/OS binaries. It is 0 for non-BSD/OS. */
regs->tf_ebx = imgp->ps_strings;
/*
* Reset the hardware debug registers if they were in use.
* They won't have any meaning for the newly exec'd process.
*/
if (pcb->pcb_flags & PCB_DBREGS) {
pcb->pcb_dr0 = 0;
pcb->pcb_dr1 = 0;
pcb->pcb_dr2 = 0;
pcb->pcb_dr3 = 0;
pcb->pcb_dr6 = 0;
pcb->pcb_dr7 = 0;
if (pcb == PCPU_GET(curpcb)) {
/*
* Clear the debug registers on the running
* CPU, otherwise they will end up affecting
* the next process we switch to.
*/
reset_dbregs();
}
pcb->pcb_flags &= ~PCB_DBREGS;
}
/*
* Initialize the math emulator (if any) for the current process.
* Actually, just clear the bit that says that the emulator has
* been initialized. Initialization is delayed until the process
* traps to the emulator (if it is done at all) mainly because
* emulators don't provide an entry point for initialization.
*/
td->td_pcb->pcb_flags &= ~FP_SOFTFP;
pcb->pcb_initial_npxcw = __INITIAL_NPXCW__;
/*
* Drop the FP state if we hold it, so that the process gets a
* clean FP state if it uses the FPU again.
*/
fpstate_drop(td);
/*
* XXX - Linux emulator
* Make sure sure edx is 0x0 on entry. Linux binaries depend
* on it.
*/
td->td_retval[1] = 0;
}
void
cpu_setregs(void)
{
unsigned int cr0;
cr0 = rcr0();
/*
* CR0_MP, CR0_NE and CR0_TS are set for NPX (FPU) support:
*
* Prepare to trap all ESC (i.e., NPX) instructions and all WAIT
* instructions. We must set the CR0_MP bit and use the CR0_TS
* bit to control the trap, because setting the CR0_EM bit does
* not cause WAIT instructions to trap. It's important to trap
* WAIT instructions - otherwise the "wait" variants of no-wait
* control instructions would degenerate to the "no-wait" variants
* after FP context switches but work correctly otherwise. It's
* particularly important to trap WAITs when there is no NPX -
* otherwise the "wait" variants would always degenerate.
*
* Try setting CR0_NE to get correct error reporting on 486DX's.
* Setting it should fail or do nothing on lesser processors.
*/
cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
load_cr0(cr0);
load_gs(_udatasel);
}
u_long bootdev; /* not a struct cdev *- encoding is different */
SYSCTL_ULONG(_machdep, OID_AUTO, guessed_bootdev,
CTLFLAG_RD, &bootdev, 0, "Maybe the Boot device (not in struct cdev *format)");
/*
* Initialize 386 and configure to run kernel
*/
/*
* Initialize segments & interrupt table
*/
int _default_ldt;
#ifdef XEN
union descriptor *gdt;
union descriptor *ldt;
#else
union descriptor gdt[NGDT * MAXCPU]; /* global descriptor table */
union descriptor ldt[NLDT]; /* local descriptor table */
#endif
static struct gate_descriptor idt0[NIDT];
struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */
struct region_descriptor r_gdt, r_idt; /* table descriptors */
struct mtx dt_lock; /* lock for GDT and LDT */
#if defined(I586_CPU) && !defined(NO_F00F_HACK)
extern int has_f00f_bug;
#endif
static struct i386tss dblfault_tss;
static char dblfault_stack[PAGE_SIZE];
extern vm_offset_t proc0kstack;
/*
* software prototypes -- in more palatable form.
*
* GCODE_SEL through GUDATA_SEL must be in this order for syscall/sysret
* GUFS_SEL and GUGS_SEL must be in this order (swtch.s knows it)
*/
struct soft_segment_descriptor gdt_segs[] = {
/* GNULL_SEL 0 Null Descriptor */
{ .ssd_base = 0x0,
.ssd_limit = 0x0,
.ssd_type = 0,
.ssd_dpl = SEL_KPL,
.ssd_p = 0,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 0,
.ssd_gran = 0 },
/* GPRIV_SEL 1 SMP Per-Processor Private Data Descriptor */
{ .ssd_base = 0x0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMRWA,
.ssd_dpl = SEL_KPL,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 1,
.ssd_gran = 1 },
/* GUFS_SEL 2 %fs Descriptor for user */
{ .ssd_base = 0x0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMRWA,
.ssd_dpl = SEL_UPL,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 1,
.ssd_gran = 1 },
/* GUGS_SEL 3 %gs Descriptor for user */
{ .ssd_base = 0x0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMRWA,
.ssd_dpl = SEL_UPL,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 1,
.ssd_gran = 1 },
/* GCODE_SEL 4 Code Descriptor for kernel */
{ .ssd_base = 0x0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMERA,
.ssd_dpl = SEL_KPL,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 1,
.ssd_gran = 1 },
/* GDATA_SEL 5 Data Descriptor for kernel */
{ .ssd_base = 0x0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMRWA,
.ssd_dpl = SEL_KPL,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 1,
.ssd_gran = 1 },
/* GUCODE_SEL 6 Code Descriptor for user */
{ .ssd_base = 0x0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMERA,
.ssd_dpl = SEL_UPL,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 1,
.ssd_gran = 1 },
/* GUDATA_SEL 7 Data Descriptor for user */
{ .ssd_base = 0x0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMRWA,
.ssd_dpl = SEL_UPL,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 1,
.ssd_gran = 1 },
/* GBIOSLOWMEM_SEL 8 BIOS access to realmode segment 0x40, must be #8 in GDT */
{ .ssd_base = 0x400,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMRWA,
.ssd_dpl = SEL_KPL,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 1,
.ssd_gran = 1 },
#ifndef XEN
/* GPROC0_SEL 9 Proc 0 Tss Descriptor */
{
.ssd_base = 0x0,
.ssd_limit = sizeof(struct i386tss)-1,
.ssd_type = SDT_SYS386TSS,
.ssd_dpl = 0,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 0,
.ssd_gran = 0 },
/* GLDT_SEL 10 LDT Descriptor */
{ .ssd_base = (int) ldt,
.ssd_limit = sizeof(ldt)-1,
.ssd_type = SDT_SYSLDT,
.ssd_dpl = SEL_UPL,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 0,
.ssd_gran = 0 },
/* GUSERLDT_SEL 11 User LDT Descriptor per process */
{ .ssd_base = (int) ldt,
.ssd_limit = (512 * sizeof(union descriptor)-1),
.ssd_type = SDT_SYSLDT,
.ssd_dpl = 0,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 0,
.ssd_gran = 0 },
/* GPANIC_SEL 12 Panic Tss Descriptor */
{ .ssd_base = (int) &dblfault_tss,
.ssd_limit = sizeof(struct i386tss)-1,
.ssd_type = SDT_SYS386TSS,
.ssd_dpl = 0,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 0,
.ssd_gran = 0 },
/* GBIOSCODE32_SEL 13 BIOS 32-bit interface (32bit Code) */
{ .ssd_base = 0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMERA,
.ssd_dpl = 0,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 0,
.ssd_gran = 1 },
/* GBIOSCODE16_SEL 14 BIOS 32-bit interface (16bit Code) */
{ .ssd_base = 0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMERA,
.ssd_dpl = 0,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 0,
.ssd_gran = 1 },
/* GBIOSDATA_SEL 15 BIOS 32-bit interface (Data) */
{ .ssd_base = 0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMRWA,
.ssd_dpl = 0,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 1,
.ssd_gran = 1 },
/* GBIOSUTIL_SEL 16 BIOS 16-bit interface (Utility) */
{ .ssd_base = 0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMRWA,
.ssd_dpl = 0,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 0,
.ssd_gran = 1 },
/* GBIOSARGS_SEL 17 BIOS 16-bit interface (Arguments) */
{ .ssd_base = 0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMRWA,
.ssd_dpl = 0,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 0,
.ssd_gran = 1 },
/* GNDIS_SEL 18 NDIS Descriptor */
{ .ssd_base = 0x0,
.ssd_limit = 0x0,
.ssd_type = 0,
.ssd_dpl = 0,
.ssd_p = 0,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 0,
.ssd_gran = 0 },
#endif /* !XEN */
};
static struct soft_segment_descriptor ldt_segs[] = {
/* Null Descriptor - overwritten by call gate */
{ .ssd_base = 0x0,
.ssd_limit = 0x0,
.ssd_type = 0,
.ssd_dpl = 0,
.ssd_p = 0,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 0,
.ssd_gran = 0 },
/* Null Descriptor - overwritten by call gate */
{ .ssd_base = 0x0,
.ssd_limit = 0x0,
.ssd_type = 0,
.ssd_dpl = 0,
.ssd_p = 0,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 0,
.ssd_gran = 0 },
/* Null Descriptor - overwritten by call gate */
{ .ssd_base = 0x0,
.ssd_limit = 0x0,
.ssd_type = 0,
.ssd_dpl = 0,
.ssd_p = 0,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 0,
.ssd_gran = 0 },
/* Code Descriptor for user */
{ .ssd_base = 0x0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMERA,
.ssd_dpl = SEL_UPL,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 1,
.ssd_gran = 1 },
/* Null Descriptor - overwritten by call gate */
{ .ssd_base = 0x0,
.ssd_limit = 0x0,
.ssd_type = 0,
.ssd_dpl = 0,
.ssd_p = 0,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 0,
.ssd_gran = 0 },
/* Data Descriptor for user */
{ .ssd_base = 0x0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMRWA,
.ssd_dpl = SEL_UPL,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 1,
.ssd_gran = 1 },
};
void
setidt(idx, func, typ, dpl, selec)
int idx;
inthand_t *func;
int typ;
int dpl;
int selec;
{
struct gate_descriptor *ip;
ip = idt + idx;
ip->gd_looffset = (int)func;
ip->gd_selector = selec;
ip->gd_stkcpy = 0;
ip->gd_xx = 0;
ip->gd_type = typ;
ip->gd_dpl = dpl;
ip->gd_p = 1;
ip->gd_hioffset = ((int)func)>>16 ;
}
extern inthand_t
IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
IDTVEC(xmm),
#ifdef KDTRACE_HOOKS
IDTVEC(dtrace_ret),
#endif
IDTVEC(lcall_syscall), IDTVEC(int0x80_syscall);
#ifdef DDB
/*
* Display the index and function name of any IDT entries that don't use
* the default 'rsvd' entry point.
*/
DB_SHOW_COMMAND(idt, db_show_idt)
{
struct gate_descriptor *ip;
int idx;
uintptr_t func;
ip = idt;
for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
func = (ip->gd_hioffset << 16 | ip->gd_looffset);
if (func != (uintptr_t)&IDTVEC(rsvd)) {
db_printf("%3d\t", idx);
db_printsym(func, DB_STGY_PROC);
db_printf("\n");
}
ip++;
}
}
/* Show privileged registers. */
DB_SHOW_COMMAND(sysregs, db_show_sysregs)
{
uint64_t idtr, gdtr;
idtr = ridt();
db_printf("idtr\t0x%08x/%04x\n",
(u_int)(idtr >> 16), (u_int)idtr & 0xffff);
gdtr = rgdt();
db_printf("gdtr\t0x%08x/%04x\n",
(u_int)(gdtr >> 16), (u_int)gdtr & 0xffff);
db_printf("ldtr\t0x%04x\n", rldt());
db_printf("tr\t0x%04x\n", rtr());
db_printf("cr0\t0x%08x\n", rcr0());
db_printf("cr2\t0x%08x\n", rcr2());
db_printf("cr3\t0x%08x\n", rcr3());
db_printf("cr4\t0x%08x\n", rcr4());
}
#endif
void
sdtossd(sd, ssd)
struct segment_descriptor *sd;
struct soft_segment_descriptor *ssd;
{
ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase;
ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
ssd->ssd_type = sd->sd_type;
ssd->ssd_dpl = sd->sd_dpl;
ssd->ssd_p = sd->sd_p;
ssd->ssd_def32 = sd->sd_def32;
ssd->ssd_gran = sd->sd_gran;
}
#ifndef XEN
static int
add_smap_entry(struct bios_smap *smap, vm_paddr_t *physmap, int *physmap_idxp)
{
int i, insert_idx, physmap_idx;
physmap_idx = *physmap_idxp;
if (boothowto & RB_VERBOSE)
printf("SMAP type=%02x base=%016llx len=%016llx\n",
smap->type, smap->base, smap->length);
if (smap->type != SMAP_TYPE_MEMORY)
return (1);
if (smap->length == 0)
return (1);
#ifndef PAE
if (smap->base > 0xffffffff) {
printf("%uK of memory above 4GB ignored\n",
(u_int)(smap->length / 1024));
return (1);
}
#endif
/*
* Find insertion point while checking for overlap. Start off by
* assuming the new entry will be added to the end.
*/
insert_idx = physmap_idx + 2;
for (i = 0; i <= physmap_idx; i += 2) {
if (smap->base < physmap[i + 1]) {
if (smap->base + smap->length <= physmap[i]) {
insert_idx = i;
break;
}
if (boothowto & RB_VERBOSE)
printf(
"Overlapping memory regions, ignoring second region\n");
return (1);
}
}
/* See if we can prepend to the next entry. */
if (insert_idx <= physmap_idx &&
smap->base + smap->length == physmap[insert_idx]) {
physmap[insert_idx] = smap->base;
return (1);
}
/* See if we can append to the previous entry. */
if (insert_idx > 0 && smap->base == physmap[insert_idx - 1]) {
physmap[insert_idx - 1] += smap->length;
return (1);
}
physmap_idx += 2;
*physmap_idxp = physmap_idx;
if (physmap_idx == PHYSMAP_SIZE) {
printf(
"Too many segments in the physical address map, giving up\n");
return (0);
}
/*
* Move the last 'N' entries down to make room for the new
* entry if needed.
*/
for (i = physmap_idx; i > insert_idx; i -= 2) {
physmap[i] = physmap[i - 2];
physmap[i + 1] = physmap[i - 1];
}
/* Insert the new entry. */
physmap[insert_idx] = smap->base;
physmap[insert_idx + 1] = smap->base + smap->length;
return (1);
}
static void
basemem_setup(void)
{
vm_paddr_t pa;
pt_entry_t *pte;
int i;
if (basemem > 640) {
printf("Preposterous BIOS basemem of %uK, truncating to 640K\n",
basemem);
basemem = 640;
}
/*
* XXX if biosbasemem is now < 640, there is a `hole'
* between the end of base memory and the start of
* ISA memory. The hole may be empty or it may
* contain BIOS code or data. Map it read/write so
* that the BIOS can write to it. (Memory from 0 to
* the physical end of the kernel is mapped read-only
* to begin with and then parts of it are remapped.
* The parts that aren't remapped form holes that
* remain read-only and are unused by the kernel.
* The base memory area is below the physical end of
* the kernel and right now forms a read-only hole.
* The part of it from PAGE_SIZE to
* (trunc_page(biosbasemem * 1024) - 1) will be
* remapped and used by the kernel later.)
*
* This code is similar to the code used in
* pmap_mapdev, but since no memory needs to be
* allocated we simply change the mapping.
*/
for (pa = trunc_page(basemem * 1024);
pa < ISA_HOLE_START; pa += PAGE_SIZE)
pmap_kenter(KERNBASE + pa, pa);
/*
* Map pages between basemem and ISA_HOLE_START, if any, r/w into
* the vm86 page table so that vm86 can scribble on them using
* the vm86 map too. XXX: why 2 ways for this and only 1 way for
* page 0, at least as initialized here?
*/
pte = (pt_entry_t *)vm86paddr;
for (i = basemem / 4; i < 160; i++)
pte[i] = (i << PAGE_SHIFT) | PG_V | PG_RW | PG_U;
}
#endif
/*
* Populate the (physmap) array with base/bound pairs describing the
* available physical memory in the system, then test this memory and
* build the phys_avail array describing the actually-available memory.
*
* If we cannot accurately determine the physical memory map, then use
* value from the 0xE801 call, and failing that, the RTC.
*
* Total memory size may be set by the kernel environment variable
* hw.physmem or the compile-time define MAXMEM.
*
* XXX first should be vm_paddr_t.
*/
static void
getmemsize(int first)
{
int has_smap, off, physmap_idx, pa_indx, da_indx;
u_long physmem_tunable, memtest;
vm_paddr_t physmap[PHYSMAP_SIZE];
pt_entry_t *pte;
quad_t dcons_addr, dcons_size;
#ifndef XEN
int hasbrokenint12, i;
u_int extmem;
struct vm86frame vmf;
struct vm86context vmc;
vm_paddr_t pa;
struct bios_smap *smap, *smapbase, *smapend;
u_int32_t smapsize;
caddr_t kmdp;
#endif
has_smap = 0;
#if defined(XEN)
Maxmem = xen_start_info->nr_pages - init_first;
physmem = Maxmem;
basemem = 0;
physmap[0] = init_first << PAGE_SHIFT;
physmap[1] = ptoa(Maxmem) - round_page(msgbufsize);
physmap_idx = 0;
#else
#ifdef XBOX
if (arch_i386_is_xbox) {
/*
* We queried the memory size before, so chop off 4MB for
* the framebuffer and inform the OS of this.
*/
physmap[0] = 0;
physmap[1] = (arch_i386_xbox_memsize * 1024 * 1024) - XBOX_FB_SIZE;
physmap_idx = 0;
goto physmap_done;
}
#endif
bzero(&vmf, sizeof(vmf));
bzero(physmap, sizeof(physmap));
basemem = 0;
/*
* Check if the loader supplied an SMAP memory map. If so,
* use that and do not make any VM86 calls.
*/
physmap_idx = 0;
smapbase = NULL;
kmdp = preload_search_by_type("elf kernel");
if (kmdp == NULL)
kmdp = preload_search_by_type("elf32 kernel");
if (kmdp != NULL)
smapbase = (struct bios_smap *)preload_search_info(kmdp,
MODINFO_METADATA | MODINFOMD_SMAP);
if (smapbase != NULL) {
/*
* subr_module.c says:
* "Consumer may safely assume that size value precedes data."
* ie: an int32_t immediately precedes SMAP.
*/
smapsize = *((u_int32_t *)smapbase - 1);
smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
has_smap = 1;
for (smap = smapbase; smap < smapend; smap++)
if (!add_smap_entry(smap, physmap, &physmap_idx))
break;
goto have_smap;
}
/*
* Some newer BIOSes have a broken INT 12H implementation
* which causes a kernel panic immediately. In this case, we
* need use the SMAP to determine the base memory size.
*/
hasbrokenint12 = 0;
TUNABLE_INT_FETCH("hw.hasbrokenint12", &hasbrokenint12);
if (hasbrokenint12 == 0) {
/* Use INT12 to determine base memory size. */
vm86_intcall(0x12, &vmf);
basemem = vmf.vmf_ax;
basemem_setup();
}
/*
* Fetch the memory map with INT 15:E820. Map page 1 R/W into
* the kernel page table so we can use it as a buffer. The
* kernel will unmap this page later.
*/
pmap_kenter(KERNBASE + (1 << PAGE_SHIFT), 1 << PAGE_SHIFT);
vmc.npages = 0;
smap = (void *)vm86_addpage(&vmc, 1, KERNBASE + (1 << PAGE_SHIFT));
vm86_getptr(&vmc, (vm_offset_t)smap, &vmf.vmf_es, &vmf.vmf_di);
vmf.vmf_ebx = 0;
do {
vmf.vmf_eax = 0xE820;
vmf.vmf_edx = SMAP_SIG;
vmf.vmf_ecx = sizeof(struct bios_smap);
i = vm86_datacall(0x15, &vmf, &vmc);
if (i || vmf.vmf_eax != SMAP_SIG)
break;
has_smap = 1;
if (!add_smap_entry(smap, physmap, &physmap_idx))
break;
} while (vmf.vmf_ebx != 0);
have_smap:
/*
* If we didn't fetch the "base memory" size from INT12,
* figure it out from the SMAP (or just guess).
*/
if (basemem == 0) {
for (i = 0; i <= physmap_idx; i += 2) {
if (physmap[i] == 0x00000000) {
basemem = physmap[i + 1] / 1024;
break;
}
}
/* XXX: If we couldn't find basemem from SMAP, just guess. */
if (basemem == 0)
basemem = 640;
basemem_setup();
}
if (physmap[1] != 0)
goto physmap_done;
/*
* If we failed to find an SMAP, figure out the extended
* memory size. We will then build a simple memory map with
* two segments, one for "base memory" and the second for
* "extended memory". Note that "extended memory" starts at a
* physical address of 1MB and that both basemem and extmem
* are in units of 1KB.
*
* First, try to fetch the extended memory size via INT 15:E801.
*/
vmf.vmf_ax = 0xE801;
if (vm86_intcall(0x15, &vmf) == 0) {
extmem = vmf.vmf_cx + vmf.vmf_dx * 64;
} else {
/*
* If INT15:E801 fails, this is our last ditch effort
* to determine the extended memory size. Currently
* we prefer the RTC value over INT15:88.
*/
#if 0
vmf.vmf_ah = 0x88;
vm86_intcall(0x15, &vmf);
extmem = vmf.vmf_ax;
#else
extmem = rtcin(RTC_EXTLO) + (rtcin(RTC_EXTHI) << 8);
#endif
}
/*
* Special hack for chipsets that still remap the 384k hole when
* there's 16MB of memory - this really confuses people that
* are trying to use bus mastering ISA controllers with the
* "16MB limit"; they only have 16MB, but the remapping puts
* them beyond the limit.
*
* If extended memory is between 15-16MB (16-17MB phys address range),
* chop it to 15MB.
*/
if ((extmem > 15 * 1024) && (extmem < 16 * 1024))
extmem = 15 * 1024;
physmap[0] = 0;
physmap[1] = basemem * 1024;
physmap_idx = 2;
physmap[physmap_idx] = 0x100000;
physmap[physmap_idx + 1] = physmap[physmap_idx] + extmem * 1024;
physmap_done:
#endif
/*
* Now, physmap contains a map of physical memory.
*/
#ifdef SMP
/* make hole for AP bootstrap code */
physmap[1] = mp_bootaddress(physmap[1]);
#endif
/*
* Maxmem isn't the "maximum memory", it's one larger than the
* highest page of the physical address space. It should be
* called something like "Maxphyspage". We may adjust this
* based on ``hw.physmem'' and the results of the memory test.
*/
Maxmem = atop(physmap[physmap_idx + 1]);
#ifdef MAXMEM
Maxmem = MAXMEM / 4;
#endif
if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
Maxmem = atop(physmem_tunable);
/*
* If we have an SMAP, don't allow MAXMEM or hw.physmem to extend
* the amount of memory in the system.
*/
if (has_smap && Maxmem > atop(physmap[physmap_idx + 1]))
Maxmem = atop(physmap[physmap_idx + 1]);
/*
* By default keep the memtest enabled. Use a general name so that
* one could eventually do more with the code than just disable it.
*/
memtest = 1;
TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
if (atop(physmap[physmap_idx + 1]) != Maxmem &&
(boothowto & RB_VERBOSE))
printf("Physical memory use set to %ldK\n", Maxmem * 4);
/*
* If Maxmem has been increased beyond what the system has detected,
* extend the last memory segment to the new limit.
*/
if (atop(physmap[physmap_idx + 1]) < Maxmem)
physmap[physmap_idx + 1] = ptoa((vm_paddr_t)Maxmem);
/* call pmap initialization to make new kernel address space */
pmap_bootstrap(first);
/*
* Size up each available chunk of physical memory.
*/
physmap[0] = PAGE_SIZE; /* mask off page 0 */
pa_indx = 0;
da_indx = 1;
phys_avail[pa_indx++] = physmap[0];
phys_avail[pa_indx] = physmap[0];
dump_avail[da_indx] = physmap[0];
pte = CMAP1;
/*
* Get dcons buffer address
*/
if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
getenv_quad("dcons.size", &dcons_size) == 0)
dcons_addr = 0;
#ifndef XEN
/*
* physmap is in bytes, so when converting to page boundaries,
* round up the start address and round down the end address.
*/
for (i = 0; i <= physmap_idx; i += 2) {
vm_paddr_t end;
end = ptoa((vm_paddr_t)Maxmem);
if (physmap[i + 1] < end)
end = trunc_page(physmap[i + 1]);
for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
int tmp, page_bad, full;
int *ptr = (int *)CADDR1;
full = FALSE;
/*
* block out kernel memory as not available.
*/
if (pa >= KERNLOAD && pa < first)
goto do_dump_avail;
/*
* block out dcons buffer
*/
if (dcons_addr > 0
&& pa >= trunc_page(dcons_addr)
&& pa < dcons_addr + dcons_size)
goto do_dump_avail;
page_bad = FALSE;
if (memtest == 0)
goto skip_memtest;
/*
* map page into kernel: valid, read/write,non-cacheable
*/
*pte = pa | PG_V | PG_RW | PG_N;
invltlb();
tmp = *(int *)ptr;
/*
* Test for alternating 1's and 0's
*/
*(volatile int *)ptr = 0xaaaaaaaa;
if (*(volatile int *)ptr != 0xaaaaaaaa)
page_bad = TRUE;
/*
* Test for alternating 0's and 1's
*/
*(volatile int *)ptr = 0x55555555;
if (*(volatile int *)ptr != 0x55555555)
page_bad = TRUE;
/*
* Test for all 1's
*/
*(volatile int *)ptr = 0xffffffff;
if (*(volatile int *)ptr != 0xffffffff)
page_bad = TRUE;
/*
* Test for all 0's
*/
*(volatile int *)ptr = 0x0;
if (*(volatile int *)ptr != 0x0)
page_bad = TRUE;
/*
* Restore original value.
*/
*(int *)ptr = tmp;
skip_memtest:
/*
* Adjust array of valid/good pages.
*/
if (page_bad == TRUE)
continue;
/*
* If this good page is a continuation of the
* previous set of good pages, then just increase
* the end pointer. Otherwise start a new chunk.
* Note that "end" points one higher than end,
* making the range >= start and < end.
* If we're also doing a speculative memory
* test and we at or past the end, bump up Maxmem
* so that we keep going. The first bad page
* will terminate the loop.
*/
if (phys_avail[pa_indx] == pa) {
phys_avail[pa_indx] += PAGE_SIZE;
} else {
pa_indx++;
if (pa_indx == PHYS_AVAIL_ARRAY_END) {
printf(
"Too many holes in the physical address space, giving up\n");
pa_indx--;
full = TRUE;
goto do_dump_avail;
}
phys_avail[pa_indx++] = pa; /* start */
phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
}
physmem++;
do_dump_avail:
if (dump_avail[da_indx] == pa) {
dump_avail[da_indx] += PAGE_SIZE;
} else {
da_indx++;
if (da_indx == DUMP_AVAIL_ARRAY_END) {
da_indx--;
goto do_next;
}
dump_avail[da_indx++] = pa; /* start */
dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
}
do_next:
if (full)
break;
}
}
*pte = 0;
invltlb();
#else
phys_avail[0] = physfree;
phys_avail[1] = xen_start_info->nr_pages*PAGE_SIZE;
dump_avail[0] = 0;
dump_avail[1] = xen_start_info->nr_pages*PAGE_SIZE;
#endif
/*
* XXX
* The last chunk must contain at least one page plus the message
* buffer to avoid complicating other code (message buffer address
* calculation, etc.).
*/
while (phys_avail[pa_indx - 1] + PAGE_SIZE +
round_page(msgbufsize) >= phys_avail[pa_indx]) {
physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
phys_avail[pa_indx--] = 0;
phys_avail[pa_indx--] = 0;
}
Maxmem = atop(phys_avail[pa_indx]);
/* Trim off space for the message buffer. */
phys_avail[pa_indx] -= round_page(msgbufsize);
/* Map the message buffer. */
for (off = 0; off < round_page(msgbufsize); off += PAGE_SIZE)
pmap_kenter((vm_offset_t)msgbufp + off, phys_avail[pa_indx] +
off);
PT_UPDATES_FLUSH();
}
#ifdef XEN
#define MTOPSIZE (1<<(14 + PAGE_SHIFT))
void
init386(first)
int first;
{
unsigned long gdtmachpfn;
int error, gsel_tss, metadata_missing, x, pa;
size_t kstack0_sz;
struct pcpu *pc;
struct callback_register event = {
.type = CALLBACKTYPE_event,
.address = {GSEL(GCODE_SEL, SEL_KPL), (unsigned long)Xhypervisor_callback },
};
struct callback_register failsafe = {
.type = CALLBACKTYPE_failsafe,
.address = {GSEL(GCODE_SEL, SEL_KPL), (unsigned long)failsafe_callback },
};
thread0.td_kstack = proc0kstack;
thread0.td_kstack_pages = KSTACK_PAGES;
kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
thread0.td_pcb = (struct pcb *)(thread0.td_kstack + kstack0_sz) - 1;
/*
* This may be done better later if it gets more high level
* components in it. If so just link td->td_proc here.
*/
proc_linkup0(&proc0, &thread0);
metadata_missing = 0;
if (xen_start_info->mod_start) {
preload_metadata = (caddr_t)xen_start_info->mod_start;
preload_bootstrap_relocate(KERNBASE);
} else {
metadata_missing = 1;
}
if (envmode == 1)
kern_envp = static_env;
else if ((caddr_t)xen_start_info->cmd_line)
kern_envp = xen_setbootenv((caddr_t)xen_start_info->cmd_line);
boothowto |= xen_boothowto(kern_envp);
/* Init basic tunables, hz etc */
init_param1();
/*
* XEN occupies a portion of the upper virtual address space
* At its base it manages an array mapping machine page frames
* to physical page frames - hence we need to be able to
* access 4GB - (64MB - 4MB + 64k)
*/
gdt_segs[GPRIV_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE);
gdt_segs[GUFS_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE);
gdt_segs[GUGS_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE);
gdt_segs[GCODE_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE);
gdt_segs[GDATA_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE);
gdt_segs[GUCODE_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE);
gdt_segs[GUDATA_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE);
gdt_segs[GBIOSLOWMEM_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE);
pc = &__pcpu[0];
gdt_segs[GPRIV_SEL].ssd_base = (int) pc;
gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss;
PT_SET_MA(gdt, xpmap_ptom(VTOP(gdt)) | PG_V | PG_RW);
bzero(gdt, PAGE_SIZE);
for (x = 0; x < NGDT; x++)
ssdtosd(&gdt_segs[x], &gdt[x].sd);
mtx_init(&dt_lock, "descriptor tables", NULL, MTX_SPIN);
gdtmachpfn = vtomach(gdt) >> PAGE_SHIFT;
PT_SET_MA(gdt, xpmap_ptom(VTOP(gdt)) | PG_V);
PANIC_IF(HYPERVISOR_set_gdt(&gdtmachpfn, 512) != 0);
lgdt(&r_gdt);
gdtset = 1;
if ((error = HYPERVISOR_set_trap_table(trap_table)) != 0) {
panic("set_trap_table failed - error %d\n", error);
}
error = HYPERVISOR_callback_op(CALLBACKOP_register, &event);
if (error == 0)
error = HYPERVISOR_callback_op(CALLBACKOP_register, &failsafe);
#if CONFIG_XEN_COMPAT <= 0x030002
if (error == -ENOXENSYS)
HYPERVISOR_set_callbacks(GSEL(GCODE_SEL, SEL_KPL),
(unsigned long)Xhypervisor_callback,
GSEL(GCODE_SEL, SEL_KPL), (unsigned long)failsafe_callback);
#endif
pcpu_init(pc, 0, sizeof(struct pcpu));
for (pa = first; pa < first + DPCPU_SIZE; pa += PAGE_SIZE)
pmap_kenter(pa + KERNBASE, pa);
dpcpu_init((void *)(first + KERNBASE), 0);
first += DPCPU_SIZE;
physfree += DPCPU_SIZE;
init_first += DPCPU_SIZE / PAGE_SIZE;
PCPU_SET(prvspace, pc);
PCPU_SET(curthread, &thread0);
PCPU_SET(curpcb, thread0.td_pcb);
/*
* Initialize mutexes.
*
* icu_lock: in order to allow an interrupt to occur in a critical
* section, to set pcpu->ipending (etc...) properly, we
* must be able to get the icu lock, so it can't be
* under witness.
*/
mutex_init();
mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS | MTX_NOPROFILE);
/* make ldt memory segments */
PT_SET_MA(ldt, xpmap_ptom(VTOP(ldt)) | PG_V | PG_RW);
bzero(ldt, PAGE_SIZE);
ldt_segs[LUCODE_SEL].ssd_limit = atop(0 - 1);
ldt_segs[LUDATA_SEL].ssd_limit = atop(0 - 1);
for (x = 0; x < sizeof ldt_segs / sizeof ldt_segs[0]; x++)
ssdtosd(&ldt_segs[x], &ldt[x].sd);
default_proc_ldt.ldt_base = (caddr_t)ldt;
default_proc_ldt.ldt_len = 6;
_default_ldt = (int)&default_proc_ldt;
PCPU_SET(currentldt, _default_ldt);
PT_SET_MA(ldt, *vtopte((unsigned long)ldt) & ~PG_RW);
xen_set_ldt((unsigned long) ldt, (sizeof ldt_segs / sizeof ldt_segs[0]));
#if defined(XEN_PRIVILEGED)
/*
* Initialize the i8254 before the console so that console
* initialization can use DELAY().
*/
i8254_init();
#endif
/*
* Initialize the console before we print anything out.
*/
cninit();
if (metadata_missing)
printf("WARNING: loader(8) metadata is missing!\n");
#ifdef DEV_ISA
elcr_probe();
atpic_startup();
#endif
#ifdef DDB
ksym_start = bootinfo.bi_symtab;
ksym_end = bootinfo.bi_esymtab;
#endif
kdb_init();
#ifdef KDB
if (boothowto & RB_KDB)
kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
#endif
finishidentcpu(); /* Final stage of CPU initialization */
setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
initializecpu(); /* Initialize CPU registers */
/* make an initial tss so cpu can get interrupt stack on syscall! */
/* Note: -16 is so we can grow the trapframe if we came from vm86 */
PCPU_SET(common_tss.tss_esp0, thread0.td_kstack +
kstack0_sz - sizeof(struct pcb) - 16);
PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL));
gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
HYPERVISOR_stack_switch(GSEL(GDATA_SEL, SEL_KPL),
PCPU_GET(common_tss.tss_esp0));
/* pointer to selector slot for %fs/%gs */
PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd);
dblfault_tss.tss_esp = dblfault_tss.tss_esp0 = dblfault_tss.tss_esp1 =
dblfault_tss.tss_esp2 = (int)&dblfault_stack[sizeof(dblfault_stack)];
dblfault_tss.tss_ss = dblfault_tss.tss_ss0 = dblfault_tss.tss_ss1 =
dblfault_tss.tss_ss2 = GSEL(GDATA_SEL, SEL_KPL);
#ifdef PAE
dblfault_tss.tss_cr3 = (int)IdlePDPT;
#else
dblfault_tss.tss_cr3 = (int)IdlePTD;
#endif
dblfault_tss.tss_eip = (int)dblfault_handler;
dblfault_tss.tss_eflags = PSL_KERNEL;
dblfault_tss.tss_ds = dblfault_tss.tss_es =
dblfault_tss.tss_gs = GSEL(GDATA_SEL, SEL_KPL);
dblfault_tss.tss_fs = GSEL(GPRIV_SEL, SEL_KPL);
dblfault_tss.tss_cs = GSEL(GCODE_SEL, SEL_KPL);
dblfault_tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL);
vm86_initialize();
getmemsize(first);
init_param2(physmem);
/* now running on new page tables, configured,and u/iom is accessible */
msgbufinit(msgbufp, msgbufsize);
/* transfer to user mode */
_ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
_udatasel = GSEL(GUDATA_SEL, SEL_UPL);
/* setup proc 0's pcb */
thread0.td_pcb->pcb_flags = 0;
#ifdef PAE
thread0.td_pcb->pcb_cr3 = (int)IdlePDPT;
#else
thread0.td_pcb->pcb_cr3 = (int)IdlePTD;
#endif
thread0.td_pcb->pcb_ext = 0;
thread0.td_frame = &proc0_tf;
thread0.td_pcb->pcb_fsd = PCPU_GET(fsgs_gdt)[0];
thread0.td_pcb->pcb_gsd = PCPU_GET(fsgs_gdt)[1];
cpu_probe_amdc1e();
cpu_probe_cmpxchg8b();
}
#else
void
init386(first)
int first;
{
struct gate_descriptor *gdp;
int gsel_tss, metadata_missing, x, pa;
size_t kstack0_sz;
struct pcpu *pc;
thread0.td_kstack = proc0kstack;
thread0.td_kstack_pages = KSTACK_PAGES;
kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
thread0.td_pcb = (struct pcb *)(thread0.td_kstack + kstack0_sz) - 1;
/*
* This may be done better later if it gets more high level
* components in it. If so just link td->td_proc here.
*/
proc_linkup0(&proc0, &thread0);
metadata_missing = 0;
if (bootinfo.bi_modulep) {
preload_metadata = (caddr_t)bootinfo.bi_modulep + KERNBASE;
preload_bootstrap_relocate(KERNBASE);
} else {
metadata_missing = 1;
}
if (envmode == 1)
kern_envp = static_env;
else if (bootinfo.bi_envp)
kern_envp = (caddr_t)bootinfo.bi_envp + KERNBASE;
/* Init basic tunables, hz etc */
init_param1();
/*
* Make gdt memory segments. All segments cover the full 4GB
* of address space and permissions are enforced at page level.
*/
gdt_segs[GCODE_SEL].ssd_limit = atop(0 - 1);
gdt_segs[GDATA_SEL].ssd_limit = atop(0 - 1);
gdt_segs[GUCODE_SEL].ssd_limit = atop(0 - 1);
gdt_segs[GUDATA_SEL].ssd_limit = atop(0 - 1);
gdt_segs[GUFS_SEL].ssd_limit = atop(0 - 1);
gdt_segs[GUGS_SEL].ssd_limit = atop(0 - 1);
pc = &__pcpu[0];
gdt_segs[GPRIV_SEL].ssd_limit = atop(0 - 1);
gdt_segs[GPRIV_SEL].ssd_base = (int) pc;
gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss;
for (x = 0; x < NGDT; x++)
ssdtosd(&gdt_segs[x], &gdt[x].sd);
r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
r_gdt.rd_base = (int) gdt;
mtx_init(&dt_lock, "descriptor tables", NULL, MTX_SPIN);
lgdt(&r_gdt);
pcpu_init(pc, 0, sizeof(struct pcpu));
for (pa = first; pa < first + DPCPU_SIZE; pa += PAGE_SIZE)
pmap_kenter(pa + KERNBASE, pa);
dpcpu_init((void *)(first + KERNBASE), 0);
first += DPCPU_SIZE;
PCPU_SET(prvspace, pc);
PCPU_SET(curthread, &thread0);
PCPU_SET(curpcb, thread0.td_pcb);
/*
* Initialize mutexes.
*
* icu_lock: in order to allow an interrupt to occur in a critical
* section, to set pcpu->ipending (etc...) properly, we
* must be able to get the icu lock, so it can't be
* under witness.
*/
mutex_init();
mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS | MTX_NOPROFILE);
/* make ldt memory segments */
ldt_segs[LUCODE_SEL].ssd_limit = atop(0 - 1);
ldt_segs[LUDATA_SEL].ssd_limit = atop(0 - 1);
for (x = 0; x < sizeof ldt_segs / sizeof ldt_segs[0]; x++)
ssdtosd(&ldt_segs[x], &ldt[x].sd);
_default_ldt = GSEL(GLDT_SEL, SEL_KPL);
lldt(_default_ldt);
PCPU_SET(currentldt, _default_ldt);
/* exceptions */
for (x = 0; x < NIDT; x++)
setidt(x, &IDTVEC(rsvd), SDT_SYS386TGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_DE, &IDTVEC(div), SDT_SYS386TGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_DB, &IDTVEC(dbg), SDT_SYS386IGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYS386IGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_BP, &IDTVEC(bpt), SDT_SYS386IGT, SEL_UPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_OF, &IDTVEC(ofl), SDT_SYS386TGT, SEL_UPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_BR, &IDTVEC(bnd), SDT_SYS386TGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_NM, &IDTVEC(dna), SDT_SYS386TGT, SEL_KPL
, GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_DF, 0, SDT_SYSTASKGT, SEL_KPL, GSEL(GPANIC_SEL, SEL_KPL));
setidt(IDT_FPUGP, &IDTVEC(fpusegm), SDT_SYS386TGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_TS, &IDTVEC(tss), SDT_SYS386TGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_NP, &IDTVEC(missing), SDT_SYS386TGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_SS, &IDTVEC(stk), SDT_SYS386TGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_PF, &IDTVEC(page), SDT_SYS386IGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_MF, &IDTVEC(fpu), SDT_SYS386TGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_AC, &IDTVEC(align), SDT_SYS386TGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_MC, &IDTVEC(mchk), SDT_SYS386TGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_XF, &IDTVEC(xmm), SDT_SYS386TGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_SYSCALL, &IDTVEC(int0x80_syscall), SDT_SYS386TGT, SEL_UPL,
GSEL(GCODE_SEL, SEL_KPL));
#ifdef KDTRACE_HOOKS
setidt(IDT_DTRACE_RET, &IDTVEC(dtrace_ret), SDT_SYS386TGT, SEL_UPL,
GSEL(GCODE_SEL, SEL_KPL));
#endif
r_idt.rd_limit = sizeof(idt0) - 1;
r_idt.rd_base = (int) idt;
lidt(&r_idt);
#ifdef XBOX
/*
* The following code queries the PCI ID of 0:0:0. For the XBOX,
* This should be 0x10de / 0x02a5.
*
* This is exactly what Linux does.
*/
outl(0xcf8, 0x80000000);
if (inl(0xcfc) == 0x02a510de) {
arch_i386_is_xbox = 1;
pic16l_setled(XBOX_LED_GREEN);
/*
* We are an XBOX, but we may have either 64MB or 128MB of
* memory. The PCI host bridge should be programmed for this,
* so we just query it.
*/
outl(0xcf8, 0x80000084);
arch_i386_xbox_memsize = (inl(0xcfc) == 0x7FFFFFF) ? 128 : 64;
}
#endif /* XBOX */
/*
* Initialize the i8254 before the console so that console
* initialization can use DELAY().
*/
i8254_init();
/*
* Initialize the console before we print anything out.
*/
cninit();
if (metadata_missing)
printf("WARNING: loader(8) metadata is missing!\n");
#ifdef DEV_ISA
elcr_probe();
atpic_startup();
#endif
#ifdef DDB
ksym_start = bootinfo.bi_symtab;
ksym_end = bootinfo.bi_esymtab;
#endif
kdb_init();
#ifdef KDB
if (boothowto & RB_KDB)
kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
#endif
finishidentcpu(); /* Final stage of CPU initialization */
setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
initializecpu(); /* Initialize CPU registers */
/* make an initial tss so cpu can get interrupt stack on syscall! */
/* Note: -16 is so we can grow the trapframe if we came from vm86 */
PCPU_SET(common_tss.tss_esp0, thread0.td_kstack +
kstack0_sz - sizeof(struct pcb) - 16);
PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL));
gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
PCPU_SET(tss_gdt, &gdt[GPROC0_SEL].sd);
PCPU_SET(common_tssd, *PCPU_GET(tss_gdt));
PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16);
ltr(gsel_tss);
/* pointer to selector slot for %fs/%gs */
PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd);
dblfault_tss.tss_esp = dblfault_tss.tss_esp0 = dblfault_tss.tss_esp1 =
dblfault_tss.tss_esp2 = (int)&dblfault_stack[sizeof(dblfault_stack)];
dblfault_tss.tss_ss = dblfault_tss.tss_ss0 = dblfault_tss.tss_ss1 =
dblfault_tss.tss_ss2 = GSEL(GDATA_SEL, SEL_KPL);
#ifdef PAE
dblfault_tss.tss_cr3 = (int)IdlePDPT;
#else
dblfault_tss.tss_cr3 = (int)IdlePTD;
#endif
dblfault_tss.tss_eip = (int)dblfault_handler;
dblfault_tss.tss_eflags = PSL_KERNEL;
dblfault_tss.tss_ds = dblfault_tss.tss_es =
dblfault_tss.tss_gs = GSEL(GDATA_SEL, SEL_KPL);
dblfault_tss.tss_fs = GSEL(GPRIV_SEL, SEL_KPL);
dblfault_tss.tss_cs = GSEL(GCODE_SEL, SEL_KPL);
dblfault_tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL);
vm86_initialize();
getmemsize(first);
init_param2(physmem);
/* now running on new page tables, configured,and u/iom is accessible */
msgbufinit(msgbufp, msgbufsize);
/* make a call gate to reenter kernel with */
gdp = &ldt[LSYS5CALLS_SEL].gd;
x = (int) &IDTVEC(lcall_syscall);
gdp->gd_looffset = x;
gdp->gd_selector = GSEL(GCODE_SEL,SEL_KPL);
gdp->gd_stkcpy = 1;
gdp->gd_type = SDT_SYS386CGT;
gdp->gd_dpl = SEL_UPL;
gdp->gd_p = 1;
gdp->gd_hioffset = x >> 16;
/* XXX does this work? */
/* XXX yes! */
ldt[LBSDICALLS_SEL] = ldt[LSYS5CALLS_SEL];
ldt[LSOL26CALLS_SEL] = ldt[LSYS5CALLS_SEL];
/* transfer to user mode */
_ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
_udatasel = GSEL(GUDATA_SEL, SEL_UPL);
/* setup proc 0's pcb */
thread0.td_pcb->pcb_flags = 0;
#ifdef PAE
thread0.td_pcb->pcb_cr3 = (int)IdlePDPT;
#else
thread0.td_pcb->pcb_cr3 = (int)IdlePTD;
#endif
thread0.td_pcb->pcb_ext = 0;
thread0.td_frame = &proc0_tf;
cpu_probe_amdc1e();
cpu_probe_cmpxchg8b();
}
#endif
void
cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
{
pcpu->pc_acpi_id = 0xffffffff;
}
void
spinlock_enter(void)
{
struct thread *td;
register_t flags;
td = curthread;
if (td->td_md.md_spinlock_count == 0) {
flags = intr_disable();
td->td_md.md_spinlock_count = 1;
td->td_md.md_saved_flags = flags;
} else
td->td_md.md_spinlock_count++;
critical_enter();
}
void
spinlock_exit(void)
{
struct thread *td;
register_t flags;
td = curthread;
critical_exit();
flags = td->td_md.md_saved_flags;
td->td_md.md_spinlock_count--;
if (td->td_md.md_spinlock_count == 0)
intr_restore(flags);
}
#if defined(I586_CPU) && !defined(NO_F00F_HACK)
static void f00f_hack(void *unused);
SYSINIT(f00f_hack, SI_SUB_INTRINSIC, SI_ORDER_FIRST, f00f_hack, NULL);
static void
f00f_hack(void *unused)
{
struct gate_descriptor *new_idt;
vm_offset_t tmp;
if (!has_f00f_bug)
return;
GIANT_REQUIRED;
printf("Intel Pentium detected, installing workaround for F00F bug\n");
tmp = kmem_alloc(kernel_map, PAGE_SIZE * 2);
if (tmp == 0)
panic("kmem_alloc returned 0");
/* Put the problematic entry (#6) at the end of the lower page. */
new_idt = (struct gate_descriptor*)
(tmp + PAGE_SIZE - 7 * sizeof(struct gate_descriptor));
bcopy(idt, new_idt, sizeof(idt0));
r_idt.rd_base = (u_int)new_idt;
lidt(&r_idt);
idt = new_idt;
if (vm_map_protect(kernel_map, tmp, tmp + PAGE_SIZE,
VM_PROT_READ, FALSE) != KERN_SUCCESS)
panic("vm_map_protect failed");
}
#endif /* defined(I586_CPU) && !NO_F00F_HACK */
/*
* Construct a PCB from a trapframe. This is called from kdb_trap() where
* we want to start a backtrace from the function that caused us to enter
* the debugger. We have the context in the trapframe, but base the trace
* on the PCB. The PCB doesn't have to be perfect, as long as it contains
* enough for a backtrace.
*/
void
makectx(struct trapframe *tf, struct pcb *pcb)
{
pcb->pcb_edi = tf->tf_edi;
pcb->pcb_esi = tf->tf_esi;
pcb->pcb_ebp = tf->tf_ebp;
pcb->pcb_ebx = tf->tf_ebx;
pcb->pcb_eip = tf->tf_eip;
pcb->pcb_esp = (ISPL(tf->tf_cs)) ? tf->tf_esp : (int)(tf + 1) - 8;
}
int
ptrace_set_pc(struct thread *td, u_long addr)
{
td->td_frame->tf_eip = addr;
return (0);
}
int
ptrace_single_step(struct thread *td)
{
td->td_frame->tf_eflags |= PSL_T;
return (0);
}
int
ptrace_clear_single_step(struct thread *td)
{
td->td_frame->tf_eflags &= ~PSL_T;
return (0);
}
int
fill_regs(struct thread *td, struct reg *regs)
{
struct pcb *pcb;
struct trapframe *tp;
tp = td->td_frame;
pcb = td->td_pcb;
regs->r_gs = pcb->pcb_gs;
return (fill_frame_regs(tp, regs));
}
int
fill_frame_regs(struct trapframe *tp, struct reg *regs)
{
regs->r_fs = tp->tf_fs;
regs->r_es = tp->tf_es;
regs->r_ds = tp->tf_ds;
regs->r_edi = tp->tf_edi;
regs->r_esi = tp->tf_esi;
regs->r_ebp = tp->tf_ebp;
regs->r_ebx = tp->tf_ebx;
regs->r_edx = tp->tf_edx;
regs->r_ecx = tp->tf_ecx;
regs->r_eax = tp->tf_eax;
regs->r_eip = tp->tf_eip;
regs->r_cs = tp->tf_cs;
regs->r_eflags = tp->tf_eflags;
regs->r_esp = tp->tf_esp;
regs->r_ss = tp->tf_ss;
return (0);
}
int
set_regs(struct thread *td, struct reg *regs)
{
struct pcb *pcb;
struct trapframe *tp;
tp = td->td_frame;
if (!EFL_SECURE(regs->r_eflags, tp->tf_eflags) ||
!CS_SECURE(regs->r_cs))
return (EINVAL);
pcb = td->td_pcb;
tp->tf_fs = regs->r_fs;
tp->tf_es = regs->r_es;
tp->tf_ds = regs->r_ds;
tp->tf_edi = regs->r_edi;
tp->tf_esi = regs->r_esi;
tp->tf_ebp = regs->r_ebp;
tp->tf_ebx = regs->r_ebx;
tp->tf_edx = regs->r_edx;
tp->tf_ecx = regs->r_ecx;
tp->tf_eax = regs->r_eax;
tp->tf_eip = regs->r_eip;
tp->tf_cs = regs->r_cs;
tp->tf_eflags = regs->r_eflags;
tp->tf_esp = regs->r_esp;
tp->tf_ss = regs->r_ss;
pcb->pcb_gs = regs->r_gs;
return (0);
}
#ifdef CPU_ENABLE_SSE
static void
fill_fpregs_xmm(sv_xmm, sv_87)
struct savexmm *sv_xmm;
struct save87 *sv_87;
{
register struct env87 *penv_87 = &sv_87->sv_env;
register struct envxmm *penv_xmm = &sv_xmm->sv_env;
int i;
bzero(sv_87, sizeof(*sv_87));
/* FPU control/status */
penv_87->en_cw = penv_xmm->en_cw;
penv_87->en_sw = penv_xmm->en_sw;
penv_87->en_tw = penv_xmm->en_tw;
penv_87->en_fip = penv_xmm->en_fip;
penv_87->en_fcs = penv_xmm->en_fcs;
penv_87->en_opcode = penv_xmm->en_opcode;
penv_87->en_foo = penv_xmm->en_foo;
penv_87->en_fos = penv_xmm->en_fos;
/* FPU registers */
for (i = 0; i < 8; ++i)
sv_87->sv_ac[i] = sv_xmm->sv_fp[i].fp_acc;
}
static void
set_fpregs_xmm(sv_87, sv_xmm)
struct save87 *sv_87;
struct savexmm *sv_xmm;
{
register struct env87 *penv_87 = &sv_87->sv_env;
register struct envxmm *penv_xmm = &sv_xmm->sv_env;
int i;
/* FPU control/status */
penv_xmm->en_cw = penv_87->en_cw;
penv_xmm->en_sw = penv_87->en_sw;
penv_xmm->en_tw = penv_87->en_tw;
penv_xmm->en_fip = penv_87->en_fip;
penv_xmm->en_fcs = penv_87->en_fcs;
penv_xmm->en_opcode = penv_87->en_opcode;
penv_xmm->en_foo = penv_87->en_foo;
penv_xmm->en_fos = penv_87->en_fos;
/* FPU registers */
for (i = 0; i < 8; ++i)
sv_xmm->sv_fp[i].fp_acc = sv_87->sv_ac[i];
}
#endif /* CPU_ENABLE_SSE */
int
fill_fpregs(struct thread *td, struct fpreg *fpregs)
{
KASSERT(td == curthread || TD_IS_SUSPENDED(td),
("not suspended thread %p", td));
#ifdef DEV_NPX
npxgetregs(td);
#else
bzero(fpregs, sizeof(*fpregs));
#endif
#ifdef CPU_ENABLE_SSE
if (cpu_fxsr)
fill_fpregs_xmm(&td->td_pcb->pcb_user_save.sv_xmm,
(struct save87 *)fpregs);
else
#endif /* CPU_ENABLE_SSE */
bcopy(&td->td_pcb->pcb_user_save.sv_87, fpregs,
sizeof(*fpregs));
return (0);
}
int
set_fpregs(struct thread *td, struct fpreg *fpregs)
{
#ifdef CPU_ENABLE_SSE
if (cpu_fxsr)
set_fpregs_xmm((struct save87 *)fpregs,
&td->td_pcb->pcb_user_save.sv_xmm);
else
#endif /* CPU_ENABLE_SSE */
bcopy(fpregs, &td->td_pcb->pcb_user_save.sv_87,
sizeof(*fpregs));
#ifdef DEV_NPX
npxuserinited(td);
#endif
return (0);
}
/*
* Get machine context.
*/
int
get_mcontext(struct thread *td, mcontext_t *mcp, int flags)
{
struct trapframe *tp;
struct segment_descriptor *sdp;
tp = td->td_frame;
PROC_LOCK(curthread->td_proc);
mcp->mc_onstack = sigonstack(tp->tf_esp);
PROC_UNLOCK(curthread->td_proc);
mcp->mc_gs = td->td_pcb->pcb_gs;
mcp->mc_fs = tp->tf_fs;
mcp->mc_es = tp->tf_es;
mcp->mc_ds = tp->tf_ds;
mcp->mc_edi = tp->tf_edi;
mcp->mc_esi = tp->tf_esi;
mcp->mc_ebp = tp->tf_ebp;
mcp->mc_isp = tp->tf_isp;
mcp->mc_eflags = tp->tf_eflags;
if (flags & GET_MC_CLEAR_RET) {
mcp->mc_eax = 0;
mcp->mc_edx = 0;
mcp->mc_eflags &= ~PSL_C;
} else {
mcp->mc_eax = tp->tf_eax;
mcp->mc_edx = tp->tf_edx;
}
mcp->mc_ebx = tp->tf_ebx;
mcp->mc_ecx = tp->tf_ecx;
mcp->mc_eip = tp->tf_eip;
mcp->mc_cs = tp->tf_cs;
mcp->mc_esp = tp->tf_esp;
mcp->mc_ss = tp->tf_ss;
mcp->mc_len = sizeof(*mcp);
get_fpcontext(td, mcp);
sdp = &td->td_pcb->pcb_fsd;
mcp->mc_fsbase = sdp->sd_hibase << 24 | sdp->sd_lobase;
sdp = &td->td_pcb->pcb_gsd;
mcp->mc_gsbase = sdp->sd_hibase << 24 | sdp->sd_lobase;
bzero(mcp->mc_spare1, sizeof(mcp->mc_spare1));
bzero(mcp->mc_spare2, sizeof(mcp->mc_spare2));
return (0);
}
/*
* Set machine context.
*
* However, we don't set any but the user modifiable flags, and we won't
* touch the cs selector.
*/
int
set_mcontext(struct thread *td, const mcontext_t *mcp)
{
struct trapframe *tp;
int eflags, ret;
tp = td->td_frame;
if (mcp->mc_len != sizeof(*mcp))
return (EINVAL);
eflags = (mcp->mc_eflags & PSL_USERCHANGE) |
(tp->tf_eflags & ~PSL_USERCHANGE);
if ((ret = set_fpcontext(td, mcp)) == 0) {
tp->tf_fs = mcp->mc_fs;
tp->tf_es = mcp->mc_es;
tp->tf_ds = mcp->mc_ds;
tp->tf_edi = mcp->mc_edi;
tp->tf_esi = mcp->mc_esi;
tp->tf_ebp = mcp->mc_ebp;
tp->tf_ebx = mcp->mc_ebx;
tp->tf_edx = mcp->mc_edx;
tp->tf_ecx = mcp->mc_ecx;
tp->tf_eax = mcp->mc_eax;
tp->tf_eip = mcp->mc_eip;
tp->tf_eflags = eflags;
tp->tf_esp = mcp->mc_esp;
tp->tf_ss = mcp->mc_ss;
td->td_pcb->pcb_gs = mcp->mc_gs;
ret = 0;
}
return (ret);
}
static void
get_fpcontext(struct thread *td, mcontext_t *mcp)
{
#ifndef DEV_NPX
mcp->mc_fpformat = _MC_FPFMT_NODEV;
mcp->mc_ownedfp = _MC_FPOWNED_NONE;
bzero(mcp->mc_fpstate, sizeof(mcp->mc_fpstate));
#else
mcp->mc_ownedfp = npxgetregs(td);
bcopy(&td->td_pcb->pcb_user_save, &mcp->mc_fpstate,
sizeof(mcp->mc_fpstate));
mcp->mc_fpformat = npxformat();
#endif
}
static int
set_fpcontext(struct thread *td, const mcontext_t *mcp)
{
if (mcp->mc_fpformat == _MC_FPFMT_NODEV)
return (0);
else if (mcp->mc_fpformat != _MC_FPFMT_387 &&
mcp->mc_fpformat != _MC_FPFMT_XMM)
return (EINVAL);
else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE)
/* We don't care what state is left in the FPU or PCB. */
fpstate_drop(td);
else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU ||
mcp->mc_ownedfp == _MC_FPOWNED_PCB) {
#ifdef DEV_NPX
#ifdef CPU_ENABLE_SSE
if (cpu_fxsr)
((union savefpu *)&mcp->mc_fpstate)->sv_xmm.sv_env.
en_mxcsr &= cpu_mxcsr_mask;
#endif
npxsetregs(td, (union savefpu *)&mcp->mc_fpstate);
#endif
} else
return (EINVAL);
return (0);
}
static void
fpstate_drop(struct thread *td)
{
KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu"));
critical_enter();
#ifdef DEV_NPX
if (PCPU_GET(fpcurthread) == td)
npxdrop();
#endif
/*
* XXX force a full drop of the npx. The above only drops it if we
* owned it. npxgetregs() has the same bug in the !cpu_fxsr case.
*
* XXX I don't much like npxgetregs()'s semantics of doing a full
* drop. Dropping only to the pcb matches fnsave's behaviour.
* We only need to drop to !PCB_INITDONE in sendsig(). But
* sendsig() is the only caller of npxgetregs()... perhaps we just
* have too many layers.
*/
curthread->td_pcb->pcb_flags &= ~(PCB_NPXINITDONE |
PCB_NPXUSERINITDONE);
critical_exit();
}
int
fill_dbregs(struct thread *td, struct dbreg *dbregs)
{
struct pcb *pcb;
if (td == NULL) {
dbregs->dr[0] = rdr0();
dbregs->dr[1] = rdr1();
dbregs->dr[2] = rdr2();
dbregs->dr[3] = rdr3();
dbregs->dr[4] = rdr4();
dbregs->dr[5] = rdr5();
dbregs->dr[6] = rdr6();
dbregs->dr[7] = rdr7();
} else {
pcb = td->td_pcb;
dbregs->dr[0] = pcb->pcb_dr0;
dbregs->dr[1] = pcb->pcb_dr1;
dbregs->dr[2] = pcb->pcb_dr2;
dbregs->dr[3] = pcb->pcb_dr3;
dbregs->dr[4] = 0;
dbregs->dr[5] = 0;
dbregs->dr[6] = pcb->pcb_dr6;
dbregs->dr[7] = pcb->pcb_dr7;
}
return (0);
}
int
set_dbregs(struct thread *td, struct dbreg *dbregs)
{
struct pcb *pcb;
int i;
if (td == NULL) {
load_dr0(dbregs->dr[0]);
load_dr1(dbregs->dr[1]);
load_dr2(dbregs->dr[2]);
load_dr3(dbregs->dr[3]);
load_dr4(dbregs->dr[4]);
load_dr5(dbregs->dr[5]);
load_dr6(dbregs->dr[6]);
load_dr7(dbregs->dr[7]);
} else {
/*
* Don't let an illegal value for dr7 get set. Specifically,
* check for undefined settings. Setting these bit patterns
* result in undefined behaviour and can lead to an unexpected
* TRCTRAP.
*/
for (i = 0; i < 4; i++) {
if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02)
return (EINVAL);
if (DBREG_DR7_LEN(dbregs->dr[7], i) == 0x02)
return (EINVAL);
}
pcb = td->td_pcb;
/*
* Don't let a process set a breakpoint that is not within the
* process's address space. If a process could do this, it
* could halt the system by setting a breakpoint in the kernel
* (if ddb was enabled). Thus, we need to check to make sure
* that no breakpoints are being enabled for addresses outside
* process's address space.
*
* XXX - what about when the watched area of the user's
* address space is written into from within the kernel
* ... wouldn't that still cause a breakpoint to be generated
* from within kernel mode?
*/
if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) {
/* dr0 is enabled */
if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS)
return (EINVAL);
}
if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) {
/* dr1 is enabled */
if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS)
return (EINVAL);
}
if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) {
/* dr2 is enabled */
if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS)
return (EINVAL);
}
if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) {
/* dr3 is enabled */
if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS)
return (EINVAL);
}
pcb->pcb_dr0 = dbregs->dr[0];
pcb->pcb_dr1 = dbregs->dr[1];
pcb->pcb_dr2 = dbregs->dr[2];
pcb->pcb_dr3 = dbregs->dr[3];
pcb->pcb_dr6 = dbregs->dr[6];
pcb->pcb_dr7 = dbregs->dr[7];
pcb->pcb_flags |= PCB_DBREGS;
}
return (0);
}
/*
* Return > 0 if a hardware breakpoint has been hit, and the
* breakpoint was in user space. Return 0, otherwise.
*/
int
user_dbreg_trap(void)
{
u_int32_t dr7, dr6; /* debug registers dr6 and dr7 */
u_int32_t bp; /* breakpoint bits extracted from dr6 */
int nbp; /* number of breakpoints that triggered */
caddr_t addr[4]; /* breakpoint addresses */
int i;
dr7 = rdr7();
if ((dr7 & 0x000000ff) == 0) {
/*
* all GE and LE bits in the dr7 register are zero,
* thus the trap couldn't have been caused by the
* hardware debug registers
*/
return 0;
}
nbp = 0;
dr6 = rdr6();
bp = dr6 & 0x0000000f;
if (!bp) {
/*
* None of the breakpoint bits are set meaning this
* trap was not caused by any of the debug registers
*/
return 0;
}
/*
* at least one of the breakpoints were hit, check to see
* which ones and if any of them are user space addresses
*/
if (bp & 0x01) {
addr[nbp++] = (caddr_t)rdr0();
}
if (bp & 0x02) {
addr[nbp++] = (caddr_t)rdr1();
}
if (bp & 0x04) {
addr[nbp++] = (caddr_t)rdr2();
}
if (bp & 0x08) {
addr[nbp++] = (caddr_t)rdr3();
}
for (i = 0; i < nbp; i++) {
if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) {
/*
* addr[i] is in user space
*/
return nbp;
}
}
/*
* None of the breakpoints are in user space.
*/
return 0;
}
#ifdef KDB
/*
* Provide inb() and outb() as functions. They are normally only available as
* inline functions, thus cannot be called from the debugger.
*/
/* silence compiler warnings */
u_char inb_(u_short);
void outb_(u_short, u_char);
u_char
inb_(u_short port)
{
return inb(port);
}
void
outb_(u_short port, u_char data)
{
outb(port, data);
}
#endif /* KDB */
Index: head/sys/i386/i386/trap.c
===================================================================
--- head/sys/i386/i386/trap.c (revision 225616)
+++ head/sys/i386/i386/trap.c (revision 225617)
@@ -1,1106 +1,1106 @@
/*-
* Copyright (C) 1994, David Greenman
* Copyright (c) 1990, 1993
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* the University of Utah, and William Jolitz.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* from: @(#)trap.c 7.4 (Berkeley) 5/13/91
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
/*
* 386 Trap and System call handling
*/
#include "opt_clock.h"
#include "opt_cpu.h"
#include "opt_hwpmc_hooks.h"
#include "opt_isa.h"
#include "opt_kdb.h"
#include "opt_kdtrace.h"
#include "opt_npx.h"
#include "opt_trap.h"
#include <sys/param.h>
#include <sys/bus.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/pioctl.h>
#include <sys/ptrace.h>
#include <sys/kdb.h>
#include <sys/kernel.h>
#include <sys/ktr.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/resourcevar.h>
#include <sys/signalvar.h>
#include <sys/syscall.h>
#include <sys/sysctl.h>
#include <sys/sysent.h>
#include <sys/uio.h>
#include <sys/vmmeter.h>
#ifdef HWPMC_HOOKS
#include <sys/pmckern.h>
#endif
#include <security/audit/audit.h>
#include <vm/vm.h>
#include <vm/vm_param.h>
#include <vm/pmap.h>
#include <vm/vm_kern.h>
#include <vm/vm_map.h>
#include <vm/vm_page.h>
#include <vm/vm_extern.h>
#include <machine/cpu.h>
#include <machine/intr_machdep.h>
#include <x86/mca.h>
#include <machine/md_var.h>
#include <machine/pcb.h>
#ifdef SMP
#include <machine/smp.h>
#endif
#include <machine/tss.h>
#include <machine/vm86.h>
#ifdef POWERFAIL_NMI
#include <sys/syslog.h>
#include <machine/clock.h>
#endif
#ifdef KDTRACE_HOOKS
#include <sys/dtrace_bsd.h>
/*
* This is a hook which is initialised by the dtrace module
* to handle traps which might occur during DTrace probe
* execution.
*/
dtrace_trap_func_t dtrace_trap_func;
dtrace_doubletrap_func_t dtrace_doubletrap_func;
/*
* This is a hook which is initialised by the systrace module
* when it is loaded. This keeps the DTrace syscall provider
* implementation opaque.
*/
systrace_probe_func_t systrace_probe_func;
/*
* These hooks are necessary for the pid, usdt and fasttrap providers.
*/
dtrace_fasttrap_probe_ptr_t dtrace_fasttrap_probe_ptr;
dtrace_pid_probe_ptr_t dtrace_pid_probe_ptr;
dtrace_return_probe_ptr_t dtrace_return_probe_ptr;
#endif
extern void trap(struct trapframe *frame);
extern void syscall(struct trapframe *frame);
static int trap_pfault(struct trapframe *, int, vm_offset_t);
static void trap_fatal(struct trapframe *, vm_offset_t);
void dblfault_handler(void);
extern inthand_t IDTVEC(lcall_syscall);
#define MAX_TRAP_MSG 30
static char *trap_msg[] = {
"", /* 0 unused */
"privileged instruction fault", /* 1 T_PRIVINFLT */
"", /* 2 unused */
"breakpoint instruction fault", /* 3 T_BPTFLT */
"", /* 4 unused */
"", /* 5 unused */
"arithmetic trap", /* 6 T_ARITHTRAP */
"", /* 7 unused */
"", /* 8 unused */
"general protection fault", /* 9 T_PROTFLT */
"trace trap", /* 10 T_TRCTRAP */
"", /* 11 unused */
"page fault", /* 12 T_PAGEFLT */
"", /* 13 unused */
"alignment fault", /* 14 T_ALIGNFLT */
"", /* 15 unused */
"", /* 16 unused */
"", /* 17 unused */
"integer divide fault", /* 18 T_DIVIDE */
"non-maskable interrupt trap", /* 19 T_NMI */
"overflow trap", /* 20 T_OFLOW */
"FPU bounds check fault", /* 21 T_BOUND */
"FPU device not available", /* 22 T_DNA */
"double fault", /* 23 T_DOUBLEFLT */
"FPU operand fetch fault", /* 24 T_FPOPFLT */
"invalid TSS fault", /* 25 T_TSSFLT */
"segment not present fault", /* 26 T_SEGNPFLT */
"stack fault", /* 27 T_STKFLT */
"machine check trap", /* 28 T_MCHK */
"SIMD floating-point exception", /* 29 T_XMMFLT */
"reserved (unknown) fault", /* 30 T_RESERVED */
};
#if defined(I586_CPU) && !defined(NO_F00F_HACK)
extern int has_f00f_bug;
#endif
#ifdef KDB
static int kdb_on_nmi = 1;
SYSCTL_INT(_machdep, OID_AUTO, kdb_on_nmi, CTLFLAG_RW,
&kdb_on_nmi, 0, "Go to KDB on NMI");
TUNABLE_INT("machdep.kdb_on_nmi", &kdb_on_nmi);
#endif
static int panic_on_nmi = 1;
SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RW,
&panic_on_nmi, 0, "Panic on NMI");
TUNABLE_INT("machdep.panic_on_nmi", &panic_on_nmi);
static int prot_fault_translation = 0;
SYSCTL_INT(_machdep, OID_AUTO, prot_fault_translation, CTLFLAG_RW,
&prot_fault_translation, 0, "Select signal to deliver on protection fault");
/*
* Exception, fault, and trap interface to the FreeBSD kernel.
* This common code is called from assembly language IDT gate entry
* routines that prepare a suitable stack frame, and restore this
* frame after the exception has been processed.
*/
void
trap(struct trapframe *frame)
{
struct thread *td = curthread;
struct proc *p = td->td_proc;
int i = 0, ucode = 0, code;
u_int type;
register_t addr = 0;
vm_offset_t eva;
ksiginfo_t ksi;
#ifdef POWERFAIL_NMI
static int lastalert = 0;
#endif
PCPU_INC(cnt.v_trap);
type = frame->tf_trapno;
#ifdef SMP
/* Handler for NMI IPIs used for stopping CPUs. */
if (type == T_NMI) {
if (ipi_nmi_handler() == 0)
goto out;
}
#endif /* SMP */
#ifdef KDB
if (kdb_active) {
kdb_reenter();
goto out;
}
#endif
if (type == T_RESERVED) {
trap_fatal(frame, 0);
goto out;
}
#ifdef HWPMC_HOOKS
/*
* CPU PMCs interrupt using an NMI so we check for that first.
* If the HWPMC module is active, 'pmc_hook' will point to
* the function to be called. A return value of '1' from the
* hook means that the NMI was handled by it and that we can
* return immediately.
*/
if (type == T_NMI && pmc_intr &&
(*pmc_intr)(PCPU_GET(cpuid), frame))
goto out;
#endif
if (type == T_MCHK) {
if (!mca_intr())
trap_fatal(frame, 0);
goto out;
}
#ifdef KDTRACE_HOOKS
/*
* A trap can occur while DTrace executes a probe. Before
* executing the probe, DTrace blocks re-scheduling and sets
* a flag in it's per-cpu flags to indicate that it doesn't
* want to fault. On returning from the probe, the no-fault
* flag is cleared and finally re-scheduling is enabled.
*
* If the DTrace kernel module has registered a trap handler,
* call it and if it returns non-zero, assume that it has
* handled the trap and modified the trap frame so that this
* function can return normally.
*/
if ((type == T_PROTFLT || type == T_PAGEFLT) &&
dtrace_trap_func != NULL)
if ((*dtrace_trap_func)(frame, type))
goto out;
if (type == T_DTRACE_PROBE || type == T_DTRACE_RET ||
type == T_BPTFLT) {
struct reg regs;
fill_frame_regs(frame, &regs);
if (type == T_DTRACE_PROBE &&
dtrace_fasttrap_probe_ptr != NULL &&
dtrace_fasttrap_probe_ptr(&regs) == 0)
goto out;
if (type == T_BPTFLT &&
dtrace_pid_probe_ptr != NULL &&
dtrace_pid_probe_ptr(&regs) == 0)
goto out;
if (type == T_DTRACE_RET &&
dtrace_return_probe_ptr != NULL &&
dtrace_return_probe_ptr(&regs) == 0)
goto out;
}
#endif
if ((frame->tf_eflags & PSL_I) == 0) {
/*
* Buggy application or kernel code has disabled
* interrupts and then trapped. Enabling interrupts
* now is wrong, but it is better than running with
* interrupts disabled until they are accidentally
* enabled later.
*/
if (ISPL(frame->tf_cs) == SEL_UPL || (frame->tf_eflags & PSL_VM))
uprintf(
"pid %ld (%s): trap %d with interrupts disabled\n",
(long)curproc->p_pid, curthread->td_name, type);
else if (type != T_BPTFLT && type != T_TRCTRAP &&
frame->tf_eip != (int)cpu_switch_load_gs) {
/*
* XXX not quite right, since this may be for a
* multiple fault in user mode.
*/
printf("kernel trap %d with interrupts disabled\n",
type);
/*
* Page faults need interrupts disabled until later,
* and we shouldn't enable interrupts while holding
* a spin lock or if servicing an NMI.
*/
if (type != T_NMI && type != T_PAGEFLT &&
td->td_md.md_spinlock_count == 0)
enable_intr();
}
}
eva = 0;
code = frame->tf_err;
if (type == T_PAGEFLT) {
/*
* For some Cyrix CPUs, %cr2 is clobbered by
* interrupts. This problem is worked around by using
* an interrupt gate for the pagefault handler. We
* are finally ready to read %cr2 and then must
* reenable interrupts.
*
* If we get a page fault while in a critical section, then
* it is most likely a fatal kernel page fault. The kernel
* is already going to panic trying to get a sleep lock to
* do the VM lookup, so just consider it a fatal trap so the
* kernel can print out a useful trap message and even get
* to the debugger.
*
* If we get a page fault while holding a non-sleepable
* lock, then it is most likely a fatal kernel page fault.
* If WITNESS is enabled, then it's going to whine about
* bogus LORs with various VM locks, so just skip to the
* fatal trap handling directly.
*/
eva = rcr2();
if (td->td_critnest != 0 ||
WITNESS_CHECK(WARN_SLEEPOK | WARN_GIANTOK, NULL,
"Kernel page fault") != 0)
trap_fatal(frame, eva);
else
enable_intr();
}
if ((ISPL(frame->tf_cs) == SEL_UPL) ||
((frame->tf_eflags & PSL_VM) &&
!(PCPU_GET(curpcb)->pcb_flags & PCB_VM86CALL))) {
/* user trap */
td->td_pticks = 0;
td->td_frame = frame;
addr = frame->tf_eip;
if (td->td_ucred != p->p_ucred)
cred_update_thread(td);
switch (type) {
case T_PRIVINFLT: /* privileged instruction fault */
i = SIGILL;
ucode = ILL_PRVOPC;
break;
case T_BPTFLT: /* bpt instruction fault */
case T_TRCTRAP: /* trace trap */
enable_intr();
frame->tf_eflags &= ~PSL_T;
i = SIGTRAP;
ucode = (type == T_TRCTRAP ? TRAP_TRACE : TRAP_BRKPT);
break;
case T_ARITHTRAP: /* arithmetic trap */
#ifdef DEV_NPX
ucode = npxtrap();
if (ucode == -1)
goto userout;
#else
ucode = 0;
#endif
i = SIGFPE;
break;
/*
* The following two traps can happen in
* vm86 mode, and, if so, we want to handle
* them specially.
*/
case T_PROTFLT: /* general protection fault */
case T_STKFLT: /* stack fault */
if (frame->tf_eflags & PSL_VM) {
i = vm86_emulate((struct vm86frame *)frame);
if (i == 0)
goto user;
break;
}
i = SIGBUS;
ucode = (type == T_PROTFLT) ? BUS_OBJERR : BUS_ADRERR;
break;
case T_SEGNPFLT: /* segment not present fault */
i = SIGBUS;
ucode = BUS_ADRERR;
break;
case T_TSSFLT: /* invalid TSS fault */
i = SIGBUS;
ucode = BUS_OBJERR;
break;
case T_DOUBLEFLT: /* double fault */
default:
i = SIGBUS;
ucode = BUS_OBJERR;
break;
case T_PAGEFLT: /* page fault */
i = trap_pfault(frame, TRUE, eva);
#if defined(I586_CPU) && !defined(NO_F00F_HACK)
if (i == -2) {
/*
* The f00f hack workaround has triggered, so
* treat the fault as an illegal instruction
* (T_PRIVINFLT) instead of a page fault.
*/
type = frame->tf_trapno = T_PRIVINFLT;
/* Proceed as in that case. */
ucode = ILL_PRVOPC;
i = SIGILL;
break;
}
#endif
if (i == -1)
goto userout;
if (i == 0)
goto user;
if (i == SIGSEGV)
ucode = SEGV_MAPERR;
else {
if (prot_fault_translation == 0) {
/*
* Autodetect.
* This check also covers the images
* without the ABI-tag ELF note.
*/
if (SV_CURPROC_ABI() == SV_ABI_FREEBSD
&& p->p_osrel >= P_OSREL_SIGSEGV) {
i = SIGSEGV;
ucode = SEGV_ACCERR;
} else {
i = SIGBUS;
ucode = BUS_PAGE_FAULT;
}
} else if (prot_fault_translation == 1) {
/*
* Always compat mode.
*/
i = SIGBUS;
ucode = BUS_PAGE_FAULT;
} else {
/*
* Always SIGSEGV mode.
*/
i = SIGSEGV;
ucode = SEGV_ACCERR;
}
}
addr = eva;
break;
case T_DIVIDE: /* integer divide fault */
ucode = FPE_INTDIV;
i = SIGFPE;
break;
#ifdef DEV_ISA
case T_NMI:
#ifdef POWERFAIL_NMI
#ifndef TIMER_FREQ
# define TIMER_FREQ 1193182
#endif
if (time_second - lastalert > 10) {
log(LOG_WARNING, "NMI: power fail\n");
sysbeep(880, hz);
lastalert = time_second;
}
goto userout;
#else /* !POWERFAIL_NMI */
/* machine/parity/power fail/"kitchen sink" faults */
if (isa_nmi(code) == 0) {
#ifdef KDB
/*
* NMI can be hooked up to a pushbutton
* for debugging.
*/
if (kdb_on_nmi) {
printf ("NMI ... going to debugger\n");
kdb_trap(type, 0, frame);
}
#endif /* KDB */
goto userout;
} else if (panic_on_nmi)
panic("NMI indicates hardware failure");
break;
#endif /* POWERFAIL_NMI */
#endif /* DEV_ISA */
case T_OFLOW: /* integer overflow fault */
ucode = FPE_INTOVF;
i = SIGFPE;
break;
case T_BOUND: /* bounds check fault */
ucode = FPE_FLTSUB;
i = SIGFPE;
break;
case T_DNA:
#ifdef DEV_NPX
KASSERT(PCB_USER_FPU(td->td_pcb),
("kernel FPU ctx has leaked"));
/* transparent fault (due to context switch "late") */
if (npxdna())
goto userout;
#endif
uprintf("pid %d killed due to lack of floating point\n",
p->p_pid);
i = SIGKILL;
ucode = 0;
break;
case T_FPOPFLT: /* FPU operand fetch fault */
ucode = ILL_COPROC;
i = SIGILL;
break;
case T_XMMFLT: /* SIMD floating-point exception */
ucode = 0; /* XXX */
i = SIGFPE;
break;
}
} else {
/* kernel trap */
KASSERT(cold || td->td_ucred != NULL,
("kernel trap doesn't have ucred"));
switch (type) {
case T_PAGEFLT: /* page fault */
(void) trap_pfault(frame, FALSE, eva);
goto out;
case T_DNA:
#ifdef DEV_NPX
KASSERT(!PCB_USER_FPU(td->td_pcb),
("Unregistered use of FPU in kernel"));
if (npxdna())
goto out;
#endif
break;
case T_ARITHTRAP: /* arithmetic trap */
case T_XMMFLT: /* SIMD floating-point exception */
case T_FPOPFLT: /* FPU operand fetch fault */
/*
* XXXKIB for now disable any FPU traps in kernel
* handler registration seems to be overkill
*/
trap_fatal(frame, 0);
goto out;
/*
* The following two traps can happen in
* vm86 mode, and, if so, we want to handle
* them specially.
*/
case T_PROTFLT: /* general protection fault */
case T_STKFLT: /* stack fault */
if (frame->tf_eflags & PSL_VM) {
i = vm86_emulate((struct vm86frame *)frame);
if (i != 0)
/*
* returns to original process
*/
vm86_trap((struct vm86frame *)frame);
goto out;
}
if (type == T_STKFLT)
break;
/* FALL THROUGH */
case T_SEGNPFLT: /* segment not present fault */
if (PCPU_GET(curpcb)->pcb_flags & PCB_VM86CALL)
break;
/*
* Invalid %fs's and %gs's can be created using
* procfs or PT_SETREGS or by invalidating the
* underlying LDT entry. This causes a fault
* in kernel mode when the kernel attempts to
* switch contexts. Lose the bad context
* (XXX) so that we can continue, and generate
* a signal.
*/
if (frame->tf_eip == (int)cpu_switch_load_gs) {
PCPU_GET(curpcb)->pcb_gs = 0;
#if 0
PROC_LOCK(p);
- psignal(p, SIGBUS);
+ kern_psignal(p, SIGBUS);
PROC_UNLOCK(p);
#endif
goto out;
}
if (td->td_intr_nesting_level != 0)
break;
/*
* Invalid segment selectors and out of bounds
* %eip's and %esp's can be set up in user mode.
* This causes a fault in kernel mode when the
* kernel tries to return to user mode. We want
* to get this fault so that we can fix the
* problem here and not have to check all the
* selectors and pointers when the user changes
* them.
*/
if (frame->tf_eip == (int)doreti_iret) {
frame->tf_eip = (int)doreti_iret_fault;
goto out;
}
if (frame->tf_eip == (int)doreti_popl_ds) {
frame->tf_eip = (int)doreti_popl_ds_fault;
goto out;
}
if (frame->tf_eip == (int)doreti_popl_es) {
frame->tf_eip = (int)doreti_popl_es_fault;
goto out;
}
if (frame->tf_eip == (int)doreti_popl_fs) {
frame->tf_eip = (int)doreti_popl_fs_fault;
goto out;
}
if (PCPU_GET(curpcb)->pcb_onfault != NULL) {
frame->tf_eip =
(int)PCPU_GET(curpcb)->pcb_onfault;
goto out;
}
break;
case T_TSSFLT:
/*
* PSL_NT can be set in user mode and isn't cleared
* automatically when the kernel is entered. This
* causes a TSS fault when the kernel attempts to
* `iret' because the TSS link is uninitialized. We
* want to get this fault so that we can fix the
* problem here and not every time the kernel is
* entered.
*/
if (frame->tf_eflags & PSL_NT) {
frame->tf_eflags &= ~PSL_NT;
goto out;
}
break;
case T_TRCTRAP: /* trace trap */
if (frame->tf_eip == (int)IDTVEC(lcall_syscall)) {
/*
* We've just entered system mode via the
* syscall lcall. Continue single stepping
* silently until the syscall handler has
* saved the flags.
*/
goto out;
}
if (frame->tf_eip == (int)IDTVEC(lcall_syscall) + 1) {
/*
* The syscall handler has now saved the
* flags. Stop single stepping it.
*/
frame->tf_eflags &= ~PSL_T;
goto out;
}
/*
* Ignore debug register trace traps due to
* accesses in the user's address space, which
* can happen under several conditions such as
* if a user sets a watchpoint on a buffer and
* then passes that buffer to a system call.
* We still want to get TRCTRAPS for addresses
* in kernel space because that is useful when
* debugging the kernel.
*/
if (user_dbreg_trap() &&
!(PCPU_GET(curpcb)->pcb_flags & PCB_VM86CALL)) {
/*
* Reset breakpoint bits because the
* processor doesn't
*/
load_dr6(rdr6() & 0xfffffff0);
goto out;
}
/*
* FALLTHROUGH (TRCTRAP kernel mode, kernel address)
*/
case T_BPTFLT:
/*
* If KDB is enabled, let it handle the debugger trap.
* Otherwise, debugger traps "can't happen".
*/
#ifdef KDB
if (kdb_trap(type, 0, frame))
goto out;
#endif
break;
#ifdef DEV_ISA
case T_NMI:
#ifdef POWERFAIL_NMI
if (time_second - lastalert > 10) {
log(LOG_WARNING, "NMI: power fail\n");
sysbeep(880, hz);
lastalert = time_second;
}
goto out;
#else /* !POWERFAIL_NMI */
/* machine/parity/power fail/"kitchen sink" faults */
if (isa_nmi(code) == 0) {
#ifdef KDB
/*
* NMI can be hooked up to a pushbutton
* for debugging.
*/
if (kdb_on_nmi) {
printf ("NMI ... going to debugger\n");
kdb_trap(type, 0, frame);
}
#endif /* KDB */
goto out;
} else if (panic_on_nmi == 0)
goto out;
/* FALLTHROUGH */
#endif /* POWERFAIL_NMI */
#endif /* DEV_ISA */
}
trap_fatal(frame, eva);
goto out;
}
/* Translate fault for emulators (e.g. Linux) */
if (*p->p_sysent->sv_transtrap)
i = (*p->p_sysent->sv_transtrap)(i, type);
ksiginfo_init_trap(&ksi);
ksi.ksi_signo = i;
ksi.ksi_code = ucode;
ksi.ksi_addr = (void *)addr;
ksi.ksi_trapno = type;
trapsignal(td, &ksi);
#ifdef DEBUG
if (type <= MAX_TRAP_MSG) {
uprintf("fatal process exception: %s",
trap_msg[type]);
if ((type == T_PAGEFLT) || (type == T_PROTFLT))
uprintf(", fault VA = 0x%lx", (u_long)eva);
uprintf("\n");
}
#endif
user:
userret(td, frame);
mtx_assert(&Giant, MA_NOTOWNED);
KASSERT(PCB_USER_FPU(td->td_pcb),
("Return from trap with kernel FPU ctx leaked"));
userout:
out:
return;
}
static int
trap_pfault(frame, usermode, eva)
struct trapframe *frame;
int usermode;
vm_offset_t eva;
{
vm_offset_t va;
struct vmspace *vm = NULL;
vm_map_t map;
int rv = 0;
vm_prot_t ftype;
struct thread *td = curthread;
struct proc *p = td->td_proc;
va = trunc_page(eva);
if (va >= KERNBASE) {
/*
* Don't allow user-mode faults in kernel address space.
* An exception: if the faulting address is the invalid
* instruction entry in the IDT, then the Intel Pentium
* F00F bug workaround was triggered, and we need to
* treat it is as an illegal instruction, and not a page
* fault.
*/
#if defined(I586_CPU) && !defined(NO_F00F_HACK)
if ((eva == (unsigned int)&idt[6]) && has_f00f_bug)
return -2;
#endif
if (usermode)
goto nogo;
map = kernel_map;
} else {
/*
* This is a fault on non-kernel virtual memory.
* vm is initialized above to NULL. If curproc is NULL
* or curproc->p_vmspace is NULL the fault is fatal.
*/
if (p != NULL)
vm = p->p_vmspace;
if (vm == NULL)
goto nogo;
map = &vm->vm_map;
}
/*
* PGEX_I is defined only if the execute disable bit capability is
* supported and enabled.
*/
if (frame->tf_err & PGEX_W)
ftype = VM_PROT_WRITE;
#ifdef PAE
else if ((frame->tf_err & PGEX_I) && pg_nx != 0)
ftype = VM_PROT_EXECUTE;
#endif
else
ftype = VM_PROT_READ;
if (map != kernel_map) {
/*
* Keep swapout from messing with us during this
* critical time.
*/
PROC_LOCK(p);
++p->p_lock;
PROC_UNLOCK(p);
/* Fault in the user page: */
rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL);
PROC_LOCK(p);
--p->p_lock;
PROC_UNLOCK(p);
} else {
/*
* Don't have to worry about process locking or stacks in the
* kernel.
*/
rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL);
}
if (rv == KERN_SUCCESS)
return (0);
nogo:
if (!usermode) {
if (td->td_intr_nesting_level == 0 &&
PCPU_GET(curpcb)->pcb_onfault != NULL) {
frame->tf_eip = (int)PCPU_GET(curpcb)->pcb_onfault;
return (0);
}
trap_fatal(frame, eva);
return (-1);
}
return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
}
static void
trap_fatal(frame, eva)
struct trapframe *frame;
vm_offset_t eva;
{
int code, ss, esp;
u_int type;
struct soft_segment_descriptor softseg;
char *msg;
code = frame->tf_err;
type = frame->tf_trapno;
sdtossd(&gdt[IDXSEL(frame->tf_cs & 0xffff)].sd, &softseg);
if (type <= MAX_TRAP_MSG)
msg = trap_msg[type];
else
msg = "UNKNOWN";
printf("\n\nFatal trap %d: %s while in %s mode\n", type, msg,
frame->tf_eflags & PSL_VM ? "vm86" :
ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel");
#ifdef SMP
/* two separate prints in case of a trap on an unmapped page */
printf("cpuid = %d; ", PCPU_GET(cpuid));
printf("apic id = %02x\n", PCPU_GET(apic_id));
#endif
if (type == T_PAGEFLT) {
printf("fault virtual address = 0x%x\n", eva);
printf("fault code = %s %s, %s\n",
code & PGEX_U ? "user" : "supervisor",
code & PGEX_W ? "write" : "read",
code & PGEX_P ? "protection violation" : "page not present");
}
printf("instruction pointer = 0x%x:0x%x\n",
frame->tf_cs & 0xffff, frame->tf_eip);
if ((ISPL(frame->tf_cs) == SEL_UPL) || (frame->tf_eflags & PSL_VM)) {
ss = frame->tf_ss & 0xffff;
esp = frame->tf_esp;
} else {
ss = GSEL(GDATA_SEL, SEL_KPL);
esp = (int)&frame->tf_esp;
}
printf("stack pointer = 0x%x:0x%x\n", ss, esp);
printf("frame pointer = 0x%x:0x%x\n", ss, frame->tf_ebp);
printf("code segment = base 0x%x, limit 0x%x, type 0x%x\n",
softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type);
printf(" = DPL %d, pres %d, def32 %d, gran %d\n",
softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_def32,
softseg.ssd_gran);
printf("processor eflags = ");
if (frame->tf_eflags & PSL_T)
printf("trace trap, ");
if (frame->tf_eflags & PSL_I)
printf("interrupt enabled, ");
if (frame->tf_eflags & PSL_NT)
printf("nested task, ");
if (frame->tf_eflags & PSL_RF)
printf("resume, ");
if (frame->tf_eflags & PSL_VM)
printf("vm86, ");
printf("IOPL = %d\n", (frame->tf_eflags & PSL_IOPL) >> 12);
printf("current process = ");
if (curproc) {
printf("%lu (%s)\n", (u_long)curproc->p_pid, curthread->td_name);
} else {
printf("Idle\n");
}
#ifdef KDB
if (debugger_on_panic || kdb_active) {
frame->tf_err = eva; /* smuggle fault address to ddb */
if (kdb_trap(type, 0, frame)) {
frame->tf_err = code; /* restore error code */
return;
}
frame->tf_err = code; /* restore error code */
}
#endif
printf("trap number = %d\n", type);
if (type <= MAX_TRAP_MSG)
panic("%s", trap_msg[type]);
else
panic("unknown/reserved trap");
}
/*
* Double fault handler. Called when a fault occurs while writing
* a frame for a trap/exception onto the stack. This usually occurs
* when the stack overflows (such is the case with infinite recursion,
* for example).
*
* XXX Note that the current PTD gets replaced by IdlePTD when the
* task switch occurs. This means that the stack that was active at
* the time of the double fault is not available at <kstack> unless
* the machine was idle when the double fault occurred. The downside
* of this is that "trace <ebp>" in ddb won't work.
*/
void
dblfault_handler()
{
#ifdef KDTRACE_HOOKS
if (dtrace_doubletrap_func != NULL)
(*dtrace_doubletrap_func)();
#endif
printf("\nFatal double fault:\n");
printf("eip = 0x%x\n", PCPU_GET(common_tss.tss_eip));
printf("esp = 0x%x\n", PCPU_GET(common_tss.tss_esp));
printf("ebp = 0x%x\n", PCPU_GET(common_tss.tss_ebp));
#ifdef SMP
/* two separate prints in case of a trap on an unmapped page */
printf("cpuid = %d; ", PCPU_GET(cpuid));
printf("apic id = %02x\n", PCPU_GET(apic_id));
#endif
panic("double fault");
}
int
cpu_fetch_syscall_args(struct thread *td, struct syscall_args *sa)
{
struct proc *p;
struct trapframe *frame;
caddr_t params;
int error;
p = td->td_proc;
frame = td->td_frame;
params = (caddr_t)frame->tf_esp + sizeof(int);
sa->code = frame->tf_eax;
/*
* Need to check if this is a 32 bit or 64 bit syscall.
*/
if (sa->code == SYS_syscall) {
/*
* Code is first argument, followed by actual args.
*/
sa->code = fuword(params);
params += sizeof(int);
} else if (sa->code == SYS___syscall) {
/*
* Like syscall, but code is a quad, so as to maintain
* quad alignment for the rest of the arguments.
*/
sa->code = fuword(params);
params += sizeof(quad_t);
}
if (p->p_sysent->sv_mask)
sa->code &= p->p_sysent->sv_mask;
if (sa->code >= p->p_sysent->sv_size)
sa->callp = &p->p_sysent->sv_table[0];
else
sa->callp = &p->p_sysent->sv_table[sa->code];
sa->narg = sa->callp->sy_narg;
if (params != NULL && sa->narg != 0)
error = copyin(params, (caddr_t)sa->args,
(u_int)(sa->narg * sizeof(int)));
else
error = 0;
if (error == 0) {
td->td_retval[0] = 0;
td->td_retval[1] = frame->tf_edx;
}
return (error);
}
#include "../../kern/subr_syscall.c"
/*
* syscall - system call request C handler
*
* A system call is essentially treated as a trap.
*/
void
syscall(struct trapframe *frame)
{
struct thread *td;
struct syscall_args sa;
register_t orig_tf_eflags;
int error;
ksiginfo_t ksi;
#ifdef DIAGNOSTIC
if (ISPL(frame->tf_cs) != SEL_UPL) {
panic("syscall");
/* NOT REACHED */
}
#endif
orig_tf_eflags = frame->tf_eflags;
td = curthread;
td->td_frame = frame;
error = syscallenter(td, &sa);
/*
* Traced syscall.
*/
if ((orig_tf_eflags & PSL_T) && !(orig_tf_eflags & PSL_VM)) {
frame->tf_eflags &= ~PSL_T;
ksiginfo_init_trap(&ksi);
ksi.ksi_signo = SIGTRAP;
ksi.ksi_code = TRAP_TRACE;
ksi.ksi_addr = (void *)frame->tf_eip;
trapsignal(td, &ksi);
}
KASSERT(PCB_USER_FPU(td->td_pcb),
("System call %s returning with kernel FPU ctx leaked",
syscallname(td->td_proc, sa.code)));
KASSERT(td->td_pcb->pcb_save == &td->td_pcb->pcb_user_save,
("System call %s returning with mangled pcb_save",
syscallname(td->td_proc, sa.code)));
syscallret(td, error, &sa);
}
Index: head/sys/i386/ibcs2/ibcs2_ioctl.c
===================================================================
--- head/sys/i386/ibcs2/ibcs2_ioctl.c (revision 225616)
+++ head/sys/i386/ibcs2/ibcs2_ioctl.c (revision 225617)
@@ -1,687 +1,687 @@
/* $NetBSD: ibcs2_ioctl.c,v 1.6 1995/03/14 15:12:28 scottb Exp $ */
/*-
* Copyright (c) 1994, 1995 Scott Bartram
* All rights reserved.
*
* based on compat/sunos/sun_ioctl.c
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/capability.h>
#include <sys/consio.h>
#include <sys/fcntl.h>
#include <sys/file.h>
#include <sys/filedesc.h>
#include <sys/filio.h>
#include <sys/kbio.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/sysproto.h>
#include <sys/tty.h>
#include <i386/ibcs2/ibcs2_signal.h>
#include <i386/ibcs2/ibcs2_socksys.h>
#include <i386/ibcs2/ibcs2_stropts.h>
#include <i386/ibcs2/ibcs2_proto.h>
#include <i386/ibcs2/ibcs2_termios.h>
#include <i386/ibcs2/ibcs2_util.h>
#include <i386/ibcs2/ibcs2_ioctl.h>
static void stios2btios(struct ibcs2_termios *, struct termios *);
static void btios2stios(struct termios *, struct ibcs2_termios *);
static void stios2stio(struct ibcs2_termios *, struct ibcs2_termio *);
static void stio2stios(struct ibcs2_termio *, struct ibcs2_termios *);
/*
* iBCS2 ioctl calls.
*/
struct speedtab {
int sp_speed; /* Speed. */
int sp_code; /* Code. */
};
static struct speedtab sptab[] = {
{ 0, 0 },
{ 50, 1 },
{ 75, 2 },
{ 110, 3 },
{ 134, 4 },
{ 135, 4 },
{ 150, 5 },
{ 200, 6 },
{ 300, 7 },
{ 600, 8 },
{ 1200, 9 },
{ 1800, 10 },
{ 2400, 11 },
{ 4800, 12 },
{ 9600, 13 },
{ 19200, 14 },
{ 38400, 15 },
{ -1, -1 }
};
static u_long s2btab[] = {
0,
50,
75,
110,
134,
150,
200,
300,
600,
1200,
1800,
2400,
4800,
9600,
19200,
38400,
};
static int
ttspeedtab(int speed, struct speedtab *table)
{
for ( ; table->sp_speed != -1; table++)
if (table->sp_speed == speed)
return (table->sp_code);
return (-1);
}
static void
stios2btios(st, bt)
struct ibcs2_termios *st;
struct termios *bt;
{
register u_long l, r;
l = st->c_iflag; r = 0;
if (l & IBCS2_IGNBRK) r |= IGNBRK;
if (l & IBCS2_BRKINT) r |= BRKINT;
if (l & IBCS2_IGNPAR) r |= IGNPAR;
if (l & IBCS2_PARMRK) r |= PARMRK;
if (l & IBCS2_INPCK) r |= INPCK;
if (l & IBCS2_ISTRIP) r |= ISTRIP;
if (l & IBCS2_INLCR) r |= INLCR;
if (l & IBCS2_IGNCR) r |= IGNCR;
if (l & IBCS2_ICRNL) r |= ICRNL;
if (l & IBCS2_IXON) r |= IXON;
if (l & IBCS2_IXANY) r |= IXANY;
if (l & IBCS2_IXOFF) r |= IXOFF;
if (l & IBCS2_IMAXBEL) r |= IMAXBEL;
bt->c_iflag = r;
l = st->c_oflag; r = 0;
if (l & IBCS2_OPOST) r |= OPOST;
if (l & IBCS2_ONLCR) r |= ONLCR;
if (l & IBCS2_TAB3) r |= TAB3;
bt->c_oflag = r;
l = st->c_cflag; r = 0;
switch (l & IBCS2_CSIZE) {
case IBCS2_CS5: r |= CS5; break;
case IBCS2_CS6: r |= CS6; break;
case IBCS2_CS7: r |= CS7; break;
case IBCS2_CS8: r |= CS8; break;
}
if (l & IBCS2_CSTOPB) r |= CSTOPB;
if (l & IBCS2_CREAD) r |= CREAD;
if (l & IBCS2_PARENB) r |= PARENB;
if (l & IBCS2_PARODD) r |= PARODD;
if (l & IBCS2_HUPCL) r |= HUPCL;
if (l & IBCS2_CLOCAL) r |= CLOCAL;
bt->c_cflag = r;
bt->c_ispeed = bt->c_ospeed = s2btab[l & 0x0000000f];
l = st->c_lflag; r = 0;
if (l & IBCS2_ISIG) r |= ISIG;
if (l & IBCS2_ICANON) r |= ICANON;
if (l & IBCS2_ECHO) r |= ECHO;
if (l & IBCS2_ECHOE) r |= ECHOE;
if (l & IBCS2_ECHOK) r |= ECHOK;
if (l & IBCS2_ECHONL) r |= ECHONL;
if (l & IBCS2_NOFLSH) r |= NOFLSH;
if (l & IBCS2_TOSTOP) r |= TOSTOP;
bt->c_lflag = r;
bt->c_cc[VINTR] =
st->c_cc[IBCS2_VINTR] ? st->c_cc[IBCS2_VINTR] : _POSIX_VDISABLE;
bt->c_cc[VQUIT] =
st->c_cc[IBCS2_VQUIT] ? st->c_cc[IBCS2_VQUIT] : _POSIX_VDISABLE;
bt->c_cc[VERASE] =
st->c_cc[IBCS2_VERASE] ? st->c_cc[IBCS2_VERASE] : _POSIX_VDISABLE;
bt->c_cc[VKILL] =
st->c_cc[IBCS2_VKILL] ? st->c_cc[IBCS2_VKILL] : _POSIX_VDISABLE;
if (bt->c_lflag & ICANON) {
bt->c_cc[VEOF] =
st->c_cc[IBCS2_VEOF] ? st->c_cc[IBCS2_VEOF] : _POSIX_VDISABLE;
bt->c_cc[VEOL] =
st->c_cc[IBCS2_VEOL] ? st->c_cc[IBCS2_VEOL] : _POSIX_VDISABLE;
} else {
bt->c_cc[VMIN] = st->c_cc[IBCS2_VMIN];
bt->c_cc[VTIME] = st->c_cc[IBCS2_VTIME];
}
bt->c_cc[VEOL2] =
st->c_cc[IBCS2_VEOL2] ? st->c_cc[IBCS2_VEOL2] : _POSIX_VDISABLE;
#if 0
bt->c_cc[VSWTCH] =
st->c_cc[IBCS2_VSWTCH] ? st->c_cc[IBCS2_VSWTCH] : _POSIX_VDISABLE;
#endif
bt->c_cc[VSTART] =
st->c_cc[IBCS2_VSTART] ? st->c_cc[IBCS2_VSTART] : _POSIX_VDISABLE;
bt->c_cc[VSTOP] =
st->c_cc[IBCS2_VSTOP] ? st->c_cc[IBCS2_VSTOP] : _POSIX_VDISABLE;
bt->c_cc[VSUSP] =
st->c_cc[IBCS2_VSUSP] ? st->c_cc[IBCS2_VSUSP] : _POSIX_VDISABLE;
bt->c_cc[VDSUSP] = _POSIX_VDISABLE;
bt->c_cc[VREPRINT] = _POSIX_VDISABLE;
bt->c_cc[VDISCARD] = _POSIX_VDISABLE;
bt->c_cc[VWERASE] = _POSIX_VDISABLE;
bt->c_cc[VLNEXT] = _POSIX_VDISABLE;
bt->c_cc[VSTATUS] = _POSIX_VDISABLE;
}
static void
btios2stios(bt, st)
struct termios *bt;
struct ibcs2_termios *st;
{
register u_long l, r;
l = bt->c_iflag; r = 0;
if (l & IGNBRK) r |= IBCS2_IGNBRK;
if (l & BRKINT) r |= IBCS2_BRKINT;
if (l & IGNPAR) r |= IBCS2_IGNPAR;
if (l & PARMRK) r |= IBCS2_PARMRK;
if (l & INPCK) r |= IBCS2_INPCK;
if (l & ISTRIP) r |= IBCS2_ISTRIP;
if (l & INLCR) r |= IBCS2_INLCR;
if (l & IGNCR) r |= IBCS2_IGNCR;
if (l & ICRNL) r |= IBCS2_ICRNL;
if (l & IXON) r |= IBCS2_IXON;
if (l & IXANY) r |= IBCS2_IXANY;
if (l & IXOFF) r |= IBCS2_IXOFF;
if (l & IMAXBEL) r |= IBCS2_IMAXBEL;
st->c_iflag = r;
l = bt->c_oflag; r = 0;
if (l & OPOST) r |= IBCS2_OPOST;
if (l & ONLCR) r |= IBCS2_ONLCR;
if (l & TAB3) r |= IBCS2_TAB3;
st->c_oflag = r;
l = bt->c_cflag; r = 0;
switch (l & CSIZE) {
case CS5: r |= IBCS2_CS5; break;
case CS6: r |= IBCS2_CS6; break;
case CS7: r |= IBCS2_CS7; break;
case CS8: r |= IBCS2_CS8; break;
}
if (l & CSTOPB) r |= IBCS2_CSTOPB;
if (l & CREAD) r |= IBCS2_CREAD;
if (l & PARENB) r |= IBCS2_PARENB;
if (l & PARODD) r |= IBCS2_PARODD;
if (l & HUPCL) r |= IBCS2_HUPCL;
if (l & CLOCAL) r |= IBCS2_CLOCAL;
st->c_cflag = r;
l = bt->c_lflag; r = 0;
if (l & ISIG) r |= IBCS2_ISIG;
if (l & ICANON) r |= IBCS2_ICANON;
if (l & ECHO) r |= IBCS2_ECHO;
if (l & ECHOE) r |= IBCS2_ECHOE;
if (l & ECHOK) r |= IBCS2_ECHOK;
if (l & ECHONL) r |= IBCS2_ECHONL;
if (l & NOFLSH) r |= IBCS2_NOFLSH;
if (l & TOSTOP) r |= IBCS2_TOSTOP;
st->c_lflag = r;
l = ttspeedtab(bt->c_ospeed, sptab);
if ((int)l >= 0)
st->c_cflag |= l;
st->c_cc[IBCS2_VINTR] =
bt->c_cc[VINTR] != _POSIX_VDISABLE ? bt->c_cc[VINTR] : 0;
st->c_cc[IBCS2_VQUIT] =
bt->c_cc[VQUIT] != _POSIX_VDISABLE ? bt->c_cc[VQUIT] : 0;
st->c_cc[IBCS2_VERASE] =
bt->c_cc[VERASE] != _POSIX_VDISABLE ? bt->c_cc[VERASE] : 0;
st->c_cc[IBCS2_VKILL] =
bt->c_cc[VKILL] != _POSIX_VDISABLE ? bt->c_cc[VKILL] : 0;
if (bt->c_lflag & ICANON) {
st->c_cc[IBCS2_VEOF] =
bt->c_cc[VEOF] != _POSIX_VDISABLE ? bt->c_cc[VEOF] : 0;
st->c_cc[IBCS2_VEOL] =
bt->c_cc[VEOL] != _POSIX_VDISABLE ? bt->c_cc[VEOL] : 0;
} else {
st->c_cc[IBCS2_VMIN] = bt->c_cc[VMIN];
st->c_cc[IBCS2_VTIME] = bt->c_cc[VTIME];
}
st->c_cc[IBCS2_VEOL2] =
bt->c_cc[VEOL2] != _POSIX_VDISABLE ? bt->c_cc[VEOL2] : 0;
st->c_cc[IBCS2_VSWTCH] =
0;
st->c_cc[IBCS2_VSUSP] =
bt->c_cc[VSUSP] != _POSIX_VDISABLE ? bt->c_cc[VSUSP] : 0;
st->c_cc[IBCS2_VSTART] =
bt->c_cc[VSTART] != _POSIX_VDISABLE ? bt->c_cc[VSTART] : 0;
st->c_cc[IBCS2_VSTOP] =
bt->c_cc[VSTOP] != _POSIX_VDISABLE ? bt->c_cc[VSTOP] : 0;
st->c_line = 0;
}
static void
stios2stio(ts, t)
struct ibcs2_termios *ts;
struct ibcs2_termio *t;
{
t->c_iflag = ts->c_iflag;
t->c_oflag = ts->c_oflag;
t->c_cflag = ts->c_cflag;
t->c_lflag = ts->c_lflag;
t->c_line = ts->c_line;
bcopy(ts->c_cc, t->c_cc, IBCS2_NCC);
}
static void
stio2stios(t, ts)
struct ibcs2_termio *t;
struct ibcs2_termios *ts;
{
ts->c_iflag = t->c_iflag;
ts->c_oflag = t->c_oflag;
ts->c_cflag = t->c_cflag;
ts->c_lflag = t->c_lflag;
ts->c_line = t->c_line;
bcopy(t->c_cc, ts->c_cc, IBCS2_NCC);
}
int
ibcs2_ioctl(td, uap)
struct thread *td;
struct ibcs2_ioctl_args *uap;
{
struct proc *p = td->td_proc;
struct file *fp;
int error;
if ((error = fget(td, uap->fd, CAP_IOCTL, &fp)) != 0) {
DPRINTF(("ibcs2_ioctl(%d): bad fd %d ", p->p_pid,
uap->fd));
return EBADF;
}
if ((fp->f_flag & (FREAD|FWRITE)) == 0) {
fdrop(fp, td);
DPRINTF(("ibcs2_ioctl(%d): bad fp flag ", p->p_pid));
return EBADF;
}
switch (uap->cmd) {
case IBCS2_TCGETA:
case IBCS2_XCGETA:
case IBCS2_OXCGETA:
{
struct termios bts;
struct ibcs2_termios sts;
struct ibcs2_termio st;
if ((error = fo_ioctl(fp, TIOCGETA, (caddr_t)&bts,
td->td_ucred, td)) != 0)
break;
btios2stios (&bts, &sts);
if (uap->cmd == IBCS2_TCGETA) {
stios2stio (&sts, &st);
error = copyout((caddr_t)&st, uap->data,
sizeof (st));
#ifdef DEBUG_IBCS2
if (error)
DPRINTF(("ibcs2_ioctl(%d): copyout failed ",
p->p_pid));
#endif
break;
} else {
error = copyout((caddr_t)&sts, uap->data,
sizeof (sts));
break;
}
/*NOTREACHED*/
}
case IBCS2_TCSETA:
case IBCS2_TCSETAW:
case IBCS2_TCSETAF:
{
struct termios bts;
struct ibcs2_termios sts;
struct ibcs2_termio st;
if ((error = copyin(uap->data, (caddr_t)&st,
sizeof(st))) != 0) {
DPRINTF(("ibcs2_ioctl(%d): TCSET copyin failed ",
p->p_pid));
break;
}
/* get full BSD termios so we don't lose information */
if ((error = fo_ioctl(fp, TIOCGETA, (caddr_t)&bts,
td->td_ucred, td)) != 0) {
DPRINTF(("ibcs2_ioctl(%d): TCSET ctl failed fd %d ",
p->p_pid, uap->fd));
break;
}
/*
* convert to iBCS2 termios, copy in information from
* termio, and convert back, then set new values.
*/
btios2stios(&bts, &sts);
stio2stios(&st, &sts);
stios2btios(&sts, &bts);
error = fo_ioctl(fp, uap->cmd - IBCS2_TCSETA + TIOCSETA,
(caddr_t)&bts, td->td_ucred, td);
break;
}
case IBCS2_XCSETA:
case IBCS2_XCSETAW:
case IBCS2_XCSETAF:
{
struct termios bts;
struct ibcs2_termios sts;
if ((error = copyin(uap->data, (caddr_t)&sts,
sizeof (sts))) != 0)
break;
stios2btios (&sts, &bts);
error = fo_ioctl(fp, uap->cmd - IBCS2_XCSETA + TIOCSETA,
(caddr_t)&bts, td->td_ucred, td);
break;
}
case IBCS2_OXCSETA:
case IBCS2_OXCSETAW:
case IBCS2_OXCSETAF:
{
struct termios bts;
struct ibcs2_termios sts;
if ((error = copyin(uap->data, (caddr_t)&sts,
sizeof (sts))) != 0)
break;
stios2btios (&sts, &bts);
error = fo_ioctl(fp, uap->cmd - IBCS2_OXCSETA + TIOCSETA,
(caddr_t)&bts, td->td_ucred, td);
break;
}
case IBCS2_TCSBRK:
DPRINTF(("ibcs2_ioctl(%d): TCSBRK ", p->p_pid));
error = ENOSYS;
break;
case IBCS2_TCXONC:
{
switch ((int)uap->data) {
case 0:
case 1:
DPRINTF(("ibcs2_ioctl(%d): TCXONC ", p->p_pid));
error = ENOSYS;
break;
case 2:
error = fo_ioctl(fp, TIOCSTOP, (caddr_t)0,
td->td_ucred, td);
break;
case 3:
error = fo_ioctl(fp, TIOCSTART, (caddr_t)1,
td->td_ucred, td);
break;
default:
error = EINVAL;
break;
}
break;
}
case IBCS2_TCFLSH:
{
int arg;
switch ((int)uap->data) {
case 0:
arg = FREAD;
break;
case 1:
arg = FWRITE;
break;
case 2:
arg = FREAD | FWRITE;
break;
default:
fdrop(fp, td);
return EINVAL;
}
error = fo_ioctl(fp, TIOCFLUSH, (caddr_t)&arg, td->td_ucred,
td);
break;
}
case IBCS2_TIOCGWINSZ:
uap->cmd = TIOCGWINSZ;
- error = ioctl(td, (struct ioctl_args *)uap);
+ error = sys_ioctl(td, (struct ioctl_args *)uap);
break;
case IBCS2_TIOCSWINSZ:
uap->cmd = TIOCSWINSZ;
- error = ioctl(td, (struct ioctl_args *)uap);
+ error = sys_ioctl(td, (struct ioctl_args *)uap);
break;
case IBCS2_TIOCGPGRP:
{
pid_t pg_id;
PROC_LOCK(p);
pg_id = p->p_pgrp->pg_id;
PROC_UNLOCK(p);
error = copyout((caddr_t)&pg_id, uap->data,
sizeof(pg_id));
break;
}
case IBCS2_TIOCSPGRP: /* XXX - is uap->data a pointer to pgid? */
{
struct setpgid_args sa;
sa.pid = 0;
sa.pgid = (int)uap->data;
- error = setpgid(td, &sa);
+ error = sys_setpgid(td, &sa);
break;
}
case IBCS2_TCGETSC: /* SCO console - get scancode flags */
error = EINTR; /* ENOSYS; */
break;
case IBCS2_TCSETSC: /* SCO console - set scancode flags */
error = 0; /* ENOSYS; */
break;
case IBCS2_JWINSIZE: /* Unix to Jerq I/O control */
{
struct ibcs2_jwinsize {
char bytex, bytey;
short bitx, bity;
} ibcs2_jwinsize;
PROC_LOCK(p);
SESS_LOCK(p->p_session);
ibcs2_jwinsize.bytex = 80;
/* p->p_session->s_ttyp->t_winsize.ws_col; XXX */
ibcs2_jwinsize.bytey = 25;
/* p->p_session->s_ttyp->t_winsize.ws_row; XXX */
ibcs2_jwinsize.bitx =
p->p_session->s_ttyp->t_winsize.ws_xpixel;
ibcs2_jwinsize.bity =
p->p_session->s_ttyp->t_winsize.ws_ypixel;
SESS_UNLOCK(p->p_session);
PROC_UNLOCK(p);
error = copyout((caddr_t)&ibcs2_jwinsize, uap->data,
sizeof(ibcs2_jwinsize));
break;
}
/* keyboard and display ioctl's -- type 'K' */
case IBCS2_KDGKBMODE: /* get keyboard translation mode */
uap->cmd = KDGKBMODE;
/* printf("ioctl KDGKBMODE = %x\n", uap->cmd);*/
- error = ioctl(td, (struct ioctl_args *)uap);
+ error = sys_ioctl(td, (struct ioctl_args *)uap);
break;
case IBCS2_KDSKBMODE: /* set keyboard translation mode */
uap->cmd = KDSKBMODE;
- error = ioctl(td, (struct ioctl_args *)uap);
+ error = sys_ioctl(td, (struct ioctl_args *)uap);
break;
case IBCS2_KDMKTONE: /* sound tone */
uap->cmd = KDMKTONE;
- error = ioctl(td, (struct ioctl_args *)uap);
+ error = sys_ioctl(td, (struct ioctl_args *)uap);
break;
case IBCS2_KDGETMODE: /* get text/graphics mode */
uap->cmd = KDGETMODE;
- error = ioctl(td, (struct ioctl_args *)uap);
+ error = sys_ioctl(td, (struct ioctl_args *)uap);
break;
case IBCS2_KDSETMODE: /* set text/graphics mode */
uap->cmd = KDSETMODE;
- error = ioctl(td, (struct ioctl_args *)uap);
+ error = sys_ioctl(td, (struct ioctl_args *)uap);
break;
case IBCS2_KDSBORDER: /* set ega color border */
uap->cmd = KDSBORDER;
- error = ioctl(td, (struct ioctl_args *)uap);
+ error = sys_ioctl(td, (struct ioctl_args *)uap);
break;
case IBCS2_KDGKBSTATE:
uap->cmd = KDGKBSTATE;
- error = ioctl(td, (struct ioctl_args *)uap);
+ error = sys_ioctl(td, (struct ioctl_args *)uap);
break;
case IBCS2_KDSETRAD:
uap->cmd = KDSETRAD;
- error = ioctl(td, (struct ioctl_args *)uap);
+ error = sys_ioctl(td, (struct ioctl_args *)uap);
break;
case IBCS2_KDENABIO: /* enable direct I/O to ports */
uap->cmd = KDENABIO;
- error = ioctl(td, (struct ioctl_args *)uap);
+ error = sys_ioctl(td, (struct ioctl_args *)uap);
break;
case IBCS2_KDDISABIO: /* disable direct I/O to ports */
uap->cmd = KDDISABIO;
- error = ioctl(td, (struct ioctl_args *)uap);
+ error = sys_ioctl(td, (struct ioctl_args *)uap);
break;
case IBCS2_KIOCSOUND: /* start sound generation */
uap->cmd = KIOCSOUND;
- error = ioctl(td, (struct ioctl_args *)uap);
+ error = sys_ioctl(td, (struct ioctl_args *)uap);
break;
case IBCS2_KDGKBTYPE: /* get keyboard type */
uap->cmd = KDGKBTYPE;
- error = ioctl(td, (struct ioctl_args *)uap);
+ error = sys_ioctl(td, (struct ioctl_args *)uap);
break;
case IBCS2_KDGETLED: /* get keyboard LED status */
uap->cmd = KDGETLED;
- error = ioctl(td, (struct ioctl_args *)uap);
+ error = sys_ioctl(td, (struct ioctl_args *)uap);
break;
case IBCS2_KDSETLED: /* set keyboard LED status */
uap->cmd = KDSETLED;
- error = ioctl(td, (struct ioctl_args *)uap);
+ error = sys_ioctl(td, (struct ioctl_args *)uap);
break;
/* Xenix keyboard and display ioctl's from sys/kd.h -- type 'k' */
case IBCS2_GETFKEY: /* Get function key */
uap->cmd = GETFKEY;
- error = ioctl(td, (struct ioctl_args *)uap);
+ error = sys_ioctl(td, (struct ioctl_args *)uap);
break;
case IBCS2_SETFKEY: /* Set function key */
uap->cmd = SETFKEY;
- error = ioctl(td, (struct ioctl_args *)uap);
+ error = sys_ioctl(td, (struct ioctl_args *)uap);
break;
case IBCS2_GIO_SCRNMAP: /* Get screen output map table */
uap->cmd = GIO_SCRNMAP;
- error = ioctl(td, (struct ioctl_args *)uap);
+ error = sys_ioctl(td, (struct ioctl_args *)uap);
break;
case IBCS2_PIO_SCRNMAP: /* Set screen output map table */
uap->cmd = PIO_SCRNMAP;
- error = ioctl(td, (struct ioctl_args *)uap);
+ error = sys_ioctl(td, (struct ioctl_args *)uap);
break;
case IBCS2_GIO_KEYMAP: /* Get keyboard map table */
uap->cmd = OGIO_KEYMAP;
- error = ioctl(td, (struct ioctl_args *)uap);
+ error = sys_ioctl(td, (struct ioctl_args *)uap);
break;
case IBCS2_PIO_KEYMAP: /* Set keyboard map table */
uap->cmd = OPIO_KEYMAP;
- error = ioctl(td, (struct ioctl_args *)uap);
+ error = sys_ioctl(td, (struct ioctl_args *)uap);
break;
/* socksys */
case IBCS2_SIOCSOCKSYS:
error = ibcs2_socksys(td, (struct ibcs2_socksys_args *)uap);
break;
case IBCS2_FIONREAD:
case IBCS2_I_NREAD: /* STREAMS */
uap->cmd = FIONREAD;
- error = ioctl(td, (struct ioctl_args *)uap);
+ error = sys_ioctl(td, (struct ioctl_args *)uap);
break;
default:
DPRINTF(("ibcs2_ioctl(%d): unknown cmd 0x%lx ",
td->proc->p_pid, uap->cmd));
error = ENOSYS;
break;
}
fdrop(fp, td);
return error;
}
Index: head/sys/i386/ibcs2/ibcs2_ipc.c
===================================================================
--- head/sys/i386/ibcs2/ibcs2_ipc.c (revision 225616)
+++ head/sys/i386/ibcs2/ibcs2_ipc.c (revision 225617)
@@ -1,560 +1,560 @@
/*-
* Copyright (c) 1995 Scott Bartram
* Copyright (c) 1995 Steven Wallace
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/limits.h>
#include <sys/msg.h>
#include <sys/sem.h>
#include <sys/shm.h>
#include <sys/syscallsubr.h>
#include <sys/sysproto.h>
#include <i386/ibcs2/ibcs2_types.h>
#include <i386/ibcs2/ibcs2_signal.h>
#include <i386/ibcs2/ibcs2_proto.h>
#include <i386/ibcs2/ibcs2_util.h>
#include <i386/ibcs2/ibcs2_ipc.h>
#define IBCS2_IPC_RMID 0
#define IBCS2_IPC_SET 1
#define IBCS2_IPC_STAT 2
#define IBCS2_SETVAL 8
static void cvt_msqid2imsqid(struct msqid_ds *, struct ibcs2_msqid_ds *);
static void cvt_imsqid2msqid(struct ibcs2_msqid_ds *, struct msqid_ds *);
#ifdef unused
static void cvt_sem2isem(struct sem *, struct ibcs2_sem *);
static void cvt_isem2sem(struct ibcs2_sem *, struct sem *);
#endif
static void cvt_semid2isemid(struct semid_ds *, struct ibcs2_semid_ds *);
static void cvt_isemid2semid(struct ibcs2_semid_ds *, struct semid_ds *);
static void cvt_shmid2ishmid(struct shmid_ds *, struct ibcs2_shmid_ds *);
static void cvt_ishmid2shmid(struct ibcs2_shmid_ds *, struct shmid_ds *);
static void cvt_perm2iperm(struct ipc_perm *, struct ibcs2_ipc_perm *);
static void cvt_iperm2perm(struct ibcs2_ipc_perm *, struct ipc_perm *);
/*
* iBCS2 msgsys call
*/
static void
cvt_msqid2imsqid(bp, ibp)
struct msqid_ds *bp;
struct ibcs2_msqid_ds *ibp;
{
cvt_perm2iperm(&bp->msg_perm, &ibp->msg_perm);
ibp->msg_first = bp->msg_first;
ibp->msg_last = bp->msg_last;
ibp->msg_cbytes = (u_short)bp->msg_cbytes;
ibp->msg_qnum = (u_short)bp->msg_qnum;
ibp->msg_qbytes = (u_short)bp->msg_qbytes;
ibp->msg_lspid = (u_short)bp->msg_lspid;
ibp->msg_lrpid = (u_short)bp->msg_lrpid;
ibp->msg_stime = bp->msg_stime;
ibp->msg_rtime = bp->msg_rtime;
ibp->msg_ctime = bp->msg_ctime;
return;
}
static void
cvt_imsqid2msqid(ibp, bp)
struct ibcs2_msqid_ds *ibp;
struct msqid_ds *bp;
{
cvt_iperm2perm(&ibp->msg_perm, &bp->msg_perm);
bp->msg_first = ibp->msg_first;
bp->msg_last = ibp->msg_last;
bp->msg_cbytes = ibp->msg_cbytes;
bp->msg_qnum = ibp->msg_qnum;
bp->msg_qbytes = ibp->msg_qbytes;
bp->msg_lspid = ibp->msg_lspid;
bp->msg_lrpid = ibp->msg_lrpid;
bp->msg_stime = ibp->msg_stime;
bp->msg_rtime = ibp->msg_rtime;
bp->msg_ctime = ibp->msg_ctime;
return;
}
struct ibcs2_msgget_args {
int what;
ibcs2_key_t key;
int msgflg;
};
static int
ibcs2_msgget(struct thread *td, void *v)
{
struct ibcs2_msgget_args *uap = v;
struct msgget_args ap;
ap.key = uap->key;
ap.msgflg = uap->msgflg;
- return msgget(td, &ap);
+ return sys_msgget(td, &ap);
}
struct ibcs2_msgctl_args {
int what;
int msqid;
int cmd;
struct ibcs2_msqid_ds *buf;
};
static int
ibcs2_msgctl(struct thread *td, void *v)
{
struct ibcs2_msgctl_args *uap = v;
struct ibcs2_msqid_ds is;
struct msqid_ds bs;
int error;
switch (uap->cmd) {
case IBCS2_IPC_STAT:
error = kern_msgctl(td, uap->msqid, IPC_STAT, &bs);
if (!error) {
cvt_msqid2imsqid(&bs, &is);
error = copyout(&is, uap->buf, sizeof(is));
}
return (error);
case IBCS2_IPC_SET:
error = copyin(uap->buf, &is, sizeof(is));
if (error)
return (error);
cvt_imsqid2msqid(&is, &bs);
return (kern_msgctl(td, uap->msqid, IPC_SET, &bs));
case IBCS2_IPC_RMID:
return (kern_msgctl(td, uap->msqid, IPC_RMID, NULL));
}
return (EINVAL);
}
struct ibcs2_msgrcv_args {
int what;
int msqid;
void *msgp;
size_t msgsz;
long msgtyp;
int msgflg;
};
static int
ibcs2_msgrcv(struct thread *td, void *v)
{
struct ibcs2_msgrcv_args *uap = v;
struct msgrcv_args ap;
ap.msqid = uap->msqid;
ap.msgp = uap->msgp;
ap.msgsz = uap->msgsz;
ap.msgtyp = uap->msgtyp;
ap.msgflg = uap->msgflg;
- return (msgrcv(td, &ap));
+ return (sys_msgrcv(td, &ap));
}
struct ibcs2_msgsnd_args {
int what;
int msqid;
void *msgp;
size_t msgsz;
int msgflg;
};
static int
ibcs2_msgsnd(struct thread *td, void *v)
{
struct ibcs2_msgsnd_args *uap = v;
struct msgsnd_args ap;
ap.msqid = uap->msqid;
ap.msgp = uap->msgp;
ap.msgsz = uap->msgsz;
ap.msgflg = uap->msgflg;
- return (msgsnd(td, &ap));
+ return (sys_msgsnd(td, &ap));
}
int
ibcs2_msgsys(td, uap)
struct thread *td;
struct ibcs2_msgsys_args *uap;
{
switch (uap->which) {
case 0:
return (ibcs2_msgget(td, uap));
case 1:
return (ibcs2_msgctl(td, uap));
case 2:
return (ibcs2_msgrcv(td, uap));
case 3:
return (ibcs2_msgsnd(td, uap));
default:
return (EINVAL);
}
}
/*
* iBCS2 semsys call
*/
#ifdef unused
static void
cvt_sem2isem(bp, ibp)
struct sem *bp;
struct ibcs2_sem *ibp;
{
ibp->semval = bp->semval;
ibp->sempid = bp->sempid;
ibp->semncnt = bp->semncnt;
ibp->semzcnt = bp->semzcnt;
return;
}
static void
cvt_isem2sem(ibp, bp)
struct ibcs2_sem *ibp;
struct sem *bp;
{
bp->semval = ibp->semval;
bp->sempid = ibp->sempid;
bp->semncnt = ibp->semncnt;
bp->semzcnt = ibp->semzcnt;
return;
}
#endif
static void
cvt_iperm2perm(ipp, pp)
struct ibcs2_ipc_perm *ipp;
struct ipc_perm *pp;
{
pp->uid = ipp->uid;
pp->gid = ipp->gid;
pp->cuid = ipp->cuid;
pp->cgid = ipp->cgid;
pp->mode = ipp->mode;
pp->seq = ipp->seq;
pp->key = ipp->key;
}
static void
cvt_perm2iperm(pp, ipp)
struct ipc_perm *pp;
struct ibcs2_ipc_perm *ipp;
{
ipp->uid = pp->uid;
ipp->gid = pp->gid;
ipp->cuid = pp->cuid;
ipp->cgid = pp->cgid;
ipp->mode = pp->mode;
ipp->seq = pp->seq;
ipp->key = pp->key;
}
static void
cvt_semid2isemid(bp, ibp)
struct semid_ds *bp;
struct ibcs2_semid_ds *ibp;
{
cvt_perm2iperm(&bp->sem_perm, &ibp->sem_perm);
ibp->sem_base = (struct ibcs2_sem *)bp->sem_base;
ibp->sem_nsems = bp->sem_nsems;
ibp->sem_otime = bp->sem_otime;
ibp->sem_ctime = bp->sem_ctime;
return;
}
static void
cvt_isemid2semid(ibp, bp)
struct ibcs2_semid_ds *ibp;
struct semid_ds *bp;
{
cvt_iperm2perm(&ibp->sem_perm, &bp->sem_perm);
bp->sem_base = (struct sem *)ibp->sem_base;
bp->sem_nsems = ibp->sem_nsems;
bp->sem_otime = ibp->sem_otime;
bp->sem_ctime = ibp->sem_ctime;
return;
}
struct ibcs2_semctl_args {
int what;
int semid;
int semnum;
int cmd;
union semun arg;
};
static int
ibcs2_semctl(struct thread *td, void *v)
{
struct ibcs2_semctl_args *uap = v;
struct ibcs2_semid_ds is;
struct semid_ds bs;
union semun semun;
register_t rval;
int error;
switch(uap->cmd) {
case IBCS2_IPC_STAT:
semun.buf = &bs;
error = kern_semctl(td, uap->semid, uap->semnum, IPC_STAT,
&semun, &rval);
if (error)
return (error);
cvt_semid2isemid(&bs, &is);
error = copyout(&is, uap->arg.buf, sizeof(is));
if (error == 0)
td->td_retval[0] = rval;
return (error);
case IBCS2_IPC_SET:
error = copyin(uap->arg.buf, &is, sizeof(is));
if (error)
return (error);
cvt_isemid2semid(&is, &bs);
semun.buf = &bs;
return (kern_semctl(td, uap->semid, uap->semnum, IPC_SET,
&semun, td->td_retval));
}
return (kern_semctl(td, uap->semid, uap->semnum, uap->cmd, &uap->arg,
td->td_retval));
}
struct ibcs2_semget_args {
int what;
ibcs2_key_t key;
int nsems;
int semflg;
};
static int
ibcs2_semget(struct thread *td, void *v)
{
struct ibcs2_semget_args *uap = v;
struct semget_args ap;
ap.key = uap->key;
ap.nsems = uap->nsems;
ap.semflg = uap->semflg;
- return (semget(td, &ap));
+ return (sys_semget(td, &ap));
}
struct ibcs2_semop_args {
int what;
int semid;
struct sembuf *sops;
size_t nsops;
};
static int
ibcs2_semop(struct thread *td, void *v)
{
struct ibcs2_semop_args *uap = v;
struct semop_args ap;
ap.semid = uap->semid;
ap.sops = uap->sops;
ap.nsops = uap->nsops;
- return (semop(td, &ap));
+ return (sys_semop(td, &ap));
}
int
ibcs2_semsys(td, uap)
struct thread *td;
struct ibcs2_semsys_args *uap;
{
switch (uap->which) {
case 0:
return (ibcs2_semctl(td, uap));
case 1:
return (ibcs2_semget(td, uap));
case 2:
return (ibcs2_semop(td, uap));
}
return (EINVAL);
}
/*
* iBCS2 shmsys call
*/
static void
cvt_shmid2ishmid(bp, ibp)
struct shmid_ds *bp;
struct ibcs2_shmid_ds *ibp;
{
cvt_perm2iperm(&bp->shm_perm, &ibp->shm_perm);
ibp->shm_segsz = bp->shm_segsz;
ibp->shm_lpid = bp->shm_lpid;
ibp->shm_cpid = bp->shm_cpid;
if (bp->shm_nattch > SHRT_MAX)
ibp->shm_nattch = SHRT_MAX;
else
ibp->shm_nattch = bp->shm_nattch;
ibp->shm_cnattch = 0; /* ignored anyway */
ibp->shm_atime = bp->shm_atime;
ibp->shm_dtime = bp->shm_dtime;
ibp->shm_ctime = bp->shm_ctime;
return;
}
static void
cvt_ishmid2shmid(ibp, bp)
struct ibcs2_shmid_ds *ibp;
struct shmid_ds *bp;
{
cvt_iperm2perm(&ibp->shm_perm, &bp->shm_perm);
bp->shm_segsz = ibp->shm_segsz;
bp->shm_lpid = ibp->shm_lpid;
bp->shm_cpid = ibp->shm_cpid;
bp->shm_nattch = ibp->shm_nattch;
bp->shm_atime = ibp->shm_atime;
bp->shm_dtime = ibp->shm_dtime;
bp->shm_ctime = ibp->shm_ctime;
return;
}
struct ibcs2_shmat_args {
int what;
int shmid;
const void *shmaddr;
int shmflg;
};
static int
ibcs2_shmat(struct thread *td, void *v)
{
struct ibcs2_shmat_args *uap = v;
struct shmat_args ap;
ap.shmid = uap->shmid;
ap.shmaddr = uap->shmaddr;
ap.shmflg = uap->shmflg;
- return (shmat(td, &ap));
+ return (sys_shmat(td, &ap));
}
struct ibcs2_shmctl_args {
int what;
int shmid;
int cmd;
struct ibcs2_shmid_ds *buf;
};
static int
ibcs2_shmctl(struct thread *td, void *v)
{
struct ibcs2_shmctl_args *uap = v;
struct ibcs2_shmid_ds is;
struct shmid_ds bs;
int error;
switch(uap->cmd) {
case IBCS2_IPC_STAT:
error = kern_shmctl(td, uap->shmid, IPC_STAT, &bs, NULL);
if (error)
return (error);
cvt_shmid2ishmid(&bs, &is);
return (copyout(&is, uap->buf, sizeof(is)));
case IBCS2_IPC_SET:
error = copyin(uap->buf, &is, sizeof(is));
if (error)
return (error);
cvt_ishmid2shmid(&is, &bs);
return (kern_shmctl(td, uap->shmid, IPC_SET, &bs, NULL));
case IPC_INFO:
case SHM_INFO:
case SHM_STAT:
/* XXX: */
return (EINVAL);
}
return (kern_shmctl(td, uap->shmid, uap->cmd, NULL, NULL));
}
struct ibcs2_shmdt_args {
int what;
const void *shmaddr;
};
static int
ibcs2_shmdt(struct thread *td, void *v)
{
struct ibcs2_shmdt_args *uap = v;
struct shmdt_args ap;
ap.shmaddr = uap->shmaddr;
- return (shmdt(td, &ap));
+ return (sys_shmdt(td, &ap));
}
struct ibcs2_shmget_args {
int what;
ibcs2_key_t key;
size_t size;
int shmflg;
};
static int
ibcs2_shmget(struct thread *td, void *v)
{
struct ibcs2_shmget_args *uap = v;
struct shmget_args ap;
ap.key = uap->key;
ap.size = uap->size;
ap.shmflg = uap->shmflg;
- return (shmget(td, &ap));
+ return (sys_shmget(td, &ap));
}
int
ibcs2_shmsys(td, uap)
struct thread *td;
struct ibcs2_shmsys_args *uap;
{
switch (uap->which) {
case 0:
return (ibcs2_shmat(td, uap));
case 1:
return (ibcs2_shmctl(td, uap));
case 2:
return (ibcs2_shmdt(td, uap));
case 3:
return (ibcs2_shmget(td, uap));
}
return (EINVAL);
}
MODULE_DEPEND(ibcs2, sysvmsg, 1, 1, 1);
MODULE_DEPEND(ibcs2, sysvsem, 1, 1, 1);
MODULE_DEPEND(ibcs2, sysvshm, 1, 1, 1);
Index: head/sys/i386/ibcs2/ibcs2_misc.c
===================================================================
--- head/sys/i386/ibcs2/ibcs2_misc.c (revision 225616)
+++ head/sys/i386/ibcs2/ibcs2_misc.c (revision 225617)
@@ -1,1267 +1,1267 @@
/*-
* Copyright (c) 1995 Steven Wallace
* Copyright (c) 1994, 1995 Scott Bartram
* Copyright (c) 1992, 1993
* The Regents of the University of California. All rights reserved.
*
* This software was developed by the Computer Systems Engineering group
* at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and
* contributed to Berkeley.
*
* All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Lawrence Berkeley Laboratory.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* from: Header: sun_misc.c,v 1.16 93/04/07 02:46:27 torek Exp
*
* @(#)sun_misc.c 8.1 (Berkeley) 6/18/93
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
/*
* IBCS2 compatibility module.
*
* IBCS2 system calls that are implemented differently in BSD are
* handled here.
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/capability.h>
#include <sys/dirent.h>
#include <sys/fcntl.h>
#include <sys/filedesc.h>
#include <sys/imgact.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/file.h> /* Must come after sys/malloc.h */
#include <sys/mutex.h>
#include <sys/namei.h>
#include <sys/priv.h>
#include <sys/reboot.h>
#include <sys/resourcevar.h>
#include <sys/stat.h>
#include <sys/sysctl.h>
#include <sys/syscallsubr.h>
#include <sys/sysproto.h>
#include <sys/time.h>
#include <sys/times.h>
#include <sys/vnode.h>
#include <sys/wait.h>
#include <machine/cpu.h>
#include <i386/ibcs2/ibcs2_dirent.h>
#include <i386/ibcs2/ibcs2_signal.h>
#include <i386/ibcs2/ibcs2_proto.h>
#include <i386/ibcs2/ibcs2_unistd.h>
#include <i386/ibcs2/ibcs2_util.h>
#include <i386/ibcs2/ibcs2_utime.h>
#include <i386/ibcs2/ibcs2_xenix.h>
#include <security/mac/mac_framework.h>
int
ibcs2_ulimit(td, uap)
struct thread *td;
struct ibcs2_ulimit_args *uap;
{
struct rlimit rl;
struct proc *p;
int error;
#define IBCS2_GETFSIZE 1
#define IBCS2_SETFSIZE 2
#define IBCS2_GETPSIZE 3
#define IBCS2_GETDTABLESIZE 4
p = td->td_proc;
switch (uap->cmd) {
case IBCS2_GETFSIZE:
PROC_LOCK(p);
td->td_retval[0] = lim_cur(p, RLIMIT_FSIZE);
PROC_UNLOCK(p);
if (td->td_retval[0] == -1)
td->td_retval[0] = 0x7fffffff;
return 0;
case IBCS2_SETFSIZE:
PROC_LOCK(p);
rl.rlim_max = lim_max(p, RLIMIT_FSIZE);
PROC_UNLOCK(p);
rl.rlim_cur = uap->newlimit;
error = kern_setrlimit(td, RLIMIT_FSIZE, &rl);
if (!error) {
PROC_LOCK(p);
td->td_retval[0] = lim_cur(p, RLIMIT_FSIZE);
PROC_UNLOCK(p);
} else {
DPRINTF(("failed "));
}
return error;
case IBCS2_GETPSIZE:
PROC_LOCK(p);
td->td_retval[0] = lim_cur(p, RLIMIT_RSS); /* XXX */
PROC_UNLOCK(p);
return 0;
case IBCS2_GETDTABLESIZE:
uap->cmd = IBCS2_SC_OPEN_MAX;
return ibcs2_sysconf(td, (struct ibcs2_sysconf_args *)uap);
default:
return ENOSYS;
}
}
#define IBCS2_WSTOPPED 0177
#define IBCS2_STOPCODE(sig) ((sig) << 8 | IBCS2_WSTOPPED)
int
ibcs2_wait(td, uap)
struct thread *td;
struct ibcs2_wait_args *uap;
{
int error, options, status;
int *statusp;
pid_t pid;
struct trapframe *tf = td->td_frame;
if ((tf->tf_eflags & (PSL_Z|PSL_PF|PSL_N|PSL_V))
== (PSL_Z|PSL_PF|PSL_N|PSL_V)) {
/* waitpid */
pid = uap->a1;
statusp = (int *)uap->a2;
options = uap->a3;
} else {
/* wait */
pid = WAIT_ANY;
statusp = (int *)uap->a1;
options = 0;
}
error = kern_wait(td, pid, &status, options, NULL);
if (error)
return error;
if (statusp) {
/*
* Convert status/signal result.
*/
if (WIFSTOPPED(status)) {
if (WSTOPSIG(status) <= 0 ||
WSTOPSIG(status) > IBCS2_SIGTBLSZ)
return (EINVAL);
status =
IBCS2_STOPCODE(bsd_to_ibcs2_sig[_SIG_IDX(WSTOPSIG(status))]);
} else if (WIFSIGNALED(status)) {
if (WTERMSIG(status) <= 0 ||
WTERMSIG(status) > IBCS2_SIGTBLSZ)
return (EINVAL);
status = bsd_to_ibcs2_sig[_SIG_IDX(WTERMSIG(status))];
}
/* else exit status -- identical */
/* record result/status */
td->td_retval[1] = status;
return copyout(&status, statusp, sizeof(status));
}
return 0;
}
int
ibcs2_execv(td, uap)
struct thread *td;
struct ibcs2_execv_args *uap;
{
struct image_args eargs;
char *path;
int error;
CHECKALTEXIST(td, uap->path, &path);
error = exec_copyin_args(&eargs, path, UIO_SYSSPACE, uap->argp, NULL);
free(path, M_TEMP);
if (error == 0)
error = kern_execve(td, &eargs, NULL);
return (error);
}
int
ibcs2_execve(td, uap)
struct thread *td;
struct ibcs2_execve_args *uap;
{
struct image_args eargs;
char *path;
int error;
CHECKALTEXIST(td, uap->path, &path);
error = exec_copyin_args(&eargs, path, UIO_SYSSPACE, uap->argp,
uap->envp);
free(path, M_TEMP);
if (error == 0)
error = kern_execve(td, &eargs, NULL);
return (error);
}
int
ibcs2_umount(td, uap)
struct thread *td;
struct ibcs2_umount_args *uap;
{
struct unmount_args um;
um.path = uap->name;
um.flags = 0;
- return unmount(td, &um);
+ return sys_unmount(td, &um);
}
int
ibcs2_mount(td, uap)
struct thread *td;
struct ibcs2_mount_args *uap;
{
#ifdef notyet
int oflags = uap->flags, nflags, error;
char fsname[MFSNAMELEN];
if (oflags & (IBCS2_MS_NOSUB | IBCS2_MS_SYS5))
return (EINVAL);
if ((oflags & IBCS2_MS_NEWTYPE) == 0)
return (EINVAL);
nflags = 0;
if (oflags & IBCS2_MS_RDONLY)
nflags |= MNT_RDONLY;
if (oflags & IBCS2_MS_NOSUID)
nflags |= MNT_NOSUID;
if (oflags & IBCS2_MS_REMOUNT)
nflags |= MNT_UPDATE;
uap->flags = nflags;
if (error = copyinstr((caddr_t)uap->type, fsname, sizeof fsname,
(u_int *)0))
return (error);
if (strcmp(fsname, "4.2") == 0) {
uap->type = (caddr_t)STACK_ALLOC();
if (error = copyout("ufs", uap->type, sizeof("ufs")))
return (error);
} else if (strcmp(fsname, "nfs") == 0) {
struct ibcs2_nfs_args sna;
struct sockaddr_in sain;
struct nfs_args na;
struct sockaddr sa;
if (error = copyin(uap->data, &sna, sizeof sna))
return (error);
if (error = copyin(sna.addr, &sain, sizeof sain))
return (error);
bcopy(&sain, &sa, sizeof sa);
sa.sa_len = sizeof(sain);
uap->data = (caddr_t)STACK_ALLOC();
na.addr = (struct sockaddr *)((int)uap->data + sizeof na);
na.sotype = SOCK_DGRAM;
na.proto = IPPROTO_UDP;
na.fh = (nfsv2fh_t *)sna.fh;
na.flags = sna.flags;
na.wsize = sna.wsize;
na.rsize = sna.rsize;
na.timeo = sna.timeo;
na.retrans = sna.retrans;
na.hostname = sna.hostname;
if (error = copyout(&sa, na.addr, sizeof sa))
return (error);
if (error = copyout(&na, uap->data, sizeof na))
return (error);
}
return (mount(td, uap));
#else
return EINVAL;
#endif
}
/*
* Read iBCS2-style directory entries. We suck them into kernel space so
* that they can be massaged before being copied out to user code. Like
* SunOS, we squish out `empty' entries.
*
* This is quite ugly, but what do you expect from compatibility code?
*/
int
ibcs2_getdents(td, uap)
struct thread *td;
register struct ibcs2_getdents_args *uap;
{
register struct vnode *vp;
register caddr_t inp, buf; /* BSD-format */
register int len, reclen; /* BSD-format */
register caddr_t outp; /* iBCS2-format */
register int resid; /* iBCS2-format */
struct file *fp;
struct uio auio;
struct iovec aiov;
struct ibcs2_dirent idb;
off_t off; /* true file offset */
int buflen, error, eofflag, vfslocked;
u_long *cookies = NULL, *cookiep;
int ncookies;
#define BSD_DIRENT(cp) ((struct dirent *)(cp))
#define IBCS2_RECLEN(reclen) (reclen + sizeof(u_short))
if ((error = getvnode(td->td_proc->p_fd, uap->fd,
CAP_READ | CAP_SEEK, &fp)) != 0)
return (error);
if ((fp->f_flag & FREAD) == 0) {
fdrop(fp, td);
return (EBADF);
}
vp = fp->f_vnode;
vfslocked = VFS_LOCK_GIANT(vp->v_mount);
if (vp->v_type != VDIR) { /* XXX vnode readdir op should do this */
VFS_UNLOCK_GIANT(vfslocked);
fdrop(fp, td);
return (EINVAL);
}
off = fp->f_offset;
#define DIRBLKSIZ 512 /* XXX we used to use ufs's DIRBLKSIZ */
buflen = max(DIRBLKSIZ, uap->nbytes);
buflen = min(buflen, MAXBSIZE);
buf = malloc(buflen, M_TEMP, M_WAITOK);
vn_lock(vp, LK_SHARED | LK_RETRY);
again:
aiov.iov_base = buf;
aiov.iov_len = buflen;
auio.uio_iov = &aiov;
auio.uio_iovcnt = 1;
auio.uio_rw = UIO_READ;
auio.uio_segflg = UIO_SYSSPACE;
auio.uio_td = td;
auio.uio_resid = buflen;
auio.uio_offset = off;
if (cookies) {
free(cookies, M_TEMP);
cookies = NULL;
}
#ifdef MAC
error = mac_vnode_check_readdir(td->td_ucred, vp);
if (error)
goto out;
#endif
/*
* First we read into the malloc'ed buffer, then
* we massage it into user space, one record at a time.
*/
if ((error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, &ncookies, &cookies)) != 0)
goto out;
inp = buf;
outp = uap->buf;
resid = uap->nbytes;
if ((len = buflen - auio.uio_resid) <= 0)
goto eof;
cookiep = cookies;
if (cookies) {
/*
* When using cookies, the vfs has the option of reading from
* a different offset than that supplied (UFS truncates the
* offset to a block boundary to make sure that it never reads
* partway through a directory entry, even if the directory
* has been compacted).
*/
while (len > 0 && ncookies > 0 && *cookiep <= off) {
len -= BSD_DIRENT(inp)->d_reclen;
inp += BSD_DIRENT(inp)->d_reclen;
cookiep++;
ncookies--;
}
}
for (; len > 0; len -= reclen) {
if (cookiep && ncookies == 0)
break;
reclen = BSD_DIRENT(inp)->d_reclen;
if (reclen & 3) {
printf("ibcs2_getdents: reclen=%d\n", reclen);
error = EFAULT;
goto out;
}
if (BSD_DIRENT(inp)->d_fileno == 0) {
inp += reclen; /* it is a hole; squish it out */
if (cookiep) {
off = *cookiep++;
ncookies--;
} else
off += reclen;
continue;
}
if (reclen > len || resid < IBCS2_RECLEN(reclen)) {
/* entry too big for buffer, so just stop */
outp++;
break;
}
/*
* Massage in place to make an iBCS2-shaped dirent (otherwise
* we have to worry about touching user memory outside of
* the copyout() call).
*/
idb.d_ino = (ibcs2_ino_t)BSD_DIRENT(inp)->d_fileno;
idb.d_off = (ibcs2_off_t)off;
idb.d_reclen = (u_short)IBCS2_RECLEN(reclen);
if ((error = copyout((caddr_t)&idb, outp, 10)) != 0 ||
(error = copyout(BSD_DIRENT(inp)->d_name, outp + 10,
BSD_DIRENT(inp)->d_namlen + 1)) != 0)
goto out;
/* advance past this real entry */
if (cookiep) {
off = *cookiep++;
ncookies--;
} else
off += reclen;
inp += reclen;
/* advance output past iBCS2-shaped entry */
outp += IBCS2_RECLEN(reclen);
resid -= IBCS2_RECLEN(reclen);
}
/* if we squished out the whole block, try again */
if (outp == uap->buf)
goto again;
fp->f_offset = off; /* update the vnode offset */
eof:
td->td_retval[0] = uap->nbytes - resid;
out:
VOP_UNLOCK(vp, 0);
VFS_UNLOCK_GIANT(vfslocked);
fdrop(fp, td);
if (cookies)
free(cookies, M_TEMP);
free(buf, M_TEMP);
return (error);
}
int
ibcs2_read(td, uap)
struct thread *td;
struct ibcs2_read_args *uap;
{
register struct vnode *vp;
register caddr_t inp, buf; /* BSD-format */
register int len, reclen; /* BSD-format */
register caddr_t outp; /* iBCS2-format */
register int resid; /* iBCS2-format */
struct file *fp;
struct uio auio;
struct iovec aiov;
struct ibcs2_direct {
ibcs2_ino_t ino;
char name[14];
} idb;
off_t off; /* true file offset */
int buflen, error, eofflag, size, vfslocked;
u_long *cookies = NULL, *cookiep;
int ncookies;
if ((error = getvnode(td->td_proc->p_fd, uap->fd,
CAP_READ | CAP_SEEK, &fp)) != 0) {
if (error == EINVAL)
- return read(td, (struct read_args *)uap);
+ return sys_read(td, (struct read_args *)uap);
else
return error;
}
if ((fp->f_flag & FREAD) == 0) {
fdrop(fp, td);
return (EBADF);
}
vp = fp->f_vnode;
vfslocked = VFS_LOCK_GIANT(vp->v_mount);
if (vp->v_type != VDIR) {
VFS_UNLOCK_GIANT(vfslocked);
fdrop(fp, td);
- return read(td, (struct read_args *)uap);
+ return sys_read(td, (struct read_args *)uap);
}
off = fp->f_offset;
DPRINTF(("ibcs2_read: read directory\n"));
buflen = max(DIRBLKSIZ, uap->nbytes);
buflen = min(buflen, MAXBSIZE);
buf = malloc(buflen, M_TEMP, M_WAITOK);
vn_lock(vp, LK_SHARED | LK_RETRY);
again:
aiov.iov_base = buf;
aiov.iov_len = buflen;
auio.uio_iov = &aiov;
auio.uio_iovcnt = 1;
auio.uio_rw = UIO_READ;
auio.uio_segflg = UIO_SYSSPACE;
auio.uio_td = td;
auio.uio_resid = buflen;
auio.uio_offset = off;
if (cookies) {
free(cookies, M_TEMP);
cookies = NULL;
}
#ifdef MAC
error = mac_vnode_check_readdir(td->td_ucred, vp);
if (error)
goto out;
#endif
/*
* First we read into the malloc'ed buffer, then
* we massage it into user space, one record at a time.
*/
if ((error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, &ncookies, &cookies)) != 0) {
DPRINTF(("VOP_READDIR failed: %d\n", error));
goto out;
}
inp = buf;
outp = uap->buf;
resid = uap->nbytes;
if ((len = buflen - auio.uio_resid) <= 0)
goto eof;
cookiep = cookies;
if (cookies) {
/*
* When using cookies, the vfs has the option of reading from
* a different offset than that supplied (UFS truncates the
* offset to a block boundary to make sure that it never reads
* partway through a directory entry, even if the directory
* has been compacted).
*/
while (len > 0 && ncookies > 0 && *cookiep <= off) {
len -= BSD_DIRENT(inp)->d_reclen;
inp += BSD_DIRENT(inp)->d_reclen;
cookiep++;
ncookies--;
}
}
for (; len > 0 && resid > 0; len -= reclen) {
if (cookiep && ncookies == 0)
break;
reclen = BSD_DIRENT(inp)->d_reclen;
if (reclen & 3) {
printf("ibcs2_read: reclen=%d\n", reclen);
error = EFAULT;
goto out;
}
if (BSD_DIRENT(inp)->d_fileno == 0) {
inp += reclen; /* it is a hole; squish it out */
if (cookiep) {
off = *cookiep++;
ncookies--;
} else
off += reclen;
continue;
}
if (reclen > len || resid < sizeof(struct ibcs2_direct)) {
/* entry too big for buffer, so just stop */
outp++;
break;
}
/*
* Massage in place to make an iBCS2-shaped dirent (otherwise
* we have to worry about touching user memory outside of
* the copyout() call).
*
* TODO: if length(filename) > 14, then break filename into
* multiple entries and set inode = 0xffff except last
*/
idb.ino = (BSD_DIRENT(inp)->d_fileno > 0xfffe) ? 0xfffe :
BSD_DIRENT(inp)->d_fileno;
(void)copystr(BSD_DIRENT(inp)->d_name, idb.name, 14, &size);
bzero(idb.name + size, 14 - size);
if ((error = copyout(&idb, outp, sizeof(struct ibcs2_direct))) != 0)
goto out;
/* advance past this real entry */
if (cookiep) {
off = *cookiep++;
ncookies--;
} else
off += reclen;
inp += reclen;
/* advance output past iBCS2-shaped entry */
outp += sizeof(struct ibcs2_direct);
resid -= sizeof(struct ibcs2_direct);
}
/* if we squished out the whole block, try again */
if (outp == uap->buf)
goto again;
fp->f_offset = off; /* update the vnode offset */
eof:
td->td_retval[0] = uap->nbytes - resid;
out:
VOP_UNLOCK(vp, 0);
VFS_UNLOCK_GIANT(vfslocked);
fdrop(fp, td);
if (cookies)
free(cookies, M_TEMP);
free(buf, M_TEMP);
return (error);
}
int
ibcs2_mknod(td, uap)
struct thread *td;
struct ibcs2_mknod_args *uap;
{
char *path;
int error;
CHECKALTCREAT(td, uap->path, &path);
if (S_ISFIFO(uap->mode))
error = kern_mkfifo(td, path, UIO_SYSSPACE, uap->mode);
else
error = kern_mknod(td, path, UIO_SYSSPACE, uap->mode, uap->dev);
free(path, M_TEMP);
return (error);
}
int
ibcs2_getgroups(td, uap)
struct thread *td;
struct ibcs2_getgroups_args *uap;
{
ibcs2_gid_t *iset;
gid_t *gp;
u_int i, ngrp;
int error;
if (uap->gidsetsize < td->td_ucred->cr_ngroups) {
if (uap->gidsetsize == 0)
ngrp = 0;
else
return (EINVAL);
} else
ngrp = td->td_ucred->cr_ngroups;
gp = malloc(ngrp * sizeof(*gp), M_TEMP, M_WAITOK);
error = kern_getgroups(td, &ngrp, gp);
if (error)
goto out;
if (uap->gidsetsize > 0) {
iset = malloc(ngrp * sizeof(*iset), M_TEMP, M_WAITOK);
for (i = 0; i < ngrp; i++)
iset[i] = (ibcs2_gid_t)gp[i];
error = copyout(iset, uap->gidset, ngrp * sizeof(ibcs2_gid_t));
free(iset, M_TEMP);
}
if (error == 0)
td->td_retval[0] = ngrp;
out:
free(gp, M_TEMP);
return (error);
}
int
ibcs2_setgroups(td, uap)
struct thread *td;
struct ibcs2_setgroups_args *uap;
{
ibcs2_gid_t *iset;
gid_t *gp;
int error, i;
if (uap->gidsetsize < 0 || uap->gidsetsize > ngroups_max + 1)
return (EINVAL);
if (uap->gidsetsize && uap->gidset == NULL)
return (EINVAL);
gp = malloc(uap->gidsetsize * sizeof(*gp), M_TEMP, M_WAITOK);
if (uap->gidsetsize) {
iset = malloc(uap->gidsetsize * sizeof(*iset), M_TEMP, M_WAITOK);
error = copyin(uap->gidset, iset, sizeof(ibcs2_gid_t) *
uap->gidsetsize);
if (error) {
free(iset, M_TEMP);
goto out;
}
for (i = 0; i < uap->gidsetsize; i++)
gp[i] = (gid_t)iset[i];
}
error = kern_setgroups(td, uap->gidsetsize, gp);
out:
free(gp, M_TEMP);
return (error);
}
int
ibcs2_setuid(td, uap)
struct thread *td;
struct ibcs2_setuid_args *uap;
{
struct setuid_args sa;
sa.uid = (uid_t)uap->uid;
- return setuid(td, &sa);
+ return sys_setuid(td, &sa);
}
int
ibcs2_setgid(td, uap)
struct thread *td;
struct ibcs2_setgid_args *uap;
{
struct setgid_args sa;
sa.gid = (gid_t)uap->gid;
- return setgid(td, &sa);
+ return sys_setgid(td, &sa);
}
int
ibcs2_time(td, uap)
struct thread *td;
struct ibcs2_time_args *uap;
{
struct timeval tv;
microtime(&tv);
td->td_retval[0] = tv.tv_sec;
if (uap->tp)
return copyout((caddr_t)&tv.tv_sec, (caddr_t)uap->tp,
sizeof(ibcs2_time_t));
else
return 0;
}
int
ibcs2_pathconf(td, uap)
struct thread *td;
struct ibcs2_pathconf_args *uap;
{
char *path;
int error;
CHECKALTEXIST(td, uap->path, &path);
uap->name++; /* iBCS2 _PC_* defines are offset by one */
error = kern_pathconf(td, path, UIO_SYSSPACE, uap->name, FOLLOW);
free(path, M_TEMP);
return (error);
}
int
ibcs2_fpathconf(td, uap)
struct thread *td;
struct ibcs2_fpathconf_args *uap;
{
uap->name++; /* iBCS2 _PC_* defines are offset by one */
- return fpathconf(td, (struct fpathconf_args *)uap);
+ return sys_fpathconf(td, (struct fpathconf_args *)uap);
}
int
ibcs2_sysconf(td, uap)
struct thread *td;
struct ibcs2_sysconf_args *uap;
{
int mib[2], value, len, error;
struct proc *p;
p = td->td_proc;
switch(uap->name) {
case IBCS2_SC_ARG_MAX:
mib[1] = KERN_ARGMAX;
break;
case IBCS2_SC_CHILD_MAX:
PROC_LOCK(p);
td->td_retval[0] = lim_cur(td->td_proc, RLIMIT_NPROC);
PROC_UNLOCK(p);
return 0;
case IBCS2_SC_CLK_TCK:
td->td_retval[0] = hz;
return 0;
case IBCS2_SC_NGROUPS_MAX:
mib[1] = KERN_NGROUPS;
break;
case IBCS2_SC_OPEN_MAX:
PROC_LOCK(p);
td->td_retval[0] = lim_cur(td->td_proc, RLIMIT_NOFILE);
PROC_UNLOCK(p);
return 0;
case IBCS2_SC_JOB_CONTROL:
mib[1] = KERN_JOB_CONTROL;
break;
case IBCS2_SC_SAVED_IDS:
mib[1] = KERN_SAVED_IDS;
break;
case IBCS2_SC_VERSION:
mib[1] = KERN_POSIX1;
break;
case IBCS2_SC_PASS_MAX:
td->td_retval[0] = 128; /* XXX - should we create PASS_MAX ? */
return 0;
case IBCS2_SC_XOPEN_VERSION:
td->td_retval[0] = 2; /* XXX: What should that be? */
return 0;
default:
return EINVAL;
}
mib[0] = CTL_KERN;
len = sizeof(value);
error = kernel_sysctl(td, mib, 2, &value, &len, NULL, 0, NULL, 0);
if (error)
return error;
td->td_retval[0] = value;
return 0;
}
int
ibcs2_alarm(td, uap)
struct thread *td;
struct ibcs2_alarm_args *uap;
{
struct itimerval itv, oitv;
int error;
timevalclear(&itv.it_interval);
itv.it_value.tv_sec = uap->sec;
itv.it_value.tv_usec = 0;
error = kern_setitimer(td, ITIMER_REAL, &itv, &oitv);
if (error)
return (error);
if (oitv.it_value.tv_usec != 0)
oitv.it_value.tv_sec++;
td->td_retval[0] = oitv.it_value.tv_sec;
return (0);
}
int
ibcs2_times(td, uap)
struct thread *td;
struct ibcs2_times_args *uap;
{
struct rusage ru;
struct timeval t;
struct tms tms;
int error;
#define CONVTCK(r) (r.tv_sec * hz + r.tv_usec / (1000000 / hz))
error = kern_getrusage(td, RUSAGE_SELF, &ru);
if (error)
return (error);
tms.tms_utime = CONVTCK(ru.ru_utime);
tms.tms_stime = CONVTCK(ru.ru_stime);
error = kern_getrusage(td, RUSAGE_CHILDREN, &ru);
if (error)
return (error);
tms.tms_cutime = CONVTCK(ru.ru_utime);
tms.tms_cstime = CONVTCK(ru.ru_stime);
microtime(&t);
td->td_retval[0] = CONVTCK(t);
return (copyout(&tms, uap->tp, sizeof(struct tms)));
}
int
ibcs2_stime(td, uap)
struct thread *td;
struct ibcs2_stime_args *uap;
{
struct timeval tv;
long secs;
int error;
error = copyin(uap->timep, &secs, sizeof(long));
if (error)
return (error);
tv.tv_sec = secs;
tv.tv_usec = 0;
error = kern_settimeofday(td, &tv, NULL);
if (error)
error = EPERM;
return (error);
}
int
ibcs2_utime(td, uap)
struct thread *td;
struct ibcs2_utime_args *uap;
{
struct ibcs2_utimbuf ubuf;
struct timeval tbuf[2], *tp;
char *path;
int error;
if (uap->buf) {
error = copyin(uap->buf, &ubuf, sizeof(ubuf));
if (error)
return (error);
tbuf[0].tv_sec = ubuf.actime;
tbuf[0].tv_usec = 0;
tbuf[1].tv_sec = ubuf.modtime;
tbuf[1].tv_usec = 0;
tp = tbuf;
} else
tp = NULL;
CHECKALTEXIST(td, uap->path, &path);
error = kern_utimes(td, path, UIO_SYSSPACE, tp, UIO_SYSSPACE);
free(path, M_TEMP);
return (error);
}
int
ibcs2_nice(td, uap)
struct thread *td;
struct ibcs2_nice_args *uap;
{
int error;
struct setpriority_args sa;
sa.which = PRIO_PROCESS;
sa.who = 0;
sa.prio = td->td_proc->p_nice + uap->incr;
- if ((error = setpriority(td, &sa)) != 0)
+ if ((error = sys_setpriority(td, &sa)) != 0)
return EPERM;
td->td_retval[0] = td->td_proc->p_nice;
return 0;
}
/*
* iBCS2 getpgrp, setpgrp, setsid, and setpgid
*/
int
ibcs2_pgrpsys(td, uap)
struct thread *td;
struct ibcs2_pgrpsys_args *uap;
{
struct proc *p = td->td_proc;
switch (uap->type) {
case 0: /* getpgrp */
PROC_LOCK(p);
td->td_retval[0] = p->p_pgrp->pg_id;
PROC_UNLOCK(p);
return 0;
case 1: /* setpgrp */
{
struct setpgid_args sa;
sa.pid = 0;
sa.pgid = 0;
- setpgid(td, &sa);
+ sys_setpgid(td, &sa);
PROC_LOCK(p);
td->td_retval[0] = p->p_pgrp->pg_id;
PROC_UNLOCK(p);
return 0;
}
case 2: /* setpgid */
{
struct setpgid_args sa;
sa.pid = uap->pid;
sa.pgid = uap->pgid;
- return setpgid(td, &sa);
+ return sys_setpgid(td, &sa);
}
case 3: /* setsid */
- return setsid(td, NULL);
+ return sys_setsid(td, NULL);
default:
return EINVAL;
}
}
/*
* XXX - need to check for nested calls
*/
int
ibcs2_plock(td, uap)
struct thread *td;
struct ibcs2_plock_args *uap;
{
int error;
#define IBCS2_UNLOCK 0
#define IBCS2_PROCLOCK 1
#define IBCS2_TEXTLOCK 2
#define IBCS2_DATALOCK 4
switch(uap->cmd) {
case IBCS2_UNLOCK:
error = priv_check(td, PRIV_VM_MUNLOCK);
if (error)
return (error);
/* XXX - TODO */
return (0);
case IBCS2_PROCLOCK:
case IBCS2_TEXTLOCK:
case IBCS2_DATALOCK:
error = priv_check(td, PRIV_VM_MLOCK);
if (error)
return (error);
/* XXX - TODO */
return 0;
}
return EINVAL;
}
int
ibcs2_uadmin(td, uap)
struct thread *td;
struct ibcs2_uadmin_args *uap;
{
#define SCO_A_REBOOT 1
#define SCO_A_SHUTDOWN 2
#define SCO_A_REMOUNT 4
#define SCO_A_CLOCK 8
#define SCO_A_SETCONFIG 128
#define SCO_A_GETDEV 130
#define SCO_AD_HALT 0
#define SCO_AD_BOOT 1
#define SCO_AD_IBOOT 2
#define SCO_AD_PWRDOWN 3
#define SCO_AD_PWRNAP 4
#define SCO_AD_PANICBOOT 1
#define SCO_AD_GETBMAJ 0
#define SCO_AD_GETCMAJ 1
switch(uap->cmd) {
case SCO_A_REBOOT:
case SCO_A_SHUTDOWN:
switch(uap->func) {
struct reboot_args r;
case SCO_AD_HALT:
case SCO_AD_PWRDOWN:
case SCO_AD_PWRNAP:
r.opt = RB_HALT;
- return (reboot(td, &r));
+ return (sys_reboot(td, &r));
case SCO_AD_BOOT:
case SCO_AD_IBOOT:
r.opt = RB_AUTOBOOT;
- return (reboot(td, &r));
+ return (sys_reboot(td, &r));
}
return EINVAL;
case SCO_A_REMOUNT:
case SCO_A_CLOCK:
case SCO_A_SETCONFIG:
return 0;
case SCO_A_GETDEV:
return EINVAL; /* XXX - TODO */
}
return EINVAL;
}
int
ibcs2_sysfs(td, uap)
struct thread *td;
struct ibcs2_sysfs_args *uap;
{
#define IBCS2_GETFSIND 1
#define IBCS2_GETFSTYP 2
#define IBCS2_GETNFSTYP 3
switch(uap->cmd) {
case IBCS2_GETFSIND:
case IBCS2_GETFSTYP:
case IBCS2_GETNFSTYP:
break;
}
return EINVAL; /* XXX - TODO */
}
int
ibcs2_unlink(td, uap)
struct thread *td;
struct ibcs2_unlink_args *uap;
{
char *path;
int error;
CHECKALTEXIST(td, uap->path, &path);
error = kern_unlink(td, path, UIO_SYSSPACE);
free(path, M_TEMP);
return (error);
}
int
ibcs2_chdir(td, uap)
struct thread *td;
struct ibcs2_chdir_args *uap;
{
char *path;
int error;
CHECKALTEXIST(td, uap->path, &path);
error = kern_chdir(td, path, UIO_SYSSPACE);
free(path, M_TEMP);
return (error);
}
int
ibcs2_chmod(td, uap)
struct thread *td;
struct ibcs2_chmod_args *uap;
{
char *path;
int error;
CHECKALTEXIST(td, uap->path, &path);
error = kern_chmod(td, path, UIO_SYSSPACE, uap->mode);
free(path, M_TEMP);
return (error);
}
int
ibcs2_chown(td, uap)
struct thread *td;
struct ibcs2_chown_args *uap;
{
char *path;
int error;
CHECKALTEXIST(td, uap->path, &path);
error = kern_chown(td, path, UIO_SYSSPACE, uap->uid, uap->gid);
free(path, M_TEMP);
return (error);
}
int
ibcs2_rmdir(td, uap)
struct thread *td;
struct ibcs2_rmdir_args *uap;
{
char *path;
int error;
CHECKALTEXIST(td, uap->path, &path);
error = kern_rmdir(td, path, UIO_SYSSPACE);
free(path, M_TEMP);
return (error);
}
int
ibcs2_mkdir(td, uap)
struct thread *td;
struct ibcs2_mkdir_args *uap;
{
char *path;
int error;
CHECKALTEXIST(td, uap->path, &path);
error = kern_mkdir(td, path, UIO_SYSSPACE, uap->mode);
free(path, M_TEMP);
return (error);
}
int
ibcs2_symlink(td, uap)
struct thread *td;
struct ibcs2_symlink_args *uap;
{
char *path, *link;
int error;
CHECKALTEXIST(td, uap->path, &path);
/*
* Have to expand CHECKALTCREAT() so that 'path' can be freed on
* errors.
*/
error = ibcs2_emul_find(td, uap->link, UIO_USERSPACE, &link, 1);
if (link == NULL) {
free(path, M_TEMP);
return (error);
}
error = kern_symlink(td, path, link, UIO_SYSSPACE);
free(path, M_TEMP);
free(link, M_TEMP);
return (error);
}
int
ibcs2_rename(td, uap)
struct thread *td;
struct ibcs2_rename_args *uap;
{
char *from, *to;
int error;
CHECKALTEXIST(td, uap->from, &from);
/*
* Have to expand CHECKALTCREAT() so that 'from' can be freed on
* errors.
*/
error = ibcs2_emul_find(td, uap->to, UIO_USERSPACE, &to, 1);
if (to == NULL) {
free(from, M_TEMP);
return (error);
}
error = kern_rename(td, from, to, UIO_SYSSPACE);
free(from, M_TEMP);
free(to, M_TEMP);
return (error);
}
int
ibcs2_readlink(td, uap)
struct thread *td;
struct ibcs2_readlink_args *uap;
{
char *path;
int error;
CHECKALTEXIST(td, uap->path, &path);
error = kern_readlink(td, path, UIO_SYSSPACE, uap->buf, UIO_USERSPACE,
uap->count);
free(path, M_TEMP);
return (error);
}
Index: head/sys/i386/ibcs2/ibcs2_other.c
===================================================================
--- head/sys/i386/ibcs2/ibcs2_other.c (revision 225616)
+++ head/sys/i386/ibcs2/ibcs2_other.c (revision 225617)
@@ -1,118 +1,118 @@
/*-
* Copyright (c) 1995 Steven Wallace
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
/*
* IBCS2 compatibility module.
*/
#include "opt_spx_hack.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/syscallsubr.h>
#include <sys/sysproto.h>
#include <sys/un.h>
#include <i386/ibcs2/ibcs2_types.h>
#include <i386/ibcs2/ibcs2_signal.h>
#include <i386/ibcs2/ibcs2_util.h>
#include <i386/ibcs2/ibcs2_proto.h>
#define IBCS2_SECURE_GETLUID 1
#define IBCS2_SECURE_SETLUID 2
int
ibcs2_secure(struct thread *td, struct ibcs2_secure_args *uap)
{
switch (uap->cmd) {
case IBCS2_SECURE_GETLUID: /* get login uid */
td->td_retval[0] = td->td_ucred->cr_uid;
return 0;
case IBCS2_SECURE_SETLUID: /* set login uid */
return EPERM;
default:
printf("IBCS2: 'secure' cmd=%d not implemented\n", uap->cmd);
}
return EINVAL;
}
int
ibcs2_lseek(struct thread *td, register struct ibcs2_lseek_args *uap)
{
struct lseek_args largs;
int error;
largs.fd = uap->fd;
largs.offset = uap->offset;
largs.whence = uap->whence;
- error = lseek(td, &largs);
+ error = sys_lseek(td, &largs);
return (error);
}
#ifdef SPX_HACK
#include <sys/socket.h>
#include <sys/un.h>
int
spx_open(struct thread *td)
{
struct socket_args sock;
struct sockaddr_un sun;
int fd, error;
/* obtain a socket. */
DPRINTF(("SPX: open socket\n"));
sock.domain = AF_UNIX;
sock.type = SOCK_STREAM;
sock.protocol = 0;
- error = socket(td, &sock);
+ error = sys_socket(td, &sock);
if (error)
return error;
fd = td->td_retval[0];
/* connect the socket to standard X socket */
DPRINTF(("SPX: connect to /tmp/X11-unix/X0\n"));
sun.sun_family = AF_UNIX;
strcpy(sun.sun_path, "/tmp/.X11-unix/X0");
sun.sun_len = sizeof(struct sockaddr_un) - sizeof(sun.sun_path) +
strlen(sun.sun_path) + 1;
error = kern_connect(td, fd, (struct sockaddr *)&sun);
if (error) {
kern_close(td, fd);
return error;
}
td->td_retval[0] = fd;
return 0;
}
#endif /* SPX_HACK */
Index: head/sys/i386/ibcs2/ibcs2_signal.c
===================================================================
--- head/sys/i386/ibcs2/ibcs2_signal.c (revision 225616)
+++ head/sys/i386/ibcs2/ibcs2_signal.c (revision 225617)
@@ -1,441 +1,441 @@
/*-
* Copyright (c) 1995 Scott Bartram
* Copyright (c) 1995 Steven Wallace
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/signalvar.h>
#include <sys/syscallsubr.h>
#include <sys/sysproto.h>
#include <i386/ibcs2/ibcs2_types.h>
#include <i386/ibcs2/ibcs2_signal.h>
#include <i386/ibcs2/ibcs2_proto.h>
#include <i386/ibcs2/ibcs2_xenix.h>
#include <i386/ibcs2/ibcs2_util.h>
#define sigemptyset(s) SIGEMPTYSET(*(s))
#define sigismember(s, n) SIGISMEMBER(*(s), n)
#define sigaddset(s, n) SIGADDSET(*(s), n)
#define ibcs2_sigmask(n) (1 << ((n) - 1))
#define ibcs2_sigemptyset(s) bzero((s), sizeof(*(s)))
#define ibcs2_sigismember(s, n) (*(s) & ibcs2_sigmask(n))
#define ibcs2_sigaddset(s, n) (*(s) |= ibcs2_sigmask(n))
static void ibcs2_to_bsd_sigset(const ibcs2_sigset_t *, sigset_t *);
static void bsd_to_ibcs2_sigset(const sigset_t *, ibcs2_sigset_t *);
static void ibcs2_to_bsd_sigaction(struct ibcs2_sigaction *,
struct sigaction *);
static void bsd_to_ibcs2_sigaction(struct sigaction *,
struct ibcs2_sigaction *);
int bsd_to_ibcs2_sig[IBCS2_SIGTBLSZ] = {
IBCS2_SIGHUP, /* 1 */
IBCS2_SIGINT, /* 2 */
IBCS2_SIGQUIT, /* 3 */
IBCS2_SIGILL, /* 4 */
IBCS2_SIGTRAP, /* 5 */
IBCS2_SIGABRT, /* 6 */
IBCS2_SIGEMT, /* 7 */
IBCS2_SIGFPE, /* 8 */
IBCS2_SIGKILL, /* 9 */
IBCS2_SIGBUS, /* 10 */
IBCS2_SIGSEGV, /* 11 */
IBCS2_SIGSYS, /* 12 */
IBCS2_SIGPIPE, /* 13 */
IBCS2_SIGALRM, /* 14 */
IBCS2_SIGTERM, /* 15 */
0, /* 16 - SIGURG */
IBCS2_SIGSTOP, /* 17 */
IBCS2_SIGTSTP, /* 18 */
IBCS2_SIGCONT, /* 19 */
IBCS2_SIGCLD, /* 20 */
IBCS2_SIGTTIN, /* 21 */
IBCS2_SIGTTOU, /* 22 */
IBCS2_SIGPOLL, /* 23 */
0, /* 24 - SIGXCPU */
0, /* 25 - SIGXFSZ */
IBCS2_SIGVTALRM, /* 26 */
IBCS2_SIGPROF, /* 27 */
IBCS2_SIGWINCH, /* 28 */
0, /* 29 */
IBCS2_SIGUSR1, /* 30 */
IBCS2_SIGUSR2, /* 31 */
0 /* 32 */
};
static int ibcs2_to_bsd_sig[IBCS2_SIGTBLSZ] = {
SIGHUP, /* 1 */
SIGINT, /* 2 */
SIGQUIT, /* 3 */
SIGILL, /* 4 */
SIGTRAP, /* 5 */
SIGABRT, /* 6 */
SIGEMT, /* 7 */
SIGFPE, /* 8 */
SIGKILL, /* 9 */
SIGBUS, /* 10 */
SIGSEGV, /* 11 */
SIGSYS, /* 12 */
SIGPIPE, /* 13 */
SIGALRM, /* 14 */
SIGTERM, /* 15 */
SIGUSR1, /* 16 */
SIGUSR2, /* 17 */
SIGCHLD, /* 18 */
0, /* 19 - SIGPWR */
SIGWINCH, /* 20 */
0, /* 21 */
SIGIO, /* 22 */
SIGSTOP, /* 23 */
SIGTSTP, /* 24 */
SIGCONT, /* 25 */
SIGTTIN, /* 26 */
SIGTTOU, /* 27 */
SIGVTALRM, /* 28 */
SIGPROF, /* 29 */
0, /* 30 */
0, /* 31 */
0 /* 32 */
};
void
ibcs2_to_bsd_sigset(iss, bss)
const ibcs2_sigset_t *iss;
sigset_t *bss;
{
int i, newsig;
sigemptyset(bss);
for (i = 1; i <= IBCS2_SIGTBLSZ; i++) {
if (ibcs2_sigismember(iss, i)) {
newsig = ibcs2_to_bsd_sig[_SIG_IDX(i)];
if (newsig)
sigaddset(bss, newsig);
}
}
}
static void
bsd_to_ibcs2_sigset(bss, iss)
const sigset_t *bss;
ibcs2_sigset_t *iss;
{
int i, newsig;
ibcs2_sigemptyset(iss);
for (i = 1; i <= IBCS2_SIGTBLSZ; i++) {
if (sigismember(bss, i)) {
newsig = bsd_to_ibcs2_sig[_SIG_IDX(i)];
if (newsig)
ibcs2_sigaddset(iss, newsig);
}
}
}
static void
ibcs2_to_bsd_sigaction(isa, bsa)
struct ibcs2_sigaction *isa;
struct sigaction *bsa;
{
bsa->sa_handler = isa->isa_handler;
ibcs2_to_bsd_sigset(&isa->isa_mask, &bsa->sa_mask);
bsa->sa_flags = 0; /* ??? SA_NODEFER */
if ((isa->isa_flags & IBCS2_SA_NOCLDSTOP) != 0)
bsa->sa_flags |= SA_NOCLDSTOP;
}
static void
bsd_to_ibcs2_sigaction(bsa, isa)
struct sigaction *bsa;
struct ibcs2_sigaction *isa;
{
isa->isa_handler = bsa->sa_handler;
bsd_to_ibcs2_sigset(&bsa->sa_mask, &isa->isa_mask);
isa->isa_flags = 0;
if ((bsa->sa_flags & SA_NOCLDSTOP) != 0)
isa->isa_flags |= IBCS2_SA_NOCLDSTOP;
}
int
ibcs2_sigaction(td, uap)
register struct thread *td;
struct ibcs2_sigaction_args *uap;
{
struct ibcs2_sigaction isa;
struct sigaction nbsa, obsa;
struct sigaction *nbsap;
int error;
if (uap->act != NULL) {
if ((error = copyin(uap->act, &isa, sizeof(isa))) != 0)
return (error);
ibcs2_to_bsd_sigaction(&isa, &nbsa);
nbsap = &nbsa;
} else
nbsap = NULL;
if (uap->sig <= 0 || uap->sig > IBCS2_NSIG)
return (EINVAL);
error = kern_sigaction(td, ibcs2_to_bsd_sig[_SIG_IDX(uap->sig)], &nbsa,
&obsa, 0);
if (error == 0 && uap->oact != NULL) {
bsd_to_ibcs2_sigaction(&obsa, &isa);
error = copyout(&isa, uap->oact, sizeof(isa));
}
return (error);
}
int
ibcs2_sigsys(td, uap)
register struct thread *td;
struct ibcs2_sigsys_args *uap;
{
struct proc *p = td->td_proc;
struct sigaction sa;
int signum = IBCS2_SIGNO(uap->sig);
int error;
if (signum <= 0 || signum > IBCS2_NSIG) {
if (IBCS2_SIGCALL(uap->sig) == IBCS2_SIGNAL_MASK ||
IBCS2_SIGCALL(uap->sig) == IBCS2_SIGSET_MASK)
td->td_retval[0] = (int)IBCS2_SIG_ERR;
return EINVAL;
}
signum = ibcs2_to_bsd_sig[_SIG_IDX(signum)];
switch (IBCS2_SIGCALL(uap->sig)) {
case IBCS2_SIGSET_MASK:
/*
* Check for SIG_HOLD action.
* Otherwise, perform signal() except with different sa_flags.
*/
if (uap->fp != IBCS2_SIG_HOLD) {
/* add sig to mask before exececuting signal handler */
sa.sa_flags = 0;
goto ibcs2_sigset;
}
/* else FALLTHROUGH to sighold */
case IBCS2_SIGHOLD_MASK:
{
sigset_t mask;
SIGEMPTYSET(mask);
SIGADDSET(mask, signum);
return (kern_sigprocmask(td, SIG_BLOCK, &mask, NULL,
0));
}
case IBCS2_SIGNAL_MASK:
{
struct sigaction osa;
/* do not automatically block signal */
sa.sa_flags = SA_NODEFER;
#ifdef SA_RESETHAND
if((signum != IBCS2_SIGILL) &&
(signum != IBCS2_SIGTRAP) &&
(signum != IBCS2_SIGPWR))
/* set to SIG_DFL before executing handler */
sa.sa_flags |= SA_RESETHAND;
#endif
ibcs2_sigset:
sa.sa_handler = uap->fp;
sigemptyset(&sa.sa_mask);
#if 0
if (signum != SIGALRM)
sa.sa_flags |= SA_RESTART;
#endif
error = kern_sigaction(td, signum, &sa, &osa, 0);
if (error != 0) {
DPRINTF(("signal: sigaction failed: %d\n",
error));
td->td_retval[0] = (int)IBCS2_SIG_ERR;
return (error);
}
td->td_retval[0] = (int)osa.sa_handler;
/* special sigset() check */
if(IBCS2_SIGCALL(uap->sig) == IBCS2_SIGSET_MASK) {
PROC_LOCK(p);
/* check to make sure signal is not blocked */
if(sigismember(&td->td_sigmask, signum)) {
/* return SIG_HOLD and unblock signal*/
td->td_retval[0] = (int)IBCS2_SIG_HOLD;
SIGDELSET(td->td_sigmask, signum);
signotify(td);
}
PROC_UNLOCK(p);
}
return 0;
}
case IBCS2_SIGRELSE_MASK:
{
sigset_t mask;
SIGEMPTYSET(mask);
SIGADDSET(mask, signum);
return (kern_sigprocmask(td, SIG_UNBLOCK, &mask, NULL,
0));
}
case IBCS2_SIGIGNORE_MASK:
{
sa.sa_handler = SIG_IGN;
sigemptyset(&sa.sa_mask);
sa.sa_flags = 0;
error = kern_sigaction(td, signum, &sa, NULL, 0);
if (error != 0)
DPRINTF(("sigignore: sigaction failed\n"));
return (error);
}
case IBCS2_SIGPAUSE_MASK:
{
sigset_t mask;
PROC_LOCK(p);
mask = td->td_sigmask;
PROC_UNLOCK(p);
SIGDELSET(mask, signum);
return kern_sigsuspend(td, mask);
}
default:
return ENOSYS;
}
}
int
ibcs2_sigprocmask(td, uap)
register struct thread *td;
struct ibcs2_sigprocmask_args *uap;
{
ibcs2_sigset_t iss;
sigset_t oss, nss;
sigset_t *nssp;
int error, how;
switch (uap->how) {
case IBCS2_SIG_BLOCK:
how = SIG_BLOCK;
break;
case IBCS2_SIG_UNBLOCK:
how = SIG_UNBLOCK;
break;
case IBCS2_SIG_SETMASK:
how = SIG_SETMASK;
break;
default:
return (EINVAL);
}
if (uap->set != NULL) {
if ((error = copyin(uap->set, &iss, sizeof(iss))) != 0)
return error;
ibcs2_to_bsd_sigset(&iss, &nss);
nssp = &nss;
} else
nssp = NULL;
error = kern_sigprocmask(td, how, nssp, &oss, 0);
if (error == 0 && uap->oset != NULL) {
bsd_to_ibcs2_sigset(&oss, &iss);
error = copyout(&iss, uap->oset, sizeof(iss));
}
return (error);
}
int
ibcs2_sigpending(td, uap)
register struct thread *td;
struct ibcs2_sigpending_args *uap;
{
struct proc *p = td->td_proc;
sigset_t bss;
ibcs2_sigset_t iss;
PROC_LOCK(p);
bss = td->td_siglist;
SIGSETOR(bss, p->p_siglist);
SIGSETAND(bss, td->td_sigmask);
PROC_UNLOCK(p);
bsd_to_ibcs2_sigset(&bss, &iss);
return copyout(&iss, uap->mask, sizeof(iss));
}
int
ibcs2_sigsuspend(td, uap)
register struct thread *td;
struct ibcs2_sigsuspend_args *uap;
{
ibcs2_sigset_t sss;
sigset_t bss;
int error;
if ((error = copyin(uap->mask, &sss, sizeof(sss))) != 0)
return error;
ibcs2_to_bsd_sigset(&sss, &bss);
return kern_sigsuspend(td, bss);
}
int
ibcs2_pause(td, uap)
register struct thread *td;
struct ibcs2_pause_args *uap;
{
sigset_t mask;
PROC_LOCK(td->td_proc);
mask = td->td_sigmask;
PROC_UNLOCK(td->td_proc);
return kern_sigsuspend(td, mask);
}
int
ibcs2_kill(td, uap)
register struct thread *td;
struct ibcs2_kill_args *uap;
{
struct kill_args ka;
if (uap->signo <= 0 || uap->signo > IBCS2_NSIG)
return (EINVAL);
ka.pid = uap->pid;
ka.signum = ibcs2_to_bsd_sig[_SIG_IDX(uap->signo)];
- return kill(td, &ka);
+ return sys_kill(td, &ka);
}
Index: head/sys/i386/ibcs2/ibcs2_socksys.c
===================================================================
--- head/sys/i386/ibcs2/ibcs2_socksys.c (revision 225616)
+++ head/sys/i386/ibcs2/ibcs2_socksys.c (revision 225617)
@@ -1,210 +1,210 @@
/*-
* Copyright (c) 1994, 1995 Scott Bartram
* Copyright (c) 1994 Arne H Juul
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/sysproto.h>
#include <sys/jail.h>
#include <sys/kernel.h>
#include <sys/sysctl.h>
#include <i386/ibcs2/ibcs2_socksys.h>
#include <i386/ibcs2/ibcs2_util.h>
/* Local structures */
struct getipdomainname_args {
char *ipdomainname;
int len;
};
struct setipdomainname_args {
char *ipdomainname;
int len;
};
/* Local prototypes */
static int ibcs2_getipdomainname(struct thread *,
struct getipdomainname_args *);
static int ibcs2_setipdomainname(struct thread *,
struct setipdomainname_args *);
/*
* iBCS2 socksys calls.
*/
int
ibcs2_socksys(td, uap)
register struct thread *td;
register struct ibcs2_socksys_args *uap;
{
int error;
int realargs[7]; /* 1 for command, 6 for recvfrom */
void *passargs;
/*
* SOCKET should only be legal on /dev/socksys.
* GETIPDOMAINNAME should only be legal on /dev/socksys ?
* The others are (and should be) only legal on sockets.
*/
if ((error = copyin(uap->argsp, (caddr_t)realargs, sizeof(realargs))) != 0)
return error;
DPRINTF(("ibcs2_socksys: %08x %08x %08x %08x %08x %08x %08x\n",
realargs[0], realargs[1], realargs[2], realargs[3],
realargs[4], realargs[5], realargs[6]));
passargs = (void *)(realargs + 1);
switch (realargs[0]) {
case SOCKSYS_ACCEPT:
- return accept(td, passargs);
+ return sys_accept(td, passargs);
case SOCKSYS_BIND:
- return bind(td, passargs);
+ return sys_bind(td, passargs);
case SOCKSYS_CONNECT:
- return connect(td, passargs);
+ return sys_connect(td, passargs);
case SOCKSYS_GETPEERNAME:
- return getpeername(td, passargs);
+ return sys_getpeername(td, passargs);
case SOCKSYS_GETSOCKNAME:
- return getsockname(td, passargs);
+ return sys_getsockname(td, passargs);
case SOCKSYS_GETSOCKOPT:
- return getsockopt(td, passargs);
+ return sys_getsockopt(td, passargs);
case SOCKSYS_LISTEN:
- return listen(td, passargs);
+ return sys_listen(td, passargs);
case SOCKSYS_RECV:
realargs[5] = realargs[6] = 0;
/* FALLTHROUGH */
case SOCKSYS_RECVFROM:
- return recvfrom(td, passargs);
+ return sys_recvfrom(td, passargs);
case SOCKSYS_SEND:
realargs[5] = realargs[6] = 0;
/* FALLTHROUGH */
case SOCKSYS_SENDTO:
- return sendto(td, passargs);
+ return sys_sendto(td, passargs);
case SOCKSYS_SETSOCKOPT:
- return setsockopt(td, passargs);
+ return sys_setsockopt(td, passargs);
case SOCKSYS_SHUTDOWN:
- return shutdown(td, passargs);
+ return sys_shutdown(td, passargs);
case SOCKSYS_SOCKET:
- return socket(td, passargs);
+ return sys_socket(td, passargs);
case SOCKSYS_SELECT:
- return select(td, passargs);
+ return sys_select(td, passargs);
case SOCKSYS_GETIPDOMAIN:
return ibcs2_getipdomainname(td, passargs);
case SOCKSYS_SETIPDOMAIN:
return ibcs2_setipdomainname(td, passargs);
case SOCKSYS_ADJTIME:
- return adjtime(td, passargs);
+ return sys_adjtime(td, passargs);
case SOCKSYS_SETREUID:
- return setreuid(td, passargs);
+ return sys_setreuid(td, passargs);
case SOCKSYS_SETREGID:
- return setregid(td, passargs);
+ return sys_setregid(td, passargs);
case SOCKSYS_GETTIME:
- return gettimeofday(td, passargs);
+ return sys_gettimeofday(td, passargs);
case SOCKSYS_SETTIME:
- return settimeofday(td, passargs);
+ return sys_settimeofday(td, passargs);
case SOCKSYS_GETITIMER:
- return getitimer(td, passargs);
+ return sys_getitimer(td, passargs);
case SOCKSYS_SETITIMER:
- return setitimer(td, passargs);
+ return sys_setitimer(td, passargs);
default:
printf("socksys unknown %08x %08x %08x %08x %08x %08x %08x\n",
realargs[0], realargs[1], realargs[2], realargs[3],
realargs[4], realargs[5], realargs[6]);
return EINVAL;
}
/* NOTREACHED */
}
/* ARGSUSED */
static int
ibcs2_getipdomainname(td, uap)
struct thread *td;
struct getipdomainname_args *uap;
{
char hname[MAXHOSTNAMELEN], *dptr;
int len;
/* Get the domain name. */
getcredhostname(td->td_ucred, hname, sizeof(hname));
dptr = index(hname, '.');
if ( dptr )
dptr++;
else
/* Make it effectively an empty string */
dptr = hname + strlen(hname);
len = strlen(dptr) + 1;
if ((u_int)uap->len > len + 1)
uap->len = len + 1;
return (copyout((caddr_t)dptr, (caddr_t)uap->ipdomainname, uap->len));
}
/* ARGSUSED */
static int
ibcs2_setipdomainname(td, uap)
struct thread *td;
struct setipdomainname_args *uap;
{
char hname[MAXHOSTNAMELEN], *ptr;
int error, sctl[2], hlen;
/* Get the domain name */
getcredhostname(td->td_ucred, hname, sizeof(hname));
/* W/out a hostname a domain-name is nonsense */
if ( strlen(hname) == 0 )
return EINVAL;
/* Get the host's unqualified name (strip off the domain) */
ptr = index(hname, '.');
if ( ptr != NULL ) {
ptr++;
*ptr = '\0';
} else {
if (strlcat(hname, ".", sizeof(hname)) >= sizeof(hname))
return (EINVAL);
}
/* Set ptr to the end of the string so we can append to it */
hlen = strlen(hname);
ptr = hname + hlen;
if ((u_int)uap->len > (sizeof (hname) - hlen - 1))
return EINVAL;
/* Append the ipdomain to the end */
error = copyinstr((caddr_t)uap->ipdomainname, ptr, uap->len, NULL);
if (error)
return (error);
/* 'sethostname' with the new information */
sctl[0] = CTL_KERN;
sctl[1] = KERN_HOSTNAME;
hlen = strlen(hname) + 1;
return (kernel_sysctl(td, sctl, 2, 0, 0, hname, hlen, 0, 0));
}
Index: head/sys/i386/ibcs2/ibcs2_xenix.c
===================================================================
--- head/sys/i386/ibcs2/ibcs2_xenix.c (revision 225616)
+++ head/sys/i386/ibcs2/ibcs2_xenix.c (revision 225617)
@@ -1,215 +1,215 @@
/*-
* Copyright (c) 1994 Sean Eric Fagan
* Copyright (c) 1994 Søren Schmidt
* Copyright (c) 1995 Steven Wallace
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer
* in this position and unchanged.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/namei.h>
#include <sys/sysproto.h>
#include <sys/clock.h>
#include <sys/jail.h>
#include <sys/kernel.h>
#include <sys/malloc.h>
#include <sys/filio.h>
#include <sys/vnode.h>
#include <sys/syscallsubr.h>
#include <sys/sysctl.h>
#include <sys/sysent.h>
#include <sys/unistd.h>
#include <machine/cpu.h>
#include <i386/ibcs2/ibcs2_types.h>
#include <i386/ibcs2/ibcs2_unistd.h>
#include <i386/ibcs2/ibcs2_signal.h>
#include <i386/ibcs2/ibcs2_util.h>
#include <i386/ibcs2/ibcs2_proto.h>
#include <i386/ibcs2/ibcs2_xenix.h>
#include <i386/ibcs2/ibcs2_xenix_syscall.h>
extern struct sysent xenix_sysent[];
int
ibcs2_xenix(struct thread *td, struct ibcs2_xenix_args *uap)
{
struct trapframe *tf = td->td_frame;
struct sysent *callp;
u_int code;
int error;
code = (tf->tf_eax & 0xff00) >> 8;
callp = &xenix_sysent[code];
if (code < IBCS2_XENIX_MAXSYSCALL)
error = ((*callp->sy_call)(td, (void *)uap));
else
error = ENOSYS;
return (error);
}
int
xenix_rdchk(td, uap)
struct thread *td;
struct xenix_rdchk_args *uap;
{
int data, error;
DPRINTF(("IBCS2: 'xenix rdchk'\n"));
error = kern_ioctl(td, uap->fd, FIONREAD, (caddr_t)&data);
if (error)
return (error);
td->td_retval[0] = data ? 1 : 0;
return (0);
}
int
xenix_chsize(td, uap)
struct thread *td;
struct xenix_chsize_args *uap;
{
struct ftruncate_args sa;
DPRINTF(("IBCS2: 'xenix chsize'\n"));
sa.fd = uap->fd;
sa.length = uap->size;
- return ftruncate(td, &sa);
+ return sys_ftruncate(td, &sa);
}
int
xenix_ftime(td, uap)
struct thread *td;
struct xenix_ftime_args *uap;
{
struct timeval tv;
struct ibcs2_timeb {
unsigned long time __packed;
unsigned short millitm;
short timezone;
short dstflag;
} itb;
DPRINTF(("IBCS2: 'xenix ftime'\n"));
microtime(&tv);
itb.time = tv.tv_sec;
itb.millitm = (tv.tv_usec / 1000);
itb.timezone = tz_minuteswest;
itb.dstflag = tz_dsttime != DST_NONE;
return copyout((caddr_t)&itb, (caddr_t)uap->tp,
sizeof(struct ibcs2_timeb));
}
int
xenix_nap(struct thread *td, struct xenix_nap_args *uap)
{
long period;
DPRINTF(("IBCS2: 'xenix nap %d ms'\n", uap->millisec));
period = (long)uap->millisec / (1000/hz);
if (period)
pause("nap", period);
return 0;
}
int
xenix_utsname(struct thread *td, struct xenix_utsname_args *uap)
{
struct ibcs2_sco_utsname {
char sysname[9];
char nodename[9];
char release[16];
char kernelid[20];
char machine[9];
char bustype[9];
char sysserial[10];
unsigned short sysorigin;
unsigned short sysoem;
char numusers[9];
unsigned short numcpu;
} ibcs2_sco_uname;
DPRINTF(("IBCS2: 'xenix sco_utsname'\n"));
bzero(&ibcs2_sco_uname, sizeof(struct ibcs2_sco_utsname));
strncpy(ibcs2_sco_uname.sysname, ostype,
sizeof(ibcs2_sco_uname.sysname) - 1);
getcredhostname(td->td_ucred, ibcs2_sco_uname.nodename,
sizeof(ibcs2_sco_uname.nodename) - 1);
strncpy(ibcs2_sco_uname.release, osrelease,
sizeof(ibcs2_sco_uname.release) - 1);
strncpy(ibcs2_sco_uname.kernelid, version,
sizeof(ibcs2_sco_uname.kernelid) - 1);
strncpy(ibcs2_sco_uname.machine, machine,
sizeof(ibcs2_sco_uname.machine) - 1);
strncpy(ibcs2_sco_uname.bustype, "ISA/EISA",
sizeof(ibcs2_sco_uname.bustype) - 1);
strncpy(ibcs2_sco_uname.sysserial, "no charge",
sizeof(ibcs2_sco_uname.sysserial) - 1);
strncpy(ibcs2_sco_uname.numusers, "unlim",
sizeof(ibcs2_sco_uname.numusers) - 1);
ibcs2_sco_uname.sysorigin = 0xFFFF;
ibcs2_sco_uname.sysoem = 0xFFFF;
ibcs2_sco_uname.numcpu = 1;
return copyout((caddr_t)&ibcs2_sco_uname,
(caddr_t)(void *)(intptr_t)uap->addr,
sizeof(struct ibcs2_sco_utsname));
}
int
xenix_scoinfo(struct thread *td, struct xenix_scoinfo_args *uap)
{
/* scoinfo (not documented) */
td->td_retval[0] = 0;
return 0;
}
int
xenix_eaccess(struct thread *td, struct xenix_eaccess_args *uap)
{
char *path;
int error, bsd_flags;
bsd_flags = 0;
if (uap->flags & IBCS2_R_OK)
bsd_flags |= R_OK;
if (uap->flags & IBCS2_W_OK)
bsd_flags |= W_OK;
if (uap->flags & IBCS2_X_OK)
bsd_flags |= X_OK;
CHECKALTEXIST(td, uap->path, &path);
error = kern_eaccess(td, path, UIO_SYSSPACE, bsd_flags);
free(path, M_TEMP);
return (error);
}
Index: head/sys/i386/linux/linux_machdep.c
===================================================================
--- head/sys/i386/linux/linux_machdep.c (revision 225616)
+++ head/sys/i386/linux/linux_machdep.c (revision 225617)
@@ -1,1120 +1,1120 @@
/*-
* Copyright (c) 2000 Marcel Moolenaar
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer
* in this position and unchanged.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/capability.h>
#include <sys/file.h>
#include <sys/fcntl.h>
#include <sys/imgact.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mman.h>
#include <sys/mutex.h>
#include <sys/sx.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/queue.h>
#include <sys/resource.h>
#include <sys/resourcevar.h>
#include <sys/signalvar.h>
#include <sys/syscallsubr.h>
#include <sys/sysproto.h>
#include <sys/unistd.h>
#include <sys/wait.h>
#include <sys/sched.h>
#include <machine/frame.h>
#include <machine/psl.h>
#include <machine/segments.h>
#include <machine/sysarch.h>
#include <vm/vm.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
#include <i386/linux/linux.h>
#include <i386/linux/linux_proto.h>
#include <compat/linux/linux_ipc.h>
#include <compat/linux/linux_misc.h>
#include <compat/linux/linux_signal.h>
#include <compat/linux/linux_util.h>
#include <compat/linux/linux_emul.h>
#include <i386/include/pcb.h> /* needed for pcb definition in linux_set_thread_area */
#include "opt_posix.h"
extern struct sysentvec elf32_freebsd_sysvec; /* defined in i386/i386/elf_machdep.c */
struct l_descriptor {
l_uint entry_number;
l_ulong base_addr;
l_uint limit;
l_uint seg_32bit:1;
l_uint contents:2;
l_uint read_exec_only:1;
l_uint limit_in_pages:1;
l_uint seg_not_present:1;
l_uint useable:1;
};
struct l_old_select_argv {
l_int nfds;
l_fd_set *readfds;
l_fd_set *writefds;
l_fd_set *exceptfds;
struct l_timeval *timeout;
};
static int linux_mmap_common(struct thread *td, l_uintptr_t addr,
l_size_t len, l_int prot, l_int flags, l_int fd,
l_loff_t pos);
int
linux_to_bsd_sigaltstack(int lsa)
{
int bsa = 0;
if (lsa & LINUX_SS_DISABLE)
bsa |= SS_DISABLE;
if (lsa & LINUX_SS_ONSTACK)
bsa |= SS_ONSTACK;
return (bsa);
}
int
bsd_to_linux_sigaltstack(int bsa)
{
int lsa = 0;
if (bsa & SS_DISABLE)
lsa |= LINUX_SS_DISABLE;
if (bsa & SS_ONSTACK)
lsa |= LINUX_SS_ONSTACK;
return (lsa);
}
int
linux_execve(struct thread *td, struct linux_execve_args *args)
{
int error;
char *newpath;
struct image_args eargs;
LCONVPATHEXIST(td, args->path, &newpath);
#ifdef DEBUG
if (ldebug(execve))
printf(ARGS(execve, "%s"), newpath);
#endif
error = exec_copyin_args(&eargs, newpath, UIO_SYSSPACE,
args->argp, args->envp);
free(newpath, M_TEMP);
if (error == 0)
error = kern_execve(td, &eargs, NULL);
if (error == 0)
/* linux process can exec fbsd one, dont attempt
* to create emuldata for such process using
* linux_proc_init, this leads to a panic on KASSERT
* because such process has p->p_emuldata == NULL
*/
if (SV_PROC_ABI(td->td_proc) == SV_ABI_LINUX)
error = linux_proc_init(td, 0, 0);
return (error);
}
struct l_ipc_kludge {
struct l_msgbuf *msgp;
l_long msgtyp;
};
int
linux_ipc(struct thread *td, struct linux_ipc_args *args)
{
switch (args->what & 0xFFFF) {
case LINUX_SEMOP: {
struct linux_semop_args a;
a.semid = args->arg1;
a.tsops = args->ptr;
a.nsops = args->arg2;
return (linux_semop(td, &a));
}
case LINUX_SEMGET: {
struct linux_semget_args a;
a.key = args->arg1;
a.nsems = args->arg2;
a.semflg = args->arg3;
return (linux_semget(td, &a));
}
case LINUX_SEMCTL: {
struct linux_semctl_args a;
int error;
a.semid = args->arg1;
a.semnum = args->arg2;
a.cmd = args->arg3;
error = copyin(args->ptr, &a.arg, sizeof(a.arg));
if (error)
return (error);
return (linux_semctl(td, &a));
}
case LINUX_MSGSND: {
struct linux_msgsnd_args a;
a.msqid = args->arg1;
a.msgp = args->ptr;
a.msgsz = args->arg2;
a.msgflg = args->arg3;
return (linux_msgsnd(td, &a));
}
case LINUX_MSGRCV: {
struct linux_msgrcv_args a;
a.msqid = args->arg1;
a.msgsz = args->arg2;
a.msgflg = args->arg3;
if ((args->what >> 16) == 0) {
struct l_ipc_kludge tmp;
int error;
if (args->ptr == NULL)
return (EINVAL);
error = copyin(args->ptr, &tmp, sizeof(tmp));
if (error)
return (error);
a.msgp = tmp.msgp;
a.msgtyp = tmp.msgtyp;
} else {
a.msgp = args->ptr;
a.msgtyp = args->arg5;
}
return (linux_msgrcv(td, &a));
}
case LINUX_MSGGET: {
struct linux_msgget_args a;
a.key = args->arg1;
a.msgflg = args->arg2;
return (linux_msgget(td, &a));
}
case LINUX_MSGCTL: {
struct linux_msgctl_args a;
a.msqid = args->arg1;
a.cmd = args->arg2;
a.buf = args->ptr;
return (linux_msgctl(td, &a));
}
case LINUX_SHMAT: {
struct linux_shmat_args a;
a.shmid = args->arg1;
a.shmaddr = args->ptr;
a.shmflg = args->arg2;
a.raddr = (l_ulong *)args->arg3;
return (linux_shmat(td, &a));
}
case LINUX_SHMDT: {
struct linux_shmdt_args a;
a.shmaddr = args->ptr;
return (linux_shmdt(td, &a));
}
case LINUX_SHMGET: {
struct linux_shmget_args a;
a.key = args->arg1;
a.size = args->arg2;
a.shmflg = args->arg3;
return (linux_shmget(td, &a));
}
case LINUX_SHMCTL: {
struct linux_shmctl_args a;
a.shmid = args->arg1;
a.cmd = args->arg2;
a.buf = args->ptr;
return (linux_shmctl(td, &a));
}
default:
break;
}
return (EINVAL);
}
int
linux_old_select(struct thread *td, struct linux_old_select_args *args)
{
struct l_old_select_argv linux_args;
struct linux_select_args newsel;
int error;
#ifdef DEBUG
if (ldebug(old_select))
printf(ARGS(old_select, "%p"), args->ptr);
#endif
error = copyin(args->ptr, &linux_args, sizeof(linux_args));
if (error)
return (error);
newsel.nfds = linux_args.nfds;
newsel.readfds = linux_args.readfds;
newsel.writefds = linux_args.writefds;
newsel.exceptfds = linux_args.exceptfds;
newsel.timeout = linux_args.timeout;
return (linux_select(td, &newsel));
}
int
linux_set_cloned_tls(struct thread *td, void *desc)
{
struct segment_descriptor sd;
struct l_user_desc info;
int idx, error;
int a[2];
error = copyin(desc, &info, sizeof(struct l_user_desc));
if (error) {
printf(LMSG("copyin failed!"));
} else {
idx = info.entry_number;
/*
* looks like we're getting the idx we returned
* in the set_thread_area() syscall
*/
if (idx != 6 && idx != 3) {
printf(LMSG("resetting idx!"));
idx = 3;
}
/* this doesnt happen in practice */
if (idx == 6) {
/* we might copy out the entry_number as 3 */
info.entry_number = 3;
error = copyout(&info, desc, sizeof(struct l_user_desc));
if (error)
printf(LMSG("copyout failed!"));
}
a[0] = LINUX_LDT_entry_a(&info);
a[1] = LINUX_LDT_entry_b(&info);
memcpy(&sd, &a, sizeof(a));
#ifdef DEBUG
if (ldebug(clone))
printf("Segment created in clone with "
"CLONE_SETTLS: lobase: %x, hibase: %x, "
"lolimit: %x, hilimit: %x, type: %i, "
"dpl: %i, p: %i, xx: %i, def32: %i, "
"gran: %i\n", sd.sd_lobase, sd.sd_hibase,
sd.sd_lolimit, sd.sd_hilimit, sd.sd_type,
sd.sd_dpl, sd.sd_p, sd.sd_xx,
sd.sd_def32, sd.sd_gran);
#endif
/* set %gs */
td->td_pcb->pcb_gsd = sd;
td->td_pcb->pcb_gs = GSEL(GUGS_SEL, SEL_UPL);
}
return (error);
}
int
linux_set_upcall_kse(struct thread *td, register_t stack)
{
td->td_frame->tf_esp = stack;
return (0);
}
#define STACK_SIZE (2 * 1024 * 1024)
#define GUARD_SIZE (4 * PAGE_SIZE)
int
linux_mmap2(struct thread *td, struct linux_mmap2_args *args)
{
#ifdef DEBUG
if (ldebug(mmap2))
printf(ARGS(mmap2, "%p, %d, %d, 0x%08x, %d, %d"),
(void *)args->addr, args->len, args->prot,
args->flags, args->fd, args->pgoff);
#endif
return (linux_mmap_common(td, args->addr, args->len, args->prot,
args->flags, args->fd, (uint64_t)(uint32_t)args->pgoff *
PAGE_SIZE));
}
int
linux_mmap(struct thread *td, struct linux_mmap_args *args)
{
int error;
struct l_mmap_argv linux_args;
error = copyin(args->ptr, &linux_args, sizeof(linux_args));
if (error)
return (error);
#ifdef DEBUG
if (ldebug(mmap))
printf(ARGS(mmap, "%p, %d, %d, 0x%08x, %d, %d"),
(void *)linux_args.addr, linux_args.len, linux_args.prot,
linux_args.flags, linux_args.fd, linux_args.pgoff);
#endif
return (linux_mmap_common(td, linux_args.addr, linux_args.len,
linux_args.prot, linux_args.flags, linux_args.fd,
(uint32_t)linux_args.pgoff));
}
static int
linux_mmap_common(struct thread *td, l_uintptr_t addr, l_size_t len, l_int prot,
l_int flags, l_int fd, l_loff_t pos)
{
struct proc *p = td->td_proc;
struct mmap_args /* {
caddr_t addr;
size_t len;
int prot;
int flags;
int fd;
long pad;
off_t pos;
} */ bsd_args;
int error;
struct file *fp;
error = 0;
bsd_args.flags = 0;
fp = NULL;
/*
* Linux mmap(2):
* You must specify exactly one of MAP_SHARED and MAP_PRIVATE
*/
if (!((flags & LINUX_MAP_SHARED) ^ (flags & LINUX_MAP_PRIVATE)))
return (EINVAL);
if (flags & LINUX_MAP_SHARED)
bsd_args.flags |= MAP_SHARED;
if (flags & LINUX_MAP_PRIVATE)
bsd_args.flags |= MAP_PRIVATE;
if (flags & LINUX_MAP_FIXED)
bsd_args.flags |= MAP_FIXED;
if (flags & LINUX_MAP_ANON) {
/* Enforce pos to be on page boundary, then ignore. */
if ((pos & PAGE_MASK) != 0)
return (EINVAL);
pos = 0;
bsd_args.flags |= MAP_ANON;
} else
bsd_args.flags |= MAP_NOSYNC;
if (flags & LINUX_MAP_GROWSDOWN)
bsd_args.flags |= MAP_STACK;
/*
* PROT_READ, PROT_WRITE, or PROT_EXEC implies PROT_READ and PROT_EXEC
* on Linux/i386. We do this to ensure maximum compatibility.
* Linux/ia64 does the same in i386 emulation mode.
*/
bsd_args.prot = prot;
if (bsd_args.prot & (PROT_READ | PROT_WRITE | PROT_EXEC))
bsd_args.prot |= PROT_READ | PROT_EXEC;
/* Linux does not check file descriptor when MAP_ANONYMOUS is set. */
bsd_args.fd = (bsd_args.flags & MAP_ANON) ? -1 : fd;
if (bsd_args.fd != -1) {
/*
* Linux follows Solaris mmap(2) description:
* The file descriptor fildes is opened with
* read permission, regardless of the
* protection options specified.
*
* Checking just CAP_MMAP is fine here, since the real work
* is done in the FreeBSD mmap().
*/
if ((error = fget(td, bsd_args.fd, CAP_MMAP, &fp)) != 0)
return (error);
if (fp->f_type != DTYPE_VNODE) {
fdrop(fp, td);
return (EINVAL);
}
/* Linux mmap() just fails for O_WRONLY files */
if (!(fp->f_flag & FREAD)) {
fdrop(fp, td);
return (EACCES);
}
fdrop(fp, td);
}
if (flags & LINUX_MAP_GROWSDOWN) {
/*
* The Linux MAP_GROWSDOWN option does not limit auto
* growth of the region. Linux mmap with this option
* takes as addr the inital BOS, and as len, the initial
* region size. It can then grow down from addr without
* limit. However, linux threads has an implicit internal
* limit to stack size of STACK_SIZE. Its just not
* enforced explicitly in linux. But, here we impose
* a limit of (STACK_SIZE - GUARD_SIZE) on the stack
* region, since we can do this with our mmap.
*
* Our mmap with MAP_STACK takes addr as the maximum
* downsize limit on BOS, and as len the max size of
* the region. It them maps the top SGROWSIZ bytes,
* and auto grows the region down, up to the limit
* in addr.
*
* If we don't use the MAP_STACK option, the effect
* of this code is to allocate a stack region of a
* fixed size of (STACK_SIZE - GUARD_SIZE).
*/
if ((caddr_t)PTRIN(addr) + len > p->p_vmspace->vm_maxsaddr) {
/*
* Some linux apps will attempt to mmap
* thread stacks near the top of their
* address space. If their TOS is greater
* than vm_maxsaddr, vm_map_growstack()
* will confuse the thread stack with the
* process stack and deliver a SEGV if they
* attempt to grow the thread stack past their
* current stacksize rlimit. To avoid this,
* adjust vm_maxsaddr upwards to reflect
* the current stacksize rlimit rather
* than the maximum possible stacksize.
* It would be better to adjust the
* mmap'ed region, but some apps do not check
* mmap's return value.
*/
PROC_LOCK(p);
p->p_vmspace->vm_maxsaddr = (char *)USRSTACK -
lim_cur(p, RLIMIT_STACK);
PROC_UNLOCK(p);
}
/*
* This gives us our maximum stack size and a new BOS.
* If we're using VM_STACK, then mmap will just map
* the top SGROWSIZ bytes, and let the stack grow down
* to the limit at BOS. If we're not using VM_STACK
* we map the full stack, since we don't have a way
* to autogrow it.
*/
if (len > STACK_SIZE - GUARD_SIZE) {
bsd_args.addr = (caddr_t)PTRIN(addr);
bsd_args.len = len;
} else {
bsd_args.addr = (caddr_t)PTRIN(addr) -
(STACK_SIZE - GUARD_SIZE - len);
bsd_args.len = STACK_SIZE - GUARD_SIZE;
}
} else {
bsd_args.addr = (caddr_t)PTRIN(addr);
bsd_args.len = len;
}
bsd_args.pos = pos;
#ifdef DEBUG
if (ldebug(mmap))
printf("-> %s(%p, %d, %d, 0x%08x, %d, 0x%x)\n",
__func__,
(void *)bsd_args.addr, bsd_args.len, bsd_args.prot,
bsd_args.flags, bsd_args.fd, (int)bsd_args.pos);
#endif
- error = mmap(td, &bsd_args);
+ error = sys_mmap(td, &bsd_args);
#ifdef DEBUG
if (ldebug(mmap))
printf("-> %s() return: 0x%x (0x%08x)\n",
__func__, error, (u_int)td->td_retval[0]);
#endif
return (error);
}
int
linux_mprotect(struct thread *td, struct linux_mprotect_args *uap)
{
struct mprotect_args bsd_args;
bsd_args.addr = uap->addr;
bsd_args.len = uap->len;
bsd_args.prot = uap->prot;
if (bsd_args.prot & (PROT_READ | PROT_WRITE | PROT_EXEC))
bsd_args.prot |= PROT_READ | PROT_EXEC;
- return (mprotect(td, &bsd_args));
+ return (sys_mprotect(td, &bsd_args));
}
int
linux_pipe(struct thread *td, struct linux_pipe_args *args)
{
int error;
int fildes[2];
#ifdef DEBUG
if (ldebug(pipe))
printf(ARGS(pipe, "*"));
#endif
error = kern_pipe(td, fildes);
if (error)
return (error);
/* XXX: Close descriptors on error. */
return (copyout(fildes, args->pipefds, sizeof fildes));
}
int
linux_ioperm(struct thread *td, struct linux_ioperm_args *args)
{
int error;
struct i386_ioperm_args iia;
iia.start = args->start;
iia.length = args->length;
iia.enable = args->enable;
error = i386_set_ioperm(td, &iia);
return (error);
}
int
linux_iopl(struct thread *td, struct linux_iopl_args *args)
{
int error;
if (args->level < 0 || args->level > 3)
return (EINVAL);
if ((error = priv_check(td, PRIV_IO)) != 0)
return (error);
if ((error = securelevel_gt(td->td_ucred, 0)) != 0)
return (error);
td->td_frame->tf_eflags = (td->td_frame->tf_eflags & ~PSL_IOPL) |
(args->level * (PSL_IOPL / 3));
return (0);
}
int
linux_modify_ldt(struct thread *td, struct linux_modify_ldt_args *uap)
{
int error;
struct i386_ldt_args ldt;
struct l_descriptor ld;
union descriptor desc;
int size, written;
switch (uap->func) {
case 0x00: /* read_ldt */
ldt.start = 0;
ldt.descs = uap->ptr;
ldt.num = uap->bytecount / sizeof(union descriptor);
error = i386_get_ldt(td, &ldt);
td->td_retval[0] *= sizeof(union descriptor);
break;
case 0x02: /* read_default_ldt = 0 */
size = 5*sizeof(struct l_desc_struct);
if (size > uap->bytecount)
size = uap->bytecount;
for (written = error = 0; written < size && error == 0; written++)
error = subyte((char *)uap->ptr + written, 0);
td->td_retval[0] = written;
break;
case 0x01: /* write_ldt */
case 0x11: /* write_ldt */
if (uap->bytecount != sizeof(ld))
return (EINVAL);
error = copyin(uap->ptr, &ld, sizeof(ld));
if (error)
return (error);
ldt.start = ld.entry_number;
ldt.descs = &desc;
ldt.num = 1;
desc.sd.sd_lolimit = (ld.limit & 0x0000ffff);
desc.sd.sd_hilimit = (ld.limit & 0x000f0000) >> 16;
desc.sd.sd_lobase = (ld.base_addr & 0x00ffffff);
desc.sd.sd_hibase = (ld.base_addr & 0xff000000) >> 24;
desc.sd.sd_type = SDT_MEMRO | ((ld.read_exec_only ^ 1) << 1) |
(ld.contents << 2);
desc.sd.sd_dpl = 3;
desc.sd.sd_p = (ld.seg_not_present ^ 1);
desc.sd.sd_xx = 0;
desc.sd.sd_def32 = ld.seg_32bit;
desc.sd.sd_gran = ld.limit_in_pages;
error = i386_set_ldt(td, &ldt, &desc);
break;
default:
error = ENOSYS;
break;
}
if (error == EOPNOTSUPP) {
printf("linux: modify_ldt needs kernel option USER_LDT\n");
error = ENOSYS;
}
return (error);
}
int
linux_sigaction(struct thread *td, struct linux_sigaction_args *args)
{
l_osigaction_t osa;
l_sigaction_t act, oact;
int error;
#ifdef DEBUG
if (ldebug(sigaction))
printf(ARGS(sigaction, "%d, %p, %p"),
args->sig, (void *)args->nsa, (void *)args->osa);
#endif
if (args->nsa != NULL) {
error = copyin(args->nsa, &osa, sizeof(l_osigaction_t));
if (error)
return (error);
act.lsa_handler = osa.lsa_handler;
act.lsa_flags = osa.lsa_flags;
act.lsa_restorer = osa.lsa_restorer;
LINUX_SIGEMPTYSET(act.lsa_mask);
act.lsa_mask.__bits[0] = osa.lsa_mask;
}
error = linux_do_sigaction(td, args->sig, args->nsa ? &act : NULL,
args->osa ? &oact : NULL);
if (args->osa != NULL && !error) {
osa.lsa_handler = oact.lsa_handler;
osa.lsa_flags = oact.lsa_flags;
osa.lsa_restorer = oact.lsa_restorer;
osa.lsa_mask = oact.lsa_mask.__bits[0];
error = copyout(&osa, args->osa, sizeof(l_osigaction_t));
}
return (error);
}
/*
* Linux has two extra args, restart and oldmask. We dont use these,
* but it seems that "restart" is actually a context pointer that
* enables the signal to happen with a different register set.
*/
int
linux_sigsuspend(struct thread *td, struct linux_sigsuspend_args *args)
{
sigset_t sigmask;
l_sigset_t mask;
#ifdef DEBUG
if (ldebug(sigsuspend))
printf(ARGS(sigsuspend, "%08lx"), (unsigned long)args->mask);
#endif
LINUX_SIGEMPTYSET(mask);
mask.__bits[0] = args->mask;
linux_to_bsd_sigset(&mask, &sigmask);
return (kern_sigsuspend(td, sigmask));
}
int
linux_rt_sigsuspend(struct thread *td, struct linux_rt_sigsuspend_args *uap)
{
l_sigset_t lmask;
sigset_t sigmask;
int error;
#ifdef DEBUG
if (ldebug(rt_sigsuspend))
printf(ARGS(rt_sigsuspend, "%p, %d"),
(void *)uap->newset, uap->sigsetsize);
#endif
if (uap->sigsetsize != sizeof(l_sigset_t))
return (EINVAL);
error = copyin(uap->newset, &lmask, sizeof(l_sigset_t));
if (error)
return (error);
linux_to_bsd_sigset(&lmask, &sigmask);
return (kern_sigsuspend(td, sigmask));
}
int
linux_pause(struct thread *td, struct linux_pause_args *args)
{
struct proc *p = td->td_proc;
sigset_t sigmask;
#ifdef DEBUG
if (ldebug(pause))
printf(ARGS(pause, ""));
#endif
PROC_LOCK(p);
sigmask = td->td_sigmask;
PROC_UNLOCK(p);
return (kern_sigsuspend(td, sigmask));
}
int
linux_sigaltstack(struct thread *td, struct linux_sigaltstack_args *uap)
{
stack_t ss, oss;
l_stack_t lss;
int error;
#ifdef DEBUG
if (ldebug(sigaltstack))
printf(ARGS(sigaltstack, "%p, %p"), uap->uss, uap->uoss);
#endif
if (uap->uss != NULL) {
error = copyin(uap->uss, &lss, sizeof(l_stack_t));
if (error)
return (error);
ss.ss_sp = lss.ss_sp;
ss.ss_size = lss.ss_size;
ss.ss_flags = linux_to_bsd_sigaltstack(lss.ss_flags);
}
error = kern_sigaltstack(td, (uap->uss != NULL) ? &ss : NULL,
(uap->uoss != NULL) ? &oss : NULL);
if (!error && uap->uoss != NULL) {
lss.ss_sp = oss.ss_sp;
lss.ss_size = oss.ss_size;
lss.ss_flags = bsd_to_linux_sigaltstack(oss.ss_flags);
error = copyout(&lss, uap->uoss, sizeof(l_stack_t));
}
return (error);
}
int
linux_ftruncate64(struct thread *td, struct linux_ftruncate64_args *args)
{
struct ftruncate_args sa;
#ifdef DEBUG
if (ldebug(ftruncate64))
printf(ARGS(ftruncate64, "%u, %jd"), args->fd,
(intmax_t)args->length);
#endif
sa.fd = args->fd;
sa.length = args->length;
- return ftruncate(td, &sa);
+ return sys_ftruncate(td, &sa);
}
int
linux_set_thread_area(struct thread *td, struct linux_set_thread_area_args *args)
{
struct l_user_desc info;
int error;
int idx;
int a[2];
struct segment_descriptor sd;
error = copyin(args->desc, &info, sizeof(struct l_user_desc));
if (error)
return (error);
#ifdef DEBUG
if (ldebug(set_thread_area))
printf(ARGS(set_thread_area, "%i, %x, %x, %i, %i, %i, %i, %i, %i\n"),
info.entry_number,
info.base_addr,
info.limit,
info.seg_32bit,
info.contents,
info.read_exec_only,
info.limit_in_pages,
info.seg_not_present,
info.useable);
#endif
idx = info.entry_number;
/*
* Semantics of linux version: every thread in the system has array of
* 3 tls descriptors. 1st is GLIBC TLS, 2nd is WINE, 3rd unknown. This
* syscall loads one of the selected tls decriptors with a value and
* also loads GDT descriptors 6, 7 and 8 with the content of the
* per-thread descriptors.
*
* Semantics of fbsd version: I think we can ignore that linux has 3
* per-thread descriptors and use just the 1st one. The tls_array[]
* is used only in set/get-thread_area() syscalls and for loading the
* GDT descriptors. In fbsd we use just one GDT descriptor for TLS so
* we will load just one.
*
* XXX: this doesn't work when a user space process tries to use more
* than 1 TLS segment. Comment in the linux sources says wine might do
* this.
*/
/*
* we support just GLIBC TLS now
* we should let 3 proceed as well because we use this segment so
* if code does two subsequent calls it should succeed
*/
if (idx != 6 && idx != -1 && idx != 3)
return (EINVAL);
/*
* we have to copy out the GDT entry we use
* FreeBSD uses GDT entry #3 for storing %gs so load that
*
* XXX: what if a user space program doesn't check this value and tries
* to use 6, 7 or 8?
*/
idx = info.entry_number = 3;
error = copyout(&info, args->desc, sizeof(struct l_user_desc));
if (error)
return (error);
if (LINUX_LDT_empty(&info)) {
a[0] = 0;
a[1] = 0;
} else {
a[0] = LINUX_LDT_entry_a(&info);
a[1] = LINUX_LDT_entry_b(&info);
}
memcpy(&sd, &a, sizeof(a));
#ifdef DEBUG
if (ldebug(set_thread_area))
printf("Segment created in set_thread_area: lobase: %x, hibase: %x, lolimit: %x, hilimit: %x, type: %i, dpl: %i, p: %i, xx: %i, def32: %i, gran: %i\n", sd.sd_lobase,
sd.sd_hibase,
sd.sd_lolimit,
sd.sd_hilimit,
sd.sd_type,
sd.sd_dpl,
sd.sd_p,
sd.sd_xx,
sd.sd_def32,
sd.sd_gran);
#endif
/* this is taken from i386 version of cpu_set_user_tls() */
critical_enter();
/* set %gs */
td->td_pcb->pcb_gsd = sd;
PCPU_GET(fsgs_gdt)[1] = sd;
load_gs(GSEL(GUGS_SEL, SEL_UPL));
critical_exit();
return (0);
}
int
linux_get_thread_area(struct thread *td, struct linux_get_thread_area_args *args)
{
struct l_user_desc info;
int error;
int idx;
struct l_desc_struct desc;
struct segment_descriptor sd;
#ifdef DEBUG
if (ldebug(get_thread_area))
printf(ARGS(get_thread_area, "%p"), args->desc);
#endif
error = copyin(args->desc, &info, sizeof(struct l_user_desc));
if (error)
return (error);
idx = info.entry_number;
/* XXX: I am not sure if we want 3 to be allowed too. */
if (idx != 6 && idx != 3)
return (EINVAL);
idx = 3;
memset(&info, 0, sizeof(info));
sd = PCPU_GET(fsgs_gdt)[1];
memcpy(&desc, &sd, sizeof(desc));
info.entry_number = idx;
info.base_addr = LINUX_GET_BASE(&desc);
info.limit = LINUX_GET_LIMIT(&desc);
info.seg_32bit = LINUX_GET_32BIT(&desc);
info.contents = LINUX_GET_CONTENTS(&desc);
info.read_exec_only = !LINUX_GET_WRITABLE(&desc);
info.limit_in_pages = LINUX_GET_LIMIT_PAGES(&desc);
info.seg_not_present = !LINUX_GET_PRESENT(&desc);
info.useable = LINUX_GET_USEABLE(&desc);
error = copyout(&info, args->desc, sizeof(struct l_user_desc));
if (error)
return (EFAULT);
return (0);
}
/* copied from kern/kern_time.c */
int
linux_timer_create(struct thread *td, struct linux_timer_create_args *args)
{
- return ktimer_create(td, (struct ktimer_create_args *) args);
+ return sys_ktimer_create(td, (struct ktimer_create_args *) args);
}
int
linux_timer_settime(struct thread *td, struct linux_timer_settime_args *args)
{
- return ktimer_settime(td, (struct ktimer_settime_args *) args);
+ return sys_ktimer_settime(td, (struct ktimer_settime_args *) args);
}
int
linux_timer_gettime(struct thread *td, struct linux_timer_gettime_args *args)
{
- return ktimer_gettime(td, (struct ktimer_gettime_args *) args);
+ return sys_ktimer_gettime(td, (struct ktimer_gettime_args *) args);
}
int
linux_timer_getoverrun(struct thread *td, struct linux_timer_getoverrun_args *args)
{
- return ktimer_getoverrun(td, (struct ktimer_getoverrun_args *) args);
+ return sys_ktimer_getoverrun(td, (struct ktimer_getoverrun_args *) args);
}
int
linux_timer_delete(struct thread *td, struct linux_timer_delete_args *args)
{
- return ktimer_delete(td, (struct ktimer_delete_args *) args);
+ return sys_ktimer_delete(td, (struct ktimer_delete_args *) args);
}
/* XXX: this wont work with module - convert it */
int
linux_mq_open(struct thread *td, struct linux_mq_open_args *args)
{
#ifdef P1003_1B_MQUEUE
- return kmq_open(td, (struct kmq_open_args *) args);
+ return sys_kmq_open(td, (struct kmq_open_args *) args);
#else
return (ENOSYS);
#endif
}
int
linux_mq_unlink(struct thread *td, struct linux_mq_unlink_args *args)
{
#ifdef P1003_1B_MQUEUE
- return kmq_unlink(td, (struct kmq_unlink_args *) args);
+ return sys_kmq_unlink(td, (struct kmq_unlink_args *) args);
#else
return (ENOSYS);
#endif
}
int
linux_mq_timedsend(struct thread *td, struct linux_mq_timedsend_args *args)
{
#ifdef P1003_1B_MQUEUE
- return kmq_timedsend(td, (struct kmq_timedsend_args *) args);
+ return sys_kmq_timedsend(td, (struct kmq_timedsend_args *) args);
#else
return (ENOSYS);
#endif
}
int
linux_mq_timedreceive(struct thread *td, struct linux_mq_timedreceive_args *args)
{
#ifdef P1003_1B_MQUEUE
- return kmq_timedreceive(td, (struct kmq_timedreceive_args *) args);
+ return sys_kmq_timedreceive(td, (struct kmq_timedreceive_args *) args);
#else
return (ENOSYS);
#endif
}
int
linux_mq_notify(struct thread *td, struct linux_mq_notify_args *args)
{
#ifdef P1003_1B_MQUEUE
- return kmq_notify(td, (struct kmq_notify_args *) args);
+ return sys_kmq_notify(td, (struct kmq_notify_args *) args);
#else
return (ENOSYS);
#endif
}
int
linux_mq_getsetattr(struct thread *td, struct linux_mq_getsetattr_args *args)
{
#ifdef P1003_1B_MQUEUE
- return kmq_setattr(td, (struct kmq_setattr_args *) args);
+ return sys_kmq_setattr(td, (struct kmq_setattr_args *) args);
#else
return (ENOSYS);
#endif
}
int
linux_wait4(struct thread *td, struct linux_wait4_args *args)
{
int error, options;
struct rusage ru, *rup;
#ifdef DEBUG
if (ldebug(wait4))
printf(ARGS(wait4, "%d, %p, %d, %p"),
args->pid, (void *)args->status, args->options,
(void *)args->rusage);
#endif
options = (args->options & (WNOHANG | WUNTRACED));
/* WLINUXCLONE should be equal to __WCLONE, but we make sure */
if (args->options & __WCLONE)
options |= WLINUXCLONE;
if (args->rusage != NULL)
rup = &ru;
else
rup = NULL;
error = linux_common_wait(td, args->pid, args->status, options, rup);
if (error)
return (error);
if (args->rusage != NULL)
error = copyout(&ru, args->rusage, sizeof(ru));
return (error);
}
Index: head/sys/ia64/ia32/ia32_signal.c
===================================================================
--- head/sys/ia64/ia32/ia32_signal.c (revision 225616)
+++ head/sys/ia64/ia32/ia32_signal.c (revision 225617)
@@ -1,298 +1,298 @@
/*-
* Copyright (c) 2002 Doug Rabson
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_compat.h"
#define __ELF_WORD_SIZE 32
#include <sys/param.h>
#include <sys/exec.h>
#include <sys/fcntl.h>
#include <sys/imgact.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
#include <sys/mman.h>
#include <sys/namei.h>
#include <sys/pioctl.h>
#include <sys/proc.h>
#include <sys/procfs.h>
#include <sys/resourcevar.h>
#include <sys/systm.h>
#include <sys/signalvar.h>
#include <sys/stat.h>
#include <sys/sx.h>
#include <sys/syscall.h>
#include <sys/sysctl.h>
#include <sys/sysent.h>
#include <sys/vnode.h>
#include <sys/imgact_elf.h>
#include <sys/sysproto.h>
#include <machine/frame.h>
#include <machine/md_var.h>
#include <machine/pcb.h>
#include <vm/vm.h>
#include <vm/vm_kern.h>
#include <vm/vm_param.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
#include <vm/vm_object.h>
#include <vm/vm_extern.h>
#include <compat/freebsd32/freebsd32_signal.h>
#include <compat/freebsd32/freebsd32_util.h>
#include <compat/freebsd32/freebsd32_proto.h>
#include <compat/ia32/ia32_signal.h>
#include <i386/include/psl.h>
#include <i386/include/segments.h>
#include <i386/include/specialreg.h>
char ia32_sigcode[] = {
0xff, 0x54, 0x24, 0x10, /* call *SIGF_HANDLER(%esp) */
0x8d, 0x44, 0x24, 0x14, /* lea SIGF_UC(%esp),%eax */
0x50, /* pushl %eax */
0xf7, 0x40, 0x54, 0x00, 0x00, 0x02, 0x02, /* testl $PSL_VM,UC_EFLAGS(%ea
x) */
0x75, 0x03, /* jne 9f */
0x8e, 0x68, 0x14, /* movl UC_GS(%eax),%gs */
0xb8, 0x57, 0x01, 0x00, 0x00, /* 9: movl $SYS_sigreturn,%eax */
0x50, /* pushl %eax */
0xcd, 0x80, /* int $0x80 */
0xeb, 0xfe, /* 0: jmp 0b */
0
};
int sz_ia32_sigcode = sizeof(ia32_sigcode);
#ifdef COMPAT_43
int
ofreebsd32_sigreturn(struct thread *td, struct ofreebsd32_sigreturn_args *uap)
{
return (EOPNOTSUPP);
}
#endif
/*
* Signal sending has not been implemented on ia64. This causes
* the sigtramp code to not understand the arguments and the application
* will generally crash if it tries to handle a signal. Calling
* sendsig() means that at least untrapped signals will work.
*/
void
ia32_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
{
sendsig(catcher, ksi, mask);
}
#ifdef COMPAT_FREEBSD4
int
freebsd4_freebsd32_sigreturn(struct thread *td, struct freebsd4_freebsd32_sigreturn_args *uap)
{
- return (sigreturn(td, (struct sigreturn_args *)uap));
+ return (sys_sigreturn(td, (struct sigreturn_args *)uap));
}
#endif
int
freebsd32_sigreturn(struct thread *td, struct freebsd32_sigreturn_args *uap)
{
- return (sigreturn(td, (struct sigreturn_args *)uap));
+ return (sys_sigreturn(td, (struct sigreturn_args *)uap));
}
void
ia32_setregs(struct thread *td, struct image_params *imgp, u_long stack)
{
struct trapframe *tf = td->td_frame;
vm_offset_t gdt, ldt;
u_int64_t codesel, datasel, ldtsel;
u_int64_t codeseg, dataseg, gdtseg, ldtseg;
struct segment_descriptor desc;
struct vmspace *vmspace = td->td_proc->p_vmspace;
struct sysentvec *sv;
sv = td->td_proc->p_sysent;
exec_setregs(td, imgp, stack);
/* Non-syscall frames are cleared by exec_setregs() */
if (tf->tf_flags & FRAME_SYSCALL) {
bzero(&tf->tf_scratch, sizeof(tf->tf_scratch));
bzero(&tf->tf_scratch_fp, sizeof(tf->tf_scratch_fp));
} else
tf->tf_special.ndirty = 0;
tf->tf_special.psr |= IA64_PSR_IS;
tf->tf_special.sp = stack;
/* Point the RSE backstore to something harmless. */
tf->tf_special.bspstore = (sv->sv_psstrings - sz_ia32_sigcode -
SPARE_USRSPACE + 15) & ~15;
codesel = LSEL(LUCODE_SEL, SEL_UPL);
datasel = LSEL(LUDATA_SEL, SEL_UPL);
ldtsel = GSEL(GLDT_SEL, SEL_UPL);
/* Setup ia32 segment registers. */
tf->tf_scratch.gr16 = (datasel << 48) | (datasel << 32) |
(datasel << 16) | datasel;
tf->tf_scratch.gr17 = (ldtsel << 32) | (datasel << 16) | codesel;
/*
* Build the GDT and LDT.
*/
gdt = sv->sv_usrstack;
vm_map_find(&vmspace->vm_map, 0, 0, &gdt, IA32_PAGE_SIZE << 1, 0,
VM_PROT_ALL, VM_PROT_ALL, 0);
ldt = gdt + IA32_PAGE_SIZE;
desc.sd_lolimit = 8*NLDT-1;
desc.sd_lobase = ldt & 0xffffff;
desc.sd_type = SDT_SYSLDT;
desc.sd_dpl = SEL_UPL;
desc.sd_p = 1;
desc.sd_hilimit = 0;
desc.sd_def32 = 0;
desc.sd_gran = 0;
desc.sd_hibase = ldt >> 24;
copyout(&desc, (caddr_t) gdt + 8*GLDT_SEL, sizeof(desc));
desc.sd_lolimit = ((sv->sv_usrstack >> 12) - 1) & 0xffff;
desc.sd_lobase = 0;
desc.sd_type = SDT_MEMERA;
desc.sd_dpl = SEL_UPL;
desc.sd_p = 1;
desc.sd_hilimit = ((sv->sv_usrstack >> 12) - 1) >> 16;
desc.sd_def32 = 1;
desc.sd_gran = 1;
desc.sd_hibase = 0;
copyout(&desc, (caddr_t) ldt + 8*LUCODE_SEL, sizeof(desc));
desc.sd_type = SDT_MEMRWA;
copyout(&desc, (caddr_t) ldt + 8*LUDATA_SEL, sizeof(desc));
codeseg = 0 /* base */
+ (((sv->sv_usrstack >> 12) - 1) << 32) /* limit */
+ ((long)SDT_MEMERA << 52)
+ ((long)SEL_UPL << 57)
+ (1L << 59) /* present */
+ (1L << 62) /* 32 bits */
+ (1L << 63); /* page granularity */
dataseg = 0 /* base */
+ (((sv->sv_usrstack >> 12) - 1) << 32) /* limit */
+ ((long)SDT_MEMRWA << 52)
+ ((long)SEL_UPL << 57)
+ (1L << 59) /* present */
+ (1L << 62) /* 32 bits */
+ (1L << 63); /* page granularity */
tf->tf_scratch.csd = codeseg;
tf->tf_scratch.ssd = dataseg;
tf->tf_scratch.gr24 = dataseg; /* ESD */
tf->tf_scratch.gr27 = dataseg; /* DSD */
tf->tf_scratch.gr28 = dataseg; /* FSD */
tf->tf_scratch.gr29 = dataseg; /* GSD */
gdtseg = gdt /* base */
+ ((8L*NGDT - 1) << 32) /* limit */
+ ((long)SDT_SYSNULL << 52)
+ ((long)SEL_UPL << 57)
+ (1L << 59) /* present */
+ (0L << 62) /* 16 bits */
+ (0L << 63); /* byte granularity */
ldtseg = ldt /* base */
+ ((8L*NLDT - 1) << 32) /* limit */
+ ((long)SDT_SYSLDT << 52)
+ ((long)SEL_UPL << 57)
+ (1L << 59) /* present */
+ (0L << 62) /* 16 bits */
+ (0L << 63); /* byte granularity */
tf->tf_scratch.gr30 = ldtseg; /* LDTD */
tf->tf_scratch.gr31 = gdtseg; /* GDTD */
/* Set ia32 control registers on this processor. */
ia64_set_cflg(CR0_PE | CR0_PG | ((long)(CR4_XMM | CR4_FXSR) << 32));
ia64_set_eflag(PSL_USER);
/* PS_STRINGS value for BSD/OS binaries. It is 0 for non-BSD/OS. */
tf->tf_scratch.gr11 = td->td_proc->p_sysent->sv_psstrings;
/*
* XXX - Linux emulator
* Make sure sure edx is 0x0 on entry. Linux binaries depend
* on it.
*/
td->td_retval[1] = 0;
}
void
ia32_restorectx(struct pcb *pcb)
{
ia64_set_cflg(pcb->pcb_ia32_cflg);
ia64_set_eflag(pcb->pcb_ia32_eflag);
ia64_set_fcr(pcb->pcb_ia32_fcr);
ia64_set_fdr(pcb->pcb_ia32_fdr);
ia64_set_fir(pcb->pcb_ia32_fir);
ia64_set_fsr(pcb->pcb_ia32_fsr);
}
void
ia32_savectx(struct pcb *pcb)
{
pcb->pcb_ia32_cflg = ia64_get_cflg();
pcb->pcb_ia32_eflag = ia64_get_eflag();
pcb->pcb_ia32_fcr = ia64_get_fcr();
pcb->pcb_ia32_fdr = ia64_get_fdr();
pcb->pcb_ia32_fir = ia64_get_fir();
pcb->pcb_ia32_fsr = ia64_get_fsr();
}
int
freebsd32_getcontext(struct thread *td, struct freebsd32_getcontext_args *uap)
{
return (nosys(td, NULL));
}
int
freebsd32_setcontext(struct thread *td, struct freebsd32_setcontext_args *uap)
{
return (nosys(td, NULL));
}
int
freebsd32_swapcontext(struct thread *td, struct freebsd32_swapcontext_args *uap)
{
return (nosys(td, NULL));
}
Index: head/sys/ia64/ia64/machdep.c
===================================================================
--- head/sys/ia64/ia64/machdep.c (revision 225616)
+++ head/sys/ia64/ia64/machdep.c (revision 225617)
@@ -1,1586 +1,1586 @@
/*-
* Copyright (c) 2003,2004 Marcel Moolenaar
* Copyright (c) 2000,2001 Doug Rabson
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_compat.h"
#include "opt_ddb.h"
#include "opt_kstack_pages.h"
#include "opt_sched.h"
#include <sys/param.h>
#include <sys/proc.h>
#include <sys/systm.h>
#include <sys/bio.h>
#include <sys/buf.h>
#include <sys/bus.h>
#include <sys/cons.h>
#include <sys/cpu.h>
#include <sys/eventhandler.h>
#include <sys/exec.h>
#include <sys/imgact.h>
#include <sys/kdb.h>
#include <sys/kernel.h>
#include <sys/linker.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/msgbuf.h>
#include <sys/pcpu.h>
#include <sys/ptrace.h>
#include <sys/random.h>
#include <sys/reboot.h>
#include <sys/sched.h>
#include <sys/signalvar.h>
#include <sys/syscall.h>
#include <sys/syscallsubr.h>
#include <sys/sysctl.h>
#include <sys/sysproto.h>
#include <sys/ucontext.h>
#include <sys/uio.h>
#include <sys/uuid.h>
#include <sys/vmmeter.h>
#include <sys/vnode.h>
#include <ddb/ddb.h>
#include <net/netisr.h>
#include <vm/vm.h>
#include <vm/vm_extern.h>
#include <vm/vm_kern.h>
#include <vm/vm_page.h>
#include <vm/vm_map.h>
#include <vm/vm_object.h>
#include <vm/vm_pager.h>
#include <machine/bootinfo.h>
#include <machine/cpu.h>
#include <machine/efi.h>
#include <machine/elf.h>
#include <machine/fpu.h>
#include <machine/intr.h>
#include <machine/mca.h>
#include <machine/md_var.h>
#include <machine/pal.h>
#include <machine/pcb.h>
#include <machine/reg.h>
#include <machine/sal.h>
#include <machine/sigframe.h>
#ifdef SMP
#include <machine/smp.h>
#endif
#include <machine/unwind.h>
#include <machine/vmparam.h>
SYSCTL_NODE(_hw, OID_AUTO, freq, CTLFLAG_RD, 0, "");
SYSCTL_NODE(_machdep, OID_AUTO, cpu, CTLFLAG_RD, 0, "");
static u_int bus_freq;
SYSCTL_UINT(_hw_freq, OID_AUTO, bus, CTLFLAG_RD, &bus_freq, 0,
"Bus clock frequency");
static u_int cpu_freq;
SYSCTL_UINT(_hw_freq, OID_AUTO, cpu, CTLFLAG_RD, &cpu_freq, 0,
"CPU clock frequency");
static u_int itc_freq;
SYSCTL_UINT(_hw_freq, OID_AUTO, itc, CTLFLAG_RD, &itc_freq, 0,
"ITC frequency");
int cold = 1;
struct bootinfo *bootinfo;
struct pcpu pcpu0;
extern u_int64_t kernel_text[], _end[];
extern u_int64_t ia64_gateway_page[];
extern u_int64_t break_sigtramp[];
extern u_int64_t epc_sigtramp[];
struct fpswa_iface *fpswa_iface;
vm_size_t ia64_pal_size;
vm_paddr_t ia64_pal_base;
vm_offset_t ia64_port_base;
u_int64_t ia64_lapic_addr = PAL_PIB_DEFAULT_ADDR;
struct ia64_pib *ia64_pib;
static int ia64_sync_icache_needed;
char machine[] = MACHINE;
SYSCTL_STRING(_hw, HW_MACHINE, machine, CTLFLAG_RD, machine, 0, "");
static char cpu_model[64];
SYSCTL_STRING(_hw, HW_MODEL, model, CTLFLAG_RD, cpu_model, 0,
"The CPU model name");
static char cpu_family[64];
SYSCTL_STRING(_hw, OID_AUTO, family, CTLFLAG_RD, cpu_family, 0,
"The CPU family name");
#ifdef DDB
extern vm_offset_t ksym_start, ksym_end;
#endif
struct msgbuf *msgbufp = NULL;
/* Other subsystems (e.g., ACPI) can hook this later. */
void (*cpu_idle_hook)(void) = NULL;
long Maxmem = 0;
long realmem = 0;
#define PHYSMAP_SIZE (2 * VM_PHYSSEG_MAX)
vm_paddr_t phys_avail[PHYSMAP_SIZE + 2];
/* must be 2 less so 0 0 can signal end of chunks */
#define PHYS_AVAIL_ARRAY_END ((sizeof(phys_avail) / sizeof(vm_offset_t)) - 2)
struct kva_md_info kmi;
#define Mhz 1000000L
#define Ghz (1000L*Mhz)
static void
identifycpu(void)
{
char vendor[17];
char *family_name, *model_name;
u_int64_t features, tmp;
int number, revision, model, family, archrev;
/*
* Assumes little-endian.
*/
*(u_int64_t *) &vendor[0] = ia64_get_cpuid(0);
*(u_int64_t *) &vendor[8] = ia64_get_cpuid(1);
vendor[16] = '\0';
tmp = ia64_get_cpuid(3);
number = (tmp >> 0) & 0xff;
revision = (tmp >> 8) & 0xff;
model = (tmp >> 16) & 0xff;
family = (tmp >> 24) & 0xff;
archrev = (tmp >> 32) & 0xff;
family_name = model_name = "unknown";
switch (family) {
case 0x07:
family_name = "Itanium";
model_name = "Merced";
break;
case 0x1f:
family_name = "Itanium 2";
switch (model) {
case 0x00:
model_name = "McKinley";
break;
case 0x01:
/*
* Deerfield is a low-voltage variant based on the
* Madison core. We need circumstantial evidence
* (i.e. the clock frequency) to identify those.
* Allow for roughly 1% error margin.
*/
if (cpu_freq > 990 && cpu_freq < 1010)
model_name = "Deerfield";
else
model_name = "Madison";
break;
case 0x02:
model_name = "Madison II";
break;
}
break;
case 0x20:
ia64_sync_icache_needed = 1;
family_name = "Itanium 2";
switch (model) {
case 0x00:
model_name = "Montecito";
break;
case 0x01:
model_name = "Montvale";
break;
}
break;
}
snprintf(cpu_family, sizeof(cpu_family), "%s", family_name);
snprintf(cpu_model, sizeof(cpu_model), "%s", model_name);
features = ia64_get_cpuid(4);
printf("CPU: %s (", model_name);
if (cpu_freq)
printf("%u Mhz ", cpu_freq);
printf("%s)\n", family_name);
printf(" Origin = \"%s\" Revision = %d\n", vendor, revision);
printf(" Features = 0x%b\n", (u_int32_t) features,
"\020"
"\001LB" /* long branch (brl) instruction. */
"\002SD" /* Spontaneous deferral. */
"\003AO" /* 16-byte atomic operations (ld, st, cmpxchg). */ );
}
static void
cpu_startup(void *dummy)
{
char nodename[16];
struct pcpu *pc;
struct pcpu_stats *pcs;
/*
* Good {morning,afternoon,evening,night}.
*/
identifycpu();
#ifdef PERFMON
perfmon_init();
#endif
printf("real memory = %ld (%ld MB)\n", ia64_ptob(Maxmem),
ia64_ptob(Maxmem) / 1048576);
realmem = Maxmem;
/*
* Display any holes after the first chunk of extended memory.
*/
if (bootverbose) {
int indx;
printf("Physical memory chunk(s):\n");
for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
long size1 = phys_avail[indx + 1] - phys_avail[indx];
printf("0x%08lx - 0x%08lx, %ld bytes (%ld pages)\n",
phys_avail[indx], phys_avail[indx + 1] - 1, size1,
size1 >> PAGE_SHIFT);
}
}
vm_ksubmap_init(&kmi);
printf("avail memory = %ld (%ld MB)\n", ptoa(cnt.v_free_count),
ptoa(cnt.v_free_count) / 1048576);
if (fpswa_iface == NULL)
printf("Warning: no FPSWA package supplied\n");
else
printf("FPSWA Revision = 0x%lx, Entry = %p\n",
(long)fpswa_iface->if_rev, (void *)fpswa_iface->if_fpswa);
/*
* Set up buffers, so they can be used to read disk labels.
*/
bufinit();
vm_pager_bufferinit();
/*
* Traverse the MADT to discover IOSAPIC and Local SAPIC
* information.
*/
ia64_probe_sapics();
ia64_pib = pmap_mapdev(ia64_lapic_addr, sizeof(*ia64_pib));
ia64_mca_init();
/*
* Create sysctl tree for per-CPU information.
*/
STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) {
snprintf(nodename, sizeof(nodename), "%u", pc->pc_cpuid);
sysctl_ctx_init(&pc->pc_md.sysctl_ctx);
pc->pc_md.sysctl_tree = SYSCTL_ADD_NODE(&pc->pc_md.sysctl_ctx,
SYSCTL_STATIC_CHILDREN(_machdep_cpu), OID_AUTO, nodename,
CTLFLAG_RD, NULL, "");
if (pc->pc_md.sysctl_tree == NULL)
continue;
pcs = &pc->pc_md.stats;
SYSCTL_ADD_ULONG(&pc->pc_md.sysctl_ctx,
SYSCTL_CHILDREN(pc->pc_md.sysctl_tree), OID_AUTO,
"nasts", CTLFLAG_RD, &pcs->pcs_nasts,
"Number of IPI_AST interrupts");
SYSCTL_ADD_ULONG(&pc->pc_md.sysctl_ctx,
SYSCTL_CHILDREN(pc->pc_md.sysctl_tree), OID_AUTO,
"nclks", CTLFLAG_RD, &pcs->pcs_nclks,
"Number of clock interrupts");
SYSCTL_ADD_ULONG(&pc->pc_md.sysctl_ctx,
SYSCTL_CHILDREN(pc->pc_md.sysctl_tree), OID_AUTO,
"nextints", CTLFLAG_RD, &pcs->pcs_nextints,
"Number of ExtINT interrupts");
SYSCTL_ADD_ULONG(&pc->pc_md.sysctl_ctx,
SYSCTL_CHILDREN(pc->pc_md.sysctl_tree), OID_AUTO,
"nhardclocks", CTLFLAG_RD, &pcs->pcs_nhardclocks,
"Number of IPI_HARDCLOCK interrupts");
SYSCTL_ADD_ULONG(&pc->pc_md.sysctl_ctx,
SYSCTL_CHILDREN(pc->pc_md.sysctl_tree), OID_AUTO,
"nhighfps", CTLFLAG_RD, &pcs->pcs_nhighfps,
"Number of IPI_HIGH_FP interrupts");
SYSCTL_ADD_ULONG(&pc->pc_md.sysctl_ctx,
SYSCTL_CHILDREN(pc->pc_md.sysctl_tree), OID_AUTO,
"nhwints", CTLFLAG_RD, &pcs->pcs_nhwints,
"Number of hardware (device) interrupts");
SYSCTL_ADD_ULONG(&pc->pc_md.sysctl_ctx,
SYSCTL_CHILDREN(pc->pc_md.sysctl_tree), OID_AUTO,
"npreempts", CTLFLAG_RD, &pcs->pcs_npreempts,
"Number of IPI_PREEMPT interrupts");
SYSCTL_ADD_ULONG(&pc->pc_md.sysctl_ctx,
SYSCTL_CHILDREN(pc->pc_md.sysctl_tree), OID_AUTO,
"nrdvs", CTLFLAG_RD, &pcs->pcs_nrdvs,
"Number of IPI_RENDEZVOUS interrupts");
SYSCTL_ADD_ULONG(&pc->pc_md.sysctl_ctx,
SYSCTL_CHILDREN(pc->pc_md.sysctl_tree), OID_AUTO,
"nstops", CTLFLAG_RD, &pcs->pcs_nstops,
"Number of IPI_STOP interrupts");
SYSCTL_ADD_ULONG(&pc->pc_md.sysctl_ctx,
SYSCTL_CHILDREN(pc->pc_md.sysctl_tree), OID_AUTO,
"nstrays", CTLFLAG_RD, &pcs->pcs_nstrays,
"Number of stray interrupts");
}
}
SYSINIT(cpu_startup, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
void
cpu_flush_dcache(void *ptr, size_t len)
{
vm_offset_t lim, va;
va = (uintptr_t)ptr & ~31;
lim = (uintptr_t)ptr + len;
while (va < lim) {
ia64_fc(va);
va += 32;
}
ia64_srlz_d();
}
/* Get current clock frequency for the given cpu id. */
int
cpu_est_clockrate(int cpu_id, uint64_t *rate)
{
if (pcpu_find(cpu_id) == NULL || rate == NULL)
return (EINVAL);
*rate = (u_long)cpu_freq * 1000000ul;
return (0);
}
void
cpu_halt()
{
efi_reset_system();
}
void
cpu_idle(int busy)
{
register_t ie;
if (!busy) {
critical_enter();
cpu_idleclock();
}
ie = intr_disable();
KASSERT(ie != 0, ("%s called with interrupts disabled\n", __func__));
if (sched_runnable())
ia64_enable_intr();
else if (cpu_idle_hook != NULL) {
(*cpu_idle_hook)();
/* The hook must enable interrupts! */
} else {
ia64_call_pal_static(PAL_HALT_LIGHT, 0, 0, 0);
ia64_enable_intr();
}
if (!busy) {
cpu_activeclock();
critical_exit();
}
}
int
cpu_idle_wakeup(int cpu)
{
return (0);
}
void
cpu_reset()
{
efi_reset_system();
}
void
cpu_switch(struct thread *old, struct thread *new, struct mtx *mtx)
{
struct pcb *oldpcb, *newpcb;
oldpcb = old->td_pcb;
#ifdef COMPAT_FREEBSD32
ia32_savectx(oldpcb);
#endif
if (PCPU_GET(fpcurthread) == old)
old->td_frame->tf_special.psr |= IA64_PSR_DFH;
if (!savectx(oldpcb)) {
newpcb = new->td_pcb;
oldpcb->pcb_current_pmap =
pmap_switch(newpcb->pcb_current_pmap);
atomic_store_rel_ptr(&old->td_lock, mtx);
#if defined(SCHED_ULE) && defined(SMP)
while (atomic_load_acq_ptr(&new->td_lock) == &blocked_lock)
cpu_spinwait();
#endif
PCPU_SET(curthread, new);
#ifdef COMPAT_FREEBSD32
ia32_restorectx(newpcb);
#endif
if (PCPU_GET(fpcurthread) == new)
new->td_frame->tf_special.psr &= ~IA64_PSR_DFH;
restorectx(newpcb);
/* We should not get here. */
panic("cpu_switch: restorectx() returned");
/* NOTREACHED */
}
}
void
cpu_throw(struct thread *old __unused, struct thread *new)
{
struct pcb *newpcb;
newpcb = new->td_pcb;
(void)pmap_switch(newpcb->pcb_current_pmap);
#if defined(SCHED_ULE) && defined(SMP)
while (atomic_load_acq_ptr(&new->td_lock) == &blocked_lock)
cpu_spinwait();
#endif
PCPU_SET(curthread, new);
#ifdef COMPAT_FREEBSD32
ia32_restorectx(newpcb);
#endif
restorectx(newpcb);
/* We should not get here. */
panic("cpu_throw: restorectx() returned");
/* NOTREACHED */
}
void
cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
{
/*
* Set pc_acpi_id to "uninitialized".
* See sys/dev/acpica/acpi_cpu.c
*/
pcpu->pc_acpi_id = 0xffffffff;
}
void
spinlock_enter(void)
{
struct thread *td;
int intr;
td = curthread;
if (td->td_md.md_spinlock_count == 0) {
intr = intr_disable();
td->td_md.md_spinlock_count = 1;
td->td_md.md_saved_intr = intr;
} else
td->td_md.md_spinlock_count++;
critical_enter();
}
void
spinlock_exit(void)
{
struct thread *td;
int intr;
td = curthread;
critical_exit();
intr = td->td_md.md_saved_intr;
td->td_md.md_spinlock_count--;
if (td->td_md.md_spinlock_count == 0)
intr_restore(intr);
}
void
map_vhpt(uintptr_t vhpt)
{
pt_entry_t pte;
uint64_t psr;
pte = PTE_PRESENT | PTE_MA_WB | PTE_ACCESSED | PTE_DIRTY |
PTE_PL_KERN | PTE_AR_RW;
pte |= vhpt & PTE_PPN_MASK;
__asm __volatile("ptr.d %0,%1" :: "r"(vhpt),
"r"(pmap_vhpt_log2size << 2));
__asm __volatile("mov %0=psr" : "=r"(psr));
__asm __volatile("rsm psr.ic|psr.i");
ia64_srlz_i();
ia64_set_ifa(vhpt);
ia64_set_itir(pmap_vhpt_log2size << 2);
ia64_srlz_d();
__asm __volatile("itr.d dtr[%0]=%1" :: "r"(3), "r"(pte));
__asm __volatile("mov psr.l=%0" :: "r" (psr));
ia64_srlz_i();
}
void
map_pal_code(void)
{
pt_entry_t pte;
vm_offset_t va;
vm_size_t sz;
uint64_t psr;
u_int shft;
if (ia64_pal_size == 0)
return;
va = IA64_PHYS_TO_RR7(ia64_pal_base);
sz = ia64_pal_size;
shft = 0;
while (sz > 1) {
shft++;
sz >>= 1;
}
pte = PTE_PRESENT | PTE_MA_WB | PTE_ACCESSED | PTE_DIRTY |
PTE_PL_KERN | PTE_AR_RWX;
pte |= ia64_pal_base & PTE_PPN_MASK;
__asm __volatile("ptr.d %0,%1; ptr.i %0,%1" :: "r"(va), "r"(shft<<2));
__asm __volatile("mov %0=psr" : "=r"(psr));
__asm __volatile("rsm psr.ic|psr.i");
ia64_srlz_i();
ia64_set_ifa(va);
ia64_set_itir(shft << 2);
ia64_srlz_d();
__asm __volatile("itr.d dtr[%0]=%1" :: "r"(4), "r"(pte));
ia64_srlz_d();
__asm __volatile("itr.i itr[%0]=%1" :: "r"(1), "r"(pte));
__asm __volatile("mov psr.l=%0" :: "r" (psr));
ia64_srlz_i();
}
void
map_gateway_page(void)
{
pt_entry_t pte;
uint64_t psr;
pte = PTE_PRESENT | PTE_MA_WB | PTE_ACCESSED | PTE_DIRTY |
PTE_PL_KERN | PTE_AR_X_RX;
pte |= ia64_tpa((uint64_t)ia64_gateway_page) & PTE_PPN_MASK;
__asm __volatile("ptr.d %0,%1; ptr.i %0,%1" ::
"r"(VM_MAXUSER_ADDRESS), "r"(PAGE_SHIFT << 2));
__asm __volatile("mov %0=psr" : "=r"(psr));
__asm __volatile("rsm psr.ic|psr.i");
ia64_srlz_i();
ia64_set_ifa(VM_MAXUSER_ADDRESS);
ia64_set_itir(PAGE_SHIFT << 2);
ia64_srlz_d();
__asm __volatile("itr.d dtr[%0]=%1" :: "r"(5), "r"(pte));
ia64_srlz_d();
__asm __volatile("itr.i itr[%0]=%1" :: "r"(2), "r"(pte));
__asm __volatile("mov psr.l=%0" :: "r" (psr));
ia64_srlz_i();
/* Expose the mapping to userland in ar.k5 */
ia64_set_k5(VM_MAXUSER_ADDRESS);
}
static u_int
freq_ratio(u_long base, u_long ratio)
{
u_long f;
f = (base * (ratio >> 32)) / (ratio & 0xfffffffful);
return ((f + 500000) / 1000000);
}
static void
calculate_frequencies(void)
{
struct ia64_sal_result sal;
struct ia64_pal_result pal;
register_t ie;
ie = intr_disable();
sal = ia64_sal_entry(SAL_FREQ_BASE, 0, 0, 0, 0, 0, 0, 0);
pal = ia64_call_pal_static(PAL_FREQ_RATIOS, 0, 0, 0);
intr_restore(ie);
if (sal.sal_status == 0 && pal.pal_status == 0) {
if (bootverbose) {
printf("Platform clock frequency %ld Hz\n",
sal.sal_result[0]);
printf("Processor ratio %ld/%ld, Bus ratio %ld/%ld, "
"ITC ratio %ld/%ld\n",
pal.pal_result[0] >> 32,
pal.pal_result[0] & ((1L << 32) - 1),
pal.pal_result[1] >> 32,
pal.pal_result[1] & ((1L << 32) - 1),
pal.pal_result[2] >> 32,
pal.pal_result[2] & ((1L << 32) - 1));
}
cpu_freq = freq_ratio(sal.sal_result[0], pal.pal_result[0]);
bus_freq = freq_ratio(sal.sal_result[0], pal.pal_result[1]);
itc_freq = freq_ratio(sal.sal_result[0], pal.pal_result[2]);
}
}
struct ia64_init_return
ia64_init(void)
{
struct ia64_init_return ret;
int phys_avail_cnt;
vm_offset_t kernstart, kernend;
vm_offset_t kernstartpfn, kernendpfn, pfn0, pfn1;
char *p;
struct efi_md *md;
int metadata_missing;
/* NO OUTPUT ALLOWED UNTIL FURTHER NOTICE */
/*
* TODO: Disable interrupts, floating point etc.
* Maybe flush cache and tlb
*/
ia64_set_fpsr(IA64_FPSR_DEFAULT);
/*
* TODO: Get critical system information (if possible, from the
* information provided by the boot program).
*/
/*
* Look for the I/O ports first - we need them for console
* probing.
*/
for (md = efi_md_first(); md != NULL; md = efi_md_next(md)) {
switch (md->md_type) {
case EFI_MD_TYPE_IOPORT:
ia64_port_base = (uintptr_t)pmap_mapdev(md->md_phys,
md->md_pages * EFI_PAGE_SIZE);
break;
case EFI_MD_TYPE_PALCODE:
ia64_pal_size = md->md_pages * EFI_PAGE_SIZE;
ia64_pal_base = md->md_phys;
break;
}
}
metadata_missing = 0;
if (bootinfo->bi_modulep)
preload_metadata = (caddr_t)bootinfo->bi_modulep;
else
metadata_missing = 1;
if (envmode == 0 && bootinfo->bi_envp)
kern_envp = (caddr_t)bootinfo->bi_envp;
else
kern_envp = static_env;
/*
* Look at arguments passed to us and compute boothowto.
*/
boothowto = bootinfo->bi_boothowto;
if (boothowto & RB_VERBOSE)
bootverbose = 1;
/*
* Find the beginning and end of the kernel.
*/
kernstart = trunc_page(kernel_text);
#ifdef DDB
ksym_start = bootinfo->bi_symtab;
ksym_end = bootinfo->bi_esymtab;
kernend = (vm_offset_t)round_page(ksym_end);
#else
kernend = (vm_offset_t)round_page(_end);
#endif
/* But if the bootstrap tells us otherwise, believe it! */
if (bootinfo->bi_kernend)
kernend = round_page(bootinfo->bi_kernend);
/*
* Region 6 is direct mapped UC and region 7 is direct mapped
* WC. The details of this is controlled by the Alt {I,D}TLB
* handlers. Here we just make sure that they have the largest
* possible page size to minimise TLB usage.
*/
ia64_set_rr(IA64_RR_BASE(6), (6 << 8) | (PAGE_SHIFT << 2));
ia64_set_rr(IA64_RR_BASE(7), (7 << 8) | (PAGE_SHIFT << 2));
ia64_srlz_d();
/*
* Wire things up so we can call the firmware.
*/
map_pal_code();
efi_boot_minimal(bootinfo->bi_systab);
ia64_xiv_init();
ia64_sal_init();
calculate_frequencies();
set_cputicker(ia64_get_itc, (u_long)itc_freq * 1000000, 0);
/*
* Setup the PCPU data for the bootstrap processor. It is needed
* by printf(). Also, since printf() has critical sections, we
* need to initialize at least pc_curthread.
*/
pcpup = &pcpu0;
ia64_set_k4((u_int64_t)pcpup);
pcpu_init(pcpup, 0, sizeof(pcpu0));
dpcpu_init((void *)kernend, 0);
PCPU_SET(md.lid, ia64_get_lid());
kernend += DPCPU_SIZE;
PCPU_SET(curthread, &thread0);
/*
* Initialize the console before we print anything out.
*/
cninit();
/* OUTPUT NOW ALLOWED */
if (metadata_missing)
printf("WARNING: loader(8) metadata is missing!\n");
/* Get FPSWA interface */
fpswa_iface = (bootinfo->bi_fpswa == 0) ? NULL :
(struct fpswa_iface *)IA64_PHYS_TO_RR7(bootinfo->bi_fpswa);
/* Init basic tunables, including hz */
init_param1();
p = getenv("kernelname");
if (p != NULL) {
strlcpy(kernelname, p, sizeof(kernelname));
freeenv(p);
}
kernstartpfn = atop(IA64_RR_MASK(kernstart));
kernendpfn = atop(IA64_RR_MASK(kernend));
/*
* Size the memory regions and load phys_avail[] with the results.
*/
/*
* Find out how much memory is available, by looking at
* the memory descriptors.
*/
#ifdef DEBUG_MD
printf("Memory descriptor count: %d\n", mdcount);
#endif
phys_avail_cnt = 0;
for (md = efi_md_first(); md != NULL; md = efi_md_next(md)) {
#ifdef DEBUG_MD
printf("MD %p: type %d pa 0x%lx cnt 0x%lx\n", md,
md->md_type, md->md_phys, md->md_pages);
#endif
pfn0 = ia64_btop(round_page(md->md_phys));
pfn1 = ia64_btop(trunc_page(md->md_phys + md->md_pages * 4096));
if (pfn1 <= pfn0)
continue;
if (md->md_type != EFI_MD_TYPE_FREE)
continue;
/*
* We have a memory descriptor that describes conventional
* memory that is for general use. We must determine if the
* loader has put the kernel in this region.
*/
physmem += (pfn1 - pfn0);
if (pfn0 <= kernendpfn && kernstartpfn <= pfn1) {
/*
* Must compute the location of the kernel
* within the segment.
*/
#ifdef DEBUG_MD
printf("Descriptor %p contains kernel\n", mp);
#endif
if (pfn0 < kernstartpfn) {
/*
* There is a chunk before the kernel.
*/
#ifdef DEBUG_MD
printf("Loading chunk before kernel: "
"0x%lx / 0x%lx\n", pfn0, kernstartpfn);
#endif
phys_avail[phys_avail_cnt] = ia64_ptob(pfn0);
phys_avail[phys_avail_cnt+1] = ia64_ptob(kernstartpfn);
phys_avail_cnt += 2;
}
if (kernendpfn < pfn1) {
/*
* There is a chunk after the kernel.
*/
#ifdef DEBUG_MD
printf("Loading chunk after kernel: "
"0x%lx / 0x%lx\n", kernendpfn, pfn1);
#endif
phys_avail[phys_avail_cnt] = ia64_ptob(kernendpfn);
phys_avail[phys_avail_cnt+1] = ia64_ptob(pfn1);
phys_avail_cnt += 2;
}
} else {
/*
* Just load this cluster as one chunk.
*/
#ifdef DEBUG_MD
printf("Loading descriptor %d: 0x%lx / 0x%lx\n", i,
pfn0, pfn1);
#endif
phys_avail[phys_avail_cnt] = ia64_ptob(pfn0);
phys_avail[phys_avail_cnt+1] = ia64_ptob(pfn1);
phys_avail_cnt += 2;
}
}
phys_avail[phys_avail_cnt] = 0;
Maxmem = physmem;
init_param2(physmem);
/*
* Initialize error message buffer (at end of core).
*/
msgbufp = (struct msgbuf *)pmap_steal_memory(msgbufsize);
msgbufinit(msgbufp, msgbufsize);
proc_linkup0(&proc0, &thread0);
/*
* Init mapping for kernel stack for proc 0
*/
thread0.td_kstack = pmap_steal_memory(KSTACK_PAGES * PAGE_SIZE);
thread0.td_kstack_pages = KSTACK_PAGES;
mutex_init();
/*
* Initialize the rest of proc 0's PCB.
*
* Set the kernel sp, reserving space for an (empty) trapframe,
* and make proc0's trapframe pointer point to it for sanity.
* Initialise proc0's backing store to start after u area.
*/
cpu_thread_alloc(&thread0);
thread0.td_frame->tf_flags = FRAME_SYSCALL;
thread0.td_pcb->pcb_special.sp =
(u_int64_t)thread0.td_frame - 16;
thread0.td_pcb->pcb_special.bspstore = thread0.td_kstack;
/*
* Initialize the virtual memory system.
*/
pmap_bootstrap();
/*
* Initialize debuggers, and break into them if appropriate.
*/
kdb_init();
#ifdef KDB
if (boothowto & RB_KDB)
kdb_enter(KDB_WHY_BOOTFLAGS,
"Boot flags requested debugger\n");
#endif
ia64_set_tpr(0);
ia64_srlz_d();
ret.bspstore = thread0.td_pcb->pcb_special.bspstore;
ret.sp = thread0.td_pcb->pcb_special.sp;
return (ret);
}
uint64_t
ia64_get_hcdp(void)
{
return (bootinfo->bi_hcdp);
}
void
bzero(void *buf, size_t len)
{
caddr_t p = buf;
while (((vm_offset_t) p & (sizeof(u_long) - 1)) && len) {
*p++ = 0;
len--;
}
while (len >= sizeof(u_long) * 8) {
*(u_long*) p = 0;
*((u_long*) p + 1) = 0;
*((u_long*) p + 2) = 0;
*((u_long*) p + 3) = 0;
len -= sizeof(u_long) * 8;
*((u_long*) p + 4) = 0;
*((u_long*) p + 5) = 0;
*((u_long*) p + 6) = 0;
*((u_long*) p + 7) = 0;
p += sizeof(u_long) * 8;
}
while (len >= sizeof(u_long)) {
*(u_long*) p = 0;
len -= sizeof(u_long);
p += sizeof(u_long);
}
while (len) {
*p++ = 0;
len--;
}
}
u_int
ia64_itc_freq(void)
{
return (itc_freq);
}
void
DELAY(int n)
{
u_int64_t start, end, now;
sched_pin();
start = ia64_get_itc();
end = start + itc_freq * n;
/* printf("DELAY from 0x%lx to 0x%lx\n", start, end); */
do {
now = ia64_get_itc();
} while (now < end || (now > start && end < start));
sched_unpin();
}
/*
* Send an interrupt (signal) to a process.
*/
void
sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
{
struct proc *p;
struct thread *td;
struct trapframe *tf;
struct sigacts *psp;
struct sigframe sf, *sfp;
u_int64_t sbs, sp;
int oonstack;
int sig;
u_long code;
td = curthread;
p = td->td_proc;
PROC_LOCK_ASSERT(p, MA_OWNED);
sig = ksi->ksi_signo;
code = ksi->ksi_code;
psp = p->p_sigacts;
mtx_assert(&psp->ps_mtx, MA_OWNED);
tf = td->td_frame;
sp = tf->tf_special.sp;
oonstack = sigonstack(sp);
sbs = 0;
/* save user context */
bzero(&sf, sizeof(struct sigframe));
sf.sf_uc.uc_sigmask = *mask;
sf.sf_uc.uc_stack = td->td_sigstk;
sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
/*
* Allocate and validate space for the signal handler
* context. Note that if the stack is in P0 space, the
* call to grow() is a nop, and the useracc() check
* will fail if the process has not already allocated
* the space with a `brk'.
*/
if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
SIGISMEMBER(psp->ps_sigonstack, sig)) {
sbs = (u_int64_t)td->td_sigstk.ss_sp;
sbs = (sbs + 15) & ~15;
sfp = (struct sigframe *)(sbs + td->td_sigstk.ss_size);
#if defined(COMPAT_43)
td->td_sigstk.ss_flags |= SS_ONSTACK;
#endif
} else
sfp = (struct sigframe *)sp;
sfp = (struct sigframe *)((u_int64_t)(sfp - 1) & ~15);
/* Fill in the siginfo structure for POSIX handlers. */
if (SIGISMEMBER(psp->ps_siginfo, sig)) {
sf.sf_si = ksi->ksi_info;
sf.sf_si.si_signo = sig;
/*
* XXX this shouldn't be here after code in trap.c
* is fixed
*/
sf.sf_si.si_addr = (void*)tf->tf_special.ifa;
code = (u_int64_t)&sfp->sf_si;
}
mtx_unlock(&psp->ps_mtx);
PROC_UNLOCK(p);
get_mcontext(td, &sf.sf_uc.uc_mcontext, 0);
/* Copy the frame out to userland. */
if (copyout(&sf, sfp, sizeof(sf)) != 0) {
/*
* Process has trashed its stack; give it an illegal
* instruction to halt it in its tracks.
*/
PROC_LOCK(p);
sigexit(td, SIGILL);
return;
}
if ((tf->tf_flags & FRAME_SYSCALL) == 0) {
tf->tf_special.psr &= ~IA64_PSR_RI;
tf->tf_special.iip = ia64_get_k5() +
((uint64_t)break_sigtramp - (uint64_t)ia64_gateway_page);
} else
tf->tf_special.iip = ia64_get_k5() +
((uint64_t)epc_sigtramp - (uint64_t)ia64_gateway_page);
/*
* Setup the trapframe to return to the signal trampoline. We pass
* information to the trampoline in the following registers:
*
* gp new backing store or NULL
* r8 signal number
* r9 signal code or siginfo pointer
* r10 signal handler (function descriptor)
*/
tf->tf_special.sp = (u_int64_t)sfp - 16;
tf->tf_special.gp = sbs;
tf->tf_special.bspstore = sf.sf_uc.uc_mcontext.mc_special.bspstore;
tf->tf_special.ndirty = 0;
tf->tf_special.rnat = sf.sf_uc.uc_mcontext.mc_special.rnat;
tf->tf_scratch.gr8 = sig;
tf->tf_scratch.gr9 = code;
tf->tf_scratch.gr10 = (u_int64_t)catcher;
PROC_LOCK(p);
mtx_lock(&psp->ps_mtx);
}
/*
* System call to cleanup state after a signal
* has been taken. Reset signal mask and
* stack state from context left by sendsig (above).
* Return to previous pc and psl as specified by
* context left by sendsig. Check carefully to
* make sure that the user has not modified the
* state to gain improper privileges.
*
* MPSAFE
*/
int
-sigreturn(struct thread *td,
+sys_sigreturn(struct thread *td,
struct sigreturn_args /* {
ucontext_t *sigcntxp;
} */ *uap)
{
ucontext_t uc;
struct trapframe *tf;
struct pcb *pcb;
tf = td->td_frame;
pcb = td->td_pcb;
/*
* Fetch the entire context structure at once for speed.
* We don't use a normal argument to simplify RSE handling.
*/
if (copyin(uap->sigcntxp, (caddr_t)&uc, sizeof(uc)))
return (EFAULT);
set_mcontext(td, &uc.uc_mcontext);
#if defined(COMPAT_43)
if (sigonstack(tf->tf_special.sp))
td->td_sigstk.ss_flags |= SS_ONSTACK;
else
td->td_sigstk.ss_flags &= ~SS_ONSTACK;
#endif
kern_sigprocmask(td, SIG_SETMASK, &uc.uc_sigmask, NULL, 0);
return (EJUSTRETURN);
}
#ifdef COMPAT_FREEBSD4
int
freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap)
{
- return sigreturn(td, (struct sigreturn_args *)uap);
+ return sys_sigreturn(td, (struct sigreturn_args *)uap);
}
#endif
/*
* Construct a PCB from a trapframe. This is called from kdb_trap() where
* we want to start a backtrace from the function that caused us to enter
* the debugger. We have the context in the trapframe, but base the trace
* on the PCB. The PCB doesn't have to be perfect, as long as it contains
* enough for a backtrace.
*/
void
makectx(struct trapframe *tf, struct pcb *pcb)
{
pcb->pcb_special = tf->tf_special;
pcb->pcb_special.__spare = ~0UL; /* XXX see unwind.c */
save_callee_saved(&pcb->pcb_preserved);
save_callee_saved_fp(&pcb->pcb_preserved_fp);
}
int
ia64_flush_dirty(struct thread *td, struct _special *r)
{
struct iovec iov;
struct uio uio;
uint64_t bspst, kstk, rnat;
int error, locked;
if (r->ndirty == 0)
return (0);
kstk = td->td_kstack + (r->bspstore & 0x1ffUL);
if (td == curthread) {
__asm __volatile("mov ar.rsc=0;;");
__asm __volatile("mov %0=ar.bspstore" : "=r"(bspst));
/* Make sure we have all the user registers written out. */
if (bspst - kstk < r->ndirty) {
__asm __volatile("flushrs;;");
__asm __volatile("mov %0=ar.bspstore" : "=r"(bspst));
}
__asm __volatile("mov %0=ar.rnat;;" : "=r"(rnat));
__asm __volatile("mov ar.rsc=3");
error = copyout((void*)kstk, (void*)r->bspstore, r->ndirty);
kstk += r->ndirty;
r->rnat = (bspst > kstk && (bspst & 0x1ffL) < (kstk & 0x1ffL))
? *(uint64_t*)(kstk | 0x1f8L) : rnat;
} else {
locked = PROC_LOCKED(td->td_proc);
if (!locked)
PHOLD(td->td_proc);
iov.iov_base = (void*)(uintptr_t)kstk;
iov.iov_len = r->ndirty;
uio.uio_iov = &iov;
uio.uio_iovcnt = 1;
uio.uio_offset = r->bspstore;
uio.uio_resid = r->ndirty;
uio.uio_segflg = UIO_SYSSPACE;
uio.uio_rw = UIO_WRITE;
uio.uio_td = td;
error = proc_rwmem(td->td_proc, &uio);
/*
* XXX proc_rwmem() doesn't currently return ENOSPC,
* so I think it can bogusly return 0. Neither do
* we allow short writes.
*/
if (uio.uio_resid != 0 && error == 0)
error = ENOSPC;
if (!locked)
PRELE(td->td_proc);
}
r->bspstore += r->ndirty;
r->ndirty = 0;
return (error);
}
int
get_mcontext(struct thread *td, mcontext_t *mc, int flags)
{
struct trapframe *tf;
int error;
tf = td->td_frame;
bzero(mc, sizeof(*mc));
mc->mc_special = tf->tf_special;
error = ia64_flush_dirty(td, &mc->mc_special);
if (tf->tf_flags & FRAME_SYSCALL) {
mc->mc_flags |= _MC_FLAGS_SYSCALL_CONTEXT;
mc->mc_scratch = tf->tf_scratch;
if (flags & GET_MC_CLEAR_RET) {
mc->mc_scratch.gr8 = 0;
mc->mc_scratch.gr9 = 0;
mc->mc_scratch.gr10 = 0;
mc->mc_scratch.gr11 = 0;
}
} else {
mc->mc_flags |= _MC_FLAGS_ASYNC_CONTEXT;
mc->mc_scratch = tf->tf_scratch;
mc->mc_scratch_fp = tf->tf_scratch_fp;
/*
* XXX If the thread never used the high FP registers, we
* probably shouldn't waste time saving them.
*/
ia64_highfp_save(td);
mc->mc_flags |= _MC_FLAGS_HIGHFP_VALID;
mc->mc_high_fp = td->td_pcb->pcb_high_fp;
}
save_callee_saved(&mc->mc_preserved);
save_callee_saved_fp(&mc->mc_preserved_fp);
return (error);
}
int
set_mcontext(struct thread *td, const mcontext_t *mc)
{
struct _special s;
struct trapframe *tf;
uint64_t psrmask;
tf = td->td_frame;
KASSERT((tf->tf_special.ndirty & ~PAGE_MASK) == 0,
("Whoa there! We have more than 8KB of dirty registers!"));
s = mc->mc_special;
/*
* Only copy the user mask and the restart instruction bit from
* the new context.
*/
psrmask = IA64_PSR_BE | IA64_PSR_UP | IA64_PSR_AC | IA64_PSR_MFL |
IA64_PSR_MFH | IA64_PSR_RI;
s.psr = (tf->tf_special.psr & ~psrmask) | (s.psr & psrmask);
/* We don't have any dirty registers of the new context. */
s.ndirty = 0;
if (mc->mc_flags & _MC_FLAGS_ASYNC_CONTEXT) {
/*
* We can get an async context passed to us while we
* entered the kernel through a syscall: sigreturn(2)
* takes contexts that could previously be the result of
* a trap or interrupt.
* Hence, we cannot assert that the trapframe is not
* a syscall frame, but we can assert that it's at
* least an expected syscall.
*/
if (tf->tf_flags & FRAME_SYSCALL) {
KASSERT(tf->tf_scratch.gr15 == SYS_sigreturn, ("foo"));
tf->tf_flags &= ~FRAME_SYSCALL;
}
tf->tf_scratch = mc->mc_scratch;
tf->tf_scratch_fp = mc->mc_scratch_fp;
if (mc->mc_flags & _MC_FLAGS_HIGHFP_VALID)
td->td_pcb->pcb_high_fp = mc->mc_high_fp;
} else {
KASSERT((tf->tf_flags & FRAME_SYSCALL) != 0, ("foo"));
if ((mc->mc_flags & _MC_FLAGS_SYSCALL_CONTEXT) == 0) {
s.cfm = tf->tf_special.cfm;
s.iip = tf->tf_special.iip;
tf->tf_scratch.gr15 = 0; /* Clear syscall nr. */
} else
tf->tf_scratch = mc->mc_scratch;
}
tf->tf_special = s;
restore_callee_saved(&mc->mc_preserved);
restore_callee_saved_fp(&mc->mc_preserved_fp);
return (0);
}
/*
* Clear registers on exec.
*/
void
exec_setregs(struct thread *td, struct image_params *imgp, u_long stack)
{
struct trapframe *tf;
uint64_t *ksttop, *kst;
tf = td->td_frame;
ksttop = (uint64_t*)(td->td_kstack + tf->tf_special.ndirty +
(tf->tf_special.bspstore & 0x1ffUL));
/*
* We can ignore up to 8KB of dirty registers by masking off the
* lower 13 bits in exception_restore() or epc_syscall(). This
* should be enough for a couple of years, but if there are more
* than 8KB of dirty registers, we lose track of the bottom of
* the kernel stack. The solution is to copy the active part of
* the kernel stack down 1 page (or 2, but not more than that)
* so that we always have less than 8KB of dirty registers.
*/
KASSERT((tf->tf_special.ndirty & ~PAGE_MASK) == 0,
("Whoa there! We have more than 8KB of dirty registers!"));
bzero(&tf->tf_special, sizeof(tf->tf_special));
if ((tf->tf_flags & FRAME_SYSCALL) == 0) { /* break syscalls. */
bzero(&tf->tf_scratch, sizeof(tf->tf_scratch));
bzero(&tf->tf_scratch_fp, sizeof(tf->tf_scratch_fp));
tf->tf_special.cfm = (1UL<<63) | (3UL<<7) | 3UL;
tf->tf_special.bspstore = IA64_BACKINGSTORE;
/*
* Copy the arguments onto the kernel register stack so that
* they get loaded by the loadrs instruction. Skip over the
* NaT collection points.
*/
kst = ksttop - 1;
if (((uintptr_t)kst & 0x1ff) == 0x1f8)
*kst-- = 0;
*kst-- = 0;
if (((uintptr_t)kst & 0x1ff) == 0x1f8)
*kst-- = 0;
*kst-- = imgp->ps_strings;
if (((uintptr_t)kst & 0x1ff) == 0x1f8)
*kst-- = 0;
*kst = stack;
tf->tf_special.ndirty = (ksttop - kst) << 3;
} else { /* epc syscalls (default). */
tf->tf_special.cfm = (3UL<<62) | (3UL<<7) | 3UL;
tf->tf_special.bspstore = IA64_BACKINGSTORE + 24;
/*
* Write values for out0, out1 and out2 to the user's backing
* store and arrange for them to be restored into the user's
* initial register frame.
* Assumes that (bspstore & 0x1f8) < 0x1e0.
*/
suword((caddr_t)tf->tf_special.bspstore - 24, stack);
suword((caddr_t)tf->tf_special.bspstore - 16, imgp->ps_strings);
suword((caddr_t)tf->tf_special.bspstore - 8, 0);
}
tf->tf_special.iip = imgp->entry_addr;
tf->tf_special.sp = (stack & ~15) - 16;
tf->tf_special.rsc = 0xf;
tf->tf_special.fpsr = IA64_FPSR_DEFAULT;
tf->tf_special.psr = IA64_PSR_IC | IA64_PSR_I | IA64_PSR_IT |
IA64_PSR_DT | IA64_PSR_RT | IA64_PSR_DFH | IA64_PSR_BN |
IA64_PSR_CPL_USER;
}
int
ptrace_set_pc(struct thread *td, unsigned long addr)
{
uint64_t slot;
switch (addr & 0xFUL) {
case 0:
slot = IA64_PSR_RI_0;
break;
case 1:
/* XXX we need to deal with MLX bundles here */
slot = IA64_PSR_RI_1;
break;
case 2:
slot = IA64_PSR_RI_2;
break;
default:
return (EINVAL);
}
td->td_frame->tf_special.iip = addr & ~0x0FULL;
td->td_frame->tf_special.psr =
(td->td_frame->tf_special.psr & ~IA64_PSR_RI) | slot;
return (0);
}
int
ptrace_single_step(struct thread *td)
{
struct trapframe *tf;
/*
* There's no way to set single stepping when we're leaving the
* kernel through the EPC syscall path. The way we solve this is
* by enabling the lower-privilege trap so that we re-enter the
* kernel as soon as the privilege level changes. See trap.c for
* how we proceed from there.
*/
tf = td->td_frame;
if (tf->tf_flags & FRAME_SYSCALL)
tf->tf_special.psr |= IA64_PSR_LP;
else
tf->tf_special.psr |= IA64_PSR_SS;
return (0);
}
int
ptrace_clear_single_step(struct thread *td)
{
struct trapframe *tf;
/*
* Clear any and all status bits we may use to implement single
* stepping.
*/
tf = td->td_frame;
tf->tf_special.psr &= ~IA64_PSR_SS;
tf->tf_special.psr &= ~IA64_PSR_LP;
tf->tf_special.psr &= ~IA64_PSR_TB;
return (0);
}
int
fill_regs(struct thread *td, struct reg *regs)
{
struct trapframe *tf;
tf = td->td_frame;
regs->r_special = tf->tf_special;
regs->r_scratch = tf->tf_scratch;
save_callee_saved(&regs->r_preserved);
return (0);
}
int
set_regs(struct thread *td, struct reg *regs)
{
struct trapframe *tf;
int error;
tf = td->td_frame;
error = ia64_flush_dirty(td, &tf->tf_special);
if (!error) {
tf->tf_special = regs->r_special;
tf->tf_special.bspstore += tf->tf_special.ndirty;
tf->tf_special.ndirty = 0;
tf->tf_scratch = regs->r_scratch;
restore_callee_saved(&regs->r_preserved);
}
return (error);
}
int
fill_dbregs(struct thread *td, struct dbreg *dbregs)
{
return (ENOSYS);
}
int
set_dbregs(struct thread *td, struct dbreg *dbregs)
{
return (ENOSYS);
}
int
fill_fpregs(struct thread *td, struct fpreg *fpregs)
{
struct trapframe *frame = td->td_frame;
struct pcb *pcb = td->td_pcb;
/* Save the high FP registers. */
ia64_highfp_save(td);
fpregs->fpr_scratch = frame->tf_scratch_fp;
save_callee_saved_fp(&fpregs->fpr_preserved);
fpregs->fpr_high = pcb->pcb_high_fp;
return (0);
}
int
set_fpregs(struct thread *td, struct fpreg *fpregs)
{
struct trapframe *frame = td->td_frame;
struct pcb *pcb = td->td_pcb;
/* Throw away the high FP registers (should be redundant). */
ia64_highfp_drop(td);
frame->tf_scratch_fp = fpregs->fpr_scratch;
restore_callee_saved_fp(&fpregs->fpr_preserved);
pcb->pcb_high_fp = fpregs->fpr_high;
return (0);
}
void
ia64_sync_icache(vm_offset_t va, vm_offset_t sz)
{
vm_offset_t lim;
if (!ia64_sync_icache_needed)
return;
lim = va + sz;
while (va < lim) {
ia64_fc_i(va);
va += 32; /* XXX */
}
ia64_sync_i();
ia64_srlz_i();
}
Index: head/sys/kern/init_main.c
===================================================================
--- head/sys/kern/init_main.c (revision 225616)
+++ head/sys/kern/init_main.c (revision 225617)
@@ -1,832 +1,832 @@
/*-
* Copyright (c) 1995 Terrence R. Lambert
* All rights reserved.
*
* Copyright (c) 1982, 1986, 1989, 1991, 1992, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)init_main.c 8.9 (Berkeley) 1/21/94
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_ddb.h"
#include "opt_init_path.h"
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/exec.h>
#include <sys/file.h>
#include <sys/filedesc.h>
#include <sys/jail.h>
#include <sys/ktr.h>
#include <sys/lock.h>
#include <sys/loginclass.h>
#include <sys/mount.h>
#include <sys/mutex.h>
#include <sys/syscallsubr.h>
#include <sys/sysctl.h>
#include <sys/proc.h>
#include <sys/racct.h>
#include <sys/resourcevar.h>
#include <sys/systm.h>
#include <sys/signalvar.h>
#include <sys/vnode.h>
#include <sys/sysent.h>
#include <sys/reboot.h>
#include <sys/sched.h>
#include <sys/sx.h>
#include <sys/sysproto.h>
#include <sys/vmmeter.h>
#include <sys/unistd.h>
#include <sys/malloc.h>
#include <sys/conf.h>
#include <sys/cpuset.h>
#include <machine/cpu.h>
#include <security/audit/audit.h>
#include <security/mac/mac_framework.h>
#include <vm/vm.h>
#include <vm/vm_param.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
#include <sys/copyright.h>
#include <ddb/ddb.h>
#include <ddb/db_sym.h>
void mi_startup(void); /* Should be elsewhere */
/* Components of the first process -- never freed. */
static struct session session0;
static struct pgrp pgrp0;
struct proc proc0;
struct thread thread0 __aligned(16);
struct vmspace vmspace0;
struct proc *initproc;
int boothowto = 0; /* initialized so that it can be patched */
SYSCTL_INT(_debug, OID_AUTO, boothowto, CTLFLAG_RD, &boothowto, 0,
"Boot control flags, passed from loader");
int bootverbose;
SYSCTL_INT(_debug, OID_AUTO, bootverbose, CTLFLAG_RW, &bootverbose, 0,
"Control the output of verbose kernel messages");
/*
* This ensures that there is at least one entry so that the sysinit_set
* symbol is not undefined. A sybsystem ID of SI_SUB_DUMMY is never
* executed.
*/
SYSINIT(placeholder, SI_SUB_DUMMY, SI_ORDER_ANY, NULL, NULL);
/*
* The sysinit table itself. Items are checked off as the are run.
* If we want to register new sysinit types, add them to newsysinit.
*/
SET_DECLARE(sysinit_set, struct sysinit);
struct sysinit **sysinit, **sysinit_end;
struct sysinit **newsysinit, **newsysinit_end;
/*
* Merge a new sysinit set into the current set, reallocating it if
* necessary. This can only be called after malloc is running.
*/
void
sysinit_add(struct sysinit **set, struct sysinit **set_end)
{
struct sysinit **newset;
struct sysinit **sipp;
struct sysinit **xipp;
int count;
count = set_end - set;
if (newsysinit)
count += newsysinit_end - newsysinit;
else
count += sysinit_end - sysinit;
newset = malloc(count * sizeof(*sipp), M_TEMP, M_NOWAIT);
if (newset == NULL)
panic("cannot malloc for sysinit");
xipp = newset;
if (newsysinit)
for (sipp = newsysinit; sipp < newsysinit_end; sipp++)
*xipp++ = *sipp;
else
for (sipp = sysinit; sipp < sysinit_end; sipp++)
*xipp++ = *sipp;
for (sipp = set; sipp < set_end; sipp++)
*xipp++ = *sipp;
if (newsysinit)
free(newsysinit, M_TEMP);
newsysinit = newset;
newsysinit_end = newset + count;
}
/*
* System startup; initialize the world, create process 0, mount root
* filesystem, and fork to create init and pagedaemon. Most of the
* hard work is done in the lower-level initialization routines including
* startup(), which does memory initialization and autoconfiguration.
*
* This allows simple addition of new kernel subsystems that require
* boot time initialization. It also allows substitution of subsystem
* (for instance, a scheduler, kernel profiler, or VM system) by object
* module. Finally, it allows for optional "kernel threads".
*/
void
mi_startup(void)
{
register struct sysinit **sipp; /* system initialization*/
register struct sysinit **xipp; /* interior loop of sort*/
register struct sysinit *save; /* bubble*/
#if defined(VERBOSE_SYSINIT)
int last;
int verbose;
#endif
if (boothowto & RB_VERBOSE)
bootverbose++;
if (sysinit == NULL) {
sysinit = SET_BEGIN(sysinit_set);
sysinit_end = SET_LIMIT(sysinit_set);
}
restart:
/*
* Perform a bubble sort of the system initialization objects by
* their subsystem (primary key) and order (secondary key).
*/
for (sipp = sysinit; sipp < sysinit_end; sipp++) {
for (xipp = sipp + 1; xipp < sysinit_end; xipp++) {
if ((*sipp)->subsystem < (*xipp)->subsystem ||
((*sipp)->subsystem == (*xipp)->subsystem &&
(*sipp)->order <= (*xipp)->order))
continue; /* skip*/
save = *sipp;
*sipp = *xipp;
*xipp = save;
}
}
#if defined(VERBOSE_SYSINIT)
last = SI_SUB_COPYRIGHT;
verbose = 0;
#if !defined(DDB)
printf("VERBOSE_SYSINIT: DDB not enabled, symbol lookups disabled.\n");
#endif
#endif
/*
* Traverse the (now) ordered list of system initialization tasks.
* Perform each task, and continue on to the next task.
*
* The last item on the list is expected to be the scheduler,
* which will not return.
*/
for (sipp = sysinit; sipp < sysinit_end; sipp++) {
if ((*sipp)->subsystem == SI_SUB_DUMMY)
continue; /* skip dummy task(s)*/
if ((*sipp)->subsystem == SI_SUB_DONE)
continue;
#if defined(VERBOSE_SYSINIT)
if ((*sipp)->subsystem > last) {
verbose = 1;
last = (*sipp)->subsystem;
printf("subsystem %x\n", last);
}
if (verbose) {
#if defined(DDB)
const char *name;
c_db_sym_t sym;
db_expr_t offset;
sym = db_search_symbol((vm_offset_t)(*sipp)->func,
DB_STGY_PROC, &offset);
db_symbol_values(sym, &name, NULL);
if (name != NULL)
printf(" %s(%p)... ", name, (*sipp)->udata);
else
#endif
printf(" %p(%p)... ", (*sipp)->func,
(*sipp)->udata);
}
#endif
/* Call function */
(*((*sipp)->func))((*sipp)->udata);
#if defined(VERBOSE_SYSINIT)
if (verbose)
printf("done.\n");
#endif
/* Check off the one we're just done */
(*sipp)->subsystem = SI_SUB_DONE;
/* Check if we've installed more sysinit items via KLD */
if (newsysinit != NULL) {
if (sysinit != SET_BEGIN(sysinit_set))
free(sysinit, M_TEMP);
sysinit = newsysinit;
sysinit_end = newsysinit_end;
newsysinit = NULL;
newsysinit_end = NULL;
goto restart;
}
}
panic("Shouldn't get here!");
/* NOTREACHED*/
}
/*
***************************************************************************
****
**** The following SYSINIT's belong elsewhere, but have not yet
**** been moved.
****
***************************************************************************
*/
static void
print_caddr_t(void *data)
{
printf("%s", (char *)data);
}
static void
print_version(void *data __unused)
{
int len;
/* Strip a trailing newline from version. */
len = strlen(version);
while (len > 0 && version[len - 1] == '\n')
len--;
printf("%.*s %s\n", len, version, machine);
}
SYSINIT(announce, SI_SUB_COPYRIGHT, SI_ORDER_FIRST, print_caddr_t,
copyright);
SYSINIT(trademark, SI_SUB_COPYRIGHT, SI_ORDER_SECOND, print_caddr_t,
trademark);
SYSINIT(version, SI_SUB_COPYRIGHT, SI_ORDER_THIRD, print_version, NULL);
#ifdef WITNESS
static char wit_warn[] =
"WARNING: WITNESS option enabled, expect reduced performance.\n";
SYSINIT(witwarn, SI_SUB_COPYRIGHT, SI_ORDER_THIRD + 1,
print_caddr_t, wit_warn);
SYSINIT(witwarn2, SI_SUB_RUN_SCHEDULER, SI_ORDER_THIRD + 1,
print_caddr_t, wit_warn);
#endif
#ifdef DIAGNOSTIC
static char diag_warn[] =
"WARNING: DIAGNOSTIC option enabled, expect reduced performance.\n";
SYSINIT(diagwarn, SI_SUB_COPYRIGHT, SI_ORDER_THIRD + 2,
print_caddr_t, diag_warn);
SYSINIT(diagwarn2, SI_SUB_RUN_SCHEDULER, SI_ORDER_THIRD + 2,
print_caddr_t, diag_warn);
#endif
static int
null_fetch_syscall_args(struct thread *td __unused,
struct syscall_args *sa __unused)
{
panic("null_fetch_syscall_args");
}
static void
null_set_syscall_retval(struct thread *td __unused, int error __unused)
{
panic("null_set_syscall_retval");
}
struct sysentvec null_sysvec = {
.sv_size = 0,
.sv_table = NULL,
.sv_mask = 0,
.sv_sigsize = 0,
.sv_sigtbl = NULL,
.sv_errsize = 0,
.sv_errtbl = NULL,
.sv_transtrap = NULL,
.sv_fixup = NULL,
.sv_sendsig = NULL,
.sv_sigcode = NULL,
.sv_szsigcode = NULL,
.sv_prepsyscall = NULL,
.sv_name = "null",
.sv_coredump = NULL,
.sv_imgact_try = NULL,
.sv_minsigstksz = 0,
.sv_pagesize = PAGE_SIZE,
.sv_minuser = VM_MIN_ADDRESS,
.sv_maxuser = VM_MAXUSER_ADDRESS,
.sv_usrstack = USRSTACK,
.sv_psstrings = PS_STRINGS,
.sv_stackprot = VM_PROT_ALL,
.sv_copyout_strings = NULL,
.sv_setregs = NULL,
.sv_fixlimit = NULL,
.sv_maxssiz = NULL,
.sv_flags = 0,
.sv_set_syscall_retval = null_set_syscall_retval,
.sv_fetch_syscall_args = null_fetch_syscall_args,
.sv_syscallnames = NULL,
.sv_schedtail = NULL,
};
/*
***************************************************************************
****
**** The two following SYSINIT's are proc0 specific glue code. I am not
**** convinced that they can not be safely combined, but their order of
**** operation has been maintained as the same as the original init_main.c
**** for right now.
****
**** These probably belong in init_proc.c or kern_proc.c, since they
**** deal with proc0 (the fork template process).
****
***************************************************************************
*/
/* ARGSUSED*/
static void
proc0_init(void *dummy __unused)
{
struct proc *p;
struct thread *td;
vm_paddr_t pageablemem;
int i;
GIANT_REQUIRED;
p = &proc0;
td = &thread0;
/*
* Initialize magic number and osrel.
*/
p->p_magic = P_MAGIC;
p->p_osrel = osreldate;
/*
* Initialize thread and process structures.
*/
procinit(); /* set up proc zone */
threadinit(); /* set up UMA zones */
/*
* Initialise scheduler resources.
* Add scheduler specific parts to proc, thread as needed.
*/
schedinit(); /* scheduler gets its house in order */
/*
* Initialize sleep queue hash table
*/
sleepinit();
/*
* additional VM structures
*/
vm_init2();
/*
* Create process 0 (the swapper).
*/
LIST_INSERT_HEAD(&allproc, p, p_list);
LIST_INSERT_HEAD(PIDHASH(0), p, p_hash);
mtx_init(&pgrp0.pg_mtx, "process group", NULL, MTX_DEF | MTX_DUPOK);
p->p_pgrp = &pgrp0;
LIST_INSERT_HEAD(PGRPHASH(0), &pgrp0, pg_hash);
LIST_INIT(&pgrp0.pg_members);
LIST_INSERT_HEAD(&pgrp0.pg_members, p, p_pglist);
pgrp0.pg_session = &session0;
mtx_init(&session0.s_mtx, "session", NULL, MTX_DEF);
refcount_init(&session0.s_count, 1);
session0.s_leader = p;
p->p_sysent = &null_sysvec;
p->p_flag = P_SYSTEM | P_INMEM;
p->p_state = PRS_NORMAL;
knlist_init_mtx(&p->p_klist, &p->p_mtx);
STAILQ_INIT(&p->p_ktr);
p->p_nice = NZERO;
td->td_tid = PID_MAX + 1;
LIST_INSERT_HEAD(TIDHASH(td->td_tid), td, td_hash);
td->td_state = TDS_RUNNING;
td->td_pri_class = PRI_TIMESHARE;
td->td_user_pri = PUSER;
td->td_base_user_pri = PUSER;
td->td_lend_user_pri = PRI_MAX;
td->td_priority = PVM;
td->td_base_pri = PVM;
td->td_oncpu = 0;
td->td_flags = TDF_INMEM|TDP_KTHREAD;
td->td_cpuset = cpuset_thread0();
prison0.pr_cpuset = cpuset_ref(td->td_cpuset);
p->p_peers = 0;
p->p_leader = p;
strncpy(p->p_comm, "kernel", sizeof (p->p_comm));
strncpy(td->td_name, "swapper", sizeof (td->td_name));
callout_init(&p->p_itcallout, CALLOUT_MPSAFE);
callout_init_mtx(&p->p_limco, &p->p_mtx, 0);
callout_init(&td->td_slpcallout, CALLOUT_MPSAFE);
/* Create credentials. */
p->p_ucred = crget();
p->p_ucred->cr_ngroups = 1; /* group 0 */
p->p_ucred->cr_uidinfo = uifind(0);
p->p_ucred->cr_ruidinfo = uifind(0);
p->p_ucred->cr_prison = &prison0;
p->p_ucred->cr_loginclass = loginclass_find("default");
#ifdef AUDIT
audit_cred_kproc0(p->p_ucred);
#endif
#ifdef MAC
mac_cred_create_swapper(p->p_ucred);
#endif
td->td_ucred = crhold(p->p_ucred);
/* Create sigacts. */
p->p_sigacts = sigacts_alloc();
/* Initialize signal state for process 0. */
siginit(&proc0);
/* Create the file descriptor table. */
p->p_fd = fdinit(NULL);
p->p_fdtol = NULL;
/* Create the limits structures. */
p->p_limit = lim_alloc();
for (i = 0; i < RLIM_NLIMITS; i++)
p->p_limit->pl_rlimit[i].rlim_cur =
p->p_limit->pl_rlimit[i].rlim_max = RLIM_INFINITY;
p->p_limit->pl_rlimit[RLIMIT_NOFILE].rlim_cur =
p->p_limit->pl_rlimit[RLIMIT_NOFILE].rlim_max = maxfiles;
p->p_limit->pl_rlimit[RLIMIT_NPROC].rlim_cur =
p->p_limit->pl_rlimit[RLIMIT_NPROC].rlim_max = maxproc;
p->p_limit->pl_rlimit[RLIMIT_DATA].rlim_cur = dfldsiz;
p->p_limit->pl_rlimit[RLIMIT_DATA].rlim_max = maxdsiz;
p->p_limit->pl_rlimit[RLIMIT_STACK].rlim_cur = dflssiz;
p->p_limit->pl_rlimit[RLIMIT_STACK].rlim_max = maxssiz;
/* Cast to avoid overflow on i386/PAE. */
pageablemem = ptoa((vm_paddr_t)cnt.v_free_count);
p->p_limit->pl_rlimit[RLIMIT_RSS].rlim_cur =
p->p_limit->pl_rlimit[RLIMIT_RSS].rlim_max = pageablemem;
p->p_limit->pl_rlimit[RLIMIT_MEMLOCK].rlim_cur = pageablemem / 3;
p->p_limit->pl_rlimit[RLIMIT_MEMLOCK].rlim_max = pageablemem;
p->p_cpulimit = RLIM_INFINITY;
/* Initialize resource accounting structures. */
racct_create(&p->p_racct);
p->p_stats = pstats_alloc();
/* Allocate a prototype map so we have something to fork. */
pmap_pinit0(vmspace_pmap(&vmspace0));
p->p_vmspace = &vmspace0;
vmspace0.vm_refcnt = 1;
/*
* proc0 is not expected to enter usermode, so there is no special
* handling for sv_minuser here, like is done for exec_new_vmspace().
*/
vm_map_init(&vmspace0.vm_map, vmspace_pmap(&vmspace0),
p->p_sysent->sv_minuser, p->p_sysent->sv_maxuser);
/*
* Call the init and ctor for the new thread and proc. We wait
* to do this until all other structures are fairly sane.
*/
EVENTHANDLER_INVOKE(process_init, p);
EVENTHANDLER_INVOKE(thread_init, td);
EVENTHANDLER_INVOKE(process_ctor, p);
EVENTHANDLER_INVOKE(thread_ctor, td);
/*
* Charge root for one process.
*/
(void)chgproccnt(p->p_ucred->cr_ruidinfo, 1, 0);
PROC_LOCK(p);
racct_add_force(p, RACCT_NPROC, 1);
PROC_UNLOCK(p);
}
SYSINIT(p0init, SI_SUB_INTRINSIC, SI_ORDER_FIRST, proc0_init, NULL);
/* ARGSUSED*/
static void
proc0_post(void *dummy __unused)
{
struct timespec ts;
struct proc *p;
struct rusage ru;
struct thread *td;
/*
* Now we can look at the time, having had a chance to verify the
* time from the filesystem. Pretend that proc0 started now.
*/
sx_slock(&allproc_lock);
FOREACH_PROC_IN_SYSTEM(p) {
microuptime(&p->p_stats->p_start);
PROC_SLOCK(p);
rufetch(p, &ru); /* Clears thread stats */
PROC_SUNLOCK(p);
p->p_rux.rux_runtime = 0;
p->p_rux.rux_uticks = 0;
p->p_rux.rux_sticks = 0;
p->p_rux.rux_iticks = 0;
FOREACH_THREAD_IN_PROC(p, td) {
td->td_runtime = 0;
}
}
sx_sunlock(&allproc_lock);
PCPU_SET(switchtime, cpu_ticks());
PCPU_SET(switchticks, ticks);
/*
* Give the ``random'' number generator a thump.
*/
nanotime(&ts);
srandom(ts.tv_sec ^ ts.tv_nsec);
}
SYSINIT(p0post, SI_SUB_INTRINSIC_POST, SI_ORDER_FIRST, proc0_post, NULL);
static void
random_init(void *dummy __unused)
{
/*
* After CPU has been started we have some randomness on most
* platforms via get_cyclecount(). For platforms that don't
* we will reseed random(9) in proc0_post() as well.
*/
srandom(get_cyclecount());
}
SYSINIT(random, SI_SUB_RANDOM, SI_ORDER_FIRST, random_init, NULL);
/*
***************************************************************************
****
**** The following SYSINIT's and glue code should be moved to the
**** respective files on a per subsystem basis.
****
***************************************************************************
*/
/*
***************************************************************************
****
**** The following code probably belongs in another file, like
**** kern/init_init.c.
****
***************************************************************************
*/
/*
* List of paths to try when searching for "init".
*/
static char init_path[MAXPATHLEN] =
#ifdef INIT_PATH
__XSTRING(INIT_PATH);
#else
"/sbin/init:/sbin/oinit:/sbin/init.bak:/rescue/init:/stand/sysinstall";
#endif
SYSCTL_STRING(_kern, OID_AUTO, init_path, CTLFLAG_RD, init_path, 0,
"Path used to search the init process");
/*
* Shutdown timeout of init(8).
* Unused within kernel, but used to control init(8), hence do not remove.
*/
#ifndef INIT_SHUTDOWN_TIMEOUT
#define INIT_SHUTDOWN_TIMEOUT 120
#endif
static int init_shutdown_timeout = INIT_SHUTDOWN_TIMEOUT;
SYSCTL_INT(_kern, OID_AUTO, init_shutdown_timeout,
CTLFLAG_RW, &init_shutdown_timeout, 0, "Shutdown timeout of init(8). "
"Unused within kernel, but used to control init(8)");
/*
* Start the initial user process; try exec'ing each pathname in init_path.
* The program is invoked with one argument containing the boot flags.
*/
static void
start_init(void *dummy)
{
vm_offset_t addr;
struct execve_args args;
int options, error;
char *var, *path, *next, *s;
char *ucp, **uap, *arg0, *arg1;
struct thread *td;
struct proc *p;
mtx_lock(&Giant);
GIANT_REQUIRED;
td = curthread;
p = td->td_proc;
vfs_mountroot();
/*
* Need just enough stack to hold the faked-up "execve()" arguments.
*/
addr = p->p_sysent->sv_usrstack - PAGE_SIZE;
if (vm_map_find(&p->p_vmspace->vm_map, NULL, 0, &addr, PAGE_SIZE,
FALSE, VM_PROT_ALL, VM_PROT_ALL, 0) != 0)
panic("init: couldn't allocate argument space");
p->p_vmspace->vm_maxsaddr = (caddr_t)addr;
p->p_vmspace->vm_ssize = 1;
if ((var = getenv("init_path")) != NULL) {
strlcpy(init_path, var, sizeof(init_path));
freeenv(var);
}
for (path = init_path; *path != '\0'; path = next) {
while (*path == ':')
path++;
if (*path == '\0')
break;
for (next = path; *next != '\0' && *next != ':'; next++)
/* nothing */ ;
if (bootverbose)
printf("start_init: trying %.*s\n", (int)(next - path),
path);
/*
* Move out the boot flag argument.
*/
options = 0;
ucp = (char *)p->p_sysent->sv_usrstack;
(void)subyte(--ucp, 0); /* trailing zero */
if (boothowto & RB_SINGLE) {
(void)subyte(--ucp, 's');
options = 1;
}
#ifdef notyet
if (boothowto & RB_FASTBOOT) {
(void)subyte(--ucp, 'f');
options = 1;
}
#endif
#ifdef BOOTCDROM
(void)subyte(--ucp, 'C');
options = 1;
#endif
if (options == 0)
(void)subyte(--ucp, '-');
(void)subyte(--ucp, '-'); /* leading hyphen */
arg1 = ucp;
/*
* Move out the file name (also arg 0).
*/
(void)subyte(--ucp, 0);
for (s = next - 1; s >= path; s--)
(void)subyte(--ucp, *s);
arg0 = ucp;
/*
* Move out the arg pointers.
*/
uap = (char **)((intptr_t)ucp & ~(sizeof(intptr_t)-1));
(void)suword((caddr_t)--uap, (long)0); /* terminator */
(void)suword((caddr_t)--uap, (long)(intptr_t)arg1);
(void)suword((caddr_t)--uap, (long)(intptr_t)arg0);
/*
* Point at the arguments.
*/
args.fname = arg0;
args.argv = uap;
args.envv = NULL;
/*
* Now try to exec the program. If can't for any reason
* other than it doesn't exist, complain.
*
* Otherwise, return via fork_trampoline() all the way
* to user mode as init!
*/
- if ((error = execve(td, &args)) == 0) {
+ if ((error = sys_execve(td, &args)) == 0) {
mtx_unlock(&Giant);
return;
}
if (error != ENOENT)
printf("exec %.*s: error %d\n", (int)(next - path),
path, error);
}
printf("init: not found in path %s\n", init_path);
panic("no init");
}
/*
* Like kproc_create(), but runs in it's own address space.
* We do this early to reserve pid 1.
*
* Note special case - do not make it runnable yet. Other work
* in progress will change this more.
*/
static void
create_init(const void *udata __unused)
{
struct ucred *newcred, *oldcred;
int error;
error = fork1(&thread0, RFFDG | RFPROC | RFSTOPPED, 0, &initproc,
NULL, 0);
if (error)
panic("cannot fork init: %d\n", error);
KASSERT(initproc->p_pid == 1, ("create_init: initproc->p_pid != 1"));
/* divorce init's credentials from the kernel's */
newcred = crget();
PROC_LOCK(initproc);
initproc->p_flag |= P_SYSTEM | P_INMEM;
oldcred = initproc->p_ucred;
crcopy(newcred, oldcred);
#ifdef MAC
mac_cred_create_init(newcred);
#endif
#ifdef AUDIT
audit_cred_proc1(newcred);
#endif
initproc->p_ucred = newcred;
PROC_UNLOCK(initproc);
crfree(oldcred);
cred_update_thread(FIRST_THREAD_IN_PROC(initproc));
cpu_set_fork_handler(FIRST_THREAD_IN_PROC(initproc), start_init, NULL);
}
SYSINIT(init, SI_SUB_CREATE_INIT, SI_ORDER_FIRST, create_init, NULL);
/*
* Make it runnable now.
*/
static void
kick_init(const void *udata __unused)
{
struct thread *td;
td = FIRST_THREAD_IN_PROC(initproc);
thread_lock(td);
TD_SET_CAN_RUN(td);
sched_add(td, SRQ_BORING);
thread_unlock(td);
}
SYSINIT(kickinit, SI_SUB_KTHREAD_INIT, SI_ORDER_FIRST, kick_init, NULL);
Index: head/sys/kern/kern_acct.c
===================================================================
--- head/sys/kern/kern_acct.c (revision 225616)
+++ head/sys/kern/kern_acct.c (revision 225617)
@@ -1,654 +1,654 @@
/*-
* Copyright (c) 1982, 1986, 1989, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* Copyright (c) 2005 Robert N. M. Watson
* All rights reserved.
*
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* Copyright (c) 1994 Christopher G. Demetriou
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_acct.c 8.1 (Berkeley) 6/14/93
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/acct.h>
#include <sys/fcntl.h>
#include <sys/kernel.h>
#include <sys/kthread.h>
#include <sys/limits.h>
#include <sys/lock.h>
#include <sys/mount.h>
#include <sys/mutex.h>
#include <sys/namei.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/resourcevar.h>
#include <sys/sched.h>
#include <sys/sx.h>
#include <sys/sysctl.h>
#include <sys/sysent.h>
#include <sys/syslog.h>
#include <sys/sysproto.h>
#include <sys/tty.h>
#include <sys/vnode.h>
#include <security/mac/mac_framework.h>
/*
* The routines implemented in this file are described in:
* Leffler, et al.: The Design and Implementation of the 4.3BSD
* UNIX Operating System (Addison Welley, 1989)
* on pages 62-63.
* On May 2007 the historic 3 bits base 8 exponent, 13 bit fraction
* compt_t representation described in the above reference was replaced
* with that of IEEE-754 floats.
*
* Arguably, to simplify accounting operations, this mechanism should
* be replaced by one in which an accounting log file (similar to /dev/klog)
* is read by a user process, etc. However, that has its own problems.
*/
/* Floating point definitions from <float.h>. */
#define FLT_MANT_DIG 24 /* p */
#define FLT_MAX_EXP 128 /* emax */
/*
* Internal accounting functions.
* The former's operation is described in Leffler, et al., and the latter
* was provided by UCB with the 4.4BSD-Lite release
*/
static uint32_t encode_timeval(struct timeval);
static uint32_t encode_long(long);
static void acctwatch(void);
static void acct_thread(void *);
static int acct_disable(struct thread *);
/*
* Accounting vnode pointer, saved vnode pointer, and flags for each.
* acct_sx protects against changes to the active vnode and credentials
* while accounting records are being committed to disk.
*/
static int acct_configured;
static int acct_suspended;
static struct vnode *acct_vp;
static struct ucred *acct_cred;
static int acct_flags;
static struct sx acct_sx;
SX_SYSINIT(acct, &acct_sx, "acct_sx");
/*
* State of the accounting kthread.
*/
static int acct_state;
#define ACCT_RUNNING 1 /* Accounting kthread is running. */
#define ACCT_EXITREQ 2 /* Accounting kthread should exit. */
/*
* Values associated with enabling and disabling accounting
*/
static int acctsuspend = 2; /* stop accounting when < 2% free space left */
SYSCTL_INT(_kern, OID_AUTO, acct_suspend, CTLFLAG_RW,
&acctsuspend, 0, "percentage of free disk space below which accounting stops");
static int acctresume = 4; /* resume when free space risen to > 4% */
SYSCTL_INT(_kern, OID_AUTO, acct_resume, CTLFLAG_RW,
&acctresume, 0, "percentage of free disk space above which accounting resumes");
static int acctchkfreq = 15; /* frequency (in seconds) to check space */
static int
sysctl_acct_chkfreq(SYSCTL_HANDLER_ARGS)
{
int error, value;
/* Write out the old value. */
error = SYSCTL_OUT(req, &acctchkfreq, sizeof(int));
if (error || req->newptr == NULL)
return (error);
/* Read in and verify the new value. */
error = SYSCTL_IN(req, &value, sizeof(int));
if (error)
return (error);
if (value <= 0)
return (EINVAL);
acctchkfreq = value;
return (0);
}
SYSCTL_PROC(_kern, OID_AUTO, acct_chkfreq, CTLTYPE_INT|CTLFLAG_RW,
&acctchkfreq, 0, sysctl_acct_chkfreq, "I",
"frequency for checking the free space");
SYSCTL_INT(_kern, OID_AUTO, acct_configured, CTLFLAG_RD, &acct_configured, 0,
"Accounting configured or not");
SYSCTL_INT(_kern, OID_AUTO, acct_suspended, CTLFLAG_RD, &acct_suspended, 0,
"Accounting suspended or not");
/*
* Accounting system call. Written based on the specification and previous
* implementation done by Mark Tinguely.
*/
int
-acct(struct thread *td, struct acct_args *uap)
+sys_acct(struct thread *td, struct acct_args *uap)
{
struct nameidata nd;
int error, flags, vfslocked;
error = priv_check(td, PRIV_ACCT);
if (error)
return (error);
/*
* If accounting is to be started to a file, open that file for
* appending and make sure it's a 'normal'.
*/
if (uap->path != NULL) {
NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE | AUDITVNODE1,
UIO_USERSPACE, uap->path, td);
flags = FWRITE | O_APPEND;
error = vn_open(&nd, &flags, 0, NULL);
if (error)
return (error);
vfslocked = NDHASGIANT(&nd);
NDFREE(&nd, NDF_ONLY_PNBUF);
#ifdef MAC
error = mac_system_check_acct(td->td_ucred, nd.ni_vp);
if (error) {
VOP_UNLOCK(nd.ni_vp, 0);
vn_close(nd.ni_vp, flags, td->td_ucred, td);
VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
#endif
VOP_UNLOCK(nd.ni_vp, 0);
if (nd.ni_vp->v_type != VREG) {
vn_close(nd.ni_vp, flags, td->td_ucred, td);
VFS_UNLOCK_GIANT(vfslocked);
return (EACCES);
}
VFS_UNLOCK_GIANT(vfslocked);
#ifdef MAC
} else {
error = mac_system_check_acct(td->td_ucred, NULL);
if (error)
return (error);
#endif
}
/*
* Disallow concurrent access to the accounting vnode while we swap
* it out, in order to prevent access after close.
*/
sx_xlock(&acct_sx);
/*
* If accounting was previously enabled, kill the old space-watcher,
* close the file, and (if no new file was specified, leave). Reset
* the suspended state regardless of whether accounting remains
* enabled.
*/
acct_suspended = 0;
if (acct_vp != NULL) {
vfslocked = VFS_LOCK_GIANT(acct_vp->v_mount);
error = acct_disable(td);
VFS_UNLOCK_GIANT(vfslocked);
}
if (uap->path == NULL) {
if (acct_state & ACCT_RUNNING) {
acct_state |= ACCT_EXITREQ;
wakeup(&acct_state);
}
sx_xunlock(&acct_sx);
return (error);
}
/*
* Save the new accounting file vnode, and schedule the new
* free space watcher.
*/
acct_vp = nd.ni_vp;
acct_cred = crhold(td->td_ucred);
acct_flags = flags;
if (acct_state & ACCT_RUNNING)
acct_state &= ~ACCT_EXITREQ;
else {
/*
* Try to start up an accounting kthread. We may start more
* than one, but if so the extras will commit suicide as
* soon as they start up.
*/
error = kproc_create(acct_thread, NULL, NULL, 0, 0,
"accounting");
if (error) {
vfslocked = VFS_LOCK_GIANT(acct_vp->v_mount);
(void) vn_close(acct_vp, acct_flags, acct_cred, td);
VFS_UNLOCK_GIANT(vfslocked);
crfree(acct_cred);
acct_configured = 0;
acct_vp = NULL;
acct_cred = NULL;
acct_flags = 0;
sx_xunlock(&acct_sx);
log(LOG_NOTICE, "Unable to start accounting thread\n");
return (error);
}
}
acct_configured = 1;
sx_xunlock(&acct_sx);
log(LOG_NOTICE, "Accounting enabled\n");
return (error);
}
/*
* Disable currently in-progress accounting by closing the vnode, dropping
* our reference to the credential, and clearing the vnode's flags.
*/
static int
acct_disable(struct thread *td)
{
int error;
sx_assert(&acct_sx, SX_XLOCKED);
error = vn_close(acct_vp, acct_flags, acct_cred, td);
crfree(acct_cred);
acct_configured = 0;
acct_vp = NULL;
acct_cred = NULL;
acct_flags = 0;
log(LOG_NOTICE, "Accounting disabled\n");
return (error);
}
/*
* Write out process accounting information, on process exit.
* Data to be written out is specified in Leffler, et al.
* and are enumerated below. (They're also noted in the system
* "acct.h" header file.)
*/
int
acct_process(struct thread *td)
{
struct acctv2 acct;
struct timeval ut, st, tmp;
struct plimit *newlim, *oldlim;
struct proc *p;
struct rusage ru;
int t, ret, vfslocked;
/*
* Lockless check of accounting condition before doing the hard
* work.
*/
if (acct_vp == NULL || acct_suspended)
return (0);
sx_slock(&acct_sx);
/*
* If accounting isn't enabled, don't bother. Have to check again
* once we own the lock in case we raced with disabling of accounting
* by another thread.
*/
if (acct_vp == NULL || acct_suspended) {
sx_sunlock(&acct_sx);
return (0);
}
p = td->td_proc;
/*
* Get process accounting information.
*/
sx_slock(&proctree_lock);
PROC_LOCK(p);
/* (1) The terminal from which the process was started */
if ((p->p_flag & P_CONTROLT) && p->p_pgrp->pg_session->s_ttyp)
acct.ac_tty = tty_udev(p->p_pgrp->pg_session->s_ttyp);
else
acct.ac_tty = NODEV;
sx_sunlock(&proctree_lock);
/* (2) The name of the command that ran */
bcopy(p->p_comm, acct.ac_comm, sizeof acct.ac_comm);
/* (3) The amount of user and system time that was used */
rufetchcalc(p, &ru, &ut, &st);
acct.ac_utime = encode_timeval(ut);
acct.ac_stime = encode_timeval(st);
/* (4) The elapsed time the command ran (and its starting time) */
tmp = boottime;
timevaladd(&tmp, &p->p_stats->p_start);
acct.ac_btime = tmp.tv_sec;
microuptime(&tmp);
timevalsub(&tmp, &p->p_stats->p_start);
acct.ac_etime = encode_timeval(tmp);
/* (5) The average amount of memory used */
tmp = ut;
timevaladd(&tmp, &st);
/* Convert tmp (i.e. u + s) into hz units to match ru_i*. */
t = tmp.tv_sec * hz + tmp.tv_usec / tick;
if (t)
acct.ac_mem = encode_long((ru.ru_ixrss + ru.ru_idrss +
+ ru.ru_isrss) / t);
else
acct.ac_mem = 0;
/* (6) The number of disk I/O operations done */
acct.ac_io = encode_long(ru.ru_inblock + ru.ru_oublock);
/* (7) The UID and GID of the process */
acct.ac_uid = p->p_ucred->cr_ruid;
acct.ac_gid = p->p_ucred->cr_rgid;
/* (8) The boolean flags that tell how the process terminated, etc. */
acct.ac_flagx = p->p_acflag;
PROC_UNLOCK(p);
/* Setup ancillary structure fields. */
acct.ac_flagx |= ANVER;
acct.ac_zero = 0;
acct.ac_version = 2;
acct.ac_len = acct.ac_len2 = sizeof(acct);
/*
* Eliminate any file size rlimit.
*/
newlim = lim_alloc();
PROC_LOCK(p);
oldlim = p->p_limit;
lim_copy(newlim, oldlim);
newlim->pl_rlimit[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
p->p_limit = newlim;
PROC_UNLOCK(p);
lim_free(oldlim);
/*
* Write the accounting information to the file.
*/
vfslocked = VFS_LOCK_GIANT(acct_vp->v_mount);
ret = vn_rdwr(UIO_WRITE, acct_vp, (caddr_t)&acct, sizeof (acct),
(off_t)0, UIO_SYSSPACE, IO_APPEND|IO_UNIT, acct_cred, NOCRED,
NULL, td);
VFS_UNLOCK_GIANT(vfslocked);
sx_sunlock(&acct_sx);
return (ret);
}
/* FLOAT_CONVERSION_START (Regression testing; don't remove this line.) */
/* Convert timevals and longs into IEEE-754 bit patterns. */
/* Mantissa mask (MSB is implied, so subtract 1). */
#define MANT_MASK ((1 << (FLT_MANT_DIG - 1)) - 1)
/*
* We calculate integer values to a precision of approximately
* 28 bits.
* This is high-enough precision to fill the 24 float bits
* and low-enough to avoid overflowing the 32 int bits.
*/
#define CALC_BITS 28
/* log_2(1000000). */
#define LOG2_1M 20
/*
* Convert the elements of a timeval into a 32-bit word holding
* the bits of a IEEE-754 float.
* The float value represents the timeval's value in microsecond units.
*/
static uint32_t
encode_timeval(struct timeval tv)
{
int log2_s;
int val, exp; /* Unnormalized value and exponent */
int norm_exp; /* Normalized exponent */
int shift;
/*
* First calculate value and exponent to about CALC_BITS precision.
* Note that the following conditionals have been ordered so that
* the most common cases appear first.
*/
if (tv.tv_sec == 0) {
if (tv.tv_usec == 0)
return (0);
exp = 0;
val = tv.tv_usec;
} else {
/*
* Calculate the value to a precision of approximately
* CALC_BITS.
*/
log2_s = fls(tv.tv_sec) - 1;
if (log2_s + LOG2_1M < CALC_BITS) {
exp = 0;
val = 1000000 * tv.tv_sec + tv.tv_usec;
} else {
exp = log2_s + LOG2_1M - CALC_BITS;
val = (unsigned int)(((uint64_t)1000000 * tv.tv_sec +
tv.tv_usec) >> exp);
}
}
/* Now normalize and pack the value into an IEEE-754 float. */
norm_exp = fls(val) - 1;
shift = FLT_MANT_DIG - norm_exp - 1;
#ifdef ACCT_DEBUG
printf("val=%d exp=%d shift=%d log2(val)=%d\n",
val, exp, shift, norm_exp);
printf("exp=%x mant=%x\n", FLT_MAX_EXP - 1 + exp + norm_exp,
((shift > 0 ? (val << shift) : (val >> -shift)) & MANT_MASK));
#endif
return (((FLT_MAX_EXP - 1 + exp + norm_exp) << (FLT_MANT_DIG - 1)) |
((shift > 0 ? val << shift : val >> -shift) & MANT_MASK));
}
/*
* Convert a non-negative long value into the bit pattern of
* an IEEE-754 float value.
*/
static uint32_t
encode_long(long val)
{
int norm_exp; /* Normalized exponent */
int shift;
if (val == 0)
return (0);
if (val < 0) {
log(LOG_NOTICE,
"encode_long: negative value %ld in accounting record\n",
val);
val = LONG_MAX;
}
norm_exp = fls(val) - 1;
shift = FLT_MANT_DIG - norm_exp - 1;
#ifdef ACCT_DEBUG
printf("val=%d shift=%d log2(val)=%d\n",
val, shift, norm_exp);
printf("exp=%x mant=%x\n", FLT_MAX_EXP - 1 + exp + norm_exp,
((shift > 0 ? (val << shift) : (val >> -shift)) & MANT_MASK));
#endif
return (((FLT_MAX_EXP - 1 + norm_exp) << (FLT_MANT_DIG - 1)) |
((shift > 0 ? val << shift : val >> -shift) & MANT_MASK));
}
/* FLOAT_CONVERSION_END (Regression testing; don't remove this line.) */
/*
* Periodically check the filesystem to see if accounting
* should be turned on or off. Beware the case where the vnode
* has been vgone()'d out from underneath us, e.g. when the file
* system containing the accounting file has been forcibly unmounted.
*/
/* ARGSUSED */
static void
acctwatch(void)
{
struct statfs sb;
int vfslocked;
sx_assert(&acct_sx, SX_XLOCKED);
/*
* If accounting was disabled before our kthread was scheduled,
* then acct_vp might be NULL. If so, just ask our kthread to
* exit and return.
*/
if (acct_vp == NULL) {
acct_state |= ACCT_EXITREQ;
return;
}
/*
* If our vnode is no longer valid, tear it down and signal the
* accounting thread to die.
*/
vfslocked = VFS_LOCK_GIANT(acct_vp->v_mount);
if (acct_vp->v_type == VBAD) {
(void) acct_disable(NULL);
VFS_UNLOCK_GIANT(vfslocked);
acct_state |= ACCT_EXITREQ;
return;
}
/*
* Stopping here is better than continuing, maybe it will be VBAD
* next time around.
*/
if (VFS_STATFS(acct_vp->v_mount, &sb) < 0) {
VFS_UNLOCK_GIANT(vfslocked);
return;
}
VFS_UNLOCK_GIANT(vfslocked);
if (acct_suspended) {
if (sb.f_bavail > (int64_t)(acctresume * sb.f_blocks /
100)) {
acct_suspended = 0;
log(LOG_NOTICE, "Accounting resumed\n");
}
} else {
if (sb.f_bavail <= (int64_t)(acctsuspend * sb.f_blocks /
100)) {
acct_suspended = 1;
log(LOG_NOTICE, "Accounting suspended\n");
}
}
}
/*
* The main loop for the dedicated kernel thread that periodically calls
* acctwatch().
*/
static void
acct_thread(void *dummy)
{
u_char pri;
/* This is a low-priority kernel thread. */
pri = PRI_MAX_KERN;
thread_lock(curthread);
sched_prio(curthread, pri);
thread_unlock(curthread);
/* If another accounting kthread is already running, just die. */
sx_xlock(&acct_sx);
if (acct_state & ACCT_RUNNING) {
sx_xunlock(&acct_sx);
kproc_exit(0);
}
acct_state |= ACCT_RUNNING;
/* Loop until we are asked to exit. */
while (!(acct_state & ACCT_EXITREQ)) {
/* Perform our periodic checks. */
acctwatch();
/*
* We check this flag again before sleeping since the
* acctwatch() might have shut down accounting and asked us
* to exit.
*/
if (!(acct_state & ACCT_EXITREQ)) {
sx_sleep(&acct_state, &acct_sx, 0, "-",
acctchkfreq * hz);
}
}
/*
* Acknowledge the exit request and shutdown. We clear both the
* exit request and running flags.
*/
acct_state = 0;
sx_xunlock(&acct_sx);
kproc_exit(0);
}
Index: head/sys/kern/kern_context.c
===================================================================
--- head/sys/kern/kern_context.c (revision 225616)
+++ head/sys/kern/kern_context.c (revision 225617)
@@ -1,129 +1,129 @@
/*-
* Copyright (c) 2002 Daniel M. Eischen <deischen@freebsd.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/syscallsubr.h>
#include <sys/sysent.h>
#include <sys/systm.h>
#include <sys/sysproto.h>
#include <sys/signalvar.h>
#include <sys/ucontext.h>
/*
* The first two fields of a ucontext_t are the signal mask and the machine
* context. The next field is uc_link; we want to avoid destroying the link
* when copying out contexts.
*/
#define UC_COPY_SIZE offsetof(ucontext_t, uc_link)
#ifndef _SYS_SYSPROTO_H_
struct getcontext_args {
struct __ucontext *ucp;
}
struct setcontext_args {
const struct __ucontext_t *ucp;
}
struct swapcontext_args {
struct __ucontext *oucp;
const struct __ucontext_t *ucp;
}
#endif
int
-getcontext(struct thread *td, struct getcontext_args *uap)
+sys_getcontext(struct thread *td, struct getcontext_args *uap)
{
ucontext_t uc;
int ret;
if (uap->ucp == NULL)
ret = EINVAL;
else {
get_mcontext(td, &uc.uc_mcontext, GET_MC_CLEAR_RET);
PROC_LOCK(td->td_proc);
uc.uc_sigmask = td->td_sigmask;
PROC_UNLOCK(td->td_proc);
bzero(uc.__spare__, sizeof(uc.__spare__));
ret = copyout(&uc, uap->ucp, UC_COPY_SIZE);
}
return (ret);
}
int
-setcontext(struct thread *td, struct setcontext_args *uap)
+sys_setcontext(struct thread *td, struct setcontext_args *uap)
{
ucontext_t uc;
int ret;
if (uap->ucp == NULL)
ret = EINVAL;
else {
ret = copyin(uap->ucp, &uc, UC_COPY_SIZE);
if (ret == 0) {
ret = set_mcontext(td, &uc.uc_mcontext);
if (ret == 0) {
kern_sigprocmask(td, SIG_SETMASK, &uc.uc_sigmask,
NULL, 0);
}
}
}
return (ret == 0 ? EJUSTRETURN : ret);
}
int
-swapcontext(struct thread *td, struct swapcontext_args *uap)
+sys_swapcontext(struct thread *td, struct swapcontext_args *uap)
{
ucontext_t uc;
int ret;
if (uap->oucp == NULL || uap->ucp == NULL)
ret = EINVAL;
else {
get_mcontext(td, &uc.uc_mcontext, GET_MC_CLEAR_RET);
bzero(uc.__spare__, sizeof(uc.__spare__));
PROC_LOCK(td->td_proc);
uc.uc_sigmask = td->td_sigmask;
PROC_UNLOCK(td->td_proc);
ret = copyout(&uc, uap->oucp, UC_COPY_SIZE);
if (ret == 0) {
ret = copyin(uap->ucp, &uc, UC_COPY_SIZE);
if (ret == 0) {
ret = set_mcontext(td, &uc.uc_mcontext);
if (ret == 0) {
kern_sigprocmask(td, SIG_SETMASK,
&uc.uc_sigmask, NULL, 0);
}
}
}
}
return (ret == 0 ? EJUSTRETURN : ret);
}
Index: head/sys/kern/kern_cpuset.c
===================================================================
--- head/sys/kern/kern_cpuset.c (revision 225616)
+++ head/sys/kern/kern_cpuset.c (revision 225617)
@@ -1,1173 +1,1173 @@
/*-
* Copyright (c) 2008, Jeffrey Roberson <jeff@freebsd.org>
* All rights reserved.
*
* Copyright (c) 2008 Nokia Corporation
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice unmodified, this list of conditions, and the following
* disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_ddb.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/sysproto.h>
#include <sys/jail.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/refcount.h>
#include <sys/sched.h>
#include <sys/smp.h>
#include <sys/syscallsubr.h>
#include <sys/cpuset.h>
#include <sys/sx.h>
#include <sys/queue.h>
#include <sys/libkern.h>
#include <sys/limits.h>
#include <sys/bus.h>
#include <sys/interrupt.h>
#include <vm/uma.h>
#ifdef DDB
#include <ddb/ddb.h>
#endif /* DDB */
/*
* cpusets provide a mechanism for creating and manipulating sets of
* processors for the purpose of constraining the scheduling of threads to
* specific processors.
*
* Each process belongs to an identified set, by default this is set 1. Each
* thread may further restrict the cpus it may run on to a subset of this
* named set. This creates an anonymous set which other threads and processes
* may not join by number.
*
* The named set is referred to herein as the 'base' set to avoid ambiguity.
* This set is usually a child of a 'root' set while the anonymous set may
* simply be referred to as a mask. In the syscall api these are referred to
* as the ROOT, CPUSET, and MASK levels where CPUSET is called 'base' here.
*
* Threads inherit their set from their creator whether it be anonymous or
* not. This means that anonymous sets are immutable because they may be
* shared. To modify an anonymous set a new set is created with the desired
* mask and the same parent as the existing anonymous set. This gives the
* illusion of each thread having a private mask.
*
* Via the syscall apis a user may ask to retrieve or modify the root, base,
* or mask that is discovered via a pid, tid, or setid. Modifying a set
* modifies all numbered and anonymous child sets to comply with the new mask.
* Modifying a pid or tid's mask applies only to that tid but must still
* exist within the assigned parent set.
*
* A thread may not be assigned to a group separate from other threads in
* the process. This is to remove ambiguity when the setid is queried with
* a pid argument. There is no other technical limitation.
*
* This somewhat complex arrangement is intended to make it easy for
* applications to query available processors and bind their threads to
* specific processors while also allowing administrators to dynamically
* reprovision by changing sets which apply to groups of processes.
*
* A simple application should not concern itself with sets at all and
* rather apply masks to its own threads via CPU_WHICH_TID and a -1 id
* meaning 'curthread'. It may query available cpus for that tid with a
* getaffinity call using (CPU_LEVEL_CPUSET, CPU_WHICH_PID, -1, ...).
*/
static uma_zone_t cpuset_zone;
static struct mtx cpuset_lock;
static struct setlist cpuset_ids;
static struct unrhdr *cpuset_unr;
static struct cpuset *cpuset_zero;
/* Return the size of cpuset_t at the kernel level */
SYSCTL_INT(_kern_sched, OID_AUTO, cpusetsize, CTLFLAG_RD,
0, sizeof(cpuset_t), "sizeof(cpuset_t)");
cpuset_t *cpuset_root;
/*
* Acquire a reference to a cpuset, all pointers must be tracked with refs.
*/
struct cpuset *
cpuset_ref(struct cpuset *set)
{
refcount_acquire(&set->cs_ref);
return (set);
}
/*
* Walks up the tree from 'set' to find the root. Returns the root
* referenced.
*/
static struct cpuset *
cpuset_refroot(struct cpuset *set)
{
for (; set->cs_parent != NULL; set = set->cs_parent)
if (set->cs_flags & CPU_SET_ROOT)
break;
cpuset_ref(set);
return (set);
}
/*
* Find the first non-anonymous set starting from 'set'. Returns this set
* referenced. May return the passed in set with an extra ref if it is
* not anonymous.
*/
static struct cpuset *
cpuset_refbase(struct cpuset *set)
{
if (set->cs_id == CPUSET_INVALID)
set = set->cs_parent;
cpuset_ref(set);
return (set);
}
/*
* Release a reference in a context where it is safe to allocate.
*/
void
cpuset_rel(struct cpuset *set)
{
cpusetid_t id;
if (refcount_release(&set->cs_ref) == 0)
return;
mtx_lock_spin(&cpuset_lock);
LIST_REMOVE(set, cs_siblings);
id = set->cs_id;
if (id != CPUSET_INVALID)
LIST_REMOVE(set, cs_link);
mtx_unlock_spin(&cpuset_lock);
cpuset_rel(set->cs_parent);
uma_zfree(cpuset_zone, set);
if (id != CPUSET_INVALID)
free_unr(cpuset_unr, id);
}
/*
* Deferred release must be used when in a context that is not safe to
* allocate/free. This places any unreferenced sets on the list 'head'.
*/
static void
cpuset_rel_defer(struct setlist *head, struct cpuset *set)
{
if (refcount_release(&set->cs_ref) == 0)
return;
mtx_lock_spin(&cpuset_lock);
LIST_REMOVE(set, cs_siblings);
if (set->cs_id != CPUSET_INVALID)
LIST_REMOVE(set, cs_link);
LIST_INSERT_HEAD(head, set, cs_link);
mtx_unlock_spin(&cpuset_lock);
}
/*
* Complete a deferred release. Removes the set from the list provided to
* cpuset_rel_defer.
*/
static void
cpuset_rel_complete(struct cpuset *set)
{
LIST_REMOVE(set, cs_link);
cpuset_rel(set->cs_parent);
uma_zfree(cpuset_zone, set);
}
/*
* Find a set based on an id. Returns it with a ref.
*/
static struct cpuset *
cpuset_lookup(cpusetid_t setid, struct thread *td)
{
struct cpuset *set;
if (setid == CPUSET_INVALID)
return (NULL);
mtx_lock_spin(&cpuset_lock);
LIST_FOREACH(set, &cpuset_ids, cs_link)
if (set->cs_id == setid)
break;
if (set)
cpuset_ref(set);
mtx_unlock_spin(&cpuset_lock);
KASSERT(td != NULL, ("[%s:%d] td is NULL", __func__, __LINE__));
if (set != NULL && jailed(td->td_ucred)) {
struct cpuset *jset, *tset;
jset = td->td_ucred->cr_prison->pr_cpuset;
for (tset = set; tset != NULL; tset = tset->cs_parent)
if (tset == jset)
break;
if (tset == NULL) {
cpuset_rel(set);
set = NULL;
}
}
return (set);
}
/*
* Create a set in the space provided in 'set' with the provided parameters.
* The set is returned with a single ref. May return EDEADLK if the set
* will have no valid cpu based on restrictions from the parent.
*/
static int
_cpuset_create(struct cpuset *set, struct cpuset *parent, const cpuset_t *mask,
cpusetid_t id)
{
if (!CPU_OVERLAP(&parent->cs_mask, mask))
return (EDEADLK);
CPU_COPY(mask, &set->cs_mask);
LIST_INIT(&set->cs_children);
refcount_init(&set->cs_ref, 1);
set->cs_flags = 0;
mtx_lock_spin(&cpuset_lock);
CPU_AND(&set->cs_mask, &parent->cs_mask);
set->cs_id = id;
set->cs_parent = cpuset_ref(parent);
LIST_INSERT_HEAD(&parent->cs_children, set, cs_siblings);
if (set->cs_id != CPUSET_INVALID)
LIST_INSERT_HEAD(&cpuset_ids, set, cs_link);
mtx_unlock_spin(&cpuset_lock);
return (0);
}
/*
* Create a new non-anonymous set with the requested parent and mask. May
* return failures if the mask is invalid or a new number can not be
* allocated.
*/
static int
cpuset_create(struct cpuset **setp, struct cpuset *parent, const cpuset_t *mask)
{
struct cpuset *set;
cpusetid_t id;
int error;
id = alloc_unr(cpuset_unr);
if (id == -1)
return (ENFILE);
*setp = set = uma_zalloc(cpuset_zone, M_WAITOK);
error = _cpuset_create(set, parent, mask, id);
if (error == 0)
return (0);
free_unr(cpuset_unr, id);
uma_zfree(cpuset_zone, set);
return (error);
}
/*
* Recursively check for errors that would occur from applying mask to
* the tree of sets starting at 'set'. Checks for sets that would become
* empty as well as RDONLY flags.
*/
static int
cpuset_testupdate(struct cpuset *set, cpuset_t *mask)
{
struct cpuset *nset;
cpuset_t newmask;
int error;
mtx_assert(&cpuset_lock, MA_OWNED);
if (set->cs_flags & CPU_SET_RDONLY)
return (EPERM);
if (!CPU_OVERLAP(&set->cs_mask, mask))
return (EDEADLK);
CPU_COPY(&set->cs_mask, &newmask);
CPU_AND(&newmask, mask);
error = 0;
LIST_FOREACH(nset, &set->cs_children, cs_siblings)
if ((error = cpuset_testupdate(nset, &newmask)) != 0)
break;
return (error);
}
/*
* Applies the mask 'mask' without checking for empty sets or permissions.
*/
static void
cpuset_update(struct cpuset *set, cpuset_t *mask)
{
struct cpuset *nset;
mtx_assert(&cpuset_lock, MA_OWNED);
CPU_AND(&set->cs_mask, mask);
LIST_FOREACH(nset, &set->cs_children, cs_siblings)
cpuset_update(nset, &set->cs_mask);
return;
}
/*
* Modify the set 'set' to use a copy of the mask provided. Apply this new
* mask to restrict all children in the tree. Checks for validity before
* applying the changes.
*/
static int
cpuset_modify(struct cpuset *set, cpuset_t *mask)
{
struct cpuset *root;
int error;
error = priv_check(curthread, PRIV_SCHED_CPUSET);
if (error)
return (error);
/*
* In case we are called from within the jail
* we do not allow modifying the dedicated root
* cpuset of the jail but may still allow to
* change child sets.
*/
if (jailed(curthread->td_ucred) &&
set->cs_flags & CPU_SET_ROOT)
return (EPERM);
/*
* Verify that we have access to this set of
* cpus.
*/
root = set->cs_parent;
if (root && !CPU_SUBSET(&root->cs_mask, mask))
return (EINVAL);
mtx_lock_spin(&cpuset_lock);
error = cpuset_testupdate(set, mask);
if (error)
goto out;
cpuset_update(set, mask);
CPU_COPY(mask, &set->cs_mask);
out:
mtx_unlock_spin(&cpuset_lock);
return (error);
}
/*
* Resolve the 'which' parameter of several cpuset apis.
*
* For WHICH_PID and WHICH_TID return a locked proc and valid proc/tid. Also
* checks for permission via p_cansched().
*
* For WHICH_SET returns a valid set with a new reference.
*
* -1 may be supplied for any argument to mean the current proc/thread or
* the base set of the current thread. May fail with ESRCH/EPERM.
*/
static int
cpuset_which(cpuwhich_t which, id_t id, struct proc **pp, struct thread **tdp,
struct cpuset **setp)
{
struct cpuset *set;
struct thread *td;
struct proc *p;
int error;
*pp = p = NULL;
*tdp = td = NULL;
*setp = set = NULL;
switch (which) {
case CPU_WHICH_PID:
if (id == -1) {
PROC_LOCK(curproc);
p = curproc;
break;
}
if ((p = pfind(id)) == NULL)
return (ESRCH);
break;
case CPU_WHICH_TID:
if (id == -1) {
PROC_LOCK(curproc);
p = curproc;
td = curthread;
break;
}
td = tdfind(id, -1);
if (td == NULL)
return (ESRCH);
p = td->td_proc;
break;
case CPU_WHICH_CPUSET:
if (id == -1) {
thread_lock(curthread);
set = cpuset_refbase(curthread->td_cpuset);
thread_unlock(curthread);
} else
set = cpuset_lookup(id, curthread);
if (set) {
*setp = set;
return (0);
}
return (ESRCH);
case CPU_WHICH_JAIL:
{
/* Find `set' for prison with given id. */
struct prison *pr;
sx_slock(&allprison_lock);
pr = prison_find_child(curthread->td_ucred->cr_prison, id);
sx_sunlock(&allprison_lock);
if (pr == NULL)
return (ESRCH);
cpuset_ref(pr->pr_cpuset);
*setp = pr->pr_cpuset;
mtx_unlock(&pr->pr_mtx);
return (0);
}
case CPU_WHICH_IRQ:
return (0);
default:
return (EINVAL);
}
error = p_cansched(curthread, p);
if (error) {
PROC_UNLOCK(p);
return (error);
}
if (td == NULL)
td = FIRST_THREAD_IN_PROC(p);
*pp = p;
*tdp = td;
return (0);
}
/*
* Create an anonymous set with the provided mask in the space provided by
* 'fset'. If the passed in set is anonymous we use its parent otherwise
* the new set is a child of 'set'.
*/
static int
cpuset_shadow(struct cpuset *set, struct cpuset *fset, const cpuset_t *mask)
{
struct cpuset *parent;
if (set->cs_id == CPUSET_INVALID)
parent = set->cs_parent;
else
parent = set;
if (!CPU_SUBSET(&parent->cs_mask, mask))
return (EDEADLK);
return (_cpuset_create(fset, parent, mask, CPUSET_INVALID));
}
/*
* Handle two cases for replacing the base set or mask of an entire process.
*
* 1) Set is non-null and mask is null. This reparents all anonymous sets
* to the provided set and replaces all non-anonymous td_cpusets with the
* provided set.
* 2) Mask is non-null and set is null. This replaces or creates anonymous
* sets for every thread with the existing base as a parent.
*
* This is overly complicated because we can't allocate while holding a
* spinlock and spinlocks must be held while changing and examining thread
* state.
*/
static int
cpuset_setproc(pid_t pid, struct cpuset *set, cpuset_t *mask)
{
struct setlist freelist;
struct setlist droplist;
struct cpuset *tdset;
struct cpuset *nset;
struct thread *td;
struct proc *p;
int threads;
int nfree;
int error;
/*
* The algorithm requires two passes due to locking considerations.
*
* 1) Lookup the process and acquire the locks in the required order.
* 2) If enough cpusets have not been allocated release the locks and
* allocate them. Loop.
*/
LIST_INIT(&freelist);
LIST_INIT(&droplist);
nfree = 0;
for (;;) {
error = cpuset_which(CPU_WHICH_PID, pid, &p, &td, &nset);
if (error)
goto out;
if (nfree >= p->p_numthreads)
break;
threads = p->p_numthreads;
PROC_UNLOCK(p);
for (; nfree < threads; nfree++) {
nset = uma_zalloc(cpuset_zone, M_WAITOK);
LIST_INSERT_HEAD(&freelist, nset, cs_link);
}
}
PROC_LOCK_ASSERT(p, MA_OWNED);
/*
* Now that the appropriate locks are held and we have enough cpusets,
* make sure the operation will succeed before applying changes. The
* proc lock prevents td_cpuset from changing between calls.
*/
error = 0;
FOREACH_THREAD_IN_PROC(p, td) {
thread_lock(td);
tdset = td->td_cpuset;
/*
* Verify that a new mask doesn't specify cpus outside of
* the set the thread is a member of.
*/
if (mask) {
if (tdset->cs_id == CPUSET_INVALID)
tdset = tdset->cs_parent;
if (!CPU_SUBSET(&tdset->cs_mask, mask))
error = EDEADLK;
/*
* Verify that a new set won't leave an existing thread
* mask without a cpu to run on. It can, however, restrict
* the set.
*/
} else if (tdset->cs_id == CPUSET_INVALID) {
if (!CPU_OVERLAP(&set->cs_mask, &tdset->cs_mask))
error = EDEADLK;
}
thread_unlock(td);
if (error)
goto unlock_out;
}
/*
* Replace each thread's cpuset while using deferred release. We
* must do this because the thread lock must be held while operating
* on the thread and this limits the type of operations allowed.
*/
FOREACH_THREAD_IN_PROC(p, td) {
thread_lock(td);
/*
* If we presently have an anonymous set or are applying a
* mask we must create an anonymous shadow set. That is
* either parented to our existing base or the supplied set.
*
* If we have a base set with no anonymous shadow we simply
* replace it outright.
*/
tdset = td->td_cpuset;
if (tdset->cs_id == CPUSET_INVALID || mask) {
nset = LIST_FIRST(&freelist);
LIST_REMOVE(nset, cs_link);
if (mask)
error = cpuset_shadow(tdset, nset, mask);
else
error = _cpuset_create(nset, set,
&tdset->cs_mask, CPUSET_INVALID);
if (error) {
LIST_INSERT_HEAD(&freelist, nset, cs_link);
thread_unlock(td);
break;
}
} else
nset = cpuset_ref(set);
cpuset_rel_defer(&droplist, tdset);
td->td_cpuset = nset;
sched_affinity(td);
thread_unlock(td);
}
unlock_out:
PROC_UNLOCK(p);
out:
while ((nset = LIST_FIRST(&droplist)) != NULL)
cpuset_rel_complete(nset);
while ((nset = LIST_FIRST(&freelist)) != NULL) {
LIST_REMOVE(nset, cs_link);
uma_zfree(cpuset_zone, nset);
}
return (error);
}
/*
* Calculate the ffs() of the cpuset.
*/
int
cpusetobj_ffs(const cpuset_t *set)
{
size_t i;
int cbit;
cbit = 0;
for (i = 0; i < _NCPUWORDS; i++) {
if (set->__bits[i] != 0) {
cbit = ffsl(set->__bits[i]);
cbit += i * _NCPUBITS;
break;
}
}
return (cbit);
}
/*
* Return a string representing a valid layout for a cpuset_t object.
* It expects an incoming buffer at least sized as CPUSETBUFSIZ.
*/
char *
cpusetobj_strprint(char *buf, const cpuset_t *set)
{
char *tbuf;
size_t i, bytesp, bufsiz;
tbuf = buf;
bytesp = 0;
bufsiz = CPUSETBUFSIZ;
for (i = _NCPUWORDS - 1; i > 0; i--) {
bytesp = snprintf(tbuf, bufsiz, "%lx, ", set->__bits[i]);
bufsiz -= bytesp;
tbuf += bytesp;
}
snprintf(tbuf, bufsiz, "%lx", set->__bits[0]);
return (buf);
}
/*
* Build a valid cpuset_t object from a string representation.
* It expects an incoming buffer at least sized as CPUSETBUFSIZ.
*/
int
cpusetobj_strscan(cpuset_t *set, const char *buf)
{
u_int nwords;
int i, ret;
if (strlen(buf) > CPUSETBUFSIZ - 1)
return (-1);
/* Allow to pass a shorter version of the mask when necessary. */
nwords = 1;
for (i = 0; buf[i] != '\0'; i++)
if (buf[i] == ',')
nwords++;
if (nwords > _NCPUWORDS)
return (-1);
CPU_ZERO(set);
for (i = nwords - 1; i > 0; i--) {
ret = sscanf(buf, "%lx, ", &set->__bits[i]);
if (ret == 0 || ret == -1)
return (-1);
buf = strstr(buf, " ");
if (buf == NULL)
return (-1);
buf++;
}
ret = sscanf(buf, "%lx", &set->__bits[0]);
if (ret == 0 || ret == -1)
return (-1);
return (0);
}
/*
* Apply an anonymous mask to a single thread.
*/
int
cpuset_setthread(lwpid_t id, cpuset_t *mask)
{
struct cpuset *nset;
struct cpuset *set;
struct thread *td;
struct proc *p;
int error;
nset = uma_zalloc(cpuset_zone, M_WAITOK);
error = cpuset_which(CPU_WHICH_TID, id, &p, &td, &set);
if (error)
goto out;
set = NULL;
thread_lock(td);
error = cpuset_shadow(td->td_cpuset, nset, mask);
if (error == 0) {
set = td->td_cpuset;
td->td_cpuset = nset;
sched_affinity(td);
nset = NULL;
}
thread_unlock(td);
PROC_UNLOCK(p);
if (set)
cpuset_rel(set);
out:
if (nset)
uma_zfree(cpuset_zone, nset);
return (error);
}
/*
* Creates the cpuset for thread0. We make two sets:
*
* 0 - The root set which should represent all valid processors in the
* system. It is initially created with a mask of all processors
* because we don't know what processors are valid until cpuset_init()
* runs. This set is immutable.
* 1 - The default set which all processes are a member of until changed.
* This allows an administrator to move all threads off of given cpus to
* dedicate them to high priority tasks or save power etc.
*/
struct cpuset *
cpuset_thread0(void)
{
struct cpuset *set;
int error;
cpuset_zone = uma_zcreate("cpuset", sizeof(struct cpuset), NULL, NULL,
NULL, NULL, UMA_ALIGN_PTR, 0);
mtx_init(&cpuset_lock, "cpuset", NULL, MTX_SPIN | MTX_RECURSE);
/*
* Create the root system set for the whole machine. Doesn't use
* cpuset_create() due to NULL parent.
*/
set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO);
CPU_FILL(&set->cs_mask);
LIST_INIT(&set->cs_children);
LIST_INSERT_HEAD(&cpuset_ids, set, cs_link);
set->cs_ref = 1;
set->cs_flags = CPU_SET_ROOT;
cpuset_zero = set;
cpuset_root = &set->cs_mask;
/*
* Now derive a default, modifiable set from that to give out.
*/
set = uma_zalloc(cpuset_zone, M_WAITOK);
error = _cpuset_create(set, cpuset_zero, &cpuset_zero->cs_mask, 1);
KASSERT(error == 0, ("Error creating default set: %d\n", error));
/*
* Initialize the unit allocator. 0 and 1 are allocated above.
*/
cpuset_unr = new_unrhdr(2, INT_MAX, NULL);
return (set);
}
/*
* Create a cpuset, which would be cpuset_create() but
* mark the new 'set' as root.
*
* We are not going to reparent the td to it. Use cpuset_setproc_update_set()
* for that.
*
* In case of no error, returns the set in *setp locked with a reference.
*/
int
cpuset_create_root(struct prison *pr, struct cpuset **setp)
{
struct cpuset *set;
int error;
KASSERT(pr != NULL, ("[%s:%d] invalid pr", __func__, __LINE__));
KASSERT(setp != NULL, ("[%s:%d] invalid setp", __func__, __LINE__));
error = cpuset_create(setp, pr->pr_cpuset, &pr->pr_cpuset->cs_mask);
if (error)
return (error);
KASSERT(*setp != NULL, ("[%s:%d] cpuset_create returned invalid data",
__func__, __LINE__));
/* Mark the set as root. */
set = *setp;
set->cs_flags |= CPU_SET_ROOT;
return (0);
}
int
cpuset_setproc_update_set(struct proc *p, struct cpuset *set)
{
int error;
KASSERT(p != NULL, ("[%s:%d] invalid proc", __func__, __LINE__));
KASSERT(set != NULL, ("[%s:%d] invalid set", __func__, __LINE__));
cpuset_ref(set);
error = cpuset_setproc(p->p_pid, set, NULL);
if (error)
return (error);
cpuset_rel(set);
return (0);
}
/*
* This is called once the final set of system cpus is known. Modifies
* the root set and all children and mark the root read-only.
*/
static void
cpuset_init(void *arg)
{
cpuset_t mask;
mask = all_cpus;
if (cpuset_modify(cpuset_zero, &mask))
panic("Can't set initial cpuset mask.\n");
cpuset_zero->cs_flags |= CPU_SET_RDONLY;
}
SYSINIT(cpuset, SI_SUB_SMP, SI_ORDER_ANY, cpuset_init, NULL);
#ifndef _SYS_SYSPROTO_H_
struct cpuset_args {
cpusetid_t *setid;
};
#endif
int
-cpuset(struct thread *td, struct cpuset_args *uap)
+sys_cpuset(struct thread *td, struct cpuset_args *uap)
{
struct cpuset *root;
struct cpuset *set;
int error;
thread_lock(td);
root = cpuset_refroot(td->td_cpuset);
thread_unlock(td);
error = cpuset_create(&set, root, &root->cs_mask);
cpuset_rel(root);
if (error)
return (error);
error = copyout(&set->cs_id, uap->setid, sizeof(set->cs_id));
if (error == 0)
error = cpuset_setproc(-1, set, NULL);
cpuset_rel(set);
return (error);
}
#ifndef _SYS_SYSPROTO_H_
struct cpuset_setid_args {
cpuwhich_t which;
id_t id;
cpusetid_t setid;
};
#endif
int
-cpuset_setid(struct thread *td, struct cpuset_setid_args *uap)
+sys_cpuset_setid(struct thread *td, struct cpuset_setid_args *uap)
{
struct cpuset *set;
int error;
/*
* Presently we only support per-process sets.
*/
if (uap->which != CPU_WHICH_PID)
return (EINVAL);
set = cpuset_lookup(uap->setid, td);
if (set == NULL)
return (ESRCH);
error = cpuset_setproc(uap->id, set, NULL);
cpuset_rel(set);
return (error);
}
#ifndef _SYS_SYSPROTO_H_
struct cpuset_getid_args {
cpulevel_t level;
cpuwhich_t which;
id_t id;
cpusetid_t *setid;
#endif
int
-cpuset_getid(struct thread *td, struct cpuset_getid_args *uap)
+sys_cpuset_getid(struct thread *td, struct cpuset_getid_args *uap)
{
struct cpuset *nset;
struct cpuset *set;
struct thread *ttd;
struct proc *p;
cpusetid_t id;
int error;
if (uap->level == CPU_LEVEL_WHICH && uap->which != CPU_WHICH_CPUSET)
return (EINVAL);
error = cpuset_which(uap->which, uap->id, &p, &ttd, &set);
if (error)
return (error);
switch (uap->which) {
case CPU_WHICH_TID:
case CPU_WHICH_PID:
thread_lock(ttd);
set = cpuset_refbase(ttd->td_cpuset);
thread_unlock(ttd);
PROC_UNLOCK(p);
break;
case CPU_WHICH_CPUSET:
case CPU_WHICH_JAIL:
break;
case CPU_WHICH_IRQ:
return (EINVAL);
}
switch (uap->level) {
case CPU_LEVEL_ROOT:
nset = cpuset_refroot(set);
cpuset_rel(set);
set = nset;
break;
case CPU_LEVEL_CPUSET:
break;
case CPU_LEVEL_WHICH:
break;
}
id = set->cs_id;
cpuset_rel(set);
if (error == 0)
error = copyout(&id, uap->setid, sizeof(id));
return (error);
}
#ifndef _SYS_SYSPROTO_H_
struct cpuset_getaffinity_args {
cpulevel_t level;
cpuwhich_t which;
id_t id;
size_t cpusetsize;
cpuset_t *mask;
};
#endif
int
-cpuset_getaffinity(struct thread *td, struct cpuset_getaffinity_args *uap)
+sys_cpuset_getaffinity(struct thread *td, struct cpuset_getaffinity_args *uap)
{
struct thread *ttd;
struct cpuset *nset;
struct cpuset *set;
struct proc *p;
cpuset_t *mask;
int error;
size_t size;
if (uap->cpusetsize < sizeof(cpuset_t) ||
uap->cpusetsize > CPU_MAXSIZE / NBBY)
return (ERANGE);
size = uap->cpusetsize;
mask = malloc(size, M_TEMP, M_WAITOK | M_ZERO);
error = cpuset_which(uap->which, uap->id, &p, &ttd, &set);
if (error)
goto out;
switch (uap->level) {
case CPU_LEVEL_ROOT:
case CPU_LEVEL_CPUSET:
switch (uap->which) {
case CPU_WHICH_TID:
case CPU_WHICH_PID:
thread_lock(ttd);
set = cpuset_ref(ttd->td_cpuset);
thread_unlock(ttd);
break;
case CPU_WHICH_CPUSET:
case CPU_WHICH_JAIL:
break;
case CPU_WHICH_IRQ:
error = EINVAL;
goto out;
}
if (uap->level == CPU_LEVEL_ROOT)
nset = cpuset_refroot(set);
else
nset = cpuset_refbase(set);
CPU_COPY(&nset->cs_mask, mask);
cpuset_rel(nset);
break;
case CPU_LEVEL_WHICH:
switch (uap->which) {
case CPU_WHICH_TID:
thread_lock(ttd);
CPU_COPY(&ttd->td_cpuset->cs_mask, mask);
thread_unlock(ttd);
break;
case CPU_WHICH_PID:
FOREACH_THREAD_IN_PROC(p, ttd) {
thread_lock(ttd);
CPU_OR(mask, &ttd->td_cpuset->cs_mask);
thread_unlock(ttd);
}
break;
case CPU_WHICH_CPUSET:
case CPU_WHICH_JAIL:
CPU_COPY(&set->cs_mask, mask);
break;
case CPU_WHICH_IRQ:
error = intr_getaffinity(uap->id, mask);
break;
}
break;
default:
error = EINVAL;
break;
}
if (set)
cpuset_rel(set);
if (p)
PROC_UNLOCK(p);
if (error == 0)
error = copyout(mask, uap->mask, size);
out:
free(mask, M_TEMP);
return (error);
}
#ifndef _SYS_SYSPROTO_H_
struct cpuset_setaffinity_args {
cpulevel_t level;
cpuwhich_t which;
id_t id;
size_t cpusetsize;
const cpuset_t *mask;
};
#endif
int
-cpuset_setaffinity(struct thread *td, struct cpuset_setaffinity_args *uap)
+sys_cpuset_setaffinity(struct thread *td, struct cpuset_setaffinity_args *uap)
{
struct cpuset *nset;
struct cpuset *set;
struct thread *ttd;
struct proc *p;
cpuset_t *mask;
int error;
if (uap->cpusetsize < sizeof(cpuset_t) ||
uap->cpusetsize > CPU_MAXSIZE / NBBY)
return (ERANGE);
mask = malloc(uap->cpusetsize, M_TEMP, M_WAITOK | M_ZERO);
error = copyin(uap->mask, mask, uap->cpusetsize);
if (error)
goto out;
/*
* Verify that no high bits are set.
*/
if (uap->cpusetsize > sizeof(cpuset_t)) {
char *end;
char *cp;
end = cp = (char *)&mask->__bits;
end += uap->cpusetsize;
cp += sizeof(cpuset_t);
while (cp != end)
if (*cp++ != 0) {
error = EINVAL;
goto out;
}
}
switch (uap->level) {
case CPU_LEVEL_ROOT:
case CPU_LEVEL_CPUSET:
error = cpuset_which(uap->which, uap->id, &p, &ttd, &set);
if (error)
break;
switch (uap->which) {
case CPU_WHICH_TID:
case CPU_WHICH_PID:
thread_lock(ttd);
set = cpuset_ref(ttd->td_cpuset);
thread_unlock(ttd);
PROC_UNLOCK(p);
break;
case CPU_WHICH_CPUSET:
case CPU_WHICH_JAIL:
break;
case CPU_WHICH_IRQ:
error = EINVAL;
goto out;
}
if (uap->level == CPU_LEVEL_ROOT)
nset = cpuset_refroot(set);
else
nset = cpuset_refbase(set);
error = cpuset_modify(nset, mask);
cpuset_rel(nset);
cpuset_rel(set);
break;
case CPU_LEVEL_WHICH:
switch (uap->which) {
case CPU_WHICH_TID:
error = cpuset_setthread(uap->id, mask);
break;
case CPU_WHICH_PID:
error = cpuset_setproc(uap->id, NULL, mask);
break;
case CPU_WHICH_CPUSET:
case CPU_WHICH_JAIL:
error = cpuset_which(uap->which, uap->id, &p,
&ttd, &set);
if (error == 0) {
error = cpuset_modify(set, mask);
cpuset_rel(set);
}
break;
case CPU_WHICH_IRQ:
error = intr_setaffinity(uap->id, mask);
break;
default:
error = EINVAL;
break;
}
break;
default:
error = EINVAL;
break;
}
out:
free(mask, M_TEMP);
return (error);
}
#ifdef DDB
DB_SHOW_COMMAND(cpusets, db_show_cpusets)
{
struct cpuset *set;
int cpu, once;
LIST_FOREACH(set, &cpuset_ids, cs_link) {
db_printf("set=%p id=%-6u ref=%-6d flags=0x%04x parent id=%d\n",
set, set->cs_id, set->cs_ref, set->cs_flags,
(set->cs_parent != NULL) ? set->cs_parent->cs_id : 0);
db_printf(" mask=");
for (once = 0, cpu = 0; cpu < CPU_SETSIZE; cpu++) {
if (CPU_ISSET(cpu, &set->cs_mask)) {
if (once == 0) {
db_printf("%d", cpu);
once = 1;
} else
db_printf(",%d", cpu);
}
}
db_printf("\n");
if (db_pager_quit)
break;
}
}
#endif /* DDB */
Index: head/sys/kern/kern_descrip.c
===================================================================
--- head/sys/kern/kern_descrip.c (revision 225616)
+++ head/sys/kern/kern_descrip.c (revision 225617)
@@ -1,3904 +1,3904 @@
/*-
* Copyright (c) 1982, 1986, 1989, 1991, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_descrip.c 8.6 (Berkeley) 4/19/94
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_capsicum.h"
#include "opt_compat.h"
#include "opt_ddb.h"
#include "opt_ktrace.h"
#include "opt_procdesc.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/capability.h>
#include <sys/conf.h>
#include <sys/domain.h>
#include <sys/fcntl.h>
#include <sys/file.h>
#include <sys/filedesc.h>
#include <sys/filio.h>
#include <sys/jail.h>
#include <sys/kernel.h>
#include <sys/limits.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mount.h>
#include <sys/mqueue.h>
#include <sys/mutex.h>
#include <sys/namei.h>
#include <sys/selinfo.h>
#include <sys/pipe.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/procdesc.h>
#include <sys/protosw.h>
#include <sys/racct.h>
#include <sys/resourcevar.h>
#include <sys/signalvar.h>
#include <sys/socketvar.h>
#include <sys/stat.h>
#include <sys/sx.h>
#include <sys/syscallsubr.h>
#include <sys/sysctl.h>
#include <sys/sysproto.h>
#include <sys/tty.h>
#include <sys/unistd.h>
#include <sys/un.h>
#include <sys/unpcb.h>
#include <sys/user.h>
#include <sys/vnode.h>
#ifdef KTRACE
#include <sys/ktrace.h>
#endif
#include <net/vnet.h>
#include <netinet/in.h>
#include <netinet/in_pcb.h>
#include <security/audit/audit.h>
#include <vm/uma.h>
#include <vm/vm.h>
#include <ddb/ddb.h>
static MALLOC_DEFINE(M_FILEDESC, "filedesc", "Open file descriptor table");
static MALLOC_DEFINE(M_FILEDESC_TO_LEADER, "filedesc_to_leader",
"file desc to leader structures");
static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures");
static uma_zone_t file_zone;
/* Flags for do_dup() */
#define DUP_FIXED 0x1 /* Force fixed allocation */
#define DUP_FCNTL 0x2 /* fcntl()-style errors */
static int do_dup(struct thread *td, int flags, int old, int new,
register_t *retval);
static int fd_first_free(struct filedesc *, int, int);
static int fd_last_used(struct filedesc *, int, int);
static void fdgrowtable(struct filedesc *, int);
static void fdunused(struct filedesc *fdp, int fd);
static void fdused(struct filedesc *fdp, int fd);
static int fill_vnode_info(struct vnode *vp, struct kinfo_file *kif);
static int fill_socket_info(struct socket *so, struct kinfo_file *kif);
static int fill_pts_info(struct tty *tp, struct kinfo_file *kif);
static int fill_pipe_info(struct pipe *pi, struct kinfo_file *kif);
static int fill_procdesc_info(struct procdesc *pdp,
struct kinfo_file *kif);
/*
* A process is initially started out with NDFILE descriptors stored within
* this structure, selected to be enough for typical applications based on
* the historical limit of 20 open files (and the usage of descriptors by
* shells). If these descriptors are exhausted, a larger descriptor table
* may be allocated, up to a process' resource limit; the internal arrays
* are then unused.
*/
#define NDFILE 20
#define NDSLOTSIZE sizeof(NDSLOTTYPE)
#define NDENTRIES (NDSLOTSIZE * __CHAR_BIT)
#define NDSLOT(x) ((x) / NDENTRIES)
#define NDBIT(x) ((NDSLOTTYPE)1 << ((x) % NDENTRIES))
#define NDSLOTS(x) (((x) + NDENTRIES - 1) / NDENTRIES)
/*
* Storage required per open file descriptor.
*/
#define OFILESIZE (sizeof(struct file *) + sizeof(char))
/*
* Storage to hold unused ofiles that need to be reclaimed.
*/
struct freetable {
struct file **ft_table;
SLIST_ENTRY(freetable) ft_next;
};
/*
* Basic allocation of descriptors:
* one of the above, plus arrays for NDFILE descriptors.
*/
struct filedesc0 {
struct filedesc fd_fd;
/*
* ofiles which need to be reclaimed on free.
*/
SLIST_HEAD(,freetable) fd_free;
/*
* These arrays are used when the number of open files is
* <= NDFILE, and are then pointed to by the pointers above.
*/
struct file *fd_dfiles[NDFILE];
char fd_dfileflags[NDFILE];
NDSLOTTYPE fd_dmap[NDSLOTS(NDFILE)];
};
/*
* Descriptor management.
*/
volatile int openfiles; /* actual number of open files */
struct mtx sigio_lock; /* mtx to protect pointers to sigio */
void (*mq_fdclose)(struct thread *td, int fd, struct file *fp);
/* A mutex to protect the association between a proc and filedesc. */
static struct mtx fdesc_mtx;
/*
* Find the first zero bit in the given bitmap, starting at low and not
* exceeding size - 1.
*/
static int
fd_first_free(struct filedesc *fdp, int low, int size)
{
NDSLOTTYPE *map = fdp->fd_map;
NDSLOTTYPE mask;
int off, maxoff;
if (low >= size)
return (low);
off = NDSLOT(low);
if (low % NDENTRIES) {
mask = ~(~(NDSLOTTYPE)0 >> (NDENTRIES - (low % NDENTRIES)));
if ((mask &= ~map[off]) != 0UL)
return (off * NDENTRIES + ffsl(mask) - 1);
++off;
}
for (maxoff = NDSLOTS(size); off < maxoff; ++off)
if (map[off] != ~0UL)
return (off * NDENTRIES + ffsl(~map[off]) - 1);
return (size);
}
/*
* Find the highest non-zero bit in the given bitmap, starting at low and
* not exceeding size - 1.
*/
static int
fd_last_used(struct filedesc *fdp, int low, int size)
{
NDSLOTTYPE *map = fdp->fd_map;
NDSLOTTYPE mask;
int off, minoff;
if (low >= size)
return (-1);
off = NDSLOT(size);
if (size % NDENTRIES) {
mask = ~(~(NDSLOTTYPE)0 << (size % NDENTRIES));
if ((mask &= map[off]) != 0)
return (off * NDENTRIES + flsl(mask) - 1);
--off;
}
for (minoff = NDSLOT(low); off >= minoff; --off)
if (map[off] != 0)
return (off * NDENTRIES + flsl(map[off]) - 1);
return (low - 1);
}
static int
fdisused(struct filedesc *fdp, int fd)
{
KASSERT(fd >= 0 && fd < fdp->fd_nfiles,
("file descriptor %d out of range (0, %d)", fd, fdp->fd_nfiles));
return ((fdp->fd_map[NDSLOT(fd)] & NDBIT(fd)) != 0);
}
/*
* Mark a file descriptor as used.
*/
static void
fdused(struct filedesc *fdp, int fd)
{
FILEDESC_XLOCK_ASSERT(fdp);
KASSERT(!fdisused(fdp, fd),
("fd already used"));
fdp->fd_map[NDSLOT(fd)] |= NDBIT(fd);
if (fd > fdp->fd_lastfile)
fdp->fd_lastfile = fd;
if (fd == fdp->fd_freefile)
fdp->fd_freefile = fd_first_free(fdp, fd, fdp->fd_nfiles);
}
/*
* Mark a file descriptor as unused.
*/
static void
fdunused(struct filedesc *fdp, int fd)
{
FILEDESC_XLOCK_ASSERT(fdp);
KASSERT(fdisused(fdp, fd),
("fd is already unused"));
KASSERT(fdp->fd_ofiles[fd] == NULL,
("fd is still in use"));
fdp->fd_map[NDSLOT(fd)] &= ~NDBIT(fd);
if (fd < fdp->fd_freefile)
fdp->fd_freefile = fd;
if (fd == fdp->fd_lastfile)
fdp->fd_lastfile = fd_last_used(fdp, 0, fd);
}
/*
* System calls on descriptors.
*/
#ifndef _SYS_SYSPROTO_H_
struct getdtablesize_args {
int dummy;
};
#endif
/* ARGSUSED */
int
-getdtablesize(struct thread *td, struct getdtablesize_args *uap)
+sys_getdtablesize(struct thread *td, struct getdtablesize_args *uap)
{
struct proc *p = td->td_proc;
uint64_t lim;
PROC_LOCK(p);
td->td_retval[0] =
min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
lim = racct_get_limit(td->td_proc, RACCT_NOFILE);
PROC_UNLOCK(p);
if (lim < td->td_retval[0])
td->td_retval[0] = lim;
return (0);
}
/*
* Duplicate a file descriptor to a particular value.
*
* Note: keep in mind that a potential race condition exists when closing
* descriptors from a shared descriptor table (via rfork).
*/
#ifndef _SYS_SYSPROTO_H_
struct dup2_args {
u_int from;
u_int to;
};
#endif
/* ARGSUSED */
int
-dup2(struct thread *td, struct dup2_args *uap)
+sys_dup2(struct thread *td, struct dup2_args *uap)
{
return (do_dup(td, DUP_FIXED, (int)uap->from, (int)uap->to,
td->td_retval));
}
/*
* Duplicate a file descriptor.
*/
#ifndef _SYS_SYSPROTO_H_
struct dup_args {
u_int fd;
};
#endif
/* ARGSUSED */
int
-dup(struct thread *td, struct dup_args *uap)
+sys_dup(struct thread *td, struct dup_args *uap)
{
return (do_dup(td, 0, (int)uap->fd, 0, td->td_retval));
}
/*
* The file control system call.
*/
#ifndef _SYS_SYSPROTO_H_
struct fcntl_args {
int fd;
int cmd;
long arg;
};
#endif
/* ARGSUSED */
int
-fcntl(struct thread *td, struct fcntl_args *uap)
+sys_fcntl(struct thread *td, struct fcntl_args *uap)
{
struct flock fl;
struct oflock ofl;
intptr_t arg;
int error;
int cmd;
error = 0;
cmd = uap->cmd;
switch (uap->cmd) {
case F_OGETLK:
case F_OSETLK:
case F_OSETLKW:
/*
* Convert old flock structure to new.
*/
error = copyin((void *)(intptr_t)uap->arg, &ofl, sizeof(ofl));
fl.l_start = ofl.l_start;
fl.l_len = ofl.l_len;
fl.l_pid = ofl.l_pid;
fl.l_type = ofl.l_type;
fl.l_whence = ofl.l_whence;
fl.l_sysid = 0;
switch (uap->cmd) {
case F_OGETLK:
cmd = F_GETLK;
break;
case F_OSETLK:
cmd = F_SETLK;
break;
case F_OSETLKW:
cmd = F_SETLKW;
break;
}
arg = (intptr_t)&fl;
break;
case F_GETLK:
case F_SETLK:
case F_SETLKW:
case F_SETLK_REMOTE:
error = copyin((void *)(intptr_t)uap->arg, &fl, sizeof(fl));
arg = (intptr_t)&fl;
break;
default:
arg = uap->arg;
break;
}
if (error)
return (error);
error = kern_fcntl(td, uap->fd, cmd, arg);
if (error)
return (error);
if (uap->cmd == F_OGETLK) {
ofl.l_start = fl.l_start;
ofl.l_len = fl.l_len;
ofl.l_pid = fl.l_pid;
ofl.l_type = fl.l_type;
ofl.l_whence = fl.l_whence;
error = copyout(&ofl, (void *)(intptr_t)uap->arg, sizeof(ofl));
} else if (uap->cmd == F_GETLK) {
error = copyout(&fl, (void *)(intptr_t)uap->arg, sizeof(fl));
}
return (error);
}
static inline struct file *
fdtofp(int fd, struct filedesc *fdp)
{
struct file *fp;
FILEDESC_LOCK_ASSERT(fdp);
if ((unsigned)fd >= fdp->fd_nfiles ||
(fp = fdp->fd_ofiles[fd]) == NULL)
return (NULL);
return (fp);
}
static inline int
fdunwrap(int fd, cap_rights_t rights, struct filedesc *fdp, struct file **fpp)
{
*fpp = fdtofp(fd, fdp);
if (*fpp == NULL)
return (EBADF);
#ifdef CAPABILITIES
if ((*fpp)->f_type == DTYPE_CAPABILITY) {
int err = cap_funwrap(*fpp, rights, fpp);
if (err != 0) {
*fpp = NULL;
return (err);
}
}
#endif /* CAPABILITIES */
return (0);
}
int
kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
{
struct filedesc *fdp;
struct flock *flp;
struct file *fp;
struct proc *p;
char *pop;
struct vnode *vp;
int error, flg, tmp;
int vfslocked;
u_int old, new;
uint64_t bsize;
vfslocked = 0;
error = 0;
flg = F_POSIX;
p = td->td_proc;
fdp = p->p_fd;
switch (cmd) {
case F_DUPFD:
tmp = arg;
error = do_dup(td, DUP_FCNTL, fd, tmp, td->td_retval);
break;
case F_DUP2FD:
tmp = arg;
error = do_dup(td, DUP_FIXED, fd, tmp, td->td_retval);
break;
case F_GETFD:
FILEDESC_SLOCK(fdp);
if ((fp = fdtofp(fd, fdp)) == NULL) {
FILEDESC_SUNLOCK(fdp);
error = EBADF;
break;
}
pop = &fdp->fd_ofileflags[fd];
td->td_retval[0] = (*pop & UF_EXCLOSE) ? FD_CLOEXEC : 0;
FILEDESC_SUNLOCK(fdp);
break;
case F_SETFD:
FILEDESC_XLOCK(fdp);
if ((fp = fdtofp(fd, fdp)) == NULL) {
FILEDESC_XUNLOCK(fdp);
error = EBADF;
break;
}
pop = &fdp->fd_ofileflags[fd];
*pop = (*pop &~ UF_EXCLOSE) |
(arg & FD_CLOEXEC ? UF_EXCLOSE : 0);
FILEDESC_XUNLOCK(fdp);
break;
case F_GETFL:
FILEDESC_SLOCK(fdp);
error = fdunwrap(fd, CAP_FCNTL, fdp, &fp);
if (error != 0) {
FILEDESC_SUNLOCK(fdp);
break;
}
td->td_retval[0] = OFLAGS(fp->f_flag);
FILEDESC_SUNLOCK(fdp);
break;
case F_SETFL:
FILEDESC_SLOCK(fdp);
error = fdunwrap(fd, CAP_FCNTL, fdp, &fp);
if (error != 0) {
FILEDESC_SUNLOCK(fdp);
break;
}
fhold(fp);
FILEDESC_SUNLOCK(fdp);
do {
tmp = flg = fp->f_flag;
tmp &= ~FCNTLFLAGS;
tmp |= FFLAGS(arg & ~O_ACCMODE) & FCNTLFLAGS;
} while(atomic_cmpset_int(&fp->f_flag, flg, tmp) == 0);
tmp = fp->f_flag & FNONBLOCK;
error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
if (error) {
fdrop(fp, td);
break;
}
tmp = fp->f_flag & FASYNC;
error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td);
if (error == 0) {
fdrop(fp, td);
break;
}
atomic_clear_int(&fp->f_flag, FNONBLOCK);
tmp = 0;
(void)fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
fdrop(fp, td);
break;
case F_GETOWN:
FILEDESC_SLOCK(fdp);
error = fdunwrap(fd, CAP_FCNTL, fdp, &fp);
if (error != 0) {
FILEDESC_SUNLOCK(fdp);
break;
}
fhold(fp);
FILEDESC_SUNLOCK(fdp);
error = fo_ioctl(fp, FIOGETOWN, &tmp, td->td_ucred, td);
if (error == 0)
td->td_retval[0] = tmp;
fdrop(fp, td);
break;
case F_SETOWN:
FILEDESC_SLOCK(fdp);
error = fdunwrap(fd, CAP_FCNTL, fdp, &fp);
if (error != 0) {
FILEDESC_SUNLOCK(fdp);
break;
}
fhold(fp);
FILEDESC_SUNLOCK(fdp);
tmp = arg;
error = fo_ioctl(fp, FIOSETOWN, &tmp, td->td_ucred, td);
fdrop(fp, td);
break;
case F_SETLK_REMOTE:
error = priv_check(td, PRIV_NFS_LOCKD);
if (error)
return (error);
flg = F_REMOTE;
goto do_setlk;
case F_SETLKW:
flg |= F_WAIT;
/* FALLTHROUGH F_SETLK */
case F_SETLK:
do_setlk:
FILEDESC_SLOCK(fdp);
error = fdunwrap(fd, CAP_FLOCK, fdp, &fp);
if (error != 0) {
FILEDESC_SUNLOCK(fdp);
break;
}
if (fp->f_type != DTYPE_VNODE) {
FILEDESC_SUNLOCK(fdp);
error = EBADF;
break;
}
flp = (struct flock *)arg;
if (flp->l_whence == SEEK_CUR) {
if (fp->f_offset < 0 ||
(flp->l_start > 0 &&
fp->f_offset > OFF_MAX - flp->l_start)) {
FILEDESC_SUNLOCK(fdp);
error = EOVERFLOW;
break;
}
flp->l_start += fp->f_offset;
}
/*
* VOP_ADVLOCK() may block.
*/
fhold(fp);
FILEDESC_SUNLOCK(fdp);
vp = fp->f_vnode;
vfslocked = VFS_LOCK_GIANT(vp->v_mount);
switch (flp->l_type) {
case F_RDLCK:
if ((fp->f_flag & FREAD) == 0) {
error = EBADF;
break;
}
PROC_LOCK(p->p_leader);
p->p_leader->p_flag |= P_ADVLOCK;
PROC_UNLOCK(p->p_leader);
error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
flp, flg);
break;
case F_WRLCK:
if ((fp->f_flag & FWRITE) == 0) {
error = EBADF;
break;
}
PROC_LOCK(p->p_leader);
p->p_leader->p_flag |= P_ADVLOCK;
PROC_UNLOCK(p->p_leader);
error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
flp, flg);
break;
case F_UNLCK:
error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK,
flp, flg);
break;
case F_UNLCKSYS:
/*
* Temporary api for testing remote lock
* infrastructure.
*/
if (flg != F_REMOTE) {
error = EINVAL;
break;
}
error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
F_UNLCKSYS, flp, flg);
break;
default:
error = EINVAL;
break;
}
VFS_UNLOCK_GIANT(vfslocked);
vfslocked = 0;
/* Check for race with close */
FILEDESC_SLOCK(fdp);
if ((unsigned) fd >= fdp->fd_nfiles ||
fp != fdp->fd_ofiles[fd]) {
FILEDESC_SUNLOCK(fdp);
flp->l_whence = SEEK_SET;
flp->l_start = 0;
flp->l_len = 0;
flp->l_type = F_UNLCK;
vfslocked = VFS_LOCK_GIANT(vp->v_mount);
(void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
F_UNLCK, flp, F_POSIX);
VFS_UNLOCK_GIANT(vfslocked);
vfslocked = 0;
} else
FILEDESC_SUNLOCK(fdp);
fdrop(fp, td);
break;
case F_GETLK:
FILEDESC_SLOCK(fdp);
error = fdunwrap(fd, CAP_FLOCK, fdp, &fp);
if (error != 0) {
FILEDESC_SUNLOCK(fdp);
break;
}
if (fp->f_type != DTYPE_VNODE) {
FILEDESC_SUNLOCK(fdp);
error = EBADF;
break;
}
flp = (struct flock *)arg;
if (flp->l_type != F_RDLCK && flp->l_type != F_WRLCK &&
flp->l_type != F_UNLCK) {
FILEDESC_SUNLOCK(fdp);
error = EINVAL;
break;
}
if (flp->l_whence == SEEK_CUR) {
if ((flp->l_start > 0 &&
fp->f_offset > OFF_MAX - flp->l_start) ||
(flp->l_start < 0 &&
fp->f_offset < OFF_MIN - flp->l_start)) {
FILEDESC_SUNLOCK(fdp);
error = EOVERFLOW;
break;
}
flp->l_start += fp->f_offset;
}
/*
* VOP_ADVLOCK() may block.
*/
fhold(fp);
FILEDESC_SUNLOCK(fdp);
vp = fp->f_vnode;
vfslocked = VFS_LOCK_GIANT(vp->v_mount);
error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK, flp,
F_POSIX);
VFS_UNLOCK_GIANT(vfslocked);
vfslocked = 0;
fdrop(fp, td);
break;
case F_RDAHEAD:
arg = arg ? 128 * 1024: 0;
/* FALLTHROUGH */
case F_READAHEAD:
FILEDESC_SLOCK(fdp);
if ((fp = fdtofp(fd, fdp)) == NULL) {
FILEDESC_SUNLOCK(fdp);
error = EBADF;
break;
}
if (fp->f_type != DTYPE_VNODE) {
FILEDESC_SUNLOCK(fdp);
error = EBADF;
break;
}
fhold(fp);
FILEDESC_SUNLOCK(fdp);
if (arg != 0) {
vp = fp->f_vnode;
vfslocked = VFS_LOCK_GIANT(vp->v_mount);
error = vn_lock(vp, LK_SHARED);
if (error != 0)
goto readahead_vnlock_fail;
bsize = fp->f_vnode->v_mount->mnt_stat.f_iosize;
VOP_UNLOCK(vp, 0);
fp->f_seqcount = (arg + bsize - 1) / bsize;
do {
new = old = fp->f_flag;
new |= FRDAHEAD;
} while (!atomic_cmpset_rel_int(&fp->f_flag, old, new));
readahead_vnlock_fail:
VFS_UNLOCK_GIANT(vfslocked);
vfslocked = 0;
} else {
do {
new = old = fp->f_flag;
new &= ~FRDAHEAD;
} while (!atomic_cmpset_rel_int(&fp->f_flag, old, new));
}
fdrop(fp, td);
break;
default:
error = EINVAL;
break;
}
VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
/*
* Common code for dup, dup2, fcntl(F_DUPFD) and fcntl(F_DUP2FD).
*/
static int
do_dup(struct thread *td, int flags, int old, int new,
register_t *retval)
{
struct filedesc *fdp;
struct proc *p;
struct file *fp;
struct file *delfp;
int error, holdleaders, maxfd;
p = td->td_proc;
fdp = p->p_fd;
/*
* Verify we have a valid descriptor to dup from and possibly to
* dup to. Unlike dup() and dup2(), fcntl()'s F_DUPFD should
* return EINVAL when the new descriptor is out of bounds.
*/
if (old < 0)
return (EBADF);
if (new < 0)
return (flags & DUP_FCNTL ? EINVAL : EBADF);
PROC_LOCK(p);
maxfd = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
PROC_UNLOCK(p);
if (new >= maxfd)
return (flags & DUP_FCNTL ? EINVAL : EMFILE);
FILEDESC_XLOCK(fdp);
if (old >= fdp->fd_nfiles || fdp->fd_ofiles[old] == NULL) {
FILEDESC_XUNLOCK(fdp);
return (EBADF);
}
if (flags & DUP_FIXED && old == new) {
*retval = new;
FILEDESC_XUNLOCK(fdp);
return (0);
}
fp = fdp->fd_ofiles[old];
fhold(fp);
/*
* If the caller specified a file descriptor, make sure the file
* table is large enough to hold it, and grab it. Otherwise, just
* allocate a new descriptor the usual way. Since the filedesc
* lock may be temporarily dropped in the process, we have to look
* out for a race.
*/
if (flags & DUP_FIXED) {
if (new >= fdp->fd_nfiles) {
/*
* The resource limits are here instead of e.g. fdalloc(),
* because the file descriptor table may be shared between
* processes, so we can't really use racct_add()/racct_sub().
* Instead of counting the number of actually allocated
* descriptors, just put the limit on the size of the file
* descriptor table.
*/
#ifdef RACCT
PROC_LOCK(p);
error = racct_set(p, RACCT_NOFILE, new + 1);
PROC_UNLOCK(p);
if (error != 0) {
FILEDESC_XUNLOCK(fdp);
fdrop(fp, td);
return (EMFILE);
}
#endif
fdgrowtable(fdp, new + 1);
}
if (fdp->fd_ofiles[new] == NULL)
fdused(fdp, new);
} else {
if ((error = fdalloc(td, new, &new)) != 0) {
FILEDESC_XUNLOCK(fdp);
fdrop(fp, td);
return (error);
}
}
/*
* If the old file changed out from under us then treat it as a
* bad file descriptor. Userland should do its own locking to
* avoid this case.
*/
if (fdp->fd_ofiles[old] != fp) {
/* we've allocated a descriptor which we won't use */
if (fdp->fd_ofiles[new] == NULL)
fdunused(fdp, new);
FILEDESC_XUNLOCK(fdp);
fdrop(fp, td);
return (EBADF);
}
KASSERT(old != new,
("new fd is same as old"));
/*
* Save info on the descriptor being overwritten. We cannot close
* it without introducing an ownership race for the slot, since we
* need to drop the filedesc lock to call closef().
*
* XXX this duplicates parts of close().
*/
delfp = fdp->fd_ofiles[new];
holdleaders = 0;
if (delfp != NULL) {
if (td->td_proc->p_fdtol != NULL) {
/*
* Ask fdfree() to sleep to ensure that all relevant
* process leaders can be traversed in closef().
*/
fdp->fd_holdleaderscount++;
holdleaders = 1;
}
}
/*
* Duplicate the source descriptor
*/
fdp->fd_ofiles[new] = fp;
fdp->fd_ofileflags[new] = fdp->fd_ofileflags[old] &~ UF_EXCLOSE;
if (new > fdp->fd_lastfile)
fdp->fd_lastfile = new;
*retval = new;
/*
* If we dup'd over a valid file, we now own the reference to it
* and must dispose of it using closef() semantics (as if a
* close() were performed on it).
*
* XXX this duplicates parts of close().
*/
if (delfp != NULL) {
knote_fdclose(td, new);
if (delfp->f_type == DTYPE_MQUEUE)
mq_fdclose(td, new, delfp);
FILEDESC_XUNLOCK(fdp);
(void) closef(delfp, td);
if (holdleaders) {
FILEDESC_XLOCK(fdp);
fdp->fd_holdleaderscount--;
if (fdp->fd_holdleaderscount == 0 &&
fdp->fd_holdleaderswakeup != 0) {
fdp->fd_holdleaderswakeup = 0;
wakeup(&fdp->fd_holdleaderscount);
}
FILEDESC_XUNLOCK(fdp);
}
} else {
FILEDESC_XUNLOCK(fdp);
}
return (0);
}
/*
* If sigio is on the list associated with a process or process group,
* disable signalling from the device, remove sigio from the list and
* free sigio.
*/
void
funsetown(struct sigio **sigiop)
{
struct sigio *sigio;
SIGIO_LOCK();
sigio = *sigiop;
if (sigio == NULL) {
SIGIO_UNLOCK();
return;
}
*(sigio->sio_myref) = NULL;
if ((sigio)->sio_pgid < 0) {
struct pgrp *pg = (sigio)->sio_pgrp;
PGRP_LOCK(pg);
SLIST_REMOVE(&sigio->sio_pgrp->pg_sigiolst, sigio,
sigio, sio_pgsigio);
PGRP_UNLOCK(pg);
} else {
struct proc *p = (sigio)->sio_proc;
PROC_LOCK(p);
SLIST_REMOVE(&sigio->sio_proc->p_sigiolst, sigio,
sigio, sio_pgsigio);
PROC_UNLOCK(p);
}
SIGIO_UNLOCK();
crfree(sigio->sio_ucred);
free(sigio, M_SIGIO);
}
/*
* Free a list of sigio structures.
* We only need to lock the SIGIO_LOCK because we have made ourselves
* inaccessible to callers of fsetown and therefore do not need to lock
* the proc or pgrp struct for the list manipulation.
*/
void
funsetownlst(struct sigiolst *sigiolst)
{
struct proc *p;
struct pgrp *pg;
struct sigio *sigio;
sigio = SLIST_FIRST(sigiolst);
if (sigio == NULL)
return;
p = NULL;
pg = NULL;
/*
* Every entry of the list should belong
* to a single proc or pgrp.
*/
if (sigio->sio_pgid < 0) {
pg = sigio->sio_pgrp;
PGRP_LOCK_ASSERT(pg, MA_NOTOWNED);
} else /* if (sigio->sio_pgid > 0) */ {
p = sigio->sio_proc;
PROC_LOCK_ASSERT(p, MA_NOTOWNED);
}
SIGIO_LOCK();
while ((sigio = SLIST_FIRST(sigiolst)) != NULL) {
*(sigio->sio_myref) = NULL;
if (pg != NULL) {
KASSERT(sigio->sio_pgid < 0,
("Proc sigio in pgrp sigio list"));
KASSERT(sigio->sio_pgrp == pg,
("Bogus pgrp in sigio list"));
PGRP_LOCK(pg);
SLIST_REMOVE(&pg->pg_sigiolst, sigio, sigio,
sio_pgsigio);
PGRP_UNLOCK(pg);
} else /* if (p != NULL) */ {
KASSERT(sigio->sio_pgid > 0,
("Pgrp sigio in proc sigio list"));
KASSERT(sigio->sio_proc == p,
("Bogus proc in sigio list"));
PROC_LOCK(p);
SLIST_REMOVE(&p->p_sigiolst, sigio, sigio,
sio_pgsigio);
PROC_UNLOCK(p);
}
SIGIO_UNLOCK();
crfree(sigio->sio_ucred);
free(sigio, M_SIGIO);
SIGIO_LOCK();
}
SIGIO_UNLOCK();
}
/*
* This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg).
*
* After permission checking, add a sigio structure to the sigio list for
* the process or process group.
*/
int
fsetown(pid_t pgid, struct sigio **sigiop)
{
struct proc *proc;
struct pgrp *pgrp;
struct sigio *sigio;
int ret;
if (pgid == 0) {
funsetown(sigiop);
return (0);
}
ret = 0;
/* Allocate and fill in the new sigio out of locks. */
sigio = malloc(sizeof(struct sigio), M_SIGIO, M_WAITOK);
sigio->sio_pgid = pgid;
sigio->sio_ucred = crhold(curthread->td_ucred);
sigio->sio_myref = sigiop;
sx_slock(&proctree_lock);
if (pgid > 0) {
proc = pfind(pgid);
if (proc == NULL) {
ret = ESRCH;
goto fail;
}
/*
* Policy - Don't allow a process to FSETOWN a process
* in another session.
*
* Remove this test to allow maximum flexibility or
* restrict FSETOWN to the current process or process
* group for maximum safety.
*/
PROC_UNLOCK(proc);
if (proc->p_session != curthread->td_proc->p_session) {
ret = EPERM;
goto fail;
}
pgrp = NULL;
} else /* if (pgid < 0) */ {
pgrp = pgfind(-pgid);
if (pgrp == NULL) {
ret = ESRCH;
goto fail;
}
PGRP_UNLOCK(pgrp);
/*
* Policy - Don't allow a process to FSETOWN a process
* in another session.
*
* Remove this test to allow maximum flexibility or
* restrict FSETOWN to the current process or process
* group for maximum safety.
*/
if (pgrp->pg_session != curthread->td_proc->p_session) {
ret = EPERM;
goto fail;
}
proc = NULL;
}
funsetown(sigiop);
if (pgid > 0) {
PROC_LOCK(proc);
/*
* Since funsetownlst() is called without the proctree
* locked, we need to check for P_WEXIT.
* XXX: is ESRCH correct?
*/
if ((proc->p_flag & P_WEXIT) != 0) {
PROC_UNLOCK(proc);
ret = ESRCH;
goto fail;
}
SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio, sio_pgsigio);
sigio->sio_proc = proc;
PROC_UNLOCK(proc);
} else {
PGRP_LOCK(pgrp);
SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio, sio_pgsigio);
sigio->sio_pgrp = pgrp;
PGRP_UNLOCK(pgrp);
}
sx_sunlock(&proctree_lock);
SIGIO_LOCK();
*sigiop = sigio;
SIGIO_UNLOCK();
return (0);
fail:
sx_sunlock(&proctree_lock);
crfree(sigio->sio_ucred);
free(sigio, M_SIGIO);
return (ret);
}
/*
* This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg).
*/
pid_t
fgetown(sigiop)
struct sigio **sigiop;
{
pid_t pgid;
SIGIO_LOCK();
pgid = (*sigiop != NULL) ? (*sigiop)->sio_pgid : 0;
SIGIO_UNLOCK();
return (pgid);
}
/*
* Close a file descriptor.
*/
#ifndef _SYS_SYSPROTO_H_
struct close_args {
int fd;
};
#endif
/* ARGSUSED */
int
-close(td, uap)
+sys_close(td, uap)
struct thread *td;
struct close_args *uap;
{
return (kern_close(td, uap->fd));
}
int
kern_close(td, fd)
struct thread *td;
int fd;
{
struct filedesc *fdp;
struct file *fp, *fp_object;
int error;
int holdleaders;
error = 0;
holdleaders = 0;
fdp = td->td_proc->p_fd;
AUDIT_SYSCLOSE(td, fd);
FILEDESC_XLOCK(fdp);
if ((unsigned)fd >= fdp->fd_nfiles ||
(fp = fdp->fd_ofiles[fd]) == NULL) {
FILEDESC_XUNLOCK(fdp);
return (EBADF);
}
fdp->fd_ofiles[fd] = NULL;
fdp->fd_ofileflags[fd] = 0;
fdunused(fdp, fd);
if (td->td_proc->p_fdtol != NULL) {
/*
* Ask fdfree() to sleep to ensure that all relevant
* process leaders can be traversed in closef().
*/
fdp->fd_holdleaderscount++;
holdleaders = 1;
}
/*
* We now hold the fp reference that used to be owned by the
* descriptor array. We have to unlock the FILEDESC *AFTER*
* knote_fdclose to prevent a race of the fd getting opened, a knote
* added, and deleteing a knote for the new fd.
*/
knote_fdclose(td, fd);
/*
* When we're closing an fd with a capability, we need to notify
* mqueue if the underlying object is of type mqueue.
*/
(void)cap_funwrap(fp, 0, &fp_object);
if (fp_object->f_type == DTYPE_MQUEUE)
mq_fdclose(td, fd, fp_object);
FILEDESC_XUNLOCK(fdp);
error = closef(fp, td);
if (holdleaders) {
FILEDESC_XLOCK(fdp);
fdp->fd_holdleaderscount--;
if (fdp->fd_holdleaderscount == 0 &&
fdp->fd_holdleaderswakeup != 0) {
fdp->fd_holdleaderswakeup = 0;
wakeup(&fdp->fd_holdleaderscount);
}
FILEDESC_XUNLOCK(fdp);
}
return (error);
}
/*
* Close open file descriptors.
*/
#ifndef _SYS_SYSPROTO_H_
struct closefrom_args {
int lowfd;
};
#endif
/* ARGSUSED */
int
-closefrom(struct thread *td, struct closefrom_args *uap)
+sys_closefrom(struct thread *td, struct closefrom_args *uap)
{
struct filedesc *fdp;
int fd;
fdp = td->td_proc->p_fd;
AUDIT_ARG_FD(uap->lowfd);
/*
* Treat negative starting file descriptor values identical to
* closefrom(0) which closes all files.
*/
if (uap->lowfd < 0)
uap->lowfd = 0;
FILEDESC_SLOCK(fdp);
for (fd = uap->lowfd; fd < fdp->fd_nfiles; fd++) {
if (fdp->fd_ofiles[fd] != NULL) {
FILEDESC_SUNLOCK(fdp);
(void)kern_close(td, fd);
FILEDESC_SLOCK(fdp);
}
}
FILEDESC_SUNLOCK(fdp);
return (0);
}
#if defined(COMPAT_43)
/*
* Return status information about a file descriptor.
*/
#ifndef _SYS_SYSPROTO_H_
struct ofstat_args {
int fd;
struct ostat *sb;
};
#endif
/* ARGSUSED */
int
ofstat(struct thread *td, struct ofstat_args *uap)
{
struct ostat oub;
struct stat ub;
int error;
error = kern_fstat(td, uap->fd, &ub);
if (error == 0) {
cvtstat(&ub, &oub);
error = copyout(&oub, uap->sb, sizeof(oub));
}
return (error);
}
#endif /* COMPAT_43 */
/*
* Return status information about a file descriptor.
*/
#ifndef _SYS_SYSPROTO_H_
struct fstat_args {
int fd;
struct stat *sb;
};
#endif
/* ARGSUSED */
int
-fstat(struct thread *td, struct fstat_args *uap)
+sys_fstat(struct thread *td, struct fstat_args *uap)
{
struct stat ub;
int error;
error = kern_fstat(td, uap->fd, &ub);
if (error == 0)
error = copyout(&ub, uap->sb, sizeof(ub));
return (error);
}
int
kern_fstat(struct thread *td, int fd, struct stat *sbp)
{
struct file *fp;
int error;
AUDIT_ARG_FD(fd);
if ((error = fget(td, fd, CAP_FSTAT, &fp)) != 0)
return (error);
AUDIT_ARG_FILE(td->td_proc, fp);
error = fo_stat(fp, sbp, td->td_ucred, td);
fdrop(fp, td);
#ifdef KTRACE
if (error == 0 && KTRPOINT(td, KTR_STRUCT))
ktrstat(sbp);
#endif
return (error);
}
/*
* Return status information about a file descriptor.
*/
#ifndef _SYS_SYSPROTO_H_
struct nfstat_args {
int fd;
struct nstat *sb;
};
#endif
/* ARGSUSED */
int
-nfstat(struct thread *td, struct nfstat_args *uap)
+sys_nfstat(struct thread *td, struct nfstat_args *uap)
{
struct nstat nub;
struct stat ub;
int error;
error = kern_fstat(td, uap->fd, &ub);
if (error == 0) {
cvtnstat(&ub, &nub);
error = copyout(&nub, uap->sb, sizeof(nub));
}
return (error);
}
/*
* Return pathconf information about a file descriptor.
*/
#ifndef _SYS_SYSPROTO_H_
struct fpathconf_args {
int fd;
int name;
};
#endif
/* ARGSUSED */
int
-fpathconf(struct thread *td, struct fpathconf_args *uap)
+sys_fpathconf(struct thread *td, struct fpathconf_args *uap)
{
struct file *fp;
struct vnode *vp;
int error;
if ((error = fget(td, uap->fd, CAP_FPATHCONF, &fp)) != 0)
return (error);
/* If asynchronous I/O is available, it works for all descriptors. */
if (uap->name == _PC_ASYNC_IO) {
td->td_retval[0] = async_io_version;
goto out;
}
vp = fp->f_vnode;
if (vp != NULL) {
int vfslocked;
vfslocked = VFS_LOCK_GIANT(vp->v_mount);
vn_lock(vp, LK_SHARED | LK_RETRY);
error = VOP_PATHCONF(vp, uap->name, td->td_retval);
VOP_UNLOCK(vp, 0);
VFS_UNLOCK_GIANT(vfslocked);
} else if (fp->f_type == DTYPE_PIPE || fp->f_type == DTYPE_SOCKET) {
if (uap->name != _PC_PIPE_BUF) {
error = EINVAL;
} else {
td->td_retval[0] = PIPE_BUF;
error = 0;
}
} else {
error = EOPNOTSUPP;
}
out:
fdrop(fp, td);
return (error);
}
/*
* Grow the file table to accomodate (at least) nfd descriptors. This may
* block and drop the filedesc lock, but it will reacquire it before
* returning.
*/
static void
fdgrowtable(struct filedesc *fdp, int nfd)
{
struct filedesc0 *fdp0;
struct freetable *fo;
struct file **ntable;
struct file **otable;
char *nfileflags;
int nnfiles, onfiles;
NDSLOTTYPE *nmap;
FILEDESC_XLOCK_ASSERT(fdp);
KASSERT(fdp->fd_nfiles > 0,
("zero-length file table"));
/* compute the size of the new table */
onfiles = fdp->fd_nfiles;
nnfiles = NDSLOTS(nfd) * NDENTRIES; /* round up */
if (nnfiles <= onfiles)
/* the table is already large enough */
return;
/* allocate a new table and (if required) new bitmaps */
FILEDESC_XUNLOCK(fdp);
ntable = malloc((nnfiles * OFILESIZE) + sizeof(struct freetable),
M_FILEDESC, M_ZERO | M_WAITOK);
nfileflags = (char *)&ntable[nnfiles];
if (NDSLOTS(nnfiles) > NDSLOTS(onfiles))
nmap = malloc(NDSLOTS(nnfiles) * NDSLOTSIZE,
M_FILEDESC, M_ZERO | M_WAITOK);
else
nmap = NULL;
FILEDESC_XLOCK(fdp);
/*
* We now have new tables ready to go. Since we dropped the
* filedesc lock to call malloc(), watch out for a race.
*/
onfiles = fdp->fd_nfiles;
if (onfiles >= nnfiles) {
/* we lost the race, but that's OK */
free(ntable, M_FILEDESC);
if (nmap != NULL)
free(nmap, M_FILEDESC);
return;
}
bcopy(fdp->fd_ofiles, ntable, onfiles * sizeof(*ntable));
bcopy(fdp->fd_ofileflags, nfileflags, onfiles);
otable = fdp->fd_ofiles;
fdp->fd_ofileflags = nfileflags;
fdp->fd_ofiles = ntable;
/*
* We must preserve ofiles until the process exits because we can't
* be certain that no threads have references to the old table via
* _fget().
*/
if (onfiles > NDFILE) {
fo = (struct freetable *)&otable[onfiles];
fdp0 = (struct filedesc0 *)fdp;
fo->ft_table = otable;
SLIST_INSERT_HEAD(&fdp0->fd_free, fo, ft_next);
}
if (NDSLOTS(nnfiles) > NDSLOTS(onfiles)) {
bcopy(fdp->fd_map, nmap, NDSLOTS(onfiles) * sizeof(*nmap));
if (NDSLOTS(onfiles) > NDSLOTS(NDFILE))
free(fdp->fd_map, M_FILEDESC);
fdp->fd_map = nmap;
}
fdp->fd_nfiles = nnfiles;
}
/*
* Allocate a file descriptor for the process.
*/
int
fdalloc(struct thread *td, int minfd, int *result)
{
struct proc *p = td->td_proc;
struct filedesc *fdp = p->p_fd;
int fd = -1, maxfd;
#ifdef RACCT
int error;
#endif
FILEDESC_XLOCK_ASSERT(fdp);
if (fdp->fd_freefile > minfd)
minfd = fdp->fd_freefile;
PROC_LOCK(p);
maxfd = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
PROC_UNLOCK(p);
/*
* Search the bitmap for a free descriptor. If none is found, try
* to grow the file table. Keep at it until we either get a file
* descriptor or run into process or system limits; fdgrowtable()
* may drop the filedesc lock, so we're in a race.
*/
for (;;) {
fd = fd_first_free(fdp, minfd, fdp->fd_nfiles);
if (fd >= maxfd)
return (EMFILE);
if (fd < fdp->fd_nfiles)
break;
#ifdef RACCT
PROC_LOCK(p);
error = racct_set(p, RACCT_NOFILE, min(fdp->fd_nfiles * 2, maxfd));
PROC_UNLOCK(p);
if (error != 0)
return (EMFILE);
#endif
fdgrowtable(fdp, min(fdp->fd_nfiles * 2, maxfd));
}
/*
* Perform some sanity checks, then mark the file descriptor as
* used and return it to the caller.
*/
KASSERT(!fdisused(fdp, fd),
("fd_first_free() returned non-free descriptor"));
KASSERT(fdp->fd_ofiles[fd] == NULL,
("free descriptor isn't"));
fdp->fd_ofileflags[fd] = 0; /* XXX needed? */
fdused(fdp, fd);
*result = fd;
return (0);
}
/*
* Check to see whether n user file descriptors are available to the process
* p.
*/
int
fdavail(struct thread *td, int n)
{
struct proc *p = td->td_proc;
struct filedesc *fdp = td->td_proc->p_fd;
struct file **fpp;
int i, lim, last;
FILEDESC_LOCK_ASSERT(fdp);
/*
* XXX: This is only called from uipc_usrreq.c:unp_externalize();
* call racct_add() from there instead of dealing with containers
* here.
*/
PROC_LOCK(p);
lim = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
PROC_UNLOCK(p);
if ((i = lim - fdp->fd_nfiles) > 0 && (n -= i) <= 0)
return (1);
last = min(fdp->fd_nfiles, lim);
fpp = &fdp->fd_ofiles[fdp->fd_freefile];
for (i = last - fdp->fd_freefile; --i >= 0; fpp++) {
if (*fpp == NULL && --n <= 0)
return (1);
}
return (0);
}
/*
* Create a new open file structure and allocate a file decriptor for the
* process that refers to it. We add one reference to the file for the
* descriptor table and one reference for resultfp. This is to prevent us
* being preempted and the entry in the descriptor table closed after we
* release the FILEDESC lock.
*/
int
falloc(struct thread *td, struct file **resultfp, int *resultfd, int flags)
{
struct file *fp;
int error, fd;
error = falloc_noinstall(td, &fp);
if (error)
return (error); /* no reference held on error */
error = finstall(td, fp, &fd, flags);
if (error) {
fdrop(fp, td); /* one reference (fp only) */
return (error);
}
if (resultfp != NULL)
*resultfp = fp; /* copy out result */
else
fdrop(fp, td); /* release local reference */
if (resultfd != NULL)
*resultfd = fd;
return (0);
}
/*
* Create a new open file structure without allocating a file descriptor.
*/
int
falloc_noinstall(struct thread *td, struct file **resultfp)
{
struct file *fp;
int maxuserfiles = maxfiles - (maxfiles / 20);
static struct timeval lastfail;
static int curfail;
KASSERT(resultfp != NULL, ("%s: resultfp == NULL", __func__));
if ((openfiles >= maxuserfiles &&
priv_check(td, PRIV_MAXFILES) != 0) ||
openfiles >= maxfiles) {
if (ppsratecheck(&lastfail, &curfail, 1)) {
printf("kern.maxfiles limit exceeded by uid %i, "
"please see tuning(7).\n", td->td_ucred->cr_ruid);
}
return (ENFILE);
}
atomic_add_int(&openfiles, 1);
fp = uma_zalloc(file_zone, M_WAITOK | M_ZERO);
refcount_init(&fp->f_count, 1);
fp->f_cred = crhold(td->td_ucred);
fp->f_ops = &badfileops;
fp->f_data = NULL;
fp->f_vnode = NULL;
*resultfp = fp;
return (0);
}
/*
* Install a file in a file descriptor table.
*/
int
finstall(struct thread *td, struct file *fp, int *fd, int flags)
{
struct filedesc *fdp = td->td_proc->p_fd;
int error;
KASSERT(fd != NULL, ("%s: fd == NULL", __func__));
KASSERT(fp != NULL, ("%s: fp == NULL", __func__));
FILEDESC_XLOCK(fdp);
if ((error = fdalloc(td, 0, fd))) {
FILEDESC_XUNLOCK(fdp);
return (error);
}
fhold(fp);
fdp->fd_ofiles[*fd] = fp;
if ((flags & O_CLOEXEC) != 0)
fdp->fd_ofileflags[*fd] |= UF_EXCLOSE;
FILEDESC_XUNLOCK(fdp);
return (0);
}
/*
* Build a new filedesc structure from another.
* Copy the current, root, and jail root vnode references.
*/
struct filedesc *
fdinit(struct filedesc *fdp)
{
struct filedesc0 *newfdp;
newfdp = malloc(sizeof *newfdp, M_FILEDESC, M_WAITOK | M_ZERO);
FILEDESC_LOCK_INIT(&newfdp->fd_fd);
if (fdp != NULL) {
FILEDESC_XLOCK(fdp);
newfdp->fd_fd.fd_cdir = fdp->fd_cdir;
if (newfdp->fd_fd.fd_cdir)
VREF(newfdp->fd_fd.fd_cdir);
newfdp->fd_fd.fd_rdir = fdp->fd_rdir;
if (newfdp->fd_fd.fd_rdir)
VREF(newfdp->fd_fd.fd_rdir);
newfdp->fd_fd.fd_jdir = fdp->fd_jdir;
if (newfdp->fd_fd.fd_jdir)
VREF(newfdp->fd_fd.fd_jdir);
FILEDESC_XUNLOCK(fdp);
}
/* Create the file descriptor table. */
newfdp->fd_fd.fd_refcnt = 1;
newfdp->fd_fd.fd_holdcnt = 1;
newfdp->fd_fd.fd_cmask = CMASK;
newfdp->fd_fd.fd_ofiles = newfdp->fd_dfiles;
newfdp->fd_fd.fd_ofileflags = newfdp->fd_dfileflags;
newfdp->fd_fd.fd_nfiles = NDFILE;
newfdp->fd_fd.fd_map = newfdp->fd_dmap;
newfdp->fd_fd.fd_lastfile = -1;
return (&newfdp->fd_fd);
}
static struct filedesc *
fdhold(struct proc *p)
{
struct filedesc *fdp;
mtx_lock(&fdesc_mtx);
fdp = p->p_fd;
if (fdp != NULL)
fdp->fd_holdcnt++;
mtx_unlock(&fdesc_mtx);
return (fdp);
}
static void
fddrop(struct filedesc *fdp)
{
struct filedesc0 *fdp0;
struct freetable *ft;
int i;
mtx_lock(&fdesc_mtx);
i = --fdp->fd_holdcnt;
mtx_unlock(&fdesc_mtx);
if (i > 0)
return;
FILEDESC_LOCK_DESTROY(fdp);
fdp0 = (struct filedesc0 *)fdp;
while ((ft = SLIST_FIRST(&fdp0->fd_free)) != NULL) {
SLIST_REMOVE_HEAD(&fdp0->fd_free, ft_next);
free(ft->ft_table, M_FILEDESC);
}
free(fdp, M_FILEDESC);
}
/*
* Share a filedesc structure.
*/
struct filedesc *
fdshare(struct filedesc *fdp)
{
FILEDESC_XLOCK(fdp);
fdp->fd_refcnt++;
FILEDESC_XUNLOCK(fdp);
return (fdp);
}
/*
* Unshare a filedesc structure, if necessary by making a copy
*/
void
fdunshare(struct proc *p, struct thread *td)
{
FILEDESC_XLOCK(p->p_fd);
if (p->p_fd->fd_refcnt > 1) {
struct filedesc *tmp;
FILEDESC_XUNLOCK(p->p_fd);
tmp = fdcopy(p->p_fd);
fdfree(td);
p->p_fd = tmp;
} else
FILEDESC_XUNLOCK(p->p_fd);
}
/*
* Copy a filedesc structure. A NULL pointer in returns a NULL reference,
* this is to ease callers, not catch errors.
*/
struct filedesc *
fdcopy(struct filedesc *fdp)
{
struct filedesc *newfdp;
int i;
/* Certain daemons might not have file descriptors. */
if (fdp == NULL)
return (NULL);
newfdp = fdinit(fdp);
FILEDESC_SLOCK(fdp);
while (fdp->fd_lastfile >= newfdp->fd_nfiles) {
FILEDESC_SUNLOCK(fdp);
FILEDESC_XLOCK(newfdp);
fdgrowtable(newfdp, fdp->fd_lastfile + 1);
FILEDESC_XUNLOCK(newfdp);
FILEDESC_SLOCK(fdp);
}
/* copy all passable descriptors (i.e. not kqueue) */
newfdp->fd_freefile = -1;
for (i = 0; i <= fdp->fd_lastfile; ++i) {
if (fdisused(fdp, i) &&
(fdp->fd_ofiles[i]->f_ops->fo_flags & DFLAG_PASSABLE) &&
fdp->fd_ofiles[i]->f_ops != &badfileops) {
newfdp->fd_ofiles[i] = fdp->fd_ofiles[i];
newfdp->fd_ofileflags[i] = fdp->fd_ofileflags[i];
fhold(newfdp->fd_ofiles[i]);
newfdp->fd_lastfile = i;
} else {
if (newfdp->fd_freefile == -1)
newfdp->fd_freefile = i;
}
}
newfdp->fd_cmask = fdp->fd_cmask;
FILEDESC_SUNLOCK(fdp);
FILEDESC_XLOCK(newfdp);
for (i = 0; i <= newfdp->fd_lastfile; ++i)
if (newfdp->fd_ofiles[i] != NULL)
fdused(newfdp, i);
if (newfdp->fd_freefile == -1)
newfdp->fd_freefile = i;
FILEDESC_XUNLOCK(newfdp);
return (newfdp);
}
/*
* Release a filedesc structure.
*/
void
fdfree(struct thread *td)
{
struct filedesc *fdp;
struct file **fpp;
int i, locked;
struct filedesc_to_leader *fdtol;
struct file *fp;
struct vnode *cdir, *jdir, *rdir, *vp;
struct flock lf;
/* Certain daemons might not have file descriptors. */
fdp = td->td_proc->p_fd;
if (fdp == NULL)
return;
#ifdef RACCT
PROC_LOCK(td->td_proc);
racct_set(td->td_proc, RACCT_NOFILE, 0);
PROC_UNLOCK(td->td_proc);
#endif
/* Check for special need to clear POSIX style locks */
fdtol = td->td_proc->p_fdtol;
if (fdtol != NULL) {
FILEDESC_XLOCK(fdp);
KASSERT(fdtol->fdl_refcount > 0,
("filedesc_to_refcount botch: fdl_refcount=%d",
fdtol->fdl_refcount));
if (fdtol->fdl_refcount == 1 &&
(td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
for (i = 0, fpp = fdp->fd_ofiles;
i <= fdp->fd_lastfile;
i++, fpp++) {
if (*fpp == NULL ||
(*fpp)->f_type != DTYPE_VNODE)
continue;
fp = *fpp;
fhold(fp);
FILEDESC_XUNLOCK(fdp);
lf.l_whence = SEEK_SET;
lf.l_start = 0;
lf.l_len = 0;
lf.l_type = F_UNLCK;
vp = fp->f_vnode;
locked = VFS_LOCK_GIANT(vp->v_mount);
(void) VOP_ADVLOCK(vp,
(caddr_t)td->td_proc->
p_leader,
F_UNLCK,
&lf,
F_POSIX);
VFS_UNLOCK_GIANT(locked);
FILEDESC_XLOCK(fdp);
fdrop(fp, td);
fpp = fdp->fd_ofiles + i;
}
}
retry:
if (fdtol->fdl_refcount == 1) {
if (fdp->fd_holdleaderscount > 0 &&
(td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
/*
* close() or do_dup() has cleared a reference
* in a shared file descriptor table.
*/
fdp->fd_holdleaderswakeup = 1;
sx_sleep(&fdp->fd_holdleaderscount,
FILEDESC_LOCK(fdp), PLOCK, "fdlhold", 0);
goto retry;
}
if (fdtol->fdl_holdcount > 0) {
/*
* Ensure that fdtol->fdl_leader remains
* valid in closef().
*/
fdtol->fdl_wakeup = 1;
sx_sleep(fdtol, FILEDESC_LOCK(fdp), PLOCK,
"fdlhold", 0);
goto retry;
}
}
fdtol->fdl_refcount--;
if (fdtol->fdl_refcount == 0 &&
fdtol->fdl_holdcount == 0) {
fdtol->fdl_next->fdl_prev = fdtol->fdl_prev;
fdtol->fdl_prev->fdl_next = fdtol->fdl_next;
} else
fdtol = NULL;
td->td_proc->p_fdtol = NULL;
FILEDESC_XUNLOCK(fdp);
if (fdtol != NULL)
free(fdtol, M_FILEDESC_TO_LEADER);
}
FILEDESC_XLOCK(fdp);
i = --fdp->fd_refcnt;
FILEDESC_XUNLOCK(fdp);
if (i > 0)
return;
fpp = fdp->fd_ofiles;
for (i = fdp->fd_lastfile; i-- >= 0; fpp++) {
if (*fpp) {
FILEDESC_XLOCK(fdp);
fp = *fpp;
*fpp = NULL;
FILEDESC_XUNLOCK(fdp);
(void) closef(fp, td);
}
}
FILEDESC_XLOCK(fdp);
/* XXX This should happen earlier. */
mtx_lock(&fdesc_mtx);
td->td_proc->p_fd = NULL;
mtx_unlock(&fdesc_mtx);
if (fdp->fd_nfiles > NDFILE)
free(fdp->fd_ofiles, M_FILEDESC);
if (NDSLOTS(fdp->fd_nfiles) > NDSLOTS(NDFILE))
free(fdp->fd_map, M_FILEDESC);
fdp->fd_nfiles = 0;
cdir = fdp->fd_cdir;
fdp->fd_cdir = NULL;
rdir = fdp->fd_rdir;
fdp->fd_rdir = NULL;
jdir = fdp->fd_jdir;
fdp->fd_jdir = NULL;
FILEDESC_XUNLOCK(fdp);
if (cdir) {
locked = VFS_LOCK_GIANT(cdir->v_mount);
vrele(cdir);
VFS_UNLOCK_GIANT(locked);
}
if (rdir) {
locked = VFS_LOCK_GIANT(rdir->v_mount);
vrele(rdir);
VFS_UNLOCK_GIANT(locked);
}
if (jdir) {
locked = VFS_LOCK_GIANT(jdir->v_mount);
vrele(jdir);
VFS_UNLOCK_GIANT(locked);
}
fddrop(fdp);
}
/*
* For setugid programs, we don't want to people to use that setugidness
* to generate error messages which write to a file which otherwise would
* otherwise be off-limits to the process. We check for filesystems where
* the vnode can change out from under us after execve (like [lin]procfs).
*
* Since setugidsafety calls this only for fd 0, 1 and 2, this check is
* sufficient. We also don't check for setugidness since we know we are.
*/
static int
is_unsafe(struct file *fp)
{
if (fp->f_type == DTYPE_VNODE) {
struct vnode *vp = fp->f_vnode;
if ((vp->v_vflag & VV_PROCDEP) != 0)
return (1);
}
return (0);
}
/*
* Make this setguid thing safe, if at all possible.
*/
void
setugidsafety(struct thread *td)
{
struct filedesc *fdp;
int i;
/* Certain daemons might not have file descriptors. */
fdp = td->td_proc->p_fd;
if (fdp == NULL)
return;
/*
* Note: fdp->fd_ofiles may be reallocated out from under us while
* we are blocked in a close. Be careful!
*/
FILEDESC_XLOCK(fdp);
for (i = 0; i <= fdp->fd_lastfile; i++) {
if (i > 2)
break;
if (fdp->fd_ofiles[i] && is_unsafe(fdp->fd_ofiles[i])) {
struct file *fp;
knote_fdclose(td, i);
/*
* NULL-out descriptor prior to close to avoid
* a race while close blocks.
*/
fp = fdp->fd_ofiles[i];
fdp->fd_ofiles[i] = NULL;
fdp->fd_ofileflags[i] = 0;
fdunused(fdp, i);
FILEDESC_XUNLOCK(fdp);
(void) closef(fp, td);
FILEDESC_XLOCK(fdp);
}
}
FILEDESC_XUNLOCK(fdp);
}
/*
* If a specific file object occupies a specific file descriptor, close the
* file descriptor entry and drop a reference on the file object. This is a
* convenience function to handle a subsequent error in a function that calls
* falloc() that handles the race that another thread might have closed the
* file descriptor out from under the thread creating the file object.
*/
void
fdclose(struct filedesc *fdp, struct file *fp, int idx, struct thread *td)
{
FILEDESC_XLOCK(fdp);
if (fdp->fd_ofiles[idx] == fp) {
fdp->fd_ofiles[idx] = NULL;
fdunused(fdp, idx);
FILEDESC_XUNLOCK(fdp);
fdrop(fp, td);
} else
FILEDESC_XUNLOCK(fdp);
}
/*
* Close any files on exec?
*/
void
fdcloseexec(struct thread *td)
{
struct filedesc *fdp;
int i;
/* Certain daemons might not have file descriptors. */
fdp = td->td_proc->p_fd;
if (fdp == NULL)
return;
FILEDESC_XLOCK(fdp);
/*
* We cannot cache fd_ofiles or fd_ofileflags since operations
* may block and rip them out from under us.
*/
for (i = 0; i <= fdp->fd_lastfile; i++) {
if (fdp->fd_ofiles[i] != NULL &&
(fdp->fd_ofiles[i]->f_type == DTYPE_MQUEUE ||
(fdp->fd_ofileflags[i] & UF_EXCLOSE))) {
struct file *fp;
knote_fdclose(td, i);
/*
* NULL-out descriptor prior to close to avoid
* a race while close blocks.
*/
fp = fdp->fd_ofiles[i];
fdp->fd_ofiles[i] = NULL;
fdp->fd_ofileflags[i] = 0;
fdunused(fdp, i);
if (fp->f_type == DTYPE_MQUEUE)
mq_fdclose(td, i, fp);
FILEDESC_XUNLOCK(fdp);
(void) closef(fp, td);
FILEDESC_XLOCK(fdp);
}
}
FILEDESC_XUNLOCK(fdp);
}
/*
* It is unsafe for set[ug]id processes to be started with file
* descriptors 0..2 closed, as these descriptors are given implicit
* significance in the Standard C library. fdcheckstd() will create a
* descriptor referencing /dev/null for each of stdin, stdout, and
* stderr that is not already open.
*/
int
fdcheckstd(struct thread *td)
{
struct filedesc *fdp;
register_t retval, save;
int i, error, devnull;
fdp = td->td_proc->p_fd;
if (fdp == NULL)
return (0);
KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared"));
devnull = -1;
error = 0;
for (i = 0; i < 3; i++) {
if (fdp->fd_ofiles[i] != NULL)
continue;
if (devnull < 0) {
save = td->td_retval[0];
error = kern_open(td, "/dev/null", UIO_SYSSPACE,
O_RDWR, 0);
devnull = td->td_retval[0];
td->td_retval[0] = save;
if (error)
break;
KASSERT(devnull == i, ("oof, we didn't get our fd"));
} else {
error = do_dup(td, DUP_FIXED, devnull, i, &retval);
if (error != 0)
break;
}
}
return (error);
}
/*
* Internal form of close. Decrement reference count on file structure.
* Note: td may be NULL when closing a file that was being passed in a
* message.
*
* XXXRW: Giant is not required for the caller, but often will be held; this
* makes it moderately likely the Giant will be recursed in the VFS case.
*/
int
closef(struct file *fp, struct thread *td)
{
struct vnode *vp;
struct flock lf;
struct filedesc_to_leader *fdtol;
struct filedesc *fdp;
struct file *fp_object;
/*
* POSIX record locking dictates that any close releases ALL
* locks owned by this process. This is handled by setting
* a flag in the unlock to free ONLY locks obeying POSIX
* semantics, and not to free BSD-style file locks.
* If the descriptor was in a message, POSIX-style locks
* aren't passed with the descriptor, and the thread pointer
* will be NULL. Callers should be careful only to pass a
* NULL thread pointer when there really is no owning
* context that might have locks, or the locks will be
* leaked.
*
* If this is a capability, we do lock processing under the underlying
* node, not the capability itself.
*/
(void)cap_funwrap(fp, 0, &fp_object);
if ((fp_object->f_type == DTYPE_VNODE) && (td != NULL)) {
int vfslocked;
vp = fp_object->f_vnode;
vfslocked = VFS_LOCK_GIANT(vp->v_mount);
if ((td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
lf.l_whence = SEEK_SET;
lf.l_start = 0;
lf.l_len = 0;
lf.l_type = F_UNLCK;
(void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc->p_leader,
F_UNLCK, &lf, F_POSIX);
}
fdtol = td->td_proc->p_fdtol;
if (fdtol != NULL) {
/*
* Handle special case where file descriptor table is
* shared between multiple process leaders.
*/
fdp = td->td_proc->p_fd;
FILEDESC_XLOCK(fdp);
for (fdtol = fdtol->fdl_next;
fdtol != td->td_proc->p_fdtol;
fdtol = fdtol->fdl_next) {
if ((fdtol->fdl_leader->p_flag &
P_ADVLOCK) == 0)
continue;
fdtol->fdl_holdcount++;
FILEDESC_XUNLOCK(fdp);
lf.l_whence = SEEK_SET;
lf.l_start = 0;
lf.l_len = 0;
lf.l_type = F_UNLCK;
vp = fp_object->f_vnode;
(void) VOP_ADVLOCK(vp,
(caddr_t)fdtol->fdl_leader,
F_UNLCK, &lf, F_POSIX);
FILEDESC_XLOCK(fdp);
fdtol->fdl_holdcount--;
if (fdtol->fdl_holdcount == 0 &&
fdtol->fdl_wakeup != 0) {
fdtol->fdl_wakeup = 0;
wakeup(fdtol);
}
}
FILEDESC_XUNLOCK(fdp);
}
VFS_UNLOCK_GIANT(vfslocked);
}
return (fdrop(fp, td));
}
/*
* Initialize the file pointer with the specified properties.
*
* The ops are set with release semantics to be certain that the flags, type,
* and data are visible when ops is. This is to prevent ops methods from being
* called with bad data.
*/
void
finit(struct file *fp, u_int flag, short type, void *data, struct fileops *ops)
{
fp->f_data = data;
fp->f_flag = flag;
fp->f_type = type;
atomic_store_rel_ptr((volatile uintptr_t *)&fp->f_ops, (uintptr_t)ops);
}
struct file *
fget_unlocked(struct filedesc *fdp, int fd)
{
struct file *fp;
u_int count;
if (fd < 0 || fd >= fdp->fd_nfiles)
return (NULL);
/*
* Fetch the descriptor locklessly. We avoid fdrop() races by
* never raising a refcount above 0. To accomplish this we have
* to use a cmpset loop rather than an atomic_add. The descriptor
* must be re-verified once we acquire a reference to be certain
* that the identity is still correct and we did not lose a race
* due to preemption.
*/
for (;;) {
fp = fdp->fd_ofiles[fd];
if (fp == NULL)
break;
count = fp->f_count;
if (count == 0)
continue;
/*
* Use an acquire barrier to prevent caching of fd_ofiles
* so it is refreshed for verification.
*/
if (atomic_cmpset_acq_int(&fp->f_count, count, count + 1) != 1)
continue;
if (fp == fdp->fd_ofiles[fd])
break;
fdrop(fp, curthread);
}
return (fp);
}
/*
* Extract the file pointer associated with the specified descriptor for the
* current user process.
*
* If the descriptor doesn't exist or doesn't match 'flags', EBADF is
* returned.
*
* If the FGET_GETCAP flag is set, the capability itself will be returned.
* Calling _fget() with FGET_GETCAP on a non-capability will return EINVAL.
* Otherwise, if the file is a capability, its rights will be checked against
* the capability rights mask, and if successful, the object will be unwrapped.
*
* If an error occured the non-zero error is returned and *fpp is set to
* NULL. Otherwise *fpp is held and set and zero is returned. Caller is
* responsible for fdrop().
*/
#define FGET_GETCAP 0x00000001
static __inline int
_fget(struct thread *td, int fd, struct file **fpp, int flags,
cap_rights_t needrights, cap_rights_t *haverightsp, u_char *maxprotp,
int fget_flags)
{
struct filedesc *fdp;
struct file *fp;
#ifdef CAPABILITIES
struct file *fp_fromcap;
int error;
#endif
*fpp = NULL;
if (td == NULL || (fdp = td->td_proc->p_fd) == NULL)
return (EBADF);
if ((fp = fget_unlocked(fdp, fd)) == NULL)
return (EBADF);
if (fp->f_ops == &badfileops) {
fdrop(fp, td);
return (EBADF);
}
#ifdef CAPABILITIES
/*
* If this is a capability, what rights does it have?
*/
if (haverightsp != NULL) {
if (fp->f_type == DTYPE_CAPABILITY)
*haverightsp = cap_rights(fp);
else
*haverightsp = CAP_MASK_VALID;
}
/*
* If a capability has been requested, return the capability directly.
* Otherwise, check capability rights, extract the underlying object,
* and check its access flags.
*/
if (fget_flags & FGET_GETCAP) {
if (fp->f_type != DTYPE_CAPABILITY) {
fdrop(fp, td);
return (EINVAL);
}
} else {
if (maxprotp == NULL)
error = cap_funwrap(fp, needrights, &fp_fromcap);
else
error = cap_funwrap_mmap(fp, needrights, maxprotp,
&fp_fromcap);
if (error) {
fdrop(fp, td);
return (error);
}
/*
* If we've unwrapped a file, drop the original capability
* and hold the new descriptor. fp after this point refers to
* the actual (unwrapped) object, not the capability.
*/
if (fp != fp_fromcap) {
fhold(fp_fromcap);
fdrop(fp, td);
fp = fp_fromcap;
}
}
#else /* !CAPABILITIES */
KASSERT(fp->f_type != DTYPE_CAPABILITY,
("%s: saw capability", __func__));
if (maxprotp != NULL)
*maxprotp = VM_PROT_ALL;
#endif /* CAPABILITIES */
/*
* FREAD and FWRITE failure return EBADF as per POSIX.
*
* Only one flag, or 0, may be specified.
*/
if ((flags == FREAD && (fp->f_flag & FREAD) == 0) ||
(flags == FWRITE && (fp->f_flag & FWRITE) == 0)) {
fdrop(fp, td);
return (EBADF);
}
*fpp = fp;
return (0);
}
int
fget(struct thread *td, int fd, cap_rights_t rights, struct file **fpp)
{
return(_fget(td, fd, fpp, 0, rights, NULL, NULL, 0));
}
int
fget_mmap(struct thread *td, int fd, cap_rights_t rights, u_char *maxprotp,
struct file **fpp)
{
return (_fget(td, fd, fpp, 0, rights, NULL, maxprotp, 0));
}
int
fget_read(struct thread *td, int fd, cap_rights_t rights, struct file **fpp)
{
return(_fget(td, fd, fpp, FREAD, rights, NULL, NULL, 0));
}
int
fget_write(struct thread *td, int fd, cap_rights_t rights, struct file **fpp)
{
return (_fget(td, fd, fpp, FWRITE, rights, NULL, NULL, 0));
}
/*
* Unlike the other fget() calls, which accept and check capability rights
* but never return capabilities, fgetcap() returns the capability but doesn't
* check capability rights.
*/
int
fgetcap(struct thread *td, int fd, struct file **fpp)
{
return (_fget(td, fd, fpp, 0, 0, NULL, NULL, FGET_GETCAP));
}
/*
* Like fget() but loads the underlying vnode, or returns an error if the
* descriptor does not represent a vnode. Note that pipes use vnodes but
* never have VM objects. The returned vnode will be vref()'d.
*
* XXX: what about the unused flags ?
*/
static __inline int
_fgetvp(struct thread *td, int fd, int flags, cap_rights_t needrights,
cap_rights_t *haverightsp, struct vnode **vpp)
{
struct file *fp;
int error;
*vpp = NULL;
if ((error = _fget(td, fd, &fp, flags, needrights, haverightsp,
NULL, 0)) != 0)
return (error);
if (fp->f_vnode == NULL) {
error = EINVAL;
} else {
*vpp = fp->f_vnode;
vref(*vpp);
}
fdrop(fp, td);
return (error);
}
int
fgetvp(struct thread *td, int fd, cap_rights_t rights, struct vnode **vpp)
{
return (_fgetvp(td, fd, 0, rights, NULL, vpp));
}
int
fgetvp_rights(struct thread *td, int fd, cap_rights_t need, cap_rights_t *have,
struct vnode **vpp)
{
return (_fgetvp(td, fd, 0, need, have, vpp));
}
int
fgetvp_read(struct thread *td, int fd, cap_rights_t rights, struct vnode **vpp)
{
return (_fgetvp(td, fd, FREAD, rights, NULL, vpp));
}
#ifdef notyet
int
fgetvp_write(struct thread *td, int fd, cap_rights_t rights,
struct vnode **vpp)
{
return (_fgetvp(td, fd, FWRITE, rights, NULL, vpp));
}
#endif
/*
* Like fget() but loads the underlying socket, or returns an error if the
* descriptor does not represent a socket.
*
* We bump the ref count on the returned socket. XXX Also obtain the SX lock
* in the future.
*
* Note: fgetsock() and fputsock() are deprecated, as consumers should rely
* on their file descriptor reference to prevent the socket from being free'd
* during use.
*/
int
fgetsock(struct thread *td, int fd, cap_rights_t rights, struct socket **spp,
u_int *fflagp)
{
struct file *fp;
int error;
*spp = NULL;
if (fflagp != NULL)
*fflagp = 0;
if ((error = _fget(td, fd, &fp, 0, rights, NULL, NULL, 0)) != 0)
return (error);
if (fp->f_type != DTYPE_SOCKET) {
error = ENOTSOCK;
} else {
*spp = fp->f_data;
if (fflagp)
*fflagp = fp->f_flag;
SOCK_LOCK(*spp);
soref(*spp);
SOCK_UNLOCK(*spp);
}
fdrop(fp, td);
return (error);
}
/*
* Drop the reference count on the socket and XXX release the SX lock in the
* future. The last reference closes the socket.
*
* Note: fputsock() is deprecated, see comment for fgetsock().
*/
void
fputsock(struct socket *so)
{
ACCEPT_LOCK();
SOCK_LOCK(so);
CURVNET_SET(so->so_vnet);
sorele(so);
CURVNET_RESTORE();
}
/*
* Handle the last reference to a file being closed.
*
* No special capability handling here, as the capability's fo_close will run
* instead of the object here, and perform any necessary drop on the object.
*/
int
_fdrop(struct file *fp, struct thread *td)
{
int error;
error = 0;
if (fp->f_count != 0)
panic("fdrop: count %d", fp->f_count);
if (fp->f_ops != &badfileops)
error = fo_close(fp, td);
/*
* The f_cdevpriv cannot be assigned non-NULL value while we
* are destroying the file.
*/
if (fp->f_cdevpriv != NULL)
devfs_fpdrop(fp);
atomic_subtract_int(&openfiles, 1);
crfree(fp->f_cred);
uma_zfree(file_zone, fp);
return (error);
}
/*
* Apply an advisory lock on a file descriptor.
*
* Just attempt to get a record lock of the requested type on the entire file
* (l_whence = SEEK_SET, l_start = 0, l_len = 0).
*/
#ifndef _SYS_SYSPROTO_H_
struct flock_args {
int fd;
int how;
};
#endif
/* ARGSUSED */
int
-flock(struct thread *td, struct flock_args *uap)
+sys_flock(struct thread *td, struct flock_args *uap)
{
struct file *fp;
struct vnode *vp;
struct flock lf;
int vfslocked;
int error;
if ((error = fget(td, uap->fd, CAP_FLOCK, &fp)) != 0)
return (error);
if (fp->f_type != DTYPE_VNODE) {
fdrop(fp, td);
return (EOPNOTSUPP);
}
vp = fp->f_vnode;
vfslocked = VFS_LOCK_GIANT(vp->v_mount);
lf.l_whence = SEEK_SET;
lf.l_start = 0;
lf.l_len = 0;
if (uap->how & LOCK_UN) {
lf.l_type = F_UNLCK;
atomic_clear_int(&fp->f_flag, FHASLOCK);
error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK);
goto done2;
}
if (uap->how & LOCK_EX)
lf.l_type = F_WRLCK;
else if (uap->how & LOCK_SH)
lf.l_type = F_RDLCK;
else {
error = EBADF;
goto done2;
}
atomic_set_int(&fp->f_flag, FHASLOCK);
error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
(uap->how & LOCK_NB) ? F_FLOCK : F_FLOCK | F_WAIT);
done2:
fdrop(fp, td);
VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
/*
* Duplicate the specified descriptor to a free descriptor.
*/
int
dupfdopen(struct thread *td, struct filedesc *fdp, int indx, int dfd, int mode, int error)
{
struct file *wfp;
struct file *fp;
/*
* If the to-be-dup'd fd number is greater than the allowed number
* of file descriptors, or the fd to be dup'd has already been
* closed, then reject.
*/
FILEDESC_XLOCK(fdp);
if (dfd < 0 || dfd >= fdp->fd_nfiles ||
(wfp = fdp->fd_ofiles[dfd]) == NULL) {
FILEDESC_XUNLOCK(fdp);
return (EBADF);
}
/*
* There are two cases of interest here.
*
* For ENODEV simply dup (dfd) to file descriptor (indx) and return.
*
* For ENXIO steal away the file structure from (dfd) and store it in
* (indx). (dfd) is effectively closed by this operation.
*
* Any other error code is just returned.
*/
switch (error) {
case ENODEV:
/*
* Check that the mode the file is being opened for is a
* subset of the mode of the existing descriptor.
*/
if (((mode & (FREAD|FWRITE)) | wfp->f_flag) != wfp->f_flag) {
FILEDESC_XUNLOCK(fdp);
return (EACCES);
}
fp = fdp->fd_ofiles[indx];
fdp->fd_ofiles[indx] = wfp;
fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd];
if (fp == NULL)
fdused(fdp, indx);
fhold(wfp);
FILEDESC_XUNLOCK(fdp);
if (fp != NULL)
/*
* We now own the reference to fp that the ofiles[]
* array used to own. Release it.
*/
fdrop(fp, td);
return (0);
case ENXIO:
/*
* Steal away the file pointer from dfd and stuff it into indx.
*/
fp = fdp->fd_ofiles[indx];
fdp->fd_ofiles[indx] = fdp->fd_ofiles[dfd];
fdp->fd_ofiles[dfd] = NULL;
fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd];
fdp->fd_ofileflags[dfd] = 0;
fdunused(fdp, dfd);
if (fp == NULL)
fdused(fdp, indx);
FILEDESC_XUNLOCK(fdp);
/*
* We now own the reference to fp that the ofiles[] array
* used to own. Release it.
*/
if (fp != NULL)
fdrop(fp, td);
return (0);
default:
FILEDESC_XUNLOCK(fdp);
return (error);
}
/* NOTREACHED */
}
/*
* Scan all active processes and prisons to see if any of them have a current
* or root directory of `olddp'. If so, replace them with the new mount point.
*/
void
mountcheckdirs(struct vnode *olddp, struct vnode *newdp)
{
struct filedesc *fdp;
struct prison *pr;
struct proc *p;
int nrele;
if (vrefcnt(olddp) == 1)
return;
nrele = 0;
sx_slock(&allproc_lock);
FOREACH_PROC_IN_SYSTEM(p) {
fdp = fdhold(p);
if (fdp == NULL)
continue;
FILEDESC_XLOCK(fdp);
if (fdp->fd_cdir == olddp) {
vref(newdp);
fdp->fd_cdir = newdp;
nrele++;
}
if (fdp->fd_rdir == olddp) {
vref(newdp);
fdp->fd_rdir = newdp;
nrele++;
}
if (fdp->fd_jdir == olddp) {
vref(newdp);
fdp->fd_jdir = newdp;
nrele++;
}
FILEDESC_XUNLOCK(fdp);
fddrop(fdp);
}
sx_sunlock(&allproc_lock);
if (rootvnode == olddp) {
vref(newdp);
rootvnode = newdp;
nrele++;
}
mtx_lock(&prison0.pr_mtx);
if (prison0.pr_root == olddp) {
vref(newdp);
prison0.pr_root = newdp;
nrele++;
}
mtx_unlock(&prison0.pr_mtx);
sx_slock(&allprison_lock);
TAILQ_FOREACH(pr, &allprison, pr_list) {
mtx_lock(&pr->pr_mtx);
if (pr->pr_root == olddp) {
vref(newdp);
pr->pr_root = newdp;
nrele++;
}
mtx_unlock(&pr->pr_mtx);
}
sx_sunlock(&allprison_lock);
while (nrele--)
vrele(olddp);
}
struct filedesc_to_leader *
filedesc_to_leader_alloc(struct filedesc_to_leader *old, struct filedesc *fdp, struct proc *leader)
{
struct filedesc_to_leader *fdtol;
fdtol = malloc(sizeof(struct filedesc_to_leader),
M_FILEDESC_TO_LEADER,
M_WAITOK);
fdtol->fdl_refcount = 1;
fdtol->fdl_holdcount = 0;
fdtol->fdl_wakeup = 0;
fdtol->fdl_leader = leader;
if (old != NULL) {
FILEDESC_XLOCK(fdp);
fdtol->fdl_next = old->fdl_next;
fdtol->fdl_prev = old;
old->fdl_next = fdtol;
fdtol->fdl_next->fdl_prev = fdtol;
FILEDESC_XUNLOCK(fdp);
} else {
fdtol->fdl_next = fdtol;
fdtol->fdl_prev = fdtol;
}
return (fdtol);
}
/*
* Get file structures globally.
*/
static int
sysctl_kern_file(SYSCTL_HANDLER_ARGS)
{
struct xfile xf;
struct filedesc *fdp;
struct file *fp;
struct proc *p;
int error, n;
error = sysctl_wire_old_buffer(req, 0);
if (error != 0)
return (error);
if (req->oldptr == NULL) {
n = 0;
sx_slock(&allproc_lock);
FOREACH_PROC_IN_SYSTEM(p) {
if (p->p_state == PRS_NEW)
continue;
fdp = fdhold(p);
if (fdp == NULL)
continue;
/* overestimates sparse tables. */
if (fdp->fd_lastfile > 0)
n += fdp->fd_lastfile;
fddrop(fdp);
}
sx_sunlock(&allproc_lock);
return (SYSCTL_OUT(req, 0, n * sizeof(xf)));
}
error = 0;
bzero(&xf, sizeof(xf));
xf.xf_size = sizeof(xf);
sx_slock(&allproc_lock);
FOREACH_PROC_IN_SYSTEM(p) {
PROC_LOCK(p);
if (p->p_state == PRS_NEW) {
PROC_UNLOCK(p);
continue;
}
if (p_cansee(req->td, p) != 0) {
PROC_UNLOCK(p);
continue;
}
xf.xf_pid = p->p_pid;
xf.xf_uid = p->p_ucred->cr_uid;
PROC_UNLOCK(p);
fdp = fdhold(p);
if (fdp == NULL)
continue;
FILEDESC_SLOCK(fdp);
for (n = 0; fdp->fd_refcnt > 0 && n < fdp->fd_nfiles; ++n) {
if ((fp = fdp->fd_ofiles[n]) == NULL)
continue;
xf.xf_fd = n;
xf.xf_file = fp;
xf.xf_data = fp->f_data;
xf.xf_vnode = fp->f_vnode;
xf.xf_type = fp->f_type;
xf.xf_count = fp->f_count;
xf.xf_msgcount = 0;
xf.xf_offset = fp->f_offset;
xf.xf_flag = fp->f_flag;
error = SYSCTL_OUT(req, &xf, sizeof(xf));
if (error)
break;
}
FILEDESC_SUNLOCK(fdp);
fddrop(fdp);
if (error)
break;
}
sx_sunlock(&allproc_lock);
return (error);
}
SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD,
0, 0, sysctl_kern_file, "S,xfile", "Entire file table");
#ifdef KINFO_OFILE_SIZE
CTASSERT(sizeof(struct kinfo_ofile) == KINFO_OFILE_SIZE);
#endif
#ifdef COMPAT_FREEBSD7
static int
export_vnode_for_osysctl(struct vnode *vp, int type,
struct kinfo_ofile *kif, struct filedesc *fdp, struct sysctl_req *req)
{
int error;
char *fullpath, *freepath;
int vfslocked;
bzero(kif, sizeof(*kif));
kif->kf_structsize = sizeof(*kif);
vref(vp);
kif->kf_fd = type;
kif->kf_type = KF_TYPE_VNODE;
/* This function only handles directories. */
if (vp->v_type != VDIR) {
vrele(vp);
return (ENOTDIR);
}
kif->kf_vnode_type = KF_VTYPE_VDIR;
/*
* This is not a true file descriptor, so we set a bogus refcount
* and offset to indicate these fields should be ignored.
*/
kif->kf_ref_count = -1;
kif->kf_offset = -1;
freepath = NULL;
fullpath = "-";
FILEDESC_SUNLOCK(fdp);
vn_fullpath(curthread, vp, &fullpath, &freepath);
vfslocked = VFS_LOCK_GIANT(vp->v_mount);
vrele(vp);
VFS_UNLOCK_GIANT(vfslocked);
strlcpy(kif->kf_path, fullpath, sizeof(kif->kf_path));
if (freepath != NULL)
free(freepath, M_TEMP);
error = SYSCTL_OUT(req, kif, sizeof(*kif));
FILEDESC_SLOCK(fdp);
return (error);
}
/*
* Get per-process file descriptors for use by procstat(1), et al.
*/
static int
sysctl_kern_proc_ofiledesc(SYSCTL_HANDLER_ARGS)
{
char *fullpath, *freepath;
struct kinfo_ofile *kif;
struct filedesc *fdp;
int error, i, *name;
struct socket *so;
struct vnode *vp;
struct file *fp;
struct proc *p;
struct tty *tp;
int vfslocked;
name = (int *)arg1;
if ((p = pfind((pid_t)name[0])) == NULL)
return (ESRCH);
if ((error = p_candebug(curthread, p))) {
PROC_UNLOCK(p);
return (error);
}
fdp = fdhold(p);
PROC_UNLOCK(p);
if (fdp == NULL)
return (ENOENT);
kif = malloc(sizeof(*kif), M_TEMP, M_WAITOK);
FILEDESC_SLOCK(fdp);
if (fdp->fd_cdir != NULL)
export_vnode_for_osysctl(fdp->fd_cdir, KF_FD_TYPE_CWD, kif,
fdp, req);
if (fdp->fd_rdir != NULL)
export_vnode_for_osysctl(fdp->fd_rdir, KF_FD_TYPE_ROOT, kif,
fdp, req);
if (fdp->fd_jdir != NULL)
export_vnode_for_osysctl(fdp->fd_jdir, KF_FD_TYPE_JAIL, kif,
fdp, req);
for (i = 0; i < fdp->fd_nfiles; i++) {
if ((fp = fdp->fd_ofiles[i]) == NULL)
continue;
bzero(kif, sizeof(*kif));
kif->kf_structsize = sizeof(*kif);
vp = NULL;
so = NULL;
tp = NULL;
kif->kf_fd = i;
#ifdef CAPABILITIES
/*
* When reporting a capability, most fields will be from the
* underlying object, but do mark as a capability. With
* ofiledesc, we don't have a field to export the cap_rights_t,
* but we do with the new filedesc.
*/
if (fp->f_type == DTYPE_CAPABILITY) {
kif->kf_flags |= KF_FLAG_CAPABILITY;
(void)cap_funwrap(fp, 0, &fp);
}
#else
KASSERT(fp->f_type != DTYPE_CAPABILITY,
("sysctl_kern_proc_ofiledesc: saw capability"));
#endif
switch (fp->f_type) {
case DTYPE_VNODE:
kif->kf_type = KF_TYPE_VNODE;
vp = fp->f_vnode;
break;
case DTYPE_SOCKET:
kif->kf_type = KF_TYPE_SOCKET;
so = fp->f_data;
break;
case DTYPE_PIPE:
kif->kf_type = KF_TYPE_PIPE;
break;
case DTYPE_FIFO:
kif->kf_type = KF_TYPE_FIFO;
vp = fp->f_vnode;
break;
case DTYPE_KQUEUE:
kif->kf_type = KF_TYPE_KQUEUE;
break;
case DTYPE_CRYPTO:
kif->kf_type = KF_TYPE_CRYPTO;
break;
case DTYPE_MQUEUE:
kif->kf_type = KF_TYPE_MQUEUE;
break;
case DTYPE_SHM:
kif->kf_type = KF_TYPE_SHM;
break;
case DTYPE_SEM:
kif->kf_type = KF_TYPE_SEM;
break;
case DTYPE_PTS:
kif->kf_type = KF_TYPE_PTS;
tp = fp->f_data;
break;
#ifdef PROCDESC
case DTYPE_PROCDESC:
kif->kf_type = KF_TYPE_PROCDESC;
break;
#endif
default:
kif->kf_type = KF_TYPE_UNKNOWN;
break;
}
kif->kf_ref_count = fp->f_count;
if (fp->f_flag & FREAD)
kif->kf_flags |= KF_FLAG_READ;
if (fp->f_flag & FWRITE)
kif->kf_flags |= KF_FLAG_WRITE;
if (fp->f_flag & FAPPEND)
kif->kf_flags |= KF_FLAG_APPEND;
if (fp->f_flag & FASYNC)
kif->kf_flags |= KF_FLAG_ASYNC;
if (fp->f_flag & FFSYNC)
kif->kf_flags |= KF_FLAG_FSYNC;
if (fp->f_flag & FNONBLOCK)
kif->kf_flags |= KF_FLAG_NONBLOCK;
if (fp->f_flag & O_DIRECT)
kif->kf_flags |= KF_FLAG_DIRECT;
if (fp->f_flag & FHASLOCK)
kif->kf_flags |= KF_FLAG_HASLOCK;
kif->kf_offset = fp->f_offset;
if (vp != NULL) {
vref(vp);
switch (vp->v_type) {
case VNON:
kif->kf_vnode_type = KF_VTYPE_VNON;
break;
case VREG:
kif->kf_vnode_type = KF_VTYPE_VREG;
break;
case VDIR:
kif->kf_vnode_type = KF_VTYPE_VDIR;
break;
case VBLK:
kif->kf_vnode_type = KF_VTYPE_VBLK;
break;
case VCHR:
kif->kf_vnode_type = KF_VTYPE_VCHR;
break;
case VLNK:
kif->kf_vnode_type = KF_VTYPE_VLNK;
break;
case VSOCK:
kif->kf_vnode_type = KF_VTYPE_VSOCK;
break;
case VFIFO:
kif->kf_vnode_type = KF_VTYPE_VFIFO;
break;
case VBAD:
kif->kf_vnode_type = KF_VTYPE_VBAD;
break;
default:
kif->kf_vnode_type = KF_VTYPE_UNKNOWN;
break;
}
/*
* It is OK to drop the filedesc lock here as we will
* re-validate and re-evaluate its properties when
* the loop continues.
*/
freepath = NULL;
fullpath = "-";
FILEDESC_SUNLOCK(fdp);
vn_fullpath(curthread, vp, &fullpath, &freepath);
vfslocked = VFS_LOCK_GIANT(vp->v_mount);
vrele(vp);
VFS_UNLOCK_GIANT(vfslocked);
strlcpy(kif->kf_path, fullpath,
sizeof(kif->kf_path));
if (freepath != NULL)
free(freepath, M_TEMP);
FILEDESC_SLOCK(fdp);
}
if (so != NULL) {
struct sockaddr *sa;
if (so->so_proto->pr_usrreqs->pru_sockaddr(so, &sa)
== 0 && sa->sa_len <= sizeof(kif->kf_sa_local)) {
bcopy(sa, &kif->kf_sa_local, sa->sa_len);
free(sa, M_SONAME);
}
if (so->so_proto->pr_usrreqs->pru_peeraddr(so, &sa)
== 0 && sa->sa_len <= sizeof(kif->kf_sa_peer)) {
bcopy(sa, &kif->kf_sa_peer, sa->sa_len);
free(sa, M_SONAME);
}
kif->kf_sock_domain =
so->so_proto->pr_domain->dom_family;
kif->kf_sock_type = so->so_type;
kif->kf_sock_protocol = so->so_proto->pr_protocol;
}
if (tp != NULL) {
strlcpy(kif->kf_path, tty_devname(tp),
sizeof(kif->kf_path));
}
error = SYSCTL_OUT(req, kif, sizeof(*kif));
if (error)
break;
}
FILEDESC_SUNLOCK(fdp);
fddrop(fdp);
free(kif, M_TEMP);
return (0);
}
static SYSCTL_NODE(_kern_proc, KERN_PROC_OFILEDESC, ofiledesc, CTLFLAG_RD,
sysctl_kern_proc_ofiledesc, "Process ofiledesc entries");
#endif /* COMPAT_FREEBSD7 */
#ifdef KINFO_FILE_SIZE
CTASSERT(sizeof(struct kinfo_file) == KINFO_FILE_SIZE);
#endif
static int
export_fd_for_sysctl(void *data, int type, int fd, int fflags, int refcnt,
int64_t offset, struct kinfo_file *kif, struct sysctl_req *req)
{
struct {
int fflag;
int kf_fflag;
} fflags_table[] = {
{ FAPPEND, KF_FLAG_APPEND },
{ FASYNC, KF_FLAG_ASYNC },
{ FFSYNC, KF_FLAG_FSYNC },
{ FHASLOCK, KF_FLAG_HASLOCK },
{ FNONBLOCK, KF_FLAG_NONBLOCK },
{ FREAD, KF_FLAG_READ },
{ FWRITE, KF_FLAG_WRITE },
{ O_CREAT, KF_FLAG_CREAT },
{ O_DIRECT, KF_FLAG_DIRECT },
{ O_EXCL, KF_FLAG_EXCL },
{ O_EXEC, KF_FLAG_EXEC },
{ O_EXLOCK, KF_FLAG_EXLOCK },
{ O_NOFOLLOW, KF_FLAG_NOFOLLOW },
{ O_SHLOCK, KF_FLAG_SHLOCK },
{ O_TRUNC, KF_FLAG_TRUNC }
};
#define NFFLAGS (sizeof(fflags_table) / sizeof(*fflags_table))
struct vnode *vp;
int error, vfslocked;
unsigned int i;
bzero(kif, sizeof(*kif));
switch (type) {
case KF_TYPE_FIFO:
case KF_TYPE_VNODE:
vp = (struct vnode *)data;
error = fill_vnode_info(vp, kif);
vfslocked = VFS_LOCK_GIANT(vp->v_mount);
vrele(vp);
VFS_UNLOCK_GIANT(vfslocked);
break;
case KF_TYPE_SOCKET:
error = fill_socket_info((struct socket *)data, kif);
break;
case KF_TYPE_PIPE:
error = fill_pipe_info((struct pipe *)data, kif);
break;
case KF_TYPE_PTS:
error = fill_pts_info((struct tty *)data, kif);
break;
case KF_TYPE_PROCDESC:
error = fill_procdesc_info((struct procdesc *)data, kif);
break;
default:
error = 0;
}
if (error == 0)
kif->kf_status |= KF_ATTR_VALID;
/*
* Translate file access flags.
*/
for (i = 0; i < NFFLAGS; i++)
if (fflags & fflags_table[i].fflag)
kif->kf_flags |= fflags_table[i].kf_fflag;
kif->kf_fd = fd;
kif->kf_type = type;
kif->kf_ref_count = refcnt;
kif->kf_offset = offset;
/* Pack record size down */
kif->kf_structsize = offsetof(struct kinfo_file, kf_path) +
strlen(kif->kf_path) + 1;
kif->kf_structsize = roundup(kif->kf_structsize, sizeof(uint64_t));
error = SYSCTL_OUT(req, kif, kif->kf_structsize);
return (error);
}
/*
* Get per-process file descriptors for use by procstat(1), et al.
*/
static int
sysctl_kern_proc_filedesc(SYSCTL_HANDLER_ARGS)
{
struct file *fp;
struct filedesc *fdp;
struct kinfo_file *kif;
struct proc *p;
struct vnode *cttyvp, *textvp, *tracevp;
size_t oldidx;
int64_t offset;
void *data;
int error, i, *name;
int type, refcnt, fflags;
name = (int *)arg1;
if ((p = pfind((pid_t)name[0])) == NULL)
return (ESRCH);
if ((error = p_candebug(curthread, p))) {
PROC_UNLOCK(p);
return (error);
}
/* ktrace vnode */
tracevp = p->p_tracevp;
if (tracevp != NULL)
vref(tracevp);
/* text vnode */
textvp = p->p_textvp;
if (textvp != NULL)
vref(textvp);
/* Controlling tty. */
cttyvp = NULL;
if (p->p_pgrp != NULL && p->p_pgrp->pg_session != NULL) {
cttyvp = p->p_pgrp->pg_session->s_ttyvp;
if (cttyvp != NULL)
vref(cttyvp);
}
fdp = fdhold(p);
PROC_UNLOCK(p);
kif = malloc(sizeof(*kif), M_TEMP, M_WAITOK);
if (tracevp != NULL)
export_fd_for_sysctl(tracevp, KF_TYPE_VNODE, KF_FD_TYPE_TRACE,
FREAD | FWRITE, -1, -1, kif, req);
if (textvp != NULL)
export_fd_for_sysctl(textvp, KF_TYPE_VNODE, KF_FD_TYPE_TEXT,
FREAD, -1, -1, kif, req);
if (cttyvp != NULL)
export_fd_for_sysctl(cttyvp, KF_TYPE_VNODE, KF_FD_TYPE_CTTY,
FREAD | FWRITE, -1, -1, kif, req);
if (fdp == NULL)
goto fail;
FILEDESC_SLOCK(fdp);
/* working directory */
if (fdp->fd_cdir != NULL) {
vref(fdp->fd_cdir);
data = fdp->fd_cdir;
FILEDESC_SUNLOCK(fdp);
export_fd_for_sysctl(data, KF_TYPE_VNODE, KF_FD_TYPE_CWD,
FREAD, -1, -1, kif, req);
FILEDESC_SLOCK(fdp);
}
/* root directory */
if (fdp->fd_rdir != NULL) {
vref(fdp->fd_rdir);
data = fdp->fd_rdir;
FILEDESC_SUNLOCK(fdp);
export_fd_for_sysctl(data, KF_TYPE_VNODE, KF_FD_TYPE_ROOT,
FREAD, -1, -1, kif, req);
FILEDESC_SLOCK(fdp);
}
/* jail directory */
if (fdp->fd_jdir != NULL) {
vref(fdp->fd_jdir);
data = fdp->fd_jdir;
FILEDESC_SUNLOCK(fdp);
export_fd_for_sysctl(data, KF_TYPE_VNODE, KF_FD_TYPE_JAIL,
FREAD, -1, -1, kif, req);
FILEDESC_SLOCK(fdp);
}
for (i = 0; i < fdp->fd_nfiles; i++) {
if ((fp = fdp->fd_ofiles[i]) == NULL)
continue;
data = NULL;
#ifdef CAPABILITIES
/*
* When reporting a capability, most fields will be from the
* underlying object, but do mark as a capability and export
* the capability rights mask.
*/
if (fp->f_type == DTYPE_CAPABILITY) {
kif->kf_flags |= KF_FLAG_CAPABILITY;
kif->kf_cap_rights = cap_rights(fp);
(void)cap_funwrap(fp, 0, &fp);
}
#else /* !CAPABILITIES */
KASSERT(fp->f_type != DTYPE_CAPABILITY,
("sysctl_kern_proc_filedesc: saw capability"));
#endif
switch (fp->f_type) {
case DTYPE_VNODE:
type = KF_TYPE_VNODE;
vref(fp->f_vnode);
data = fp->f_vnode;
break;
case DTYPE_SOCKET:
type = KF_TYPE_SOCKET;
data = fp->f_data;
break;
case DTYPE_PIPE:
type = KF_TYPE_PIPE;
data = fp->f_data;
break;
case DTYPE_FIFO:
type = KF_TYPE_FIFO;
vref(fp->f_vnode);
data = fp->f_vnode;
break;
case DTYPE_KQUEUE:
type = KF_TYPE_KQUEUE;
break;
case DTYPE_CRYPTO:
type = KF_TYPE_CRYPTO;
break;
case DTYPE_MQUEUE:
type = KF_TYPE_MQUEUE;
break;
case DTYPE_SHM:
type = KF_TYPE_SHM;
break;
case DTYPE_SEM:
type = KF_TYPE_SEM;
break;
case DTYPE_PTS:
type = KF_TYPE_PTS;
data = fp->f_data;
break;
#ifdef PROCDESC
case DTYPE_PROCDESC:
type = KF_TYPE_PROCDESC;
data = fp->f_data;
break;
#endif
default:
type = KF_TYPE_UNKNOWN;
break;
}
refcnt = fp->f_count;
fflags = fp->f_flag;
offset = fp->f_offset;
/*
* Create sysctl entry.
* It is OK to drop the filedesc lock here as we will
* re-validate and re-evaluate its properties when
* the loop continues.
*/
oldidx = req->oldidx;
if (type == KF_TYPE_VNODE || type == KF_TYPE_FIFO)
FILEDESC_SUNLOCK(fdp);
error = export_fd_for_sysctl(data, type, i,
fflags, refcnt, offset, kif, req);
if (type == KF_TYPE_VNODE || type == KF_TYPE_FIFO)
FILEDESC_SLOCK(fdp);
if (error) {
if (error == ENOMEM) {
/*
* The hack to keep the ABI of sysctl
* kern.proc.filedesc intact, but not
* to account a partially copied
* kinfo_file into the oldidx.
*/
req->oldidx = oldidx;
error = 0;
}
break;
}
}
FILEDESC_SUNLOCK(fdp);
fail:
if (fdp != NULL)
fddrop(fdp);
free(kif, M_TEMP);
return (error);
}
int
vntype_to_kinfo(int vtype)
{
struct {
int vtype;
int kf_vtype;
} vtypes_table[] = {
{ VBAD, KF_VTYPE_VBAD },
{ VBLK, KF_VTYPE_VBLK },
{ VCHR, KF_VTYPE_VCHR },
{ VDIR, KF_VTYPE_VDIR },
{ VFIFO, KF_VTYPE_VFIFO },
{ VLNK, KF_VTYPE_VLNK },
{ VNON, KF_VTYPE_VNON },
{ VREG, KF_VTYPE_VREG },
{ VSOCK, KF_VTYPE_VSOCK }
};
#define NVTYPES (sizeof(vtypes_table) / sizeof(*vtypes_table))
unsigned int i;
/*
* Perform vtype translation.
*/
for (i = 0; i < NVTYPES; i++)
if (vtypes_table[i].vtype == vtype)
break;
if (i < NVTYPES)
return (vtypes_table[i].kf_vtype);
return (KF_VTYPE_UNKNOWN);
}
static int
fill_vnode_info(struct vnode *vp, struct kinfo_file *kif)
{
struct vattr va;
char *fullpath, *freepath;
int error, vfslocked;
if (vp == NULL)
return (1);
kif->kf_vnode_type = vntype_to_kinfo(vp->v_type);
freepath = NULL;
fullpath = "-";
error = vn_fullpath(curthread, vp, &fullpath, &freepath);
if (error == 0) {
strlcpy(kif->kf_path, fullpath, sizeof(kif->kf_path));
}
if (freepath != NULL)
free(freepath, M_TEMP);
/*
* Retrieve vnode attributes.
*/
va.va_fsid = VNOVAL;
va.va_rdev = NODEV;
vfslocked = VFS_LOCK_GIANT(vp->v_mount);
vn_lock(vp, LK_SHARED | LK_RETRY);
error = VOP_GETATTR(vp, &va, curthread->td_ucred);
VOP_UNLOCK(vp, 0);
VFS_UNLOCK_GIANT(vfslocked);
if (error != 0)
return (error);
if (va.va_fsid != VNOVAL)
kif->kf_un.kf_file.kf_file_fsid = va.va_fsid;
else
kif->kf_un.kf_file.kf_file_fsid =
vp->v_mount->mnt_stat.f_fsid.val[0];
kif->kf_un.kf_file.kf_file_fileid = va.va_fileid;
kif->kf_un.kf_file.kf_file_mode = MAKEIMODE(va.va_type, va.va_mode);
kif->kf_un.kf_file.kf_file_size = va.va_size;
kif->kf_un.kf_file.kf_file_rdev = va.va_rdev;
return (0);
}
static int
fill_socket_info(struct socket *so, struct kinfo_file *kif)
{
struct sockaddr *sa;
struct inpcb *inpcb;
struct unpcb *unpcb;
int error;
if (so == NULL)
return (1);
kif->kf_sock_domain = so->so_proto->pr_domain->dom_family;
kif->kf_sock_type = so->so_type;
kif->kf_sock_protocol = so->so_proto->pr_protocol;
kif->kf_un.kf_sock.kf_sock_pcb = (uintptr_t)so->so_pcb;
switch(kif->kf_sock_domain) {
case AF_INET:
case AF_INET6:
if (kif->kf_sock_protocol == IPPROTO_TCP) {
if (so->so_pcb != NULL) {
inpcb = (struct inpcb *)(so->so_pcb);
kif->kf_un.kf_sock.kf_sock_inpcb =
(uintptr_t)inpcb->inp_ppcb;
}
}
break;
case AF_UNIX:
if (so->so_pcb != NULL) {
unpcb = (struct unpcb *)(so->so_pcb);
if (unpcb->unp_conn) {
kif->kf_un.kf_sock.kf_sock_unpconn =
(uintptr_t)unpcb->unp_conn;
kif->kf_un.kf_sock.kf_sock_rcv_sb_state =
so->so_rcv.sb_state;
kif->kf_un.kf_sock.kf_sock_snd_sb_state =
so->so_snd.sb_state;
}
}
break;
}
error = so->so_proto->pr_usrreqs->pru_sockaddr(so, &sa);
if (error == 0 && sa->sa_len <= sizeof(kif->kf_sa_local)) {
bcopy(sa, &kif->kf_sa_local, sa->sa_len);
free(sa, M_SONAME);
}
error = so->so_proto->pr_usrreqs->pru_peeraddr(so, &sa);
if (error == 0 && sa->sa_len <= sizeof(kif->kf_sa_peer)) {
bcopy(sa, &kif->kf_sa_peer, sa->sa_len);
free(sa, M_SONAME);
}
strncpy(kif->kf_path, so->so_proto->pr_domain->dom_name,
sizeof(kif->kf_path));
return (0);
}
static int
fill_pts_info(struct tty *tp, struct kinfo_file *kif)
{
if (tp == NULL)
return (1);
kif->kf_un.kf_pts.kf_pts_dev = tty_udev(tp);
strlcpy(kif->kf_path, tty_devname(tp), sizeof(kif->kf_path));
return (0);
}
static int
fill_pipe_info(struct pipe *pi, struct kinfo_file *kif)
{
if (pi == NULL)
return (1);
kif->kf_un.kf_pipe.kf_pipe_addr = (uintptr_t)pi;
kif->kf_un.kf_pipe.kf_pipe_peer = (uintptr_t)pi->pipe_peer;
kif->kf_un.kf_pipe.kf_pipe_buffer_cnt = pi->pipe_buffer.cnt;
return (0);
}
static int
fill_procdesc_info(struct procdesc *pdp, struct kinfo_file *kif)
{
if (pdp == NULL)
return (1);
kif->kf_un.kf_proc.kf_pid = pdp->pd_pid;
return (0);
}
static SYSCTL_NODE(_kern_proc, KERN_PROC_FILEDESC, filedesc, CTLFLAG_RD,
sysctl_kern_proc_filedesc, "Process filedesc entries");
#ifdef DDB
/*
* For the purposes of debugging, generate a human-readable string for the
* file type.
*/
static const char *
file_type_to_name(short type)
{
switch (type) {
case 0:
return ("zero");
case DTYPE_VNODE:
return ("vnod");
case DTYPE_SOCKET:
return ("sock");
case DTYPE_PIPE:
return ("pipe");
case DTYPE_FIFO:
return ("fifo");
case DTYPE_KQUEUE:
return ("kque");
case DTYPE_CRYPTO:
return ("crpt");
case DTYPE_MQUEUE:
return ("mque");
case DTYPE_SHM:
return ("shm");
case DTYPE_SEM:
return ("ksem");
default:
return ("unkn");
}
}
/*
* For the purposes of debugging, identify a process (if any, perhaps one of
* many) that references the passed file in its file descriptor array. Return
* NULL if none.
*/
static struct proc *
file_to_first_proc(struct file *fp)
{
struct filedesc *fdp;
struct proc *p;
int n;
FOREACH_PROC_IN_SYSTEM(p) {
if (p->p_state == PRS_NEW)
continue;
fdp = p->p_fd;
if (fdp == NULL)
continue;
for (n = 0; n < fdp->fd_nfiles; n++) {
if (fp == fdp->fd_ofiles[n])
return (p);
}
}
return (NULL);
}
static void
db_print_file(struct file *fp, int header)
{
struct proc *p;
if (header)
db_printf("%8s %4s %8s %8s %4s %5s %6s %8s %5s %12s\n",
"File", "Type", "Data", "Flag", "GCFl", "Count",
"MCount", "Vnode", "FPID", "FCmd");
p = file_to_first_proc(fp);
db_printf("%8p %4s %8p %08x %04x %5d %6d %8p %5d %12s\n", fp,
file_type_to_name(fp->f_type), fp->f_data, fp->f_flag,
0, fp->f_count, 0, fp->f_vnode,
p != NULL ? p->p_pid : -1, p != NULL ? p->p_comm : "-");
}
DB_SHOW_COMMAND(file, db_show_file)
{
struct file *fp;
if (!have_addr) {
db_printf("usage: show file <addr>\n");
return;
}
fp = (struct file *)addr;
db_print_file(fp, 1);
}
DB_SHOW_COMMAND(files, db_show_files)
{
struct filedesc *fdp;
struct file *fp;
struct proc *p;
int header;
int n;
header = 1;
FOREACH_PROC_IN_SYSTEM(p) {
if (p->p_state == PRS_NEW)
continue;
if ((fdp = p->p_fd) == NULL)
continue;
for (n = 0; n < fdp->fd_nfiles; ++n) {
if ((fp = fdp->fd_ofiles[n]) == NULL)
continue;
db_print_file(fp, header);
header = 0;
}
}
}
#endif
SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW,
&maxfilesperproc, 0, "Maximum files allowed open per process");
SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW,
&maxfiles, 0, "Maximum number of files");
SYSCTL_INT(_kern, OID_AUTO, openfiles, CTLFLAG_RD,
__DEVOLATILE(int *, &openfiles), 0, "System-wide number of open files");
/* ARGSUSED*/
static void
filelistinit(void *dummy)
{
file_zone = uma_zcreate("Files", sizeof(struct file), NULL, NULL,
NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF);
mtx_init(&fdesc_mtx, "fdesc", NULL, MTX_DEF);
}
SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, filelistinit, NULL);
/*-------------------------------------------------------------------*/
static int
badfo_readwrite(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td)
{
return (EBADF);
}
static int
badfo_truncate(struct file *fp, off_t length, struct ucred *active_cred, struct thread *td)
{
return (EINVAL);
}
static int
badfo_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred, struct thread *td)
{
return (EBADF);
}
static int
badfo_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td)
{
return (0);
}
static int
badfo_kqfilter(struct file *fp, struct knote *kn)
{
return (EBADF);
}
static int
badfo_stat(struct file *fp, struct stat *sb, struct ucred *active_cred, struct thread *td)
{
return (EBADF);
}
static int
badfo_close(struct file *fp, struct thread *td)
{
return (EBADF);
}
static int
badfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
struct thread *td)
{
return (EBADF);
}
static int
badfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
struct thread *td)
{
return (EBADF);
}
struct fileops badfileops = {
.fo_read = badfo_readwrite,
.fo_write = badfo_readwrite,
.fo_truncate = badfo_truncate,
.fo_ioctl = badfo_ioctl,
.fo_poll = badfo_poll,
.fo_kqfilter = badfo_kqfilter,
.fo_stat = badfo_stat,
.fo_close = badfo_close,
.fo_chmod = badfo_chmod,
.fo_chown = badfo_chown,
};
int
invfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
struct thread *td)
{
return (EINVAL);
}
int
invfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
struct thread *td)
{
return (EINVAL);
}
/*-------------------------------------------------------------------*/
/*
* File Descriptor pseudo-device driver (/dev/fd/).
*
* Opening minor device N dup()s the file (if any) connected to file
* descriptor N belonging to the calling process. Note that this driver
* consists of only the ``open()'' routine, because all subsequent
* references to this file will be direct to the other driver.
*
* XXX: we could give this one a cloning event handler if necessary.
*/
/* ARGSUSED */
static int
fdopen(struct cdev *dev, int mode, int type, struct thread *td)
{
/*
* XXX Kludge: set curthread->td_dupfd to contain the value of the
* the file descriptor being sought for duplication. The error
* return ensures that the vnode for this device will be released
* by vn_open. Open will detect this special error and take the
* actions in dupfdopen below. Other callers of vn_open or VOP_OPEN
* will simply report the error.
*/
td->td_dupfd = dev2unit(dev);
return (ENODEV);
}
static struct cdevsw fildesc_cdevsw = {
.d_version = D_VERSION,
.d_open = fdopen,
.d_name = "FD",
};
static void
fildesc_drvinit(void *unused)
{
struct cdev *dev;
dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 0, NULL,
UID_ROOT, GID_WHEEL, 0666, "fd/0");
make_dev_alias(dev, "stdin");
dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 1, NULL,
UID_ROOT, GID_WHEEL, 0666, "fd/1");
make_dev_alias(dev, "stdout");
dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 2, NULL,
UID_ROOT, GID_WHEEL, 0666, "fd/2");
make_dev_alias(dev, "stderr");
}
SYSINIT(fildescdev, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, fildesc_drvinit, NULL);
Index: head/sys/kern/kern_environment.c
===================================================================
--- head/sys/kern/kern_environment.c (revision 225616)
+++ head/sys/kern/kern_environment.c (revision 225617)
@@ -1,623 +1,623 @@
/*-
* Copyright (c) 1998 Michael Smith
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* The unified bootloader passes us a pointer to a preserved copy of
* bootstrap/kernel environment variables. We convert them to a
* dynamic array of strings later when the VM subsystem is up.
*
* We make these available through the kenv(2) syscall for userland
* and through getenv()/freeenv() setenv() unsetenv() testenv() for
* the kernel.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <sys/param.h>
#include <sys/proc.h>
#include <sys/queue.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
#include <sys/priv.h>
#include <sys/kernel.h>
#include <sys/systm.h>
#include <sys/sysent.h>
#include <sys/sysproto.h>
#include <sys/libkern.h>
#include <sys/kenv.h>
#include <security/mac/mac_framework.h>
static MALLOC_DEFINE(M_KENV, "kenv", "kernel environment");
#define KENV_SIZE 512 /* Maximum number of environment strings */
/* pointer to the static environment */
char *kern_envp;
static int env_len;
static int env_pos;
static char *kernenv_next(char *);
/* dynamic environment variables */
char **kenvp;
struct mtx kenv_lock;
/*
* No need to protect this with a mutex since SYSINITS are single threaded.
*/
int dynamic_kenv = 0;
#define KENV_CHECK if (!dynamic_kenv) \
panic("%s: called before SI_SUB_KMEM", __func__)
int
-kenv(td, uap)
+sys_kenv(td, uap)
struct thread *td;
struct kenv_args /* {
int what;
const char *name;
char *value;
int len;
} */ *uap;
{
char *name, *value, *buffer = NULL;
size_t len, done, needed, buflen;
int error, i;
KASSERT(dynamic_kenv, ("kenv: dynamic_kenv = 0"));
error = 0;
if (uap->what == KENV_DUMP) {
#ifdef MAC
error = mac_kenv_check_dump(td->td_ucred);
if (error)
return (error);
#endif
done = needed = 0;
buflen = uap->len;
if (buflen > KENV_SIZE * (KENV_MNAMELEN + KENV_MVALLEN + 2))
buflen = KENV_SIZE * (KENV_MNAMELEN +
KENV_MVALLEN + 2);
if (uap->len > 0 && uap->value != NULL)
buffer = malloc(buflen, M_TEMP, M_WAITOK|M_ZERO);
mtx_lock(&kenv_lock);
for (i = 0; kenvp[i] != NULL; i++) {
len = strlen(kenvp[i]) + 1;
needed += len;
len = min(len, buflen - done);
/*
* If called with a NULL or insufficiently large
* buffer, just keep computing the required size.
*/
if (uap->value != NULL && buffer != NULL && len > 0) {
bcopy(kenvp[i], buffer + done, len);
done += len;
}
}
mtx_unlock(&kenv_lock);
if (buffer != NULL) {
error = copyout(buffer, uap->value, done);
free(buffer, M_TEMP);
}
td->td_retval[0] = ((done == needed) ? 0 : needed);
return (error);
}
switch (uap->what) {
case KENV_SET:
error = priv_check(td, PRIV_KENV_SET);
if (error)
return (error);
break;
case KENV_UNSET:
error = priv_check(td, PRIV_KENV_UNSET);
if (error)
return (error);
break;
}
name = malloc(KENV_MNAMELEN, M_TEMP, M_WAITOK);
error = copyinstr(uap->name, name, KENV_MNAMELEN, NULL);
if (error)
goto done;
switch (uap->what) {
case KENV_GET:
#ifdef MAC
error = mac_kenv_check_get(td->td_ucred, name);
if (error)
goto done;
#endif
value = getenv(name);
if (value == NULL) {
error = ENOENT;
goto done;
}
len = strlen(value) + 1;
if (len > uap->len)
len = uap->len;
error = copyout(value, uap->value, len);
freeenv(value);
if (error)
goto done;
td->td_retval[0] = len;
break;
case KENV_SET:
len = uap->len;
if (len < 1) {
error = EINVAL;
goto done;
}
if (len > KENV_MVALLEN)
len = KENV_MVALLEN;
value = malloc(len, M_TEMP, M_WAITOK);
error = copyinstr(uap->value, value, len, NULL);
if (error) {
free(value, M_TEMP);
goto done;
}
#ifdef MAC
error = mac_kenv_check_set(td->td_ucred, name, value);
if (error == 0)
#endif
setenv(name, value);
free(value, M_TEMP);
break;
case KENV_UNSET:
#ifdef MAC
error = mac_kenv_check_unset(td->td_ucred, name);
if (error)
goto done;
#endif
error = unsetenv(name);
if (error)
error = ENOENT;
break;
default:
error = EINVAL;
break;
}
done:
free(name, M_TEMP);
return (error);
}
void
init_static_kenv(char *buf, size_t len)
{
kern_envp = buf;
env_len = len;
env_pos = 0;
}
/*
* Setup the dynamic kernel environment.
*/
static void
init_dynamic_kenv(void *data __unused)
{
char *cp;
size_t len;
int i;
kenvp = malloc((KENV_SIZE + 1) * sizeof(char *), M_KENV,
M_WAITOK | M_ZERO);
i = 0;
for (cp = kern_envp; cp != NULL; cp = kernenv_next(cp)) {
len = strlen(cp) + 1;
if (len > KENV_MNAMELEN + 1 + KENV_MVALLEN + 1) {
printf("WARNING: too long kenv string, ignoring %s\n",
cp);
continue;
}
if (i < KENV_SIZE) {
kenvp[i] = malloc(len, M_KENV, M_WAITOK);
strcpy(kenvp[i++], cp);
} else
printf(
"WARNING: too many kenv strings, ignoring %s\n",
cp);
}
kenvp[i] = NULL;
mtx_init(&kenv_lock, "kernel environment", NULL, MTX_DEF);
dynamic_kenv = 1;
}
SYSINIT(kenv, SI_SUB_KMEM, SI_ORDER_ANY, init_dynamic_kenv, NULL);
void
freeenv(char *env)
{
if (dynamic_kenv)
free(env, M_KENV);
}
/*
* Internal functions for string lookup.
*/
static char *
_getenv_dynamic(const char *name, int *idx)
{
char *cp;
int len, i;
mtx_assert(&kenv_lock, MA_OWNED);
len = strlen(name);
for (cp = kenvp[0], i = 0; cp != NULL; cp = kenvp[++i]) {
if ((strncmp(cp, name, len) == 0) &&
(cp[len] == '=')) {
if (idx != NULL)
*idx = i;
return (cp + len + 1);
}
}
return (NULL);
}
static char *
_getenv_static(const char *name)
{
char *cp, *ep;
int len;
for (cp = kern_envp; cp != NULL; cp = kernenv_next(cp)) {
for (ep = cp; (*ep != '=') && (*ep != 0); ep++)
;
if (*ep != '=')
continue;
len = ep - cp;
ep++;
if (!strncmp(name, cp, len) && name[len] == 0)
return (ep);
}
return (NULL);
}
/*
* Look up an environment variable by name.
* Return a pointer to the string if found.
* The pointer has to be freed with freeenv()
* after use.
*/
char *
getenv(const char *name)
{
char buf[KENV_MNAMELEN + 1 + KENV_MVALLEN + 1];
char *ret, *cp;
int len;
if (dynamic_kenv) {
mtx_lock(&kenv_lock);
cp = _getenv_dynamic(name, NULL);
if (cp != NULL) {
strcpy(buf, cp);
mtx_unlock(&kenv_lock);
len = strlen(buf) + 1;
ret = malloc(len, M_KENV, M_WAITOK);
strcpy(ret, buf);
} else {
mtx_unlock(&kenv_lock);
ret = NULL;
WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
"getenv");
}
} else
ret = _getenv_static(name);
return (ret);
}
/*
* Test if an environment variable is defined.
*/
int
testenv(const char *name)
{
char *cp;
if (dynamic_kenv) {
mtx_lock(&kenv_lock);
cp = _getenv_dynamic(name, NULL);
mtx_unlock(&kenv_lock);
} else
cp = _getenv_static(name);
if (cp != NULL)
return (1);
return (0);
}
static int
setenv_static(const char *name, const char *value)
{
int len;
if (env_pos >= env_len)
return (-1);
/* Check space for x=y and two nuls */
len = strlen(name) + strlen(value);
if (len + 3 < env_len - env_pos) {
len = sprintf(&kern_envp[env_pos], "%s=%s", name, value);
env_pos += len+1;
kern_envp[env_pos] = '\0';
return (0);
} else
return (-1);
}
/*
* Set an environment variable by name.
*/
int
setenv(const char *name, const char *value)
{
char *buf, *cp, *oldenv;
int namelen, vallen, i;
if (dynamic_kenv == 0 && env_len > 0)
return (setenv_static(name, value));
KENV_CHECK;
namelen = strlen(name) + 1;
if (namelen > KENV_MNAMELEN)
return (-1);
vallen = strlen(value) + 1;
if (vallen > KENV_MVALLEN)
return (-1);
buf = malloc(namelen + vallen, M_KENV, M_WAITOK);
sprintf(buf, "%s=%s", name, value);
mtx_lock(&kenv_lock);
cp = _getenv_dynamic(name, &i);
if (cp != NULL) {
oldenv = kenvp[i];
kenvp[i] = buf;
mtx_unlock(&kenv_lock);
free(oldenv, M_KENV);
} else {
/* We add the option if it wasn't found */
for (i = 0; (cp = kenvp[i]) != NULL; i++)
;
/* Bounds checking */
if (i < 0 || i >= KENV_SIZE) {
free(buf, M_KENV);
mtx_unlock(&kenv_lock);
return (-1);
}
kenvp[i] = buf;
kenvp[i + 1] = NULL;
mtx_unlock(&kenv_lock);
}
return (0);
}
/*
* Unset an environment variable string.
*/
int
unsetenv(const char *name)
{
char *cp, *oldenv;
int i, j;
KENV_CHECK;
mtx_lock(&kenv_lock);
cp = _getenv_dynamic(name, &i);
if (cp != NULL) {
oldenv = kenvp[i];
for (j = i + 1; kenvp[j] != NULL; j++)
kenvp[i++] = kenvp[j];
kenvp[i] = NULL;
mtx_unlock(&kenv_lock);
free(oldenv, M_KENV);
return (0);
}
mtx_unlock(&kenv_lock);
return (-1);
}
/*
* Return a string value from an environment variable.
*/
int
getenv_string(const char *name, char *data, int size)
{
char *tmp;
tmp = getenv(name);
if (tmp != NULL) {
strlcpy(data, tmp, size);
freeenv(tmp);
return (1);
} else
return (0);
}
/*
* Return an integer value from an environment variable.
*/
int
getenv_int(const char *name, int *data)
{
quad_t tmp;
int rval;
rval = getenv_quad(name, &tmp);
if (rval)
*data = (int) tmp;
return (rval);
}
/*
* Return an unsigned integer value from an environment variable.
*/
int
getenv_uint(const char *name, unsigned int *data)
{
quad_t tmp;
int rval;
rval = getenv_quad(name, &tmp);
if (rval)
*data = (unsigned int) tmp;
return (rval);
}
/*
* Return a long value from an environment variable.
*/
int
getenv_long(const char *name, long *data)
{
quad_t tmp;
int rval;
rval = getenv_quad(name, &tmp);
if (rval)
*data = (long) tmp;
return (rval);
}
/*
* Return an unsigned long value from an environment variable.
*/
int
getenv_ulong(const char *name, unsigned long *data)
{
quad_t tmp;
int rval;
rval = getenv_quad(name, &tmp);
if (rval)
*data = (unsigned long) tmp;
return (rval);
}
/*
* Return a quad_t value from an environment variable.
*/
int
getenv_quad(const char *name, quad_t *data)
{
char *value;
char *vtp;
quad_t iv;
value = getenv(name);
if (value == NULL)
return (0);
iv = strtoq(value, &vtp, 0);
if (vtp == value || (vtp[0] != '\0' && vtp[1] != '\0')) {
freeenv(value);
return (0);
}
switch (vtp[0]) {
case 't': case 'T':
iv *= 1024;
case 'g': case 'G':
iv *= 1024;
case 'm': case 'M':
iv *= 1024;
case 'k': case 'K':
iv *= 1024;
case '\0':
break;
default:
freeenv(value);
return (0);
}
*data = iv;
freeenv(value);
return (1);
}
/*
* Find the next entry after the one which (cp) falls within, return a
* pointer to its start or NULL if there are no more.
*/
static char *
kernenv_next(char *cp)
{
if (cp != NULL) {
while (*cp != 0)
cp++;
cp++;
if (*cp == 0)
cp = NULL;
}
return (cp);
}
void
tunable_int_init(void *data)
{
struct tunable_int *d = (struct tunable_int *)data;
TUNABLE_INT_FETCH(d->path, d->var);
}
void
tunable_long_init(void *data)
{
struct tunable_long *d = (struct tunable_long *)data;
TUNABLE_LONG_FETCH(d->path, d->var);
}
void
tunable_ulong_init(void *data)
{
struct tunable_ulong *d = (struct tunable_ulong *)data;
TUNABLE_ULONG_FETCH(d->path, d->var);
}
void
tunable_quad_init(void *data)
{
struct tunable_quad *d = (struct tunable_quad *)data;
TUNABLE_QUAD_FETCH(d->path, d->var);
}
void
tunable_str_init(void *data)
{
struct tunable_str *d = (struct tunable_str *)data;
TUNABLE_STR_FETCH(d->path, d->var, d->size);
}
Index: head/sys/kern/kern_event.c
===================================================================
--- head/sys/kern/kern_event.c (revision 225616)
+++ head/sys/kern/kern_event.c (revision 225617)
@@ -1,2201 +1,2201 @@
/*-
* Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org>
* Copyright 2004 John-Mark Gurney <jmg@FreeBSD.org>
* Copyright (c) 2009 Apple, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_ktrace.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/capability.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/malloc.h>
#include <sys/unistd.h>
#include <sys/file.h>
#include <sys/filedesc.h>
#include <sys/filio.h>
#include <sys/fcntl.h>
#include <sys/kthread.h>
#include <sys/selinfo.h>
#include <sys/queue.h>
#include <sys/event.h>
#include <sys/eventvar.h>
#include <sys/poll.h>
#include <sys/protosw.h>
#include <sys/sigio.h>
#include <sys/signalvar.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/stat.h>
#include <sys/sysctl.h>
#include <sys/sysproto.h>
#include <sys/syscallsubr.h>
#include <sys/taskqueue.h>
#include <sys/uio.h>
#ifdef KTRACE
#include <sys/ktrace.h>
#endif
#include <vm/uma.h>
static MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system");
/*
* This lock is used if multiple kq locks are required. This possibly
* should be made into a per proc lock.
*/
static struct mtx kq_global;
MTX_SYSINIT(kq_global, &kq_global, "kqueue order", MTX_DEF);
#define KQ_GLOBAL_LOCK(lck, haslck) do { \
if (!haslck) \
mtx_lock(lck); \
haslck = 1; \
} while (0)
#define KQ_GLOBAL_UNLOCK(lck, haslck) do { \
if (haslck) \
mtx_unlock(lck); \
haslck = 0; \
} while (0)
TASKQUEUE_DEFINE_THREAD(kqueue);
static int kevent_copyout(void *arg, struct kevent *kevp, int count);
static int kevent_copyin(void *arg, struct kevent *kevp, int count);
static int kqueue_register(struct kqueue *kq, struct kevent *kev,
struct thread *td, int waitok);
static int kqueue_acquire(struct file *fp, struct kqueue **kqp);
static void kqueue_release(struct kqueue *kq, int locked);
static int kqueue_expand(struct kqueue *kq, struct filterops *fops,
uintptr_t ident, int waitok);
static void kqueue_task(void *arg, int pending);
static int kqueue_scan(struct kqueue *kq, int maxevents,
struct kevent_copyops *k_ops,
const struct timespec *timeout,
struct kevent *keva, struct thread *td);
static void kqueue_wakeup(struct kqueue *kq);
static struct filterops *kqueue_fo_find(int filt);
static void kqueue_fo_release(int filt);
static fo_rdwr_t kqueue_read;
static fo_rdwr_t kqueue_write;
static fo_truncate_t kqueue_truncate;
static fo_ioctl_t kqueue_ioctl;
static fo_poll_t kqueue_poll;
static fo_kqfilter_t kqueue_kqfilter;
static fo_stat_t kqueue_stat;
static fo_close_t kqueue_close;
static struct fileops kqueueops = {
.fo_read = kqueue_read,
.fo_write = kqueue_write,
.fo_truncate = kqueue_truncate,
.fo_ioctl = kqueue_ioctl,
.fo_poll = kqueue_poll,
.fo_kqfilter = kqueue_kqfilter,
.fo_stat = kqueue_stat,
.fo_close = kqueue_close,
.fo_chmod = invfo_chmod,
.fo_chown = invfo_chown,
};
static int knote_attach(struct knote *kn, struct kqueue *kq);
static void knote_drop(struct knote *kn, struct thread *td);
static void knote_enqueue(struct knote *kn);
static void knote_dequeue(struct knote *kn);
static void knote_init(void);
static struct knote *knote_alloc(int waitok);
static void knote_free(struct knote *kn);
static void filt_kqdetach(struct knote *kn);
static int filt_kqueue(struct knote *kn, long hint);
static int filt_procattach(struct knote *kn);
static void filt_procdetach(struct knote *kn);
static int filt_proc(struct knote *kn, long hint);
static int filt_fileattach(struct knote *kn);
static void filt_timerexpire(void *knx);
static int filt_timerattach(struct knote *kn);
static void filt_timerdetach(struct knote *kn);
static int filt_timer(struct knote *kn, long hint);
static int filt_userattach(struct knote *kn);
static void filt_userdetach(struct knote *kn);
static int filt_user(struct knote *kn, long hint);
static void filt_usertouch(struct knote *kn, struct kevent *kev,
u_long type);
static struct filterops file_filtops = {
.f_isfd = 1,
.f_attach = filt_fileattach,
};
static struct filterops kqread_filtops = {
.f_isfd = 1,
.f_detach = filt_kqdetach,
.f_event = filt_kqueue,
};
/* XXX - move to kern_proc.c? */
static struct filterops proc_filtops = {
.f_isfd = 0,
.f_attach = filt_procattach,
.f_detach = filt_procdetach,
.f_event = filt_proc,
};
static struct filterops timer_filtops = {
.f_isfd = 0,
.f_attach = filt_timerattach,
.f_detach = filt_timerdetach,
.f_event = filt_timer,
};
static struct filterops user_filtops = {
.f_attach = filt_userattach,
.f_detach = filt_userdetach,
.f_event = filt_user,
.f_touch = filt_usertouch,
};
static uma_zone_t knote_zone;
static int kq_ncallouts = 0;
static int kq_calloutmax = (4 * 1024);
SYSCTL_INT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW,
&kq_calloutmax, 0, "Maximum number of callouts allocated for kqueue");
/* XXX - ensure not KN_INFLUX?? */
#define KNOTE_ACTIVATE(kn, islock) do { \
if ((islock)) \
mtx_assert(&(kn)->kn_kq->kq_lock, MA_OWNED); \
else \
KQ_LOCK((kn)->kn_kq); \
(kn)->kn_status |= KN_ACTIVE; \
if (((kn)->kn_status & (KN_QUEUED | KN_DISABLED)) == 0) \
knote_enqueue((kn)); \
if (!(islock)) \
KQ_UNLOCK((kn)->kn_kq); \
} while(0)
#define KQ_LOCK(kq) do { \
mtx_lock(&(kq)->kq_lock); \
} while (0)
#define KQ_FLUX_WAKEUP(kq) do { \
if (((kq)->kq_state & KQ_FLUXWAIT) == KQ_FLUXWAIT) { \
(kq)->kq_state &= ~KQ_FLUXWAIT; \
wakeup((kq)); \
} \
} while (0)
#define KQ_UNLOCK_FLUX(kq) do { \
KQ_FLUX_WAKEUP(kq); \
mtx_unlock(&(kq)->kq_lock); \
} while (0)
#define KQ_UNLOCK(kq) do { \
mtx_unlock(&(kq)->kq_lock); \
} while (0)
#define KQ_OWNED(kq) do { \
mtx_assert(&(kq)->kq_lock, MA_OWNED); \
} while (0)
#define KQ_NOTOWNED(kq) do { \
mtx_assert(&(kq)->kq_lock, MA_NOTOWNED); \
} while (0)
#define KN_LIST_LOCK(kn) do { \
if (kn->kn_knlist != NULL) \
kn->kn_knlist->kl_lock(kn->kn_knlist->kl_lockarg); \
} while (0)
#define KN_LIST_UNLOCK(kn) do { \
if (kn->kn_knlist != NULL) \
kn->kn_knlist->kl_unlock(kn->kn_knlist->kl_lockarg); \
} while (0)
#define KNL_ASSERT_LOCK(knl, islocked) do { \
if (islocked) \
KNL_ASSERT_LOCKED(knl); \
else \
KNL_ASSERT_UNLOCKED(knl); \
} while (0)
#ifdef INVARIANTS
#define KNL_ASSERT_LOCKED(knl) do { \
knl->kl_assert_locked((knl)->kl_lockarg); \
} while (0)
#define KNL_ASSERT_UNLOCKED(knl) do { \
knl->kl_assert_unlocked((knl)->kl_lockarg); \
} while (0)
#else /* !INVARIANTS */
#define KNL_ASSERT_LOCKED(knl) do {} while(0)
#define KNL_ASSERT_UNLOCKED(knl) do {} while (0)
#endif /* INVARIANTS */
#define KN_HASHSIZE 64 /* XXX should be tunable */
#define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask))
static int
filt_nullattach(struct knote *kn)
{
return (ENXIO);
};
struct filterops null_filtops = {
.f_isfd = 0,
.f_attach = filt_nullattach,
};
/* XXX - make SYSINIT to add these, and move into respective modules. */
extern struct filterops sig_filtops;
extern struct filterops fs_filtops;
/*
* Table for for all system-defined filters.
*/
static struct mtx filterops_lock;
MTX_SYSINIT(kqueue_filterops, &filterops_lock, "protect sysfilt_ops",
MTX_DEF);
static struct {
struct filterops *for_fop;
int for_refcnt;
} sysfilt_ops[EVFILT_SYSCOUNT] = {
{ &file_filtops }, /* EVFILT_READ */
{ &file_filtops }, /* EVFILT_WRITE */
{ &null_filtops }, /* EVFILT_AIO */
{ &file_filtops }, /* EVFILT_VNODE */
{ &proc_filtops }, /* EVFILT_PROC */
{ &sig_filtops }, /* EVFILT_SIGNAL */
{ &timer_filtops }, /* EVFILT_TIMER */
{ &null_filtops }, /* former EVFILT_NETDEV */
{ &fs_filtops }, /* EVFILT_FS */
{ &null_filtops }, /* EVFILT_LIO */
{ &user_filtops }, /* EVFILT_USER */
};
/*
* Simple redirection for all cdevsw style objects to call their fo_kqfilter
* method.
*/
static int
filt_fileattach(struct knote *kn)
{
return (fo_kqfilter(kn->kn_fp, kn));
}
/*ARGSUSED*/
static int
kqueue_kqfilter(struct file *fp, struct knote *kn)
{
struct kqueue *kq = kn->kn_fp->f_data;
if (kn->kn_filter != EVFILT_READ)
return (EINVAL);
kn->kn_status |= KN_KQUEUE;
kn->kn_fop = &kqread_filtops;
knlist_add(&kq->kq_sel.si_note, kn, 0);
return (0);
}
static void
filt_kqdetach(struct knote *kn)
{
struct kqueue *kq = kn->kn_fp->f_data;
knlist_remove(&kq->kq_sel.si_note, kn, 0);
}
/*ARGSUSED*/
static int
filt_kqueue(struct knote *kn, long hint)
{
struct kqueue *kq = kn->kn_fp->f_data;
kn->kn_data = kq->kq_count;
return (kn->kn_data > 0);
}
/* XXX - move to kern_proc.c? */
static int
filt_procattach(struct knote *kn)
{
struct proc *p;
int immediate;
int error;
immediate = 0;
p = pfind(kn->kn_id);
if (p == NULL && (kn->kn_sfflags & NOTE_EXIT)) {
p = zpfind(kn->kn_id);
immediate = 1;
} else if (p != NULL && (p->p_flag & P_WEXIT)) {
immediate = 1;
}
if (p == NULL)
return (ESRCH);
if ((error = p_cansee(curthread, p))) {
PROC_UNLOCK(p);
return (error);
}
kn->kn_ptr.p_proc = p;
kn->kn_flags |= EV_CLEAR; /* automatically set */
/*
* internal flag indicating registration done by kernel
*/
if (kn->kn_flags & EV_FLAG1) {
kn->kn_data = kn->kn_sdata; /* ppid */
kn->kn_fflags = NOTE_CHILD;
kn->kn_flags &= ~EV_FLAG1;
}
if (immediate == 0)
knlist_add(&p->p_klist, kn, 1);
/*
* Immediately activate any exit notes if the target process is a
* zombie. This is necessary to handle the case where the target
* process, e.g. a child, dies before the kevent is registered.
*/
if (immediate && filt_proc(kn, NOTE_EXIT))
KNOTE_ACTIVATE(kn, 0);
PROC_UNLOCK(p);
return (0);
}
/*
* The knote may be attached to a different process, which may exit,
* leaving nothing for the knote to be attached to. So when the process
* exits, the knote is marked as DETACHED and also flagged as ONESHOT so
* it will be deleted when read out. However, as part of the knote deletion,
* this routine is called, so a check is needed to avoid actually performing
* a detach, because the original process does not exist any more.
*/
/* XXX - move to kern_proc.c? */
static void
filt_procdetach(struct knote *kn)
{
struct proc *p;
p = kn->kn_ptr.p_proc;
knlist_remove(&p->p_klist, kn, 0);
kn->kn_ptr.p_proc = NULL;
}
/* XXX - move to kern_proc.c? */
static int
filt_proc(struct knote *kn, long hint)
{
struct proc *p = kn->kn_ptr.p_proc;
u_int event;
/*
* mask off extra data
*/
event = (u_int)hint & NOTE_PCTRLMASK;
/*
* if the user is interested in this event, record it.
*/
if (kn->kn_sfflags & event)
kn->kn_fflags |= event;
/*
* process is gone, so flag the event as finished.
*/
if (event == NOTE_EXIT) {
if (!(kn->kn_status & KN_DETACHED))
knlist_remove_inevent(&p->p_klist, kn);
kn->kn_flags |= (EV_EOF | EV_ONESHOT);
kn->kn_data = p->p_xstat;
kn->kn_ptr.p_proc = NULL;
return (1);
}
return (kn->kn_fflags != 0);
}
/*
* Called when the process forked. It mostly does the same as the
* knote(), activating all knotes registered to be activated when the
* process forked. Additionally, for each knote attached to the
* parent, check whether user wants to track the new process. If so
* attach a new knote to it, and immediately report an event with the
* child's pid.
*/
void
knote_fork(struct knlist *list, int pid)
{
struct kqueue *kq;
struct knote *kn;
struct kevent kev;
int error;
if (list == NULL)
return;
list->kl_lock(list->kl_lockarg);
SLIST_FOREACH(kn, &list->kl_list, kn_selnext) {
if ((kn->kn_status & KN_INFLUX) == KN_INFLUX)
continue;
kq = kn->kn_kq;
KQ_LOCK(kq);
if ((kn->kn_status & KN_INFLUX) == KN_INFLUX) {
KQ_UNLOCK(kq);
continue;
}
/*
* The same as knote(), activate the event.
*/
if ((kn->kn_sfflags & NOTE_TRACK) == 0) {
kn->kn_status |= KN_HASKQLOCK;
if (kn->kn_fop->f_event(kn, NOTE_FORK | pid))
KNOTE_ACTIVATE(kn, 1);
kn->kn_status &= ~KN_HASKQLOCK;
KQ_UNLOCK(kq);
continue;
}
/*
* The NOTE_TRACK case. In addition to the activation
* of the event, we need to register new event to
* track the child. Drop the locks in preparation for
* the call to kqueue_register().
*/
kn->kn_status |= KN_INFLUX;
KQ_UNLOCK(kq);
list->kl_unlock(list->kl_lockarg);
/*
* Activate existing knote and register a knote with
* new process.
*/
kev.ident = pid;
kev.filter = kn->kn_filter;
kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1;
kev.fflags = kn->kn_sfflags;
kev.data = kn->kn_id; /* parent */
kev.udata = kn->kn_kevent.udata;/* preserve udata */
error = kqueue_register(kq, &kev, NULL, 0);
if (kn->kn_fop->f_event(kn, NOTE_FORK | pid))
KNOTE_ACTIVATE(kn, 0);
if (error)
kn->kn_fflags |= NOTE_TRACKERR;
KQ_LOCK(kq);
kn->kn_status &= ~KN_INFLUX;
KQ_UNLOCK_FLUX(kq);
list->kl_lock(list->kl_lockarg);
}
list->kl_unlock(list->kl_lockarg);
}
static int
timertoticks(intptr_t data)
{
struct timeval tv;
int tticks;
tv.tv_sec = data / 1000;
tv.tv_usec = (data % 1000) * 1000;
tticks = tvtohz(&tv);
return tticks;
}
/* XXX - move to kern_timeout.c? */
static void
filt_timerexpire(void *knx)
{
struct knote *kn = knx;
struct callout *calloutp;
kn->kn_data++;
KNOTE_ACTIVATE(kn, 0); /* XXX - handle locking */
if ((kn->kn_flags & EV_ONESHOT) != EV_ONESHOT) {
calloutp = (struct callout *)kn->kn_hook;
callout_reset_curcpu(calloutp, timertoticks(kn->kn_sdata),
filt_timerexpire, kn);
}
}
/*
* data contains amount of time to sleep, in milliseconds
*/
/* XXX - move to kern_timeout.c? */
static int
filt_timerattach(struct knote *kn)
{
struct callout *calloutp;
atomic_add_int(&kq_ncallouts, 1);
if (kq_ncallouts >= kq_calloutmax) {
atomic_add_int(&kq_ncallouts, -1);
return (ENOMEM);
}
kn->kn_flags |= EV_CLEAR; /* automatically set */
kn->kn_status &= ~KN_DETACHED; /* knlist_add usually sets it */
calloutp = malloc(sizeof(*calloutp), M_KQUEUE, M_WAITOK);
callout_init(calloutp, CALLOUT_MPSAFE);
kn->kn_hook = calloutp;
callout_reset_curcpu(calloutp, timertoticks(kn->kn_sdata),
filt_timerexpire, kn);
return (0);
}
/* XXX - move to kern_timeout.c? */
static void
filt_timerdetach(struct knote *kn)
{
struct callout *calloutp;
calloutp = (struct callout *)kn->kn_hook;
callout_drain(calloutp);
free(calloutp, M_KQUEUE);
atomic_add_int(&kq_ncallouts, -1);
kn->kn_status |= KN_DETACHED; /* knlist_remove usually clears it */
}
/* XXX - move to kern_timeout.c? */
static int
filt_timer(struct knote *kn, long hint)
{
return (kn->kn_data != 0);
}
static int
filt_userattach(struct knote *kn)
{
/*
* EVFILT_USER knotes are not attached to anything in the kernel.
*/
kn->kn_hook = NULL;
if (kn->kn_fflags & NOTE_TRIGGER)
kn->kn_hookid = 1;
else
kn->kn_hookid = 0;
return (0);
}
static void
filt_userdetach(__unused struct knote *kn)
{
/*
* EVFILT_USER knotes are not attached to anything in the kernel.
*/
}
static int
filt_user(struct knote *kn, __unused long hint)
{
return (kn->kn_hookid);
}
static void
filt_usertouch(struct knote *kn, struct kevent *kev, u_long type)
{
u_int ffctrl;
switch (type) {
case EVENT_REGISTER:
if (kev->fflags & NOTE_TRIGGER)
kn->kn_hookid = 1;
ffctrl = kev->fflags & NOTE_FFCTRLMASK;
kev->fflags &= NOTE_FFLAGSMASK;
switch (ffctrl) {
case NOTE_FFNOP:
break;
case NOTE_FFAND:
kn->kn_sfflags &= kev->fflags;
break;
case NOTE_FFOR:
kn->kn_sfflags |= kev->fflags;
break;
case NOTE_FFCOPY:
kn->kn_sfflags = kev->fflags;
break;
default:
/* XXX Return error? */
break;
}
kn->kn_sdata = kev->data;
if (kev->flags & EV_CLEAR) {
kn->kn_hookid = 0;
kn->kn_data = 0;
kn->kn_fflags = 0;
}
break;
case EVENT_PROCESS:
*kev = kn->kn_kevent;
kev->fflags = kn->kn_sfflags;
kev->data = kn->kn_sdata;
if (kn->kn_flags & EV_CLEAR) {
kn->kn_hookid = 0;
kn->kn_data = 0;
kn->kn_fflags = 0;
}
break;
default:
panic("filt_usertouch() - invalid type (%ld)", type);
break;
}
}
int
-kqueue(struct thread *td, struct kqueue_args *uap)
+sys_kqueue(struct thread *td, struct kqueue_args *uap)
{
struct filedesc *fdp;
struct kqueue *kq;
struct file *fp;
int fd, error;
fdp = td->td_proc->p_fd;
error = falloc(td, &fp, &fd, 0);
if (error)
goto done2;
/* An extra reference on `nfp' has been held for us by falloc(). */
kq = malloc(sizeof *kq, M_KQUEUE, M_WAITOK | M_ZERO);
mtx_init(&kq->kq_lock, "kqueue", NULL, MTX_DEF|MTX_DUPOK);
TAILQ_INIT(&kq->kq_head);
kq->kq_fdp = fdp;
knlist_init_mtx(&kq->kq_sel.si_note, &kq->kq_lock);
TASK_INIT(&kq->kq_task, 0, kqueue_task, kq);
FILEDESC_XLOCK(fdp);
SLIST_INSERT_HEAD(&fdp->fd_kqlist, kq, kq_list);
FILEDESC_XUNLOCK(fdp);
finit(fp, FREAD | FWRITE, DTYPE_KQUEUE, kq, &kqueueops);
fdrop(fp, td);
td->td_retval[0] = fd;
done2:
return (error);
}
#ifndef _SYS_SYSPROTO_H_
struct kevent_args {
int fd;
const struct kevent *changelist;
int nchanges;
struct kevent *eventlist;
int nevents;
const struct timespec *timeout;
};
#endif
int
-kevent(struct thread *td, struct kevent_args *uap)
+sys_kevent(struct thread *td, struct kevent_args *uap)
{
struct timespec ts, *tsp;
struct kevent_copyops k_ops = { uap,
kevent_copyout,
kevent_copyin};
int error;
#ifdef KTRACE
struct uio ktruio;
struct iovec ktriov;
struct uio *ktruioin = NULL;
struct uio *ktruioout = NULL;
#endif
if (uap->timeout != NULL) {
error = copyin(uap->timeout, &ts, sizeof(ts));
if (error)
return (error);
tsp = &ts;
} else
tsp = NULL;
#ifdef KTRACE
if (KTRPOINT(td, KTR_GENIO)) {
ktriov.iov_base = uap->changelist;
ktriov.iov_len = uap->nchanges * sizeof(struct kevent);
ktruio = (struct uio){ .uio_iov = &ktriov, .uio_iovcnt = 1,
.uio_segflg = UIO_USERSPACE, .uio_rw = UIO_READ,
.uio_td = td };
ktruioin = cloneuio(&ktruio);
ktriov.iov_base = uap->eventlist;
ktriov.iov_len = uap->nevents * sizeof(struct kevent);
ktruioout = cloneuio(&ktruio);
}
#endif
error = kern_kevent(td, uap->fd, uap->nchanges, uap->nevents,
&k_ops, tsp);
#ifdef KTRACE
if (ktruioin != NULL) {
ktruioin->uio_resid = uap->nchanges * sizeof(struct kevent);
ktrgenio(uap->fd, UIO_WRITE, ktruioin, 0);
ktruioout->uio_resid = td->td_retval[0] * sizeof(struct kevent);
ktrgenio(uap->fd, UIO_READ, ktruioout, error);
}
#endif
return (error);
}
/*
* Copy 'count' items into the destination list pointed to by uap->eventlist.
*/
static int
kevent_copyout(void *arg, struct kevent *kevp, int count)
{
struct kevent_args *uap;
int error;
KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
uap = (struct kevent_args *)arg;
error = copyout(kevp, uap->eventlist, count * sizeof *kevp);
if (error == 0)
uap->eventlist += count;
return (error);
}
/*
* Copy 'count' items from the list pointed to by uap->changelist.
*/
static int
kevent_copyin(void *arg, struct kevent *kevp, int count)
{
struct kevent_args *uap;
int error;
KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
uap = (struct kevent_args *)arg;
error = copyin(uap->changelist, kevp, count * sizeof *kevp);
if (error == 0)
uap->changelist += count;
return (error);
}
int
kern_kevent(struct thread *td, int fd, int nchanges, int nevents,
struct kevent_copyops *k_ops, const struct timespec *timeout)
{
struct kevent keva[KQ_NEVENTS];
struct kevent *kevp, *changes;
struct kqueue *kq;
struct file *fp;
int i, n, nerrors, error;
if ((error = fget(td, fd, CAP_POST_EVENT, &fp)) != 0)
return (error);
if ((error = kqueue_acquire(fp, &kq)) != 0)
goto done_norel;
nerrors = 0;
while (nchanges > 0) {
n = nchanges > KQ_NEVENTS ? KQ_NEVENTS : nchanges;
error = k_ops->k_copyin(k_ops->arg, keva, n);
if (error)
goto done;
changes = keva;
for (i = 0; i < n; i++) {
kevp = &changes[i];
if (!kevp->filter)
continue;
kevp->flags &= ~EV_SYSFLAGS;
error = kqueue_register(kq, kevp, td, 1);
if (error || (kevp->flags & EV_RECEIPT)) {
if (nevents != 0) {
kevp->flags = EV_ERROR;
kevp->data = error;
(void) k_ops->k_copyout(k_ops->arg,
kevp, 1);
nevents--;
nerrors++;
} else {
goto done;
}
}
}
nchanges -= n;
}
if (nerrors) {
td->td_retval[0] = nerrors;
error = 0;
goto done;
}
error = kqueue_scan(kq, nevents, k_ops, timeout, keva, td);
done:
kqueue_release(kq, 0);
done_norel:
fdrop(fp, td);
return (error);
}
int
kqueue_add_filteropts(int filt, struct filterops *filtops)
{
int error;
error = 0;
if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) {
printf(
"trying to add a filterop that is out of range: %d is beyond %d\n",
~filt, EVFILT_SYSCOUNT);
return EINVAL;
}
mtx_lock(&filterops_lock);
if (sysfilt_ops[~filt].for_fop != &null_filtops &&
sysfilt_ops[~filt].for_fop != NULL)
error = EEXIST;
else {
sysfilt_ops[~filt].for_fop = filtops;
sysfilt_ops[~filt].for_refcnt = 0;
}
mtx_unlock(&filterops_lock);
return (error);
}
int
kqueue_del_filteropts(int filt)
{
int error;
error = 0;
if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
return EINVAL;
mtx_lock(&filterops_lock);
if (sysfilt_ops[~filt].for_fop == &null_filtops ||
sysfilt_ops[~filt].for_fop == NULL)
error = EINVAL;
else if (sysfilt_ops[~filt].for_refcnt != 0)
error = EBUSY;
else {
sysfilt_ops[~filt].for_fop = &null_filtops;
sysfilt_ops[~filt].for_refcnt = 0;
}
mtx_unlock(&filterops_lock);
return error;
}
static struct filterops *
kqueue_fo_find(int filt)
{
if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
return NULL;
mtx_lock(&filterops_lock);
sysfilt_ops[~filt].for_refcnt++;
if (sysfilt_ops[~filt].for_fop == NULL)
sysfilt_ops[~filt].for_fop = &null_filtops;
mtx_unlock(&filterops_lock);
return sysfilt_ops[~filt].for_fop;
}
static void
kqueue_fo_release(int filt)
{
if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
return;
mtx_lock(&filterops_lock);
KASSERT(sysfilt_ops[~filt].for_refcnt > 0,
("filter object refcount not valid on release"));
sysfilt_ops[~filt].for_refcnt--;
mtx_unlock(&filterops_lock);
}
/*
* A ref to kq (obtained via kqueue_acquire) must be held. waitok will
* influence if memory allocation should wait. Make sure it is 0 if you
* hold any mutexes.
*/
static int
kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td, int waitok)
{
struct filterops *fops;
struct file *fp;
struct knote *kn, *tkn;
int error, filt, event;
int haskqglobal;
fp = NULL;
kn = NULL;
error = 0;
haskqglobal = 0;
filt = kev->filter;
fops = kqueue_fo_find(filt);
if (fops == NULL)
return EINVAL;
tkn = knote_alloc(waitok); /* prevent waiting with locks */
findkn:
if (fops->f_isfd) {
KASSERT(td != NULL, ("td is NULL"));
error = fget(td, kev->ident, CAP_POLL_EVENT, &fp);
if (error)
goto done;
if ((kev->flags & EV_ADD) == EV_ADD && kqueue_expand(kq, fops,
kev->ident, 0) != 0) {
/* try again */
fdrop(fp, td);
fp = NULL;
error = kqueue_expand(kq, fops, kev->ident, waitok);
if (error)
goto done;
goto findkn;
}
if (fp->f_type == DTYPE_KQUEUE) {
/*
* if we add some inteligence about what we are doing,
* we should be able to support events on ourselves.
* We need to know when we are doing this to prevent
* getting both the knlist lock and the kq lock since
* they are the same thing.
*/
if (fp->f_data == kq) {
error = EINVAL;
goto done;
}
KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
}
KQ_LOCK(kq);
if (kev->ident < kq->kq_knlistsize) {
SLIST_FOREACH(kn, &kq->kq_knlist[kev->ident], kn_link)
if (kev->filter == kn->kn_filter)
break;
}
} else {
if ((kev->flags & EV_ADD) == EV_ADD)
kqueue_expand(kq, fops, kev->ident, waitok);
KQ_LOCK(kq);
if (kq->kq_knhashmask != 0) {
struct klist *list;
list = &kq->kq_knhash[
KN_HASH((u_long)kev->ident, kq->kq_knhashmask)];
SLIST_FOREACH(kn, list, kn_link)
if (kev->ident == kn->kn_id &&
kev->filter == kn->kn_filter)
break;
}
}
/* knote is in the process of changing, wait for it to stablize. */
if (kn != NULL && (kn->kn_status & KN_INFLUX) == KN_INFLUX) {
KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
kq->kq_state |= KQ_FLUXWAIT;
msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqflxwt", 0);
if (fp != NULL) {
fdrop(fp, td);
fp = NULL;
}
goto findkn;
}
/*
* kn now contains the matching knote, or NULL if no match
*/
if (kn == NULL) {
if (kev->flags & EV_ADD) {
kn = tkn;
tkn = NULL;
if (kn == NULL) {
KQ_UNLOCK(kq);
error = ENOMEM;
goto done;
}
kn->kn_fp = fp;
kn->kn_kq = kq;
kn->kn_fop = fops;
/*
* apply reference counts to knote structure, and
* do not release it at the end of this routine.
*/
fops = NULL;
fp = NULL;
kn->kn_sfflags = kev->fflags;
kn->kn_sdata = kev->data;
kev->fflags = 0;
kev->data = 0;
kn->kn_kevent = *kev;
kn->kn_kevent.flags &= ~(EV_ADD | EV_DELETE |
EV_ENABLE | EV_DISABLE);
kn->kn_status = KN_INFLUX|KN_DETACHED;
error = knote_attach(kn, kq);
KQ_UNLOCK(kq);
if (error != 0) {
tkn = kn;
goto done;
}
if ((error = kn->kn_fop->f_attach(kn)) != 0) {
knote_drop(kn, td);
goto done;
}
KN_LIST_LOCK(kn);
goto done_ev_add;
} else {
/* No matching knote and the EV_ADD flag is not set. */
KQ_UNLOCK(kq);
error = ENOENT;
goto done;
}
}
if (kev->flags & EV_DELETE) {
kn->kn_status |= KN_INFLUX;
KQ_UNLOCK(kq);
if (!(kn->kn_status & KN_DETACHED))
kn->kn_fop->f_detach(kn);
knote_drop(kn, td);
goto done;
}
/*
* The user may change some filter values after the initial EV_ADD,
* but doing so will not reset any filter which has already been
* triggered.
*/
kn->kn_status |= KN_INFLUX;
KQ_UNLOCK(kq);
KN_LIST_LOCK(kn);
kn->kn_kevent.udata = kev->udata;
if (!fops->f_isfd && fops->f_touch != NULL) {
fops->f_touch(kn, kev, EVENT_REGISTER);
} else {
kn->kn_sfflags = kev->fflags;
kn->kn_sdata = kev->data;
}
/*
* We can get here with kn->kn_knlist == NULL. This can happen when
* the initial attach event decides that the event is "completed"
* already. i.e. filt_procattach is called on a zombie process. It
* will call filt_proc which will remove it from the list, and NULL
* kn_knlist.
*/
done_ev_add:
event = kn->kn_fop->f_event(kn, 0);
KQ_LOCK(kq);
if (event)
KNOTE_ACTIVATE(kn, 1);
kn->kn_status &= ~KN_INFLUX;
KN_LIST_UNLOCK(kn);
if ((kev->flags & EV_DISABLE) &&
((kn->kn_status & KN_DISABLED) == 0)) {
kn->kn_status |= KN_DISABLED;
}
if ((kev->flags & EV_ENABLE) && (kn->kn_status & KN_DISABLED)) {
kn->kn_status &= ~KN_DISABLED;
if ((kn->kn_status & KN_ACTIVE) &&
((kn->kn_status & KN_QUEUED) == 0))
knote_enqueue(kn);
}
KQ_UNLOCK_FLUX(kq);
done:
KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
if (fp != NULL)
fdrop(fp, td);
if (tkn != NULL)
knote_free(tkn);
if (fops != NULL)
kqueue_fo_release(filt);
return (error);
}
static int
kqueue_acquire(struct file *fp, struct kqueue **kqp)
{
int error;
struct kqueue *kq;
error = 0;
kq = fp->f_data;
if (fp->f_type != DTYPE_KQUEUE || kq == NULL)
return (EBADF);
*kqp = kq;
KQ_LOCK(kq);
if ((kq->kq_state & KQ_CLOSING) == KQ_CLOSING) {
KQ_UNLOCK(kq);
return (EBADF);
}
kq->kq_refcnt++;
KQ_UNLOCK(kq);
return error;
}
static void
kqueue_release(struct kqueue *kq, int locked)
{
if (locked)
KQ_OWNED(kq);
else
KQ_LOCK(kq);
kq->kq_refcnt--;
if (kq->kq_refcnt == 1)
wakeup(&kq->kq_refcnt);
if (!locked)
KQ_UNLOCK(kq);
}
static void
kqueue_schedtask(struct kqueue *kq)
{
KQ_OWNED(kq);
KASSERT(((kq->kq_state & KQ_TASKDRAIN) != KQ_TASKDRAIN),
("scheduling kqueue task while draining"));
if ((kq->kq_state & KQ_TASKSCHED) != KQ_TASKSCHED) {
taskqueue_enqueue(taskqueue_kqueue, &kq->kq_task);
kq->kq_state |= KQ_TASKSCHED;
}
}
/*
* Expand the kq to make sure we have storage for fops/ident pair.
*
* Return 0 on success (or no work necessary), return errno on failure.
*
* Not calling hashinit w/ waitok (proper malloc flag) should be safe.
* If kqueue_register is called from a non-fd context, there usually/should
* be no locks held.
*/
static int
kqueue_expand(struct kqueue *kq, struct filterops *fops, uintptr_t ident,
int waitok)
{
struct klist *list, *tmp_knhash, *to_free;
u_long tmp_knhashmask;
int size;
int fd;
int mflag = waitok ? M_WAITOK : M_NOWAIT;
KQ_NOTOWNED(kq);
to_free = NULL;
if (fops->f_isfd) {
fd = ident;
if (kq->kq_knlistsize <= fd) {
size = kq->kq_knlistsize;
while (size <= fd)
size += KQEXTENT;
list = malloc(size * sizeof(*list), M_KQUEUE, mflag);
if (list == NULL)
return ENOMEM;
KQ_LOCK(kq);
if (kq->kq_knlistsize > fd) {
to_free = list;
list = NULL;
} else {
if (kq->kq_knlist != NULL) {
bcopy(kq->kq_knlist, list,
kq->kq_knlistsize * sizeof(*list));
to_free = kq->kq_knlist;
kq->kq_knlist = NULL;
}
bzero((caddr_t)list +
kq->kq_knlistsize * sizeof(*list),
(size - kq->kq_knlistsize) * sizeof(*list));
kq->kq_knlistsize = size;
kq->kq_knlist = list;
}
KQ_UNLOCK(kq);
}
} else {
if (kq->kq_knhashmask == 0) {
tmp_knhash = hashinit(KN_HASHSIZE, M_KQUEUE,
&tmp_knhashmask);
if (tmp_knhash == NULL)
return ENOMEM;
KQ_LOCK(kq);
if (kq->kq_knhashmask == 0) {
kq->kq_knhash = tmp_knhash;
kq->kq_knhashmask = tmp_knhashmask;
} else {
to_free = tmp_knhash;
}
KQ_UNLOCK(kq);
}
}
free(to_free, M_KQUEUE);
KQ_NOTOWNED(kq);
return 0;
}
static void
kqueue_task(void *arg, int pending)
{
struct kqueue *kq;
int haskqglobal;
haskqglobal = 0;
kq = arg;
KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
KQ_LOCK(kq);
KNOTE_LOCKED(&kq->kq_sel.si_note, 0);
kq->kq_state &= ~KQ_TASKSCHED;
if ((kq->kq_state & KQ_TASKDRAIN) == KQ_TASKDRAIN) {
wakeup(&kq->kq_state);
}
KQ_UNLOCK(kq);
KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
}
/*
* Scan, update kn_data (if not ONESHOT), and copyout triggered events.
* We treat KN_MARKER knotes as if they are INFLUX.
*/
static int
kqueue_scan(struct kqueue *kq, int maxevents, struct kevent_copyops *k_ops,
const struct timespec *tsp, struct kevent *keva, struct thread *td)
{
struct kevent *kevp;
struct timeval atv, rtv, ttv;
struct knote *kn, *marker;
int count, timeout, nkev, error, influx;
int haskqglobal, touch;
count = maxevents;
nkev = 0;
error = 0;
haskqglobal = 0;
if (maxevents == 0)
goto done_nl;
if (tsp != NULL) {
TIMESPEC_TO_TIMEVAL(&atv, tsp);
if (itimerfix(&atv)) {
error = EINVAL;
goto done_nl;
}
if (tsp->tv_sec == 0 && tsp->tv_nsec == 0)
timeout = -1;
else
timeout = atv.tv_sec > 24 * 60 * 60 ?
24 * 60 * 60 * hz : tvtohz(&atv);
getmicrouptime(&rtv);
timevaladd(&atv, &rtv);
} else {
atv.tv_sec = 0;
atv.tv_usec = 0;
timeout = 0;
}
marker = knote_alloc(1);
if (marker == NULL) {
error = ENOMEM;
goto done_nl;
}
marker->kn_status = KN_MARKER;
KQ_LOCK(kq);
goto start;
retry:
if (atv.tv_sec || atv.tv_usec) {
getmicrouptime(&rtv);
if (timevalcmp(&rtv, &atv, >=))
goto done;
ttv = atv;
timevalsub(&ttv, &rtv);
timeout = ttv.tv_sec > 24 * 60 * 60 ?
24 * 60 * 60 * hz : tvtohz(&ttv);
}
start:
kevp = keva;
if (kq->kq_count == 0) {
if (timeout < 0) {
error = EWOULDBLOCK;
} else {
kq->kq_state |= KQ_SLEEP;
error = msleep(kq, &kq->kq_lock, PSOCK | PCATCH,
"kqread", timeout);
}
if (error == 0)
goto retry;
/* don't restart after signals... */
if (error == ERESTART)
error = EINTR;
else if (error == EWOULDBLOCK)
error = 0;
goto done;
}
TAILQ_INSERT_TAIL(&kq->kq_head, marker, kn_tqe);
influx = 0;
while (count) {
KQ_OWNED(kq);
kn = TAILQ_FIRST(&kq->kq_head);
if ((kn->kn_status == KN_MARKER && kn != marker) ||
(kn->kn_status & KN_INFLUX) == KN_INFLUX) {
if (influx) {
influx = 0;
KQ_FLUX_WAKEUP(kq);
}
kq->kq_state |= KQ_FLUXWAIT;
error = msleep(kq, &kq->kq_lock, PSOCK,
"kqflxwt", 0);
continue;
}
TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
if ((kn->kn_status & KN_DISABLED) == KN_DISABLED) {
kn->kn_status &= ~KN_QUEUED;
kq->kq_count--;
continue;
}
if (kn == marker) {
KQ_FLUX_WAKEUP(kq);
if (count == maxevents)
goto retry;
goto done;
}
KASSERT((kn->kn_status & KN_INFLUX) == 0,
("KN_INFLUX set when not suppose to be"));
if ((kn->kn_flags & EV_ONESHOT) == EV_ONESHOT) {
kn->kn_status &= ~KN_QUEUED;
kn->kn_status |= KN_INFLUX;
kq->kq_count--;
KQ_UNLOCK(kq);
/*
* We don't need to lock the list since we've marked
* it _INFLUX.
*/
*kevp = kn->kn_kevent;
if (!(kn->kn_status & KN_DETACHED))
kn->kn_fop->f_detach(kn);
knote_drop(kn, td);
KQ_LOCK(kq);
kn = NULL;
} else {
kn->kn_status |= KN_INFLUX;
KQ_UNLOCK(kq);
if ((kn->kn_status & KN_KQUEUE) == KN_KQUEUE)
KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
KN_LIST_LOCK(kn);
if (kn->kn_fop->f_event(kn, 0) == 0) {
KQ_LOCK(kq);
KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
kn->kn_status &=
~(KN_QUEUED | KN_ACTIVE | KN_INFLUX);
kq->kq_count--;
KN_LIST_UNLOCK(kn);
influx = 1;
continue;
}
touch = (!kn->kn_fop->f_isfd &&
kn->kn_fop->f_touch != NULL);
if (touch)
kn->kn_fop->f_touch(kn, kevp, EVENT_PROCESS);
else
*kevp = kn->kn_kevent;
KQ_LOCK(kq);
KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
if (kn->kn_flags & (EV_CLEAR | EV_DISPATCH)) {
/*
* Manually clear knotes who weren't
* 'touch'ed.
*/
if (touch == 0 && kn->kn_flags & EV_CLEAR) {
kn->kn_data = 0;
kn->kn_fflags = 0;
}
if (kn->kn_flags & EV_DISPATCH)
kn->kn_status |= KN_DISABLED;
kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE);
kq->kq_count--;
} else
TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
kn->kn_status &= ~(KN_INFLUX);
KN_LIST_UNLOCK(kn);
influx = 1;
}
/* we are returning a copy to the user */
kevp++;
nkev++;
count--;
if (nkev == KQ_NEVENTS) {
influx = 0;
KQ_UNLOCK_FLUX(kq);
error = k_ops->k_copyout(k_ops->arg, keva, nkev);
nkev = 0;
kevp = keva;
KQ_LOCK(kq);
if (error)
break;
}
}
TAILQ_REMOVE(&kq->kq_head, marker, kn_tqe);
done:
KQ_OWNED(kq);
KQ_UNLOCK_FLUX(kq);
knote_free(marker);
done_nl:
KQ_NOTOWNED(kq);
if (nkev != 0)
error = k_ops->k_copyout(k_ops->arg, keva, nkev);
td->td_retval[0] = maxevents - count;
return (error);
}
/*
* XXX
* This could be expanded to call kqueue_scan, if desired.
*/
/*ARGSUSED*/
static int
kqueue_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
int flags, struct thread *td)
{
return (ENXIO);
}
/*ARGSUSED*/
static int
kqueue_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
int flags, struct thread *td)
{
return (ENXIO);
}
/*ARGSUSED*/
static int
kqueue_truncate(struct file *fp, off_t length, struct ucred *active_cred,
struct thread *td)
{
return (EINVAL);
}
/*ARGSUSED*/
static int
kqueue_ioctl(struct file *fp, u_long cmd, void *data,
struct ucred *active_cred, struct thread *td)
{
/*
* Enabling sigio causes two major problems:
* 1) infinite recursion:
* Synopsys: kevent is being used to track signals and have FIOASYNC
* set. On receipt of a signal this will cause a kqueue to recurse
* into itself over and over. Sending the sigio causes the kqueue
* to become ready, which in turn posts sigio again, forever.
* Solution: this can be solved by setting a flag in the kqueue that
* we have a SIGIO in progress.
* 2) locking problems:
* Synopsys: Kqueue is a leaf subsystem, but adding signalling puts
* us above the proc and pgrp locks.
* Solution: Post a signal using an async mechanism, being sure to
* record a generation count in the delivery so that we do not deliver
* a signal to the wrong process.
*
* Note, these two mechanisms are somewhat mutually exclusive!
*/
#if 0
struct kqueue *kq;
kq = fp->f_data;
switch (cmd) {
case FIOASYNC:
if (*(int *)data) {
kq->kq_state |= KQ_ASYNC;
} else {
kq->kq_state &= ~KQ_ASYNC;
}
return (0);
case FIOSETOWN:
return (fsetown(*(int *)data, &kq->kq_sigio));
case FIOGETOWN:
*(int *)data = fgetown(&kq->kq_sigio);
return (0);
}
#endif
return (ENOTTY);
}
/*ARGSUSED*/
static int
kqueue_poll(struct file *fp, int events, struct ucred *active_cred,
struct thread *td)
{
struct kqueue *kq;
int revents = 0;
int error;
if ((error = kqueue_acquire(fp, &kq)))
return POLLERR;
KQ_LOCK(kq);
if (events & (POLLIN | POLLRDNORM)) {
if (kq->kq_count) {
revents |= events & (POLLIN | POLLRDNORM);
} else {
selrecord(td, &kq->kq_sel);
if (SEL_WAITING(&kq->kq_sel))
kq->kq_state |= KQ_SEL;
}
}
kqueue_release(kq, 1);
KQ_UNLOCK(kq);
return (revents);
}
/*ARGSUSED*/
static int
kqueue_stat(struct file *fp, struct stat *st, struct ucred *active_cred,
struct thread *td)
{
bzero((void *)st, sizeof *st);
/*
* We no longer return kq_count because the unlocked value is useless.
* If you spent all this time getting the count, why not spend your
* syscall better by calling kevent?
*
* XXX - This is needed for libc_r.
*/
st->st_mode = S_IFIFO;
return (0);
}
/*ARGSUSED*/
static int
kqueue_close(struct file *fp, struct thread *td)
{
struct kqueue *kq = fp->f_data;
struct filedesc *fdp;
struct knote *kn;
int i;
int error;
if ((error = kqueue_acquire(fp, &kq)))
return error;
KQ_LOCK(kq);
KASSERT((kq->kq_state & KQ_CLOSING) != KQ_CLOSING,
("kqueue already closing"));
kq->kq_state |= KQ_CLOSING;
if (kq->kq_refcnt > 1)
msleep(&kq->kq_refcnt, &kq->kq_lock, PSOCK, "kqclose", 0);
KASSERT(kq->kq_refcnt == 1, ("other refs are out there!"));
fdp = kq->kq_fdp;
KASSERT(knlist_empty(&kq->kq_sel.si_note),
("kqueue's knlist not empty"));
for (i = 0; i < kq->kq_knlistsize; i++) {
while ((kn = SLIST_FIRST(&kq->kq_knlist[i])) != NULL) {
if ((kn->kn_status & KN_INFLUX) == KN_INFLUX) {
kq->kq_state |= KQ_FLUXWAIT;
msleep(kq, &kq->kq_lock, PSOCK, "kqclo1", 0);
continue;
}
kn->kn_status |= KN_INFLUX;
KQ_UNLOCK(kq);
if (!(kn->kn_status & KN_DETACHED))
kn->kn_fop->f_detach(kn);
knote_drop(kn, td);
KQ_LOCK(kq);
}
}
if (kq->kq_knhashmask != 0) {
for (i = 0; i <= kq->kq_knhashmask; i++) {
while ((kn = SLIST_FIRST(&kq->kq_knhash[i])) != NULL) {
if ((kn->kn_status & KN_INFLUX) == KN_INFLUX) {
kq->kq_state |= KQ_FLUXWAIT;
msleep(kq, &kq->kq_lock, PSOCK,
"kqclo2", 0);
continue;
}
kn->kn_status |= KN_INFLUX;
KQ_UNLOCK(kq);
if (!(kn->kn_status & KN_DETACHED))
kn->kn_fop->f_detach(kn);
knote_drop(kn, td);
KQ_LOCK(kq);
}
}
}
if ((kq->kq_state & KQ_TASKSCHED) == KQ_TASKSCHED) {
kq->kq_state |= KQ_TASKDRAIN;
msleep(&kq->kq_state, &kq->kq_lock, PSOCK, "kqtqdr", 0);
}
if ((kq->kq_state & KQ_SEL) == KQ_SEL) {
selwakeuppri(&kq->kq_sel, PSOCK);
if (!SEL_WAITING(&kq->kq_sel))
kq->kq_state &= ~KQ_SEL;
}
KQ_UNLOCK(kq);
FILEDESC_XLOCK(fdp);
SLIST_REMOVE(&fdp->fd_kqlist, kq, kqueue, kq_list);
FILEDESC_XUNLOCK(fdp);
seldrain(&kq->kq_sel);
knlist_destroy(&kq->kq_sel.si_note);
mtx_destroy(&kq->kq_lock);
kq->kq_fdp = NULL;
if (kq->kq_knhash != NULL)
free(kq->kq_knhash, M_KQUEUE);
if (kq->kq_knlist != NULL)
free(kq->kq_knlist, M_KQUEUE);
funsetown(&kq->kq_sigio);
free(kq, M_KQUEUE);
fp->f_data = NULL;
return (0);
}
static void
kqueue_wakeup(struct kqueue *kq)
{
KQ_OWNED(kq);
if ((kq->kq_state & KQ_SLEEP) == KQ_SLEEP) {
kq->kq_state &= ~KQ_SLEEP;
wakeup(kq);
}
if ((kq->kq_state & KQ_SEL) == KQ_SEL) {
selwakeuppri(&kq->kq_sel, PSOCK);
if (!SEL_WAITING(&kq->kq_sel))
kq->kq_state &= ~KQ_SEL;
}
if (!knlist_empty(&kq->kq_sel.si_note))
kqueue_schedtask(kq);
if ((kq->kq_state & KQ_ASYNC) == KQ_ASYNC) {
pgsigio(&kq->kq_sigio, SIGIO, 0);
}
}
/*
* Walk down a list of knotes, activating them if their event has triggered.
*
* There is a possibility to optimize in the case of one kq watching another.
* Instead of scheduling a task to wake it up, you could pass enough state
* down the chain to make up the parent kqueue. Make this code functional
* first.
*/
void
knote(struct knlist *list, long hint, int lockflags)
{
struct kqueue *kq;
struct knote *kn;
int error;
if (list == NULL)
return;
KNL_ASSERT_LOCK(list, lockflags & KNF_LISTLOCKED);
if ((lockflags & KNF_LISTLOCKED) == 0)
list->kl_lock(list->kl_lockarg);
/*
* If we unlock the list lock (and set KN_INFLUX), we can eliminate
* the kqueue scheduling, but this will introduce four
* lock/unlock's for each knote to test. If we do, continue to use
* SLIST_FOREACH, SLIST_FOREACH_SAFE is not safe in our case, it is
* only safe if you want to remove the current item, which we are
* not doing.
*/
SLIST_FOREACH(kn, &list->kl_list, kn_selnext) {
kq = kn->kn_kq;
if ((kn->kn_status & KN_INFLUX) != KN_INFLUX) {
KQ_LOCK(kq);
if ((kn->kn_status & KN_INFLUX) == KN_INFLUX) {
KQ_UNLOCK(kq);
} else if ((lockflags & KNF_NOKQLOCK) != 0) {
kn->kn_status |= KN_INFLUX;
KQ_UNLOCK(kq);
error = kn->kn_fop->f_event(kn, hint);
KQ_LOCK(kq);
kn->kn_status &= ~KN_INFLUX;
if (error)
KNOTE_ACTIVATE(kn, 1);
KQ_UNLOCK_FLUX(kq);
} else {
kn->kn_status |= KN_HASKQLOCK;
if (kn->kn_fop->f_event(kn, hint))
KNOTE_ACTIVATE(kn, 1);
kn->kn_status &= ~KN_HASKQLOCK;
KQ_UNLOCK(kq);
}
}
kq = NULL;
}
if ((lockflags & KNF_LISTLOCKED) == 0)
list->kl_unlock(list->kl_lockarg);
}
/*
* add a knote to a knlist
*/
void
knlist_add(struct knlist *knl, struct knote *kn, int islocked)
{
KNL_ASSERT_LOCK(knl, islocked);
KQ_NOTOWNED(kn->kn_kq);
KASSERT((kn->kn_status & (KN_INFLUX|KN_DETACHED)) ==
(KN_INFLUX|KN_DETACHED), ("knote not KN_INFLUX and KN_DETACHED"));
if (!islocked)
knl->kl_lock(knl->kl_lockarg);
SLIST_INSERT_HEAD(&knl->kl_list, kn, kn_selnext);
if (!islocked)
knl->kl_unlock(knl->kl_lockarg);
KQ_LOCK(kn->kn_kq);
kn->kn_knlist = knl;
kn->kn_status &= ~KN_DETACHED;
KQ_UNLOCK(kn->kn_kq);
}
static void
knlist_remove_kq(struct knlist *knl, struct knote *kn, int knlislocked, int kqislocked)
{
KASSERT(!(!!kqislocked && !knlislocked), ("kq locked w/o knl locked"));
KNL_ASSERT_LOCK(knl, knlislocked);
mtx_assert(&kn->kn_kq->kq_lock, kqislocked ? MA_OWNED : MA_NOTOWNED);
if (!kqislocked)
KASSERT((kn->kn_status & (KN_INFLUX|KN_DETACHED)) == KN_INFLUX,
("knlist_remove called w/o knote being KN_INFLUX or already removed"));
if (!knlislocked)
knl->kl_lock(knl->kl_lockarg);
SLIST_REMOVE(&knl->kl_list, kn, knote, kn_selnext);
kn->kn_knlist = NULL;
if (!knlislocked)
knl->kl_unlock(knl->kl_lockarg);
if (!kqislocked)
KQ_LOCK(kn->kn_kq);
kn->kn_status |= KN_DETACHED;
if (!kqislocked)
KQ_UNLOCK(kn->kn_kq);
}
/*
* remove all knotes from a specified klist
*/
void
knlist_remove(struct knlist *knl, struct knote *kn, int islocked)
{
knlist_remove_kq(knl, kn, islocked, 0);
}
/*
* remove knote from a specified klist while in f_event handler.
*/
void
knlist_remove_inevent(struct knlist *knl, struct knote *kn)
{
knlist_remove_kq(knl, kn, 1,
(kn->kn_status & KN_HASKQLOCK) == KN_HASKQLOCK);
}
int
knlist_empty(struct knlist *knl)
{
KNL_ASSERT_LOCKED(knl);
return SLIST_EMPTY(&knl->kl_list);
}
static struct mtx knlist_lock;
MTX_SYSINIT(knlist_lock, &knlist_lock, "knlist lock for lockless objects",
MTX_DEF);
static void knlist_mtx_lock(void *arg);
static void knlist_mtx_unlock(void *arg);
static void
knlist_mtx_lock(void *arg)
{
mtx_lock((struct mtx *)arg);
}
static void
knlist_mtx_unlock(void *arg)
{
mtx_unlock((struct mtx *)arg);
}
static void
knlist_mtx_assert_locked(void *arg)
{
mtx_assert((struct mtx *)arg, MA_OWNED);
}
static void
knlist_mtx_assert_unlocked(void *arg)
{
mtx_assert((struct mtx *)arg, MA_NOTOWNED);
}
void
knlist_init(struct knlist *knl, void *lock, void (*kl_lock)(void *),
void (*kl_unlock)(void *),
void (*kl_assert_locked)(void *), void (*kl_assert_unlocked)(void *))
{
if (lock == NULL)
knl->kl_lockarg = &knlist_lock;
else
knl->kl_lockarg = lock;
if (kl_lock == NULL)
knl->kl_lock = knlist_mtx_lock;
else
knl->kl_lock = kl_lock;
if (kl_unlock == NULL)
knl->kl_unlock = knlist_mtx_unlock;
else
knl->kl_unlock = kl_unlock;
if (kl_assert_locked == NULL)
knl->kl_assert_locked = knlist_mtx_assert_locked;
else
knl->kl_assert_locked = kl_assert_locked;
if (kl_assert_unlocked == NULL)
knl->kl_assert_unlocked = knlist_mtx_assert_unlocked;
else
knl->kl_assert_unlocked = kl_assert_unlocked;
SLIST_INIT(&knl->kl_list);
}
void
knlist_init_mtx(struct knlist *knl, struct mtx *lock)
{
knlist_init(knl, lock, NULL, NULL, NULL, NULL);
}
void
knlist_destroy(struct knlist *knl)
{
#ifdef INVARIANTS
/*
* if we run across this error, we need to find the offending
* driver and have it call knlist_clear.
*/
if (!SLIST_EMPTY(&knl->kl_list))
printf("WARNING: destroying knlist w/ knotes on it!\n");
#endif
knl->kl_lockarg = knl->kl_lock = knl->kl_unlock = NULL;
SLIST_INIT(&knl->kl_list);
}
/*
* Even if we are locked, we may need to drop the lock to allow any influx
* knotes time to "settle".
*/
void
knlist_cleardel(struct knlist *knl, struct thread *td, int islocked, int killkn)
{
struct knote *kn, *kn2;
struct kqueue *kq;
if (islocked)
KNL_ASSERT_LOCKED(knl);
else {
KNL_ASSERT_UNLOCKED(knl);
again: /* need to reacquire lock since we have dropped it */
knl->kl_lock(knl->kl_lockarg);
}
SLIST_FOREACH_SAFE(kn, &knl->kl_list, kn_selnext, kn2) {
kq = kn->kn_kq;
KQ_LOCK(kq);
if ((kn->kn_status & KN_INFLUX)) {
KQ_UNLOCK(kq);
continue;
}
knlist_remove_kq(knl, kn, 1, 1);
if (killkn) {
kn->kn_status |= KN_INFLUX | KN_DETACHED;
KQ_UNLOCK(kq);
knote_drop(kn, td);
} else {
/* Make sure cleared knotes disappear soon */
kn->kn_flags |= (EV_EOF | EV_ONESHOT);
KQ_UNLOCK(kq);
}
kq = NULL;
}
if (!SLIST_EMPTY(&knl->kl_list)) {
/* there are still KN_INFLUX remaining */
kn = SLIST_FIRST(&knl->kl_list);
kq = kn->kn_kq;
KQ_LOCK(kq);
KASSERT(kn->kn_status & KN_INFLUX,
("knote removed w/o list lock"));
knl->kl_unlock(knl->kl_lockarg);
kq->kq_state |= KQ_FLUXWAIT;
msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqkclr", 0);
kq = NULL;
goto again;
}
if (islocked)
KNL_ASSERT_LOCKED(knl);
else {
knl->kl_unlock(knl->kl_lockarg);
KNL_ASSERT_UNLOCKED(knl);
}
}
/*
* Remove all knotes referencing a specified fd must be called with FILEDESC
* lock. This prevents a race where a new fd comes along and occupies the
* entry and we attach a knote to the fd.
*/
void
knote_fdclose(struct thread *td, int fd)
{
struct filedesc *fdp = td->td_proc->p_fd;
struct kqueue *kq;
struct knote *kn;
int influx;
FILEDESC_XLOCK_ASSERT(fdp);
/*
* We shouldn't have to worry about new kevents appearing on fd
* since filedesc is locked.
*/
SLIST_FOREACH(kq, &fdp->fd_kqlist, kq_list) {
KQ_LOCK(kq);
again:
influx = 0;
while (kq->kq_knlistsize > fd &&
(kn = SLIST_FIRST(&kq->kq_knlist[fd])) != NULL) {
if (kn->kn_status & KN_INFLUX) {
/* someone else might be waiting on our knote */
if (influx)
wakeup(kq);
kq->kq_state |= KQ_FLUXWAIT;
msleep(kq, &kq->kq_lock, PSOCK, "kqflxwt", 0);
goto again;
}
kn->kn_status |= KN_INFLUX;
KQ_UNLOCK(kq);
if (!(kn->kn_status & KN_DETACHED))
kn->kn_fop->f_detach(kn);
knote_drop(kn, td);
influx = 1;
KQ_LOCK(kq);
}
KQ_UNLOCK_FLUX(kq);
}
}
static int
knote_attach(struct knote *kn, struct kqueue *kq)
{
struct klist *list;
KASSERT(kn->kn_status & KN_INFLUX, ("knote not marked INFLUX"));
KQ_OWNED(kq);
if (kn->kn_fop->f_isfd) {
if (kn->kn_id >= kq->kq_knlistsize)
return ENOMEM;
list = &kq->kq_knlist[kn->kn_id];
} else {
if (kq->kq_knhash == NULL)
return ENOMEM;
list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
}
SLIST_INSERT_HEAD(list, kn, kn_link);
return 0;
}
/*
* knote must already have been detached using the f_detach method.
* no lock need to be held, it is assumed that the KN_INFLUX flag is set
* to prevent other removal.
*/
static void
knote_drop(struct knote *kn, struct thread *td)
{
struct kqueue *kq;
struct klist *list;
kq = kn->kn_kq;
KQ_NOTOWNED(kq);
KASSERT((kn->kn_status & KN_INFLUX) == KN_INFLUX,
("knote_drop called without KN_INFLUX set in kn_status"));
KQ_LOCK(kq);
if (kn->kn_fop->f_isfd)
list = &kq->kq_knlist[kn->kn_id];
else
list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
if (!SLIST_EMPTY(list))
SLIST_REMOVE(list, kn, knote, kn_link);
if (kn->kn_status & KN_QUEUED)
knote_dequeue(kn);
KQ_UNLOCK_FLUX(kq);
if (kn->kn_fop->f_isfd) {
fdrop(kn->kn_fp, td);
kn->kn_fp = NULL;
}
kqueue_fo_release(kn->kn_kevent.filter);
kn->kn_fop = NULL;
knote_free(kn);
}
static void
knote_enqueue(struct knote *kn)
{
struct kqueue *kq = kn->kn_kq;
KQ_OWNED(kn->kn_kq);
KASSERT((kn->kn_status & KN_QUEUED) == 0, ("knote already queued"));
TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
kn->kn_status |= KN_QUEUED;
kq->kq_count++;
kqueue_wakeup(kq);
}
static void
knote_dequeue(struct knote *kn)
{
struct kqueue *kq = kn->kn_kq;
KQ_OWNED(kn->kn_kq);
KASSERT(kn->kn_status & KN_QUEUED, ("knote not queued"));
TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
kn->kn_status &= ~KN_QUEUED;
kq->kq_count--;
}
static void
knote_init(void)
{
knote_zone = uma_zcreate("KNOTE", sizeof(struct knote), NULL, NULL,
NULL, NULL, UMA_ALIGN_PTR, 0);
}
SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL);
static struct knote *
knote_alloc(int waitok)
{
return ((struct knote *)uma_zalloc(knote_zone,
(waitok ? M_WAITOK : M_NOWAIT)|M_ZERO));
}
static void
knote_free(struct knote *kn)
{
if (kn != NULL)
uma_zfree(knote_zone, kn);
}
/*
* Register the kev w/ the kq specified by fd.
*/
int
kqfd_register(int fd, struct kevent *kev, struct thread *td, int waitok)
{
struct kqueue *kq;
struct file *fp;
int error;
if ((error = fget(td, fd, CAP_POST_EVENT, &fp)) != 0)
return (error);
if ((error = kqueue_acquire(fp, &kq)) != 0)
goto noacquire;
error = kqueue_register(kq, kev, td, waitok);
kqueue_release(kq, 0);
noacquire:
fdrop(fp, td);
return error;
}
Index: head/sys/kern/kern_exec.c
===================================================================
--- head/sys/kern/kern_exec.c (revision 225616)
+++ head/sys/kern/kern_exec.c (revision 225617)
@@ -1,1577 +1,1577 @@
/*-
* Copyright (c) 1993, David Greenman
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_capsicum.h"
#include "opt_hwpmc_hooks.h"
#include "opt_kdtrace.h"
#include "opt_ktrace.h"
#include "opt_vm.h"
#include <sys/param.h>
#include <sys/capability.h>
#include <sys/systm.h>
#include <sys/capability.h>
#include <sys/eventhandler.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/sysproto.h>
#include <sys/signalvar.h>
#include <sys/kernel.h>
#include <sys/mount.h>
#include <sys/filedesc.h>
#include <sys/fcntl.h>
#include <sys/acct.h>
#include <sys/exec.h>
#include <sys/imgact.h>
#include <sys/imgact_elf.h>
#include <sys/wait.h>
#include <sys/malloc.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/pioctl.h>
#include <sys/namei.h>
#include <sys/resourcevar.h>
#include <sys/sdt.h>
#include <sys/sf_buf.h>
#include <sys/syscallsubr.h>
#include <sys/sysent.h>
#include <sys/shm.h>
#include <sys/sysctl.h>
#include <sys/vnode.h>
#include <sys/stat.h>
#ifdef KTRACE
#include <sys/ktrace.h>
#endif
#include <vm/vm.h>
#include <vm/vm_param.h>
#include <vm/pmap.h>
#include <vm/vm_page.h>
#include <vm/vm_map.h>
#include <vm/vm_kern.h>
#include <vm/vm_extern.h>
#include <vm/vm_object.h>
#include <vm/vm_pager.h>
#ifdef HWPMC_HOOKS
#include <sys/pmckern.h>
#endif
#include <machine/reg.h>
#include <security/audit/audit.h>
#include <security/mac/mac_framework.h>
#ifdef KDTRACE_HOOKS
#include <sys/dtrace_bsd.h>
dtrace_execexit_func_t dtrace_fasttrap_exec;
#endif
SDT_PROVIDER_DECLARE(proc);
SDT_PROBE_DEFINE(proc, kernel, , exec, exec);
SDT_PROBE_ARGTYPE(proc, kernel, , exec, 0, "char *");
SDT_PROBE_DEFINE(proc, kernel, , exec_failure, exec-failure);
SDT_PROBE_ARGTYPE(proc, kernel, , exec_failure, 0, "int");
SDT_PROBE_DEFINE(proc, kernel, , exec_success, exec-success);
SDT_PROBE_ARGTYPE(proc, kernel, , exec_success, 0, "char *");
MALLOC_DEFINE(M_PARGS, "proc-args", "Process arguments");
static int sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS);
static int sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS);
static int sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS);
static int do_execve(struct thread *td, struct image_args *args,
struct mac *mac_p);
/* XXX This should be vm_size_t. */
SYSCTL_PROC(_kern, KERN_PS_STRINGS, ps_strings, CTLTYPE_ULONG|CTLFLAG_RD,
NULL, 0, sysctl_kern_ps_strings, "LU", "");
/* XXX This should be vm_size_t. */
SYSCTL_PROC(_kern, KERN_USRSTACK, usrstack, CTLTYPE_ULONG|CTLFLAG_RD|
CTLFLAG_CAPRD, NULL, 0, sysctl_kern_usrstack, "LU", "");
SYSCTL_PROC(_kern, OID_AUTO, stackprot, CTLTYPE_INT|CTLFLAG_RD,
NULL, 0, sysctl_kern_stackprot, "I", "");
u_long ps_arg_cache_limit = PAGE_SIZE / 16;
SYSCTL_ULONG(_kern, OID_AUTO, ps_arg_cache_limit, CTLFLAG_RW,
&ps_arg_cache_limit, 0, "");
static int map_at_zero = 0;
TUNABLE_INT("security.bsd.map_at_zero", &map_at_zero);
SYSCTL_INT(_security_bsd, OID_AUTO, map_at_zero, CTLFLAG_RW, &map_at_zero, 0,
"Permit processes to map an object at virtual address 0.");
static int
sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS)
{
struct proc *p;
int error;
p = curproc;
#ifdef SCTL_MASK32
if (req->flags & SCTL_MASK32) {
unsigned int val;
val = (unsigned int)p->p_sysent->sv_psstrings;
error = SYSCTL_OUT(req, &val, sizeof(val));
} else
#endif
error = SYSCTL_OUT(req, &p->p_sysent->sv_psstrings,
sizeof(p->p_sysent->sv_psstrings));
return error;
}
static int
sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS)
{
struct proc *p;
int error;
p = curproc;
#ifdef SCTL_MASK32
if (req->flags & SCTL_MASK32) {
unsigned int val;
val = (unsigned int)p->p_sysent->sv_usrstack;
error = SYSCTL_OUT(req, &val, sizeof(val));
} else
#endif
error = SYSCTL_OUT(req, &p->p_sysent->sv_usrstack,
sizeof(p->p_sysent->sv_usrstack));
return error;
}
static int
sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS)
{
struct proc *p;
p = curproc;
return (SYSCTL_OUT(req, &p->p_sysent->sv_stackprot,
sizeof(p->p_sysent->sv_stackprot)));
}
/*
* Each of the items is a pointer to a `const struct execsw', hence the
* double pointer here.
*/
static const struct execsw **execsw;
#ifndef _SYS_SYSPROTO_H_
struct execve_args {
char *fname;
char **argv;
char **envv;
};
#endif
int
-execve(td, uap)
+sys_execve(td, uap)
struct thread *td;
struct execve_args /* {
char *fname;
char **argv;
char **envv;
} */ *uap;
{
int error;
struct image_args args;
error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE,
uap->argv, uap->envv);
if (error == 0)
error = kern_execve(td, &args, NULL);
return (error);
}
#ifndef _SYS_SYSPROTO_H_
struct fexecve_args {
int fd;
char **argv;
char **envv;
}
#endif
int
-fexecve(struct thread *td, struct fexecve_args *uap)
+sys_fexecve(struct thread *td, struct fexecve_args *uap)
{
int error;
struct image_args args;
error = exec_copyin_args(&args, NULL, UIO_SYSSPACE,
uap->argv, uap->envv);
if (error == 0) {
args.fd = uap->fd;
error = kern_execve(td, &args, NULL);
}
return (error);
}
#ifndef _SYS_SYSPROTO_H_
struct __mac_execve_args {
char *fname;
char **argv;
char **envv;
struct mac *mac_p;
};
#endif
int
-__mac_execve(td, uap)
+sys___mac_execve(td, uap)
struct thread *td;
struct __mac_execve_args /* {
char *fname;
char **argv;
char **envv;
struct mac *mac_p;
} */ *uap;
{
#ifdef MAC
int error;
struct image_args args;
error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE,
uap->argv, uap->envv);
if (error == 0)
error = kern_execve(td, &args, uap->mac_p);
return (error);
#else
return (ENOSYS);
#endif
}
/*
* XXX: kern_execve has the astonishing property of not always returning to
* the caller. If sufficiently bad things happen during the call to
* do_execve(), it can end up calling exit1(); as a result, callers must
* avoid doing anything which they might need to undo (e.g., allocating
* memory).
*/
int
kern_execve(td, args, mac_p)
struct thread *td;
struct image_args *args;
struct mac *mac_p;
{
struct proc *p = td->td_proc;
int error;
AUDIT_ARG_ARGV(args->begin_argv, args->argc,
args->begin_envv - args->begin_argv);
AUDIT_ARG_ENVV(args->begin_envv, args->envc,
args->endp - args->begin_envv);
if (p->p_flag & P_HADTHREADS) {
PROC_LOCK(p);
if (thread_single(SINGLE_BOUNDARY)) {
PROC_UNLOCK(p);
exec_free_args(args);
return (ERESTART); /* Try again later. */
}
PROC_UNLOCK(p);
}
error = do_execve(td, args, mac_p);
if (p->p_flag & P_HADTHREADS) {
PROC_LOCK(p);
/*
* If success, we upgrade to SINGLE_EXIT state to
* force other threads to suicide.
*/
if (error == 0)
thread_single(SINGLE_EXIT);
else
thread_single_end();
PROC_UNLOCK(p);
}
return (error);
}
/*
* In-kernel implementation of execve(). All arguments are assumed to be
* userspace pointers from the passed thread.
*/
static int
do_execve(td, args, mac_p)
struct thread *td;
struct image_args *args;
struct mac *mac_p;
{
struct proc *p = td->td_proc;
struct nameidata nd;
struct ucred *newcred = NULL, *oldcred;
struct uidinfo *euip;
register_t *stack_base;
int error, i;
struct image_params image_params, *imgp;
struct vattr attr;
int (*img_first)(struct image_params *);
struct pargs *oldargs = NULL, *newargs = NULL;
struct sigacts *oldsigacts, *newsigacts;
#ifdef KTRACE
struct vnode *tracevp = NULL;
struct ucred *tracecred = NULL;
#endif
struct vnode *textvp = NULL, *binvp = NULL;
int credential_changing;
int vfslocked;
int textset;
#ifdef MAC
struct label *interpvplabel = NULL;
int will_transition;
#endif
#ifdef HWPMC_HOOKS
struct pmckern_procexec pe;
#endif
static const char fexecv_proc_title[] = "(fexecv)";
vfslocked = 0;
imgp = &image_params;
/*
* Lock the process and set the P_INEXEC flag to indicate that
* it should be left alone until we're done here. This is
* necessary to avoid race conditions - e.g. in ptrace() -
* that might allow a local user to illicitly obtain elevated
* privileges.
*/
PROC_LOCK(p);
KASSERT((p->p_flag & P_INEXEC) == 0,
("%s(): process already has P_INEXEC flag", __func__));
p->p_flag |= P_INEXEC;
PROC_UNLOCK(p);
/*
* Initialize part of the common data
*/
imgp->proc = p;
imgp->execlabel = NULL;
imgp->attr = &attr;
imgp->entry_addr = 0;
imgp->reloc_base = 0;
imgp->vmspace_destroyed = 0;
imgp->interpreted = 0;
imgp->opened = 0;
imgp->interpreter_name = NULL;
imgp->auxargs = NULL;
imgp->vp = NULL;
imgp->object = NULL;
imgp->firstpage = NULL;
imgp->ps_strings = 0;
imgp->auxarg_size = 0;
imgp->args = args;
imgp->execpath = imgp->freepath = NULL;
imgp->execpathp = 0;
imgp->canary = 0;
imgp->canarylen = 0;
imgp->pagesizes = 0;
imgp->pagesizeslen = 0;
imgp->stack_prot = 0;
#ifdef MAC
error = mac_execve_enter(imgp, mac_p);
if (error)
goto exec_fail;
#endif
imgp->image_header = NULL;
/*
* Translate the file name. namei() returns a vnode pointer
* in ni_vp amoung other things.
*
* XXXAUDIT: It would be desirable to also audit the name of the
* interpreter if this is an interpreted binary.
*/
if (args->fname != NULL) {
NDINIT(&nd, LOOKUP, ISOPEN | LOCKLEAF | FOLLOW | SAVENAME
| MPSAFE | AUDITVNODE1, UIO_SYSSPACE, args->fname, td);
}
SDT_PROBE(proc, kernel, , exec, args->fname, 0, 0, 0, 0 );
interpret:
if (args->fname != NULL) {
#ifdef CAPABILITY_MODE
/*
* While capability mode can't reach this point via direct
* path arguments to execve(), we also don't allow
* interpreters to be used in capability mode (for now).
* Catch indirect lookups and return a permissions error.
*/
if (IN_CAPABILITY_MODE(td)) {
error = ECAPMODE;
goto exec_fail;
}
#endif
error = namei(&nd);
if (error)
goto exec_fail;
vfslocked = NDHASGIANT(&nd);
binvp = nd.ni_vp;
imgp->vp = binvp;
} else {
AUDIT_ARG_FD(args->fd);
/*
* Some might argue that CAP_READ and/or CAP_MMAP should also
* be required here; such arguments will be entertained.
*/
error = fgetvp_read(td, args->fd, CAP_FEXECVE, &binvp);
if (error)
goto exec_fail;
vfslocked = VFS_LOCK_GIANT(binvp->v_mount);
vn_lock(binvp, LK_EXCLUSIVE | LK_RETRY);
AUDIT_ARG_VNODE1(binvp);
imgp->vp = binvp;
}
/*
* Check file permissions (also 'opens' file)
*/
error = exec_check_permissions(imgp);
if (error)
goto exec_fail_dealloc;
imgp->object = imgp->vp->v_object;
if (imgp->object != NULL)
vm_object_reference(imgp->object);
/*
* Set VV_TEXT now so no one can write to the executable while we're
* activating it.
*
* Remember if this was set before and unset it in case this is not
* actually an executable image.
*/
textset = imgp->vp->v_vflag & VV_TEXT;
imgp->vp->v_vflag |= VV_TEXT;
error = exec_map_first_page(imgp);
if (error)
goto exec_fail_dealloc;
imgp->proc->p_osrel = 0;
/*
* If the current process has a special image activator it
* wants to try first, call it. For example, emulating shell
* scripts differently.
*/
error = -1;
if ((img_first = imgp->proc->p_sysent->sv_imgact_try) != NULL)
error = img_first(imgp);
/*
* Loop through the list of image activators, calling each one.
* An activator returns -1 if there is no match, 0 on success,
* and an error otherwise.
*/
for (i = 0; error == -1 && execsw[i]; ++i) {
if (execsw[i]->ex_imgact == NULL ||
execsw[i]->ex_imgact == img_first) {
continue;
}
error = (*execsw[i]->ex_imgact)(imgp);
}
if (error) {
if (error == -1) {
if (textset == 0)
imgp->vp->v_vflag &= ~VV_TEXT;
error = ENOEXEC;
}
goto exec_fail_dealloc;
}
/*
* Special interpreter operation, cleanup and loop up to try to
* activate the interpreter.
*/
if (imgp->interpreted) {
exec_unmap_first_page(imgp);
/*
* VV_TEXT needs to be unset for scripts. There is a short
* period before we determine that something is a script where
* VV_TEXT will be set. The vnode lock is held over this
* entire period so nothing should illegitimately be blocked.
*/
imgp->vp->v_vflag &= ~VV_TEXT;
/* free name buffer and old vnode */
if (args->fname != NULL)
NDFREE(&nd, NDF_ONLY_PNBUF);
#ifdef MAC
mac_execve_interpreter_enter(binvp, &interpvplabel);
#endif
if (imgp->opened) {
VOP_CLOSE(binvp, FREAD, td->td_ucred, td);
imgp->opened = 0;
}
vput(binvp);
vm_object_deallocate(imgp->object);
imgp->object = NULL;
VFS_UNLOCK_GIANT(vfslocked);
vfslocked = 0;
/* set new name to that of the interpreter */
NDINIT(&nd, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME | MPSAFE,
UIO_SYSSPACE, imgp->interpreter_name, td);
args->fname = imgp->interpreter_name;
goto interpret;
}
/*
* NB: We unlock the vnode here because it is believed that none
* of the sv_copyout_strings/sv_fixup operations require the vnode.
*/
VOP_UNLOCK(imgp->vp, 0);
/*
* Do the best to calculate the full path to the image file.
*/
if (imgp->auxargs != NULL &&
((args->fname != NULL && args->fname[0] == '/') ||
vn_fullpath(td, imgp->vp, &imgp->execpath, &imgp->freepath) != 0))
imgp->execpath = args->fname;
/*
* Copy out strings (args and env) and initialize stack base
*/
if (p->p_sysent->sv_copyout_strings)
stack_base = (*p->p_sysent->sv_copyout_strings)(imgp);
else
stack_base = exec_copyout_strings(imgp);
/*
* If custom stack fixup routine present for this process
* let it do the stack setup.
* Else stuff argument count as first item on stack
*/
if (p->p_sysent->sv_fixup != NULL)
(*p->p_sysent->sv_fixup)(&stack_base, imgp);
else
suword(--stack_base, imgp->args->argc);
/*
* For security and other reasons, the file descriptor table cannot
* be shared after an exec.
*/
fdunshare(p, td);
/*
* Malloc things before we need locks.
*/
newcred = crget();
euip = uifind(attr.va_uid);
i = imgp->args->begin_envv - imgp->args->begin_argv;
/* Cache arguments if they fit inside our allowance */
if (ps_arg_cache_limit >= i + sizeof(struct pargs)) {
newargs = pargs_alloc(i);
bcopy(imgp->args->begin_argv, newargs->ar_args, i);
}
/* close files on exec */
fdcloseexec(td);
vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY);
/* Get a reference to the vnode prior to locking the proc */
VREF(binvp);
/*
* For security and other reasons, signal handlers cannot
* be shared after an exec. The new process gets a copy of the old
* handlers. In execsigs(), the new process will have its signals
* reset.
*/
PROC_LOCK(p);
oldcred = crcopysafe(p, newcred);
if (sigacts_shared(p->p_sigacts)) {
oldsigacts = p->p_sigacts;
PROC_UNLOCK(p);
newsigacts = sigacts_alloc();
sigacts_copy(newsigacts, oldsigacts);
PROC_LOCK(p);
p->p_sigacts = newsigacts;
} else
oldsigacts = NULL;
/* Stop profiling */
stopprofclock(p);
/* reset caught signals */
execsigs(p);
/* name this process - nameiexec(p, ndp) */
bzero(p->p_comm, sizeof(p->p_comm));
if (args->fname)
bcopy(nd.ni_cnd.cn_nameptr, p->p_comm,
min(nd.ni_cnd.cn_namelen, MAXCOMLEN));
else if (vn_commname(binvp, p->p_comm, sizeof(p->p_comm)) != 0)
bcopy(fexecv_proc_title, p->p_comm, sizeof(fexecv_proc_title));
bcopy(p->p_comm, td->td_name, sizeof(td->td_name));
/*
* mark as execed, wakeup the process that vforked (if any) and tell
* it that it now has its own resources back
*/
p->p_flag |= P_EXEC;
if (p->p_pptr && (p->p_flag & P_PPWAIT)) {
p->p_flag &= ~P_PPWAIT;
cv_broadcast(&p->p_pwait);
}
/*
* Implement image setuid/setgid.
*
* Don't honor setuid/setgid if the filesystem prohibits it or if
* the process is being traced.
*
* We disable setuid/setgid/etc in compatibility mode on the basis
* that most setugid applications are not written with that
* environment in mind, and will therefore almost certainly operate
* incorrectly. In principle there's no reason that setugid
* applications might not be useful in capability mode, so we may want
* to reconsider this conservative design choice in the future.
*
* XXXMAC: For the time being, use NOSUID to also prohibit
* transitions on the file system.
*/
credential_changing = 0;
credential_changing |= (attr.va_mode & S_ISUID) && oldcred->cr_uid !=
attr.va_uid;
credential_changing |= (attr.va_mode & S_ISGID) && oldcred->cr_gid !=
attr.va_gid;
#ifdef MAC
will_transition = mac_vnode_execve_will_transition(oldcred, imgp->vp,
interpvplabel, imgp);
credential_changing |= will_transition;
#endif
if (credential_changing &&
#ifdef CAPABILITY_MODE
((oldcred->cr_flags & CRED_FLAG_CAPMODE) == 0) &&
#endif
(imgp->vp->v_mount->mnt_flag & MNT_NOSUID) == 0 &&
(p->p_flag & P_TRACED) == 0) {
/*
* Turn off syscall tracing for set-id programs, except for
* root. Record any set-id flags first to make sure that
* we do not regain any tracing during a possible block.
*/
setsugid(p);
#ifdef KTRACE
if (priv_check_cred(oldcred, PRIV_DEBUG_DIFFCRED, 0))
ktrprocexec(p, &tracecred, &tracevp);
#endif
/*
* Close any file descriptors 0..2 that reference procfs,
* then make sure file descriptors 0..2 are in use.
*
* setugidsafety() may call closef() and then pfind()
* which may grab the process lock.
* fdcheckstd() may call falloc() which may block to
* allocate memory, so temporarily drop the process lock.
*/
PROC_UNLOCK(p);
VOP_UNLOCK(imgp->vp, 0);
setugidsafety(td);
error = fdcheckstd(td);
vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY);
if (error != 0)
goto done1;
PROC_LOCK(p);
/*
* Set the new credentials.
*/
if (attr.va_mode & S_ISUID)
change_euid(newcred, euip);
if (attr.va_mode & S_ISGID)
change_egid(newcred, attr.va_gid);
#ifdef MAC
if (will_transition) {
mac_vnode_execve_transition(oldcred, newcred, imgp->vp,
interpvplabel, imgp);
}
#endif
/*
* Implement correct POSIX saved-id behavior.
*
* XXXMAC: Note that the current logic will save the
* uid and gid if a MAC domain transition occurs, even
* though maybe it shouldn't.
*/
change_svuid(newcred, newcred->cr_uid);
change_svgid(newcred, newcred->cr_gid);
p->p_ucred = newcred;
newcred = NULL;
} else {
if (oldcred->cr_uid == oldcred->cr_ruid &&
oldcred->cr_gid == oldcred->cr_rgid)
p->p_flag &= ~P_SUGID;
/*
* Implement correct POSIX saved-id behavior.
*
* XXX: It's not clear that the existing behavior is
* POSIX-compliant. A number of sources indicate that the
* saved uid/gid should only be updated if the new ruid is
* not equal to the old ruid, or the new euid is not equal
* to the old euid and the new euid is not equal to the old
* ruid. The FreeBSD code always updates the saved uid/gid.
* Also, this code uses the new (replaced) euid and egid as
* the source, which may or may not be the right ones to use.
*/
if (oldcred->cr_svuid != oldcred->cr_uid ||
oldcred->cr_svgid != oldcred->cr_gid) {
change_svuid(newcred, newcred->cr_uid);
change_svgid(newcred, newcred->cr_gid);
p->p_ucred = newcred;
newcred = NULL;
}
}
/*
* Store the vp for use in procfs. This vnode was referenced prior
* to locking the proc lock.
*/
textvp = p->p_textvp;
p->p_textvp = binvp;
#ifdef KDTRACE_HOOKS
/*
* Tell the DTrace fasttrap provider about the exec if it
* has declared an interest.
*/
if (dtrace_fasttrap_exec)
dtrace_fasttrap_exec(p);
#endif
/*
* Notify others that we exec'd, and clear the P_INEXEC flag
* as we're now a bona fide freshly-execed process.
*/
KNOTE_LOCKED(&p->p_klist, NOTE_EXEC);
p->p_flag &= ~P_INEXEC;
/*
* If tracing the process, trap to the debugger so that
* breakpoints can be set before the program executes. We
* have to use tdsignal() to deliver the signal to the current
* thread since any other threads in this process will exit if
* execve() succeeds.
*/
if (p->p_flag & P_TRACED)
tdsignal(td, SIGTRAP);
/* clear "fork but no exec" flag, as we _are_ execing */
p->p_acflag &= ~AFORK;
/*
* Free any previous argument cache and replace it with
* the new argument cache, if any.
*/
oldargs = p->p_args;
p->p_args = newargs;
newargs = NULL;
#ifdef HWPMC_HOOKS
/*
* Check if system-wide sampling is in effect or if the
* current process is using PMCs. If so, do exec() time
* processing. This processing needs to happen AFTER the
* P_INEXEC flag is cleared.
*
* The proc lock needs to be released before taking the PMC
* SX.
*/
if (PMC_SYSTEM_SAMPLING_ACTIVE() || PMC_PROC_IS_USING_PMCS(p)) {
PROC_UNLOCK(p);
VOP_UNLOCK(imgp->vp, 0);
pe.pm_credentialschanged = credential_changing;
pe.pm_entryaddr = imgp->entry_addr;
PMC_CALL_HOOK_X(td, PMC_FN_PROCESS_EXEC, (void *) &pe);
vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY);
} else
PROC_UNLOCK(p);
#else /* !HWPMC_HOOKS */
PROC_UNLOCK(p);
#endif
/* Set values passed into the program in registers. */
if (p->p_sysent->sv_setregs)
(*p->p_sysent->sv_setregs)(td, imgp,
(u_long)(uintptr_t)stack_base);
else
exec_setregs(td, imgp, (u_long)(uintptr_t)stack_base);
vfs_mark_atime(imgp->vp, td->td_ucred);
SDT_PROBE(proc, kernel, , exec_success, args->fname, 0, 0, 0, 0);
done1:
/*
* Free any resources malloc'd earlier that we didn't use.
*/
uifree(euip);
if (newcred == NULL)
crfree(oldcred);
else
crfree(newcred);
VOP_UNLOCK(imgp->vp, 0);
/*
* Handle deferred decrement of ref counts.
*/
if (textvp != NULL) {
int tvfslocked;
tvfslocked = VFS_LOCK_GIANT(textvp->v_mount);
vrele(textvp);
VFS_UNLOCK_GIANT(tvfslocked);
}
if (binvp && error != 0)
vrele(binvp);
#ifdef KTRACE
if (tracevp != NULL) {
int tvfslocked;
tvfslocked = VFS_LOCK_GIANT(tracevp->v_mount);
vrele(tracevp);
VFS_UNLOCK_GIANT(tvfslocked);
}
if (tracecred != NULL)
crfree(tracecred);
#endif
vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY);
pargs_drop(oldargs);
pargs_drop(newargs);
if (oldsigacts != NULL)
sigacts_free(oldsigacts);
exec_fail_dealloc:
/*
* free various allocated resources
*/
if (imgp->firstpage != NULL)
exec_unmap_first_page(imgp);
if (imgp->vp != NULL) {
if (args->fname)
NDFREE(&nd, NDF_ONLY_PNBUF);
if (imgp->opened)
VOP_CLOSE(imgp->vp, FREAD, td->td_ucred, td);
vput(imgp->vp);
}
if (imgp->object != NULL)
vm_object_deallocate(imgp->object);
free(imgp->freepath, M_TEMP);
if (error == 0) {
PROC_LOCK(p);
td->td_dbgflags |= TDB_EXEC;
PROC_UNLOCK(p);
/*
* Stop the process here if its stop event mask has
* the S_EXEC bit set.
*/
STOPEVENT(p, S_EXEC, 0);
goto done2;
}
exec_fail:
/* we're done here, clear P_INEXEC */
PROC_LOCK(p);
p->p_flag &= ~P_INEXEC;
PROC_UNLOCK(p);
SDT_PROBE(proc, kernel, , exec_failure, error, 0, 0, 0, 0);
done2:
#ifdef MAC
mac_execve_exit(imgp);
mac_execve_interpreter_exit(interpvplabel);
#endif
VFS_UNLOCK_GIANT(vfslocked);
exec_free_args(args);
if (error && imgp->vmspace_destroyed) {
/* sorry, no more process anymore. exit gracefully */
exit1(td, W_EXITCODE(0, SIGABRT));
/* NOT REACHED */
}
#ifdef KTRACE
if (error == 0)
ktrprocctor(p);
#endif
return (error);
}
int
exec_map_first_page(imgp)
struct image_params *imgp;
{
int rv, i;
int initial_pagein;
vm_page_t ma[VM_INITIAL_PAGEIN];
vm_object_t object;
if (imgp->firstpage != NULL)
exec_unmap_first_page(imgp);
object = imgp->vp->v_object;
if (object == NULL)
return (EACCES);
VM_OBJECT_LOCK(object);
#if VM_NRESERVLEVEL > 0
if ((object->flags & OBJ_COLORED) == 0) {
object->flags |= OBJ_COLORED;
object->pg_color = 0;
}
#endif
ma[0] = vm_page_grab(object, 0, VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
if (ma[0]->valid != VM_PAGE_BITS_ALL) {
initial_pagein = VM_INITIAL_PAGEIN;
if (initial_pagein > object->size)
initial_pagein = object->size;
for (i = 1; i < initial_pagein; i++) {
if ((ma[i] = vm_page_next(ma[i - 1])) != NULL) {
if (ma[i]->valid)
break;
if ((ma[i]->oflags & VPO_BUSY) || ma[i]->busy)
break;
vm_page_busy(ma[i]);
} else {
ma[i] = vm_page_alloc(object, i,
VM_ALLOC_NORMAL | VM_ALLOC_IFNOTCACHED);
if (ma[i] == NULL)
break;
}
}
initial_pagein = i;
rv = vm_pager_get_pages(object, ma, initial_pagein, 0);
ma[0] = vm_page_lookup(object, 0);
if ((rv != VM_PAGER_OK) || (ma[0] == NULL)) {
if (ma[0] != NULL) {
vm_page_lock(ma[0]);
vm_page_free(ma[0]);
vm_page_unlock(ma[0]);
}
VM_OBJECT_UNLOCK(object);
return (EIO);
}
}
vm_page_lock(ma[0]);
vm_page_hold(ma[0]);
vm_page_unlock(ma[0]);
vm_page_wakeup(ma[0]);
VM_OBJECT_UNLOCK(object);
imgp->firstpage = sf_buf_alloc(ma[0], 0);
imgp->image_header = (char *)sf_buf_kva(imgp->firstpage);
return (0);
}
void
exec_unmap_first_page(imgp)
struct image_params *imgp;
{
vm_page_t m;
if (imgp->firstpage != NULL) {
m = sf_buf_page(imgp->firstpage);
sf_buf_free(imgp->firstpage);
imgp->firstpage = NULL;
vm_page_lock(m);
vm_page_unhold(m);
vm_page_unlock(m);
}
}
/*
* Destroy old address space, and allocate a new stack
* The new stack is only SGROWSIZ large because it is grown
* automatically in trap.c.
*/
int
exec_new_vmspace(imgp, sv)
struct image_params *imgp;
struct sysentvec *sv;
{
int error;
struct proc *p = imgp->proc;
struct vmspace *vmspace = p->p_vmspace;
vm_object_t obj;
vm_offset_t sv_minuser, stack_addr;
vm_map_t map;
u_long ssiz;
imgp->vmspace_destroyed = 1;
imgp->sysent = sv;
/* May be called with Giant held */
EVENTHANDLER_INVOKE(process_exec, p, imgp);
/*
* Blow away entire process VM, if address space not shared,
* otherwise, create a new VM space so that other threads are
* not disrupted
*/
map = &vmspace->vm_map;
if (map_at_zero)
sv_minuser = sv->sv_minuser;
else
sv_minuser = MAX(sv->sv_minuser, PAGE_SIZE);
if (vmspace->vm_refcnt == 1 && vm_map_min(map) == sv_minuser &&
vm_map_max(map) == sv->sv_maxuser) {
shmexit(vmspace);
pmap_remove_pages(vmspace_pmap(vmspace));
vm_map_remove(map, vm_map_min(map), vm_map_max(map));
} else {
error = vmspace_exec(p, sv_minuser, sv->sv_maxuser);
if (error)
return (error);
vmspace = p->p_vmspace;
map = &vmspace->vm_map;
}
/* Map a shared page */
obj = sv->sv_shared_page_obj;
if (obj != NULL) {
vm_object_reference(obj);
error = vm_map_fixed(map, obj, 0,
sv->sv_shared_page_base, sv->sv_shared_page_len,
VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_ALL,
MAP_COPY_ON_WRITE | MAP_ACC_NO_CHARGE);
if (error) {
vm_object_deallocate(obj);
return (error);
}
}
/* Allocate a new stack */
if (sv->sv_maxssiz != NULL)
ssiz = *sv->sv_maxssiz;
else
ssiz = maxssiz;
stack_addr = sv->sv_usrstack - ssiz;
error = vm_map_stack(map, stack_addr, (vm_size_t)ssiz,
obj != NULL && imgp->stack_prot != 0 ? imgp->stack_prot :
sv->sv_stackprot,
VM_PROT_ALL, MAP_STACK_GROWS_DOWN);
if (error)
return (error);
#ifdef __ia64__
/* Allocate a new register stack */
stack_addr = IA64_BACKINGSTORE;
error = vm_map_stack(map, stack_addr, (vm_size_t)ssiz,
sv->sv_stackprot, VM_PROT_ALL, MAP_STACK_GROWS_UP);
if (error)
return (error);
#endif
/* vm_ssize and vm_maxsaddr are somewhat antiquated concepts in the
* VM_STACK case, but they are still used to monitor the size of the
* process stack so we can check the stack rlimit.
*/
vmspace->vm_ssize = sgrowsiz >> PAGE_SHIFT;
vmspace->vm_maxsaddr = (char *)sv->sv_usrstack - ssiz;
return (0);
}
/*
* Copy out argument and environment strings from the old process address
* space into the temporary string buffer.
*/
int
exec_copyin_args(struct image_args *args, char *fname,
enum uio_seg segflg, char **argv, char **envv)
{
char *argp, *envp;
int error;
size_t length;
bzero(args, sizeof(*args));
if (argv == NULL)
return (EFAULT);
/*
* Allocate demand-paged memory for the file name, argument, and
* environment strings.
*/
error = exec_alloc_args(args);
if (error != 0)
return (error);
/*
* Copy the file name.
*/
if (fname != NULL) {
args->fname = args->buf;
error = (segflg == UIO_SYSSPACE) ?
copystr(fname, args->fname, PATH_MAX, &length) :
copyinstr(fname, args->fname, PATH_MAX, &length);
if (error != 0)
goto err_exit;
} else
length = 0;
args->begin_argv = args->buf + length;
args->endp = args->begin_argv;
args->stringspace = ARG_MAX;
/*
* extract arguments first
*/
while ((argp = (caddr_t) (intptr_t) fuword(argv++))) {
if (argp == (caddr_t) -1) {
error = EFAULT;
goto err_exit;
}
if ((error = copyinstr(argp, args->endp,
args->stringspace, &length))) {
if (error == ENAMETOOLONG)
error = E2BIG;
goto err_exit;
}
args->stringspace -= length;
args->endp += length;
args->argc++;
}
args->begin_envv = args->endp;
/*
* extract environment strings
*/
if (envv) {
while ((envp = (caddr_t)(intptr_t)fuword(envv++))) {
if (envp == (caddr_t)-1) {
error = EFAULT;
goto err_exit;
}
if ((error = copyinstr(envp, args->endp,
args->stringspace, &length))) {
if (error == ENAMETOOLONG)
error = E2BIG;
goto err_exit;
}
args->stringspace -= length;
args->endp += length;
args->envc++;
}
}
return (0);
err_exit:
exec_free_args(args);
return (error);
}
/*
* Allocate temporary demand-paged, zero-filled memory for the file name,
* argument, and environment strings. Returns zero if the allocation succeeds
* and ENOMEM otherwise.
*/
int
exec_alloc_args(struct image_args *args)
{
args->buf = (char *)kmem_alloc_wait(exec_map, PATH_MAX + ARG_MAX);
return (args->buf != NULL ? 0 : ENOMEM);
}
void
exec_free_args(struct image_args *args)
{
if (args->buf != NULL) {
kmem_free_wakeup(exec_map, (vm_offset_t)args->buf,
PATH_MAX + ARG_MAX);
args->buf = NULL;
}
if (args->fname_buf != NULL) {
free(args->fname_buf, M_TEMP);
args->fname_buf = NULL;
}
}
/*
* Copy strings out to the new process address space, constructing new arg
* and env vector tables. Return a pointer to the base so that it can be used
* as the initial stack pointer.
*/
register_t *
exec_copyout_strings(imgp)
struct image_params *imgp;
{
int argc, envc;
char **vectp;
char *stringp, *destp;
register_t *stack_base;
struct ps_strings *arginfo;
struct proc *p;
size_t execpath_len;
int szsigcode, szps;
char canary[sizeof(long) * 8];
szps = sizeof(pagesizes[0]) * MAXPAGESIZES;
/*
* Calculate string base and vector table pointers.
* Also deal with signal trampoline code for this exec type.
*/
if (imgp->execpath != NULL && imgp->auxargs != NULL)
execpath_len = strlen(imgp->execpath) + 1;
else
execpath_len = 0;
p = imgp->proc;
szsigcode = 0;
arginfo = (struct ps_strings *)p->p_sysent->sv_psstrings;
if (p->p_sysent->sv_sigcode_base == 0) {
if (p->p_sysent->sv_szsigcode != NULL)
szsigcode = *(p->p_sysent->sv_szsigcode);
}
destp = (caddr_t)arginfo - szsigcode - SPARE_USRSPACE -
roundup(execpath_len, sizeof(char *)) -
roundup(sizeof(canary), sizeof(char *)) -
roundup(szps, sizeof(char *)) -
roundup((ARG_MAX - imgp->args->stringspace), sizeof(char *));
/*
* install sigcode
*/
if (szsigcode != 0)
copyout(p->p_sysent->sv_sigcode, ((caddr_t)arginfo -
szsigcode), szsigcode);
/*
* Copy the image path for the rtld.
*/
if (execpath_len != 0) {
imgp->execpathp = (uintptr_t)arginfo - szsigcode - execpath_len;
copyout(imgp->execpath, (void *)imgp->execpathp,
execpath_len);
}
/*
* Prepare the canary for SSP.
*/
arc4rand(canary, sizeof(canary), 0);
imgp->canary = (uintptr_t)arginfo - szsigcode - execpath_len -
sizeof(canary);
copyout(canary, (void *)imgp->canary, sizeof(canary));
imgp->canarylen = sizeof(canary);
/*
* Prepare the pagesizes array.
*/
imgp->pagesizes = (uintptr_t)arginfo - szsigcode - execpath_len -
roundup(sizeof(canary), sizeof(char *)) - szps;
copyout(pagesizes, (void *)imgp->pagesizes, szps);
imgp->pagesizeslen = szps;
/*
* If we have a valid auxargs ptr, prepare some room
* on the stack.
*/
if (imgp->auxargs) {
/*
* 'AT_COUNT*2' is size for the ELF Auxargs data. This is for
* lower compatibility.
*/
imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size :
(AT_COUNT * 2);
/*
* The '+ 2' is for the null pointers at the end of each of
* the arg and env vector sets,and imgp->auxarg_size is room
* for argument of Runtime loader.
*/
vectp = (char **)(destp - (imgp->args->argc +
imgp->args->envc + 2 + imgp->auxarg_size)
* sizeof(char *));
} else {
/*
* The '+ 2' is for the null pointers at the end of each of
* the arg and env vector sets
*/
vectp = (char **)(destp - (imgp->args->argc + imgp->args->envc + 2) *
sizeof(char *));
}
/*
* vectp also becomes our initial stack base
*/
stack_base = (register_t *)vectp;
stringp = imgp->args->begin_argv;
argc = imgp->args->argc;
envc = imgp->args->envc;
/*
* Copy out strings - arguments and environment.
*/
copyout(stringp, destp, ARG_MAX - imgp->args->stringspace);
/*
* Fill in "ps_strings" struct for ps, w, etc.
*/
suword(&arginfo->ps_argvstr, (long)(intptr_t)vectp);
suword32(&arginfo->ps_nargvstr, argc);
/*
* Fill in argument portion of vector table.
*/
for (; argc > 0; --argc) {
suword(vectp++, (long)(intptr_t)destp);
while (*stringp++ != 0)
destp++;
destp++;
}
/* a null vector table pointer separates the argp's from the envp's */
suword(vectp++, 0);
suword(&arginfo->ps_envstr, (long)(intptr_t)vectp);
suword32(&arginfo->ps_nenvstr, envc);
/*
* Fill in environment portion of vector table.
*/
for (; envc > 0; --envc) {
suword(vectp++, (long)(intptr_t)destp);
while (*stringp++ != 0)
destp++;
destp++;
}
/* end of vector table is a null pointer */
suword(vectp, 0);
return (stack_base);
}
/*
* Check permissions of file to execute.
* Called with imgp->vp locked.
* Return 0 for success or error code on failure.
*/
int
exec_check_permissions(imgp)
struct image_params *imgp;
{
struct vnode *vp = imgp->vp;
struct vattr *attr = imgp->attr;
struct thread *td;
int error;
td = curthread;
/* Get file attributes */
error = VOP_GETATTR(vp, attr, td->td_ucred);
if (error)
return (error);
#ifdef MAC
error = mac_vnode_check_exec(td->td_ucred, imgp->vp, imgp);
if (error)
return (error);
#endif
/*
* 1) Check if file execution is disabled for the filesystem that
* this file resides on.
* 2) Ensure that at least one execute bit is on. Otherwise, a
* privileged user will always succeed, and we don't want this
* to happen unless the file really is executable.
* 3) Ensure that the file is a regular file.
*/
if ((vp->v_mount->mnt_flag & MNT_NOEXEC) ||
(attr->va_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0 ||
(attr->va_type != VREG))
return (EACCES);
/*
* Zero length files can't be exec'd
*/
if (attr->va_size == 0)
return (ENOEXEC);
/*
* Check for execute permission to file based on current credentials.
*/
error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td);
if (error)
return (error);
/*
* Check number of open-for-writes on the file and deny execution
* if there are any.
*/
if (vp->v_writecount)
return (ETXTBSY);
/*
* Call filesystem specific open routine (which does nothing in the
* general case).
*/
error = VOP_OPEN(vp, FREAD, td->td_ucred, td, NULL);
if (error == 0)
imgp->opened = 1;
return (error);
}
/*
* Exec handler registration
*/
int
exec_register(execsw_arg)
const struct execsw *execsw_arg;
{
const struct execsw **es, **xs, **newexecsw;
int count = 2; /* New slot and trailing NULL */
if (execsw)
for (es = execsw; *es; es++)
count++;
newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK);
if (newexecsw == NULL)
return (ENOMEM);
xs = newexecsw;
if (execsw)
for (es = execsw; *es; es++)
*xs++ = *es;
*xs++ = execsw_arg;
*xs = NULL;
if (execsw)
free(execsw, M_TEMP);
execsw = newexecsw;
return (0);
}
int
exec_unregister(execsw_arg)
const struct execsw *execsw_arg;
{
const struct execsw **es, **xs, **newexecsw;
int count = 1;
if (execsw == NULL)
panic("unregister with no handlers left?\n");
for (es = execsw; *es; es++) {
if (*es == execsw_arg)
break;
}
if (*es == NULL)
return (ENOENT);
for (es = execsw; *es; es++)
if (*es != execsw_arg)
count++;
newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK);
if (newexecsw == NULL)
return (ENOMEM);
xs = newexecsw;
for (es = execsw; *es; es++)
if (*es != execsw_arg)
*xs++ = *es;
*xs = NULL;
if (execsw)
free(execsw, M_TEMP);
execsw = newexecsw;
return (0);
}
static vm_object_t shared_page_obj;
static int shared_page_free;
int
shared_page_fill(int size, int align, const char *data)
{
vm_page_t m;
struct sf_buf *s;
vm_offset_t sk;
int res;
VM_OBJECT_LOCK(shared_page_obj);
m = vm_page_grab(shared_page_obj, 0, VM_ALLOC_RETRY);
res = roundup(shared_page_free, align);
if (res + size >= IDX_TO_OFF(shared_page_obj->size))
res = -1;
else {
VM_OBJECT_UNLOCK(shared_page_obj);
s = sf_buf_alloc(m, SFB_DEFAULT);
sk = sf_buf_kva(s);
bcopy(data, (void *)(sk + res), size);
shared_page_free = res + size;
sf_buf_free(s);
VM_OBJECT_LOCK(shared_page_obj);
}
vm_page_wakeup(m);
VM_OBJECT_UNLOCK(shared_page_obj);
return (res);
}
static void
shared_page_init(void *dummy __unused)
{
vm_page_t m;
shared_page_obj = vm_pager_allocate(OBJT_PHYS, 0, PAGE_SIZE,
VM_PROT_DEFAULT, 0, NULL);
VM_OBJECT_LOCK(shared_page_obj);
m = vm_page_grab(shared_page_obj, 0, VM_ALLOC_RETRY | VM_ALLOC_NOBUSY |
VM_ALLOC_ZERO);
m->valid = VM_PAGE_BITS_ALL;
VM_OBJECT_UNLOCK(shared_page_obj);
}
SYSINIT(shp, SI_SUB_EXEC, SI_ORDER_FIRST, (sysinit_cfunc_t)shared_page_init,
NULL);
void
exec_sysvec_init(void *param)
{
struct sysentvec *sv;
sv = (struct sysentvec *)param;
if ((sv->sv_flags & SV_SHP) == 0)
return;
sv->sv_shared_page_obj = shared_page_obj;
sv->sv_sigcode_base = sv->sv_shared_page_base +
shared_page_fill(*(sv->sv_szsigcode), 16, sv->sv_sigcode);
}
Index: head/sys/kern/kern_exit.c
===================================================================
--- head/sys/kern/kern_exit.c (revision 225616)
+++ head/sys/kern/kern_exit.c (revision 225617)
@@ -1,956 +1,956 @@
/*-
* Copyright (c) 1982, 1986, 1989, 1991, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_exit.c 8.7 (Berkeley) 2/12/94
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_compat.h"
#include "opt_kdtrace.h"
#include "opt_ktrace.h"
#include "opt_procdesc.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/sysproto.h>
#include <sys/capability.h>
#include <sys/eventhandler.h>
#include <sys/kernel.h>
#include <sys/malloc.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/procdesc.h>
#include <sys/pioctl.h>
#include <sys/jail.h>
#include <sys/tty.h>
#include <sys/wait.h>
#include <sys/vmmeter.h>
#include <sys/vnode.h>
#include <sys/racct.h>
#include <sys/resourcevar.h>
#include <sys/sbuf.h>
#include <sys/signalvar.h>
#include <sys/sched.h>
#include <sys/sx.h>
#include <sys/syscallsubr.h>
#include <sys/syslog.h>
#include <sys/ptrace.h>
#include <sys/acct.h> /* for acct_process() function prototype */
#include <sys/filedesc.h>
#include <sys/sdt.h>
#include <sys/shm.h>
#include <sys/sem.h>
#ifdef KTRACE
#include <sys/ktrace.h>
#endif
#include <security/audit/audit.h>
#include <security/mac/mac_framework.h>
#include <vm/vm.h>
#include <vm/vm_extern.h>
#include <vm/vm_param.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
#include <vm/vm_page.h>
#include <vm/uma.h>
#ifdef KDTRACE_HOOKS
#include <sys/dtrace_bsd.h>
dtrace_execexit_func_t dtrace_fasttrap_exit;
#endif
SDT_PROVIDER_DECLARE(proc);
SDT_PROBE_DEFINE(proc, kernel, , exit, exit);
SDT_PROBE_ARGTYPE(proc, kernel, , exit, 0, "int");
/* Hook for NFS teardown procedure. */
void (*nlminfo_release_p)(struct proc *p);
/*
* exit -- death of process.
*/
void
-sys_exit(struct thread *td, struct sys_exit_args *uap)
+sys_sys_exit(struct thread *td, struct sys_exit_args *uap)
{
exit1(td, W_EXITCODE(uap->rval, 0));
/* NOTREACHED */
}
/*
* Exit: deallocate address space and other resources, change proc state to
* zombie, and unlink proc from allproc and parent's lists. Save exit status
* and rusage for wait(). Check for child processes and orphan them.
*/
void
exit1(struct thread *td, int rv)
{
struct proc *p, *nq, *q;
struct vnode *vtmp;
struct vnode *ttyvp = NULL;
struct plimit *plim;
int locked;
mtx_assert(&Giant, MA_NOTOWNED);
p = td->td_proc;
/*
* XXX in case we're rebooting we just let init die in order to
* work around an unsolved stack overflow seen very late during
* shutdown on sparc64 when the gmirror worker process exists.
*/
if (p == initproc && rebooting == 0) {
printf("init died (signal %d, exit %d)\n",
WTERMSIG(rv), WEXITSTATUS(rv));
panic("Going nowhere without my init!");
}
/*
* MUST abort all other threads before proceeding past here.
*/
PROC_LOCK(p);
while (p->p_flag & P_HADTHREADS) {
/*
* First check if some other thread got here before us..
* if so, act apropriatly, (exit or suspend);
*/
thread_suspend_check(0);
/*
* Kill off the other threads. This requires
* some co-operation from other parts of the kernel
* so it may not be instantaneous. With this state set
* any thread entering the kernel from userspace will
* thread_exit() in trap(). Any thread attempting to
* sleep will return immediately with EINTR or EWOULDBLOCK
* which will hopefully force them to back out to userland
* freeing resources as they go. Any thread attempting
* to return to userland will thread_exit() from userret().
* thread_exit() will unsuspend us when the last of the
* other threads exits.
* If there is already a thread singler after resumption,
* calling thread_single will fail; in that case, we just
* re-check all suspension request, the thread should
* either be suspended there or exit.
*/
if (! thread_single(SINGLE_EXIT))
break;
/*
* All other activity in this process is now stopped.
* Threading support has been turned off.
*/
}
KASSERT(p->p_numthreads == 1,
("exit1: proc %p exiting with %d threads", p, p->p_numthreads));
racct_sub(p, RACCT_NTHR, 1);
/*
* Wakeup anyone in procfs' PIOCWAIT. They should have a hold
* on our vmspace, so we should block below until they have
* released their reference to us. Note that if they have
* requested S_EXIT stops we will block here until they ack
* via PIOCCONT.
*/
_STOPEVENT(p, S_EXIT, rv);
/*
* Note that we are exiting and do another wakeup of anyone in
* PIOCWAIT in case they aren't listening for S_EXIT stops or
* decided to wait again after we told them we are exiting.
*/
p->p_flag |= P_WEXIT;
wakeup(&p->p_stype);
/*
* Wait for any processes that have a hold on our vmspace to
* release their reference.
*/
while (p->p_lock > 0)
msleep(&p->p_lock, &p->p_mtx, PWAIT, "exithold", 0);
p->p_xstat = rv; /* Let event handler change exit status */
PROC_UNLOCK(p);
/* Drain the limit callout while we don't have the proc locked */
callout_drain(&p->p_limco);
#ifdef AUDIT
/*
* The Sun BSM exit token contains two components: an exit status as
* passed to exit(), and a return value to indicate what sort of exit
* it was. The exit status is WEXITSTATUS(rv), but it's not clear
* what the return value is.
*/
AUDIT_ARG_EXIT(WEXITSTATUS(rv), 0);
AUDIT_SYSCALL_EXIT(0, td);
#endif
/* Are we a task leader? */
if (p == p->p_leader) {
mtx_lock(&ppeers_lock);
q = p->p_peers;
while (q != NULL) {
PROC_LOCK(q);
- psignal(q, SIGKILL);
+ kern_psignal(q, SIGKILL);
PROC_UNLOCK(q);
q = q->p_peers;
}
while (p->p_peers != NULL)
msleep(p, &ppeers_lock, PWAIT, "exit1", 0);
mtx_unlock(&ppeers_lock);
}
/*
* Check if any loadable modules need anything done at process exit.
* E.g. SYSV IPC stuff
* XXX what if one of these generates an error?
*/
EVENTHANDLER_INVOKE(process_exit, p);
/*
* If parent is waiting for us to exit or exec,
* P_PPWAIT is set; we will wakeup the parent below.
*/
PROC_LOCK(p);
rv = p->p_xstat; /* Event handler could change exit status */
stopprofclock(p);
p->p_flag &= ~(P_TRACED | P_PPWAIT);
/*
* Stop the real interval timer. If the handler is currently
* executing, prevent it from rearming itself and let it finish.
*/
if (timevalisset(&p->p_realtimer.it_value) &&
callout_stop(&p->p_itcallout) == 0) {
timevalclear(&p->p_realtimer.it_interval);
msleep(&p->p_itcallout, &p->p_mtx, PWAIT, "ritwait", 0);
KASSERT(!timevalisset(&p->p_realtimer.it_value),
("realtime timer is still armed"));
}
PROC_UNLOCK(p);
/*
* Reset any sigio structures pointing to us as a result of
* F_SETOWN with our pid.
*/
funsetownlst(&p->p_sigiolst);
/*
* If this process has an nlminfo data area (for lockd), release it
*/
if (nlminfo_release_p != NULL && p->p_nlminfo != NULL)
(*nlminfo_release_p)(p);
/*
* Close open files and release open-file table.
* This may block!
*/
fdfree(td);
/*
* If this thread tickled GEOM, we need to wait for the giggling to
* stop before we return to userland
*/
if (td->td_pflags & TDP_GEOM)
g_waitidle();
/*
* Remove ourself from our leader's peer list and wake our leader.
*/
mtx_lock(&ppeers_lock);
if (p->p_leader->p_peers) {
q = p->p_leader;
while (q->p_peers != p)
q = q->p_peers;
q->p_peers = p->p_peers;
wakeup(p->p_leader);
}
mtx_unlock(&ppeers_lock);
vmspace_exit(td);
sx_xlock(&proctree_lock);
if (SESS_LEADER(p)) {
struct session *sp = p->p_session;
struct tty *tp;
/*
* s_ttyp is not zero'd; we use this to indicate that
* the session once had a controlling terminal. (for
* logging and informational purposes)
*/
SESS_LOCK(sp);
ttyvp = sp->s_ttyvp;
tp = sp->s_ttyp;
sp->s_ttyvp = NULL;
sp->s_ttydp = NULL;
sp->s_leader = NULL;
SESS_UNLOCK(sp);
/*
* Signal foreground pgrp and revoke access to
* controlling terminal if it has not been revoked
* already.
*
* Because the TTY may have been revoked in the mean
* time and could already have a new session associated
* with it, make sure we don't send a SIGHUP to a
* foreground process group that does not belong to this
* session.
*/
if (tp != NULL) {
tty_lock(tp);
if (tp->t_session == sp)
tty_signal_pgrp(tp, SIGHUP);
tty_unlock(tp);
}
if (ttyvp != NULL) {
sx_xunlock(&proctree_lock);
if (vn_lock(ttyvp, LK_EXCLUSIVE) == 0) {
VOP_REVOKE(ttyvp, REVOKEALL);
VOP_UNLOCK(ttyvp, 0);
}
sx_xlock(&proctree_lock);
}
}
fixjobc(p, p->p_pgrp, 0);
sx_xunlock(&proctree_lock);
(void)acct_process(td);
/* Release the TTY now we've unlocked everything. */
if (ttyvp != NULL)
vrele(ttyvp);
#ifdef KTRACE
ktrprocexit(td);
#endif
/*
* Release reference to text vnode
*/
if ((vtmp = p->p_textvp) != NULL) {
p->p_textvp = NULL;
locked = VFS_LOCK_GIANT(vtmp->v_mount);
vrele(vtmp);
VFS_UNLOCK_GIANT(locked);
}
/*
* Release our limits structure.
*/
PROC_LOCK(p);
plim = p->p_limit;
p->p_limit = NULL;
PROC_UNLOCK(p);
lim_free(plim);
tidhash_remove(td);
/*
* Remove proc from allproc queue and pidhash chain.
* Place onto zombproc. Unlink from parent's child list.
*/
sx_xlock(&allproc_lock);
LIST_REMOVE(p, p_list);
LIST_INSERT_HEAD(&zombproc, p, p_list);
LIST_REMOVE(p, p_hash);
sx_xunlock(&allproc_lock);
/*
* Call machine-dependent code to release any
* machine-dependent resources other than the address space.
* The address space is released by "vmspace_exitfree(p)" in
* vm_waitproc().
*/
cpu_exit(td);
WITNESS_WARN(WARN_PANIC, NULL, "process (pid %d) exiting", p->p_pid);
/*
* Reparent all of our children to init.
*/
sx_xlock(&proctree_lock);
q = LIST_FIRST(&p->p_children);
if (q != NULL) /* only need this if any child is S_ZOMB */
wakeup(initproc);
for (; q != NULL; q = nq) {
nq = LIST_NEXT(q, p_sibling);
PROC_LOCK(q);
proc_reparent(q, initproc);
q->p_sigparent = SIGCHLD;
/*
* Traced processes are killed
* since their existence means someone is screwing up.
*/
if (q->p_flag & P_TRACED) {
struct thread *temp;
q->p_flag &= ~(P_TRACED | P_STOPPED_TRACE);
FOREACH_THREAD_IN_PROC(q, temp)
temp->td_dbgflags &= ~TDB_SUSPEND;
- psignal(q, SIGKILL);
+ kern_psignal(q, SIGKILL);
}
PROC_UNLOCK(q);
}
/* Save exit status. */
PROC_LOCK(p);
p->p_xthread = td;
/* Tell the prison that we are gone. */
prison_proc_free(p->p_ucred->cr_prison);
#ifdef KDTRACE_HOOKS
/*
* Tell the DTrace fasttrap provider about the exit if it
* has declared an interest.
*/
if (dtrace_fasttrap_exit)
dtrace_fasttrap_exit(p);
#endif
/*
* Notify interested parties of our demise.
*/
KNOTE_LOCKED(&p->p_klist, NOTE_EXIT);
#ifdef KDTRACE_HOOKS
int reason = CLD_EXITED;
if (WCOREDUMP(rv))
reason = CLD_DUMPED;
else if (WIFSIGNALED(rv))
reason = CLD_KILLED;
SDT_PROBE(proc, kernel, , exit, reason, 0, 0, 0, 0);
#endif
/*
* Just delete all entries in the p_klist. At this point we won't
* report any more events, and there are nasty race conditions that
* can beat us if we don't.
*/
knlist_clear(&p->p_klist, 1);
/*
* If this is a process with a descriptor, we may not need to deliver
* a signal to the parent. proctree_lock is held over
* procdesc_exit() to serialize concurrent calls to close() and
* exit().
*/
#ifdef PROCDESC
if (p->p_procdesc == NULL || procdesc_exit(p)) {
#endif
/*
* Notify parent that we're gone. If parent has the
* PS_NOCLDWAIT flag set, or if the handler is set to SIG_IGN,
* notify process 1 instead (and hope it will handle this
* situation).
*/
PROC_LOCK(p->p_pptr);
mtx_lock(&p->p_pptr->p_sigacts->ps_mtx);
if (p->p_pptr->p_sigacts->ps_flag &
(PS_NOCLDWAIT | PS_CLDSIGIGN)) {
struct proc *pp;
mtx_unlock(&p->p_pptr->p_sigacts->ps_mtx);
pp = p->p_pptr;
PROC_UNLOCK(pp);
proc_reparent(p, initproc);
p->p_sigparent = SIGCHLD;
PROC_LOCK(p->p_pptr);
/*
* Notify parent, so in case he was wait(2)ing or
* executing waitpid(2) with our pid, he will
* continue.
*/
wakeup(pp);
} else
mtx_unlock(&p->p_pptr->p_sigacts->ps_mtx);
if (p->p_pptr == initproc)
- psignal(p->p_pptr, SIGCHLD);
+ kern_psignal(p->p_pptr, SIGCHLD);
else if (p->p_sigparent != 0) {
if (p->p_sigparent == SIGCHLD)
childproc_exited(p);
else /* LINUX thread */
- psignal(p->p_pptr, p->p_sigparent);
+ kern_psignal(p->p_pptr, p->p_sigparent);
}
#ifdef PROCDESC
} else
PROC_LOCK(p->p_pptr);
#endif
sx_xunlock(&proctree_lock);
/*
* The state PRS_ZOMBIE prevents other proesses from sending
* signal to the process, to avoid memory leak, we free memory
* for signal queue at the time when the state is set.
*/
sigqueue_flush(&p->p_sigqueue);
sigqueue_flush(&td->td_sigqueue);
/*
* We have to wait until after acquiring all locks before
* changing p_state. We need to avoid all possible context
* switches (including ones from blocking on a mutex) while
* marked as a zombie. We also have to set the zombie state
* before we release the parent process' proc lock to avoid
* a lost wakeup. So, we first call wakeup, then we grab the
* sched lock, update the state, and release the parent process'
* proc lock.
*/
wakeup(p->p_pptr);
cv_broadcast(&p->p_pwait);
sched_exit(p->p_pptr, td);
PROC_SLOCK(p);
p->p_state = PRS_ZOMBIE;
PROC_UNLOCK(p->p_pptr);
/*
* Hopefully no one will try to deliver a signal to the process this
* late in the game.
*/
knlist_destroy(&p->p_klist);
/*
* Save our children's rusage information in our exit rusage.
*/
ruadd(&p->p_ru, &p->p_rux, &p->p_stats->p_cru, &p->p_crux);
/*
* Make sure the scheduler takes this thread out of its tables etc.
* This will also release this thread's reference to the ucred.
* Other thread parts to release include pcb bits and such.
*/
thread_exit();
}
#ifndef _SYS_SYSPROTO_H_
struct abort2_args {
char *why;
int nargs;
void **args;
};
#endif
int
-abort2(struct thread *td, struct abort2_args *uap)
+sys_abort2(struct thread *td, struct abort2_args *uap)
{
struct proc *p = td->td_proc;
struct sbuf *sb;
void *uargs[16];
int error, i, sig;
/*
* Do it right now so we can log either proper call of abort2(), or
* note, that invalid argument was passed. 512 is big enough to
* handle 16 arguments' descriptions with additional comments.
*/
sb = sbuf_new(NULL, NULL, 512, SBUF_FIXEDLEN);
sbuf_clear(sb);
sbuf_printf(sb, "%s(pid %d uid %d) aborted: ",
p->p_comm, p->p_pid, td->td_ucred->cr_uid);
/*
* Since we can't return from abort2(), send SIGKILL in cases, where
* abort2() was called improperly
*/
sig = SIGKILL;
/* Prevent from DoSes from user-space. */
if (uap->nargs < 0 || uap->nargs > 16)
goto out;
if (uap->nargs > 0) {
if (uap->args == NULL)
goto out;
error = copyin(uap->args, uargs, uap->nargs * sizeof(void *));
if (error != 0)
goto out;
}
/*
* Limit size of 'reason' string to 128. Will fit even when
* maximal number of arguments was chosen to be logged.
*/
if (uap->why != NULL) {
error = sbuf_copyin(sb, uap->why, 128);
if (error < 0)
goto out;
} else {
sbuf_printf(sb, "(null)");
}
if (uap->nargs > 0) {
sbuf_printf(sb, "(");
for (i = 0;i < uap->nargs; i++)
sbuf_printf(sb, "%s%p", i == 0 ? "" : ", ", uargs[i]);
sbuf_printf(sb, ")");
}
/*
* Final stage: arguments were proper, string has been
* successfully copied from userspace, and copying pointers
* from user-space succeed.
*/
sig = SIGABRT;
out:
if (sig == SIGKILL) {
sbuf_trim(sb);
sbuf_printf(sb, " (Reason text inaccessible)");
}
sbuf_cat(sb, "\n");
sbuf_finish(sb);
log(LOG_INFO, "%s", sbuf_data(sb));
sbuf_delete(sb);
exit1(td, W_EXITCODE(0, sig));
return (0);
}
#ifdef COMPAT_43
/*
* The dirty work is handled by kern_wait().
*/
int
owait(struct thread *td, struct owait_args *uap __unused)
{
int error, status;
error = kern_wait(td, WAIT_ANY, &status, 0, NULL);
if (error == 0)
td->td_retval[1] = status;
return (error);
}
#endif /* COMPAT_43 */
/*
* The dirty work is handled by kern_wait().
*/
int
-wait4(struct thread *td, struct wait_args *uap)
+sys_wait4(struct thread *td, struct wait_args *uap)
{
struct rusage ru, *rup;
int error, status;
if (uap->rusage != NULL)
rup = &ru;
else
rup = NULL;
error = kern_wait(td, uap->pid, &status, uap->options, rup);
if (uap->status != NULL && error == 0)
error = copyout(&status, uap->status, sizeof(status));
if (uap->rusage != NULL && error == 0)
error = copyout(&ru, uap->rusage, sizeof(struct rusage));
return (error);
}
/*
* Reap the remains of a zombie process and optionally return status and
* rusage. Asserts and will release both the proctree_lock and the process
* lock as part of its work.
*/
void
proc_reap(struct thread *td, struct proc *p, int *status, int options,
struct rusage *rusage)
{
struct proc *q, *t;
sx_assert(&proctree_lock, SA_XLOCKED);
PROC_LOCK_ASSERT(p, MA_OWNED);
PROC_SLOCK_ASSERT(p, MA_OWNED);
KASSERT(p->p_state == PRS_ZOMBIE, ("proc_reap: !PRS_ZOMBIE"));
q = td->td_proc;
if (rusage) {
*rusage = p->p_ru;
calcru(p, &rusage->ru_utime, &rusage->ru_stime);
}
PROC_SUNLOCK(p);
td->td_retval[0] = p->p_pid;
if (status)
*status = p->p_xstat; /* convert to int */
if (options & WNOWAIT) {
/*
* Only poll, returning the status. Caller does not wish to
* release the proc struct just yet.
*/
PROC_UNLOCK(p);
sx_xunlock(&proctree_lock);
return;
}
PROC_LOCK(q);
sigqueue_take(p->p_ksi);
PROC_UNLOCK(q);
PROC_UNLOCK(p);
/*
* If we got the child via a ptrace 'attach', we need to give it back
* to the old parent.
*/
if (p->p_oppid && (t = pfind(p->p_oppid)) != NULL) {
PROC_LOCK(p);
proc_reparent(p, t);
p->p_pptr->p_dbg_child--;
p->p_oppid = 0;
PROC_UNLOCK(p);
pksignal(t, SIGCHLD, p->p_ksi);
wakeup(t);
cv_broadcast(&p->p_pwait);
PROC_UNLOCK(t);
sx_xunlock(&proctree_lock);
return;
}
/*
* Remove other references to this process to ensure we have an
* exclusive reference.
*/
sx_xlock(&allproc_lock);
LIST_REMOVE(p, p_list); /* off zombproc */
sx_xunlock(&allproc_lock);
LIST_REMOVE(p, p_sibling);
leavepgrp(p);
#ifdef PROCDESC
if (p->p_procdesc != NULL)
procdesc_reap(p);
#endif
sx_xunlock(&proctree_lock);
/*
* As a side effect of this lock, we know that all other writes to
* this proc are visible now, so no more locking is needed for p.
*/
PROC_LOCK(p);
p->p_xstat = 0; /* XXX: why? */
PROC_UNLOCK(p);
PROC_LOCK(q);
ruadd(&q->p_stats->p_cru, &q->p_crux, &p->p_ru, &p->p_rux);
PROC_UNLOCK(q);
/*
* Decrement the count of procs running with this uid.
*/
(void)chgproccnt(p->p_ucred->cr_ruidinfo, -1, 0);
/*
* Destroy resource accounting information associated with the process.
*/
racct_proc_exit(p);
#ifdef RACCT
PROC_LOCK(p->p_pptr);
racct_sub(p->p_pptr, RACCT_NPROC, 1);
PROC_UNLOCK(p->p_pptr);
#endif
/*
* Free credentials, arguments, and sigacts.
*/
crfree(p->p_ucred);
p->p_ucred = NULL;
pargs_drop(p->p_args);
p->p_args = NULL;
sigacts_free(p->p_sigacts);
p->p_sigacts = NULL;
/*
* Do any thread-system specific cleanups.
*/
thread_wait(p);
/*
* Give vm and machine-dependent layer a chance to free anything that
* cpu_exit couldn't release while still running in process context.
*/
vm_waitproc(p);
#ifdef MAC
mac_proc_destroy(p);
#endif
KASSERT(FIRST_THREAD_IN_PROC(p),
("proc_reap: no residual thread!"));
uma_zfree(proc_zone, p);
sx_xlock(&allproc_lock);
nprocs--;
sx_xunlock(&allproc_lock);
}
int
kern_wait(struct thread *td, pid_t pid, int *status, int options,
struct rusage *rusage)
{
struct proc *p, *q;
int error, nfound;
AUDIT_ARG_PID(pid);
AUDIT_ARG_VALUE(options);
q = td->td_proc;
if (pid == 0) {
PROC_LOCK(q);
pid = -q->p_pgid;
PROC_UNLOCK(q);
}
/* If we don't know the option, just return. */
if (options & ~(WUNTRACED|WNOHANG|WCONTINUED|WNOWAIT|WLINUXCLONE))
return (EINVAL);
loop:
if (q->p_flag & P_STATCHILD) {
PROC_LOCK(q);
q->p_flag &= ~P_STATCHILD;
PROC_UNLOCK(q);
}
nfound = 0;
sx_xlock(&proctree_lock);
LIST_FOREACH(p, &q->p_children, p_sibling) {
PROC_LOCK(p);
if (pid != WAIT_ANY &&
p->p_pid != pid && p->p_pgid != -pid) {
PROC_UNLOCK(p);
continue;
}
if (p_canwait(td, p)) {
PROC_UNLOCK(p);
continue;
}
/*
* This special case handles a kthread spawned by linux_clone
* (see linux_misc.c). The linux_wait4 and linux_waitpid
* functions need to be able to distinguish between waiting
* on a process and waiting on a thread. It is a thread if
* p_sigparent is not SIGCHLD, and the WLINUXCLONE option
* signifies we want to wait for threads and not processes.
*/
if ((p->p_sigparent != SIGCHLD) ^
((options & WLINUXCLONE) != 0)) {
PROC_UNLOCK(p);
continue;
}
nfound++;
PROC_SLOCK(p);
if (p->p_state == PRS_ZOMBIE) {
proc_reap(td, p, status, options, rusage);
return (0);
}
if ((p->p_flag & P_STOPPED_SIG) &&
(p->p_suspcount == p->p_numthreads) &&
(p->p_flag & P_WAITED) == 0 &&
(p->p_flag & P_TRACED || options & WUNTRACED)) {
PROC_SUNLOCK(p);
p->p_flag |= P_WAITED;
sx_xunlock(&proctree_lock);
td->td_retval[0] = p->p_pid;
if (status)
*status = W_STOPCODE(p->p_xstat);
PROC_LOCK(q);
sigqueue_take(p->p_ksi);
PROC_UNLOCK(q);
PROC_UNLOCK(p);
return (0);
}
PROC_SUNLOCK(p);
if (options & WCONTINUED && (p->p_flag & P_CONTINUED)) {
sx_xunlock(&proctree_lock);
td->td_retval[0] = p->p_pid;
p->p_flag &= ~P_CONTINUED;
PROC_LOCK(q);
sigqueue_take(p->p_ksi);
PROC_UNLOCK(q);
PROC_UNLOCK(p);
if (status)
*status = SIGCONT;
return (0);
}
PROC_UNLOCK(p);
}
if (nfound == 0) {
sx_xunlock(&proctree_lock);
if (td->td_proc->p_dbg_child)
return (0);
else
return (ECHILD);
}
if (options & WNOHANG) {
sx_xunlock(&proctree_lock);
td->td_retval[0] = 0;
return (0);
}
PROC_LOCK(q);
sx_xunlock(&proctree_lock);
if (q->p_flag & P_STATCHILD) {
q->p_flag &= ~P_STATCHILD;
error = 0;
} else
error = msleep(q, &q->p_mtx, PWAIT | PCATCH, "wait", 0);
PROC_UNLOCK(q);
if (error)
return (error);
goto loop;
}
/*
* Make process 'parent' the new parent of process 'child'.
* Must be called with an exclusive hold of proctree lock.
*/
void
proc_reparent(struct proc *child, struct proc *parent)
{
#ifdef RACCT
int locked;
#endif
sx_assert(&proctree_lock, SX_XLOCKED);
PROC_LOCK_ASSERT(child, MA_OWNED);
if (child->p_pptr == parent)
return;
#ifdef RACCT
locked = PROC_LOCKED(parent);
if (!locked)
PROC_LOCK(parent);
racct_add_force(parent, RACCT_NPROC, 1);
if (!locked)
PROC_UNLOCK(parent);
#endif
PROC_LOCK(child->p_pptr);
racct_sub(child->p_pptr, RACCT_NPROC, 1);
sigqueue_take(child->p_ksi);
PROC_UNLOCK(child->p_pptr);
LIST_REMOVE(child, p_sibling);
LIST_INSERT_HEAD(&parent->p_children, child, p_sibling);
child->p_pptr = parent;
}
Index: head/sys/kern/kern_fork.c
===================================================================
--- head/sys/kern/kern_fork.c (revision 225616)
+++ head/sys/kern/kern_fork.c (revision 225617)
@@ -1,1087 +1,1087 @@
/*-
* Copyright (c) 1982, 1986, 1989, 1991, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_fork.c 8.6 (Berkeley) 4/8/94
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_kdtrace.h"
#include "opt_ktrace.h"
#include "opt_kstack_pages.h"
#include "opt_procdesc.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/sysproto.h>
#include <sys/eventhandler.h>
#include <sys/fcntl.h>
#include <sys/filedesc.h>
#include <sys/jail.h>
#include <sys/kernel.h>
#include <sys/kthread.h>
#include <sys/sysctl.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/procdesc.h>
#include <sys/pioctl.h>
#include <sys/racct.h>
#include <sys/resourcevar.h>
#include <sys/sched.h>
#include <sys/syscall.h>
#include <sys/vmmeter.h>
#include <sys/vnode.h>
#include <sys/acct.h>
#include <sys/ktr.h>
#include <sys/ktrace.h>
#include <sys/unistd.h>
#include <sys/sdt.h>
#include <sys/sx.h>
#include <sys/sysent.h>
#include <sys/signalvar.h>
#include <security/audit/audit.h>
#include <security/mac/mac_framework.h>
#include <vm/vm.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
#include <vm/vm_extern.h>
#include <vm/uma.h>
#ifdef KDTRACE_HOOKS
#include <sys/dtrace_bsd.h>
dtrace_fork_func_t dtrace_fasttrap_fork;
#endif
SDT_PROVIDER_DECLARE(proc);
SDT_PROBE_DEFINE(proc, kernel, , create, create);
SDT_PROBE_ARGTYPE(proc, kernel, , create, 0, "struct proc *");
SDT_PROBE_ARGTYPE(proc, kernel, , create, 1, "struct proc *");
SDT_PROBE_ARGTYPE(proc, kernel, , create, 2, "int");
#ifndef _SYS_SYSPROTO_H_
struct fork_args {
int dummy;
};
#endif
/* ARGSUSED */
int
-fork(struct thread *td, struct fork_args *uap)
+sys_fork(struct thread *td, struct fork_args *uap)
{
int error;
struct proc *p2;
error = fork1(td, RFFDG | RFPROC, 0, &p2, NULL, 0);
if (error == 0) {
td->td_retval[0] = p2->p_pid;
td->td_retval[1] = 0;
}
return (error);
}
/* ARGUSED */
int
-pdfork(td, uap)
+sys_pdfork(td, uap)
struct thread *td;
struct pdfork_args *uap;
{
#ifdef PROCDESC
int error, fd;
struct proc *p2;
/*
* It is necessary to return fd by reference because 0 is a valid file
* descriptor number, and the child needs to be able to distinguish
* itself from the parent using the return value.
*/
error = fork1(td, RFFDG | RFPROC | RFPROCDESC, 0, &p2,
&fd, uap->flags);
if (error == 0) {
td->td_retval[0] = p2->p_pid;
td->td_retval[1] = 0;
error = copyout(&fd, uap->fdp, sizeof(fd));
}
return (error);
#else
return (ENOSYS);
#endif
}
/* ARGSUSED */
int
-vfork(struct thread *td, struct vfork_args *uap)
+sys_vfork(struct thread *td, struct vfork_args *uap)
{
int error, flags;
struct proc *p2;
#ifdef XEN
flags = RFFDG | RFPROC; /* validate that this is still an issue */
#else
flags = RFFDG | RFPROC | RFPPWAIT | RFMEM;
#endif
error = fork1(td, flags, 0, &p2, NULL, 0);
if (error == 0) {
td->td_retval[0] = p2->p_pid;
td->td_retval[1] = 0;
}
return (error);
}
int
-rfork(struct thread *td, struct rfork_args *uap)
+sys_rfork(struct thread *td, struct rfork_args *uap)
{
struct proc *p2;
int error;
/* Don't allow kernel-only flags. */
if ((uap->flags & RFKERNELONLY) != 0)
return (EINVAL);
AUDIT_ARG_FFLAGS(uap->flags);
error = fork1(td, uap->flags, 0, &p2, NULL, 0);
if (error == 0) {
td->td_retval[0] = p2 ? p2->p_pid : 0;
td->td_retval[1] = 0;
}
return (error);
}
int nprocs = 1; /* process 0 */
int lastpid = 0;
SYSCTL_INT(_kern, OID_AUTO, lastpid, CTLFLAG_RD, &lastpid, 0,
"Last used PID");
/*
* Random component to lastpid generation. We mix in a random factor to make
* it a little harder to predict. We sanity check the modulus value to avoid
* doing it in critical paths. Don't let it be too small or we pointlessly
* waste randomness entropy, and don't let it be impossibly large. Using a
* modulus that is too big causes a LOT more process table scans and slows
* down fork processing as the pidchecked caching is defeated.
*/
static int randompid = 0;
static int
sysctl_kern_randompid(SYSCTL_HANDLER_ARGS)
{
int error, pid;
error = sysctl_wire_old_buffer(req, sizeof(int));
if (error != 0)
return(error);
sx_xlock(&allproc_lock);
pid = randompid;
error = sysctl_handle_int(oidp, &pid, 0, req);
if (error == 0 && req->newptr != NULL) {
if (pid < 0 || pid > PID_MAX - 100) /* out of range */
pid = PID_MAX - 100;
else if (pid < 2) /* NOP */
pid = 0;
else if (pid < 100) /* Make it reasonable */
pid = 100;
randompid = pid;
}
sx_xunlock(&allproc_lock);
return (error);
}
SYSCTL_PROC(_kern, OID_AUTO, randompid, CTLTYPE_INT|CTLFLAG_RW,
0, 0, sysctl_kern_randompid, "I", "Random PID modulus");
static int
fork_findpid(int flags)
{
struct proc *p;
int trypid;
static int pidchecked = 0;
/*
* Requires allproc_lock in order to iterate over the list
* of processes, and proctree_lock to access p_pgrp.
*/
sx_assert(&allproc_lock, SX_LOCKED);
sx_assert(&proctree_lock, SX_LOCKED);
/*
* Find an unused process ID. We remember a range of unused IDs
* ready to use (from lastpid+1 through pidchecked-1).
*
* If RFHIGHPID is set (used during system boot), do not allocate
* low-numbered pids.
*/
trypid = lastpid + 1;
if (flags & RFHIGHPID) {
if (trypid < 10)
trypid = 10;
} else {
if (randompid)
trypid += arc4random() % randompid;
}
retry:
/*
* If the process ID prototype has wrapped around,
* restart somewhat above 0, as the low-numbered procs
* tend to include daemons that don't exit.
*/
if (trypid >= PID_MAX) {
trypid = trypid % PID_MAX;
if (trypid < 100)
trypid += 100;
pidchecked = 0;
}
if (trypid >= pidchecked) {
int doingzomb = 0;
pidchecked = PID_MAX;
/*
* Scan the active and zombie procs to check whether this pid
* is in use. Remember the lowest pid that's greater
* than trypid, so we can avoid checking for a while.
*/
p = LIST_FIRST(&allproc);
again:
for (; p != NULL; p = LIST_NEXT(p, p_list)) {
while (p->p_pid == trypid ||
(p->p_pgrp != NULL &&
(p->p_pgrp->pg_id == trypid ||
(p->p_session != NULL &&
p->p_session->s_sid == trypid)))) {
trypid++;
if (trypid >= pidchecked)
goto retry;
}
if (p->p_pid > trypid && pidchecked > p->p_pid)
pidchecked = p->p_pid;
if (p->p_pgrp != NULL) {
if (p->p_pgrp->pg_id > trypid &&
pidchecked > p->p_pgrp->pg_id)
pidchecked = p->p_pgrp->pg_id;
if (p->p_session != NULL &&
p->p_session->s_sid > trypid &&
pidchecked > p->p_session->s_sid)
pidchecked = p->p_session->s_sid;
}
}
if (!doingzomb) {
doingzomb = 1;
p = LIST_FIRST(&zombproc);
goto again;
}
}
/*
* RFHIGHPID does not mess with the lastpid counter during boot.
*/
if (flags & RFHIGHPID)
pidchecked = 0;
else
lastpid = trypid;
return (trypid);
}
static int
fork_norfproc(struct thread *td, int flags)
{
int error;
struct proc *p1;
KASSERT((flags & RFPROC) == 0,
("fork_norfproc called with RFPROC set"));
p1 = td->td_proc;
if (((p1->p_flag & (P_HADTHREADS|P_SYSTEM)) == P_HADTHREADS) &&
(flags & (RFCFDG | RFFDG))) {
PROC_LOCK(p1);
if (thread_single(SINGLE_BOUNDARY)) {
PROC_UNLOCK(p1);
return (ERESTART);
}
PROC_UNLOCK(p1);
}
error = vm_forkproc(td, NULL, NULL, NULL, flags);
if (error)
goto fail;
/*
* Close all file descriptors.
*/
if (flags & RFCFDG) {
struct filedesc *fdtmp;
fdtmp = fdinit(td->td_proc->p_fd);
fdfree(td);
p1->p_fd = fdtmp;
}
/*
* Unshare file descriptors (from parent).
*/
if (flags & RFFDG)
fdunshare(p1, td);
fail:
if (((p1->p_flag & (P_HADTHREADS|P_SYSTEM)) == P_HADTHREADS) &&
(flags & (RFCFDG | RFFDG))) {
PROC_LOCK(p1);
thread_single_end();
PROC_UNLOCK(p1);
}
return (error);
}
static void
do_fork(struct thread *td, int flags, struct proc *p2, struct thread *td2,
struct vmspace *vm2, int pdflags)
{
struct proc *p1, *pptr;
int p2_held, trypid;
struct filedesc *fd;
struct filedesc_to_leader *fdtol;
struct sigacts *newsigacts;
sx_assert(&proctree_lock, SX_SLOCKED);
sx_assert(&allproc_lock, SX_XLOCKED);
p2_held = 0;
p1 = td->td_proc;
/*
* Increment the nprocs resource before blocking can occur. There
* are hard-limits as to the number of processes that can run.
*/
nprocs++;
trypid = fork_findpid(flags);
sx_sunlock(&proctree_lock);
p2->p_state = PRS_NEW; /* protect against others */
p2->p_pid = trypid;
AUDIT_ARG_PID(p2->p_pid);
LIST_INSERT_HEAD(&allproc, p2, p_list);
LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash);
tidhash_add(td2);
PROC_LOCK(p2);
PROC_LOCK(p1);
sx_xunlock(&allproc_lock);
bcopy(&p1->p_startcopy, &p2->p_startcopy,
__rangeof(struct proc, p_startcopy, p_endcopy));
pargs_hold(p2->p_args);
PROC_UNLOCK(p1);
bzero(&p2->p_startzero,
__rangeof(struct proc, p_startzero, p_endzero));
p2->p_ucred = crhold(td->td_ucred);
/* Tell the prison that we exist. */
prison_proc_hold(p2->p_ucred->cr_prison);
PROC_UNLOCK(p2);
/*
* Malloc things while we don't hold any locks.
*/
if (flags & RFSIGSHARE)
newsigacts = NULL;
else
newsigacts = sigacts_alloc();
/*
* Copy filedesc.
*/
if (flags & RFCFDG) {
fd = fdinit(p1->p_fd);
fdtol = NULL;
} else if (flags & RFFDG) {
fd = fdcopy(p1->p_fd);
fdtol = NULL;
} else {
fd = fdshare(p1->p_fd);
if (p1->p_fdtol == NULL)
p1->p_fdtol = filedesc_to_leader_alloc(NULL, NULL,
p1->p_leader);
if ((flags & RFTHREAD) != 0) {
/*
* Shared file descriptor table, and shared
* process leaders.
*/
fdtol = p1->p_fdtol;
FILEDESC_XLOCK(p1->p_fd);
fdtol->fdl_refcount++;
FILEDESC_XUNLOCK(p1->p_fd);
} else {
/*
* Shared file descriptor table, and different
* process leaders.
*/
fdtol = filedesc_to_leader_alloc(p1->p_fdtol,
p1->p_fd, p2);
}
}
/*
* Make a proc table entry for the new process.
* Start by zeroing the section of proc that is zero-initialized,
* then copy the section that is copied directly from the parent.
*/
PROC_LOCK(p2);
PROC_LOCK(p1);
bzero(&td2->td_startzero,
__rangeof(struct thread, td_startzero, td_endzero));
bcopy(&td->td_startcopy, &td2->td_startcopy,
__rangeof(struct thread, td_startcopy, td_endcopy));
bcopy(&p2->p_comm, &td2->td_name, sizeof(td2->td_name));
td2->td_sigstk = td->td_sigstk;
td2->td_sigmask = td->td_sigmask;
td2->td_flags = TDF_INMEM;
td2->td_lend_user_pri = PRI_MAX;
#ifdef VIMAGE
td2->td_vnet = NULL;
td2->td_vnet_lpush = NULL;
#endif
/*
* Allow the scheduler to initialize the child.
*/
thread_lock(td);
sched_fork(td, td2);
thread_unlock(td);
/*
* Duplicate sub-structures as needed.
* Increase reference counts on shared objects.
*/
p2->p_flag = P_INMEM;
p2->p_swtick = ticks;
if (p1->p_flag & P_PROFIL)
startprofclock(p2);
td2->td_ucred = crhold(p2->p_ucred);
if (flags & RFSIGSHARE) {
p2->p_sigacts = sigacts_hold(p1->p_sigacts);
} else {
sigacts_copy(newsigacts, p1->p_sigacts);
p2->p_sigacts = newsigacts;
}
if (flags & RFTSIGZMB)
p2->p_sigparent = RFTSIGNUM(flags);
else if (flags & RFLINUXTHPN)
p2->p_sigparent = SIGUSR1;
else
p2->p_sigparent = SIGCHLD;
p2->p_textvp = p1->p_textvp;
p2->p_fd = fd;
p2->p_fdtol = fdtol;
/*
* p_limit is copy-on-write. Bump its refcount.
*/
lim_fork(p1, p2);
pstats_fork(p1->p_stats, p2->p_stats);
PROC_UNLOCK(p1);
PROC_UNLOCK(p2);
/* Bump references to the text vnode (for procfs). */
if (p2->p_textvp)
vref(p2->p_textvp);
/*
* Set up linkage for kernel based threading.
*/
if ((flags & RFTHREAD) != 0) {
mtx_lock(&ppeers_lock);
p2->p_peers = p1->p_peers;
p1->p_peers = p2;
p2->p_leader = p1->p_leader;
mtx_unlock(&ppeers_lock);
PROC_LOCK(p1->p_leader);
if ((p1->p_leader->p_flag & P_WEXIT) != 0) {
PROC_UNLOCK(p1->p_leader);
/*
* The task leader is exiting, so process p1 is
* going to be killed shortly. Since p1 obviously
* isn't dead yet, we know that the leader is either
* sending SIGKILL's to all the processes in this
* task or is sleeping waiting for all the peers to
* exit. We let p1 complete the fork, but we need
* to go ahead and kill the new process p2 since
* the task leader may not get a chance to send
* SIGKILL to it. We leave it on the list so that
* the task leader will wait for this new process
* to commit suicide.
*/
PROC_LOCK(p2);
- psignal(p2, SIGKILL);
+ kern_psignal(p2, SIGKILL);
PROC_UNLOCK(p2);
} else
PROC_UNLOCK(p1->p_leader);
} else {
p2->p_peers = NULL;
p2->p_leader = p2;
}
sx_xlock(&proctree_lock);
PGRP_LOCK(p1->p_pgrp);
PROC_LOCK(p2);
PROC_LOCK(p1);
/*
* Preserve some more flags in subprocess. P_PROFIL has already
* been preserved.
*/
p2->p_flag |= p1->p_flag & P_SUGID;
td2->td_pflags |= td->td_pflags & TDP_ALTSTACK;
SESS_LOCK(p1->p_session);
if (p1->p_session->s_ttyvp != NULL && p1->p_flag & P_CONTROLT)
p2->p_flag |= P_CONTROLT;
SESS_UNLOCK(p1->p_session);
if (flags & RFPPWAIT)
p2->p_flag |= P_PPWAIT;
p2->p_pgrp = p1->p_pgrp;
LIST_INSERT_AFTER(p1, p2, p_pglist);
PGRP_UNLOCK(p1->p_pgrp);
LIST_INIT(&p2->p_children);
callout_init(&p2->p_itcallout, CALLOUT_MPSAFE);
/*
* If PF_FORK is set, the child process inherits the
* procfs ioctl flags from its parent.
*/
if (p1->p_pfsflags & PF_FORK) {
p2->p_stops = p1->p_stops;
p2->p_pfsflags = p1->p_pfsflags;
}
/*
* This begins the section where we must prevent the parent
* from being swapped.
*/
_PHOLD(p1);
PROC_UNLOCK(p1);
/*
* Attach the new process to its parent.
*
* If RFNOWAIT is set, the newly created process becomes a child
* of init. This effectively disassociates the child from the
* parent.
*/
if (flags & RFNOWAIT)
pptr = initproc;
else
pptr = p1;
p2->p_pptr = pptr;
LIST_INSERT_HEAD(&pptr->p_children, p2, p_sibling);
sx_xunlock(&proctree_lock);
/* Inform accounting that we have forked. */
p2->p_acflag = AFORK;
PROC_UNLOCK(p2);
#ifdef KTRACE
ktrprocfork(p1, p2);
#endif
/*
* Finish creating the child process. It will return via a different
* execution path later. (ie: directly into user mode)
*/
vm_forkproc(td, p2, td2, vm2, flags);
if (flags == (RFFDG | RFPROC)) {
PCPU_INC(cnt.v_forks);
PCPU_ADD(cnt.v_forkpages, p2->p_vmspace->vm_dsize +
p2->p_vmspace->vm_ssize);
} else if (flags == (RFFDG | RFPROC | RFPPWAIT | RFMEM)) {
PCPU_INC(cnt.v_vforks);
PCPU_ADD(cnt.v_vforkpages, p2->p_vmspace->vm_dsize +
p2->p_vmspace->vm_ssize);
} else if (p1 == &proc0) {
PCPU_INC(cnt.v_kthreads);
PCPU_ADD(cnt.v_kthreadpages, p2->p_vmspace->vm_dsize +
p2->p_vmspace->vm_ssize);
} else {
PCPU_INC(cnt.v_rforks);
PCPU_ADD(cnt.v_rforkpages, p2->p_vmspace->vm_dsize +
p2->p_vmspace->vm_ssize);
}
#ifdef PROCDESC
/*
* Associate the process descriptor with the process before anything
* can happen that might cause that process to need the descriptor.
* However, don't do this until after fork(2) can no longer fail.
*/
if (flags & RFPROCDESC)
procdesc_new(p2, pdflags);
#endif
/*
* Both processes are set up, now check if any loadable modules want
* to adjust anything.
*/
EVENTHANDLER_INVOKE(process_fork, p1, p2, flags);
/*
* Set the child start time and mark the process as being complete.
*/
PROC_LOCK(p2);
PROC_LOCK(p1);
microuptime(&p2->p_stats->p_start);
PROC_SLOCK(p2);
p2->p_state = PRS_NORMAL;
PROC_SUNLOCK(p2);
#ifdef KDTRACE_HOOKS
/*
* Tell the DTrace fasttrap provider about the new process
* if it has registered an interest. We have to do this only after
* p_state is PRS_NORMAL since the fasttrap module will use pfind()
* later on.
*/
if (dtrace_fasttrap_fork)
dtrace_fasttrap_fork(p1, p2);
#endif
if ((p1->p_flag & (P_TRACED | P_FOLLOWFORK)) == (P_TRACED |
P_FOLLOWFORK)) {
/*
* Arrange for debugger to receive the fork event.
*
* We can report PL_FLAG_FORKED regardless of
* P_FOLLOWFORK settings, but it does not make a sense
* for runaway child.
*/
td->td_dbgflags |= TDB_FORK;
td->td_dbg_forked = p2->p_pid;
td2->td_dbgflags |= TDB_STOPATFORK;
_PHOLD(p2);
p2_held = 1;
}
PROC_UNLOCK(p2);
if ((flags & RFSTOPPED) == 0) {
/*
* If RFSTOPPED not requested, make child runnable and
* add to run queue.
*/
thread_lock(td2);
TD_SET_CAN_RUN(td2);
sched_add(td2, SRQ_BORING);
thread_unlock(td2);
}
/*
* Now can be swapped.
*/
_PRELE(p1);
PROC_UNLOCK(p1);
/*
* Tell any interested parties about the new process.
*/
knote_fork(&p1->p_klist, p2->p_pid);
SDT_PROBE(proc, kernel, , create, p2, p1, flags, 0, 0);
/*
* Wait until debugger is attached to child.
*/
PROC_LOCK(p2);
while ((td2->td_dbgflags & TDB_STOPATFORK) != 0)
cv_wait(&p2->p_dbgwait, &p2->p_mtx);
if (p2_held)
_PRELE(p2);
/*
* Preserve synchronization semantics of vfork. If waiting for
* child to exec or exit, set P_PPWAIT on child, and sleep on our
* proc (in case of exit).
*/
while (p2->p_flag & P_PPWAIT)
cv_wait(&p2->p_pwait, &p2->p_mtx);
PROC_UNLOCK(p2);
}
int
fork1(struct thread *td, int flags, int pages, struct proc **procp,
int *procdescp, int pdflags)
{
struct proc *p1;
struct proc *newproc;
int ok;
struct thread *td2;
struct vmspace *vm2;
vm_ooffset_t mem_charged;
int error;
static int curfail;
static struct timeval lastfail;
#ifdef PROCDESC
struct file *fp_procdesc = NULL;
#endif
/* Check for the undefined or unimplemented flags. */
if ((flags & ~(RFFLAGS | RFTSIGFLAGS(RFTSIGMASK))) != 0)
return (EINVAL);
/* Signal value requires RFTSIGZMB. */
if ((flags & RFTSIGFLAGS(RFTSIGMASK)) != 0 && (flags & RFTSIGZMB) == 0)
return (EINVAL);
/* Can't copy and clear. */
if ((flags & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG))
return (EINVAL);
/* Check the validity of the signal number. */
if ((flags & RFTSIGZMB) != 0 && (u_int)RFTSIGNUM(flags) > _SIG_MAXSIG)
return (EINVAL);
#ifdef PROCDESC
if ((flags & RFPROCDESC) != 0) {
/* Can't not create a process yet get a process descriptor. */
if ((flags & RFPROC) == 0)
return (EINVAL);
/* Must provide a place to put a procdesc if creating one. */
if (procdescp == NULL)
return (EINVAL);
}
#endif
p1 = td->td_proc;
/*
* Here we don't create a new process, but we divorce
* certain parts of a process from itself.
*/
if ((flags & RFPROC) == 0) {
*procp = NULL;
return (fork_norfproc(td, flags));
}
#ifdef RACCT
PROC_LOCK(p1);
error = racct_add(p1, RACCT_NPROC, 1);
PROC_UNLOCK(p1);
if (error != 0)
return (EAGAIN);
#endif
#ifdef PROCDESC
/*
* If required, create a process descriptor in the parent first; we
* will abandon it if something goes wrong. We don't finit() until
* later.
*/
if (flags & RFPROCDESC) {
error = falloc(td, &fp_procdesc, procdescp, 0);
if (error != 0) {
#ifdef RACCT
PROC_LOCK(p1);
racct_sub(p1, RACCT_NPROC, 1);
PROC_UNLOCK(p1);
#endif
return (error);
}
}
#endif
mem_charged = 0;
vm2 = NULL;
if (pages == 0)
pages = KSTACK_PAGES;
/* Allocate new proc. */
newproc = uma_zalloc(proc_zone, M_WAITOK);
td2 = FIRST_THREAD_IN_PROC(newproc);
if (td2 == NULL) {
td2 = thread_alloc(pages);
if (td2 == NULL) {
error = ENOMEM;
goto fail1;
}
proc_linkup(newproc, td2);
} else {
if (td2->td_kstack == 0 || td2->td_kstack_pages != pages) {
if (td2->td_kstack != 0)
vm_thread_dispose(td2);
if (!thread_alloc_stack(td2, pages)) {
error = ENOMEM;
goto fail1;
}
}
}
if ((flags & RFMEM) == 0) {
vm2 = vmspace_fork(p1->p_vmspace, &mem_charged);
if (vm2 == NULL) {
error = ENOMEM;
goto fail1;
}
if (!swap_reserve(mem_charged)) {
/*
* The swap reservation failed. The accounting
* from the entries of the copied vm2 will be
* substracted in vmspace_free(), so force the
* reservation there.
*/
swap_reserve_force(mem_charged);
error = ENOMEM;
goto fail1;
}
} else
vm2 = NULL;
#ifdef MAC
mac_proc_init(newproc);
#endif
knlist_init_mtx(&newproc->p_klist, &newproc->p_mtx);
STAILQ_INIT(&newproc->p_ktr);
/*
* XXX: This is ugly; when we copy resource usage, we need to bump
* per-cred resource counters.
*/
newproc->p_ucred = p1->p_ucred;
/*
* Initialize resource accounting for the child process.
*/
error = racct_proc_fork(p1, newproc);
if (error != 0) {
error = EAGAIN;
goto fail1;
}
/* We have to lock the process tree while we look for a pid. */
sx_slock(&proctree_lock);
/*
* Although process entries are dynamically created, we still keep
* a global limit on the maximum number we will create. Don't allow
* a nonprivileged user to use the last ten processes; don't let root
* exceed the limit. The variable nprocs is the current number of
* processes, maxproc is the limit.
*/
sx_xlock(&allproc_lock);
if ((nprocs >= maxproc - 10 && priv_check_cred(td->td_ucred,
PRIV_MAXPROC, 0) != 0) || nprocs >= maxproc) {
error = EAGAIN;
goto fail;
}
#ifdef RACCT
/*
* After fork, there is exactly one thread running.
*/
PROC_LOCK(newproc);
error = racct_set(newproc, RACCT_NTHR, 1);
PROC_UNLOCK(newproc);
if (error != 0) {
error = EAGAIN;
goto fail;
}
#endif
/*
* Increment the count of procs running with this uid. Don't allow
* a nonprivileged user to exceed their current limit.
*
* XXXRW: Can we avoid privilege here if it's not needed?
*/
error = priv_check_cred(td->td_ucred, PRIV_PROC_LIMIT, 0);
if (error == 0)
ok = chgproccnt(td->td_ucred->cr_ruidinfo, 1, 0);
else {
PROC_LOCK(p1);
ok = chgproccnt(td->td_ucred->cr_ruidinfo, 1,
lim_cur(p1, RLIMIT_NPROC));
PROC_UNLOCK(p1);
}
if (ok) {
do_fork(td, flags, newproc, td2, vm2, pdflags);
/*
* Return child proc pointer to parent.
*/
*procp = newproc;
#ifdef PROCDESC
if (flags & RFPROCDESC)
procdesc_finit(newproc->p_procdesc, fp_procdesc);
#endif
return (0);
}
error = EAGAIN;
fail:
racct_proc_exit(newproc);
sx_sunlock(&proctree_lock);
if (ppsratecheck(&lastfail, &curfail, 1))
printf("maxproc limit exceeded by uid %i, please see tuning(7) and login.conf(5).\n",
td->td_ucred->cr_ruid);
sx_xunlock(&allproc_lock);
#ifdef MAC
mac_proc_destroy(newproc);
#endif
fail1:
if (vm2 != NULL)
vmspace_free(vm2);
uma_zfree(proc_zone, newproc);
#ifdef PROCDESC
if (((flags & RFPROCDESC) != 0) && (fp_procdesc != NULL))
fdrop(fp_procdesc, td);
#endif
pause("fork", hz / 2);
#ifdef RACCT
PROC_LOCK(p1);
racct_sub(p1, RACCT_NPROC, 1);
PROC_UNLOCK(p1);
#endif
return (error);
}
/*
* Handle the return of a child process from fork1(). This function
* is called from the MD fork_trampoline() entry point.
*/
void
fork_exit(void (*callout)(void *, struct trapframe *), void *arg,
struct trapframe *frame)
{
struct proc *p;
struct thread *td;
struct thread *dtd;
td = curthread;
p = td->td_proc;
KASSERT(p->p_state == PRS_NORMAL, ("executing process is still new"));
CTR4(KTR_PROC, "fork_exit: new thread %p (td_sched %p, pid %d, %s)",
td, td->td_sched, p->p_pid, td->td_name);
sched_fork_exit(td);
/*
* Processes normally resume in mi_switch() after being
* cpu_switch()'ed to, but when children start up they arrive here
* instead, so we must do much the same things as mi_switch() would.
*/
if ((dtd = PCPU_GET(deadthread))) {
PCPU_SET(deadthread, NULL);
thread_stash(dtd);
}
thread_unlock(td);
/*
* cpu_set_fork_handler intercepts this function call to
* have this call a non-return function to stay in kernel mode.
* initproc has its own fork handler, but it does return.
*/
KASSERT(callout != NULL, ("NULL callout in fork_exit"));
callout(arg, frame);
/*
* Check if a kernel thread misbehaved and returned from its main
* function.
*/
if (p->p_flag & P_KTHREAD) {
printf("Kernel thread \"%s\" (pid %d) exited prematurely.\n",
td->td_name, p->p_pid);
kproc_exit(0);
}
mtx_assert(&Giant, MA_NOTOWNED);
if (p->p_sysent->sv_schedtail != NULL)
(p->p_sysent->sv_schedtail)(td);
}
/*
* Simplified back end of syscall(), used when returning from fork()
* directly into user mode. Giant is not held on entry, and must not
* be held on return. This function is passed in to fork_exit() as the
* first parameter and is called when returning to a new userland process.
*/
void
fork_return(struct thread *td, struct trapframe *frame)
{
struct proc *p, *dbg;
if (td->td_dbgflags & TDB_STOPATFORK) {
p = td->td_proc;
sx_xlock(&proctree_lock);
PROC_LOCK(p);
if ((p->p_pptr->p_flag & (P_TRACED | P_FOLLOWFORK)) ==
(P_TRACED | P_FOLLOWFORK)) {
/*
* If debugger still wants auto-attach for the
* parent's children, do it now.
*/
dbg = p->p_pptr->p_pptr;
p->p_flag |= P_TRACED;
p->p_oppid = p->p_pptr->p_pid;
proc_reparent(p, dbg);
sx_xunlock(&proctree_lock);
ptracestop(td, SIGSTOP);
} else {
/*
* ... otherwise clear the request.
*/
sx_xunlock(&proctree_lock);
td->td_dbgflags &= ~TDB_STOPATFORK;
cv_broadcast(&p->p_dbgwait);
}
PROC_UNLOCK(p);
}
userret(td, frame);
#ifdef KTRACE
if (KTRPOINT(td, KTR_SYSRET))
ktrsysret(SYS_fork, 0, 0);
#endif
mtx_assert(&Giant, MA_NOTOWNED);
}
Index: head/sys/kern/kern_jail.c
===================================================================
--- head/sys/kern/kern_jail.c (revision 225616)
+++ head/sys/kern/kern_jail.c (revision 225617)
@@ -1,4480 +1,4480 @@
/*-
* Copyright (c) 1999 Poul-Henning Kamp.
* Copyright (c) 2008 Bjoern A. Zeeb.
* Copyright (c) 2009 James Gritton.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_compat.h"
#include "opt_ddb.h"
#include "opt_inet.h"
#include "opt_inet6.h"
#include <sys/param.h>
#include <sys/types.h>
#include <sys/kernel.h>
#include <sys/systm.h>
#include <sys/errno.h>
#include <sys/sysproto.h>
#include <sys/malloc.h>
#include <sys/osd.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/taskqueue.h>
#include <sys/fcntl.h>
#include <sys/jail.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/racct.h>
#include <sys/refcount.h>
#include <sys/sx.h>
#include <sys/sysent.h>
#include <sys/namei.h>
#include <sys/mount.h>
#include <sys/queue.h>
#include <sys/socket.h>
#include <sys/syscallsubr.h>
#include <sys/sysctl.h>
#include <sys/vnode.h>
#include <net/if.h>
#include <net/vnet.h>
#include <netinet/in.h>
#ifdef DDB
#include <ddb/ddb.h>
#ifdef INET6
#include <netinet6/in6_var.h>
#endif /* INET6 */
#endif /* DDB */
#include <security/mac/mac_framework.h>
#define DEFAULT_HOSTUUID "00000000-0000-0000-0000-000000000000"
MALLOC_DEFINE(M_PRISON, "prison", "Prison structures");
MALLOC_DEFINE(M_PRISON_RACCT, "prison_racct", "Prison racct structures");
/* Keep struct prison prison0 and some code in kern_jail_set() readable. */
#ifdef INET
#ifdef INET6
#define _PR_IP_SADDRSEL PR_IP4_SADDRSEL|PR_IP6_SADDRSEL
#else
#define _PR_IP_SADDRSEL PR_IP4_SADDRSEL
#endif
#else /* !INET */
#ifdef INET6
#define _PR_IP_SADDRSEL PR_IP6_SADDRSEL
#else
#define _PR_IP_SADDRSEL 0
#endif
#endif
/* prison0 describes what is "real" about the system. */
struct prison prison0 = {
.pr_id = 0,
.pr_name = "0",
.pr_ref = 1,
.pr_uref = 1,
.pr_path = "/",
.pr_securelevel = -1,
.pr_childmax = JAIL_MAX,
.pr_hostuuid = DEFAULT_HOSTUUID,
.pr_children = LIST_HEAD_INITIALIZER(prison0.pr_children),
#ifdef VIMAGE
.pr_flags = PR_HOST|PR_VNET|_PR_IP_SADDRSEL,
#else
.pr_flags = PR_HOST|_PR_IP_SADDRSEL,
#endif
.pr_allow = PR_ALLOW_ALL,
};
MTX_SYSINIT(prison0, &prison0.pr_mtx, "jail mutex", MTX_DEF);
/* allprison, allprison_racct and lastprid are protected by allprison_lock. */
struct sx allprison_lock;
SX_SYSINIT(allprison_lock, &allprison_lock, "allprison");
struct prisonlist allprison = TAILQ_HEAD_INITIALIZER(allprison);
LIST_HEAD(, prison_racct) allprison_racct;
int lastprid = 0;
static int do_jail_attach(struct thread *td, struct prison *pr);
static void prison_complete(void *context, int pending);
static void prison_deref(struct prison *pr, int flags);
static char *prison_path(struct prison *pr1, struct prison *pr2);
static void prison_remove_one(struct prison *pr);
#ifdef RACCT
static void prison_racct_attach(struct prison *pr);
static void prison_racct_detach(struct prison *pr);
#endif
#ifdef INET
static int _prison_check_ip4(struct prison *pr, struct in_addr *ia);
static int prison_restrict_ip4(struct prison *pr, struct in_addr *newip4);
#endif
#ifdef INET6
static int _prison_check_ip6(struct prison *pr, struct in6_addr *ia6);
static int prison_restrict_ip6(struct prison *pr, struct in6_addr *newip6);
#endif
/* Flags for prison_deref */
#define PD_DEREF 0x01
#define PD_DEUREF 0x02
#define PD_LOCKED 0x04
#define PD_LIST_SLOCKED 0x08
#define PD_LIST_XLOCKED 0x10
/*
* Parameter names corresponding to PR_* flag values. Size values are for kvm
* as we cannot figure out the size of a sparse array, or an array without a
* terminating entry.
*/
static char *pr_flag_names[] = {
[0] = "persist",
#ifdef INET
[7] = "ip4.saddrsel",
#endif
#ifdef INET6
[8] = "ip6.saddrsel",
#endif
};
const size_t pr_flag_names_size = sizeof(pr_flag_names);
static char *pr_flag_nonames[] = {
[0] = "nopersist",
#ifdef INET
[7] = "ip4.nosaddrsel",
#endif
#ifdef INET6
[8] = "ip6.nosaddrsel",
#endif
};
const size_t pr_flag_nonames_size = sizeof(pr_flag_nonames);
struct jailsys_flags {
const char *name;
unsigned disable;
unsigned new;
} pr_flag_jailsys[] = {
{ "host", 0, PR_HOST },
#ifdef VIMAGE
{ "vnet", 0, PR_VNET },
#endif
#ifdef INET
{ "ip4", PR_IP4_USER | PR_IP4_DISABLE, PR_IP4_USER },
#endif
#ifdef INET6
{ "ip6", PR_IP6_USER | PR_IP6_DISABLE, PR_IP6_USER },
#endif
};
const size_t pr_flag_jailsys_size = sizeof(pr_flag_jailsys);
static char *pr_allow_names[] = {
"allow.set_hostname",
"allow.sysvipc",
"allow.raw_sockets",
"allow.chflags",
"allow.mount",
"allow.quotas",
"allow.socket_af",
};
const size_t pr_allow_names_size = sizeof(pr_allow_names);
static char *pr_allow_nonames[] = {
"allow.noset_hostname",
"allow.nosysvipc",
"allow.noraw_sockets",
"allow.nochflags",
"allow.nomount",
"allow.noquotas",
"allow.nosocket_af",
};
const size_t pr_allow_nonames_size = sizeof(pr_allow_nonames);
#define JAIL_DEFAULT_ALLOW PR_ALLOW_SET_HOSTNAME
#define JAIL_DEFAULT_ENFORCE_STATFS 2
static unsigned jail_default_allow = JAIL_DEFAULT_ALLOW;
static int jail_default_enforce_statfs = JAIL_DEFAULT_ENFORCE_STATFS;
#if defined(INET) || defined(INET6)
static unsigned jail_max_af_ips = 255;
#endif
#ifdef INET
static int
qcmp_v4(const void *ip1, const void *ip2)
{
in_addr_t iaa, iab;
/*
* We need to compare in HBO here to get the list sorted as expected
* by the result of the code. Sorting NBO addresses gives you
* interesting results. If you do not understand, do not try.
*/
iaa = ntohl(((const struct in_addr *)ip1)->s_addr);
iab = ntohl(((const struct in_addr *)ip2)->s_addr);
/*
* Do not simply return the difference of the two numbers, the int is
* not wide enough.
*/
if (iaa > iab)
return (1);
else if (iaa < iab)
return (-1);
else
return (0);
}
#endif
#ifdef INET6
static int
qcmp_v6(const void *ip1, const void *ip2)
{
const struct in6_addr *ia6a, *ia6b;
int i, rc;
ia6a = (const struct in6_addr *)ip1;
ia6b = (const struct in6_addr *)ip2;
rc = 0;
for (i = 0; rc == 0 && i < sizeof(struct in6_addr); i++) {
if (ia6a->s6_addr[i] > ia6b->s6_addr[i])
rc = 1;
else if (ia6a->s6_addr[i] < ia6b->s6_addr[i])
rc = -1;
}
return (rc);
}
#endif
/*
* struct jail_args {
* struct jail *jail;
* };
*/
int
-jail(struct thread *td, struct jail_args *uap)
+sys_jail(struct thread *td, struct jail_args *uap)
{
uint32_t version;
int error;
struct jail j;
error = copyin(uap->jail, &version, sizeof(uint32_t));
if (error)
return (error);
switch (version) {
case 0:
{
struct jail_v0 j0;
/* FreeBSD single IPv4 jails. */
bzero(&j, sizeof(struct jail));
error = copyin(uap->jail, &j0, sizeof(struct jail_v0));
if (error)
return (error);
j.version = j0.version;
j.path = j0.path;
j.hostname = j0.hostname;
j.ip4s = j0.ip_number;
break;
}
case 1:
/*
* Version 1 was used by multi-IPv4 jail implementations
* that never made it into the official kernel.
*/
return (EINVAL);
case 2: /* JAIL_API_VERSION */
/* FreeBSD multi-IPv4/IPv6,noIP jails. */
error = copyin(uap->jail, &j, sizeof(struct jail));
if (error)
return (error);
break;
default:
/* Sci-Fi jails are not supported, sorry. */
return (EINVAL);
}
return (kern_jail(td, &j));
}
int
kern_jail(struct thread *td, struct jail *j)
{
struct iovec optiov[2 * (4
+ sizeof(pr_allow_names) / sizeof(pr_allow_names[0])
#ifdef INET
+ 1
#endif
#ifdef INET6
+ 1
#endif
)];
struct uio opt;
char *u_path, *u_hostname, *u_name;
#ifdef INET
uint32_t ip4s;
struct in_addr *u_ip4;
#endif
#ifdef INET6
struct in6_addr *u_ip6;
#endif
size_t tmplen;
int error, enforce_statfs, fi;
bzero(&optiov, sizeof(optiov));
opt.uio_iov = optiov;
opt.uio_iovcnt = 0;
opt.uio_offset = -1;
opt.uio_resid = -1;
opt.uio_segflg = UIO_SYSSPACE;
opt.uio_rw = UIO_READ;
opt.uio_td = td;
/* Set permissions for top-level jails from sysctls. */
if (!jailed(td->td_ucred)) {
for (fi = 0; fi < sizeof(pr_allow_names) /
sizeof(pr_allow_names[0]); fi++) {
optiov[opt.uio_iovcnt].iov_base =
(jail_default_allow & (1 << fi))
? pr_allow_names[fi] : pr_allow_nonames[fi];
optiov[opt.uio_iovcnt].iov_len =
strlen(optiov[opt.uio_iovcnt].iov_base) + 1;
opt.uio_iovcnt += 2;
}
optiov[opt.uio_iovcnt].iov_base = "enforce_statfs";
optiov[opt.uio_iovcnt].iov_len = sizeof("enforce_statfs");
opt.uio_iovcnt++;
enforce_statfs = jail_default_enforce_statfs;
optiov[opt.uio_iovcnt].iov_base = &enforce_statfs;
optiov[opt.uio_iovcnt].iov_len = sizeof(enforce_statfs);
opt.uio_iovcnt++;
}
tmplen = MAXPATHLEN + MAXHOSTNAMELEN + MAXHOSTNAMELEN;
#ifdef INET
ip4s = (j->version == 0) ? 1 : j->ip4s;
if (ip4s > jail_max_af_ips)
return (EINVAL);
tmplen += ip4s * sizeof(struct in_addr);
#else
if (j->ip4s > 0)
return (EINVAL);
#endif
#ifdef INET6
if (j->ip6s > jail_max_af_ips)
return (EINVAL);
tmplen += j->ip6s * sizeof(struct in6_addr);
#else
if (j->ip6s > 0)
return (EINVAL);
#endif
u_path = malloc(tmplen, M_TEMP, M_WAITOK);
u_hostname = u_path + MAXPATHLEN;
u_name = u_hostname + MAXHOSTNAMELEN;
#ifdef INET
u_ip4 = (struct in_addr *)(u_name + MAXHOSTNAMELEN);
#endif
#ifdef INET6
#ifdef INET
u_ip6 = (struct in6_addr *)(u_ip4 + ip4s);
#else
u_ip6 = (struct in6_addr *)(u_name + MAXHOSTNAMELEN);
#endif
#endif
optiov[opt.uio_iovcnt].iov_base = "path";
optiov[opt.uio_iovcnt].iov_len = sizeof("path");
opt.uio_iovcnt++;
optiov[opt.uio_iovcnt].iov_base = u_path;
error = copyinstr(j->path, u_path, MAXPATHLEN,
&optiov[opt.uio_iovcnt].iov_len);
if (error) {
free(u_path, M_TEMP);
return (error);
}
opt.uio_iovcnt++;
optiov[opt.uio_iovcnt].iov_base = "host.hostname";
optiov[opt.uio_iovcnt].iov_len = sizeof("host.hostname");
opt.uio_iovcnt++;
optiov[opt.uio_iovcnt].iov_base = u_hostname;
error = copyinstr(j->hostname, u_hostname, MAXHOSTNAMELEN,
&optiov[opt.uio_iovcnt].iov_len);
if (error) {
free(u_path, M_TEMP);
return (error);
}
opt.uio_iovcnt++;
if (j->jailname != NULL) {
optiov[opt.uio_iovcnt].iov_base = "name";
optiov[opt.uio_iovcnt].iov_len = sizeof("name");
opt.uio_iovcnt++;
optiov[opt.uio_iovcnt].iov_base = u_name;
error = copyinstr(j->jailname, u_name, MAXHOSTNAMELEN,
&optiov[opt.uio_iovcnt].iov_len);
if (error) {
free(u_path, M_TEMP);
return (error);
}
opt.uio_iovcnt++;
}
#ifdef INET
optiov[opt.uio_iovcnt].iov_base = "ip4.addr";
optiov[opt.uio_iovcnt].iov_len = sizeof("ip4.addr");
opt.uio_iovcnt++;
optiov[opt.uio_iovcnt].iov_base = u_ip4;
optiov[opt.uio_iovcnt].iov_len = ip4s * sizeof(struct in_addr);
if (j->version == 0)
u_ip4->s_addr = j->ip4s;
else {
error = copyin(j->ip4, u_ip4, optiov[opt.uio_iovcnt].iov_len);
if (error) {
free(u_path, M_TEMP);
return (error);
}
}
opt.uio_iovcnt++;
#endif
#ifdef INET6
optiov[opt.uio_iovcnt].iov_base = "ip6.addr";
optiov[opt.uio_iovcnt].iov_len = sizeof("ip6.addr");
opt.uio_iovcnt++;
optiov[opt.uio_iovcnt].iov_base = u_ip6;
optiov[opt.uio_iovcnt].iov_len = j->ip6s * sizeof(struct in6_addr);
error = copyin(j->ip6, u_ip6, optiov[opt.uio_iovcnt].iov_len);
if (error) {
free(u_path, M_TEMP);
return (error);
}
opt.uio_iovcnt++;
#endif
KASSERT(opt.uio_iovcnt <= sizeof(optiov) / sizeof(optiov[0]),
("kern_jail: too many iovecs (%d)", opt.uio_iovcnt));
error = kern_jail_set(td, &opt, JAIL_CREATE | JAIL_ATTACH);
free(u_path, M_TEMP);
return (error);
}
/*
* struct jail_set_args {
* struct iovec *iovp;
* unsigned int iovcnt;
* int flags;
* };
*/
int
-jail_set(struct thread *td, struct jail_set_args *uap)
+sys_jail_set(struct thread *td, struct jail_set_args *uap)
{
struct uio *auio;
int error;
/* Check that we have an even number of iovecs. */
if (uap->iovcnt & 1)
return (EINVAL);
error = copyinuio(uap->iovp, uap->iovcnt, &auio);
if (error)
return (error);
error = kern_jail_set(td, auio, uap->flags);
free(auio, M_IOV);
return (error);
}
int
kern_jail_set(struct thread *td, struct uio *optuio, int flags)
{
struct nameidata nd;
#ifdef INET
struct in_addr *ip4;
#endif
#ifdef INET6
struct in6_addr *ip6;
#endif
struct vfsopt *opt;
struct vfsoptlist *opts;
struct prison *pr, *deadpr, *mypr, *ppr, *tpr;
struct vnode *root;
char *domain, *errmsg, *host, *name, *namelc, *p, *path, *uuid;
#if defined(INET) || defined(INET6)
struct prison *tppr;
void *op;
#endif
unsigned long hid;
size_t namelen, onamelen;
int created, cuflags, descend, enforce, error, errmsg_len, errmsg_pos;
int gotchildmax, gotenforce, gothid, gotslevel;
int fi, jid, jsys, len, level;
int childmax, slevel, vfslocked;
#if defined(INET) || defined(INET6)
int ii, ij;
#endif
#ifdef INET
int ip4s, redo_ip4;
#endif
#ifdef INET6
int ip6s, redo_ip6;
#endif
uint64_t pr_allow, ch_allow, pr_flags, ch_flags;
unsigned tallow;
char numbuf[12];
error = priv_check(td, PRIV_JAIL_SET);
if (!error && (flags & JAIL_ATTACH))
error = priv_check(td, PRIV_JAIL_ATTACH);
if (error)
return (error);
mypr = ppr = td->td_ucred->cr_prison;
if ((flags & JAIL_CREATE) && mypr->pr_childmax == 0)
return (EPERM);
if (flags & ~JAIL_SET_MASK)
return (EINVAL);
/*
* Check all the parameters before committing to anything. Not all
* errors can be caught early, but we may as well try. Also, this
* takes care of some expensive stuff (path lookup) before getting
* the allprison lock.
*
* XXX Jails are not filesystems, and jail parameters are not mount
* options. But it makes more sense to re-use the vfsopt code
* than duplicate it under a different name.
*/
error = vfs_buildopts(optuio, &opts);
if (error)
return (error);
#ifdef INET
ip4 = NULL;
#endif
#ifdef INET6
ip6 = NULL;
#endif
error = vfs_copyopt(opts, "jid", &jid, sizeof(jid));
if (error == ENOENT)
jid = 0;
else if (error != 0)
goto done_free;
error = vfs_copyopt(opts, "securelevel", &slevel, sizeof(slevel));
if (error == ENOENT)
gotslevel = 0;
else if (error != 0)
goto done_free;
else
gotslevel = 1;
error =
vfs_copyopt(opts, "children.max", &childmax, sizeof(childmax));
if (error == ENOENT)
gotchildmax = 0;
else if (error != 0)
goto done_free;
else
gotchildmax = 1;
error = vfs_copyopt(opts, "enforce_statfs", &enforce, sizeof(enforce));
if (error == ENOENT)
gotenforce = 0;
else if (error != 0)
goto done_free;
else if (enforce < 0 || enforce > 2) {
error = EINVAL;
goto done_free;
} else
gotenforce = 1;
pr_flags = ch_flags = 0;
for (fi = 0; fi < sizeof(pr_flag_names) / sizeof(pr_flag_names[0]);
fi++) {
if (pr_flag_names[fi] == NULL)
continue;
vfs_flagopt(opts, pr_flag_names[fi], &pr_flags, 1 << fi);
vfs_flagopt(opts, pr_flag_nonames[fi], &ch_flags, 1 << fi);
}
ch_flags |= pr_flags;
for (fi = 0; fi < sizeof(pr_flag_jailsys) / sizeof(pr_flag_jailsys[0]);
fi++) {
error = vfs_copyopt(opts, pr_flag_jailsys[fi].name, &jsys,
sizeof(jsys));
if (error == ENOENT)
continue;
if (error != 0)
goto done_free;
switch (jsys) {
case JAIL_SYS_DISABLE:
if (!pr_flag_jailsys[fi].disable) {
error = EINVAL;
goto done_free;
}
pr_flags |= pr_flag_jailsys[fi].disable;
break;
case JAIL_SYS_NEW:
pr_flags |= pr_flag_jailsys[fi].new;
break;
case JAIL_SYS_INHERIT:
break;
default:
error = EINVAL;
goto done_free;
}
ch_flags |=
pr_flag_jailsys[fi].new | pr_flag_jailsys[fi].disable;
}
if ((flags & (JAIL_CREATE | JAIL_UPDATE | JAIL_ATTACH)) == JAIL_CREATE
&& !(pr_flags & PR_PERSIST)) {
error = EINVAL;
vfs_opterror(opts, "new jail must persist or attach");
goto done_errmsg;
}
#ifdef VIMAGE
if ((flags & JAIL_UPDATE) && (ch_flags & PR_VNET)) {
error = EINVAL;
vfs_opterror(opts, "vnet cannot be changed after creation");
goto done_errmsg;
}
#endif
#ifdef INET
if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP4_USER)) {
error = EINVAL;
vfs_opterror(opts, "ip4 cannot be changed after creation");
goto done_errmsg;
}
#endif
#ifdef INET6
if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP6_USER)) {
error = EINVAL;
vfs_opterror(opts, "ip6 cannot be changed after creation");
goto done_errmsg;
}
#endif
pr_allow = ch_allow = 0;
for (fi = 0; fi < sizeof(pr_allow_names) / sizeof(pr_allow_names[0]);
fi++) {
vfs_flagopt(opts, pr_allow_names[fi], &pr_allow, 1 << fi);
vfs_flagopt(opts, pr_allow_nonames[fi], &ch_allow, 1 << fi);
}
ch_allow |= pr_allow;
error = vfs_getopt(opts, "name", (void **)&name, &len);
if (error == ENOENT)
name = NULL;
else if (error != 0)
goto done_free;
else {
if (len == 0 || name[len - 1] != '\0') {
error = EINVAL;
goto done_free;
}
if (len > MAXHOSTNAMELEN) {
error = ENAMETOOLONG;
goto done_free;
}
}
error = vfs_getopt(opts, "host.hostname", (void **)&host, &len);
if (error == ENOENT)
host = NULL;
else if (error != 0)
goto done_free;
else {
ch_flags |= PR_HOST;
pr_flags |= PR_HOST;
if (len == 0 || host[len - 1] != '\0') {
error = EINVAL;
goto done_free;
}
if (len > MAXHOSTNAMELEN) {
error = ENAMETOOLONG;
goto done_free;
}
}
error = vfs_getopt(opts, "host.domainname", (void **)&domain, &len);
if (error == ENOENT)
domain = NULL;
else if (error != 0)
goto done_free;
else {
ch_flags |= PR_HOST;
pr_flags |= PR_HOST;
if (len == 0 || domain[len - 1] != '\0') {
error = EINVAL;
goto done_free;
}
if (len > MAXHOSTNAMELEN) {
error = ENAMETOOLONG;
goto done_free;
}
}
error = vfs_getopt(opts, "host.hostuuid", (void **)&uuid, &len);
if (error == ENOENT)
uuid = NULL;
else if (error != 0)
goto done_free;
else {
ch_flags |= PR_HOST;
pr_flags |= PR_HOST;
if (len == 0 || uuid[len - 1] != '\0') {
error = EINVAL;
goto done_free;
}
if (len > HOSTUUIDLEN) {
error = ENAMETOOLONG;
goto done_free;
}
}
#ifdef COMPAT_FREEBSD32
if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
uint32_t hid32;
error = vfs_copyopt(opts, "host.hostid", &hid32, sizeof(hid32));
hid = hid32;
} else
#endif
error = vfs_copyopt(opts, "host.hostid", &hid, sizeof(hid));
if (error == ENOENT)
gothid = 0;
else if (error != 0)
goto done_free;
else {
gothid = 1;
ch_flags |= PR_HOST;
pr_flags |= PR_HOST;
}
#ifdef INET
error = vfs_getopt(opts, "ip4.addr", &op, &ip4s);
if (error == ENOENT)
ip4s = (pr_flags & PR_IP4_DISABLE) ? 0 : -1;
else if (error != 0)
goto done_free;
else if (ip4s & (sizeof(*ip4) - 1)) {
error = EINVAL;
goto done_free;
} else {
ch_flags |= PR_IP4_USER | PR_IP4_DISABLE;
if (ip4s == 0)
pr_flags |= PR_IP4_USER | PR_IP4_DISABLE;
else {
pr_flags = (pr_flags & ~PR_IP4_DISABLE) | PR_IP4_USER;
ip4s /= sizeof(*ip4);
if (ip4s > jail_max_af_ips) {
error = EINVAL;
vfs_opterror(opts, "too many IPv4 addresses");
goto done_errmsg;
}
ip4 = malloc(ip4s * sizeof(*ip4), M_PRISON, M_WAITOK);
bcopy(op, ip4, ip4s * sizeof(*ip4));
/*
* IP addresses are all sorted but ip[0] to preserve
* the primary IP address as given from userland.
* This special IP is used for unbound outgoing
* connections as well for "loopback" traffic in case
* source address selection cannot find any more fitting
* address to connect from.
*/
if (ip4s > 1)
qsort(ip4 + 1, ip4s - 1, sizeof(*ip4), qcmp_v4);
/*
* Check for duplicate addresses and do some simple
* zero and broadcast checks. If users give other bogus
* addresses it is their problem.
*
* We do not have to care about byte order for these
* checks so we will do them in NBO.
*/
for (ii = 0; ii < ip4s; ii++) {
if (ip4[ii].s_addr == INADDR_ANY ||
ip4[ii].s_addr == INADDR_BROADCAST) {
error = EINVAL;
goto done_free;
}
if ((ii+1) < ip4s &&
(ip4[0].s_addr == ip4[ii+1].s_addr ||
ip4[ii].s_addr == ip4[ii+1].s_addr)) {
error = EINVAL;
goto done_free;
}
}
}
}
#endif
#ifdef INET6
error = vfs_getopt(opts, "ip6.addr", &op, &ip6s);
if (error == ENOENT)
ip6s = (pr_flags & PR_IP6_DISABLE) ? 0 : -1;
else if (error != 0)
goto done_free;
else if (ip6s & (sizeof(*ip6) - 1)) {
error = EINVAL;
goto done_free;
} else {
ch_flags |= PR_IP6_USER | PR_IP6_DISABLE;
if (ip6s == 0)
pr_flags |= PR_IP6_USER | PR_IP6_DISABLE;
else {
pr_flags = (pr_flags & ~PR_IP6_DISABLE) | PR_IP6_USER;
ip6s /= sizeof(*ip6);
if (ip6s > jail_max_af_ips) {
error = EINVAL;
vfs_opterror(opts, "too many IPv6 addresses");
goto done_errmsg;
}
ip6 = malloc(ip6s * sizeof(*ip6), M_PRISON, M_WAITOK);
bcopy(op, ip6, ip6s * sizeof(*ip6));
if (ip6s > 1)
qsort(ip6 + 1, ip6s - 1, sizeof(*ip6), qcmp_v6);
for (ii = 0; ii < ip6s; ii++) {
if (IN6_IS_ADDR_UNSPECIFIED(&ip6[ii])) {
error = EINVAL;
goto done_free;
}
if ((ii+1) < ip6s &&
(IN6_ARE_ADDR_EQUAL(&ip6[0], &ip6[ii+1]) ||
IN6_ARE_ADDR_EQUAL(&ip6[ii], &ip6[ii+1])))
{
error = EINVAL;
goto done_free;
}
}
}
}
#endif
#if defined(VIMAGE) && (defined(INET) || defined(INET6))
if ((ch_flags & PR_VNET) && (ch_flags & (PR_IP4_USER | PR_IP6_USER))) {
error = EINVAL;
vfs_opterror(opts,
"vnet jails cannot have IP address restrictions");
goto done_errmsg;
}
#endif
root = NULL;
error = vfs_getopt(opts, "path", (void **)&path, &len);
if (error == ENOENT)
path = NULL;
else if (error != 0)
goto done_free;
else {
if (flags & JAIL_UPDATE) {
error = EINVAL;
vfs_opterror(opts,
"path cannot be changed after creation");
goto done_errmsg;
}
if (len == 0 || path[len - 1] != '\0') {
error = EINVAL;
goto done_free;
}
if (len < 2 || (len == 2 && path[0] == '/'))
path = NULL;
else {
/* Leave room for a real-root full pathname. */
if (len + (path[0] == '/' && strcmp(mypr->pr_path, "/")
? strlen(mypr->pr_path) : 0) > MAXPATHLEN) {
error = ENAMETOOLONG;
goto done_free;
}
NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW, UIO_SYSSPACE,
path, td);
error = namei(&nd);
if (error)
goto done_free;
vfslocked = NDHASGIANT(&nd);
root = nd.ni_vp;
NDFREE(&nd, NDF_ONLY_PNBUF);
if (root->v_type != VDIR) {
error = ENOTDIR;
vrele(root);
VFS_UNLOCK_GIANT(vfslocked);
goto done_free;
}
VFS_UNLOCK_GIANT(vfslocked);
}
}
/*
* Grab the allprison lock before letting modules check their
* parameters. Once we have it, do not let go so we'll have a
* consistent view of the OSD list.
*/
sx_xlock(&allprison_lock);
error = osd_jail_call(NULL, PR_METHOD_CHECK, opts);
if (error)
goto done_unlock_list;
/* By now, all parameters should have been noted. */
TAILQ_FOREACH(opt, opts, link) {
if (!opt->seen && strcmp(opt->name, "errmsg")) {
error = EINVAL;
vfs_opterror(opts, "unknown parameter: %s", opt->name);
goto done_unlock_list;
}
}
/*
* See if we are creating a new record or updating an existing one.
* This abuses the file error codes ENOENT and EEXIST.
*/
cuflags = flags & (JAIL_CREATE | JAIL_UPDATE);
if (!cuflags) {
error = EINVAL;
vfs_opterror(opts, "no valid operation (create or update)");
goto done_unlock_list;
}
pr = NULL;
namelc = NULL;
if (cuflags == JAIL_CREATE && jid == 0 && name != NULL) {
namelc = strrchr(name, '.');
jid = strtoul(namelc != NULL ? namelc + 1 : name, &p, 10);
if (*p != '\0')
jid = 0;
}
if (jid != 0) {
/*
* See if a requested jid already exists. There is an
* information leak here if the jid exists but is not within
* the caller's jail hierarchy. Jail creators will get EEXIST
* even though they cannot see the jail, and CREATE | UPDATE
* will return ENOENT which is not normally a valid error.
*/
if (jid < 0) {
error = EINVAL;
vfs_opterror(opts, "negative jid");
goto done_unlock_list;
}
pr = prison_find(jid);
if (pr != NULL) {
ppr = pr->pr_parent;
/* Create: jid must not exist. */
if (cuflags == JAIL_CREATE) {
mtx_unlock(&pr->pr_mtx);
error = EEXIST;
vfs_opterror(opts, "jail %d already exists",
jid);
goto done_unlock_list;
}
if (!prison_ischild(mypr, pr)) {
mtx_unlock(&pr->pr_mtx);
pr = NULL;
} else if (pr->pr_uref == 0) {
if (!(flags & JAIL_DYING)) {
mtx_unlock(&pr->pr_mtx);
error = ENOENT;
vfs_opterror(opts, "jail %d is dying",
jid);
goto done_unlock_list;
} else if ((flags & JAIL_ATTACH) ||
(pr_flags & PR_PERSIST)) {
/*
* A dying jail might be resurrected
* (via attach or persist), but first
* it must determine if another jail
* has claimed its name. Accomplish
* this by implicitly re-setting the
* name.
*/
if (name == NULL)
name = prison_name(mypr, pr);
}
}
}
if (pr == NULL) {
/* Update: jid must exist. */
if (cuflags == JAIL_UPDATE) {
error = ENOENT;
vfs_opterror(opts, "jail %d not found", jid);
goto done_unlock_list;
}
}
}
/*
* If the caller provided a name, look for a jail by that name.
* This has different semantics for creates and updates keyed by jid
* (where the name must not already exist in a different jail),
* and updates keyed by the name itself (where the name must exist
* because that is the jail being updated).
*/
if (name != NULL) {
namelc = strrchr(name, '.');
if (namelc == NULL)
namelc = name;
else {
/*
* This is a hierarchical name. Split it into the
* parent and child names, and make sure the parent
* exists or matches an already found jail.
*/
*namelc = '\0';
if (pr != NULL) {
if (strncmp(name, ppr->pr_name, namelc - name)
|| ppr->pr_name[namelc - name] != '\0') {
mtx_unlock(&pr->pr_mtx);
error = EINVAL;
vfs_opterror(opts,
"cannot change jail's parent");
goto done_unlock_list;
}
} else {
ppr = prison_find_name(mypr, name);
if (ppr == NULL) {
error = ENOENT;
vfs_opterror(opts,
"jail \"%s\" not found", name);
goto done_unlock_list;
}
mtx_unlock(&ppr->pr_mtx);
}
name = ++namelc;
}
if (name[0] != '\0') {
namelen =
(ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1;
name_again:
deadpr = NULL;
FOREACH_PRISON_CHILD(ppr, tpr) {
if (tpr != pr && tpr->pr_ref > 0 &&
!strcmp(tpr->pr_name + namelen, name)) {
if (pr == NULL &&
cuflags != JAIL_CREATE) {
mtx_lock(&tpr->pr_mtx);
if (tpr->pr_ref > 0) {
/*
* Use this jail
* for updates.
*/
if (tpr->pr_uref > 0) {
pr = tpr;
break;
}
deadpr = tpr;
}
mtx_unlock(&tpr->pr_mtx);
} else if (tpr->pr_uref > 0) {
/*
* Create, or update(jid):
* name must not exist in an
* active sibling jail.
*/
error = EEXIST;
if (pr != NULL)
mtx_unlock(&pr->pr_mtx);
vfs_opterror(opts,
"jail \"%s\" already exists",
name);
goto done_unlock_list;
}
}
}
/* If no active jail is found, use a dying one. */
if (deadpr != NULL && pr == NULL) {
if (flags & JAIL_DYING) {
mtx_lock(&deadpr->pr_mtx);
if (deadpr->pr_ref == 0) {
mtx_unlock(&deadpr->pr_mtx);
goto name_again;
}
pr = deadpr;
} else if (cuflags == JAIL_UPDATE) {
error = ENOENT;
vfs_opterror(opts,
"jail \"%s\" is dying", name);
goto done_unlock_list;
}
}
/* Update: name must exist if no jid. */
else if (cuflags == JAIL_UPDATE && pr == NULL) {
error = ENOENT;
vfs_opterror(opts, "jail \"%s\" not found",
name);
goto done_unlock_list;
}
}
}
/* Update: must provide a jid or name. */
else if (cuflags == JAIL_UPDATE && pr == NULL) {
error = ENOENT;
vfs_opterror(opts, "update specified no jail");
goto done_unlock_list;
}
/* If there's no prison to update, create a new one and link it in. */
if (pr == NULL) {
for (tpr = mypr; tpr != NULL; tpr = tpr->pr_parent)
if (tpr->pr_childcount >= tpr->pr_childmax) {
error = EPERM;
vfs_opterror(opts, "prison limit exceeded");
goto done_unlock_list;
}
created = 1;
mtx_lock(&ppr->pr_mtx);
if (ppr->pr_ref == 0 || (ppr->pr_flags & PR_REMOVE)) {
mtx_unlock(&ppr->pr_mtx);
error = ENOENT;
vfs_opterror(opts, "parent jail went away!");
goto done_unlock_list;
}
ppr->pr_ref++;
ppr->pr_uref++;
mtx_unlock(&ppr->pr_mtx);
pr = malloc(sizeof(*pr), M_PRISON, M_WAITOK | M_ZERO);
if (jid == 0) {
/* Find the next free jid. */
jid = lastprid + 1;
findnext:
if (jid == JAIL_MAX)
jid = 1;
TAILQ_FOREACH(tpr, &allprison, pr_list) {
if (tpr->pr_id < jid)
continue;
if (tpr->pr_id > jid || tpr->pr_ref == 0) {
TAILQ_INSERT_BEFORE(tpr, pr, pr_list);
break;
}
if (jid == lastprid) {
error = EAGAIN;
vfs_opterror(opts,
"no available jail IDs");
free(pr, M_PRISON);
prison_deref(ppr, PD_DEREF |
PD_DEUREF | PD_LIST_XLOCKED);
goto done_releroot;
}
jid++;
goto findnext;
}
lastprid = jid;
} else {
/*
* The jail already has a jid (that did not yet exist),
* so just find where to insert it.
*/
TAILQ_FOREACH(tpr, &allprison, pr_list)
if (tpr->pr_id >= jid) {
TAILQ_INSERT_BEFORE(tpr, pr, pr_list);
break;
}
}
if (tpr == NULL)
TAILQ_INSERT_TAIL(&allprison, pr, pr_list);
LIST_INSERT_HEAD(&ppr->pr_children, pr, pr_sibling);
for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent)
tpr->pr_childcount++;
pr->pr_parent = ppr;
pr->pr_id = jid;
/* Set some default values, and inherit some from the parent. */
if (name == NULL)
name = "";
if (path == NULL) {
path = "/";
root = mypr->pr_root;
vref(root);
}
strlcpy(pr->pr_hostuuid, DEFAULT_HOSTUUID, HOSTUUIDLEN);
pr->pr_flags |= PR_HOST;
#if defined(INET) || defined(INET6)
#ifdef VIMAGE
if (!(pr_flags & PR_VNET))
#endif
{
#ifdef INET
if (!(ch_flags & PR_IP4_USER))
pr->pr_flags |=
PR_IP4 | PR_IP4_USER | PR_IP4_DISABLE;
else if (!(pr_flags & PR_IP4_USER)) {
pr->pr_flags |= ppr->pr_flags & PR_IP4;
if (ppr->pr_ip4 != NULL) {
pr->pr_ip4s = ppr->pr_ip4s;
pr->pr_ip4 = malloc(pr->pr_ip4s *
sizeof(struct in_addr), M_PRISON,
M_WAITOK);
bcopy(ppr->pr_ip4, pr->pr_ip4,
pr->pr_ip4s * sizeof(*pr->pr_ip4));
}
}
#endif
#ifdef INET6
if (!(ch_flags & PR_IP6_USER))
pr->pr_flags |=
PR_IP6 | PR_IP6_USER | PR_IP6_DISABLE;
else if (!(pr_flags & PR_IP6_USER)) {
pr->pr_flags |= ppr->pr_flags & PR_IP6;
if (ppr->pr_ip6 != NULL) {
pr->pr_ip6s = ppr->pr_ip6s;
pr->pr_ip6 = malloc(pr->pr_ip6s *
sizeof(struct in6_addr), M_PRISON,
M_WAITOK);
bcopy(ppr->pr_ip6, pr->pr_ip6,
pr->pr_ip6s * sizeof(*pr->pr_ip6));
}
}
#endif
}
#endif
/* Source address selection is always on by default. */
pr->pr_flags |= _PR_IP_SADDRSEL;
pr->pr_securelevel = ppr->pr_securelevel;
pr->pr_allow = JAIL_DEFAULT_ALLOW & ppr->pr_allow;
pr->pr_enforce_statfs = JAIL_DEFAULT_ENFORCE_STATFS;
LIST_INIT(&pr->pr_children);
mtx_init(&pr->pr_mtx, "jail mutex", NULL, MTX_DEF | MTX_DUPOK);
#ifdef VIMAGE
/* Allocate a new vnet if specified. */
pr->pr_vnet = (pr_flags & PR_VNET)
? vnet_alloc() : ppr->pr_vnet;
#endif
/*
* Allocate a dedicated cpuset for each jail.
* Unlike other initial settings, this may return an erorr.
*/
error = cpuset_create_root(ppr, &pr->pr_cpuset);
if (error) {
prison_deref(pr, PD_LIST_XLOCKED);
goto done_releroot;
}
mtx_lock(&pr->pr_mtx);
/*
* New prisons do not yet have a reference, because we do not
* want other to see the incomplete prison once the
* allprison_lock is downgraded.
*/
} else {
created = 0;
/*
* Grab a reference for existing prisons, to ensure they
* continue to exist for the duration of the call.
*/
pr->pr_ref++;
#if defined(VIMAGE) && (defined(INET) || defined(INET6))
if ((pr->pr_flags & PR_VNET) &&
(ch_flags & (PR_IP4_USER | PR_IP6_USER))) {
error = EINVAL;
vfs_opterror(opts,
"vnet jails cannot have IP address restrictions");
goto done_deref_locked;
}
#endif
#ifdef INET
if (PR_IP4_USER & ch_flags & (pr_flags ^ pr->pr_flags)) {
error = EINVAL;
vfs_opterror(opts,
"ip4 cannot be changed after creation");
goto done_deref_locked;
}
#endif
#ifdef INET6
if (PR_IP6_USER & ch_flags & (pr_flags ^ pr->pr_flags)) {
error = EINVAL;
vfs_opterror(opts,
"ip6 cannot be changed after creation");
goto done_deref_locked;
}
#endif
}
/* Do final error checking before setting anything. */
if (gotslevel) {
if (slevel < ppr->pr_securelevel) {
error = EPERM;
goto done_deref_locked;
}
}
if (gotchildmax) {
if (childmax >= ppr->pr_childmax) {
error = EPERM;
goto done_deref_locked;
}
}
if (gotenforce) {
if (enforce < ppr->pr_enforce_statfs) {
error = EPERM;
goto done_deref_locked;
}
}
#ifdef INET
if (ip4s > 0) {
if (ppr->pr_flags & PR_IP4) {
/*
* Make sure the new set of IP addresses is a
* subset of the parent's list. Don't worry
* about the parent being unlocked, as any
* setting is done with allprison_lock held.
*/
for (ij = 0; ij < ppr->pr_ip4s; ij++)
if (ip4[0].s_addr == ppr->pr_ip4[ij].s_addr)
break;
if (ij == ppr->pr_ip4s) {
error = EPERM;
goto done_deref_locked;
}
if (ip4s > 1) {
for (ii = ij = 1; ii < ip4s; ii++) {
if (ip4[ii].s_addr ==
ppr->pr_ip4[0].s_addr)
continue;
for (; ij < ppr->pr_ip4s; ij++)
if (ip4[ii].s_addr ==
ppr->pr_ip4[ij].s_addr)
break;
if (ij == ppr->pr_ip4s)
break;
}
if (ij == ppr->pr_ip4s) {
error = EPERM;
goto done_deref_locked;
}
}
}
/*
* Check for conflicting IP addresses. We permit them
* if there is no more than one IP on each jail. If
* there is a duplicate on a jail with more than one
* IP stop checking and return error.
*/
tppr = ppr;
#ifdef VIMAGE
for (; tppr != &prison0; tppr = tppr->pr_parent)
if (tppr->pr_flags & PR_VNET)
break;
#endif
FOREACH_PRISON_DESCENDANT(tppr, tpr, descend) {
if (tpr == pr ||
#ifdef VIMAGE
(tpr != tppr && (tpr->pr_flags & PR_VNET)) ||
#endif
tpr->pr_uref == 0) {
descend = 0;
continue;
}
if (!(tpr->pr_flags & PR_IP4_USER))
continue;
descend = 0;
if (tpr->pr_ip4 == NULL ||
(ip4s == 1 && tpr->pr_ip4s == 1))
continue;
for (ii = 0; ii < ip4s; ii++) {
if (_prison_check_ip4(tpr, &ip4[ii]) == 0) {
error = EADDRINUSE;
vfs_opterror(opts,
"IPv4 addresses clash");
goto done_deref_locked;
}
}
}
}
#endif
#ifdef INET6
if (ip6s > 0) {
if (ppr->pr_flags & PR_IP6) {
/*
* Make sure the new set of IP addresses is a
* subset of the parent's list.
*/
for (ij = 0; ij < ppr->pr_ip6s; ij++)
if (IN6_ARE_ADDR_EQUAL(&ip6[0],
&ppr->pr_ip6[ij]))
break;
if (ij == ppr->pr_ip6s) {
error = EPERM;
goto done_deref_locked;
}
if (ip6s > 1) {
for (ii = ij = 1; ii < ip6s; ii++) {
if (IN6_ARE_ADDR_EQUAL(&ip6[ii],
&ppr->pr_ip6[0]))
continue;
for (; ij < ppr->pr_ip6s; ij++)
if (IN6_ARE_ADDR_EQUAL(
&ip6[ii], &ppr->pr_ip6[ij]))
break;
if (ij == ppr->pr_ip6s)
break;
}
if (ij == ppr->pr_ip6s) {
error = EPERM;
goto done_deref_locked;
}
}
}
/* Check for conflicting IP addresses. */
tppr = ppr;
#ifdef VIMAGE
for (; tppr != &prison0; tppr = tppr->pr_parent)
if (tppr->pr_flags & PR_VNET)
break;
#endif
FOREACH_PRISON_DESCENDANT(tppr, tpr, descend) {
if (tpr == pr ||
#ifdef VIMAGE
(tpr != tppr && (tpr->pr_flags & PR_VNET)) ||
#endif
tpr->pr_uref == 0) {
descend = 0;
continue;
}
if (!(tpr->pr_flags & PR_IP6_USER))
continue;
descend = 0;
if (tpr->pr_ip6 == NULL ||
(ip6s == 1 && tpr->pr_ip6s == 1))
continue;
for (ii = 0; ii < ip6s; ii++) {
if (_prison_check_ip6(tpr, &ip6[ii]) == 0) {
error = EADDRINUSE;
vfs_opterror(opts,
"IPv6 addresses clash");
goto done_deref_locked;
}
}
}
}
#endif
onamelen = namelen = 0;
if (name != NULL) {
/* Give a default name of the jid. */
if (name[0] == '\0')
snprintf(name = numbuf, sizeof(numbuf), "%d", jid);
else if (*namelc == '0' || (strtoul(namelc, &p, 10) != jid &&
*p == '\0')) {
error = EINVAL;
vfs_opterror(opts,
"name cannot be numeric (unless it is the jid)");
goto done_deref_locked;
}
/*
* Make sure the name isn't too long for the prison or its
* children.
*/
onamelen = strlen(pr->pr_name);
namelen = strlen(name);
if (strlen(ppr->pr_name) + namelen + 2 > sizeof(pr->pr_name)) {
error = ENAMETOOLONG;
goto done_deref_locked;
}
FOREACH_PRISON_DESCENDANT(pr, tpr, descend) {
if (strlen(tpr->pr_name) + (namelen - onamelen) >=
sizeof(pr->pr_name)) {
error = ENAMETOOLONG;
goto done_deref_locked;
}
}
}
if (pr_allow & ~ppr->pr_allow) {
error = EPERM;
goto done_deref_locked;
}
/* Set the parameters of the prison. */
#ifdef INET
redo_ip4 = 0;
if (pr_flags & PR_IP4_USER) {
pr->pr_flags |= PR_IP4;
free(pr->pr_ip4, M_PRISON);
pr->pr_ip4s = ip4s;
pr->pr_ip4 = ip4;
ip4 = NULL;
FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
#ifdef VIMAGE
if (tpr->pr_flags & PR_VNET) {
descend = 0;
continue;
}
#endif
if (prison_restrict_ip4(tpr, NULL)) {
redo_ip4 = 1;
descend = 0;
}
}
}
#endif
#ifdef INET6
redo_ip6 = 0;
if (pr_flags & PR_IP6_USER) {
pr->pr_flags |= PR_IP6;
free(pr->pr_ip6, M_PRISON);
pr->pr_ip6s = ip6s;
pr->pr_ip6 = ip6;
ip6 = NULL;
FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
#ifdef VIMAGE
if (tpr->pr_flags & PR_VNET) {
descend = 0;
continue;
}
#endif
if (prison_restrict_ip6(tpr, NULL)) {
redo_ip6 = 1;
descend = 0;
}
}
}
#endif
if (gotslevel) {
pr->pr_securelevel = slevel;
/* Set all child jails to be at least this level. */
FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
if (tpr->pr_securelevel < slevel)
tpr->pr_securelevel = slevel;
}
if (gotchildmax) {
pr->pr_childmax = childmax;
/* Set all child jails to under this limit. */
FOREACH_PRISON_DESCENDANT_LOCKED_LEVEL(pr, tpr, descend, level)
if (tpr->pr_childmax > childmax - level)
tpr->pr_childmax = childmax > level
? childmax - level : 0;
}
if (gotenforce) {
pr->pr_enforce_statfs = enforce;
/* Pass this restriction on to the children. */
FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
if (tpr->pr_enforce_statfs < enforce)
tpr->pr_enforce_statfs = enforce;
}
if (name != NULL) {
if (ppr == &prison0)
strlcpy(pr->pr_name, name, sizeof(pr->pr_name));
else
snprintf(pr->pr_name, sizeof(pr->pr_name), "%s.%s",
ppr->pr_name, name);
/* Change this component of child names. */
FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
bcopy(tpr->pr_name + onamelen, tpr->pr_name + namelen,
strlen(tpr->pr_name + onamelen) + 1);
bcopy(pr->pr_name, tpr->pr_name, namelen);
}
}
if (path != NULL) {
/* Try to keep a real-rooted full pathname. */
if (path[0] == '/' && strcmp(mypr->pr_path, "/"))
snprintf(pr->pr_path, sizeof(pr->pr_path), "%s%s",
mypr->pr_path, path);
else
strlcpy(pr->pr_path, path, sizeof(pr->pr_path));
pr->pr_root = root;
}
if (PR_HOST & ch_flags & ~pr_flags) {
if (pr->pr_flags & PR_HOST) {
/*
* Copy the parent's host info. As with pr_ip4 above,
* the lack of a lock on the parent is not a problem;
* it is always set with allprison_lock at least
* shared, and is held exclusively here.
*/
strlcpy(pr->pr_hostname, pr->pr_parent->pr_hostname,
sizeof(pr->pr_hostname));
strlcpy(pr->pr_domainname, pr->pr_parent->pr_domainname,
sizeof(pr->pr_domainname));
strlcpy(pr->pr_hostuuid, pr->pr_parent->pr_hostuuid,
sizeof(pr->pr_hostuuid));
pr->pr_hostid = pr->pr_parent->pr_hostid;
}
} else if (host != NULL || domain != NULL || uuid != NULL || gothid) {
/* Set this prison, and any descendants without PR_HOST. */
if (host != NULL)
strlcpy(pr->pr_hostname, host, sizeof(pr->pr_hostname));
if (domain != NULL)
strlcpy(pr->pr_domainname, domain,
sizeof(pr->pr_domainname));
if (uuid != NULL)
strlcpy(pr->pr_hostuuid, uuid, sizeof(pr->pr_hostuuid));
if (gothid)
pr->pr_hostid = hid;
FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
if (tpr->pr_flags & PR_HOST)
descend = 0;
else {
if (host != NULL)
strlcpy(tpr->pr_hostname,
pr->pr_hostname,
sizeof(tpr->pr_hostname));
if (domain != NULL)
strlcpy(tpr->pr_domainname,
pr->pr_domainname,
sizeof(tpr->pr_domainname));
if (uuid != NULL)
strlcpy(tpr->pr_hostuuid,
pr->pr_hostuuid,
sizeof(tpr->pr_hostuuid));
if (gothid)
tpr->pr_hostid = hid;
}
}
}
if ((tallow = ch_allow & ~pr_allow)) {
/* Clear allow bits in all children. */
FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
tpr->pr_allow &= ~tallow;
}
pr->pr_allow = (pr->pr_allow & ~ch_allow) | pr_allow;
/*
* Persistent prisons get an extra reference, and prisons losing their
* persist flag lose that reference. Only do this for existing prisons
* for now, so new ones will remain unseen until after the module
* handlers have completed.
*/
if (!created && (ch_flags & PR_PERSIST & (pr_flags ^ pr->pr_flags))) {
if (pr_flags & PR_PERSIST) {
pr->pr_ref++;
pr->pr_uref++;
} else {
pr->pr_ref--;
pr->pr_uref--;
}
}
pr->pr_flags = (pr->pr_flags & ~ch_flags) | pr_flags;
mtx_unlock(&pr->pr_mtx);
#ifdef RACCT
if (created)
prison_racct_attach(pr);
#endif
/* Locks may have prevented a complete restriction of child IP
* addresses. If so, allocate some more memory and try again.
*/
#ifdef INET
while (redo_ip4) {
ip4s = pr->pr_ip4s;
ip4 = malloc(ip4s * sizeof(*ip4), M_PRISON, M_WAITOK);
mtx_lock(&pr->pr_mtx);
redo_ip4 = 0;
FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
#ifdef VIMAGE
if (tpr->pr_flags & PR_VNET) {
descend = 0;
continue;
}
#endif
if (prison_restrict_ip4(tpr, ip4)) {
if (ip4 != NULL)
ip4 = NULL;
else
redo_ip4 = 1;
}
}
mtx_unlock(&pr->pr_mtx);
}
#endif
#ifdef INET6
while (redo_ip6) {
ip6s = pr->pr_ip6s;
ip6 = malloc(ip6s * sizeof(*ip6), M_PRISON, M_WAITOK);
mtx_lock(&pr->pr_mtx);
redo_ip6 = 0;
FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
#ifdef VIMAGE
if (tpr->pr_flags & PR_VNET) {
descend = 0;
continue;
}
#endif
if (prison_restrict_ip6(tpr, ip6)) {
if (ip6 != NULL)
ip6 = NULL;
else
redo_ip6 = 1;
}
}
mtx_unlock(&pr->pr_mtx);
}
#endif
/* Let the modules do their work. */
sx_downgrade(&allprison_lock);
if (created) {
error = osd_jail_call(pr, PR_METHOD_CREATE, opts);
if (error) {
prison_deref(pr, PD_LIST_SLOCKED);
goto done_errmsg;
}
}
error = osd_jail_call(pr, PR_METHOD_SET, opts);
if (error) {
prison_deref(pr, created
? PD_LIST_SLOCKED
: PD_DEREF | PD_LIST_SLOCKED);
goto done_errmsg;
}
/* Attach this process to the prison if requested. */
if (flags & JAIL_ATTACH) {
mtx_lock(&pr->pr_mtx);
error = do_jail_attach(td, pr);
if (error) {
vfs_opterror(opts, "attach failed");
if (!created)
prison_deref(pr, PD_DEREF);
goto done_errmsg;
}
}
/*
* Now that it is all there, drop the temporary reference from existing
* prisons. Or add a reference to newly created persistent prisons
* (which was not done earlier so that the prison would not be publicly
* visible).
*/
if (!created) {
prison_deref(pr, (flags & JAIL_ATTACH)
? PD_DEREF
: PD_DEREF | PD_LIST_SLOCKED);
} else {
if (pr_flags & PR_PERSIST) {
mtx_lock(&pr->pr_mtx);
pr->pr_ref++;
pr->pr_uref++;
mtx_unlock(&pr->pr_mtx);
}
if (!(flags & JAIL_ATTACH))
sx_sunlock(&allprison_lock);
}
td->td_retval[0] = pr->pr_id;
goto done_errmsg;
done_deref_locked:
prison_deref(pr, created
? PD_LOCKED | PD_LIST_XLOCKED
: PD_DEREF | PD_LOCKED | PD_LIST_XLOCKED);
goto done_releroot;
done_unlock_list:
sx_xunlock(&allprison_lock);
done_releroot:
if (root != NULL) {
vfslocked = VFS_LOCK_GIANT(root->v_mount);
vrele(root);
VFS_UNLOCK_GIANT(vfslocked);
}
done_errmsg:
if (error) {
vfs_getopt(opts, "errmsg", (void **)&errmsg, &errmsg_len);
if (errmsg_len > 0) {
errmsg_pos = 2 * vfs_getopt_pos(opts, "errmsg") + 1;
if (errmsg_pos > 0) {
if (optuio->uio_segflg == UIO_SYSSPACE)
bcopy(errmsg,
optuio->uio_iov[errmsg_pos].iov_base,
errmsg_len);
else
copyout(errmsg,
optuio->uio_iov[errmsg_pos].iov_base,
errmsg_len);
}
}
}
done_free:
#ifdef INET
free(ip4, M_PRISON);
#endif
#ifdef INET6
free(ip6, M_PRISON);
#endif
vfs_freeopts(opts);
return (error);
}
/*
* struct jail_get_args {
* struct iovec *iovp;
* unsigned int iovcnt;
* int flags;
* };
*/
int
-jail_get(struct thread *td, struct jail_get_args *uap)
+sys_jail_get(struct thread *td, struct jail_get_args *uap)
{
struct uio *auio;
int error;
/* Check that we have an even number of iovecs. */
if (uap->iovcnt & 1)
return (EINVAL);
error = copyinuio(uap->iovp, uap->iovcnt, &auio);
if (error)
return (error);
error = kern_jail_get(td, auio, uap->flags);
if (error == 0)
error = copyout(auio->uio_iov, uap->iovp,
uap->iovcnt * sizeof (struct iovec));
free(auio, M_IOV);
return (error);
}
int
kern_jail_get(struct thread *td, struct uio *optuio, int flags)
{
struct prison *pr, *mypr;
struct vfsopt *opt;
struct vfsoptlist *opts;
char *errmsg, *name;
int error, errmsg_len, errmsg_pos, fi, i, jid, len, locked, pos;
if (flags & ~JAIL_GET_MASK)
return (EINVAL);
/* Get the parameter list. */
error = vfs_buildopts(optuio, &opts);
if (error)
return (error);
errmsg_pos = vfs_getopt_pos(opts, "errmsg");
mypr = td->td_ucred->cr_prison;
/*
* Find the prison specified by one of: lastjid, jid, name.
*/
sx_slock(&allprison_lock);
error = vfs_copyopt(opts, "lastjid", &jid, sizeof(jid));
if (error == 0) {
TAILQ_FOREACH(pr, &allprison, pr_list) {
if (pr->pr_id > jid && prison_ischild(mypr, pr)) {
mtx_lock(&pr->pr_mtx);
if (pr->pr_ref > 0 &&
(pr->pr_uref > 0 || (flags & JAIL_DYING)))
break;
mtx_unlock(&pr->pr_mtx);
}
}
if (pr != NULL)
goto found_prison;
error = ENOENT;
vfs_opterror(opts, "no jail after %d", jid);
goto done_unlock_list;
} else if (error != ENOENT)
goto done_unlock_list;
error = vfs_copyopt(opts, "jid", &jid, sizeof(jid));
if (error == 0) {
if (jid != 0) {
pr = prison_find_child(mypr, jid);
if (pr != NULL) {
if (pr->pr_uref == 0 && !(flags & JAIL_DYING)) {
mtx_unlock(&pr->pr_mtx);
error = ENOENT;
vfs_opterror(opts, "jail %d is dying",
jid);
goto done_unlock_list;
}
goto found_prison;
}
error = ENOENT;
vfs_opterror(opts, "jail %d not found", jid);
goto done_unlock_list;
}
} else if (error != ENOENT)
goto done_unlock_list;
error = vfs_getopt(opts, "name", (void **)&name, &len);
if (error == 0) {
if (len == 0 || name[len - 1] != '\0') {
error = EINVAL;
goto done_unlock_list;
}
pr = prison_find_name(mypr, name);
if (pr != NULL) {
if (pr->pr_uref == 0 && !(flags & JAIL_DYING)) {
mtx_unlock(&pr->pr_mtx);
error = ENOENT;
vfs_opterror(opts, "jail \"%s\" is dying",
name);
goto done_unlock_list;
}
goto found_prison;
}
error = ENOENT;
vfs_opterror(opts, "jail \"%s\" not found", name);
goto done_unlock_list;
} else if (error != ENOENT)
goto done_unlock_list;
vfs_opterror(opts, "no jail specified");
error = ENOENT;
goto done_unlock_list;
found_prison:
/* Get the parameters of the prison. */
pr->pr_ref++;
locked = PD_LOCKED;
td->td_retval[0] = pr->pr_id;
error = vfs_setopt(opts, "jid", &pr->pr_id, sizeof(pr->pr_id));
if (error != 0 && error != ENOENT)
goto done_deref;
i = (pr->pr_parent == mypr) ? 0 : pr->pr_parent->pr_id;
error = vfs_setopt(opts, "parent", &i, sizeof(i));
if (error != 0 && error != ENOENT)
goto done_deref;
error = vfs_setopts(opts, "name", prison_name(mypr, pr));
if (error != 0 && error != ENOENT)
goto done_deref;
error = vfs_setopt(opts, "cpuset.id", &pr->pr_cpuset->cs_id,
sizeof(pr->pr_cpuset->cs_id));
if (error != 0 && error != ENOENT)
goto done_deref;
error = vfs_setopts(opts, "path", prison_path(mypr, pr));
if (error != 0 && error != ENOENT)
goto done_deref;
#ifdef INET
error = vfs_setopt_part(opts, "ip4.addr", pr->pr_ip4,
pr->pr_ip4s * sizeof(*pr->pr_ip4));
if (error != 0 && error != ENOENT)
goto done_deref;
#endif
#ifdef INET6
error = vfs_setopt_part(opts, "ip6.addr", pr->pr_ip6,
pr->pr_ip6s * sizeof(*pr->pr_ip6));
if (error != 0 && error != ENOENT)
goto done_deref;
#endif
error = vfs_setopt(opts, "securelevel", &pr->pr_securelevel,
sizeof(pr->pr_securelevel));
if (error != 0 && error != ENOENT)
goto done_deref;
error = vfs_setopt(opts, "children.cur", &pr->pr_childcount,
sizeof(pr->pr_childcount));
if (error != 0 && error != ENOENT)
goto done_deref;
error = vfs_setopt(opts, "children.max", &pr->pr_childmax,
sizeof(pr->pr_childmax));
if (error != 0 && error != ENOENT)
goto done_deref;
error = vfs_setopts(opts, "host.hostname", pr->pr_hostname);
if (error != 0 && error != ENOENT)
goto done_deref;
error = vfs_setopts(opts, "host.domainname", pr->pr_domainname);
if (error != 0 && error != ENOENT)
goto done_deref;
error = vfs_setopts(opts, "host.hostuuid", pr->pr_hostuuid);
if (error != 0 && error != ENOENT)
goto done_deref;
#ifdef COMPAT_FREEBSD32
if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
uint32_t hid32 = pr->pr_hostid;
error = vfs_setopt(opts, "host.hostid", &hid32, sizeof(hid32));
} else
#endif
error = vfs_setopt(opts, "host.hostid", &pr->pr_hostid,
sizeof(pr->pr_hostid));
if (error != 0 && error != ENOENT)
goto done_deref;
error = vfs_setopt(opts, "enforce_statfs", &pr->pr_enforce_statfs,
sizeof(pr->pr_enforce_statfs));
if (error != 0 && error != ENOENT)
goto done_deref;
for (fi = 0; fi < sizeof(pr_flag_names) / sizeof(pr_flag_names[0]);
fi++) {
if (pr_flag_names[fi] == NULL)
continue;
i = (pr->pr_flags & (1 << fi)) ? 1 : 0;
error = vfs_setopt(opts, pr_flag_names[fi], &i, sizeof(i));
if (error != 0 && error != ENOENT)
goto done_deref;
i = !i;
error = vfs_setopt(opts, pr_flag_nonames[fi], &i, sizeof(i));
if (error != 0 && error != ENOENT)
goto done_deref;
}
for (fi = 0; fi < sizeof(pr_flag_jailsys) / sizeof(pr_flag_jailsys[0]);
fi++) {
i = pr->pr_flags &
(pr_flag_jailsys[fi].disable | pr_flag_jailsys[fi].new);
i = pr_flag_jailsys[fi].disable &&
(i == pr_flag_jailsys[fi].disable) ? JAIL_SYS_DISABLE
: (i == pr_flag_jailsys[fi].new) ? JAIL_SYS_NEW
: JAIL_SYS_INHERIT;
error =
vfs_setopt(opts, pr_flag_jailsys[fi].name, &i, sizeof(i));
if (error != 0 && error != ENOENT)
goto done_deref;
}
for (fi = 0; fi < sizeof(pr_allow_names) / sizeof(pr_allow_names[0]);
fi++) {
if (pr_allow_names[fi] == NULL)
continue;
i = (pr->pr_allow & (1 << fi)) ? 1 : 0;
error = vfs_setopt(opts, pr_allow_names[fi], &i, sizeof(i));
if (error != 0 && error != ENOENT)
goto done_deref;
i = !i;
error = vfs_setopt(opts, pr_allow_nonames[fi], &i, sizeof(i));
if (error != 0 && error != ENOENT)
goto done_deref;
}
i = (pr->pr_uref == 0);
error = vfs_setopt(opts, "dying", &i, sizeof(i));
if (error != 0 && error != ENOENT)
goto done_deref;
i = !i;
error = vfs_setopt(opts, "nodying", &i, sizeof(i));
if (error != 0 && error != ENOENT)
goto done_deref;
/* Get the module parameters. */
mtx_unlock(&pr->pr_mtx);
locked = 0;
error = osd_jail_call(pr, PR_METHOD_GET, opts);
if (error)
goto done_deref;
prison_deref(pr, PD_DEREF | PD_LIST_SLOCKED);
/* By now, all parameters should have been noted. */
TAILQ_FOREACH(opt, opts, link) {
if (!opt->seen && strcmp(opt->name, "errmsg")) {
error = EINVAL;
vfs_opterror(opts, "unknown parameter: %s", opt->name);
goto done_errmsg;
}
}
/* Write the fetched parameters back to userspace. */
error = 0;
TAILQ_FOREACH(opt, opts, link) {
if (opt->pos >= 0 && opt->pos != errmsg_pos) {
pos = 2 * opt->pos + 1;
optuio->uio_iov[pos].iov_len = opt->len;
if (opt->value != NULL) {
if (optuio->uio_segflg == UIO_SYSSPACE) {
bcopy(opt->value,
optuio->uio_iov[pos].iov_base,
opt->len);
} else {
error = copyout(opt->value,
optuio->uio_iov[pos].iov_base,
opt->len);
if (error)
break;
}
}
}
}
goto done_errmsg;
done_deref:
prison_deref(pr, locked | PD_DEREF | PD_LIST_SLOCKED);
goto done_errmsg;
done_unlock_list:
sx_sunlock(&allprison_lock);
done_errmsg:
if (error && errmsg_pos >= 0) {
vfs_getopt(opts, "errmsg", (void **)&errmsg, &errmsg_len);
errmsg_pos = 2 * errmsg_pos + 1;
if (errmsg_len > 0) {
if (optuio->uio_segflg == UIO_SYSSPACE)
bcopy(errmsg,
optuio->uio_iov[errmsg_pos].iov_base,
errmsg_len);
else
copyout(errmsg,
optuio->uio_iov[errmsg_pos].iov_base,
errmsg_len);
}
}
vfs_freeopts(opts);
return (error);
}
/*
* struct jail_remove_args {
* int jid;
* };
*/
int
-jail_remove(struct thread *td, struct jail_remove_args *uap)
+sys_jail_remove(struct thread *td, struct jail_remove_args *uap)
{
struct prison *pr, *cpr, *lpr, *tpr;
int descend, error;
error = priv_check(td, PRIV_JAIL_REMOVE);
if (error)
return (error);
sx_xlock(&allprison_lock);
pr = prison_find_child(td->td_ucred->cr_prison, uap->jid);
if (pr == NULL) {
sx_xunlock(&allprison_lock);
return (EINVAL);
}
/* Remove all descendants of this prison, then remove this prison. */
pr->pr_ref++;
pr->pr_flags |= PR_REMOVE;
if (!LIST_EMPTY(&pr->pr_children)) {
mtx_unlock(&pr->pr_mtx);
lpr = NULL;
FOREACH_PRISON_DESCENDANT(pr, cpr, descend) {
mtx_lock(&cpr->pr_mtx);
if (cpr->pr_ref > 0) {
tpr = cpr;
cpr->pr_ref++;
cpr->pr_flags |= PR_REMOVE;
} else {
/* Already removed - do not do it again. */
tpr = NULL;
}
mtx_unlock(&cpr->pr_mtx);
if (lpr != NULL) {
mtx_lock(&lpr->pr_mtx);
prison_remove_one(lpr);
sx_xlock(&allprison_lock);
}
lpr = tpr;
}
if (lpr != NULL) {
mtx_lock(&lpr->pr_mtx);
prison_remove_one(lpr);
sx_xlock(&allprison_lock);
}
mtx_lock(&pr->pr_mtx);
}
prison_remove_one(pr);
return (0);
}
static void
prison_remove_one(struct prison *pr)
{
struct proc *p;
int deuref;
/* If the prison was persistent, it is not anymore. */
deuref = 0;
if (pr->pr_flags & PR_PERSIST) {
pr->pr_ref--;
deuref = PD_DEUREF;
pr->pr_flags &= ~PR_PERSIST;
}
/*
* jail_remove added a reference. If that's the only one, remove
* the prison now.
*/
KASSERT(pr->pr_ref > 0,
("prison_remove_one removing a dead prison (jid=%d)", pr->pr_id));
if (pr->pr_ref == 1) {
prison_deref(pr,
deuref | PD_DEREF | PD_LOCKED | PD_LIST_XLOCKED);
return;
}
mtx_unlock(&pr->pr_mtx);
sx_xunlock(&allprison_lock);
/*
* Kill all processes unfortunate enough to be attached to this prison.
*/
sx_slock(&allproc_lock);
LIST_FOREACH(p, &allproc, p_list) {
PROC_LOCK(p);
if (p->p_state != PRS_NEW && p->p_ucred &&
p->p_ucred->cr_prison == pr)
- psignal(p, SIGKILL);
+ kern_psignal(p, SIGKILL);
PROC_UNLOCK(p);
}
sx_sunlock(&allproc_lock);
/* Remove the temporary reference added by jail_remove. */
prison_deref(pr, deuref | PD_DEREF);
}
/*
* struct jail_attach_args {
* int jid;
* };
*/
int
-jail_attach(struct thread *td, struct jail_attach_args *uap)
+sys_jail_attach(struct thread *td, struct jail_attach_args *uap)
{
struct prison *pr;
int error;
error = priv_check(td, PRIV_JAIL_ATTACH);
if (error)
return (error);
sx_slock(&allprison_lock);
pr = prison_find_child(td->td_ucred->cr_prison, uap->jid);
if (pr == NULL) {
sx_sunlock(&allprison_lock);
return (EINVAL);
}
/*
* Do not allow a process to attach to a prison that is not
* considered to be "alive".
*/
if (pr->pr_uref == 0) {
mtx_unlock(&pr->pr_mtx);
sx_sunlock(&allprison_lock);
return (EINVAL);
}
return (do_jail_attach(td, pr));
}
static int
do_jail_attach(struct thread *td, struct prison *pr)
{
struct prison *ppr;
struct proc *p;
struct ucred *newcred, *oldcred;
int vfslocked, error;
/*
* XXX: Note that there is a slight race here if two threads
* in the same privileged process attempt to attach to two
* different jails at the same time. It is important for
* user processes not to do this, or they might end up with
* a process root from one prison, but attached to the jail
* of another.
*/
pr->pr_ref++;
pr->pr_uref++;
mtx_unlock(&pr->pr_mtx);
/* Let modules do whatever they need to prepare for attaching. */
error = osd_jail_call(pr, PR_METHOD_ATTACH, td);
if (error) {
prison_deref(pr, PD_DEREF | PD_DEUREF | PD_LIST_SLOCKED);
return (error);
}
sx_sunlock(&allprison_lock);
/*
* Reparent the newly attached process to this jail.
*/
ppr = td->td_ucred->cr_prison;
p = td->td_proc;
error = cpuset_setproc_update_set(p, pr->pr_cpuset);
if (error)
goto e_revert_osd;
vfslocked = VFS_LOCK_GIANT(pr->pr_root->v_mount);
vn_lock(pr->pr_root, LK_EXCLUSIVE | LK_RETRY);
if ((error = change_dir(pr->pr_root, td)) != 0)
goto e_unlock;
#ifdef MAC
if ((error = mac_vnode_check_chroot(td->td_ucred, pr->pr_root)))
goto e_unlock;
#endif
VOP_UNLOCK(pr->pr_root, 0);
if ((error = change_root(pr->pr_root, td)))
goto e_unlock_giant;
VFS_UNLOCK_GIANT(vfslocked);
newcred = crget();
PROC_LOCK(p);
oldcred = p->p_ucred;
setsugid(p);
crcopy(newcred, oldcred);
newcred->cr_prison = pr;
p->p_ucred = newcred;
PROC_UNLOCK(p);
#ifdef RACCT
racct_proc_ucred_changed(p, oldcred, newcred);
#endif
crfree(oldcred);
prison_deref(ppr, PD_DEREF | PD_DEUREF);
return (0);
e_unlock:
VOP_UNLOCK(pr->pr_root, 0);
e_unlock_giant:
VFS_UNLOCK_GIANT(vfslocked);
e_revert_osd:
/* Tell modules this thread is still in its old jail after all. */
(void)osd_jail_call(ppr, PR_METHOD_ATTACH, td);
prison_deref(pr, PD_DEREF | PD_DEUREF);
return (error);
}
/*
* Returns a locked prison instance, or NULL on failure.
*/
struct prison *
prison_find(int prid)
{
struct prison *pr;
sx_assert(&allprison_lock, SX_LOCKED);
TAILQ_FOREACH(pr, &allprison, pr_list) {
if (pr->pr_id == prid) {
mtx_lock(&pr->pr_mtx);
if (pr->pr_ref > 0)
return (pr);
mtx_unlock(&pr->pr_mtx);
}
}
return (NULL);
}
/*
* Find a prison that is a descendant of mypr. Returns a locked prison or NULL.
*/
struct prison *
prison_find_child(struct prison *mypr, int prid)
{
struct prison *pr;
int descend;
sx_assert(&allprison_lock, SX_LOCKED);
FOREACH_PRISON_DESCENDANT(mypr, pr, descend) {
if (pr->pr_id == prid) {
mtx_lock(&pr->pr_mtx);
if (pr->pr_ref > 0)
return (pr);
mtx_unlock(&pr->pr_mtx);
}
}
return (NULL);
}
/*
* Look for the name relative to mypr. Returns a locked prison or NULL.
*/
struct prison *
prison_find_name(struct prison *mypr, const char *name)
{
struct prison *pr, *deadpr;
size_t mylen;
int descend;
sx_assert(&allprison_lock, SX_LOCKED);
mylen = (mypr == &prison0) ? 0 : strlen(mypr->pr_name) + 1;
again:
deadpr = NULL;
FOREACH_PRISON_DESCENDANT(mypr, pr, descend) {
if (!strcmp(pr->pr_name + mylen, name)) {
mtx_lock(&pr->pr_mtx);
if (pr->pr_ref > 0) {
if (pr->pr_uref > 0)
return (pr);
deadpr = pr;
}
mtx_unlock(&pr->pr_mtx);
}
}
/* There was no valid prison - perhaps there was a dying one. */
if (deadpr != NULL) {
mtx_lock(&deadpr->pr_mtx);
if (deadpr->pr_ref == 0) {
mtx_unlock(&deadpr->pr_mtx);
goto again;
}
}
return (deadpr);
}
/*
* See if a prison has the specific flag set.
*/
int
prison_flag(struct ucred *cred, unsigned flag)
{
/* This is an atomic read, so no locking is necessary. */
return (cred->cr_prison->pr_flags & flag);
}
int
prison_allow(struct ucred *cred, unsigned flag)
{
/* This is an atomic read, so no locking is necessary. */
return (cred->cr_prison->pr_allow & flag);
}
/*
* Remove a prison reference. If that was the last reference, remove the
* prison itself - but not in this context in case there are locks held.
*/
void
prison_free_locked(struct prison *pr)
{
mtx_assert(&pr->pr_mtx, MA_OWNED);
pr->pr_ref--;
if (pr->pr_ref == 0) {
mtx_unlock(&pr->pr_mtx);
TASK_INIT(&pr->pr_task, 0, prison_complete, pr);
taskqueue_enqueue(taskqueue_thread, &pr->pr_task);
return;
}
mtx_unlock(&pr->pr_mtx);
}
void
prison_free(struct prison *pr)
{
mtx_lock(&pr->pr_mtx);
prison_free_locked(pr);
}
static void
prison_complete(void *context, int pending)
{
prison_deref((struct prison *)context, 0);
}
/*
* Remove a prison reference (usually). This internal version assumes no
* mutexes are held, except perhaps the prison itself. If there are no more
* references, release and delist the prison. On completion, the prison lock
* and the allprison lock are both unlocked.
*/
static void
prison_deref(struct prison *pr, int flags)
{
struct prison *ppr, *tpr;
int vfslocked;
if (!(flags & PD_LOCKED))
mtx_lock(&pr->pr_mtx);
for (;;) {
if (flags & PD_DEUREF) {
pr->pr_uref--;
KASSERT(prison0.pr_uref != 0, ("prison0 pr_uref=0"));
}
if (flags & PD_DEREF)
pr->pr_ref--;
/* If the prison still has references, nothing else to do. */
if (pr->pr_ref > 0) {
mtx_unlock(&pr->pr_mtx);
if (flags & PD_LIST_SLOCKED)
sx_sunlock(&allprison_lock);
else if (flags & PD_LIST_XLOCKED)
sx_xunlock(&allprison_lock);
return;
}
mtx_unlock(&pr->pr_mtx);
if (flags & PD_LIST_SLOCKED) {
if (!sx_try_upgrade(&allprison_lock)) {
sx_sunlock(&allprison_lock);
sx_xlock(&allprison_lock);
}
} else if (!(flags & PD_LIST_XLOCKED))
sx_xlock(&allprison_lock);
TAILQ_REMOVE(&allprison, pr, pr_list);
LIST_REMOVE(pr, pr_sibling);
ppr = pr->pr_parent;
for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent)
tpr->pr_childcount--;
sx_xunlock(&allprison_lock);
#ifdef VIMAGE
if (pr->pr_vnet != ppr->pr_vnet)
vnet_destroy(pr->pr_vnet);
#endif
if (pr->pr_root != NULL) {
vfslocked = VFS_LOCK_GIANT(pr->pr_root->v_mount);
vrele(pr->pr_root);
VFS_UNLOCK_GIANT(vfslocked);
}
mtx_destroy(&pr->pr_mtx);
#ifdef INET
free(pr->pr_ip4, M_PRISON);
#endif
#ifdef INET6
free(pr->pr_ip6, M_PRISON);
#endif
if (pr->pr_cpuset != NULL)
cpuset_rel(pr->pr_cpuset);
osd_jail_exit(pr);
#ifdef RACCT
prison_racct_detach(pr);
#endif
free(pr, M_PRISON);
/* Removing a prison frees a reference on its parent. */
pr = ppr;
mtx_lock(&pr->pr_mtx);
flags = PD_DEREF | PD_DEUREF;
}
}
void
prison_hold_locked(struct prison *pr)
{
mtx_assert(&pr->pr_mtx, MA_OWNED);
KASSERT(pr->pr_ref > 0,
("Trying to hold dead prison (jid=%d).", pr->pr_id));
pr->pr_ref++;
}
void
prison_hold(struct prison *pr)
{
mtx_lock(&pr->pr_mtx);
prison_hold_locked(pr);
mtx_unlock(&pr->pr_mtx);
}
void
prison_proc_hold(struct prison *pr)
{
mtx_lock(&pr->pr_mtx);
KASSERT(pr->pr_uref > 0,
("Cannot add a process to a non-alive prison (jid=%d)", pr->pr_id));
pr->pr_uref++;
mtx_unlock(&pr->pr_mtx);
}
void
prison_proc_free(struct prison *pr)
{
mtx_lock(&pr->pr_mtx);
KASSERT(pr->pr_uref > 0,
("Trying to kill a process in a dead prison (jid=%d)", pr->pr_id));
prison_deref(pr, PD_DEUREF | PD_LOCKED);
}
#ifdef INET
/*
* Restrict a prison's IP address list with its parent's, possibly replacing
* it. Return true if the replacement buffer was used (or would have been).
*/
static int
prison_restrict_ip4(struct prison *pr, struct in_addr *newip4)
{
int ii, ij, used;
struct prison *ppr;
ppr = pr->pr_parent;
if (!(pr->pr_flags & PR_IP4_USER)) {
/* This has no user settings, so just copy the parent's list. */
if (pr->pr_ip4s < ppr->pr_ip4s) {
/*
* There's no room for the parent's list. Use the
* new list buffer, which is assumed to be big enough
* (if it was passed). If there's no buffer, try to
* allocate one.
*/
used = 1;
if (newip4 == NULL) {
newip4 = malloc(ppr->pr_ip4s * sizeof(*newip4),
M_PRISON, M_NOWAIT);
if (newip4 != NULL)
used = 0;
}
if (newip4 != NULL) {
bcopy(ppr->pr_ip4, newip4,
ppr->pr_ip4s * sizeof(*newip4));
free(pr->pr_ip4, M_PRISON);
pr->pr_ip4 = newip4;
pr->pr_ip4s = ppr->pr_ip4s;
}
return (used);
}
pr->pr_ip4s = ppr->pr_ip4s;
if (pr->pr_ip4s > 0)
bcopy(ppr->pr_ip4, pr->pr_ip4,
pr->pr_ip4s * sizeof(*newip4));
else if (pr->pr_ip4 != NULL) {
free(pr->pr_ip4, M_PRISON);
pr->pr_ip4 = NULL;
}
} else if (pr->pr_ip4s > 0) {
/* Remove addresses that aren't in the parent. */
for (ij = 0; ij < ppr->pr_ip4s; ij++)
if (pr->pr_ip4[0].s_addr == ppr->pr_ip4[ij].s_addr)
break;
if (ij < ppr->pr_ip4s)
ii = 1;
else {
bcopy(pr->pr_ip4 + 1, pr->pr_ip4,
--pr->pr_ip4s * sizeof(*pr->pr_ip4));
ii = 0;
}
for (ij = 1; ii < pr->pr_ip4s; ) {
if (pr->pr_ip4[ii].s_addr == ppr->pr_ip4[0].s_addr) {
ii++;
continue;
}
switch (ij >= ppr->pr_ip4s ? -1 :
qcmp_v4(&pr->pr_ip4[ii], &ppr->pr_ip4[ij])) {
case -1:
bcopy(pr->pr_ip4 + ii + 1, pr->pr_ip4 + ii,
(--pr->pr_ip4s - ii) * sizeof(*pr->pr_ip4));
break;
case 0:
ii++;
ij++;
break;
case 1:
ij++;
break;
}
}
if (pr->pr_ip4s == 0) {
pr->pr_flags |= PR_IP4_DISABLE;
free(pr->pr_ip4, M_PRISON);
pr->pr_ip4 = NULL;
}
}
return (0);
}
/*
* Pass back primary IPv4 address of this jail.
*
* If not restricted return success but do not alter the address. Caller has
* to make sure to initialize it correctly (e.g. INADDR_ANY).
*
* Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv4.
* Address returned in NBO.
*/
int
prison_get_ip4(struct ucred *cred, struct in_addr *ia)
{
struct prison *pr;
KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
pr = cred->cr_prison;
if (!(pr->pr_flags & PR_IP4))
return (0);
mtx_lock(&pr->pr_mtx);
if (!(pr->pr_flags & PR_IP4)) {
mtx_unlock(&pr->pr_mtx);
return (0);
}
if (pr->pr_ip4 == NULL) {
mtx_unlock(&pr->pr_mtx);
return (EAFNOSUPPORT);
}
ia->s_addr = pr->pr_ip4[0].s_addr;
mtx_unlock(&pr->pr_mtx);
return (0);
}
/*
* Return 1 if we should do proper source address selection or are not jailed.
* We will return 0 if we should bypass source address selection in favour
* of the primary jail IPv4 address. Only in this case *ia will be updated and
* returned in NBO.
* Return EAFNOSUPPORT, in case this jail does not allow IPv4.
*/
int
prison_saddrsel_ip4(struct ucred *cred, struct in_addr *ia)
{
struct prison *pr;
struct in_addr lia;
int error;
KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
if (!jailed(cred))
return (1);
pr = cred->cr_prison;
if (pr->pr_flags & PR_IP4_SADDRSEL)
return (1);
lia.s_addr = INADDR_ANY;
error = prison_get_ip4(cred, &lia);
if (error)
return (error);
if (lia.s_addr == INADDR_ANY)
return (1);
ia->s_addr = lia.s_addr;
return (0);
}
/*
* Return true if pr1 and pr2 have the same IPv4 address restrictions.
*/
int
prison_equal_ip4(struct prison *pr1, struct prison *pr2)
{
if (pr1 == pr2)
return (1);
/*
* No need to lock since the PR_IP4_USER flag can't be altered for
* existing prisons.
*/
while (pr1 != &prison0 &&
#ifdef VIMAGE
!(pr1->pr_flags & PR_VNET) &&
#endif
!(pr1->pr_flags & PR_IP4_USER))
pr1 = pr1->pr_parent;
while (pr2 != &prison0 &&
#ifdef VIMAGE
!(pr2->pr_flags & PR_VNET) &&
#endif
!(pr2->pr_flags & PR_IP4_USER))
pr2 = pr2->pr_parent;
return (pr1 == pr2);
}
/*
* Make sure our (source) address is set to something meaningful to this
* jail.
*
* Returns 0 if jail doesn't restrict IPv4 or if address belongs to jail,
* EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail
* doesn't allow IPv4. Address passed in in NBO and returned in NBO.
*/
int
prison_local_ip4(struct ucred *cred, struct in_addr *ia)
{
struct prison *pr;
struct in_addr ia0;
int error;
KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
pr = cred->cr_prison;
if (!(pr->pr_flags & PR_IP4))
return (0);
mtx_lock(&pr->pr_mtx);
if (!(pr->pr_flags & PR_IP4)) {
mtx_unlock(&pr->pr_mtx);
return (0);
}
if (pr->pr_ip4 == NULL) {
mtx_unlock(&pr->pr_mtx);
return (EAFNOSUPPORT);
}
ia0.s_addr = ntohl(ia->s_addr);
if (ia0.s_addr == INADDR_LOOPBACK) {
ia->s_addr = pr->pr_ip4[0].s_addr;
mtx_unlock(&pr->pr_mtx);
return (0);
}
if (ia0.s_addr == INADDR_ANY) {
/*
* In case there is only 1 IPv4 address, bind directly.
*/
if (pr->pr_ip4s == 1)
ia->s_addr = pr->pr_ip4[0].s_addr;
mtx_unlock(&pr->pr_mtx);
return (0);
}
error = _prison_check_ip4(pr, ia);
mtx_unlock(&pr->pr_mtx);
return (error);
}
/*
* Rewrite destination address in case we will connect to loopback address.
*
* Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv4.
* Address passed in in NBO and returned in NBO.
*/
int
prison_remote_ip4(struct ucred *cred, struct in_addr *ia)
{
struct prison *pr;
KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
pr = cred->cr_prison;
if (!(pr->pr_flags & PR_IP4))
return (0);
mtx_lock(&pr->pr_mtx);
if (!(pr->pr_flags & PR_IP4)) {
mtx_unlock(&pr->pr_mtx);
return (0);
}
if (pr->pr_ip4 == NULL) {
mtx_unlock(&pr->pr_mtx);
return (EAFNOSUPPORT);
}
if (ntohl(ia->s_addr) == INADDR_LOOPBACK) {
ia->s_addr = pr->pr_ip4[0].s_addr;
mtx_unlock(&pr->pr_mtx);
return (0);
}
/*
* Return success because nothing had to be changed.
*/
mtx_unlock(&pr->pr_mtx);
return (0);
}
/*
* Check if given address belongs to the jail referenced by cred/prison.
*
* Returns 0 if jail doesn't restrict IPv4 or if address belongs to jail,
* EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail
* doesn't allow IPv4. Address passed in in NBO.
*/
static int
_prison_check_ip4(struct prison *pr, struct in_addr *ia)
{
int i, a, z, d;
/*
* Check the primary IP.
*/
if (pr->pr_ip4[0].s_addr == ia->s_addr)
return (0);
/*
* All the other IPs are sorted so we can do a binary search.
*/
a = 0;
z = pr->pr_ip4s - 2;
while (a <= z) {
i = (a + z) / 2;
d = qcmp_v4(&pr->pr_ip4[i+1], ia);
if (d > 0)
z = i - 1;
else if (d < 0)
a = i + 1;
else
return (0);
}
return (EADDRNOTAVAIL);
}
int
prison_check_ip4(struct ucred *cred, struct in_addr *ia)
{
struct prison *pr;
int error;
KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
pr = cred->cr_prison;
if (!(pr->pr_flags & PR_IP4))
return (0);
mtx_lock(&pr->pr_mtx);
if (!(pr->pr_flags & PR_IP4)) {
mtx_unlock(&pr->pr_mtx);
return (0);
}
if (pr->pr_ip4 == NULL) {
mtx_unlock(&pr->pr_mtx);
return (EAFNOSUPPORT);
}
error = _prison_check_ip4(pr, ia);
mtx_unlock(&pr->pr_mtx);
return (error);
}
#endif
#ifdef INET6
static int
prison_restrict_ip6(struct prison *pr, struct in6_addr *newip6)
{
int ii, ij, used;
struct prison *ppr;
ppr = pr->pr_parent;
if (!(pr->pr_flags & PR_IP6_USER)) {
/* This has no user settings, so just copy the parent's list. */
if (pr->pr_ip6s < ppr->pr_ip6s) {
/*
* There's no room for the parent's list. Use the
* new list buffer, which is assumed to be big enough
* (if it was passed). If there's no buffer, try to
* allocate one.
*/
used = 1;
if (newip6 == NULL) {
newip6 = malloc(ppr->pr_ip6s * sizeof(*newip6),
M_PRISON, M_NOWAIT);
if (newip6 != NULL)
used = 0;
}
if (newip6 != NULL) {
bcopy(ppr->pr_ip6, newip6,
ppr->pr_ip6s * sizeof(*newip6));
free(pr->pr_ip6, M_PRISON);
pr->pr_ip6 = newip6;
pr->pr_ip6s = ppr->pr_ip6s;
}
return (used);
}
pr->pr_ip6s = ppr->pr_ip6s;
if (pr->pr_ip6s > 0)
bcopy(ppr->pr_ip6, pr->pr_ip6,
pr->pr_ip6s * sizeof(*newip6));
else if (pr->pr_ip6 != NULL) {
free(pr->pr_ip6, M_PRISON);
pr->pr_ip6 = NULL;
}
} else if (pr->pr_ip6s > 0) {
/* Remove addresses that aren't in the parent. */
for (ij = 0; ij < ppr->pr_ip6s; ij++)
if (IN6_ARE_ADDR_EQUAL(&pr->pr_ip6[0],
&ppr->pr_ip6[ij]))
break;
if (ij < ppr->pr_ip6s)
ii = 1;
else {
bcopy(pr->pr_ip6 + 1, pr->pr_ip6,
--pr->pr_ip6s * sizeof(*pr->pr_ip6));
ii = 0;
}
for (ij = 1; ii < pr->pr_ip6s; ) {
if (IN6_ARE_ADDR_EQUAL(&pr->pr_ip6[ii],
&ppr->pr_ip6[0])) {
ii++;
continue;
}
switch (ij >= ppr->pr_ip4s ? -1 :
qcmp_v6(&pr->pr_ip6[ii], &ppr->pr_ip6[ij])) {
case -1:
bcopy(pr->pr_ip6 + ii + 1, pr->pr_ip6 + ii,
(--pr->pr_ip6s - ii) * sizeof(*pr->pr_ip6));
break;
case 0:
ii++;
ij++;
break;
case 1:
ij++;
break;
}
}
if (pr->pr_ip6s == 0) {
pr->pr_flags |= PR_IP6_DISABLE;
free(pr->pr_ip6, M_PRISON);
pr->pr_ip6 = NULL;
}
}
return 0;
}
/*
* Pass back primary IPv6 address for this jail.
*
* If not restricted return success but do not alter the address. Caller has
* to make sure to initialize it correctly (e.g. IN6ADDR_ANY_INIT).
*
* Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv6.
*/
int
prison_get_ip6(struct ucred *cred, struct in6_addr *ia6)
{
struct prison *pr;
KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
pr = cred->cr_prison;
if (!(pr->pr_flags & PR_IP6))
return (0);
mtx_lock(&pr->pr_mtx);
if (!(pr->pr_flags & PR_IP6)) {
mtx_unlock(&pr->pr_mtx);
return (0);
}
if (pr->pr_ip6 == NULL) {
mtx_unlock(&pr->pr_mtx);
return (EAFNOSUPPORT);
}
bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr));
mtx_unlock(&pr->pr_mtx);
return (0);
}
/*
* Return 1 if we should do proper source address selection or are not jailed.
* We will return 0 if we should bypass source address selection in favour
* of the primary jail IPv6 address. Only in this case *ia will be updated and
* returned in NBO.
* Return EAFNOSUPPORT, in case this jail does not allow IPv6.
*/
int
prison_saddrsel_ip6(struct ucred *cred, struct in6_addr *ia6)
{
struct prison *pr;
struct in6_addr lia6;
int error;
KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
if (!jailed(cred))
return (1);
pr = cred->cr_prison;
if (pr->pr_flags & PR_IP6_SADDRSEL)
return (1);
lia6 = in6addr_any;
error = prison_get_ip6(cred, &lia6);
if (error)
return (error);
if (IN6_IS_ADDR_UNSPECIFIED(&lia6))
return (1);
bcopy(&lia6, ia6, sizeof(struct in6_addr));
return (0);
}
/*
* Return true if pr1 and pr2 have the same IPv6 address restrictions.
*/
int
prison_equal_ip6(struct prison *pr1, struct prison *pr2)
{
if (pr1 == pr2)
return (1);
while (pr1 != &prison0 &&
#ifdef VIMAGE
!(pr1->pr_flags & PR_VNET) &&
#endif
!(pr1->pr_flags & PR_IP6_USER))
pr1 = pr1->pr_parent;
while (pr2 != &prison0 &&
#ifdef VIMAGE
!(pr2->pr_flags & PR_VNET) &&
#endif
!(pr2->pr_flags & PR_IP6_USER))
pr2 = pr2->pr_parent;
return (pr1 == pr2);
}
/*
* Make sure our (source) address is set to something meaningful to this jail.
*
* v6only should be set based on (inp->inp_flags & IN6P_IPV6_V6ONLY != 0)
* when needed while binding.
*
* Returns 0 if jail doesn't restrict IPv6 or if address belongs to jail,
* EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail
* doesn't allow IPv6.
*/
int
prison_local_ip6(struct ucred *cred, struct in6_addr *ia6, int v6only)
{
struct prison *pr;
int error;
KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
pr = cred->cr_prison;
if (!(pr->pr_flags & PR_IP6))
return (0);
mtx_lock(&pr->pr_mtx);
if (!(pr->pr_flags & PR_IP6)) {
mtx_unlock(&pr->pr_mtx);
return (0);
}
if (pr->pr_ip6 == NULL) {
mtx_unlock(&pr->pr_mtx);
return (EAFNOSUPPORT);
}
if (IN6_IS_ADDR_LOOPBACK(ia6)) {
bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr));
mtx_unlock(&pr->pr_mtx);
return (0);
}
if (IN6_IS_ADDR_UNSPECIFIED(ia6)) {
/*
* In case there is only 1 IPv6 address, and v6only is true,
* then bind directly.
*/
if (v6only != 0 && pr->pr_ip6s == 1)
bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr));
mtx_unlock(&pr->pr_mtx);
return (0);
}
error = _prison_check_ip6(pr, ia6);
mtx_unlock(&pr->pr_mtx);
return (error);
}
/*
* Rewrite destination address in case we will connect to loopback address.
*
* Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv6.
*/
int
prison_remote_ip6(struct ucred *cred, struct in6_addr *ia6)
{
struct prison *pr;
KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
pr = cred->cr_prison;
if (!(pr->pr_flags & PR_IP6))
return (0);
mtx_lock(&pr->pr_mtx);
if (!(pr->pr_flags & PR_IP6)) {
mtx_unlock(&pr->pr_mtx);
return (0);
}
if (pr->pr_ip6 == NULL) {
mtx_unlock(&pr->pr_mtx);
return (EAFNOSUPPORT);
}
if (IN6_IS_ADDR_LOOPBACK(ia6)) {
bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr));
mtx_unlock(&pr->pr_mtx);
return (0);
}
/*
* Return success because nothing had to be changed.
*/
mtx_unlock(&pr->pr_mtx);
return (0);
}
/*
* Check if given address belongs to the jail referenced by cred/prison.
*
* Returns 0 if jail doesn't restrict IPv6 or if address belongs to jail,
* EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail
* doesn't allow IPv6.
*/
static int
_prison_check_ip6(struct prison *pr, struct in6_addr *ia6)
{
int i, a, z, d;
/*
* Check the primary IP.
*/
if (IN6_ARE_ADDR_EQUAL(&pr->pr_ip6[0], ia6))
return (0);
/*
* All the other IPs are sorted so we can do a binary search.
*/
a = 0;
z = pr->pr_ip6s - 2;
while (a <= z) {
i = (a + z) / 2;
d = qcmp_v6(&pr->pr_ip6[i+1], ia6);
if (d > 0)
z = i - 1;
else if (d < 0)
a = i + 1;
else
return (0);
}
return (EADDRNOTAVAIL);
}
int
prison_check_ip6(struct ucred *cred, struct in6_addr *ia6)
{
struct prison *pr;
int error;
KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
pr = cred->cr_prison;
if (!(pr->pr_flags & PR_IP6))
return (0);
mtx_lock(&pr->pr_mtx);
if (!(pr->pr_flags & PR_IP6)) {
mtx_unlock(&pr->pr_mtx);
return (0);
}
if (pr->pr_ip6 == NULL) {
mtx_unlock(&pr->pr_mtx);
return (EAFNOSUPPORT);
}
error = _prison_check_ip6(pr, ia6);
mtx_unlock(&pr->pr_mtx);
return (error);
}
#endif
/*
* Check if a jail supports the given address family.
*
* Returns 0 if not jailed or the address family is supported, EAFNOSUPPORT
* if not.
*/
int
prison_check_af(struct ucred *cred, int af)
{
struct prison *pr;
int error;
KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
pr = cred->cr_prison;
#ifdef VIMAGE
/* Prisons with their own network stack are not limited. */
if (prison_owns_vnet(cred))
return (0);
#endif
error = 0;
switch (af)
{
#ifdef INET
case AF_INET:
if (pr->pr_flags & PR_IP4)
{
mtx_lock(&pr->pr_mtx);
if ((pr->pr_flags & PR_IP4) && pr->pr_ip4 == NULL)
error = EAFNOSUPPORT;
mtx_unlock(&pr->pr_mtx);
}
break;
#endif
#ifdef INET6
case AF_INET6:
if (pr->pr_flags & PR_IP6)
{
mtx_lock(&pr->pr_mtx);
if ((pr->pr_flags & PR_IP6) && pr->pr_ip6 == NULL)
error = EAFNOSUPPORT;
mtx_unlock(&pr->pr_mtx);
}
break;
#endif
case AF_LOCAL:
case AF_ROUTE:
break;
default:
if (!(pr->pr_allow & PR_ALLOW_SOCKET_AF))
error = EAFNOSUPPORT;
}
return (error);
}
/*
* Check if given address belongs to the jail referenced by cred (wrapper to
* prison_check_ip[46]).
*
* Returns 0 if jail doesn't restrict the address family or if address belongs
* to jail, EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if
* the jail doesn't allow the address family. IPv4 Address passed in in NBO.
*/
int
prison_if(struct ucred *cred, struct sockaddr *sa)
{
#ifdef INET
struct sockaddr_in *sai;
#endif
#ifdef INET6
struct sockaddr_in6 *sai6;
#endif
int error;
KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
KASSERT(sa != NULL, ("%s: sa is NULL", __func__));
#ifdef VIMAGE
if (prison_owns_vnet(cred))
return (0);
#endif
error = 0;
switch (sa->sa_family)
{
#ifdef INET
case AF_INET:
sai = (struct sockaddr_in *)sa;
error = prison_check_ip4(cred, &sai->sin_addr);
break;
#endif
#ifdef INET6
case AF_INET6:
sai6 = (struct sockaddr_in6 *)sa;
error = prison_check_ip6(cred, &sai6->sin6_addr);
break;
#endif
default:
if (!(cred->cr_prison->pr_allow & PR_ALLOW_SOCKET_AF))
error = EAFNOSUPPORT;
}
return (error);
}
/*
* Return 0 if jails permit p1 to frob p2, otherwise ESRCH.
*/
int
prison_check(struct ucred *cred1, struct ucred *cred2)
{
return ((cred1->cr_prison == cred2->cr_prison ||
prison_ischild(cred1->cr_prison, cred2->cr_prison)) ? 0 : ESRCH);
}
/*
* Return 1 if p2 is a child of p1, otherwise 0.
*/
int
prison_ischild(struct prison *pr1, struct prison *pr2)
{
for (pr2 = pr2->pr_parent; pr2 != NULL; pr2 = pr2->pr_parent)
if (pr1 == pr2)
return (1);
return (0);
}
/*
* Return 1 if the passed credential is in a jail, otherwise 0.
*/
int
jailed(struct ucred *cred)
{
return (cred->cr_prison != &prison0);
}
/*
* Return 1 if the passed credential is in a jail and that jail does not
* have its own virtual network stack, otherwise 0.
*/
int
jailed_without_vnet(struct ucred *cred)
{
if (!jailed(cred))
return (0);
#ifdef VIMAGE
if (prison_owns_vnet(cred))
return (0);
#endif
return (1);
}
/*
* Return the correct hostname (domainname, et al) for the passed credential.
*/
void
getcredhostname(struct ucred *cred, char *buf, size_t size)
{
struct prison *pr;
/*
* A NULL credential can be used to shortcut to the physical
* system's hostname.
*/
pr = (cred != NULL) ? cred->cr_prison : &prison0;
mtx_lock(&pr->pr_mtx);
strlcpy(buf, pr->pr_hostname, size);
mtx_unlock(&pr->pr_mtx);
}
void
getcreddomainname(struct ucred *cred, char *buf, size_t size)
{
mtx_lock(&cred->cr_prison->pr_mtx);
strlcpy(buf, cred->cr_prison->pr_domainname, size);
mtx_unlock(&cred->cr_prison->pr_mtx);
}
void
getcredhostuuid(struct ucred *cred, char *buf, size_t size)
{
mtx_lock(&cred->cr_prison->pr_mtx);
strlcpy(buf, cred->cr_prison->pr_hostuuid, size);
mtx_unlock(&cred->cr_prison->pr_mtx);
}
void
getcredhostid(struct ucred *cred, unsigned long *hostid)
{
mtx_lock(&cred->cr_prison->pr_mtx);
*hostid = cred->cr_prison->pr_hostid;
mtx_unlock(&cred->cr_prison->pr_mtx);
}
#ifdef VIMAGE
/*
* Determine whether the prison represented by cred owns
* its vnet rather than having it inherited.
*
* Returns 1 in case the prison owns the vnet, 0 otherwise.
*/
int
prison_owns_vnet(struct ucred *cred)
{
/*
* vnets cannot be added/removed after jail creation,
* so no need to lock here.
*/
return (cred->cr_prison->pr_flags & PR_VNET ? 1 : 0);
}
#endif
/*
* Determine whether the subject represented by cred can "see"
* status of a mount point.
* Returns: 0 for permitted, ENOENT otherwise.
* XXX: This function should be called cr_canseemount() and should be
* placed in kern_prot.c.
*/
int
prison_canseemount(struct ucred *cred, struct mount *mp)
{
struct prison *pr;
struct statfs *sp;
size_t len;
pr = cred->cr_prison;
if (pr->pr_enforce_statfs == 0)
return (0);
if (pr->pr_root->v_mount == mp)
return (0);
if (pr->pr_enforce_statfs == 2)
return (ENOENT);
/*
* If jail's chroot directory is set to "/" we should be able to see
* all mount-points from inside a jail.
* This is ugly check, but this is the only situation when jail's
* directory ends with '/'.
*/
if (strcmp(pr->pr_path, "/") == 0)
return (0);
len = strlen(pr->pr_path);
sp = &mp->mnt_stat;
if (strncmp(pr->pr_path, sp->f_mntonname, len) != 0)
return (ENOENT);
/*
* Be sure that we don't have situation where jail's root directory
* is "/some/path" and mount point is "/some/pathpath".
*/
if (sp->f_mntonname[len] != '\0' && sp->f_mntonname[len] != '/')
return (ENOENT);
return (0);
}
void
prison_enforce_statfs(struct ucred *cred, struct mount *mp, struct statfs *sp)
{
char jpath[MAXPATHLEN];
struct prison *pr;
size_t len;
pr = cred->cr_prison;
if (pr->pr_enforce_statfs == 0)
return;
if (prison_canseemount(cred, mp) != 0) {
bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
strlcpy(sp->f_mntonname, "[restricted]",
sizeof(sp->f_mntonname));
return;
}
if (pr->pr_root->v_mount == mp) {
/*
* Clear current buffer data, so we are sure nothing from
* the valid path left there.
*/
bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
*sp->f_mntonname = '/';
return;
}
/*
* If jail's chroot directory is set to "/" we should be able to see
* all mount-points from inside a jail.
*/
if (strcmp(pr->pr_path, "/") == 0)
return;
len = strlen(pr->pr_path);
strlcpy(jpath, sp->f_mntonname + len, sizeof(jpath));
/*
* Clear current buffer data, so we are sure nothing from
* the valid path left there.
*/
bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
if (*jpath == '\0') {
/* Should never happen. */
*sp->f_mntonname = '/';
} else {
strlcpy(sp->f_mntonname, jpath, sizeof(sp->f_mntonname));
}
}
/*
* Check with permission for a specific privilege is granted within jail. We
* have a specific list of accepted privileges; the rest are denied.
*/
int
prison_priv_check(struct ucred *cred, int priv)
{
if (!jailed(cred))
return (0);
#ifdef VIMAGE
/*
* Privileges specific to prisons with a virtual network stack.
* There might be a duplicate entry here in case the privilege
* is only granted conditionally in the legacy jail case.
*/
switch (priv) {
#ifdef notyet
/*
* NFS-specific privileges.
*/
case PRIV_NFS_DAEMON:
case PRIV_NFS_LOCKD:
#endif
/*
* Network stack privileges.
*/
case PRIV_NET_BRIDGE:
case PRIV_NET_GRE:
case PRIV_NET_BPF:
case PRIV_NET_RAW: /* Dup, cond. in legacy jail case. */
case PRIV_NET_ROUTE:
case PRIV_NET_TAP:
case PRIV_NET_SETIFMTU:
case PRIV_NET_SETIFFLAGS:
case PRIV_NET_SETIFCAP:
case PRIV_NET_SETIFDESCR:
case PRIV_NET_SETIFNAME :
case PRIV_NET_SETIFMETRIC:
case PRIV_NET_SETIFPHYS:
case PRIV_NET_SETIFMAC:
case PRIV_NET_ADDMULTI:
case PRIV_NET_DELMULTI:
case PRIV_NET_HWIOCTL:
case PRIV_NET_SETLLADDR:
case PRIV_NET_ADDIFGROUP:
case PRIV_NET_DELIFGROUP:
case PRIV_NET_IFCREATE:
case PRIV_NET_IFDESTROY:
case PRIV_NET_ADDIFADDR:
case PRIV_NET_DELIFADDR:
case PRIV_NET_LAGG:
case PRIV_NET_GIF:
case PRIV_NET_SETIFVNET:
case PRIV_NET_SETIFFIB:
/*
* 802.11-related privileges.
*/
case PRIV_NET80211_GETKEY:
#ifdef notyet
case PRIV_NET80211_MANAGE: /* XXX-BZ discuss with sam@ */
#endif
#ifdef notyet
/*
* AppleTalk privileges.
*/
case PRIV_NETATALK_RESERVEDPORT:
/*
* ATM privileges.
*/
case PRIV_NETATM_CFG:
case PRIV_NETATM_ADD:
case PRIV_NETATM_DEL:
case PRIV_NETATM_SET:
/*
* Bluetooth privileges.
*/
case PRIV_NETBLUETOOTH_RAW:
#endif
/*
* Netgraph and netgraph module privileges.
*/
case PRIV_NETGRAPH_CONTROL:
#ifdef notyet
case PRIV_NETGRAPH_TTY:
#endif
/*
* IPv4 and IPv6 privileges.
*/
case PRIV_NETINET_IPFW:
case PRIV_NETINET_DIVERT:
case PRIV_NETINET_PF:
case PRIV_NETINET_DUMMYNET:
case PRIV_NETINET_CARP:
case PRIV_NETINET_MROUTE:
case PRIV_NETINET_RAW:
case PRIV_NETINET_ADDRCTRL6:
case PRIV_NETINET_ND6:
case PRIV_NETINET_SCOPE6:
case PRIV_NETINET_ALIFETIME6:
case PRIV_NETINET_IPSEC:
case PRIV_NETINET_BINDANY:
#ifdef notyet
/*
* IPX/SPX privileges.
*/
case PRIV_NETIPX_RESERVEDPORT:
case PRIV_NETIPX_RAW:
/*
* NCP privileges.
*/
case PRIV_NETNCP:
/*
* SMB privileges.
*/
case PRIV_NETSMB:
#endif
/*
* No default: or deny here.
* In case of no permit fall through to next switch().
*/
if (cred->cr_prison->pr_flags & PR_VNET)
return (0);
}
#endif /* VIMAGE */
switch (priv) {
/*
* Allow ktrace privileges for root in jail.
*/
case PRIV_KTRACE:
#if 0
/*
* Allow jailed processes to configure audit identity and
* submit audit records (login, etc). In the future we may
* want to further refine the relationship between audit and
* jail.
*/
case PRIV_AUDIT_GETAUDIT:
case PRIV_AUDIT_SETAUDIT:
case PRIV_AUDIT_SUBMIT:
#endif
/*
* Allow jailed processes to manipulate process UNIX
* credentials in any way they see fit.
*/
case PRIV_CRED_SETUID:
case PRIV_CRED_SETEUID:
case PRIV_CRED_SETGID:
case PRIV_CRED_SETEGID:
case PRIV_CRED_SETGROUPS:
case PRIV_CRED_SETREUID:
case PRIV_CRED_SETREGID:
case PRIV_CRED_SETRESUID:
case PRIV_CRED_SETRESGID:
/*
* Jail implements visibility constraints already, so allow
* jailed root to override uid/gid-based constraints.
*/
case PRIV_SEEOTHERGIDS:
case PRIV_SEEOTHERUIDS:
/*
* Jail implements inter-process debugging limits already, so
* allow jailed root various debugging privileges.
*/
case PRIV_DEBUG_DIFFCRED:
case PRIV_DEBUG_SUGID:
case PRIV_DEBUG_UNPRIV:
/*
* Allow jail to set various resource limits and login
* properties, and for now, exceed process resource limits.
*/
case PRIV_PROC_LIMIT:
case PRIV_PROC_SETLOGIN:
case PRIV_PROC_SETRLIMIT:
/*
* System V and POSIX IPC privileges are granted in jail.
*/
case PRIV_IPC_READ:
case PRIV_IPC_WRITE:
case PRIV_IPC_ADMIN:
case PRIV_IPC_MSGSIZE:
case PRIV_MQ_ADMIN:
/*
* Jail operations within a jail work on child jails.
*/
case PRIV_JAIL_ATTACH:
case PRIV_JAIL_SET:
case PRIV_JAIL_REMOVE:
/*
* Jail implements its own inter-process limits, so allow
* root processes in jail to change scheduling on other
* processes in the same jail. Likewise for signalling.
*/
case PRIV_SCHED_DIFFCRED:
case PRIV_SCHED_CPUSET:
case PRIV_SIGNAL_DIFFCRED:
case PRIV_SIGNAL_SUGID:
/*
* Allow jailed processes to write to sysctls marked as jail
* writable.
*/
case PRIV_SYSCTL_WRITEJAIL:
/*
* Allow root in jail to manage a variety of quota
* properties. These should likely be conditional on a
* configuration option.
*/
case PRIV_VFS_GETQUOTA:
case PRIV_VFS_SETQUOTA:
/*
* Since Jail relies on chroot() to implement file system
* protections, grant many VFS privileges to root in jail.
* Be careful to exclude mount-related and NFS-related
* privileges.
*/
case PRIV_VFS_READ:
case PRIV_VFS_WRITE:
case PRIV_VFS_ADMIN:
case PRIV_VFS_EXEC:
case PRIV_VFS_LOOKUP:
case PRIV_VFS_BLOCKRESERVE: /* XXXRW: Slightly surprising. */
case PRIV_VFS_CHFLAGS_DEV:
case PRIV_VFS_CHOWN:
case PRIV_VFS_CHROOT:
case PRIV_VFS_RETAINSUGID:
case PRIV_VFS_FCHROOT:
case PRIV_VFS_LINK:
case PRIV_VFS_SETGID:
case PRIV_VFS_STAT:
case PRIV_VFS_STICKYFILE:
return (0);
/*
* Depending on the global setting, allow privilege of
* setting system flags.
*/
case PRIV_VFS_SYSFLAGS:
if (cred->cr_prison->pr_allow & PR_ALLOW_CHFLAGS)
return (0);
else
return (EPERM);
/*
* Depending on the global setting, allow privilege of
* mounting/unmounting file systems.
*/
case PRIV_VFS_MOUNT:
case PRIV_VFS_UNMOUNT:
case PRIV_VFS_MOUNT_NONUSER:
case PRIV_VFS_MOUNT_OWNER:
if (cred->cr_prison->pr_allow & PR_ALLOW_MOUNT &&
cred->cr_prison->pr_enforce_statfs < 2)
return (0);
else
return (EPERM);
/*
* Allow jailed root to bind reserved ports and reuse in-use
* ports.
*/
case PRIV_NETINET_RESERVEDPORT:
case PRIV_NETINET_REUSEPORT:
return (0);
/*
* Allow jailed root to set certian IPv4/6 (option) headers.
*/
case PRIV_NETINET_SETHDROPTS:
return (0);
/*
* Conditionally allow creating raw sockets in jail.
*/
case PRIV_NETINET_RAW:
if (cred->cr_prison->pr_allow & PR_ALLOW_RAW_SOCKETS)
return (0);
else
return (EPERM);
/*
* Since jail implements its own visibility limits on netstat
* sysctls, allow getcred. This allows identd to work in
* jail.
*/
case PRIV_NETINET_GETCRED:
return (0);
/*
* Allow jailed root to set loginclass.
*/
case PRIV_PROC_SETLOGINCLASS:
return (0);
default:
/*
* In all remaining cases, deny the privilege request. This
* includes almost all network privileges, many system
* configuration privileges.
*/
return (EPERM);
}
}
/*
* Return the part of pr2's name that is relative to pr1, or the whole name
* if it does not directly follow.
*/
char *
prison_name(struct prison *pr1, struct prison *pr2)
{
char *name;
/* Jails see themselves as "0" (if they see themselves at all). */
if (pr1 == pr2)
return "0";
name = pr2->pr_name;
if (prison_ischild(pr1, pr2)) {
/*
* pr1 isn't locked (and allprison_lock may not be either)
* so its length can't be counted on. But the number of dots
* can be counted on - and counted.
*/
for (; pr1 != &prison0; pr1 = pr1->pr_parent)
name = strchr(name, '.') + 1;
}
return (name);
}
/*
* Return the part of pr2's path that is relative to pr1, or the whole path
* if it does not directly follow.
*/
static char *
prison_path(struct prison *pr1, struct prison *pr2)
{
char *path1, *path2;
int len1;
path1 = pr1->pr_path;
path2 = pr2->pr_path;
if (!strcmp(path1, "/"))
return (path2);
len1 = strlen(path1);
if (strncmp(path1, path2, len1))
return (path2);
if (path2[len1] == '\0')
return "/";
if (path2[len1] == '/')
return (path2 + len1);
return (path2);
}
/*
* Jail-related sysctls.
*/
SYSCTL_NODE(_security, OID_AUTO, jail, CTLFLAG_RW, 0,
"Jails");
static int
sysctl_jail_list(SYSCTL_HANDLER_ARGS)
{
struct xprison *xp;
struct prison *pr, *cpr;
#ifdef INET
struct in_addr *ip4 = NULL;
int ip4s = 0;
#endif
#ifdef INET6
struct in6_addr *ip6 = NULL;
int ip6s = 0;
#endif
int descend, error;
xp = malloc(sizeof(*xp), M_TEMP, M_WAITOK);
pr = req->td->td_ucred->cr_prison;
error = 0;
sx_slock(&allprison_lock);
FOREACH_PRISON_DESCENDANT(pr, cpr, descend) {
#if defined(INET) || defined(INET6)
again:
#endif
mtx_lock(&cpr->pr_mtx);
#ifdef INET
if (cpr->pr_ip4s > 0) {
if (ip4s < cpr->pr_ip4s) {
ip4s = cpr->pr_ip4s;
mtx_unlock(&cpr->pr_mtx);
ip4 = realloc(ip4, ip4s *
sizeof(struct in_addr), M_TEMP, M_WAITOK);
goto again;
}
bcopy(cpr->pr_ip4, ip4,
cpr->pr_ip4s * sizeof(struct in_addr));
}
#endif
#ifdef INET6
if (cpr->pr_ip6s > 0) {
if (ip6s < cpr->pr_ip6s) {
ip6s = cpr->pr_ip6s;
mtx_unlock(&cpr->pr_mtx);
ip6 = realloc(ip6, ip6s *
sizeof(struct in6_addr), M_TEMP, M_WAITOK);
goto again;
}
bcopy(cpr->pr_ip6, ip6,
cpr->pr_ip6s * sizeof(struct in6_addr));
}
#endif
if (cpr->pr_ref == 0) {
mtx_unlock(&cpr->pr_mtx);
continue;
}
bzero(xp, sizeof(*xp));
xp->pr_version = XPRISON_VERSION;
xp->pr_id = cpr->pr_id;
xp->pr_state = cpr->pr_uref > 0
? PRISON_STATE_ALIVE : PRISON_STATE_DYING;
strlcpy(xp->pr_path, prison_path(pr, cpr), sizeof(xp->pr_path));
strlcpy(xp->pr_host, cpr->pr_hostname, sizeof(xp->pr_host));
strlcpy(xp->pr_name, prison_name(pr, cpr), sizeof(xp->pr_name));
#ifdef INET
xp->pr_ip4s = cpr->pr_ip4s;
#endif
#ifdef INET6
xp->pr_ip6s = cpr->pr_ip6s;
#endif
mtx_unlock(&cpr->pr_mtx);
error = SYSCTL_OUT(req, xp, sizeof(*xp));
if (error)
break;
#ifdef INET
if (xp->pr_ip4s > 0) {
error = SYSCTL_OUT(req, ip4,
xp->pr_ip4s * sizeof(struct in_addr));
if (error)
break;
}
#endif
#ifdef INET6
if (xp->pr_ip6s > 0) {
error = SYSCTL_OUT(req, ip6,
xp->pr_ip6s * sizeof(struct in6_addr));
if (error)
break;
}
#endif
}
sx_sunlock(&allprison_lock);
free(xp, M_TEMP);
#ifdef INET
free(ip4, M_TEMP);
#endif
#ifdef INET6
free(ip6, M_TEMP);
#endif
return (error);
}
SYSCTL_OID(_security_jail, OID_AUTO, list,
CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
sysctl_jail_list, "S", "List of active jails");
static int
sysctl_jail_jailed(SYSCTL_HANDLER_ARGS)
{
int error, injail;
injail = jailed(req->td->td_ucred);
error = SYSCTL_OUT(req, &injail, sizeof(injail));
return (error);
}
SYSCTL_PROC(_security_jail, OID_AUTO, jailed,
CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
sysctl_jail_jailed, "I", "Process in jail?");
#if defined(INET) || defined(INET6)
SYSCTL_UINT(_security_jail, OID_AUTO, jail_max_af_ips, CTLFLAG_RW,
&jail_max_af_ips, 0,
"Number of IP addresses a jail may have at most per address family");
#endif
/*
* Default parameters for jail(2) compatability. For historical reasons,
* the sysctl names have varying similarity to the parameter names. Prisons
* just see their own parameters, and can't change them.
*/
static int
sysctl_jail_default_allow(SYSCTL_HANDLER_ARGS)
{
struct prison *pr;
int allow, error, i;
pr = req->td->td_ucred->cr_prison;
allow = (pr == &prison0) ? jail_default_allow : pr->pr_allow;
/* Get the current flag value, and convert it to a boolean. */
i = (allow & arg2) ? 1 : 0;
if (arg1 != NULL)
i = !i;
error = sysctl_handle_int(oidp, &i, 0, req);
if (error || !req->newptr)
return (error);
i = i ? arg2 : 0;
if (arg1 != NULL)
i ^= arg2;
/*
* The sysctls don't have CTLFLAGS_PRISON, so assume prison0
* for writing.
*/
mtx_lock(&prison0.pr_mtx);
jail_default_allow = (jail_default_allow & ~arg2) | i;
mtx_unlock(&prison0.pr_mtx);
return (0);
}
SYSCTL_PROC(_security_jail, OID_AUTO, set_hostname_allowed,
CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
NULL, PR_ALLOW_SET_HOSTNAME, sysctl_jail_default_allow, "I",
"Processes in jail can set their hostnames");
SYSCTL_PROC(_security_jail, OID_AUTO, socket_unixiproute_only,
CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
(void *)1, PR_ALLOW_SOCKET_AF, sysctl_jail_default_allow, "I",
"Processes in jail are limited to creating UNIX/IP/route sockets only");
SYSCTL_PROC(_security_jail, OID_AUTO, sysvipc_allowed,
CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
NULL, PR_ALLOW_SYSVIPC, sysctl_jail_default_allow, "I",
"Processes in jail can use System V IPC primitives");
SYSCTL_PROC(_security_jail, OID_AUTO, allow_raw_sockets,
CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
NULL, PR_ALLOW_RAW_SOCKETS, sysctl_jail_default_allow, "I",
"Prison root can create raw sockets");
SYSCTL_PROC(_security_jail, OID_AUTO, chflags_allowed,
CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
NULL, PR_ALLOW_CHFLAGS, sysctl_jail_default_allow, "I",
"Processes in jail can alter system file flags");
SYSCTL_PROC(_security_jail, OID_AUTO, mount_allowed,
CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
NULL, PR_ALLOW_MOUNT, sysctl_jail_default_allow, "I",
"Processes in jail can mount/unmount jail-friendly file systems");
static int
sysctl_jail_default_level(SYSCTL_HANDLER_ARGS)
{
struct prison *pr;
int level, error;
pr = req->td->td_ucred->cr_prison;
level = (pr == &prison0) ? *(int *)arg1 : *(int *)((char *)pr + arg2);
error = sysctl_handle_int(oidp, &level, 0, req);
if (error || !req->newptr)
return (error);
*(int *)arg1 = level;
return (0);
}
SYSCTL_PROC(_security_jail, OID_AUTO, enforce_statfs,
CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
&jail_default_enforce_statfs, offsetof(struct prison, pr_enforce_statfs),
sysctl_jail_default_level, "I",
"Processes in jail cannot see all mounted file systems");
/*
* Nodes to describe jail parameters. Maximum length of string parameters
* is returned in the string itself, and the other parameters exist merely
* to make themselves and their types known.
*/
SYSCTL_NODE(_security_jail, OID_AUTO, param, CTLFLAG_RW, 0,
"Jail parameters");
int
sysctl_jail_param(SYSCTL_HANDLER_ARGS)
{
int i;
long l;
size_t s;
char numbuf[12];
switch (oidp->oid_kind & CTLTYPE)
{
case CTLTYPE_LONG:
case CTLTYPE_ULONG:
l = 0;
#ifdef SCTL_MASK32
if (!(req->flags & SCTL_MASK32))
#endif
return (SYSCTL_OUT(req, &l, sizeof(l)));
case CTLTYPE_INT:
case CTLTYPE_UINT:
i = 0;
return (SYSCTL_OUT(req, &i, sizeof(i)));
case CTLTYPE_STRING:
snprintf(numbuf, sizeof(numbuf), "%jd", (intmax_t)arg2);
return
(sysctl_handle_string(oidp, numbuf, sizeof(numbuf), req));
case CTLTYPE_STRUCT:
s = (size_t)arg2;
return (SYSCTL_OUT(req, &s, sizeof(s)));
}
return (0);
}
SYSCTL_JAIL_PARAM(, jid, CTLTYPE_INT | CTLFLAG_RDTUN, "I", "Jail ID");
SYSCTL_JAIL_PARAM(, parent, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail parent ID");
SYSCTL_JAIL_PARAM_STRING(, name, CTLFLAG_RW, MAXHOSTNAMELEN, "Jail name");
SYSCTL_JAIL_PARAM_STRING(, path, CTLFLAG_RDTUN, MAXPATHLEN, "Jail root path");
SYSCTL_JAIL_PARAM(, securelevel, CTLTYPE_INT | CTLFLAG_RW,
"I", "Jail secure level");
SYSCTL_JAIL_PARAM(, enforce_statfs, CTLTYPE_INT | CTLFLAG_RW,
"I", "Jail cannot see all mounted file systems");
SYSCTL_JAIL_PARAM(, persist, CTLTYPE_INT | CTLFLAG_RW,
"B", "Jail persistence");
#ifdef VIMAGE
SYSCTL_JAIL_PARAM(, vnet, CTLTYPE_INT | CTLFLAG_RDTUN,
"E,jailsys", "Virtual network stack");
#endif
SYSCTL_JAIL_PARAM(, dying, CTLTYPE_INT | CTLFLAG_RD,
"B", "Jail is in the process of shutting down");
SYSCTL_JAIL_PARAM_NODE(children, "Number of child jails");
SYSCTL_JAIL_PARAM(_children, cur, CTLTYPE_INT | CTLFLAG_RD,
"I", "Current number of child jails");
SYSCTL_JAIL_PARAM(_children, max, CTLTYPE_INT | CTLFLAG_RW,
"I", "Maximum number of child jails");
SYSCTL_JAIL_PARAM_SYS_NODE(host, CTLFLAG_RW, "Jail host info");
SYSCTL_JAIL_PARAM_STRING(_host, hostname, CTLFLAG_RW, MAXHOSTNAMELEN,
"Jail hostname");
SYSCTL_JAIL_PARAM_STRING(_host, domainname, CTLFLAG_RW, MAXHOSTNAMELEN,
"Jail NIS domainname");
SYSCTL_JAIL_PARAM_STRING(_host, hostuuid, CTLFLAG_RW, HOSTUUIDLEN,
"Jail host UUID");
SYSCTL_JAIL_PARAM(_host, hostid, CTLTYPE_ULONG | CTLFLAG_RW,
"LU", "Jail host ID");
SYSCTL_JAIL_PARAM_NODE(cpuset, "Jail cpuset");
SYSCTL_JAIL_PARAM(_cpuset, id, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail cpuset ID");
#ifdef INET
SYSCTL_JAIL_PARAM_SYS_NODE(ip4, CTLFLAG_RDTUN,
"Jail IPv4 address virtualization");
SYSCTL_JAIL_PARAM_STRUCT(_ip4, addr, CTLFLAG_RW, sizeof(struct in_addr),
"S,in_addr,a", "Jail IPv4 addresses");
SYSCTL_JAIL_PARAM(_ip4, saddrsel, CTLTYPE_INT | CTLFLAG_RW,
"B", "Do (not) use IPv4 source address selection rather than the "
"primary jail IPv4 address.");
#endif
#ifdef INET6
SYSCTL_JAIL_PARAM_SYS_NODE(ip6, CTLFLAG_RDTUN,
"Jail IPv6 address virtualization");
SYSCTL_JAIL_PARAM_STRUCT(_ip6, addr, CTLFLAG_RW, sizeof(struct in6_addr),
"S,in6_addr,a", "Jail IPv6 addresses");
SYSCTL_JAIL_PARAM(_ip6, saddrsel, CTLTYPE_INT | CTLFLAG_RW,
"B", "Do (not) use IPv6 source address selection rather than the "
"primary jail IPv6 address.");
#endif
SYSCTL_JAIL_PARAM_NODE(allow, "Jail permission flags");
SYSCTL_JAIL_PARAM(_allow, set_hostname, CTLTYPE_INT | CTLFLAG_RW,
"B", "Jail may set hostname");
SYSCTL_JAIL_PARAM(_allow, sysvipc, CTLTYPE_INT | CTLFLAG_RW,
"B", "Jail may use SYSV IPC");
SYSCTL_JAIL_PARAM(_allow, raw_sockets, CTLTYPE_INT | CTLFLAG_RW,
"B", "Jail may create raw sockets");
SYSCTL_JAIL_PARAM(_allow, chflags, CTLTYPE_INT | CTLFLAG_RW,
"B", "Jail may alter system file flags");
SYSCTL_JAIL_PARAM(_allow, mount, CTLTYPE_INT | CTLFLAG_RW,
"B", "Jail may mount/unmount jail-friendly file systems");
SYSCTL_JAIL_PARAM(_allow, quotas, CTLTYPE_INT | CTLFLAG_RW,
"B", "Jail may set file quotas");
SYSCTL_JAIL_PARAM(_allow, socket_af, CTLTYPE_INT | CTLFLAG_RW,
"B", "Jail may create sockets other than just UNIX/IPv4/IPv6/route");
void
prison_racct_foreach(void (*callback)(struct racct *racct,
void *arg2, void *arg3), void *arg2, void *arg3)
{
struct prison_racct *prr;
sx_slock(&allprison_lock);
LIST_FOREACH(prr, &allprison_racct, prr_next)
(callback)(prr->prr_racct, arg2, arg3);
sx_sunlock(&allprison_lock);
}
static struct prison_racct *
prison_racct_find_locked(const char *name)
{
struct prison_racct *prr;
sx_assert(&allprison_lock, SA_XLOCKED);
if (name[0] == '\0' || strlen(name) >= MAXHOSTNAMELEN)
return (NULL);
LIST_FOREACH(prr, &allprison_racct, prr_next) {
if (strcmp(name, prr->prr_name) != 0)
continue;
/* Found prison_racct with a matching name? */
prison_racct_hold(prr);
return (prr);
}
/* Add new prison_racct. */
prr = malloc(sizeof(*prr), M_PRISON_RACCT, M_ZERO | M_WAITOK);
racct_create(&prr->prr_racct);
strcpy(prr->prr_name, name);
refcount_init(&prr->prr_refcount, 1);
LIST_INSERT_HEAD(&allprison_racct, prr, prr_next);
return (prr);
}
struct prison_racct *
prison_racct_find(const char *name)
{
struct prison_racct *prr;
sx_xlock(&allprison_lock);
prr = prison_racct_find_locked(name);
sx_xunlock(&allprison_lock);
return (prr);
}
void
prison_racct_hold(struct prison_racct *prr)
{
refcount_acquire(&prr->prr_refcount);
}
void
prison_racct_free(struct prison_racct *prr)
{
int old;
old = prr->prr_refcount;
if (old > 1 && atomic_cmpset_int(&prr->prr_refcount, old, old - 1))
return;
sx_xlock(&allprison_lock);
if (refcount_release(&prr->prr_refcount)) {
racct_destroy(&prr->prr_racct);
LIST_REMOVE(prr, prr_next);
sx_xunlock(&allprison_lock);
free(prr, M_PRISON_RACCT);
return;
}
sx_xunlock(&allprison_lock);
}
#ifdef RACCT
static void
prison_racct_attach(struct prison *pr)
{
struct prison_racct *prr;
prr = prison_racct_find_locked(pr->pr_name);
KASSERT(prr != NULL, ("cannot find prison_racct"));
pr->pr_prison_racct = prr;
}
static void
prison_racct_detach(struct prison *pr)
{
prison_racct_free(pr->pr_prison_racct);
pr->pr_prison_racct = NULL;
}
#endif /* RACCT */
#ifdef DDB
static void
db_show_prison(struct prison *pr)
{
int fi;
#if defined(INET) || defined(INET6)
int ii;
#endif
unsigned jsf;
#ifdef INET6
char ip6buf[INET6_ADDRSTRLEN];
#endif
db_printf("prison %p:\n", pr);
db_printf(" jid = %d\n", pr->pr_id);
db_printf(" name = %s\n", pr->pr_name);
db_printf(" parent = %p\n", pr->pr_parent);
db_printf(" ref = %d\n", pr->pr_ref);
db_printf(" uref = %d\n", pr->pr_uref);
db_printf(" path = %s\n", pr->pr_path);
db_printf(" cpuset = %d\n", pr->pr_cpuset
? pr->pr_cpuset->cs_id : -1);
#ifdef VIMAGE
db_printf(" vnet = %p\n", pr->pr_vnet);
#endif
db_printf(" root = %p\n", pr->pr_root);
db_printf(" securelevel = %d\n", pr->pr_securelevel);
db_printf(" children.max = %d\n", pr->pr_childmax);
db_printf(" children.cur = %d\n", pr->pr_childcount);
db_printf(" child = %p\n", LIST_FIRST(&pr->pr_children));
db_printf(" sibling = %p\n", LIST_NEXT(pr, pr_sibling));
db_printf(" flags = 0x%x", pr->pr_flags);
for (fi = 0; fi < sizeof(pr_flag_names) / sizeof(pr_flag_names[0]);
fi++)
if (pr_flag_names[fi] != NULL && (pr->pr_flags & (1 << fi)))
db_printf(" %s", pr_flag_names[fi]);
for (fi = 0; fi < sizeof(pr_flag_jailsys) / sizeof(pr_flag_jailsys[0]);
fi++) {
jsf = pr->pr_flags &
(pr_flag_jailsys[fi].disable | pr_flag_jailsys[fi].new);
db_printf(" %-16s= %s\n", pr_flag_jailsys[fi].name,
pr_flag_jailsys[fi].disable &&
(jsf == pr_flag_jailsys[fi].disable) ? "disable"
: (jsf == pr_flag_jailsys[fi].new) ? "new"
: "inherit");
}
db_printf(" allow = 0x%x", pr->pr_allow);
for (fi = 0; fi < sizeof(pr_allow_names) / sizeof(pr_allow_names[0]);
fi++)
if (pr_allow_names[fi] != NULL && (pr->pr_allow & (1 << fi)))
db_printf(" %s", pr_allow_names[fi]);
db_printf("\n");
db_printf(" enforce_statfs = %d\n", pr->pr_enforce_statfs);
db_printf(" host.hostname = %s\n", pr->pr_hostname);
db_printf(" host.domainname = %s\n", pr->pr_domainname);
db_printf(" host.hostuuid = %s\n", pr->pr_hostuuid);
db_printf(" host.hostid = %lu\n", pr->pr_hostid);
#ifdef INET
db_printf(" ip4s = %d\n", pr->pr_ip4s);
for (ii = 0; ii < pr->pr_ip4s; ii++)
db_printf(" %s %s\n",
ii == 0 ? "ip4.addr =" : " ",
inet_ntoa(pr->pr_ip4[ii]));
#endif
#ifdef INET6
db_printf(" ip6s = %d\n", pr->pr_ip6s);
for (ii = 0; ii < pr->pr_ip6s; ii++)
db_printf(" %s %s\n",
ii == 0 ? "ip6.addr =" : " ",
ip6_sprintf(ip6buf, &pr->pr_ip6[ii]));
#endif
}
DB_SHOW_COMMAND(prison, db_show_prison_command)
{
struct prison *pr;
if (!have_addr) {
/*
* Show all prisons in the list, and prison0 which is not
* listed.
*/
db_show_prison(&prison0);
if (!db_pager_quit) {
TAILQ_FOREACH(pr, &allprison, pr_list) {
db_show_prison(pr);
if (db_pager_quit)
break;
}
}
return;
}
if (addr == 0)
pr = &prison0;
else {
/* Look for a prison with the ID and with references. */
TAILQ_FOREACH(pr, &allprison, pr_list)
if (pr->pr_id == addr && pr->pr_ref > 0)
break;
if (pr == NULL)
/* Look again, without requiring a reference. */
TAILQ_FOREACH(pr, &allprison, pr_list)
if (pr->pr_id == addr)
break;
if (pr == NULL)
/* Assume address points to a valid prison. */
pr = (struct prison *)addr;
}
db_show_prison(pr);
}
#endif /* DDB */
Index: head/sys/kern/kern_ktrace.c
===================================================================
--- head/sys/kern/kern_ktrace.c (revision 225616)
+++ head/sys/kern/kern_ktrace.c (revision 225617)
@@ -1,1224 +1,1224 @@
/*-
* Copyright (c) 1989, 1993
* The Regents of the University of California.
* Copyright (c) 2005 Robert N. M. Watson
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_ktrace.c 8.2 (Berkeley) 9/23/93
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_ktrace.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/fcntl.h>
#include <sys/kernel.h>
#include <sys/kthread.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/malloc.h>
#include <sys/mount.h>
#include <sys/namei.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/unistd.h>
#include <sys/vnode.h>
#include <sys/socket.h>
#include <sys/stat.h>
#include <sys/ktrace.h>
#include <sys/sx.h>
#include <sys/sysctl.h>
#include <sys/sysent.h>
#include <sys/syslog.h>
#include <sys/sysproto.h>
#include <security/mac/mac_framework.h>
/*
* The ktrace facility allows the tracing of certain key events in user space
* processes, such as system calls, signal delivery, context switches, and
* user generated events using utrace(2). It works by streaming event
* records and data to a vnode associated with the process using the
* ktrace(2) system call. In general, records can be written directly from
* the context that generates the event. One important exception to this is
* during a context switch, where sleeping is not permitted. To handle this
* case, trace events are generated using in-kernel ktr_request records, and
* then delivered to disk at a convenient moment -- either immediately, the
* next traceable event, at system call return, or at process exit.
*
* When dealing with multiple threads or processes writing to the same event
* log, ordering guarantees are weak: specifically, if an event has multiple
* records (i.e., system call enter and return), they may be interlaced with
* records from another event. Process and thread ID information is provided
* in the record, and user applications can de-interlace events if required.
*/
static MALLOC_DEFINE(M_KTRACE, "KTRACE", "KTRACE");
#ifdef KTRACE
FEATURE(ktrace, "Kernel support for system-call tracing");
#ifndef KTRACE_REQUEST_POOL
#define KTRACE_REQUEST_POOL 100
#endif
struct ktr_request {
struct ktr_header ktr_header;
void *ktr_buffer;
union {
struct ktr_proc_ctor ktr_proc_ctor;
struct ktr_syscall ktr_syscall;
struct ktr_sysret ktr_sysret;
struct ktr_genio ktr_genio;
struct ktr_psig ktr_psig;
struct ktr_csw ktr_csw;
} ktr_data;
STAILQ_ENTRY(ktr_request) ktr_list;
};
static int data_lengths[] = {
0, /* none */
offsetof(struct ktr_syscall, ktr_args), /* KTR_SYSCALL */
sizeof(struct ktr_sysret), /* KTR_SYSRET */
0, /* KTR_NAMEI */
sizeof(struct ktr_genio), /* KTR_GENIO */
sizeof(struct ktr_psig), /* KTR_PSIG */
sizeof(struct ktr_csw), /* KTR_CSW */
0, /* KTR_USER */
0, /* KTR_STRUCT */
0, /* KTR_SYSCTL */
sizeof(struct ktr_proc_ctor), /* KTR_PROCCTOR */
0, /* KTR_PROCDTOR */
};
static STAILQ_HEAD(, ktr_request) ktr_free;
static SYSCTL_NODE(_kern, OID_AUTO, ktrace, CTLFLAG_RD, 0, "KTRACE options");
static u_int ktr_requestpool = KTRACE_REQUEST_POOL;
TUNABLE_INT("kern.ktrace.request_pool", &ktr_requestpool);
static u_int ktr_geniosize = PAGE_SIZE;
TUNABLE_INT("kern.ktrace.genio_size", &ktr_geniosize);
SYSCTL_UINT(_kern_ktrace, OID_AUTO, genio_size, CTLFLAG_RW, &ktr_geniosize,
0, "Maximum size of genio event payload");
static int print_message = 1;
static struct mtx ktrace_mtx;
static struct sx ktrace_sx;
static void ktrace_init(void *dummy);
static int sysctl_kern_ktrace_request_pool(SYSCTL_HANDLER_ARGS);
static u_int ktrace_resize_pool(u_int oldsize, u_int newsize);
static struct ktr_request *ktr_getrequest_entered(struct thread *td, int type);
static struct ktr_request *ktr_getrequest(int type);
static void ktr_submitrequest(struct thread *td, struct ktr_request *req);
static void ktr_freeproc(struct proc *p, struct ucred **uc,
struct vnode **vp);
static void ktr_freerequest(struct ktr_request *req);
static void ktr_freerequest_locked(struct ktr_request *req);
static void ktr_writerequest(struct thread *td, struct ktr_request *req);
static int ktrcanset(struct thread *,struct proc *);
static int ktrsetchildren(struct thread *,struct proc *,int,int,struct vnode *);
static int ktrops(struct thread *,struct proc *,int,int,struct vnode *);
static void ktrprocctor_entered(struct thread *, struct proc *);
/*
* ktrace itself generates events, such as context switches, which we do not
* wish to trace. Maintain a flag, TDP_INKTRACE, on each thread to determine
* whether or not it is in a region where tracing of events should be
* suppressed.
*/
static void
ktrace_enter(struct thread *td)
{
KASSERT(!(td->td_pflags & TDP_INKTRACE), ("ktrace_enter: flag set"));
td->td_pflags |= TDP_INKTRACE;
}
static void
ktrace_exit(struct thread *td)
{
KASSERT(td->td_pflags & TDP_INKTRACE, ("ktrace_exit: flag not set"));
td->td_pflags &= ~TDP_INKTRACE;
}
static void
ktrace_assert(struct thread *td)
{
KASSERT(td->td_pflags & TDP_INKTRACE, ("ktrace_assert: flag not set"));
}
static void
ktrace_init(void *dummy)
{
struct ktr_request *req;
int i;
mtx_init(&ktrace_mtx, "ktrace", NULL, MTX_DEF | MTX_QUIET);
sx_init(&ktrace_sx, "ktrace_sx");
STAILQ_INIT(&ktr_free);
for (i = 0; i < ktr_requestpool; i++) {
req = malloc(sizeof(struct ktr_request), M_KTRACE, M_WAITOK);
STAILQ_INSERT_HEAD(&ktr_free, req, ktr_list);
}
}
SYSINIT(ktrace_init, SI_SUB_KTRACE, SI_ORDER_ANY, ktrace_init, NULL);
static int
sysctl_kern_ktrace_request_pool(SYSCTL_HANDLER_ARGS)
{
struct thread *td;
u_int newsize, oldsize, wantsize;
int error;
/* Handle easy read-only case first to avoid warnings from GCC. */
if (!req->newptr) {
oldsize = ktr_requestpool;
return (SYSCTL_OUT(req, &oldsize, sizeof(u_int)));
}
error = SYSCTL_IN(req, &wantsize, sizeof(u_int));
if (error)
return (error);
td = curthread;
ktrace_enter(td);
oldsize = ktr_requestpool;
newsize = ktrace_resize_pool(oldsize, wantsize);
ktrace_exit(td);
error = SYSCTL_OUT(req, &oldsize, sizeof(u_int));
if (error)
return (error);
if (wantsize > oldsize && newsize < wantsize)
return (ENOSPC);
return (0);
}
SYSCTL_PROC(_kern_ktrace, OID_AUTO, request_pool, CTLTYPE_UINT|CTLFLAG_RW,
&ktr_requestpool, 0, sysctl_kern_ktrace_request_pool, "IU",
"Pool buffer size for ktrace(1)");
static u_int
ktrace_resize_pool(u_int oldsize, u_int newsize)
{
STAILQ_HEAD(, ktr_request) ktr_new;
struct ktr_request *req;
int bound;
print_message = 1;
bound = newsize - oldsize;
if (bound == 0)
return (ktr_requestpool);
if (bound < 0) {
mtx_lock(&ktrace_mtx);
/* Shrink pool down to newsize if possible. */
while (bound++ < 0) {
req = STAILQ_FIRST(&ktr_free);
if (req == NULL)
break;
STAILQ_REMOVE_HEAD(&ktr_free, ktr_list);
ktr_requestpool--;
free(req, M_KTRACE);
}
} else {
/* Grow pool up to newsize. */
STAILQ_INIT(&ktr_new);
while (bound-- > 0) {
req = malloc(sizeof(struct ktr_request), M_KTRACE,
M_WAITOK);
STAILQ_INSERT_HEAD(&ktr_new, req, ktr_list);
}
mtx_lock(&ktrace_mtx);
STAILQ_CONCAT(&ktr_free, &ktr_new);
ktr_requestpool += (newsize - oldsize);
}
mtx_unlock(&ktrace_mtx);
return (ktr_requestpool);
}
/* ktr_getrequest() assumes that ktr_comm[] is the same size as td_name[]. */
CTASSERT(sizeof(((struct ktr_header *)NULL)->ktr_comm) ==
(sizeof((struct thread *)NULL)->td_name));
static struct ktr_request *
ktr_getrequest_entered(struct thread *td, int type)
{
struct ktr_request *req;
struct proc *p = td->td_proc;
int pm;
mtx_lock(&ktrace_mtx);
if (!KTRCHECK(td, type)) {
mtx_unlock(&ktrace_mtx);
return (NULL);
}
req = STAILQ_FIRST(&ktr_free);
if (req != NULL) {
STAILQ_REMOVE_HEAD(&ktr_free, ktr_list);
req->ktr_header.ktr_type = type;
if (p->p_traceflag & KTRFAC_DROP) {
req->ktr_header.ktr_type |= KTR_DROP;
p->p_traceflag &= ~KTRFAC_DROP;
}
mtx_unlock(&ktrace_mtx);
microtime(&req->ktr_header.ktr_time);
req->ktr_header.ktr_pid = p->p_pid;
req->ktr_header.ktr_tid = td->td_tid;
bcopy(td->td_name, req->ktr_header.ktr_comm,
sizeof(req->ktr_header.ktr_comm));
req->ktr_buffer = NULL;
req->ktr_header.ktr_len = 0;
} else {
p->p_traceflag |= KTRFAC_DROP;
pm = print_message;
print_message = 0;
mtx_unlock(&ktrace_mtx);
if (pm)
printf("Out of ktrace request objects.\n");
}
return (req);
}
static struct ktr_request *
ktr_getrequest(int type)
{
struct thread *td = curthread;
struct ktr_request *req;
ktrace_enter(td);
req = ktr_getrequest_entered(td, type);
if (req == NULL)
ktrace_exit(td);
return (req);
}
/*
* Some trace generation environments don't permit direct access to VFS,
* such as during a context switch where sleeping is not allowed. Under these
* circumstances, queue a request to the thread to be written asynchronously
* later.
*/
static void
ktr_enqueuerequest(struct thread *td, struct ktr_request *req)
{
mtx_lock(&ktrace_mtx);
STAILQ_INSERT_TAIL(&td->td_proc->p_ktr, req, ktr_list);
mtx_unlock(&ktrace_mtx);
}
/*
* Drain any pending ktrace records from the per-thread queue to disk. This
* is used both internally before committing other records, and also on
* system call return. We drain all the ones we can find at the time when
* drain is requested, but don't keep draining after that as those events
* may be approximately "after" the current event.
*/
static void
ktr_drain(struct thread *td)
{
struct ktr_request *queued_req;
STAILQ_HEAD(, ktr_request) local_queue;
ktrace_assert(td);
sx_assert(&ktrace_sx, SX_XLOCKED);
STAILQ_INIT(&local_queue);
if (!STAILQ_EMPTY(&td->td_proc->p_ktr)) {
mtx_lock(&ktrace_mtx);
STAILQ_CONCAT(&local_queue, &td->td_proc->p_ktr);
mtx_unlock(&ktrace_mtx);
while ((queued_req = STAILQ_FIRST(&local_queue))) {
STAILQ_REMOVE_HEAD(&local_queue, ktr_list);
ktr_writerequest(td, queued_req);
ktr_freerequest(queued_req);
}
}
}
/*
* Submit a trace record for immediate commit to disk -- to be used only
* where entering VFS is OK. First drain any pending records that may have
* been cached in the thread.
*/
static void
ktr_submitrequest(struct thread *td, struct ktr_request *req)
{
ktrace_assert(td);
sx_xlock(&ktrace_sx);
ktr_drain(td);
ktr_writerequest(td, req);
ktr_freerequest(req);
sx_xunlock(&ktrace_sx);
ktrace_exit(td);
}
static void
ktr_freerequest(struct ktr_request *req)
{
mtx_lock(&ktrace_mtx);
ktr_freerequest_locked(req);
mtx_unlock(&ktrace_mtx);
}
static void
ktr_freerequest_locked(struct ktr_request *req)
{
mtx_assert(&ktrace_mtx, MA_OWNED);
if (req->ktr_buffer != NULL)
free(req->ktr_buffer, M_KTRACE);
STAILQ_INSERT_HEAD(&ktr_free, req, ktr_list);
}
/*
* Disable tracing for a process and release all associated resources.
* The caller is responsible for releasing a reference on the returned
* vnode and credentials.
*/
static void
ktr_freeproc(struct proc *p, struct ucred **uc, struct vnode **vp)
{
struct ktr_request *req;
PROC_LOCK_ASSERT(p, MA_OWNED);
mtx_assert(&ktrace_mtx, MA_OWNED);
*uc = p->p_tracecred;
p->p_tracecred = NULL;
if (vp != NULL)
*vp = p->p_tracevp;
p->p_tracevp = NULL;
p->p_traceflag = 0;
while ((req = STAILQ_FIRST(&p->p_ktr)) != NULL) {
STAILQ_REMOVE_HEAD(&p->p_ktr, ktr_list);
ktr_freerequest_locked(req);
}
}
void
ktrsyscall(code, narg, args)
int code, narg;
register_t args[];
{
struct ktr_request *req;
struct ktr_syscall *ktp;
size_t buflen;
char *buf = NULL;
buflen = sizeof(register_t) * narg;
if (buflen > 0) {
buf = malloc(buflen, M_KTRACE, M_WAITOK);
bcopy(args, buf, buflen);
}
req = ktr_getrequest(KTR_SYSCALL);
if (req == NULL) {
if (buf != NULL)
free(buf, M_KTRACE);
return;
}
ktp = &req->ktr_data.ktr_syscall;
ktp->ktr_code = code;
ktp->ktr_narg = narg;
if (buflen > 0) {
req->ktr_header.ktr_len = buflen;
req->ktr_buffer = buf;
}
ktr_submitrequest(curthread, req);
}
void
ktrsysret(code, error, retval)
int code, error;
register_t retval;
{
struct ktr_request *req;
struct ktr_sysret *ktp;
req = ktr_getrequest(KTR_SYSRET);
if (req == NULL)
return;
ktp = &req->ktr_data.ktr_sysret;
ktp->ktr_code = code;
ktp->ktr_error = error;
ktp->ktr_retval = retval; /* what about val2 ? */
ktr_submitrequest(curthread, req);
}
/*
* When a setuid process execs, disable tracing.
*
* XXX: We toss any pending asynchronous records.
*/
void
ktrprocexec(struct proc *p, struct ucred **uc, struct vnode **vp)
{
PROC_LOCK_ASSERT(p, MA_OWNED);
mtx_lock(&ktrace_mtx);
ktr_freeproc(p, uc, vp);
mtx_unlock(&ktrace_mtx);
}
/*
* When a process exits, drain per-process asynchronous trace records
* and disable tracing.
*/
void
ktrprocexit(struct thread *td)
{
struct ktr_request *req;
struct proc *p;
struct ucred *cred;
struct vnode *vp;
int vfslocked;
p = td->td_proc;
if (p->p_traceflag == 0)
return;
ktrace_enter(td);
req = ktr_getrequest_entered(td, KTR_PROCDTOR);
if (req != NULL)
ktr_enqueuerequest(td, req);
sx_xlock(&ktrace_sx);
ktr_drain(td);
sx_xunlock(&ktrace_sx);
PROC_LOCK(p);
mtx_lock(&ktrace_mtx);
ktr_freeproc(p, &cred, &vp);
mtx_unlock(&ktrace_mtx);
PROC_UNLOCK(p);
if (vp != NULL) {
vfslocked = VFS_LOCK_GIANT(vp->v_mount);
vrele(vp);
VFS_UNLOCK_GIANT(vfslocked);
}
if (cred != NULL)
crfree(cred);
ktrace_exit(td);
}
static void
ktrprocctor_entered(struct thread *td, struct proc *p)
{
struct ktr_proc_ctor *ktp;
struct ktr_request *req;
struct thread *td2;
ktrace_assert(td);
td2 = FIRST_THREAD_IN_PROC(p);
req = ktr_getrequest_entered(td2, KTR_PROCCTOR);
if (req == NULL)
return;
ktp = &req->ktr_data.ktr_proc_ctor;
ktp->sv_flags = p->p_sysent->sv_flags;
ktr_enqueuerequest(td2, req);
}
void
ktrprocctor(struct proc *p)
{
struct thread *td = curthread;
if ((p->p_traceflag & KTRFAC_MASK) == 0)
return;
ktrace_enter(td);
ktrprocctor_entered(td, p);
ktrace_exit(td);
}
/*
* When a process forks, enable tracing in the new process if needed.
*/
void
ktrprocfork(struct proc *p1, struct proc *p2)
{
PROC_LOCK(p1);
mtx_lock(&ktrace_mtx);
KASSERT(p2->p_tracevp == NULL, ("new process has a ktrace vnode"));
if (p1->p_traceflag & KTRFAC_INHERIT) {
p2->p_traceflag = p1->p_traceflag;
if ((p2->p_tracevp = p1->p_tracevp) != NULL) {
VREF(p2->p_tracevp);
KASSERT(p1->p_tracecred != NULL,
("ktrace vnode with no cred"));
p2->p_tracecred = crhold(p1->p_tracecred);
}
}
mtx_unlock(&ktrace_mtx);
PROC_UNLOCK(p1);
ktrprocctor(p2);
}
/*
* When a thread returns, drain any asynchronous records generated by the
* system call.
*/
void
ktruserret(struct thread *td)
{
ktrace_enter(td);
sx_xlock(&ktrace_sx);
ktr_drain(td);
sx_xunlock(&ktrace_sx);
ktrace_exit(td);
}
void
ktrnamei(path)
char *path;
{
struct ktr_request *req;
int namelen;
char *buf = NULL;
namelen = strlen(path);
if (namelen > 0) {
buf = malloc(namelen, M_KTRACE, M_WAITOK);
bcopy(path, buf, namelen);
}
req = ktr_getrequest(KTR_NAMEI);
if (req == NULL) {
if (buf != NULL)
free(buf, M_KTRACE);
return;
}
if (namelen > 0) {
req->ktr_header.ktr_len = namelen;
req->ktr_buffer = buf;
}
ktr_submitrequest(curthread, req);
}
void
ktrsysctl(name, namelen)
int *name;
u_int namelen;
{
struct ktr_request *req;
u_int mib[CTL_MAXNAME + 2];
char *mibname;
size_t mibnamelen;
int error;
/* Lookup name of mib. */
KASSERT(namelen <= CTL_MAXNAME, ("sysctl MIB too long"));
mib[0] = 0;
mib[1] = 1;
bcopy(name, mib + 2, namelen * sizeof(*name));
mibnamelen = 128;
mibname = malloc(mibnamelen, M_KTRACE, M_WAITOK);
error = kernel_sysctl(curthread, mib, namelen + 2, mibname, &mibnamelen,
NULL, 0, &mibnamelen, 0);
if (error) {
free(mibname, M_KTRACE);
return;
}
req = ktr_getrequest(KTR_SYSCTL);
if (req == NULL) {
free(mibname, M_KTRACE);
return;
}
req->ktr_header.ktr_len = mibnamelen;
req->ktr_buffer = mibname;
ktr_submitrequest(curthread, req);
}
void
ktrgenio(fd, rw, uio, error)
int fd;
enum uio_rw rw;
struct uio *uio;
int error;
{
struct ktr_request *req;
struct ktr_genio *ktg;
int datalen;
char *buf;
if (error) {
free(uio, M_IOV);
return;
}
uio->uio_offset = 0;
uio->uio_rw = UIO_WRITE;
datalen = imin(uio->uio_resid, ktr_geniosize);
buf = malloc(datalen, M_KTRACE, M_WAITOK);
error = uiomove(buf, datalen, uio);
free(uio, M_IOV);
if (error) {
free(buf, M_KTRACE);
return;
}
req = ktr_getrequest(KTR_GENIO);
if (req == NULL) {
free(buf, M_KTRACE);
return;
}
ktg = &req->ktr_data.ktr_genio;
ktg->ktr_fd = fd;
ktg->ktr_rw = rw;
req->ktr_header.ktr_len = datalen;
req->ktr_buffer = buf;
ktr_submitrequest(curthread, req);
}
void
ktrpsig(sig, action, mask, code)
int sig;
sig_t action;
sigset_t *mask;
int code;
{
struct thread *td = curthread;
struct ktr_request *req;
struct ktr_psig *kp;
req = ktr_getrequest(KTR_PSIG);
if (req == NULL)
return;
kp = &req->ktr_data.ktr_psig;
kp->signo = (char)sig;
kp->action = action;
kp->mask = *mask;
kp->code = code;
ktr_enqueuerequest(td, req);
ktrace_exit(td);
}
void
ktrcsw(out, user)
int out, user;
{
struct thread *td = curthread;
struct ktr_request *req;
struct ktr_csw *kc;
req = ktr_getrequest(KTR_CSW);
if (req == NULL)
return;
kc = &req->ktr_data.ktr_csw;
kc->out = out;
kc->user = user;
ktr_enqueuerequest(td, req);
ktrace_exit(td);
}
void
ktrstruct(name, data, datalen)
const char *name;
void *data;
size_t datalen;
{
struct ktr_request *req;
char *buf = NULL;
size_t buflen;
if (!data)
datalen = 0;
buflen = strlen(name) + 1 + datalen;
buf = malloc(buflen, M_KTRACE, M_WAITOK);
strcpy(buf, name);
bcopy(data, buf + strlen(name) + 1, datalen);
if ((req = ktr_getrequest(KTR_STRUCT)) == NULL) {
free(buf, M_KTRACE);
return;
}
req->ktr_buffer = buf;
req->ktr_header.ktr_len = buflen;
ktr_submitrequest(curthread, req);
}
#endif /* KTRACE */
/* Interface and common routines */
#ifndef _SYS_SYSPROTO_H_
struct ktrace_args {
char *fname;
int ops;
int facs;
int pid;
};
#endif
/* ARGSUSED */
int
-ktrace(td, uap)
+sys_ktrace(td, uap)
struct thread *td;
register struct ktrace_args *uap;
{
#ifdef KTRACE
register struct vnode *vp = NULL;
register struct proc *p;
struct pgrp *pg;
int facs = uap->facs & ~KTRFAC_ROOT;
int ops = KTROP(uap->ops);
int descend = uap->ops & KTRFLAG_DESCEND;
int nfound, ret = 0;
int flags, error = 0, vfslocked;
struct nameidata nd;
struct ucred *cred;
/*
* Need something to (un)trace.
*/
if (ops != KTROP_CLEARFILE && facs == 0)
return (EINVAL);
ktrace_enter(td);
if (ops != KTROP_CLEAR) {
/*
* an operation which requires a file argument.
*/
NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE, UIO_USERSPACE,
uap->fname, td);
flags = FREAD | FWRITE | O_NOFOLLOW;
error = vn_open(&nd, &flags, 0, NULL);
if (error) {
ktrace_exit(td);
return (error);
}
vfslocked = NDHASGIANT(&nd);
NDFREE(&nd, NDF_ONLY_PNBUF);
vp = nd.ni_vp;
VOP_UNLOCK(vp, 0);
if (vp->v_type != VREG) {
(void) vn_close(vp, FREAD|FWRITE, td->td_ucred, td);
VFS_UNLOCK_GIANT(vfslocked);
ktrace_exit(td);
return (EACCES);
}
VFS_UNLOCK_GIANT(vfslocked);
}
/*
* Clear all uses of the tracefile.
*/
if (ops == KTROP_CLEARFILE) {
int vrele_count;
vrele_count = 0;
sx_slock(&allproc_lock);
FOREACH_PROC_IN_SYSTEM(p) {
PROC_LOCK(p);
if (p->p_tracevp == vp) {
if (ktrcanset(td, p)) {
mtx_lock(&ktrace_mtx);
ktr_freeproc(p, &cred, NULL);
mtx_unlock(&ktrace_mtx);
vrele_count++;
crfree(cred);
} else
error = EPERM;
}
PROC_UNLOCK(p);
}
sx_sunlock(&allproc_lock);
if (vrele_count > 0) {
vfslocked = VFS_LOCK_GIANT(vp->v_mount);
while (vrele_count-- > 0)
vrele(vp);
VFS_UNLOCK_GIANT(vfslocked);
}
goto done;
}
/*
* do it
*/
sx_slock(&proctree_lock);
if (uap->pid < 0) {
/*
* by process group
*/
pg = pgfind(-uap->pid);
if (pg == NULL) {
sx_sunlock(&proctree_lock);
error = ESRCH;
goto done;
}
/*
* ktrops() may call vrele(). Lock pg_members
* by the proctree_lock rather than pg_mtx.
*/
PGRP_UNLOCK(pg);
nfound = 0;
LIST_FOREACH(p, &pg->pg_members, p_pglist) {
PROC_LOCK(p);
if (p->p_state == PRS_NEW ||
p_cansee(td, p) != 0) {
PROC_UNLOCK(p);
continue;
}
nfound++;
if (descend)
ret |= ktrsetchildren(td, p, ops, facs, vp);
else
ret |= ktrops(td, p, ops, facs, vp);
}
if (nfound == 0) {
sx_sunlock(&proctree_lock);
error = ESRCH;
goto done;
}
} else {
/*
* by pid
*/
p = pfind(uap->pid);
if (p == NULL)
error = ESRCH;
else
error = p_cansee(td, p);
if (error) {
if (p != NULL)
PROC_UNLOCK(p);
sx_sunlock(&proctree_lock);
goto done;
}
if (descend)
ret |= ktrsetchildren(td, p, ops, facs, vp);
else
ret |= ktrops(td, p, ops, facs, vp);
}
sx_sunlock(&proctree_lock);
if (!ret)
error = EPERM;
done:
if (vp != NULL) {
vfslocked = VFS_LOCK_GIANT(vp->v_mount);
(void) vn_close(vp, FWRITE, td->td_ucred, td);
VFS_UNLOCK_GIANT(vfslocked);
}
ktrace_exit(td);
return (error);
#else /* !KTRACE */
return (ENOSYS);
#endif /* KTRACE */
}
/* ARGSUSED */
int
-utrace(td, uap)
+sys_utrace(td, uap)
struct thread *td;
register struct utrace_args *uap;
{
#ifdef KTRACE
struct ktr_request *req;
void *cp;
int error;
if (!KTRPOINT(td, KTR_USER))
return (0);
if (uap->len > KTR_USER_MAXLEN)
return (EINVAL);
cp = malloc(uap->len, M_KTRACE, M_WAITOK);
error = copyin(uap->addr, cp, uap->len);
if (error) {
free(cp, M_KTRACE);
return (error);
}
req = ktr_getrequest(KTR_USER);
if (req == NULL) {
free(cp, M_KTRACE);
return (ENOMEM);
}
req->ktr_buffer = cp;
req->ktr_header.ktr_len = uap->len;
ktr_submitrequest(td, req);
return (0);
#else /* !KTRACE */
return (ENOSYS);
#endif /* KTRACE */
}
#ifdef KTRACE
static int
ktrops(td, p, ops, facs, vp)
struct thread *td;
struct proc *p;
int ops, facs;
struct vnode *vp;
{
struct vnode *tracevp = NULL;
struct ucred *tracecred = NULL;
PROC_LOCK_ASSERT(p, MA_OWNED);
if (!ktrcanset(td, p)) {
PROC_UNLOCK(p);
return (0);
}
if (p->p_flag & P_WEXIT) {
/* If the process is exiting, just ignore it. */
PROC_UNLOCK(p);
return (1);
}
mtx_lock(&ktrace_mtx);
if (ops == KTROP_SET) {
if (p->p_tracevp != vp) {
/*
* if trace file already in use, relinquish below
*/
tracevp = p->p_tracevp;
VREF(vp);
p->p_tracevp = vp;
}
if (p->p_tracecred != td->td_ucred) {
tracecred = p->p_tracecred;
p->p_tracecred = crhold(td->td_ucred);
}
p->p_traceflag |= facs;
if (priv_check(td, PRIV_KTRACE) == 0)
p->p_traceflag |= KTRFAC_ROOT;
} else {
/* KTROP_CLEAR */
if (((p->p_traceflag &= ~facs) & KTRFAC_MASK) == 0)
/* no more tracing */
ktr_freeproc(p, &tracecred, &tracevp);
}
mtx_unlock(&ktrace_mtx);
if ((p->p_traceflag & KTRFAC_MASK) != 0)
ktrprocctor_entered(td, p);
PROC_UNLOCK(p);
if (tracevp != NULL) {
int vfslocked;
vfslocked = VFS_LOCK_GIANT(tracevp->v_mount);
vrele(tracevp);
VFS_UNLOCK_GIANT(vfslocked);
}
if (tracecred != NULL)
crfree(tracecred);
return (1);
}
static int
ktrsetchildren(td, top, ops, facs, vp)
struct thread *td;
struct proc *top;
int ops, facs;
struct vnode *vp;
{
register struct proc *p;
register int ret = 0;
p = top;
PROC_LOCK_ASSERT(p, MA_OWNED);
sx_assert(&proctree_lock, SX_LOCKED);
for (;;) {
ret |= ktrops(td, p, ops, facs, vp);
/*
* If this process has children, descend to them next,
* otherwise do any siblings, and if done with this level,
* follow back up the tree (but not past top).
*/
if (!LIST_EMPTY(&p->p_children))
p = LIST_FIRST(&p->p_children);
else for (;;) {
if (p == top)
return (ret);
if (LIST_NEXT(p, p_sibling)) {
p = LIST_NEXT(p, p_sibling);
break;
}
p = p->p_pptr;
}
PROC_LOCK(p);
}
/*NOTREACHED*/
}
static void
ktr_writerequest(struct thread *td, struct ktr_request *req)
{
struct ktr_header *kth;
struct vnode *vp;
struct proc *p;
struct ucred *cred;
struct uio auio;
struct iovec aiov[3];
struct mount *mp;
int datalen, buflen, vrele_count;
int error, vfslocked;
/*
* We hold the vnode and credential for use in I/O in case ktrace is
* disabled on the process as we write out the request.
*
* XXXRW: This is not ideal: we could end up performing a write after
* the vnode has been closed.
*/
mtx_lock(&ktrace_mtx);
vp = td->td_proc->p_tracevp;
cred = td->td_proc->p_tracecred;
/*
* If vp is NULL, the vp has been cleared out from under this
* request, so just drop it. Make sure the credential and vnode are
* in sync: we should have both or neither.
*/
if (vp == NULL) {
KASSERT(cred == NULL, ("ktr_writerequest: cred != NULL"));
mtx_unlock(&ktrace_mtx);
return;
}
VREF(vp);
KASSERT(cred != NULL, ("ktr_writerequest: cred == NULL"));
crhold(cred);
mtx_unlock(&ktrace_mtx);
kth = &req->ktr_header;
KASSERT(((u_short)kth->ktr_type & ~KTR_DROP) <
sizeof(data_lengths) / sizeof(data_lengths[0]),
("data_lengths array overflow"));
datalen = data_lengths[(u_short)kth->ktr_type & ~KTR_DROP];
buflen = kth->ktr_len;
auio.uio_iov = &aiov[0];
auio.uio_offset = 0;
auio.uio_segflg = UIO_SYSSPACE;
auio.uio_rw = UIO_WRITE;
aiov[0].iov_base = (caddr_t)kth;
aiov[0].iov_len = sizeof(struct ktr_header);
auio.uio_resid = sizeof(struct ktr_header);
auio.uio_iovcnt = 1;
auio.uio_td = td;
if (datalen != 0) {
aiov[1].iov_base = (caddr_t)&req->ktr_data;
aiov[1].iov_len = datalen;
auio.uio_resid += datalen;
auio.uio_iovcnt++;
kth->ktr_len += datalen;
}
if (buflen != 0) {
KASSERT(req->ktr_buffer != NULL, ("ktrace: nothing to write"));
aiov[auio.uio_iovcnt].iov_base = req->ktr_buffer;
aiov[auio.uio_iovcnt].iov_len = buflen;
auio.uio_resid += buflen;
auio.uio_iovcnt++;
}
vfslocked = VFS_LOCK_GIANT(vp->v_mount);
vn_start_write(vp, &mp, V_WAIT);
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
#ifdef MAC
error = mac_vnode_check_write(cred, NOCRED, vp);
if (error == 0)
#endif
error = VOP_WRITE(vp, &auio, IO_UNIT | IO_APPEND, cred);
VOP_UNLOCK(vp, 0);
vn_finished_write(mp);
crfree(cred);
if (!error) {
vrele(vp);
VFS_UNLOCK_GIANT(vfslocked);
return;
}
VFS_UNLOCK_GIANT(vfslocked);
/*
* If error encountered, give up tracing on this vnode. We defer
* all the vrele()'s on the vnode until after we are finished walking
* the various lists to avoid needlessly holding locks.
* NB: at this point we still hold the vnode reference that must
* not go away as we need the valid vnode to compare with. Thus let
* vrele_count start at 1 and the reference will be freed
* by the loop at the end after our last use of vp.
*/
log(LOG_NOTICE, "ktrace write failed, errno %d, tracing stopped\n",
error);
vrele_count = 1;
/*
* First, clear this vnode from being used by any processes in the
* system.
* XXX - If one process gets an EPERM writing to the vnode, should
* we really do this? Other processes might have suitable
* credentials for the operation.
*/
cred = NULL;
sx_slock(&allproc_lock);
FOREACH_PROC_IN_SYSTEM(p) {
PROC_LOCK(p);
if (p->p_tracevp == vp) {
mtx_lock(&ktrace_mtx);
ktr_freeproc(p, &cred, NULL);
mtx_unlock(&ktrace_mtx);
vrele_count++;
}
PROC_UNLOCK(p);
if (cred != NULL) {
crfree(cred);
cred = NULL;
}
}
sx_sunlock(&allproc_lock);
vfslocked = VFS_LOCK_GIANT(vp->v_mount);
while (vrele_count-- > 0)
vrele(vp);
VFS_UNLOCK_GIANT(vfslocked);
}
/*
* Return true if caller has permission to set the ktracing state
* of target. Essentially, the target can't possess any
* more permissions than the caller. KTRFAC_ROOT signifies that
* root previously set the tracing status on the target process, and
* so, only root may further change it.
*/
static int
ktrcanset(td, targetp)
struct thread *td;
struct proc *targetp;
{
PROC_LOCK_ASSERT(targetp, MA_OWNED);
if (targetp->p_traceflag & KTRFAC_ROOT &&
priv_check(td, PRIV_KTRACE))
return (0);
if (p_candebug(td, targetp) != 0)
return (0);
return (1);
}
#endif /* KTRACE */
Index: head/sys/kern/kern_linker.c
===================================================================
--- head/sys/kern/kern_linker.c (revision 225616)
+++ head/sys/kern/kern_linker.c (revision 225617)
@@ -1,2170 +1,2170 @@
/*-
* Copyright (c) 1997-2000 Doug Rabson
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_ddb.h"
#include "opt_hwpmc_hooks.h"
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/systm.h>
#include <sys/malloc.h>
#include <sys/sysproto.h>
#include <sys/sysent.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/sx.h>
#include <sys/module.h>
#include <sys/mount.h>
#include <sys/linker.h>
#include <sys/fcntl.h>
#include <sys/jail.h>
#include <sys/libkern.h>
#include <sys/namei.h>
#include <sys/vnode.h>
#include <sys/syscallsubr.h>
#include <sys/sysctl.h>
#include <net/vnet.h>
#include <security/mac/mac_framework.h>
#include "linker_if.h"
#ifdef HWPMC_HOOKS
#include <sys/pmckern.h>
#endif
#ifdef KLD_DEBUG
int kld_debug = 0;
SYSCTL_INT(_debug, OID_AUTO, kld_debug, CTLFLAG_RW,
&kld_debug, 0, "Set various levels of KLD debug");
#endif
#define KLD_LOCK() sx_xlock(&kld_sx)
#define KLD_UNLOCK() sx_xunlock(&kld_sx)
#define KLD_DOWNGRADE() sx_downgrade(&kld_sx)
#define KLD_LOCK_READ() sx_slock(&kld_sx)
#define KLD_UNLOCK_READ() sx_sunlock(&kld_sx)
#define KLD_LOCKED() sx_xlocked(&kld_sx)
#define KLD_LOCK_ASSERT() do { \
if (!cold) \
sx_assert(&kld_sx, SX_XLOCKED); \
} while (0)
/*
* static char *linker_search_path(const char *name, struct mod_depend
* *verinfo);
*/
static const char *linker_basename(const char *path);
/*
* Find a currently loaded file given its filename.
*/
static linker_file_t linker_find_file_by_name(const char* _filename);
/*
* Find a currently loaded file given its file id.
*/
static linker_file_t linker_find_file_by_id(int _fileid);
/* Metadata from the static kernel */
SET_DECLARE(modmetadata_set, struct mod_metadata);
MALLOC_DEFINE(M_LINKER, "linker", "kernel linker");
linker_file_t linker_kernel_file;
static struct sx kld_sx; /* kernel linker lock */
/*
* Load counter used by clients to determine if a linker file has been
* re-loaded. This counter is incremented for each file load.
*/
static int loadcnt;
static linker_class_list_t classes;
static linker_file_list_t linker_files;
static int next_file_id = 1;
static int linker_no_more_classes = 0;
#define LINKER_GET_NEXT_FILE_ID(a) do { \
linker_file_t lftmp; \
\
KLD_LOCK_ASSERT(); \
retry: \
TAILQ_FOREACH(lftmp, &linker_files, link) { \
if (next_file_id == lftmp->id) { \
next_file_id++; \
goto retry; \
} \
} \
(a) = next_file_id; \
} while(0)
/* XXX wrong name; we're looking at version provision tags here, not modules */
typedef TAILQ_HEAD(, modlist) modlisthead_t;
struct modlist {
TAILQ_ENTRY(modlist) link; /* chain together all modules */
linker_file_t container;
const char *name;
int version;
};
typedef struct modlist *modlist_t;
static modlisthead_t found_modules;
static int linker_file_add_dependency(linker_file_t file,
linker_file_t dep);
static caddr_t linker_file_lookup_symbol_internal(linker_file_t file,
const char* name, int deps);
static int linker_load_module(const char *kldname,
const char *modname, struct linker_file *parent,
struct mod_depend *verinfo, struct linker_file **lfpp);
static modlist_t modlist_lookup2(const char *name, struct mod_depend *verinfo);
static char *
linker_strdup(const char *str)
{
char *result;
if ((result = malloc((strlen(str) + 1), M_LINKER, M_WAITOK)) != NULL)
strcpy(result, str);
return (result);
}
static void
linker_init(void *arg)
{
sx_init(&kld_sx, "kernel linker");
TAILQ_INIT(&classes);
TAILQ_INIT(&linker_files);
}
SYSINIT(linker, SI_SUB_KLD, SI_ORDER_FIRST, linker_init, 0);
static void
linker_stop_class_add(void *arg)
{
linker_no_more_classes = 1;
}
SYSINIT(linker_class, SI_SUB_KLD, SI_ORDER_ANY, linker_stop_class_add, NULL);
int
linker_add_class(linker_class_t lc)
{
/*
* We disallow any class registration past SI_ORDER_ANY
* of SI_SUB_KLD. We bump the reference count to keep the
* ops from being freed.
*/
if (linker_no_more_classes == 1)
return (EPERM);
kobj_class_compile((kobj_class_t) lc);
((kobj_class_t)lc)->refs++; /* XXX: kobj_mtx */
TAILQ_INSERT_TAIL(&classes, lc, link);
return (0);
}
static void
linker_file_sysinit(linker_file_t lf)
{
struct sysinit **start, **stop, **sipp, **xipp, *save;
KLD_DPF(FILE, ("linker_file_sysinit: calling SYSINITs for %s\n",
lf->filename));
if (linker_file_lookup_set(lf, "sysinit_set", &start, &stop, NULL) != 0)
return;
/*
* Perform a bubble sort of the system initialization objects by
* their subsystem (primary key) and order (secondary key).
*
* Since some things care about execution order, this is the operation
* which ensures continued function.
*/
for (sipp = start; sipp < stop; sipp++) {
for (xipp = sipp + 1; xipp < stop; xipp++) {
if ((*sipp)->subsystem < (*xipp)->subsystem ||
((*sipp)->subsystem == (*xipp)->subsystem &&
(*sipp)->order <= (*xipp)->order))
continue; /* skip */
save = *sipp;
*sipp = *xipp;
*xipp = save;
}
}
/*
* Traverse the (now) ordered list of system initialization tasks.
* Perform each task, and continue on to the next task.
*/
mtx_lock(&Giant);
for (sipp = start; sipp < stop; sipp++) {
if ((*sipp)->subsystem == SI_SUB_DUMMY)
continue; /* skip dummy task(s) */
/* Call function */
(*((*sipp)->func)) ((*sipp)->udata);
}
mtx_unlock(&Giant);
}
static void
linker_file_sysuninit(linker_file_t lf)
{
struct sysinit **start, **stop, **sipp, **xipp, *save;
KLD_DPF(FILE, ("linker_file_sysuninit: calling SYSUNINITs for %s\n",
lf->filename));
if (linker_file_lookup_set(lf, "sysuninit_set", &start, &stop,
NULL) != 0)
return;
/*
* Perform a reverse bubble sort of the system initialization objects
* by their subsystem (primary key) and order (secondary key).
*
* Since some things care about execution order, this is the operation
* which ensures continued function.
*/
for (sipp = start; sipp < stop; sipp++) {
for (xipp = sipp + 1; xipp < stop; xipp++) {
if ((*sipp)->subsystem > (*xipp)->subsystem ||
((*sipp)->subsystem == (*xipp)->subsystem &&
(*sipp)->order >= (*xipp)->order))
continue; /* skip */
save = *sipp;
*sipp = *xipp;
*xipp = save;
}
}
/*
* Traverse the (now) ordered list of system initialization tasks.
* Perform each task, and continue on to the next task.
*/
mtx_lock(&Giant);
for (sipp = start; sipp < stop; sipp++) {
if ((*sipp)->subsystem == SI_SUB_DUMMY)
continue; /* skip dummy task(s) */
/* Call function */
(*((*sipp)->func)) ((*sipp)->udata);
}
mtx_unlock(&Giant);
}
static void
linker_file_register_sysctls(linker_file_t lf)
{
struct sysctl_oid **start, **stop, **oidp;
KLD_DPF(FILE,
("linker_file_register_sysctls: registering SYSCTLs for %s\n",
lf->filename));
if (linker_file_lookup_set(lf, "sysctl_set", &start, &stop, NULL) != 0)
return;
sysctl_lock();
for (oidp = start; oidp < stop; oidp++)
sysctl_register_oid(*oidp);
sysctl_unlock();
}
static void
linker_file_unregister_sysctls(linker_file_t lf)
{
struct sysctl_oid **start, **stop, **oidp;
KLD_DPF(FILE, ("linker_file_unregister_sysctls: registering SYSCTLs"
" for %s\n", lf->filename));
if (linker_file_lookup_set(lf, "sysctl_set", &start, &stop, NULL) != 0)
return;
sysctl_lock();
for (oidp = start; oidp < stop; oidp++)
sysctl_unregister_oid(*oidp);
sysctl_unlock();
}
static int
linker_file_register_modules(linker_file_t lf)
{
struct mod_metadata **start, **stop, **mdp;
const moduledata_t *moddata;
int first_error, error;
KLD_DPF(FILE, ("linker_file_register_modules: registering modules"
" in %s\n", lf->filename));
if (linker_file_lookup_set(lf, "modmetadata_set", &start,
&stop, NULL) != 0) {
/*
* This fallback should be unnecessary, but if we get booted
* from boot2 instead of loader and we are missing our
* metadata then we have to try the best we can.
*/
if (lf == linker_kernel_file) {
start = SET_BEGIN(modmetadata_set);
stop = SET_LIMIT(modmetadata_set);
} else
return (0);
}
first_error = 0;
for (mdp = start; mdp < stop; mdp++) {
if ((*mdp)->md_type != MDT_MODULE)
continue;
moddata = (*mdp)->md_data;
KLD_DPF(FILE, ("Registering module %s in %s\n",
moddata->name, lf->filename));
error = module_register(moddata, lf);
if (error) {
printf("Module %s failed to register: %d\n",
moddata->name, error);
if (first_error == 0)
first_error = error;
}
}
return (first_error);
}
static void
linker_init_kernel_modules(void)
{
linker_file_register_modules(linker_kernel_file);
}
SYSINIT(linker_kernel, SI_SUB_KLD, SI_ORDER_ANY, linker_init_kernel_modules,
0);
static int
linker_load_file(const char *filename, linker_file_t *result)
{
linker_class_t lc;
linker_file_t lf;
int foundfile, error;
/* Refuse to load modules if securelevel raised */
if (prison0.pr_securelevel > 0)
return (EPERM);
KLD_LOCK_ASSERT();
lf = linker_find_file_by_name(filename);
if (lf) {
KLD_DPF(FILE, ("linker_load_file: file %s is already loaded,"
" incrementing refs\n", filename));
*result = lf;
lf->refs++;
return (0);
}
foundfile = 0;
error = 0;
/*
* We do not need to protect (lock) classes here because there is
* no class registration past startup (SI_SUB_KLD, SI_ORDER_ANY)
* and there is no class deregistration mechanism at this time.
*/
TAILQ_FOREACH(lc, &classes, link) {
KLD_DPF(FILE, ("linker_load_file: trying to load %s\n",
filename));
error = LINKER_LOAD_FILE(lc, filename, &lf);
/*
* If we got something other than ENOENT, then it exists but
* we cannot load it for some other reason.
*/
if (error != ENOENT)
foundfile = 1;
if (lf) {
error = linker_file_register_modules(lf);
if (error == EEXIST) {
linker_file_unload(lf, LINKER_UNLOAD_FORCE);
return (error);
}
KLD_UNLOCK();
linker_file_register_sysctls(lf);
linker_file_sysinit(lf);
KLD_LOCK();
lf->flags |= LINKER_FILE_LINKED;
*result = lf;
return (0);
}
}
/*
* Less than ideal, but tells the user whether it failed to load or
* the module was not found.
*/
if (foundfile) {
/*
* If the file type has not been recognized by the last try
* printout a message before to fail.
*/
if (error == ENOSYS)
printf("linker_load_file: Unsupported file type\n");
/*
* Format not recognized or otherwise unloadable.
* When loading a module that is statically built into
* the kernel EEXIST percolates back up as the return
* value. Preserve this so that apps like sysinstall
* can recognize this special case and not post bogus
* dialog boxes.
*/
if (error != EEXIST)
error = ENOEXEC;
} else
error = ENOENT; /* Nothing found */
return (error);
}
int
linker_reference_module(const char *modname, struct mod_depend *verinfo,
linker_file_t *result)
{
modlist_t mod;
int error;
KLD_LOCK();
if ((mod = modlist_lookup2(modname, verinfo)) != NULL) {
*result = mod->container;
(*result)->refs++;
KLD_UNLOCK();
return (0);
}
error = linker_load_module(NULL, modname, NULL, verinfo, result);
KLD_UNLOCK();
return (error);
}
int
linker_release_module(const char *modname, struct mod_depend *verinfo,
linker_file_t lf)
{
modlist_t mod;
int error;
KLD_LOCK();
if (lf == NULL) {
KASSERT(modname != NULL,
("linker_release_module: no file or name"));
mod = modlist_lookup2(modname, verinfo);
if (mod == NULL) {
KLD_UNLOCK();
return (ESRCH);
}
lf = mod->container;
} else
KASSERT(modname == NULL && verinfo == NULL,
("linker_release_module: both file and name"));
error = linker_file_unload(lf, LINKER_UNLOAD_NORMAL);
KLD_UNLOCK();
return (error);
}
static linker_file_t
linker_find_file_by_name(const char *filename)
{
linker_file_t lf;
char *koname;
koname = malloc(strlen(filename) + 4, M_LINKER, M_WAITOK);
sprintf(koname, "%s.ko", filename);
KLD_LOCK_ASSERT();
TAILQ_FOREACH(lf, &linker_files, link) {
if (strcmp(lf->filename, koname) == 0)
break;
if (strcmp(lf->filename, filename) == 0)
break;
}
free(koname, M_LINKER);
return (lf);
}
static linker_file_t
linker_find_file_by_id(int fileid)
{
linker_file_t lf;
KLD_LOCK_ASSERT();
TAILQ_FOREACH(lf, &linker_files, link)
if (lf->id == fileid && lf->flags & LINKER_FILE_LINKED)
break;
return (lf);
}
int
linker_file_foreach(linker_predicate_t *predicate, void *context)
{
linker_file_t lf;
int retval = 0;
KLD_LOCK();
TAILQ_FOREACH(lf, &linker_files, link) {
retval = predicate(lf, context);
if (retval != 0)
break;
}
KLD_UNLOCK();
return (retval);
}
linker_file_t
linker_make_file(const char *pathname, linker_class_t lc)
{
linker_file_t lf;
const char *filename;
KLD_LOCK_ASSERT();
filename = linker_basename(pathname);
KLD_DPF(FILE, ("linker_make_file: new file, filename='%s' for pathname='%s'\n", filename, pathname));
lf = (linker_file_t)kobj_create((kobj_class_t)lc, M_LINKER, M_WAITOK);
if (lf == NULL)
return (NULL);
lf->refs = 1;
lf->userrefs = 0;
lf->flags = 0;
lf->filename = linker_strdup(filename);
lf->pathname = linker_strdup(pathname);
LINKER_GET_NEXT_FILE_ID(lf->id);
lf->ndeps = 0;
lf->deps = NULL;
lf->loadcnt = ++loadcnt;
lf->sdt_probes = NULL;
lf->sdt_nprobes = 0;
STAILQ_INIT(&lf->common);
TAILQ_INIT(&lf->modules);
TAILQ_INSERT_TAIL(&linker_files, lf, link);
return (lf);
}
int
linker_file_unload(linker_file_t file, int flags)
{
module_t mod, next;
modlist_t ml, nextml;
struct common_symbol *cp;
int error, i;
/* Refuse to unload modules if securelevel raised. */
if (prison0.pr_securelevel > 0)
return (EPERM);
KLD_LOCK_ASSERT();
KLD_DPF(FILE, ("linker_file_unload: lf->refs=%d\n", file->refs));
/* Easy case of just dropping a reference. */
if (file->refs > 1) {
file->refs--;
return (0);
}
KLD_DPF(FILE, ("linker_file_unload: file is unloading,"
" informing modules\n"));
/*
* Quiesce all the modules to give them a chance to veto the unload.
*/
MOD_SLOCK;
for (mod = TAILQ_FIRST(&file->modules); mod;
mod = module_getfnext(mod)) {
error = module_quiesce(mod);
if (error != 0 && flags != LINKER_UNLOAD_FORCE) {
KLD_DPF(FILE, ("linker_file_unload: module %s"
" vetoed unload\n", module_getname(mod)));
/*
* XXX: Do we need to tell all the quiesced modules
* that they can resume work now via a new module
* event?
*/
MOD_SUNLOCK;
return (error);
}
}
MOD_SUNLOCK;
/*
* Inform any modules associated with this file that they are
* being be unloaded.
*/
MOD_XLOCK;
for (mod = TAILQ_FIRST(&file->modules); mod; mod = next) {
next = module_getfnext(mod);
MOD_XUNLOCK;
/*
* Give the module a chance to veto the unload.
*/
if ((error = module_unload(mod)) != 0) {
KLD_DPF(FILE, ("linker_file_unload: module %s"
" failed unload\n", module_getname(mod)));
return (error);
}
MOD_XLOCK;
module_release(mod);
}
MOD_XUNLOCK;
TAILQ_FOREACH_SAFE(ml, &found_modules, link, nextml) {
if (ml->container == file) {
TAILQ_REMOVE(&found_modules, ml, link);
free(ml, M_LINKER);
}
}
/*
* Don't try to run SYSUNINITs if we are unloaded due to a
* link error.
*/
if (file->flags & LINKER_FILE_LINKED) {
file->flags &= ~LINKER_FILE_LINKED;
KLD_UNLOCK();
linker_file_sysuninit(file);
linker_file_unregister_sysctls(file);
KLD_LOCK();
}
TAILQ_REMOVE(&linker_files, file, link);
if (file->deps) {
for (i = 0; i < file->ndeps; i++)
linker_file_unload(file->deps[i], flags);
free(file->deps, M_LINKER);
file->deps = NULL;
}
while ((cp = STAILQ_FIRST(&file->common)) != NULL) {
STAILQ_REMOVE_HEAD(&file->common, link);
free(cp, M_LINKER);
}
LINKER_UNLOAD(file);
if (file->filename) {
free(file->filename, M_LINKER);
file->filename = NULL;
}
if (file->pathname) {
free(file->pathname, M_LINKER);
file->pathname = NULL;
}
kobj_delete((kobj_t) file, M_LINKER);
return (0);
}
int
linker_ctf_get(linker_file_t file, linker_ctf_t *lc)
{
return (LINKER_CTF_GET(file, lc));
}
static int
linker_file_add_dependency(linker_file_t file, linker_file_t dep)
{
linker_file_t *newdeps;
KLD_LOCK_ASSERT();
newdeps = malloc((file->ndeps + 1) * sizeof(linker_file_t *),
M_LINKER, M_WAITOK | M_ZERO);
if (newdeps == NULL)
return (ENOMEM);
if (file->deps) {
bcopy(file->deps, newdeps,
file->ndeps * sizeof(linker_file_t *));
free(file->deps, M_LINKER);
}
file->deps = newdeps;
file->deps[file->ndeps] = dep;
file->ndeps++;
KLD_DPF(FILE, ("linker_file_add_dependency:"
" adding %s as dependency for %s\n",
dep->filename, file->filename));
return (0);
}
/*
* Locate a linker set and its contents. This is a helper function to avoid
* linker_if.h exposure elsewhere. Note: firstp and lastp are really void **.
* This function is used in this file so we can avoid having lots of (void **)
* casts.
*/
int
linker_file_lookup_set(linker_file_t file, const char *name,
void *firstp, void *lastp, int *countp)
{
int error, locked;
locked = KLD_LOCKED();
if (!locked)
KLD_LOCK();
error = LINKER_LOOKUP_SET(file, name, firstp, lastp, countp);
if (!locked)
KLD_UNLOCK();
return (error);
}
/*
* List all functions in a file.
*/
int
linker_file_function_listall(linker_file_t lf,
linker_function_nameval_callback_t callback_func, void *arg)
{
return (LINKER_EACH_FUNCTION_NAMEVAL(lf, callback_func, arg));
}
caddr_t
linker_file_lookup_symbol(linker_file_t file, const char *name, int deps)
{
caddr_t sym;
int locked;
locked = KLD_LOCKED();
if (!locked)
KLD_LOCK();
sym = linker_file_lookup_symbol_internal(file, name, deps);
if (!locked)
KLD_UNLOCK();
return (sym);
}
static caddr_t
linker_file_lookup_symbol_internal(linker_file_t file, const char *name,
int deps)
{
c_linker_sym_t sym;
linker_symval_t symval;
caddr_t address;
size_t common_size = 0;
int i;
KLD_LOCK_ASSERT();
KLD_DPF(SYM, ("linker_file_lookup_symbol: file=%p, name=%s, deps=%d\n",
file, name, deps));
if (LINKER_LOOKUP_SYMBOL(file, name, &sym) == 0) {
LINKER_SYMBOL_VALUES(file, sym, &symval);
if (symval.value == 0)
/*
* For commons, first look them up in the
* dependencies and only allocate space if not found
* there.
*/
common_size = symval.size;
else {
KLD_DPF(SYM, ("linker_file_lookup_symbol: symbol"
".value=%p\n", symval.value));
return (symval.value);
}
}
if (deps) {
for (i = 0; i < file->ndeps; i++) {
address = linker_file_lookup_symbol_internal(
file->deps[i], name, 0);
if (address) {
KLD_DPF(SYM, ("linker_file_lookup_symbol:"
" deps value=%p\n", address));
return (address);
}
}
}
if (common_size > 0) {
/*
* This is a common symbol which was not found in the
* dependencies. We maintain a simple common symbol table in
* the file object.
*/
struct common_symbol *cp;
STAILQ_FOREACH(cp, &file->common, link) {
if (strcmp(cp->name, name) == 0) {
KLD_DPF(SYM, ("linker_file_lookup_symbol:"
" old common value=%p\n", cp->address));
return (cp->address);
}
}
/*
* Round the symbol size up to align.
*/
common_size = (common_size + sizeof(int) - 1) & -sizeof(int);
cp = malloc(sizeof(struct common_symbol)
+ common_size + strlen(name) + 1, M_LINKER,
M_WAITOK | M_ZERO);
cp->address = (caddr_t)(cp + 1);
cp->name = cp->address + common_size;
strcpy(cp->name, name);
bzero(cp->address, common_size);
STAILQ_INSERT_TAIL(&file->common, cp, link);
KLD_DPF(SYM, ("linker_file_lookup_symbol: new common"
" value=%p\n", cp->address));
return (cp->address);
}
KLD_DPF(SYM, ("linker_file_lookup_symbol: fail\n"));
return (0);
}
/*
* Both DDB and stack(9) rely on the kernel linker to provide forward and
* backward lookup of symbols. However, DDB and sometimes stack(9) need to
* do this in a lockfree manner. We provide a set of internal helper
* routines to perform these operations without locks, and then wrappers that
* optionally lock.
*
* linker_debug_lookup() is ifdef DDB as currently it's only used by DDB.
*/
#ifdef DDB
static int
linker_debug_lookup(const char *symstr, c_linker_sym_t *sym)
{
linker_file_t lf;
TAILQ_FOREACH(lf, &linker_files, link) {
if (LINKER_LOOKUP_SYMBOL(lf, symstr, sym) == 0)
return (0);
}
return (ENOENT);
}
#endif
static int
linker_debug_search_symbol(caddr_t value, c_linker_sym_t *sym, long *diffp)
{
linker_file_t lf;
c_linker_sym_t best, es;
u_long diff, bestdiff, off;
best = 0;
off = (uintptr_t)value;
bestdiff = off;
TAILQ_FOREACH(lf, &linker_files, link) {
if (LINKER_SEARCH_SYMBOL(lf, value, &es, &diff) != 0)
continue;
if (es != 0 && diff < bestdiff) {
best = es;
bestdiff = diff;
}
if (bestdiff == 0)
break;
}
if (best) {
*sym = best;
*diffp = bestdiff;
return (0);
} else {
*sym = 0;
*diffp = off;
return (ENOENT);
}
}
static int
linker_debug_symbol_values(c_linker_sym_t sym, linker_symval_t *symval)
{
linker_file_t lf;
TAILQ_FOREACH(lf, &linker_files, link) {
if (LINKER_SYMBOL_VALUES(lf, sym, symval) == 0)
return (0);
}
return (ENOENT);
}
static int
linker_debug_search_symbol_name(caddr_t value, char *buf, u_int buflen,
long *offset)
{
linker_symval_t symval;
c_linker_sym_t sym;
int error;
*offset = 0;
error = linker_debug_search_symbol(value, &sym, offset);
if (error)
return (error);
error = linker_debug_symbol_values(sym, &symval);
if (error)
return (error);
strlcpy(buf, symval.name, buflen);
return (0);
}
/*
* DDB Helpers. DDB has to look across multiple files with their own symbol
* tables and string tables.
*
* Note that we do not obey list locking protocols here. We really don't need
* DDB to hang because somebody's got the lock held. We'll take the chance
* that the files list is inconsistant instead.
*/
#ifdef DDB
int
linker_ddb_lookup(const char *symstr, c_linker_sym_t *sym)
{
return (linker_debug_lookup(symstr, sym));
}
#endif
int
linker_ddb_search_symbol(caddr_t value, c_linker_sym_t *sym, long *diffp)
{
return (linker_debug_search_symbol(value, sym, diffp));
}
int
linker_ddb_symbol_values(c_linker_sym_t sym, linker_symval_t *symval)
{
return (linker_debug_symbol_values(sym, symval));
}
int
linker_ddb_search_symbol_name(caddr_t value, char *buf, u_int buflen,
long *offset)
{
return (linker_debug_search_symbol_name(value, buf, buflen, offset));
}
/*
* stack(9) helper for non-debugging environemnts. Unlike DDB helpers, we do
* obey locking protocols, and offer a significantly less complex interface.
*/
int
linker_search_symbol_name(caddr_t value, char *buf, u_int buflen,
long *offset)
{
int error;
KLD_LOCK();
error = linker_debug_search_symbol_name(value, buf, buflen, offset);
KLD_UNLOCK();
return (error);
}
/*
* Syscalls.
*/
int
kern_kldload(struct thread *td, const char *file, int *fileid)
{
#ifdef HWPMC_HOOKS
struct pmckern_map_in pkm;
#endif
const char *kldname, *modname;
linker_file_t lf;
int error;
if ((error = securelevel_gt(td->td_ucred, 0)) != 0)
return (error);
if ((error = priv_check(td, PRIV_KLD_LOAD)) != 0)
return (error);
/*
* It is possible that kldloaded module will attach a new ifnet,
* so vnet context must be set when this ocurs.
*/
CURVNET_SET(TD_TO_VNET(td));
/*
* If file does not contain a qualified name or any dot in it
* (kldname.ko, or kldname.ver.ko) treat it as an interface
* name.
*/
if (index(file, '/') || index(file, '.')) {
kldname = file;
modname = NULL;
} else {
kldname = NULL;
modname = file;
}
KLD_LOCK();
error = linker_load_module(kldname, modname, NULL, NULL, &lf);
if (error) {
KLD_UNLOCK();
goto done;
}
lf->userrefs++;
if (fileid != NULL)
*fileid = lf->id;
#ifdef HWPMC_HOOKS
KLD_DOWNGRADE();
pkm.pm_file = lf->filename;
pkm.pm_address = (uintptr_t) lf->address;
PMC_CALL_HOOK(td, PMC_FN_KLD_LOAD, (void *) &pkm);
KLD_UNLOCK_READ();
#else
KLD_UNLOCK();
#endif
done:
CURVNET_RESTORE();
return (error);
}
int
-kldload(struct thread *td, struct kldload_args *uap)
+sys_kldload(struct thread *td, struct kldload_args *uap)
{
char *pathname = NULL;
int error, fileid;
td->td_retval[0] = -1;
pathname = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
error = copyinstr(uap->file, pathname, MAXPATHLEN, NULL);
if (error == 0) {
error = kern_kldload(td, pathname, &fileid);
if (error == 0)
td->td_retval[0] = fileid;
}
free(pathname, M_TEMP);
return (error);
}
int
kern_kldunload(struct thread *td, int fileid, int flags)
{
#ifdef HWPMC_HOOKS
struct pmckern_map_out pkm;
#endif
linker_file_t lf;
int error = 0;
if ((error = securelevel_gt(td->td_ucred, 0)) != 0)
return (error);
if ((error = priv_check(td, PRIV_KLD_UNLOAD)) != 0)
return (error);
CURVNET_SET(TD_TO_VNET(td));
KLD_LOCK();
lf = linker_find_file_by_id(fileid);
if (lf) {
KLD_DPF(FILE, ("kldunload: lf->userrefs=%d\n", lf->userrefs));
/* Check if there are DTrace probes enabled on this file. */
if (lf->nenabled > 0) {
printf("kldunload: attempt to unload file that has"
" DTrace probes enabled\n");
error = EBUSY;
} else if (lf->userrefs == 0) {
/*
* XXX: maybe LINKER_UNLOAD_FORCE should override ?
*/
printf("kldunload: attempt to unload file that was"
" loaded by the kernel\n");
error = EBUSY;
} else {
#ifdef HWPMC_HOOKS
/* Save data needed by hwpmc(4) before unloading. */
pkm.pm_address = (uintptr_t) lf->address;
pkm.pm_size = lf->size;
#endif
lf->userrefs--;
error = linker_file_unload(lf, flags);
if (error)
lf->userrefs++;
}
} else
error = ENOENT;
#ifdef HWPMC_HOOKS
if (error == 0) {
KLD_DOWNGRADE();
PMC_CALL_HOOK(td, PMC_FN_KLD_UNLOAD, (void *) &pkm);
KLD_UNLOCK_READ();
} else
KLD_UNLOCK();
#else
KLD_UNLOCK();
#endif
CURVNET_RESTORE();
return (error);
}
int
-kldunload(struct thread *td, struct kldunload_args *uap)
+sys_kldunload(struct thread *td, struct kldunload_args *uap)
{
return (kern_kldunload(td, uap->fileid, LINKER_UNLOAD_NORMAL));
}
int
-kldunloadf(struct thread *td, struct kldunloadf_args *uap)
+sys_kldunloadf(struct thread *td, struct kldunloadf_args *uap)
{
if (uap->flags != LINKER_UNLOAD_NORMAL &&
uap->flags != LINKER_UNLOAD_FORCE)
return (EINVAL);
return (kern_kldunload(td, uap->fileid, uap->flags));
}
int
-kldfind(struct thread *td, struct kldfind_args *uap)
+sys_kldfind(struct thread *td, struct kldfind_args *uap)
{
char *pathname;
const char *filename;
linker_file_t lf;
int error;
#ifdef MAC
error = mac_kld_check_stat(td->td_ucred);
if (error)
return (error);
#endif
td->td_retval[0] = -1;
pathname = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
if ((error = copyinstr(uap->file, pathname, MAXPATHLEN, NULL)) != 0)
goto out;
filename = linker_basename(pathname);
KLD_LOCK();
lf = linker_find_file_by_name(filename);
if (lf)
td->td_retval[0] = lf->id;
else
error = ENOENT;
KLD_UNLOCK();
out:
free(pathname, M_TEMP);
return (error);
}
int
-kldnext(struct thread *td, struct kldnext_args *uap)
+sys_kldnext(struct thread *td, struct kldnext_args *uap)
{
linker_file_t lf;
int error = 0;
#ifdef MAC
error = mac_kld_check_stat(td->td_ucred);
if (error)
return (error);
#endif
KLD_LOCK();
if (uap->fileid == 0)
lf = TAILQ_FIRST(&linker_files);
else {
lf = linker_find_file_by_id(uap->fileid);
if (lf == NULL) {
error = ENOENT;
goto out;
}
lf = TAILQ_NEXT(lf, link);
}
/* Skip partially loaded files. */
while (lf != NULL && !(lf->flags & LINKER_FILE_LINKED))
lf = TAILQ_NEXT(lf, link);
if (lf)
td->td_retval[0] = lf->id;
else
td->td_retval[0] = 0;
out:
KLD_UNLOCK();
return (error);
}
int
-kldstat(struct thread *td, struct kldstat_args *uap)
+sys_kldstat(struct thread *td, struct kldstat_args *uap)
{
struct kld_file_stat stat;
int error, version;
/*
* Check the version of the user's structure.
*/
if ((error = copyin(&uap->stat->version, &version, sizeof(version)))
!= 0)
return (error);
if (version != sizeof(struct kld_file_stat_1) &&
version != sizeof(struct kld_file_stat))
return (EINVAL);
error = kern_kldstat(td, uap->fileid, &stat);
if (error != 0)
return (error);
return (copyout(&stat, uap->stat, version));
}
int
kern_kldstat(struct thread *td, int fileid, struct kld_file_stat *stat)
{
linker_file_t lf;
int namelen;
#ifdef MAC
int error;
error = mac_kld_check_stat(td->td_ucred);
if (error)
return (error);
#endif
KLD_LOCK();
lf = linker_find_file_by_id(fileid);
if (lf == NULL) {
KLD_UNLOCK();
return (ENOENT);
}
/* Version 1 fields: */
namelen = strlen(lf->filename) + 1;
if (namelen > MAXPATHLEN)
namelen = MAXPATHLEN;
bcopy(lf->filename, &stat->name[0], namelen);
stat->refs = lf->refs;
stat->id = lf->id;
stat->address = lf->address;
stat->size = lf->size;
/* Version 2 fields: */
namelen = strlen(lf->pathname) + 1;
if (namelen > MAXPATHLEN)
namelen = MAXPATHLEN;
bcopy(lf->pathname, &stat->pathname[0], namelen);
KLD_UNLOCK();
td->td_retval[0] = 0;
return (0);
}
int
-kldfirstmod(struct thread *td, struct kldfirstmod_args *uap)
+sys_kldfirstmod(struct thread *td, struct kldfirstmod_args *uap)
{
linker_file_t lf;
module_t mp;
int error = 0;
#ifdef MAC
error = mac_kld_check_stat(td->td_ucred);
if (error)
return (error);
#endif
KLD_LOCK();
lf = linker_find_file_by_id(uap->fileid);
if (lf) {
MOD_SLOCK;
mp = TAILQ_FIRST(&lf->modules);
if (mp != NULL)
td->td_retval[0] = module_getid(mp);
else
td->td_retval[0] = 0;
MOD_SUNLOCK;
} else
error = ENOENT;
KLD_UNLOCK();
return (error);
}
int
-kldsym(struct thread *td, struct kldsym_args *uap)
+sys_kldsym(struct thread *td, struct kldsym_args *uap)
{
char *symstr = NULL;
c_linker_sym_t sym;
linker_symval_t symval;
linker_file_t lf;
struct kld_sym_lookup lookup;
int error = 0;
#ifdef MAC
error = mac_kld_check_stat(td->td_ucred);
if (error)
return (error);
#endif
if ((error = copyin(uap->data, &lookup, sizeof(lookup))) != 0)
return (error);
if (lookup.version != sizeof(lookup) ||
uap->cmd != KLDSYM_LOOKUP)
return (EINVAL);
symstr = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
if ((error = copyinstr(lookup.symname, symstr, MAXPATHLEN, NULL)) != 0)
goto out;
KLD_LOCK();
if (uap->fileid != 0) {
lf = linker_find_file_by_id(uap->fileid);
if (lf == NULL)
error = ENOENT;
else if (LINKER_LOOKUP_SYMBOL(lf, symstr, &sym) == 0 &&
LINKER_SYMBOL_VALUES(lf, sym, &symval) == 0) {
lookup.symvalue = (uintptr_t) symval.value;
lookup.symsize = symval.size;
error = copyout(&lookup, uap->data, sizeof(lookup));
} else
error = ENOENT;
} else {
TAILQ_FOREACH(lf, &linker_files, link) {
if (LINKER_LOOKUP_SYMBOL(lf, symstr, &sym) == 0 &&
LINKER_SYMBOL_VALUES(lf, sym, &symval) == 0) {
lookup.symvalue = (uintptr_t)symval.value;
lookup.symsize = symval.size;
error = copyout(&lookup, uap->data,
sizeof(lookup));
break;
}
}
if (lf == NULL)
error = ENOENT;
}
KLD_UNLOCK();
out:
free(symstr, M_TEMP);
return (error);
}
/*
* Preloaded module support
*/
static modlist_t
modlist_lookup(const char *name, int ver)
{
modlist_t mod;
TAILQ_FOREACH(mod, &found_modules, link) {
if (strcmp(mod->name, name) == 0 &&
(ver == 0 || mod->version == ver))
return (mod);
}
return (NULL);
}
static modlist_t
modlist_lookup2(const char *name, struct mod_depend *verinfo)
{
modlist_t mod, bestmod;
int ver;
if (verinfo == NULL)
return (modlist_lookup(name, 0));
bestmod = NULL;
TAILQ_FOREACH(mod, &found_modules, link) {
if (strcmp(mod->name, name) != 0)
continue;
ver = mod->version;
if (ver == verinfo->md_ver_preferred)
return (mod);
if (ver >= verinfo->md_ver_minimum &&
ver <= verinfo->md_ver_maximum &&
(bestmod == NULL || ver > bestmod->version))
bestmod = mod;
}
return (bestmod);
}
static modlist_t
modlist_newmodule(const char *modname, int version, linker_file_t container)
{
modlist_t mod;
mod = malloc(sizeof(struct modlist), M_LINKER, M_NOWAIT | M_ZERO);
if (mod == NULL)
panic("no memory for module list");
mod->container = container;
mod->name = modname;
mod->version = version;
TAILQ_INSERT_TAIL(&found_modules, mod, link);
return (mod);
}
static void
linker_addmodules(linker_file_t lf, struct mod_metadata **start,
struct mod_metadata **stop, int preload)
{
struct mod_metadata *mp, **mdp;
const char *modname;
int ver;
for (mdp = start; mdp < stop; mdp++) {
mp = *mdp;
if (mp->md_type != MDT_VERSION)
continue;
modname = mp->md_cval;
ver = ((struct mod_version *)mp->md_data)->mv_version;
if (modlist_lookup(modname, ver) != NULL) {
printf("module %s already present!\n", modname);
/* XXX what can we do? this is a build error. :-( */
continue;
}
modlist_newmodule(modname, ver, lf);
}
}
static void
linker_preload(void *arg)
{
caddr_t modptr;
const char *modname, *nmodname;
char *modtype;
linker_file_t lf, nlf;
linker_class_t lc;
int error;
linker_file_list_t loaded_files;
linker_file_list_t depended_files;
struct mod_metadata *mp, *nmp;
struct mod_metadata **start, **stop, **mdp, **nmdp;
struct mod_depend *verinfo;
int nver;
int resolves;
modlist_t mod;
struct sysinit **si_start, **si_stop;
TAILQ_INIT(&loaded_files);
TAILQ_INIT(&depended_files);
TAILQ_INIT(&found_modules);
error = 0;
modptr = NULL;
while ((modptr = preload_search_next_name(modptr)) != NULL) {
modname = (char *)preload_search_info(modptr, MODINFO_NAME);
modtype = (char *)preload_search_info(modptr, MODINFO_TYPE);
if (modname == NULL) {
printf("Preloaded module at %p does not have a"
" name!\n", modptr);
continue;
}
if (modtype == NULL) {
printf("Preloaded module at %p does not have a type!\n",
modptr);
continue;
}
if (bootverbose)
printf("Preloaded %s \"%s\" at %p.\n", modtype, modname,
modptr);
lf = NULL;
TAILQ_FOREACH(lc, &classes, link) {
error = LINKER_LINK_PRELOAD(lc, modname, &lf);
if (!error)
break;
lf = NULL;
}
if (lf)
TAILQ_INSERT_TAIL(&loaded_files, lf, loaded);
}
/*
* First get a list of stuff in the kernel.
*/
if (linker_file_lookup_set(linker_kernel_file, MDT_SETNAME, &start,
&stop, NULL) == 0)
linker_addmodules(linker_kernel_file, start, stop, 1);
/*
* This is a once-off kinky bubble sort to resolve relocation
* dependency requirements.
*/
restart:
TAILQ_FOREACH(lf, &loaded_files, loaded) {
error = linker_file_lookup_set(lf, MDT_SETNAME, &start,
&stop, NULL);
/*
* First, look to see if we would successfully link with this
* stuff.
*/
resolves = 1; /* unless we know otherwise */
if (!error) {
for (mdp = start; mdp < stop; mdp++) {
mp = *mdp;
if (mp->md_type != MDT_DEPEND)
continue;
modname = mp->md_cval;
verinfo = mp->md_data;
for (nmdp = start; nmdp < stop; nmdp++) {
nmp = *nmdp;
if (nmp->md_type != MDT_VERSION)
continue;
nmodname = nmp->md_cval;
if (strcmp(modname, nmodname) == 0)
break;
}
if (nmdp < stop) /* it's a self reference */
continue;
/*
* ok, the module isn't here yet, we
* are not finished
*/
if (modlist_lookup2(modname, verinfo) == NULL)
resolves = 0;
}
}
/*
* OK, if we found our modules, we can link. So, "provide"
* the modules inside and add it to the end of the link order
* list.
*/
if (resolves) {
if (!error) {
for (mdp = start; mdp < stop; mdp++) {
mp = *mdp;
if (mp->md_type != MDT_VERSION)
continue;
modname = mp->md_cval;
nver = ((struct mod_version *)
mp->md_data)->mv_version;
if (modlist_lookup(modname,
nver) != NULL) {
printf("module %s already"
" present!\n", modname);
TAILQ_REMOVE(&loaded_files,
lf, loaded);
linker_file_unload(lf,
LINKER_UNLOAD_FORCE);
/* we changed tailq next ptr */
goto restart;
}
modlist_newmodule(modname, nver, lf);
}
}
TAILQ_REMOVE(&loaded_files, lf, loaded);
TAILQ_INSERT_TAIL(&depended_files, lf, loaded);
/*
* Since we provided modules, we need to restart the
* sort so that the previous files that depend on us
* have a chance. Also, we've busted the tailq next
* pointer with the REMOVE.
*/
goto restart;
}
}
/*
* At this point, we check to see what could not be resolved..
*/
while ((lf = TAILQ_FIRST(&loaded_files)) != NULL) {
TAILQ_REMOVE(&loaded_files, lf, loaded);
printf("KLD file %s is missing dependencies\n", lf->filename);
linker_file_unload(lf, LINKER_UNLOAD_FORCE);
}
/*
* We made it. Finish off the linking in the order we determined.
*/
TAILQ_FOREACH_SAFE(lf, &depended_files, loaded, nlf) {
if (linker_kernel_file) {
linker_kernel_file->refs++;
error = linker_file_add_dependency(lf,
linker_kernel_file);
if (error)
panic("cannot add dependency");
}
lf->userrefs++; /* so we can (try to) kldunload it */
error = linker_file_lookup_set(lf, MDT_SETNAME, &start,
&stop, NULL);
if (!error) {
for (mdp = start; mdp < stop; mdp++) {
mp = *mdp;
if (mp->md_type != MDT_DEPEND)
continue;
modname = mp->md_cval;
verinfo = mp->md_data;
mod = modlist_lookup2(modname, verinfo);
if (mod == NULL) {
printf("KLD file %s - cannot find "
"dependency \"%s\"\n",
lf->filename, modname);
goto fail;
}
/* Don't count self-dependencies */
if (lf == mod->container)
continue;
mod->container->refs++;
error = linker_file_add_dependency(lf,
mod->container);
if (error)
panic("cannot add dependency");
}
}
/*
* Now do relocation etc using the symbol search paths
* established by the dependencies
*/
error = LINKER_LINK_PRELOAD_FINISH(lf);
if (error) {
printf("KLD file %s - could not finalize loading\n",
lf->filename);
goto fail;
}
linker_file_register_modules(lf);
if (linker_file_lookup_set(lf, "sysinit_set", &si_start,
&si_stop, NULL) == 0)
sysinit_add(si_start, si_stop);
linker_file_register_sysctls(lf);
lf->flags |= LINKER_FILE_LINKED;
continue;
fail:
TAILQ_REMOVE(&depended_files, lf, loaded);
linker_file_unload(lf, LINKER_UNLOAD_FORCE);
}
/* woohoo! we made it! */
}
SYSINIT(preload, SI_SUB_KLD, SI_ORDER_MIDDLE, linker_preload, 0);
/*
* Search for a not-loaded module by name.
*
* Modules may be found in the following locations:
*
* - preloaded (result is just the module name) - on disk (result is full path
* to module)
*
* If the module name is qualified in any way (contains path, etc.) the we
* simply return a copy of it.
*
* The search path can be manipulated via sysctl. Note that we use the ';'
* character as a separator to be consistent with the bootloader.
*/
static char linker_hintfile[] = "linker.hints";
static char linker_path[MAXPATHLEN] = "/boot/kernel;/boot/modules";
SYSCTL_STRING(_kern, OID_AUTO, module_path, CTLFLAG_RW, linker_path,
sizeof(linker_path), "module load search path");
TUNABLE_STR("module_path", linker_path, sizeof(linker_path));
static char *linker_ext_list[] = {
"",
".ko",
NULL
};
/*
* Check if file actually exists either with or without extension listed in
* the linker_ext_list. (probably should be generic for the rest of the
* kernel)
*/
static char *
linker_lookup_file(const char *path, int pathlen, const char *name,
int namelen, struct vattr *vap)
{
struct nameidata nd;
struct thread *td = curthread; /* XXX */
char *result, **cpp, *sep;
int error, len, extlen, reclen, flags, vfslocked;
enum vtype type;
extlen = 0;
for (cpp = linker_ext_list; *cpp; cpp++) {
len = strlen(*cpp);
if (len > extlen)
extlen = len;
}
extlen++; /* trailing '\0' */
sep = (path[pathlen - 1] != '/') ? "/" : "";
reclen = pathlen + strlen(sep) + namelen + extlen + 1;
result = malloc(reclen, M_LINKER, M_WAITOK);
for (cpp = linker_ext_list; *cpp; cpp++) {
snprintf(result, reclen, "%.*s%s%.*s%s", pathlen, path, sep,
namelen, name, *cpp);
/*
* Attempt to open the file, and return the path if
* we succeed and it's a regular file.
*/
NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE, UIO_SYSSPACE, result, td);
flags = FREAD;
error = vn_open(&nd, &flags, 0, NULL);
if (error == 0) {
vfslocked = NDHASGIANT(&nd);
NDFREE(&nd, NDF_ONLY_PNBUF);
type = nd.ni_vp->v_type;
if (vap)
VOP_GETATTR(nd.ni_vp, vap, td->td_ucred);
VOP_UNLOCK(nd.ni_vp, 0);
vn_close(nd.ni_vp, FREAD, td->td_ucred, td);
VFS_UNLOCK_GIANT(vfslocked);
if (type == VREG)
return (result);
}
}
free(result, M_LINKER);
return (NULL);
}
#define INT_ALIGN(base, ptr) ptr = \
(base) + (((ptr) - (base) + sizeof(int) - 1) & ~(sizeof(int) - 1))
/*
* Lookup KLD which contains requested module in the "linker.hints" file. If
* version specification is available, then try to find the best KLD.
* Otherwise just find the latest one.
*/
static char *
linker_hints_lookup(const char *path, int pathlen, const char *modname,
int modnamelen, struct mod_depend *verinfo)
{
struct thread *td = curthread; /* XXX */
struct ucred *cred = td ? td->td_ucred : NULL;
struct nameidata nd;
struct vattr vattr, mattr;
u_char *hints = NULL;
u_char *cp, *recptr, *bufend, *result, *best, *pathbuf, *sep;
int error, ival, bestver, *intp, reclen, found, flags, clen, blen;
int vfslocked = 0;
result = NULL;
bestver = found = 0;
sep = (path[pathlen - 1] != '/') ? "/" : "";
reclen = imax(modnamelen, strlen(linker_hintfile)) + pathlen +
strlen(sep) + 1;
pathbuf = malloc(reclen, M_LINKER, M_WAITOK);
snprintf(pathbuf, reclen, "%.*s%s%s", pathlen, path, sep,
linker_hintfile);
NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE, UIO_SYSSPACE, pathbuf, td);
flags = FREAD;
error = vn_open(&nd, &flags, 0, NULL);
if (error)
goto bad;
vfslocked = NDHASGIANT(&nd);
NDFREE(&nd, NDF_ONLY_PNBUF);
if (nd.ni_vp->v_type != VREG)
goto bad;
best = cp = NULL;
error = VOP_GETATTR(nd.ni_vp, &vattr, cred);
if (error)
goto bad;
/*
* XXX: we need to limit this number to some reasonable value
*/
if (vattr.va_size > 100 * 1024) {
printf("hints file too large %ld\n", (long)vattr.va_size);
goto bad;
}
hints = malloc(vattr.va_size, M_TEMP, M_WAITOK);
if (hints == NULL)
goto bad;
error = vn_rdwr(UIO_READ, nd.ni_vp, (caddr_t)hints, vattr.va_size, 0,
UIO_SYSSPACE, IO_NODELOCKED, cred, NOCRED, &reclen, td);
if (error)
goto bad;
VOP_UNLOCK(nd.ni_vp, 0);
vn_close(nd.ni_vp, FREAD, cred, td);
VFS_UNLOCK_GIANT(vfslocked);
nd.ni_vp = NULL;
if (reclen != 0) {
printf("can't read %d\n", reclen);
goto bad;
}
intp = (int *)hints;
ival = *intp++;
if (ival != LINKER_HINTS_VERSION) {
printf("hints file version mismatch %d\n", ival);
goto bad;
}
bufend = hints + vattr.va_size;
recptr = (u_char *)intp;
clen = blen = 0;
while (recptr < bufend && !found) {
intp = (int *)recptr;
reclen = *intp++;
ival = *intp++;
cp = (char *)intp;
switch (ival) {
case MDT_VERSION:
clen = *cp++;
if (clen != modnamelen || bcmp(cp, modname, clen) != 0)
break;
cp += clen;
INT_ALIGN(hints, cp);
ival = *(int *)cp;
cp += sizeof(int);
clen = *cp++;
if (verinfo == NULL ||
ival == verinfo->md_ver_preferred) {
found = 1;
break;
}
if (ival >= verinfo->md_ver_minimum &&
ival <= verinfo->md_ver_maximum &&
ival > bestver) {
bestver = ival;
best = cp;
blen = clen;
}
break;
default:
break;
}
recptr += reclen + sizeof(int);
}
/*
* Finally check if KLD is in the place
*/
if (found)
result = linker_lookup_file(path, pathlen, cp, clen, &mattr);
else if (best)
result = linker_lookup_file(path, pathlen, best, blen, &mattr);
/*
* KLD is newer than hints file. What we should do now?
*/
if (result && timespeccmp(&mattr.va_mtime, &vattr.va_mtime, >))
printf("warning: KLD '%s' is newer than the linker.hints"
" file\n", result);
bad:
free(pathbuf, M_LINKER);
if (hints)
free(hints, M_TEMP);
if (nd.ni_vp != NULL) {
VOP_UNLOCK(nd.ni_vp, 0);
vn_close(nd.ni_vp, FREAD, cred, td);
VFS_UNLOCK_GIANT(vfslocked);
}
/*
* If nothing found or hints is absent - fallback to the old
* way by using "kldname[.ko]" as module name.
*/
if (!found && !bestver && result == NULL)
result = linker_lookup_file(path, pathlen, modname,
modnamelen, NULL);
return (result);
}
/*
* Lookup KLD which contains requested module in the all directories.
*/
static char *
linker_search_module(const char *modname, int modnamelen,
struct mod_depend *verinfo)
{
char *cp, *ep, *result;
/*
* traverse the linker path
*/
for (cp = linker_path; *cp; cp = ep + 1) {
/* find the end of this component */
for (ep = cp; (*ep != 0) && (*ep != ';'); ep++);
result = linker_hints_lookup(cp, ep - cp, modname,
modnamelen, verinfo);
if (result != NULL)
return (result);
if (*ep == 0)
break;
}
return (NULL);
}
/*
* Search for module in all directories listed in the linker_path.
*/
static char *
linker_search_kld(const char *name)
{
char *cp, *ep, *result;
int len;
/* qualified at all? */
if (index(name, '/'))
return (linker_strdup(name));
/* traverse the linker path */
len = strlen(name);
for (ep = linker_path; *ep; ep++) {
cp = ep;
/* find the end of this component */
for (; *ep != 0 && *ep != ';'; ep++);
result = linker_lookup_file(cp, ep - cp, name, len, NULL);
if (result != NULL)
return (result);
}
return (NULL);
}
static const char *
linker_basename(const char *path)
{
const char *filename;
filename = rindex(path, '/');
if (filename == NULL)
return path;
if (filename[1])
filename++;
return (filename);
}
#ifdef HWPMC_HOOKS
/*
* Inform hwpmc about the set of kernel modules currently loaded.
*/
void *
linker_hwpmc_list_objects(void)
{
linker_file_t lf;
struct pmckern_map_in *kobase;
int i, nmappings;
nmappings = 0;
KLD_LOCK_READ();
TAILQ_FOREACH(lf, &linker_files, link)
nmappings++;
/* Allocate nmappings + 1 entries. */
kobase = malloc((nmappings + 1) * sizeof(struct pmckern_map_in),
M_LINKER, M_WAITOK | M_ZERO);
i = 0;
TAILQ_FOREACH(lf, &linker_files, link) {
/* Save the info for this linker file. */
kobase[i].pm_file = lf->filename;
kobase[i].pm_address = (uintptr_t)lf->address;
i++;
}
KLD_UNLOCK_READ();
KASSERT(i > 0, ("linker_hpwmc_list_objects: no kernel objects?"));
/* The last entry of the malloced area comprises of all zeros. */
KASSERT(kobase[i].pm_file == NULL,
("linker_hwpmc_list_objects: last object not NULL"));
return ((void *)kobase);
}
#endif
/*
* Find a file which contains given module and load it, if "parent" is not
* NULL, register a reference to it.
*/
static int
linker_load_module(const char *kldname, const char *modname,
struct linker_file *parent, struct mod_depend *verinfo,
struct linker_file **lfpp)
{
linker_file_t lfdep;
const char *filename;
char *pathname;
int error;
KLD_LOCK_ASSERT();
if (modname == NULL) {
/*
* We have to load KLD
*/
KASSERT(verinfo == NULL, ("linker_load_module: verinfo"
" is not NULL"));
pathname = linker_search_kld(kldname);
} else {
if (modlist_lookup2(modname, verinfo) != NULL)
return (EEXIST);
if (kldname != NULL)
pathname = linker_strdup(kldname);
else if (rootvnode == NULL)
pathname = NULL;
else
/*
* Need to find a KLD with required module
*/
pathname = linker_search_module(modname,
strlen(modname), verinfo);
}
if (pathname == NULL)
return (ENOENT);
/*
* Can't load more than one file with the same basename XXX:
* Actually it should be possible to have multiple KLDs with
* the same basename but different path because they can
* provide different versions of the same modules.
*/
filename = linker_basename(pathname);
if (linker_find_file_by_name(filename))
error = EEXIST;
else do {
error = linker_load_file(pathname, &lfdep);
if (error)
break;
if (modname && verinfo &&
modlist_lookup2(modname, verinfo) == NULL) {
linker_file_unload(lfdep, LINKER_UNLOAD_FORCE);
error = ENOENT;
break;
}
if (parent) {
error = linker_file_add_dependency(parent, lfdep);
if (error)
break;
}
if (lfpp)
*lfpp = lfdep;
} while (0);
free(pathname, M_LINKER);
return (error);
}
/*
* This routine is responsible for finding dependencies of userland initiated
* kldload(2)'s of files.
*/
int
linker_load_dependencies(linker_file_t lf)
{
linker_file_t lfdep;
struct mod_metadata **start, **stop, **mdp, **nmdp;
struct mod_metadata *mp, *nmp;
struct mod_depend *verinfo;
modlist_t mod;
const char *modname, *nmodname;
int ver, error = 0, count;
/*
* All files are dependant on /kernel.
*/
KLD_LOCK_ASSERT();
if (linker_kernel_file) {
linker_kernel_file->refs++;
error = linker_file_add_dependency(lf, linker_kernel_file);
if (error)
return (error);
}
if (linker_file_lookup_set(lf, MDT_SETNAME, &start, &stop,
&count) != 0)
return (0);
for (mdp = start; mdp < stop; mdp++) {
mp = *mdp;
if (mp->md_type != MDT_VERSION)
continue;
modname = mp->md_cval;
ver = ((struct mod_version *)mp->md_data)->mv_version;
mod = modlist_lookup(modname, ver);
if (mod != NULL) {
printf("interface %s.%d already present in the KLD"
" '%s'!\n", modname, ver,
mod->container->filename);
return (EEXIST);
}
}
for (mdp = start; mdp < stop; mdp++) {
mp = *mdp;
if (mp->md_type != MDT_DEPEND)
continue;
modname = mp->md_cval;
verinfo = mp->md_data;
nmodname = NULL;
for (nmdp = start; nmdp < stop; nmdp++) {
nmp = *nmdp;
if (nmp->md_type != MDT_VERSION)
continue;
nmodname = nmp->md_cval;
if (strcmp(modname, nmodname) == 0)
break;
}
if (nmdp < stop)/* early exit, it's a self reference */
continue;
mod = modlist_lookup2(modname, verinfo);
if (mod) { /* woohoo, it's loaded already */
lfdep = mod->container;
lfdep->refs++;
error = linker_file_add_dependency(lf, lfdep);
if (error)
break;
continue;
}
error = linker_load_module(NULL, modname, lf, verinfo, NULL);
if (error) {
printf("KLD %s: depends on %s - not available or"
" version mismatch\n", lf->filename, modname);
break;
}
}
if (error)
return (error);
linker_addmodules(lf, start, stop, 0);
return (error);
}
static int
sysctl_kern_function_list_iterate(const char *name, void *opaque)
{
struct sysctl_req *req;
req = opaque;
return (SYSCTL_OUT(req, name, strlen(name) + 1));
}
/*
* Export a nul-separated, double-nul-terminated list of all function names
* in the kernel.
*/
static int
sysctl_kern_function_list(SYSCTL_HANDLER_ARGS)
{
linker_file_t lf;
int error;
#ifdef MAC
error = mac_kld_check_stat(req->td->td_ucred);
if (error)
return (error);
#endif
error = sysctl_wire_old_buffer(req, 0);
if (error != 0)
return (error);
KLD_LOCK();
TAILQ_FOREACH(lf, &linker_files, link) {
error = LINKER_EACH_FUNCTION_NAME(lf,
sysctl_kern_function_list_iterate, req);
if (error) {
KLD_UNLOCK();
return (error);
}
}
KLD_UNLOCK();
return (SYSCTL_OUT(req, "", 1));
}
SYSCTL_PROC(_kern, OID_AUTO, function_list, CTLTYPE_OPAQUE | CTLFLAG_RD,
NULL, 0, sysctl_kern_function_list, "", "kernel function list");
Index: head/sys/kern/kern_loginclass.c
===================================================================
--- head/sys/kern/kern_loginclass.c (revision 225616)
+++ head/sys/kern/kern_loginclass.c (revision 225617)
@@ -1,238 +1,238 @@
/*-
* Copyright (c) 2011 The FreeBSD Foundation
* All rights reserved.
*
* This software was developed by Edward Tomasz Napierala under sponsorship
* from the FreeBSD Foundation.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
/*
* Processes may set login class name using setloginclass(2). This
* is usually done through call to setusercontext(3), by programs
* such as login(1), based on information from master.passwd(5). Kernel
* uses this information to enforce per-class resource limits. Current
* login class can be determined using id(1). Login class is inherited
* from the parent process during fork(2). If not set, it defaults
* to "default".
*
* Code in this file implements setloginclass(2) and getloginclass(2)
* system calls, and maintains class name storage and retrieval.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/eventhandler.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/loginclass.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
#include <sys/types.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/queue.h>
#include <sys/racct.h>
#include <sys/refcount.h>
#include <sys/sysproto.h>
#include <sys/systm.h>
static MALLOC_DEFINE(M_LOGINCLASS, "loginclass", "loginclass structures");
LIST_HEAD(, loginclass) loginclasses;
/*
* Lock protecting loginclasses list.
*/
static struct mtx loginclasses_lock;
static void lc_init(void);
SYSINIT(loginclass, SI_SUB_CPU, SI_ORDER_FIRST, lc_init, NULL);
void
loginclass_hold(struct loginclass *lc)
{
refcount_acquire(&lc->lc_refcount);
}
void
loginclass_free(struct loginclass *lc)
{
int old;
old = lc->lc_refcount;
if (old > 1 && atomic_cmpset_int(&lc->lc_refcount, old, old - 1))
return;
mtx_lock(&loginclasses_lock);
if (refcount_release(&lc->lc_refcount)) {
racct_destroy(&lc->lc_racct);
LIST_REMOVE(lc, lc_next);
mtx_unlock(&loginclasses_lock);
free(lc, M_LOGINCLASS);
return;
}
mtx_unlock(&loginclasses_lock);
}
/*
* Return loginclass structure with a corresponding name. Not
* performance critical, as it's used mainly by setloginclass(2),
* which happens once per login session. Caller has to use
* loginclass_free() on the returned value when it's no longer
* needed.
*/
struct loginclass *
loginclass_find(const char *name)
{
struct loginclass *lc, *newlc;
if (name[0] == '\0' || strlen(name) >= MAXLOGNAME)
return (NULL);
newlc = malloc(sizeof(*newlc), M_LOGINCLASS, M_ZERO | M_WAITOK);
racct_create(&newlc->lc_racct);
mtx_lock(&loginclasses_lock);
LIST_FOREACH(lc, &loginclasses, lc_next) {
if (strcmp(name, lc->lc_name) != 0)
continue;
/* Found loginclass with a matching name? */
loginclass_hold(lc);
mtx_unlock(&loginclasses_lock);
racct_destroy(&newlc->lc_racct);
free(newlc, M_LOGINCLASS);
return (lc);
}
/* Add new loginclass. */
strcpy(newlc->lc_name, name);
refcount_init(&newlc->lc_refcount, 1);
LIST_INSERT_HEAD(&loginclasses, newlc, lc_next);
mtx_unlock(&loginclasses_lock);
return (newlc);
}
/*
* Get login class name.
*/
#ifndef _SYS_SYSPROTO_H_
struct getloginclass_args {
char *namebuf;
size_t namelen;
};
#endif
/* ARGSUSED */
int
-getloginclass(struct thread *td, struct getloginclass_args *uap)
+sys_getloginclass(struct thread *td, struct getloginclass_args *uap)
{
int error = 0;
size_t lcnamelen;
struct proc *p;
struct loginclass *lc;
p = td->td_proc;
PROC_LOCK(p);
lc = p->p_ucred->cr_loginclass;
loginclass_hold(lc);
PROC_UNLOCK(p);
lcnamelen = strlen(lc->lc_name) + 1;
if (lcnamelen > uap->namelen)
error = ERANGE;
if (error == 0)
error = copyout(lc->lc_name, uap->namebuf, lcnamelen);
loginclass_free(lc);
return (error);
}
/*
* Set login class name.
*/
#ifndef _SYS_SYSPROTO_H_
struct setloginclass_args {
const char *namebuf;
};
#endif
/* ARGSUSED */
int
-setloginclass(struct thread *td, struct setloginclass_args *uap)
+sys_setloginclass(struct thread *td, struct setloginclass_args *uap)
{
struct proc *p = td->td_proc;
int error;
char lcname[MAXLOGNAME];
struct loginclass *newlc;
struct ucred *newcred, *oldcred;
error = priv_check(td, PRIV_PROC_SETLOGINCLASS);
if (error != 0)
return (error);
error = copyinstr(uap->namebuf, lcname, sizeof(lcname), NULL);
if (error != 0)
return (error);
newlc = loginclass_find(lcname);
if (newlc == NULL)
return (EINVAL);
newcred = crget();
PROC_LOCK(p);
oldcred = crcopysafe(p, newcred);
newcred->cr_loginclass = newlc;
p->p_ucred = newcred;
PROC_UNLOCK(p);
#ifdef RACCT
racct_proc_ucred_changed(p, oldcred, newcred);
#endif
loginclass_free(oldcred->cr_loginclass);
crfree(oldcred);
return (0);
}
void
loginclass_racct_foreach(void (*callback)(struct racct *racct,
void *arg2, void *arg3), void *arg2, void *arg3)
{
struct loginclass *lc;
mtx_lock(&loginclasses_lock);
LIST_FOREACH(lc, &loginclasses, lc_next)
(callback)(lc->lc_racct, arg2, arg3);
mtx_unlock(&loginclasses_lock);
}
static void
lc_init(void)
{
mtx_init(&loginclasses_lock, "loginclasses lock", NULL, MTX_DEF);
}
Index: head/sys/kern/kern_module.c
===================================================================
--- head/sys/kern/kern_module.c (revision 225616)
+++ head/sys/kern/kern_module.c (revision 225617)
@@ -1,523 +1,523 @@
/*-
* Copyright (c) 1997 Doug Rabson
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include "opt_compat.h"
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/systm.h>
#include <sys/eventhandler.h>
#include <sys/malloc.h>
#include <sys/sysproto.h>
#include <sys/sysent.h>
#include <sys/proc.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/reboot.h>
#include <sys/sx.h>
#include <sys/module.h>
#include <sys/linker.h>
static MALLOC_DEFINE(M_MODULE, "module", "module data structures");
struct module {
TAILQ_ENTRY(module) link; /* chain together all modules */
TAILQ_ENTRY(module) flink; /* all modules in a file */
struct linker_file *file; /* file which contains this module */
int refs; /* reference count */
int id; /* unique id number */
char *name; /* module name */
modeventhand_t handler; /* event handler */
void *arg; /* argument for handler */
modspecific_t data; /* module specific data */
};
#define MOD_EVENT(mod, type) (mod)->handler((mod), (type), (mod)->arg)
static TAILQ_HEAD(modulelist, module) modules;
struct sx modules_sx;
static int nextid = 1;
static void module_shutdown(void *, int);
static int
modevent_nop(module_t mod, int what, void *arg)
{
switch(what) {
case MOD_LOAD:
return (0);
case MOD_UNLOAD:
return (EBUSY);
default:
return (EOPNOTSUPP);
}
}
static void
module_init(void *arg)
{
sx_init(&modules_sx, "module subsystem sx lock");
TAILQ_INIT(&modules);
EVENTHANDLER_REGISTER(shutdown_final, module_shutdown, NULL,
SHUTDOWN_PRI_DEFAULT);
}
SYSINIT(module, SI_SUB_KLD, SI_ORDER_FIRST, module_init, 0);
static void
module_shutdown(void *arg1, int arg2)
{
module_t mod;
if (arg2 & RB_NOSYNC)
return;
mtx_lock(&Giant);
MOD_SLOCK;
TAILQ_FOREACH_REVERSE(mod, &modules, modulelist, link)
MOD_EVENT(mod, MOD_SHUTDOWN);
MOD_SUNLOCK;
mtx_unlock(&Giant);
}
void
module_register_init(const void *arg)
{
const moduledata_t *data = (const moduledata_t *)arg;
int error;
module_t mod;
mtx_lock(&Giant);
MOD_SLOCK;
mod = module_lookupbyname(data->name);
if (mod == NULL)
panic("module_register_init: module named %s not found\n",
data->name);
MOD_SUNLOCK;
error = MOD_EVENT(mod, MOD_LOAD);
if (error) {
MOD_EVENT(mod, MOD_UNLOAD);
MOD_XLOCK;
module_release(mod);
MOD_XUNLOCK;
printf("module_register_init: MOD_LOAD (%s, %p, %p) error"
" %d\n", data->name, (void *)data->evhand, data->priv,
error);
} else {
MOD_XLOCK;
if (mod->file) {
/*
* Once a module is succesfully loaded, move
* it to the head of the module list for this
* linker file. This resorts the list so that
* when the kernel linker iterates over the
* modules to unload them, it will unload them
* in the reverse order they were loaded.
*/
TAILQ_REMOVE(&mod->file->modules, mod, flink);
TAILQ_INSERT_HEAD(&mod->file->modules, mod, flink);
}
MOD_XUNLOCK;
}
mtx_unlock(&Giant);
}
int
module_register(const moduledata_t *data, linker_file_t container)
{
size_t namelen;
module_t newmod;
MOD_XLOCK;
newmod = module_lookupbyname(data->name);
if (newmod != NULL) {
MOD_XUNLOCK;
printf("module_register: module %s already exists!\n",
data->name);
return (EEXIST);
}
namelen = strlen(data->name) + 1;
newmod = malloc(sizeof(struct module) + namelen, M_MODULE, M_WAITOK);
if (newmod == NULL) {
MOD_XUNLOCK;
return (ENOMEM);
}
newmod->refs = 1;
newmod->id = nextid++;
newmod->name = (char *)(newmod + 1);
strcpy(newmod->name, data->name);
newmod->handler = data->evhand ? data->evhand : modevent_nop;
newmod->arg = data->priv;
bzero(&newmod->data, sizeof(newmod->data));
TAILQ_INSERT_TAIL(&modules, newmod, link);
if (container)
TAILQ_INSERT_TAIL(&container->modules, newmod, flink);
newmod->file = container;
MOD_XUNLOCK;
return (0);
}
void
module_reference(module_t mod)
{
MOD_XLOCK_ASSERT;
MOD_DPF(REFS, ("module_reference: before, refs=%d\n", mod->refs));
mod->refs++;
}
void
module_release(module_t mod)
{
MOD_XLOCK_ASSERT;
if (mod->refs <= 0)
panic("module_release: bad reference count");
MOD_DPF(REFS, ("module_release: before, refs=%d\n", mod->refs));
mod->refs--;
if (mod->refs == 0) {
TAILQ_REMOVE(&modules, mod, link);
if (mod->file)
TAILQ_REMOVE(&mod->file->modules, mod, flink);
free(mod, M_MODULE);
}
}
module_t
module_lookupbyname(const char *name)
{
module_t mod;
int err;
MOD_LOCK_ASSERT;
TAILQ_FOREACH(mod, &modules, link) {
err = strcmp(mod->name, name);
if (err == 0)
return (mod);
}
return (NULL);
}
module_t
module_lookupbyid(int modid)
{
module_t mod;
MOD_LOCK_ASSERT;
TAILQ_FOREACH(mod, &modules, link)
if (mod->id == modid)
return(mod);
return (NULL);
}
int
module_quiesce(module_t mod)
{
int error;
mtx_lock(&Giant);
error = MOD_EVENT(mod, MOD_QUIESCE);
mtx_unlock(&Giant);
if (error == EOPNOTSUPP || error == EINVAL)
error = 0;
return (error);
}
int
module_unload(module_t mod)
{
int error;
mtx_lock(&Giant);
error = MOD_EVENT(mod, MOD_UNLOAD);
mtx_unlock(&Giant);
return (error);
}
int
module_getid(module_t mod)
{
MOD_LOCK_ASSERT;
return (mod->id);
}
module_t
module_getfnext(module_t mod)
{
MOD_LOCK_ASSERT;
return (TAILQ_NEXT(mod, flink));
}
const char *
module_getname(module_t mod)
{
MOD_LOCK_ASSERT;
return (mod->name);
}
void
module_setspecific(module_t mod, modspecific_t *datap)
{
MOD_XLOCK_ASSERT;
mod->data = *datap;
}
linker_file_t
module_file(module_t mod)
{
return (mod->file);
}
/*
* Syscalls.
*/
int
-modnext(struct thread *td, struct modnext_args *uap)
+sys_modnext(struct thread *td, struct modnext_args *uap)
{
module_t mod;
int error = 0;
td->td_retval[0] = -1;
MOD_SLOCK;
if (uap->modid == 0) {
mod = TAILQ_FIRST(&modules);
if (mod)
td->td_retval[0] = mod->id;
else
error = ENOENT;
goto done2;
}
mod = module_lookupbyid(uap->modid);
if (mod == NULL) {
error = ENOENT;
goto done2;
}
if (TAILQ_NEXT(mod, link))
td->td_retval[0] = TAILQ_NEXT(mod, link)->id;
else
td->td_retval[0] = 0;
done2:
MOD_SUNLOCK;
return (error);
}
int
-modfnext(struct thread *td, struct modfnext_args *uap)
+sys_modfnext(struct thread *td, struct modfnext_args *uap)
{
module_t mod;
int error;
td->td_retval[0] = -1;
MOD_SLOCK;
mod = module_lookupbyid(uap->modid);
if (mod == NULL) {
error = ENOENT;
} else {
error = 0;
if (TAILQ_NEXT(mod, flink))
td->td_retval[0] = TAILQ_NEXT(mod, flink)->id;
else
td->td_retval[0] = 0;
}
MOD_SUNLOCK;
return (error);
}
struct module_stat_v1 {
int version; /* set to sizeof(struct module_stat) */
char name[MAXMODNAME];
int refs;
int id;
};
int
-modstat(struct thread *td, struct modstat_args *uap)
+sys_modstat(struct thread *td, struct modstat_args *uap)
{
module_t mod;
modspecific_t data;
int error = 0;
int id, namelen, refs, version;
struct module_stat *stat;
char *name;
MOD_SLOCK;
mod = module_lookupbyid(uap->modid);
if (mod == NULL) {
MOD_SUNLOCK;
return (ENOENT);
}
id = mod->id;
refs = mod->refs;
name = mod->name;
data = mod->data;
MOD_SUNLOCK;
stat = uap->stat;
/*
* Check the version of the user's structure.
*/
if ((error = copyin(&stat->version, &version, sizeof(version))) != 0)
return (error);
if (version != sizeof(struct module_stat_v1)
&& version != sizeof(struct module_stat))
return (EINVAL);
namelen = strlen(mod->name) + 1;
if (namelen > MAXMODNAME)
namelen = MAXMODNAME;
if ((error = copyout(name, &stat->name[0], namelen)) != 0)
return (error);
if ((error = copyout(&refs, &stat->refs, sizeof(int))) != 0)
return (error);
if ((error = copyout(&id, &stat->id, sizeof(int))) != 0)
return (error);
/*
* >v1 stat includes module data.
*/
if (version == sizeof(struct module_stat))
if ((error = copyout(&data, &stat->data,
sizeof(data))) != 0)
return (error);
td->td_retval[0] = 0;
return (error);
}
int
-modfind(struct thread *td, struct modfind_args *uap)
+sys_modfind(struct thread *td, struct modfind_args *uap)
{
int error = 0;
char name[MAXMODNAME];
module_t mod;
if ((error = copyinstr(uap->name, name, sizeof name, 0)) != 0)
return (error);
MOD_SLOCK;
mod = module_lookupbyname(name);
if (mod == NULL)
error = ENOENT;
else
td->td_retval[0] = module_getid(mod);
MOD_SUNLOCK;
return (error);
}
MODULE_VERSION(kernel, __FreeBSD_version);
#ifdef COMPAT_FREEBSD32
#include <sys/mount.h>
#include <sys/socket.h>
#include <compat/freebsd32/freebsd32_util.h>
#include <compat/freebsd32/freebsd32.h>
#include <compat/freebsd32/freebsd32_proto.h>
typedef union modspecific32 {
int intval;
uint32_t uintval;
int longval;
uint32_t ulongval;
} modspecific32_t;
struct module_stat32 {
int version;
char name[MAXMODNAME];
int refs;
int id;
modspecific32_t data;
};
int
freebsd32_modstat(struct thread *td, struct freebsd32_modstat_args *uap)
{
module_t mod;
modspecific32_t data32;
int error = 0;
int id, namelen, refs, version;
struct module_stat32 *stat32;
char *name;
MOD_SLOCK;
mod = module_lookupbyid(uap->modid);
if (mod == NULL) {
MOD_SUNLOCK;
return (ENOENT);
}
id = mod->id;
refs = mod->refs;
name = mod->name;
CP(mod->data, data32, intval);
CP(mod->data, data32, uintval);
CP(mod->data, data32, longval);
CP(mod->data, data32, ulongval);
MOD_SUNLOCK;
stat32 = uap->stat;
if ((error = copyin(&stat32->version, &version, sizeof(version))) != 0)
return (error);
if (version != sizeof(struct module_stat_v1)
&& version != sizeof(struct module_stat32))
return (EINVAL);
namelen = strlen(mod->name) + 1;
if (namelen > MAXMODNAME)
namelen = MAXMODNAME;
if ((error = copyout(name, &stat32->name[0], namelen)) != 0)
return (error);
if ((error = copyout(&refs, &stat32->refs, sizeof(int))) != 0)
return (error);
if ((error = copyout(&id, &stat32->id, sizeof(int))) != 0)
return (error);
/*
* >v1 stat includes module data.
*/
if (version == sizeof(struct module_stat32))
if ((error = copyout(&data32, &stat32->data,
sizeof(data32))) != 0)
return (error);
td->td_retval[0] = 0;
return (error);
}
#endif
Index: head/sys/kern/kern_ntptime.c
===================================================================
--- head/sys/kern/kern_ntptime.c (revision 225616)
+++ head/sys/kern/kern_ntptime.c (revision 225617)
@@ -1,1044 +1,1044 @@
/*-
***********************************************************************
* *
* Copyright (c) David L. Mills 1993-2001 *
* *
* Permission to use, copy, modify, and distribute this software and *
* its documentation for any purpose and without fee is hereby *
* granted, provided that the above copyright notice appears in all *
* copies and that both the copyright notice and this permission *
* notice appear in supporting documentation, and that the name *
* University of Delaware not be used in advertising or publicity *
* pertaining to distribution of the software without specific, *
* written prior permission. The University of Delaware makes no *
* representations about the suitability this software for any *
* purpose. It is provided "as is" without express or implied *
* warranty. *
* *
**********************************************************************/
/*
* Adapted from the original sources for FreeBSD and timecounters by:
* Poul-Henning Kamp <phk@FreeBSD.org>.
*
* The 32bit version of the "LP" macros seems a bit past its "sell by"
* date so I have retained only the 64bit version and included it directly
* in this file.
*
* Only minor changes done to interface with the timecounters over in
* sys/kern/kern_clock.c. Some of the comments below may be (even more)
* confusing and/or plain wrong in that context.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_ntp.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/sysproto.h>
#include <sys/eventhandler.h>
#include <sys/kernel.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/time.h>
#include <sys/timex.h>
#include <sys/timetc.h>
#include <sys/timepps.h>
#include <sys/syscallsubr.h>
#include <sys/sysctl.h>
#ifdef PPS_SYNC
FEATURE(pps_sync, "Support usage of external PPS signal by kernel PLL");
#endif
/*
* Single-precision macros for 64-bit machines
*/
typedef int64_t l_fp;
#define L_ADD(v, u) ((v) += (u))
#define L_SUB(v, u) ((v) -= (u))
#define L_ADDHI(v, a) ((v) += (int64_t)(a) << 32)
#define L_NEG(v) ((v) = -(v))
#define L_RSHIFT(v, n) \
do { \
if ((v) < 0) \
(v) = -(-(v) >> (n)); \
else \
(v) = (v) >> (n); \
} while (0)
#define L_MPY(v, a) ((v) *= (a))
#define L_CLR(v) ((v) = 0)
#define L_ISNEG(v) ((v) < 0)
#define L_LINT(v, a) ((v) = (int64_t)(a) << 32)
#define L_GINT(v) ((v) < 0 ? -(-(v) >> 32) : (v) >> 32)
/*
* Generic NTP kernel interface
*
* These routines constitute the Network Time Protocol (NTP) interfaces
* for user and daemon application programs. The ntp_gettime() routine
* provides the time, maximum error (synch distance) and estimated error
* (dispersion) to client user application programs. The ntp_adjtime()
* routine is used by the NTP daemon to adjust the system clock to an
* externally derived time. The time offset and related variables set by
* this routine are used by other routines in this module to adjust the
* phase and frequency of the clock discipline loop which controls the
* system clock.
*
* When the kernel time is reckoned directly in nanoseconds (NTP_NANO
* defined), the time at each tick interrupt is derived directly from
* the kernel time variable. When the kernel time is reckoned in
* microseconds, (NTP_NANO undefined), the time is derived from the
* kernel time variable together with a variable representing the
* leftover nanoseconds at the last tick interrupt. In either case, the
* current nanosecond time is reckoned from these values plus an
* interpolated value derived by the clock routines in another
* architecture-specific module. The interpolation can use either a
* dedicated counter or a processor cycle counter (PCC) implemented in
* some architectures.
*
* Note that all routines must run at priority splclock or higher.
*/
/*
* Phase/frequency-lock loop (PLL/FLL) definitions
*
* The nanosecond clock discipline uses two variable types, time
* variables and frequency variables. Both types are represented as 64-
* bit fixed-point quantities with the decimal point between two 32-bit
* halves. On a 32-bit machine, each half is represented as a single
* word and mathematical operations are done using multiple-precision
* arithmetic. On a 64-bit machine, ordinary computer arithmetic is
* used.
*
* A time variable is a signed 64-bit fixed-point number in ns and
* fraction. It represents the remaining time offset to be amortized
* over succeeding tick interrupts. The maximum time offset is about
* 0.5 s and the resolution is about 2.3e-10 ns.
*
* 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 3 3
* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
* |s s s| ns |
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
* | fraction |
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*
* A frequency variable is a signed 64-bit fixed-point number in ns/s
* and fraction. It represents the ns and fraction to be added to the
* kernel time variable at each second. The maximum frequency offset is
* about +-500000 ns/s and the resolution is about 2.3e-10 ns/s.
*
* 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 3 3
* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
* |s s s s s s s s s s s s s| ns/s |
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
* | fraction |
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*/
/*
* The following variables establish the state of the PLL/FLL and the
* residual time and frequency offset of the local clock.
*/
#define SHIFT_PLL 4 /* PLL loop gain (shift) */
#define SHIFT_FLL 2 /* FLL loop gain (shift) */
static int time_state = TIME_OK; /* clock state */
static int time_status = STA_UNSYNC; /* clock status bits */
static long time_tai; /* TAI offset (s) */
static long time_monitor; /* last time offset scaled (ns) */
static long time_constant; /* poll interval (shift) (s) */
static long time_precision = 1; /* clock precision (ns) */
static long time_maxerror = MAXPHASE / 1000; /* maximum error (us) */
static long time_esterror = MAXPHASE / 1000; /* estimated error (us) */
static long time_reftime; /* time at last adjustment (s) */
static l_fp time_offset; /* time offset (ns) */
static l_fp time_freq; /* frequency offset (ns/s) */
static l_fp time_adj; /* tick adjust (ns/s) */
static int64_t time_adjtime; /* correction from adjtime(2) (usec) */
#ifdef PPS_SYNC
/*
* The following variables are used when a pulse-per-second (PPS) signal
* is available and connected via a modem control lead. They establish
* the engineering parameters of the clock discipline loop when
* controlled by the PPS signal.
*/
#define PPS_FAVG 2 /* min freq avg interval (s) (shift) */
#define PPS_FAVGDEF 8 /* default freq avg int (s) (shift) */
#define PPS_FAVGMAX 15 /* max freq avg interval (s) (shift) */
#define PPS_PAVG 4 /* phase avg interval (s) (shift) */
#define PPS_VALID 120 /* PPS signal watchdog max (s) */
#define PPS_MAXWANDER 100000 /* max PPS wander (ns/s) */
#define PPS_POPCORN 2 /* popcorn spike threshold (shift) */
static struct timespec pps_tf[3]; /* phase median filter */
static l_fp pps_freq; /* scaled frequency offset (ns/s) */
static long pps_fcount; /* frequency accumulator */
static long pps_jitter; /* nominal jitter (ns) */
static long pps_stabil; /* nominal stability (scaled ns/s) */
static long pps_lastsec; /* time at last calibration (s) */
static int pps_valid; /* signal watchdog counter */
static int pps_shift = PPS_FAVG; /* interval duration (s) (shift) */
static int pps_shiftmax = PPS_FAVGDEF; /* max interval duration (s) (shift) */
static int pps_intcnt; /* wander counter */
/*
* PPS signal quality monitors
*/
static long pps_calcnt; /* calibration intervals */
static long pps_jitcnt; /* jitter limit exceeded */
static long pps_stbcnt; /* stability limit exceeded */
static long pps_errcnt; /* calibration errors */
#endif /* PPS_SYNC */
/*
* End of phase/frequency-lock loop (PLL/FLL) definitions
*/
static void ntp_init(void);
static void hardupdate(long offset);
static void ntp_gettime1(struct ntptimeval *ntvp);
static int ntp_is_time_error(void);
static int
ntp_is_time_error(void)
{
/*
* Status word error decode. If any of these conditions occur,
* an error is returned, instead of the status word. Most
* applications will care only about the fact the system clock
* may not be trusted, not about the details.
*
* Hardware or software error
*/
if ((time_status & (STA_UNSYNC | STA_CLOCKERR)) ||
/*
* PPS signal lost when either time or frequency synchronization
* requested
*/
(time_status & (STA_PPSFREQ | STA_PPSTIME) &&
!(time_status & STA_PPSSIGNAL)) ||
/*
* PPS jitter exceeded when time synchronization requested
*/
(time_status & STA_PPSTIME &&
time_status & STA_PPSJITTER) ||
/*
* PPS wander exceeded or calibration error when frequency
* synchronization requested
*/
(time_status & STA_PPSFREQ &&
time_status & (STA_PPSWANDER | STA_PPSERROR)))
return (1);
return (0);
}
static void
ntp_gettime1(struct ntptimeval *ntvp)
{
struct timespec atv; /* nanosecond time */
GIANT_REQUIRED;
nanotime(&atv);
ntvp->time.tv_sec = atv.tv_sec;
ntvp->time.tv_nsec = atv.tv_nsec;
ntvp->maxerror = time_maxerror;
ntvp->esterror = time_esterror;
ntvp->tai = time_tai;
ntvp->time_state = time_state;
if (ntp_is_time_error())
ntvp->time_state = TIME_ERROR;
}
/*
* ntp_gettime() - NTP user application interface
*
* See the timex.h header file for synopsis and API description. Note that
* the TAI offset is returned in the ntvtimeval.tai structure member.
*/
#ifndef _SYS_SYSPROTO_H_
struct ntp_gettime_args {
struct ntptimeval *ntvp;
};
#endif
/* ARGSUSED */
int
-ntp_gettime(struct thread *td, struct ntp_gettime_args *uap)
+sys_ntp_gettime(struct thread *td, struct ntp_gettime_args *uap)
{
struct ntptimeval ntv;
mtx_lock(&Giant);
ntp_gettime1(&ntv);
mtx_unlock(&Giant);
td->td_retval[0] = ntv.time_state;
return (copyout(&ntv, uap->ntvp, sizeof(ntv)));
}
static int
ntp_sysctl(SYSCTL_HANDLER_ARGS)
{
struct ntptimeval ntv; /* temporary structure */
ntp_gettime1(&ntv);
return (sysctl_handle_opaque(oidp, &ntv, sizeof(ntv), req));
}
SYSCTL_NODE(_kern, OID_AUTO, ntp_pll, CTLFLAG_RW, 0, "");
SYSCTL_PROC(_kern_ntp_pll, OID_AUTO, gettime, CTLTYPE_OPAQUE|CTLFLAG_RD,
0, sizeof(struct ntptimeval) , ntp_sysctl, "S,ntptimeval", "");
#ifdef PPS_SYNC
SYSCTL_INT(_kern_ntp_pll, OID_AUTO, pps_shiftmax, CTLFLAG_RW, &pps_shiftmax, 0, "");
SYSCTL_INT(_kern_ntp_pll, OID_AUTO, pps_shift, CTLFLAG_RW, &pps_shift, 0, "");
SYSCTL_LONG(_kern_ntp_pll, OID_AUTO, time_monitor, CTLFLAG_RD,
&time_monitor, 0, "");
SYSCTL_OPAQUE(_kern_ntp_pll, OID_AUTO, pps_freq, CTLFLAG_RD, &pps_freq, sizeof(pps_freq), "I", "");
SYSCTL_OPAQUE(_kern_ntp_pll, OID_AUTO, time_freq, CTLFLAG_RD, &time_freq, sizeof(time_freq), "I", "");
#endif
/*
* ntp_adjtime() - NTP daemon application interface
*
* See the timex.h header file for synopsis and API description. Note that
* the timex.constant structure member has a dual purpose to set the time
* constant and to set the TAI offset.
*/
#ifndef _SYS_SYSPROTO_H_
struct ntp_adjtime_args {
struct timex *tp;
};
#endif
int
-ntp_adjtime(struct thread *td, struct ntp_adjtime_args *uap)
+sys_ntp_adjtime(struct thread *td, struct ntp_adjtime_args *uap)
{
struct timex ntv; /* temporary structure */
long freq; /* frequency ns/s) */
int modes; /* mode bits from structure */
int s; /* caller priority */
int error;
error = copyin((caddr_t)uap->tp, (caddr_t)&ntv, sizeof(ntv));
if (error)
return(error);
/*
* Update selected clock variables - only the superuser can
* change anything. Note that there is no error checking here on
* the assumption the superuser should know what it is doing.
* Note that either the time constant or TAI offset are loaded
* from the ntv.constant member, depending on the mode bits. If
* the STA_PLL bit in the status word is cleared, the state and
* status words are reset to the initial values at boot.
*/
mtx_lock(&Giant);
modes = ntv.modes;
if (modes)
error = priv_check(td, PRIV_NTP_ADJTIME);
if (error)
goto done2;
s = splclock();
if (modes & MOD_MAXERROR)
time_maxerror = ntv.maxerror;
if (modes & MOD_ESTERROR)
time_esterror = ntv.esterror;
if (modes & MOD_STATUS) {
if (time_status & STA_PLL && !(ntv.status & STA_PLL)) {
time_state = TIME_OK;
time_status = STA_UNSYNC;
#ifdef PPS_SYNC
pps_shift = PPS_FAVG;
#endif /* PPS_SYNC */
}
time_status &= STA_RONLY;
time_status |= ntv.status & ~STA_RONLY;
}
if (modes & MOD_TIMECONST) {
if (ntv.constant < 0)
time_constant = 0;
else if (ntv.constant > MAXTC)
time_constant = MAXTC;
else
time_constant = ntv.constant;
}
if (modes & MOD_TAI) {
if (ntv.constant > 0) /* XXX zero & negative numbers ? */
time_tai = ntv.constant;
}
#ifdef PPS_SYNC
if (modes & MOD_PPSMAX) {
if (ntv.shift < PPS_FAVG)
pps_shiftmax = PPS_FAVG;
else if (ntv.shift > PPS_FAVGMAX)
pps_shiftmax = PPS_FAVGMAX;
else
pps_shiftmax = ntv.shift;
}
#endif /* PPS_SYNC */
if (modes & MOD_NANO)
time_status |= STA_NANO;
if (modes & MOD_MICRO)
time_status &= ~STA_NANO;
if (modes & MOD_CLKB)
time_status |= STA_CLK;
if (modes & MOD_CLKA)
time_status &= ~STA_CLK;
if (modes & MOD_FREQUENCY) {
freq = (ntv.freq * 1000LL) >> 16;
if (freq > MAXFREQ)
L_LINT(time_freq, MAXFREQ);
else if (freq < -MAXFREQ)
L_LINT(time_freq, -MAXFREQ);
else {
/*
* ntv.freq is [PPM * 2^16] = [us/s * 2^16]
* time_freq is [ns/s * 2^32]
*/
time_freq = ntv.freq * 1000LL * 65536LL;
}
#ifdef PPS_SYNC
pps_freq = time_freq;
#endif /* PPS_SYNC */
}
if (modes & MOD_OFFSET) {
if (time_status & STA_NANO)
hardupdate(ntv.offset);
else
hardupdate(ntv.offset * 1000);
}
/*
* Retrieve all clock variables. Note that the TAI offset is
* returned only by ntp_gettime();
*/
if (time_status & STA_NANO)
ntv.offset = L_GINT(time_offset);
else
ntv.offset = L_GINT(time_offset) / 1000; /* XXX rounding ? */
ntv.freq = L_GINT((time_freq / 1000LL) << 16);
ntv.maxerror = time_maxerror;
ntv.esterror = time_esterror;
ntv.status = time_status;
ntv.constant = time_constant;
if (time_status & STA_NANO)
ntv.precision = time_precision;
else
ntv.precision = time_precision / 1000;
ntv.tolerance = MAXFREQ * SCALE_PPM;
#ifdef PPS_SYNC
ntv.shift = pps_shift;
ntv.ppsfreq = L_GINT((pps_freq / 1000LL) << 16);
if (time_status & STA_NANO)
ntv.jitter = pps_jitter;
else
ntv.jitter = pps_jitter / 1000;
ntv.stabil = pps_stabil;
ntv.calcnt = pps_calcnt;
ntv.errcnt = pps_errcnt;
ntv.jitcnt = pps_jitcnt;
ntv.stbcnt = pps_stbcnt;
#endif /* PPS_SYNC */
splx(s);
error = copyout((caddr_t)&ntv, (caddr_t)uap->tp, sizeof(ntv));
if (error)
goto done2;
if (ntp_is_time_error())
td->td_retval[0] = TIME_ERROR;
else
td->td_retval[0] = time_state;
done2:
mtx_unlock(&Giant);
return (error);
}
/*
* second_overflow() - called after ntp_tick_adjust()
*
* This routine is ordinarily called immediately following the above
* routine ntp_tick_adjust(). While these two routines are normally
* combined, they are separated here only for the purposes of
* simulation.
*/
void
ntp_update_second(int64_t *adjustment, time_t *newsec)
{
int tickrate;
l_fp ftemp; /* 32/64-bit temporary */
/*
* On rollover of the second both the nanosecond and microsecond
* clocks are updated and the state machine cranked as
* necessary. The phase adjustment to be used for the next
* second is calculated and the maximum error is increased by
* the tolerance.
*/
time_maxerror += MAXFREQ / 1000;
/*
* Leap second processing. If in leap-insert state at
* the end of the day, the system clock is set back one
* second; if in leap-delete state, the system clock is
* set ahead one second. The nano_time() routine or
* external clock driver will insure that reported time
* is always monotonic.
*/
switch (time_state) {
/*
* No warning.
*/
case TIME_OK:
if (time_status & STA_INS)
time_state = TIME_INS;
else if (time_status & STA_DEL)
time_state = TIME_DEL;
break;
/*
* Insert second 23:59:60 following second
* 23:59:59.
*/
case TIME_INS:
if (!(time_status & STA_INS))
time_state = TIME_OK;
else if ((*newsec) % 86400 == 0) {
(*newsec)--;
time_state = TIME_OOP;
time_tai++;
}
break;
/*
* Delete second 23:59:59.
*/
case TIME_DEL:
if (!(time_status & STA_DEL))
time_state = TIME_OK;
else if (((*newsec) + 1) % 86400 == 0) {
(*newsec)++;
time_tai--;
time_state = TIME_WAIT;
}
break;
/*
* Insert second in progress.
*/
case TIME_OOP:
time_state = TIME_WAIT;
break;
/*
* Wait for status bits to clear.
*/
case TIME_WAIT:
if (!(time_status & (STA_INS | STA_DEL)))
time_state = TIME_OK;
}
/*
* Compute the total time adjustment for the next second
* in ns. The offset is reduced by a factor depending on
* whether the PPS signal is operating. Note that the
* value is in effect scaled by the clock frequency,
* since the adjustment is added at each tick interrupt.
*/
ftemp = time_offset;
#ifdef PPS_SYNC
/* XXX even if PPS signal dies we should finish adjustment ? */
if (time_status & STA_PPSTIME && time_status &
STA_PPSSIGNAL)
L_RSHIFT(ftemp, pps_shift);
else
L_RSHIFT(ftemp, SHIFT_PLL + time_constant);
#else
L_RSHIFT(ftemp, SHIFT_PLL + time_constant);
#endif /* PPS_SYNC */
time_adj = ftemp;
L_SUB(time_offset, ftemp);
L_ADD(time_adj, time_freq);
/*
* Apply any correction from adjtime(2). If more than one second
* off we slew at a rate of 5ms/s (5000 PPM) else 500us/s (500PPM)
* until the last second is slewed the final < 500 usecs.
*/
if (time_adjtime != 0) {
if (time_adjtime > 1000000)
tickrate = 5000;
else if (time_adjtime < -1000000)
tickrate = -5000;
else if (time_adjtime > 500)
tickrate = 500;
else if (time_adjtime < -500)
tickrate = -500;
else
tickrate = time_adjtime;
time_adjtime -= tickrate;
L_LINT(ftemp, tickrate * 1000);
L_ADD(time_adj, ftemp);
}
*adjustment = time_adj;
#ifdef PPS_SYNC
if (pps_valid > 0)
pps_valid--;
else
time_status &= ~STA_PPSSIGNAL;
#endif /* PPS_SYNC */
}
/*
* ntp_init() - initialize variables and structures
*
* This routine must be called after the kernel variables hz and tick
* are set or changed and before the next tick interrupt. In this
* particular implementation, these values are assumed set elsewhere in
* the kernel. The design allows the clock frequency and tick interval
* to be changed while the system is running. So, this routine should
* probably be integrated with the code that does that.
*/
static void
ntp_init()
{
/*
* The following variables are initialized only at startup. Only
* those structures not cleared by the compiler need to be
* initialized, and these only in the simulator. In the actual
* kernel, any nonzero values here will quickly evaporate.
*/
L_CLR(time_offset);
L_CLR(time_freq);
#ifdef PPS_SYNC
pps_tf[0].tv_sec = pps_tf[0].tv_nsec = 0;
pps_tf[1].tv_sec = pps_tf[1].tv_nsec = 0;
pps_tf[2].tv_sec = pps_tf[2].tv_nsec = 0;
pps_fcount = 0;
L_CLR(pps_freq);
#endif /* PPS_SYNC */
}
SYSINIT(ntpclocks, SI_SUB_CLOCKS, SI_ORDER_MIDDLE, ntp_init, NULL);
/*
* hardupdate() - local clock update
*
* This routine is called by ntp_adjtime() to update the local clock
* phase and frequency. The implementation is of an adaptive-parameter,
* hybrid phase/frequency-lock loop (PLL/FLL). The routine computes new
* time and frequency offset estimates for each call. If the kernel PPS
* discipline code is configured (PPS_SYNC), the PPS signal itself
* determines the new time offset, instead of the calling argument.
* Presumably, calls to ntp_adjtime() occur only when the caller
* believes the local clock is valid within some bound (+-128 ms with
* NTP). If the caller's time is far different than the PPS time, an
* argument will ensue, and it's not clear who will lose.
*
* For uncompensated quartz crystal oscillators and nominal update
* intervals less than 256 s, operation should be in phase-lock mode,
* where the loop is disciplined to phase. For update intervals greater
* than 1024 s, operation should be in frequency-lock mode, where the
* loop is disciplined to frequency. Between 256 s and 1024 s, the mode
* is selected by the STA_MODE status bit.
*/
static void
hardupdate(offset)
long offset; /* clock offset (ns) */
{
long mtemp;
l_fp ftemp;
/*
* Select how the phase is to be controlled and from which
* source. If the PPS signal is present and enabled to
* discipline the time, the PPS offset is used; otherwise, the
* argument offset is used.
*/
if (!(time_status & STA_PLL))
return;
if (!(time_status & STA_PPSTIME && time_status &
STA_PPSSIGNAL)) {
if (offset > MAXPHASE)
time_monitor = MAXPHASE;
else if (offset < -MAXPHASE)
time_monitor = -MAXPHASE;
else
time_monitor = offset;
L_LINT(time_offset, time_monitor);
}
/*
* Select how the frequency is to be controlled and in which
* mode (PLL or FLL). If the PPS signal is present and enabled
* to discipline the frequency, the PPS frequency is used;
* otherwise, the argument offset is used to compute it.
*/
if (time_status & STA_PPSFREQ && time_status & STA_PPSSIGNAL) {
time_reftime = time_second;
return;
}
if (time_status & STA_FREQHOLD || time_reftime == 0)
time_reftime = time_second;
mtemp = time_second - time_reftime;
L_LINT(ftemp, time_monitor);
L_RSHIFT(ftemp, (SHIFT_PLL + 2 + time_constant) << 1);
L_MPY(ftemp, mtemp);
L_ADD(time_freq, ftemp);
time_status &= ~STA_MODE;
if (mtemp >= MINSEC && (time_status & STA_FLL || mtemp >
MAXSEC)) {
L_LINT(ftemp, (time_monitor << 4) / mtemp);
L_RSHIFT(ftemp, SHIFT_FLL + 4);
L_ADD(time_freq, ftemp);
time_status |= STA_MODE;
}
time_reftime = time_second;
if (L_GINT(time_freq) > MAXFREQ)
L_LINT(time_freq, MAXFREQ);
else if (L_GINT(time_freq) < -MAXFREQ)
L_LINT(time_freq, -MAXFREQ);
}
#ifdef PPS_SYNC
/*
* hardpps() - discipline CPU clock oscillator to external PPS signal
*
* This routine is called at each PPS interrupt in order to discipline
* the CPU clock oscillator to the PPS signal. There are two independent
* first-order feedback loops, one for the phase, the other for the
* frequency. The phase loop measures and grooms the PPS phase offset
* and leaves it in a handy spot for the seconds overflow routine. The
* frequency loop averages successive PPS phase differences and
* calculates the PPS frequency offset, which is also processed by the
* seconds overflow routine. The code requires the caller to capture the
* time and architecture-dependent hardware counter values in
* nanoseconds at the on-time PPS signal transition.
*
* Note that, on some Unix systems this routine runs at an interrupt
* priority level higher than the timer interrupt routine hardclock().
* Therefore, the variables used are distinct from the hardclock()
* variables, except for the actual time and frequency variables, which
* are determined by this routine and updated atomically.
*/
void
hardpps(tsp, nsec)
struct timespec *tsp; /* time at PPS */
long nsec; /* hardware counter at PPS */
{
long u_sec, u_nsec, v_nsec; /* temps */
l_fp ftemp;
/*
* The signal is first processed by a range gate and frequency
* discriminator. The range gate rejects noise spikes outside
* the range +-500 us. The frequency discriminator rejects input
* signals with apparent frequency outside the range 1 +-500
* PPM. If two hits occur in the same second, we ignore the
* later hit; if not and a hit occurs outside the range gate,
* keep the later hit for later comparison, but do not process
* it.
*/
time_status |= STA_PPSSIGNAL | STA_PPSJITTER;
time_status &= ~(STA_PPSWANDER | STA_PPSERROR);
pps_valid = PPS_VALID;
u_sec = tsp->tv_sec;
u_nsec = tsp->tv_nsec;
if (u_nsec >= (NANOSECOND >> 1)) {
u_nsec -= NANOSECOND;
u_sec++;
}
v_nsec = u_nsec - pps_tf[0].tv_nsec;
if (u_sec == pps_tf[0].tv_sec && v_nsec < NANOSECOND -
MAXFREQ)
return;
pps_tf[2] = pps_tf[1];
pps_tf[1] = pps_tf[0];
pps_tf[0].tv_sec = u_sec;
pps_tf[0].tv_nsec = u_nsec;
/*
* Compute the difference between the current and previous
* counter values. If the difference exceeds 0.5 s, assume it
* has wrapped around, so correct 1.0 s. If the result exceeds
* the tick interval, the sample point has crossed a tick
* boundary during the last second, so correct the tick. Very
* intricate.
*/
u_nsec = nsec;
if (u_nsec > (NANOSECOND >> 1))
u_nsec -= NANOSECOND;
else if (u_nsec < -(NANOSECOND >> 1))
u_nsec += NANOSECOND;
pps_fcount += u_nsec;
if (v_nsec > MAXFREQ || v_nsec < -MAXFREQ)
return;
time_status &= ~STA_PPSJITTER;
/*
* A three-stage median filter is used to help denoise the PPS
* time. The median sample becomes the time offset estimate; the
* difference between the other two samples becomes the time
* dispersion (jitter) estimate.
*/
if (pps_tf[0].tv_nsec > pps_tf[1].tv_nsec) {
if (pps_tf[1].tv_nsec > pps_tf[2].tv_nsec) {
v_nsec = pps_tf[1].tv_nsec; /* 0 1 2 */
u_nsec = pps_tf[0].tv_nsec - pps_tf[2].tv_nsec;
} else if (pps_tf[2].tv_nsec > pps_tf[0].tv_nsec) {
v_nsec = pps_tf[0].tv_nsec; /* 2 0 1 */
u_nsec = pps_tf[2].tv_nsec - pps_tf[1].tv_nsec;
} else {
v_nsec = pps_tf[2].tv_nsec; /* 0 2 1 */
u_nsec = pps_tf[0].tv_nsec - pps_tf[1].tv_nsec;
}
} else {
if (pps_tf[1].tv_nsec < pps_tf[2].tv_nsec) {
v_nsec = pps_tf[1].tv_nsec; /* 2 1 0 */
u_nsec = pps_tf[2].tv_nsec - pps_tf[0].tv_nsec;
} else if (pps_tf[2].tv_nsec < pps_tf[0].tv_nsec) {
v_nsec = pps_tf[0].tv_nsec; /* 1 0 2 */
u_nsec = pps_tf[1].tv_nsec - pps_tf[2].tv_nsec;
} else {
v_nsec = pps_tf[2].tv_nsec; /* 1 2 0 */
u_nsec = pps_tf[1].tv_nsec - pps_tf[0].tv_nsec;
}
}
/*
* Nominal jitter is due to PPS signal noise and interrupt
* latency. If it exceeds the popcorn threshold, the sample is
* discarded. otherwise, if so enabled, the time offset is
* updated. We can tolerate a modest loss of data here without
* much degrading time accuracy.
*/
if (u_nsec > (pps_jitter << PPS_POPCORN)) {
time_status |= STA_PPSJITTER;
pps_jitcnt++;
} else if (time_status & STA_PPSTIME) {
time_monitor = -v_nsec;
L_LINT(time_offset, time_monitor);
}
pps_jitter += (u_nsec - pps_jitter) >> PPS_FAVG;
u_sec = pps_tf[0].tv_sec - pps_lastsec;
if (u_sec < (1 << pps_shift))
return;
/*
* At the end of the calibration interval the difference between
* the first and last counter values becomes the scaled
* frequency. It will later be divided by the length of the
* interval to determine the frequency update. If the frequency
* exceeds a sanity threshold, or if the actual calibration
* interval is not equal to the expected length, the data are
* discarded. We can tolerate a modest loss of data here without
* much degrading frequency accuracy.
*/
pps_calcnt++;
v_nsec = -pps_fcount;
pps_lastsec = pps_tf[0].tv_sec;
pps_fcount = 0;
u_nsec = MAXFREQ << pps_shift;
if (v_nsec > u_nsec || v_nsec < -u_nsec || u_sec != (1 <<
pps_shift)) {
time_status |= STA_PPSERROR;
pps_errcnt++;
return;
}
/*
* Here the raw frequency offset and wander (stability) is
* calculated. If the wander is less than the wander threshold
* for four consecutive averaging intervals, the interval is
* doubled; if it is greater than the threshold for four
* consecutive intervals, the interval is halved. The scaled
* frequency offset is converted to frequency offset. The
* stability metric is calculated as the average of recent
* frequency changes, but is used only for performance
* monitoring.
*/
L_LINT(ftemp, v_nsec);
L_RSHIFT(ftemp, pps_shift);
L_SUB(ftemp, pps_freq);
u_nsec = L_GINT(ftemp);
if (u_nsec > PPS_MAXWANDER) {
L_LINT(ftemp, PPS_MAXWANDER);
pps_intcnt--;
time_status |= STA_PPSWANDER;
pps_stbcnt++;
} else if (u_nsec < -PPS_MAXWANDER) {
L_LINT(ftemp, -PPS_MAXWANDER);
pps_intcnt--;
time_status |= STA_PPSWANDER;
pps_stbcnt++;
} else {
pps_intcnt++;
}
if (pps_intcnt >= 4) {
pps_intcnt = 4;
if (pps_shift < pps_shiftmax) {
pps_shift++;
pps_intcnt = 0;
}
} else if (pps_intcnt <= -4 || pps_shift > pps_shiftmax) {
pps_intcnt = -4;
if (pps_shift > PPS_FAVG) {
pps_shift--;
pps_intcnt = 0;
}
}
if (u_nsec < 0)
u_nsec = -u_nsec;
pps_stabil += (u_nsec * SCALE_PPM - pps_stabil) >> PPS_FAVG;
/*
* The PPS frequency is recalculated and clamped to the maximum
* MAXFREQ. If enabled, the system clock frequency is updated as
* well.
*/
L_ADD(pps_freq, ftemp);
u_nsec = L_GINT(pps_freq);
if (u_nsec > MAXFREQ)
L_LINT(pps_freq, MAXFREQ);
else if (u_nsec < -MAXFREQ)
L_LINT(pps_freq, -MAXFREQ);
if (time_status & STA_PPSFREQ)
time_freq = pps_freq;
}
#endif /* PPS_SYNC */
#ifndef _SYS_SYSPROTO_H_
struct adjtime_args {
struct timeval *delta;
struct timeval *olddelta;
};
#endif
/* ARGSUSED */
int
-adjtime(struct thread *td, struct adjtime_args *uap)
+sys_adjtime(struct thread *td, struct adjtime_args *uap)
{
struct timeval delta, olddelta, *deltap;
int error;
if (uap->delta) {
error = copyin(uap->delta, &delta, sizeof(delta));
if (error)
return (error);
deltap = &delta;
} else
deltap = NULL;
error = kern_adjtime(td, deltap, &olddelta);
if (uap->olddelta && error == 0)
error = copyout(&olddelta, uap->olddelta, sizeof(olddelta));
return (error);
}
int
kern_adjtime(struct thread *td, struct timeval *delta, struct timeval *olddelta)
{
struct timeval atv;
int error;
mtx_lock(&Giant);
if (olddelta) {
atv.tv_sec = time_adjtime / 1000000;
atv.tv_usec = time_adjtime % 1000000;
if (atv.tv_usec < 0) {
atv.tv_usec += 1000000;
atv.tv_sec--;
}
*olddelta = atv;
}
if (delta) {
if ((error = priv_check(td, PRIV_ADJTIME))) {
mtx_unlock(&Giant);
return (error);
}
time_adjtime = (int64_t)delta->tv_sec * 1000000 +
delta->tv_usec;
}
mtx_unlock(&Giant);
return (0);
}
static struct callout resettodr_callout;
static int resettodr_period = 1800;
static void
periodic_resettodr(void *arg __unused)
{
if (!ntp_is_time_error()) {
mtx_lock(&Giant);
resettodr();
mtx_unlock(&Giant);
}
if (resettodr_period > 0)
callout_schedule(&resettodr_callout, resettodr_period * hz);
}
static void
shutdown_resettodr(void *arg __unused, int howto __unused)
{
callout_drain(&resettodr_callout);
if (resettodr_period > 0 && !ntp_is_time_error()) {
mtx_lock(&Giant);
resettodr();
mtx_unlock(&Giant);
}
}
static int
sysctl_resettodr_period(SYSCTL_HANDLER_ARGS)
{
int error;
error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req);
if (error || !req->newptr)
return (error);
if (resettodr_period == 0)
callout_stop(&resettodr_callout);
else
callout_reset(&resettodr_callout, resettodr_period * hz,
periodic_resettodr, NULL);
return (0);
}
SYSCTL_PROC(_machdep, OID_AUTO, rtc_save_period, CTLTYPE_INT|CTLFLAG_RW,
&resettodr_period, 1800, sysctl_resettodr_period, "I",
"Save system time to RTC with this period (in seconds)");
TUNABLE_INT("machdep.rtc_save_period", &resettodr_period);
static void
start_periodic_resettodr(void *arg __unused)
{
EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_resettodr, NULL,
SHUTDOWN_PRI_FIRST);
callout_init(&resettodr_callout, 1);
if (resettodr_period == 0)
return;
callout_reset(&resettodr_callout, resettodr_period * hz,
periodic_resettodr, NULL);
}
SYSINIT(periodic_resettodr, SI_SUB_RUN_SCHEDULER, SI_ORDER_MIDDLE,
start_periodic_resettodr, NULL);
Index: head/sys/kern/kern_proc.c
===================================================================
--- head/sys/kern/kern_proc.c (revision 225616)
+++ head/sys/kern/kern_proc.c (revision 225617)
@@ -1,2078 +1,2078 @@
/*-
* Copyright (c) 1982, 1986, 1989, 1991, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_proc.c 8.7 (Berkeley) 2/14/95
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_compat.h"
#include "opt_ddb.h"
#include "opt_kdtrace.h"
#include "opt_ktrace.h"
#include "opt_kstack_pages.h"
#include "opt_stack.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/limits.h>
#include <sys/lock.h>
#include <sys/loginclass.h>
#include <sys/malloc.h>
#include <sys/mount.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/refcount.h>
#include <sys/sbuf.h>
#include <sys/sysent.h>
#include <sys/sched.h>
#include <sys/smp.h>
#include <sys/stack.h>
#include <sys/sysctl.h>
#include <sys/filedesc.h>
#include <sys/tty.h>
#include <sys/signalvar.h>
#include <sys/sdt.h>
#include <sys/sx.h>
#include <sys/user.h>
#include <sys/jail.h>
#include <sys/vnode.h>
#include <sys/eventhandler.h>
#ifdef DDB
#include <ddb/ddb.h>
#endif
#include <vm/vm.h>
#include <vm/vm_extern.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
#include <vm/vm_object.h>
#include <vm/uma.h>
#ifdef COMPAT_FREEBSD32
#include <compat/freebsd32/freebsd32.h>
#include <compat/freebsd32/freebsd32_util.h>
#endif
SDT_PROVIDER_DEFINE(proc);
SDT_PROBE_DEFINE(proc, kernel, ctor, entry, entry);
SDT_PROBE_ARGTYPE(proc, kernel, ctor, entry, 0, "struct proc *");
SDT_PROBE_ARGTYPE(proc, kernel, ctor, entry, 1, "int");
SDT_PROBE_ARGTYPE(proc, kernel, ctor, entry, 2, "void *");
SDT_PROBE_ARGTYPE(proc, kernel, ctor, entry, 3, "int");
SDT_PROBE_DEFINE(proc, kernel, ctor, return, return);
SDT_PROBE_ARGTYPE(proc, kernel, ctor, return, 0, "struct proc *");
SDT_PROBE_ARGTYPE(proc, kernel, ctor, return, 1, "int");
SDT_PROBE_ARGTYPE(proc, kernel, ctor, return, 2, "void *");
SDT_PROBE_ARGTYPE(proc, kernel, ctor, return, 3, "int");
SDT_PROBE_DEFINE(proc, kernel, dtor, entry, entry);
SDT_PROBE_ARGTYPE(proc, kernel, dtor, entry, 0, "struct proc *");
SDT_PROBE_ARGTYPE(proc, kernel, dtor, entry, 1, "int");
SDT_PROBE_ARGTYPE(proc, kernel, dtor, entry, 2, "void *");
SDT_PROBE_ARGTYPE(proc, kernel, dtor, entry, 3, "struct thread *");
SDT_PROBE_DEFINE(proc, kernel, dtor, return, return);
SDT_PROBE_ARGTYPE(proc, kernel, dtor, return, 0, "struct proc *");
SDT_PROBE_ARGTYPE(proc, kernel, dtor, return, 1, "int");
SDT_PROBE_ARGTYPE(proc, kernel, dtor, return, 2, "void *");
SDT_PROBE_DEFINE(proc, kernel, init, entry, entry);
SDT_PROBE_ARGTYPE(proc, kernel, init, entry, 0, "struct proc *");
SDT_PROBE_ARGTYPE(proc, kernel, init, entry, 1, "int");
SDT_PROBE_ARGTYPE(proc, kernel, init, entry, 2, "int");
SDT_PROBE_DEFINE(proc, kernel, init, return, return);
SDT_PROBE_ARGTYPE(proc, kernel, init, return, 0, "struct proc *");
SDT_PROBE_ARGTYPE(proc, kernel, init, return, 1, "int");
SDT_PROBE_ARGTYPE(proc, kernel, init, return, 2, "int");
MALLOC_DEFINE(M_PGRP, "pgrp", "process group header");
MALLOC_DEFINE(M_SESSION, "session", "session header");
static MALLOC_DEFINE(M_PROC, "proc", "Proc structures");
MALLOC_DEFINE(M_SUBPROC, "subproc", "Proc sub-structures");
static void doenterpgrp(struct proc *, struct pgrp *);
static void orphanpg(struct pgrp *pg);
static void fill_kinfo_aggregate(struct proc *p, struct kinfo_proc *kp);
static void fill_kinfo_proc_only(struct proc *p, struct kinfo_proc *kp);
static void fill_kinfo_thread(struct thread *td, struct kinfo_proc *kp,
int preferthread);
static void pgadjustjobc(struct pgrp *pgrp, int entering);
static void pgdelete(struct pgrp *);
static int proc_ctor(void *mem, int size, void *arg, int flags);
static void proc_dtor(void *mem, int size, void *arg);
static int proc_init(void *mem, int size, int flags);
static void proc_fini(void *mem, int size);
static void pargs_free(struct pargs *pa);
/*
* Other process lists
*/
struct pidhashhead *pidhashtbl;
u_long pidhash;
struct pgrphashhead *pgrphashtbl;
u_long pgrphash;
struct proclist allproc;
struct proclist zombproc;
struct sx allproc_lock;
struct sx proctree_lock;
struct mtx ppeers_lock;
uma_zone_t proc_zone;
int kstack_pages = KSTACK_PAGES;
SYSCTL_INT(_kern, OID_AUTO, kstack_pages, CTLFLAG_RD, &kstack_pages, 0,
"Kernel stack size in pages");
CTASSERT(sizeof(struct kinfo_proc) == KINFO_PROC_SIZE);
#ifdef COMPAT_FREEBSD32
CTASSERT(sizeof(struct kinfo_proc32) == KINFO_PROC32_SIZE);
#endif
/*
* Initialize global process hashing structures.
*/
void
procinit()
{
sx_init(&allproc_lock, "allproc");
sx_init(&proctree_lock, "proctree");
mtx_init(&ppeers_lock, "p_peers", NULL, MTX_DEF);
LIST_INIT(&allproc);
LIST_INIT(&zombproc);
pidhashtbl = hashinit(maxproc / 4, M_PROC, &pidhash);
pgrphashtbl = hashinit(maxproc / 4, M_PROC, &pgrphash);
proc_zone = uma_zcreate("PROC", sched_sizeof_proc(),
proc_ctor, proc_dtor, proc_init, proc_fini,
UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
uihashinit();
}
/*
* Prepare a proc for use.
*/
static int
proc_ctor(void *mem, int size, void *arg, int flags)
{
struct proc *p;
p = (struct proc *)mem;
SDT_PROBE(proc, kernel, ctor , entry, p, size, arg, flags, 0);
EVENTHANDLER_INVOKE(process_ctor, p);
SDT_PROBE(proc, kernel, ctor , return, p, size, arg, flags, 0);
return (0);
}
/*
* Reclaim a proc after use.
*/
static void
proc_dtor(void *mem, int size, void *arg)
{
struct proc *p;
struct thread *td;
/* INVARIANTS checks go here */
p = (struct proc *)mem;
td = FIRST_THREAD_IN_PROC(p);
SDT_PROBE(proc, kernel, dtor, entry, p, size, arg, td, 0);
if (td != NULL) {
#ifdef INVARIANTS
KASSERT((p->p_numthreads == 1),
("bad number of threads in exiting process"));
KASSERT(STAILQ_EMPTY(&p->p_ktr), ("proc_dtor: non-empty p_ktr"));
#endif
/* Free all OSD associated to this thread. */
osd_thread_exit(td);
}
EVENTHANDLER_INVOKE(process_dtor, p);
if (p->p_ksi != NULL)
KASSERT(! KSI_ONQ(p->p_ksi), ("SIGCHLD queue"));
SDT_PROBE(proc, kernel, dtor, return, p, size, arg, 0, 0);
}
/*
* Initialize type-stable parts of a proc (when newly created).
*/
static int
proc_init(void *mem, int size, int flags)
{
struct proc *p;
p = (struct proc *)mem;
SDT_PROBE(proc, kernel, init, entry, p, size, flags, 0, 0);
p->p_sched = (struct p_sched *)&p[1];
bzero(&p->p_mtx, sizeof(struct mtx));
mtx_init(&p->p_mtx, "process lock", NULL, MTX_DEF | MTX_DUPOK);
mtx_init(&p->p_slock, "process slock", NULL, MTX_SPIN | MTX_RECURSE);
cv_init(&p->p_pwait, "ppwait");
cv_init(&p->p_dbgwait, "dbgwait");
TAILQ_INIT(&p->p_threads); /* all threads in proc */
EVENTHANDLER_INVOKE(process_init, p);
p->p_stats = pstats_alloc();
SDT_PROBE(proc, kernel, init, return, p, size, flags, 0, 0);
return (0);
}
/*
* UMA should ensure that this function is never called.
* Freeing a proc structure would violate type stability.
*/
static void
proc_fini(void *mem, int size)
{
#ifdef notnow
struct proc *p;
p = (struct proc *)mem;
EVENTHANDLER_INVOKE(process_fini, p);
pstats_free(p->p_stats);
thread_free(FIRST_THREAD_IN_PROC(p));
mtx_destroy(&p->p_mtx);
if (p->p_ksi != NULL)
ksiginfo_free(p->p_ksi);
#else
panic("proc reclaimed");
#endif
}
/*
* Is p an inferior of the current process?
*/
int
inferior(p)
register struct proc *p;
{
sx_assert(&proctree_lock, SX_LOCKED);
for (; p != curproc; p = p->p_pptr)
if (p->p_pid == 0)
return (0);
return (1);
}
/*
* Locate a process by number; return only "live" processes -- i.e., neither
* zombies nor newly born but incompletely initialized processes. By not
* returning processes in the PRS_NEW state, we allow callers to avoid
* testing for that condition to avoid dereferencing p_ucred, et al.
*/
struct proc *
pfind(pid)
register pid_t pid;
{
register struct proc *p;
sx_slock(&allproc_lock);
LIST_FOREACH(p, PIDHASH(pid), p_hash)
if (p->p_pid == pid) {
PROC_LOCK(p);
if (p->p_state == PRS_NEW) {
PROC_UNLOCK(p);
p = NULL;
}
break;
}
sx_sunlock(&allproc_lock);
return (p);
}
/*
* Locate a process group by number.
* The caller must hold proctree_lock.
*/
struct pgrp *
pgfind(pgid)
register pid_t pgid;
{
register struct pgrp *pgrp;
sx_assert(&proctree_lock, SX_LOCKED);
LIST_FOREACH(pgrp, PGRPHASH(pgid), pg_hash) {
if (pgrp->pg_id == pgid) {
PGRP_LOCK(pgrp);
return (pgrp);
}
}
return (NULL);
}
/*
* Create a new process group.
* pgid must be equal to the pid of p.
* Begin a new session if required.
*/
int
enterpgrp(p, pgid, pgrp, sess)
register struct proc *p;
pid_t pgid;
struct pgrp *pgrp;
struct session *sess;
{
struct pgrp *pgrp2;
sx_assert(&proctree_lock, SX_XLOCKED);
KASSERT(pgrp != NULL, ("enterpgrp: pgrp == NULL"));
KASSERT(p->p_pid == pgid,
("enterpgrp: new pgrp and pid != pgid"));
pgrp2 = pgfind(pgid);
KASSERT(pgrp2 == NULL,
("enterpgrp: pgrp with pgid exists"));
KASSERT(!SESS_LEADER(p),
("enterpgrp: session leader attempted setpgrp"));
mtx_init(&pgrp->pg_mtx, "process group", NULL, MTX_DEF | MTX_DUPOK);
if (sess != NULL) {
/*
* new session
*/
mtx_init(&sess->s_mtx, "session", NULL, MTX_DEF);
PROC_LOCK(p);
p->p_flag &= ~P_CONTROLT;
PROC_UNLOCK(p);
PGRP_LOCK(pgrp);
sess->s_leader = p;
sess->s_sid = p->p_pid;
refcount_init(&sess->s_count, 1);
sess->s_ttyvp = NULL;
sess->s_ttydp = NULL;
sess->s_ttyp = NULL;
bcopy(p->p_session->s_login, sess->s_login,
sizeof(sess->s_login));
pgrp->pg_session = sess;
KASSERT(p == curproc,
("enterpgrp: mksession and p != curproc"));
} else {
pgrp->pg_session = p->p_session;
sess_hold(pgrp->pg_session);
PGRP_LOCK(pgrp);
}
pgrp->pg_id = pgid;
LIST_INIT(&pgrp->pg_members);
/*
* As we have an exclusive lock of proctree_lock,
* this should not deadlock.
*/
LIST_INSERT_HEAD(PGRPHASH(pgid), pgrp, pg_hash);
pgrp->pg_jobc = 0;
SLIST_INIT(&pgrp->pg_sigiolst);
PGRP_UNLOCK(pgrp);
doenterpgrp(p, pgrp);
return (0);
}
/*
* Move p to an existing process group
*/
int
enterthispgrp(p, pgrp)
register struct proc *p;
struct pgrp *pgrp;
{
sx_assert(&proctree_lock, SX_XLOCKED);
PROC_LOCK_ASSERT(p, MA_NOTOWNED);
PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED);
PGRP_LOCK_ASSERT(p->p_pgrp, MA_NOTOWNED);
SESS_LOCK_ASSERT(p->p_session, MA_NOTOWNED);
KASSERT(pgrp->pg_session == p->p_session,
("%s: pgrp's session %p, p->p_session %p.\n",
__func__,
pgrp->pg_session,
p->p_session));
KASSERT(pgrp != p->p_pgrp,
("%s: p belongs to pgrp.", __func__));
doenterpgrp(p, pgrp);
return (0);
}
/*
* Move p to a process group
*/
static void
doenterpgrp(p, pgrp)
struct proc *p;
struct pgrp *pgrp;
{
struct pgrp *savepgrp;
sx_assert(&proctree_lock, SX_XLOCKED);
PROC_LOCK_ASSERT(p, MA_NOTOWNED);
PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED);
PGRP_LOCK_ASSERT(p->p_pgrp, MA_NOTOWNED);
SESS_LOCK_ASSERT(p->p_session, MA_NOTOWNED);
savepgrp = p->p_pgrp;
/*
* Adjust eligibility of affected pgrps to participate in job control.
* Increment eligibility counts before decrementing, otherwise we
* could reach 0 spuriously during the first call.
*/
fixjobc(p, pgrp, 1);
fixjobc(p, p->p_pgrp, 0);
PGRP_LOCK(pgrp);
PGRP_LOCK(savepgrp);
PROC_LOCK(p);
LIST_REMOVE(p, p_pglist);
p->p_pgrp = pgrp;
PROC_UNLOCK(p);
LIST_INSERT_HEAD(&pgrp->pg_members, p, p_pglist);
PGRP_UNLOCK(savepgrp);
PGRP_UNLOCK(pgrp);
if (LIST_EMPTY(&savepgrp->pg_members))
pgdelete(savepgrp);
}
/*
* remove process from process group
*/
int
leavepgrp(p)
register struct proc *p;
{
struct pgrp *savepgrp;
sx_assert(&proctree_lock, SX_XLOCKED);
savepgrp = p->p_pgrp;
PGRP_LOCK(savepgrp);
PROC_LOCK(p);
LIST_REMOVE(p, p_pglist);
p->p_pgrp = NULL;
PROC_UNLOCK(p);
PGRP_UNLOCK(savepgrp);
if (LIST_EMPTY(&savepgrp->pg_members))
pgdelete(savepgrp);
return (0);
}
/*
* delete a process group
*/
static void
pgdelete(pgrp)
register struct pgrp *pgrp;
{
struct session *savesess;
struct tty *tp;
sx_assert(&proctree_lock, SX_XLOCKED);
PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED);
SESS_LOCK_ASSERT(pgrp->pg_session, MA_NOTOWNED);
/*
* Reset any sigio structures pointing to us as a result of
* F_SETOWN with our pgid.
*/
funsetownlst(&pgrp->pg_sigiolst);
PGRP_LOCK(pgrp);
tp = pgrp->pg_session->s_ttyp;
LIST_REMOVE(pgrp, pg_hash);
savesess = pgrp->pg_session;
PGRP_UNLOCK(pgrp);
/* Remove the reference to the pgrp before deallocating it. */
if (tp != NULL) {
tty_lock(tp);
tty_rel_pgrp(tp, pgrp);
}
mtx_destroy(&pgrp->pg_mtx);
free(pgrp, M_PGRP);
sess_release(savesess);
}
static void
pgadjustjobc(pgrp, entering)
struct pgrp *pgrp;
int entering;
{
PGRP_LOCK(pgrp);
if (entering)
pgrp->pg_jobc++;
else {
--pgrp->pg_jobc;
if (pgrp->pg_jobc == 0)
orphanpg(pgrp);
}
PGRP_UNLOCK(pgrp);
}
/*
* Adjust pgrp jobc counters when specified process changes process group.
* We count the number of processes in each process group that "qualify"
* the group for terminal job control (those with a parent in a different
* process group of the same session). If that count reaches zero, the
* process group becomes orphaned. Check both the specified process'
* process group and that of its children.
* entering == 0 => p is leaving specified group.
* entering == 1 => p is entering specified group.
*/
void
fixjobc(p, pgrp, entering)
register struct proc *p;
register struct pgrp *pgrp;
int entering;
{
register struct pgrp *hispgrp;
register struct session *mysession;
sx_assert(&proctree_lock, SX_LOCKED);
PROC_LOCK_ASSERT(p, MA_NOTOWNED);
PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED);
SESS_LOCK_ASSERT(pgrp->pg_session, MA_NOTOWNED);
/*
* Check p's parent to see whether p qualifies its own process
* group; if so, adjust count for p's process group.
*/
mysession = pgrp->pg_session;
if ((hispgrp = p->p_pptr->p_pgrp) != pgrp &&
hispgrp->pg_session == mysession)
pgadjustjobc(pgrp, entering);
/*
* Check this process' children to see whether they qualify
* their process groups; if so, adjust counts for children's
* process groups.
*/
LIST_FOREACH(p, &p->p_children, p_sibling) {
hispgrp = p->p_pgrp;
if (hispgrp == pgrp ||
hispgrp->pg_session != mysession)
continue;
PROC_LOCK(p);
if (p->p_state == PRS_ZOMBIE) {
PROC_UNLOCK(p);
continue;
}
PROC_UNLOCK(p);
pgadjustjobc(hispgrp, entering);
}
}
/*
* A process group has become orphaned;
* if there are any stopped processes in the group,
* hang-up all process in that group.
*/
static void
orphanpg(pg)
struct pgrp *pg;
{
register struct proc *p;
PGRP_LOCK_ASSERT(pg, MA_OWNED);
LIST_FOREACH(p, &pg->pg_members, p_pglist) {
PROC_LOCK(p);
if (P_SHOULDSTOP(p)) {
PROC_UNLOCK(p);
LIST_FOREACH(p, &pg->pg_members, p_pglist) {
PROC_LOCK(p);
- psignal(p, SIGHUP);
- psignal(p, SIGCONT);
+ kern_psignal(p, SIGHUP);
+ kern_psignal(p, SIGCONT);
PROC_UNLOCK(p);
}
return;
}
PROC_UNLOCK(p);
}
}
void
sess_hold(struct session *s)
{
refcount_acquire(&s->s_count);
}
void
sess_release(struct session *s)
{
if (refcount_release(&s->s_count)) {
if (s->s_ttyp != NULL) {
tty_lock(s->s_ttyp);
tty_rel_sess(s->s_ttyp, s);
}
mtx_destroy(&s->s_mtx);
free(s, M_SESSION);
}
}
#include "opt_ddb.h"
#ifdef DDB
#include <ddb/ddb.h>
DB_SHOW_COMMAND(pgrpdump, pgrpdump)
{
register struct pgrp *pgrp;
register struct proc *p;
register int i;
for (i = 0; i <= pgrphash; i++) {
if (!LIST_EMPTY(&pgrphashtbl[i])) {
printf("\tindx %d\n", i);
LIST_FOREACH(pgrp, &pgrphashtbl[i], pg_hash) {
printf(
"\tpgrp %p, pgid %ld, sess %p, sesscnt %d, mem %p\n",
(void *)pgrp, (long)pgrp->pg_id,
(void *)pgrp->pg_session,
pgrp->pg_session->s_count,
(void *)LIST_FIRST(&pgrp->pg_members));
LIST_FOREACH(p, &pgrp->pg_members, p_pglist) {
printf("\t\tpid %ld addr %p pgrp %p\n",
(long)p->p_pid, (void *)p,
(void *)p->p_pgrp);
}
}
}
}
}
#endif /* DDB */
/*
* Calculate the kinfo_proc members which contain process-wide
* informations.
* Must be called with the target process locked.
*/
static void
fill_kinfo_aggregate(struct proc *p, struct kinfo_proc *kp)
{
struct thread *td;
PROC_LOCK_ASSERT(p, MA_OWNED);
kp->ki_estcpu = 0;
kp->ki_pctcpu = 0;
FOREACH_THREAD_IN_PROC(p, td) {
thread_lock(td);
kp->ki_pctcpu += sched_pctcpu(td);
kp->ki_estcpu += td->td_estcpu;
thread_unlock(td);
}
}
/*
* Clear kinfo_proc and fill in any information that is common
* to all threads in the process.
* Must be called with the target process locked.
*/
static void
fill_kinfo_proc_only(struct proc *p, struct kinfo_proc *kp)
{
struct thread *td0;
struct tty *tp;
struct session *sp;
struct ucred *cred;
struct sigacts *ps;
PROC_LOCK_ASSERT(p, MA_OWNED);
bzero(kp, sizeof(*kp));
kp->ki_structsize = sizeof(*kp);
kp->ki_paddr = p;
kp->ki_addr =/* p->p_addr; */0; /* XXX */
kp->ki_args = p->p_args;
kp->ki_textvp = p->p_textvp;
#ifdef KTRACE
kp->ki_tracep = p->p_tracevp;
kp->ki_traceflag = p->p_traceflag;
#endif
kp->ki_fd = p->p_fd;
kp->ki_vmspace = p->p_vmspace;
kp->ki_flag = p->p_flag;
cred = p->p_ucred;
if (cred) {
kp->ki_uid = cred->cr_uid;
kp->ki_ruid = cred->cr_ruid;
kp->ki_svuid = cred->cr_svuid;
kp->ki_cr_flags = 0;
if (cred->cr_flags & CRED_FLAG_CAPMODE)
kp->ki_cr_flags |= KI_CRF_CAPABILITY_MODE;
/* XXX bde doesn't like KI_NGROUPS */
if (cred->cr_ngroups > KI_NGROUPS) {
kp->ki_ngroups = KI_NGROUPS;
kp->ki_cr_flags |= KI_CRF_GRP_OVERFLOW;
} else
kp->ki_ngroups = cred->cr_ngroups;
bcopy(cred->cr_groups, kp->ki_groups,
kp->ki_ngroups * sizeof(gid_t));
kp->ki_rgid = cred->cr_rgid;
kp->ki_svgid = cred->cr_svgid;
/* If jailed(cred), emulate the old P_JAILED flag. */
if (jailed(cred)) {
kp->ki_flag |= P_JAILED;
/* If inside the jail, use 0 as a jail ID. */
if (cred->cr_prison != curthread->td_ucred->cr_prison)
kp->ki_jid = cred->cr_prison->pr_id;
}
strlcpy(kp->ki_loginclass, cred->cr_loginclass->lc_name,
sizeof(kp->ki_loginclass));
}
ps = p->p_sigacts;
if (ps) {
mtx_lock(&ps->ps_mtx);
kp->ki_sigignore = ps->ps_sigignore;
kp->ki_sigcatch = ps->ps_sigcatch;
mtx_unlock(&ps->ps_mtx);
}
if (p->p_state != PRS_NEW &&
p->p_state != PRS_ZOMBIE &&
p->p_vmspace != NULL) {
struct vmspace *vm = p->p_vmspace;
kp->ki_size = vm->vm_map.size;
kp->ki_rssize = vmspace_resident_count(vm); /*XXX*/
FOREACH_THREAD_IN_PROC(p, td0) {
if (!TD_IS_SWAPPED(td0))
kp->ki_rssize += td0->td_kstack_pages;
}
kp->ki_swrss = vm->vm_swrss;
kp->ki_tsize = vm->vm_tsize;
kp->ki_dsize = vm->vm_dsize;
kp->ki_ssize = vm->vm_ssize;
} else if (p->p_state == PRS_ZOMBIE)
kp->ki_stat = SZOMB;
if (kp->ki_flag & P_INMEM)
kp->ki_sflag = PS_INMEM;
else
kp->ki_sflag = 0;
/* Calculate legacy swtime as seconds since 'swtick'. */
kp->ki_swtime = (ticks - p->p_swtick) / hz;
kp->ki_pid = p->p_pid;
kp->ki_nice = p->p_nice;
kp->ki_start = p->p_stats->p_start;
timevaladd(&kp->ki_start, &boottime);
PROC_SLOCK(p);
rufetch(p, &kp->ki_rusage);
kp->ki_runtime = cputick2usec(p->p_rux.rux_runtime);
calcru(p, &kp->ki_rusage.ru_utime, &kp->ki_rusage.ru_stime);
PROC_SUNLOCK(p);
calccru(p, &kp->ki_childutime, &kp->ki_childstime);
/* Some callers want child times in a single value. */
kp->ki_childtime = kp->ki_childstime;
timevaladd(&kp->ki_childtime, &kp->ki_childutime);
tp = NULL;
if (p->p_pgrp) {
kp->ki_pgid = p->p_pgrp->pg_id;
kp->ki_jobc = p->p_pgrp->pg_jobc;
sp = p->p_pgrp->pg_session;
if (sp != NULL) {
kp->ki_sid = sp->s_sid;
SESS_LOCK(sp);
strlcpy(kp->ki_login, sp->s_login,
sizeof(kp->ki_login));
if (sp->s_ttyvp)
kp->ki_kiflag |= KI_CTTY;
if (SESS_LEADER(p))
kp->ki_kiflag |= KI_SLEADER;
/* XXX proctree_lock */
tp = sp->s_ttyp;
SESS_UNLOCK(sp);
}
}
if ((p->p_flag & P_CONTROLT) && tp != NULL) {
kp->ki_tdev = tty_udev(tp);
kp->ki_tpgid = tp->t_pgrp ? tp->t_pgrp->pg_id : NO_PID;
if (tp->t_session)
kp->ki_tsid = tp->t_session->s_sid;
} else
kp->ki_tdev = NODEV;
if (p->p_comm[0] != '\0')
strlcpy(kp->ki_comm, p->p_comm, sizeof(kp->ki_comm));
if (p->p_sysent && p->p_sysent->sv_name != NULL &&
p->p_sysent->sv_name[0] != '\0')
strlcpy(kp->ki_emul, p->p_sysent->sv_name, sizeof(kp->ki_emul));
kp->ki_siglist = p->p_siglist;
kp->ki_xstat = p->p_xstat;
kp->ki_acflag = p->p_acflag;
kp->ki_lock = p->p_lock;
if (p->p_pptr)
kp->ki_ppid = p->p_pptr->p_pid;
}
/*
* Fill in information that is thread specific. Must be called with
* target process locked. If 'preferthread' is set, overwrite certain
* process-related fields that are maintained for both threads and
* processes.
*/
static void
fill_kinfo_thread(struct thread *td, struct kinfo_proc *kp, int preferthread)
{
struct proc *p;
p = td->td_proc;
kp->ki_tdaddr = td;
PROC_LOCK_ASSERT(p, MA_OWNED);
if (preferthread)
PROC_SLOCK(p);
thread_lock(td);
if (td->td_wmesg != NULL)
strlcpy(kp->ki_wmesg, td->td_wmesg, sizeof(kp->ki_wmesg));
else
bzero(kp->ki_wmesg, sizeof(kp->ki_wmesg));
strlcpy(kp->ki_tdname, td->td_name, sizeof(kp->ki_tdname));
if (TD_ON_LOCK(td)) {
kp->ki_kiflag |= KI_LOCKBLOCK;
strlcpy(kp->ki_lockname, td->td_lockname,
sizeof(kp->ki_lockname));
} else {
kp->ki_kiflag &= ~KI_LOCKBLOCK;
bzero(kp->ki_lockname, sizeof(kp->ki_lockname));
}
if (p->p_state == PRS_NORMAL) { /* approximate. */
if (TD_ON_RUNQ(td) ||
TD_CAN_RUN(td) ||
TD_IS_RUNNING(td)) {
kp->ki_stat = SRUN;
} else if (P_SHOULDSTOP(p)) {
kp->ki_stat = SSTOP;
} else if (TD_IS_SLEEPING(td)) {
kp->ki_stat = SSLEEP;
} else if (TD_ON_LOCK(td)) {
kp->ki_stat = SLOCK;
} else {
kp->ki_stat = SWAIT;
}
} else if (p->p_state == PRS_ZOMBIE) {
kp->ki_stat = SZOMB;
} else {
kp->ki_stat = SIDL;
}
/* Things in the thread */
kp->ki_wchan = td->td_wchan;
kp->ki_pri.pri_level = td->td_priority;
kp->ki_pri.pri_native = td->td_base_pri;
kp->ki_lastcpu = td->td_lastcpu;
kp->ki_oncpu = td->td_oncpu;
kp->ki_tdflags = td->td_flags;
kp->ki_tid = td->td_tid;
kp->ki_numthreads = p->p_numthreads;
kp->ki_pcb = td->td_pcb;
kp->ki_kstack = (void *)td->td_kstack;
kp->ki_slptime = (ticks - td->td_slptick) / hz;
kp->ki_pri.pri_class = td->td_pri_class;
kp->ki_pri.pri_user = td->td_user_pri;
if (preferthread) {
rufetchtd(td, &kp->ki_rusage);
kp->ki_runtime = cputick2usec(td->td_rux.rux_runtime);
kp->ki_pctcpu = sched_pctcpu(td);
kp->ki_estcpu = td->td_estcpu;
}
/* We can't get this anymore but ps etc never used it anyway. */
kp->ki_rqindex = 0;
if (preferthread)
kp->ki_siglist = td->td_siglist;
kp->ki_sigmask = td->td_sigmask;
thread_unlock(td);
if (preferthread)
PROC_SUNLOCK(p);
}
/*
* Fill in a kinfo_proc structure for the specified process.
* Must be called with the target process locked.
*/
void
fill_kinfo_proc(struct proc *p, struct kinfo_proc *kp)
{
MPASS(FIRST_THREAD_IN_PROC(p) != NULL);
fill_kinfo_proc_only(p, kp);
fill_kinfo_thread(FIRST_THREAD_IN_PROC(p), kp, 0);
fill_kinfo_aggregate(p, kp);
}
struct pstats *
pstats_alloc(void)
{
return (malloc(sizeof(struct pstats), M_SUBPROC, M_ZERO|M_WAITOK));
}
/*
* Copy parts of p_stats; zero the rest of p_stats (statistics).
*/
void
pstats_fork(struct pstats *src, struct pstats *dst)
{
bzero(&dst->pstat_startzero,
__rangeof(struct pstats, pstat_startzero, pstat_endzero));
bcopy(&src->pstat_startcopy, &dst->pstat_startcopy,
__rangeof(struct pstats, pstat_startcopy, pstat_endcopy));
}
void
pstats_free(struct pstats *ps)
{
free(ps, M_SUBPROC);
}
/*
* Locate a zombie process by number
*/
struct proc *
zpfind(pid_t pid)
{
struct proc *p;
sx_slock(&allproc_lock);
LIST_FOREACH(p, &zombproc, p_list)
if (p->p_pid == pid) {
PROC_LOCK(p);
break;
}
sx_sunlock(&allproc_lock);
return (p);
}
#define KERN_PROC_ZOMBMASK 0x3
#define KERN_PROC_NOTHREADS 0x4
#ifdef COMPAT_FREEBSD32
/*
* This function is typically used to copy out the kernel address, so
* it can be replaced by assignment of zero.
*/
static inline uint32_t
ptr32_trim(void *ptr)
{
uintptr_t uptr;
uptr = (uintptr_t)ptr;
return ((uptr > UINT_MAX) ? 0 : uptr);
}
#define PTRTRIM_CP(src,dst,fld) \
do { (dst).fld = ptr32_trim((src).fld); } while (0)
static void
freebsd32_kinfo_proc_out(const struct kinfo_proc *ki, struct kinfo_proc32 *ki32)
{
int i;
bzero(ki32, sizeof(struct kinfo_proc32));
ki32->ki_structsize = sizeof(struct kinfo_proc32);
CP(*ki, *ki32, ki_layout);
PTRTRIM_CP(*ki, *ki32, ki_args);
PTRTRIM_CP(*ki, *ki32, ki_paddr);
PTRTRIM_CP(*ki, *ki32, ki_addr);
PTRTRIM_CP(*ki, *ki32, ki_tracep);
PTRTRIM_CP(*ki, *ki32, ki_textvp);
PTRTRIM_CP(*ki, *ki32, ki_fd);
PTRTRIM_CP(*ki, *ki32, ki_vmspace);
PTRTRIM_CP(*ki, *ki32, ki_wchan);
CP(*ki, *ki32, ki_pid);
CP(*ki, *ki32, ki_ppid);
CP(*ki, *ki32, ki_pgid);
CP(*ki, *ki32, ki_tpgid);
CP(*ki, *ki32, ki_sid);
CP(*ki, *ki32, ki_tsid);
CP(*ki, *ki32, ki_jobc);
CP(*ki, *ki32, ki_tdev);
CP(*ki, *ki32, ki_siglist);
CP(*ki, *ki32, ki_sigmask);
CP(*ki, *ki32, ki_sigignore);
CP(*ki, *ki32, ki_sigcatch);
CP(*ki, *ki32, ki_uid);
CP(*ki, *ki32, ki_ruid);
CP(*ki, *ki32, ki_svuid);
CP(*ki, *ki32, ki_rgid);
CP(*ki, *ki32, ki_svgid);
CP(*ki, *ki32, ki_ngroups);
for (i = 0; i < KI_NGROUPS; i++)
CP(*ki, *ki32, ki_groups[i]);
CP(*ki, *ki32, ki_size);
CP(*ki, *ki32, ki_rssize);
CP(*ki, *ki32, ki_swrss);
CP(*ki, *ki32, ki_tsize);
CP(*ki, *ki32, ki_dsize);
CP(*ki, *ki32, ki_ssize);
CP(*ki, *ki32, ki_xstat);
CP(*ki, *ki32, ki_acflag);
CP(*ki, *ki32, ki_pctcpu);
CP(*ki, *ki32, ki_estcpu);
CP(*ki, *ki32, ki_slptime);
CP(*ki, *ki32, ki_swtime);
CP(*ki, *ki32, ki_runtime);
TV_CP(*ki, *ki32, ki_start);
TV_CP(*ki, *ki32, ki_childtime);
CP(*ki, *ki32, ki_flag);
CP(*ki, *ki32, ki_kiflag);
CP(*ki, *ki32, ki_traceflag);
CP(*ki, *ki32, ki_stat);
CP(*ki, *ki32, ki_nice);
CP(*ki, *ki32, ki_lock);
CP(*ki, *ki32, ki_rqindex);
CP(*ki, *ki32, ki_oncpu);
CP(*ki, *ki32, ki_lastcpu);
bcopy(ki->ki_tdname, ki32->ki_tdname, TDNAMLEN + 1);
bcopy(ki->ki_wmesg, ki32->ki_wmesg, WMESGLEN + 1);
bcopy(ki->ki_login, ki32->ki_login, LOGNAMELEN + 1);
bcopy(ki->ki_lockname, ki32->ki_lockname, LOCKNAMELEN + 1);
bcopy(ki->ki_comm, ki32->ki_comm, COMMLEN + 1);
bcopy(ki->ki_emul, ki32->ki_emul, KI_EMULNAMELEN + 1);
bcopy(ki->ki_loginclass, ki32->ki_loginclass, LOGINCLASSLEN + 1);
CP(*ki, *ki32, ki_cr_flags);
CP(*ki, *ki32, ki_jid);
CP(*ki, *ki32, ki_numthreads);
CP(*ki, *ki32, ki_tid);
CP(*ki, *ki32, ki_pri);
freebsd32_rusage_out(&ki->ki_rusage, &ki32->ki_rusage);
freebsd32_rusage_out(&ki->ki_rusage_ch, &ki32->ki_rusage_ch);
PTRTRIM_CP(*ki, *ki32, ki_pcb);
PTRTRIM_CP(*ki, *ki32, ki_kstack);
PTRTRIM_CP(*ki, *ki32, ki_udata);
CP(*ki, *ki32, ki_sflag);
CP(*ki, *ki32, ki_tdflags);
}
static int
sysctl_out_proc_copyout(struct kinfo_proc *ki, struct sysctl_req *req)
{
struct kinfo_proc32 ki32;
int error;
if (req->flags & SCTL_MASK32) {
freebsd32_kinfo_proc_out(ki, &ki32);
error = SYSCTL_OUT(req, &ki32, sizeof(struct kinfo_proc32));
} else
error = SYSCTL_OUT(req, ki, sizeof(struct kinfo_proc));
return (error);
}
#else
static int
sysctl_out_proc_copyout(struct kinfo_proc *ki, struct sysctl_req *req)
{
return (SYSCTL_OUT(req, ki, sizeof(struct kinfo_proc)));
}
#endif
/*
* Must be called with the process locked and will return with it unlocked.
*/
static int
sysctl_out_proc(struct proc *p, struct sysctl_req *req, int flags)
{
struct thread *td;
struct kinfo_proc kinfo_proc;
int error = 0;
struct proc *np;
pid_t pid = p->p_pid;
PROC_LOCK_ASSERT(p, MA_OWNED);
MPASS(FIRST_THREAD_IN_PROC(p) != NULL);
fill_kinfo_proc(p, &kinfo_proc);
if (flags & KERN_PROC_NOTHREADS)
error = sysctl_out_proc_copyout(&kinfo_proc, req);
else {
FOREACH_THREAD_IN_PROC(p, td) {
fill_kinfo_thread(td, &kinfo_proc, 1);
error = sysctl_out_proc_copyout(&kinfo_proc, req);
if (error)
break;
}
}
PROC_UNLOCK(p);
if (error)
return (error);
if (flags & KERN_PROC_ZOMBMASK)
np = zpfind(pid);
else {
if (pid == 0)
return (0);
np = pfind(pid);
}
if (np == NULL)
return (ESRCH);
if (np != p) {
PROC_UNLOCK(np);
return (ESRCH);
}
PROC_UNLOCK(np);
return (0);
}
static int
sysctl_kern_proc(SYSCTL_HANDLER_ARGS)
{
int *name = (int*) arg1;
u_int namelen = arg2;
struct proc *p;
int flags, doingzomb, oid_number;
int error = 0;
oid_number = oidp->oid_number;
if (oid_number != KERN_PROC_ALL &&
(oid_number & KERN_PROC_INC_THREAD) == 0)
flags = KERN_PROC_NOTHREADS;
else {
flags = 0;
oid_number &= ~KERN_PROC_INC_THREAD;
}
if (oid_number == KERN_PROC_PID) {
if (namelen != 1)
return (EINVAL);
error = sysctl_wire_old_buffer(req, 0);
if (error)
return (error);
p = pfind((pid_t)name[0]);
if (!p)
return (ESRCH);
if ((error = p_cansee(curthread, p))) {
PROC_UNLOCK(p);
return (error);
}
error = sysctl_out_proc(p, req, flags);
return (error);
}
switch (oid_number) {
case KERN_PROC_ALL:
if (namelen != 0)
return (EINVAL);
break;
case KERN_PROC_PROC:
if (namelen != 0 && namelen != 1)
return (EINVAL);
break;
default:
if (namelen != 1)
return (EINVAL);
break;
}
if (!req->oldptr) {
/* overestimate by 5 procs */
error = SYSCTL_OUT(req, 0, sizeof (struct kinfo_proc) * 5);
if (error)
return (error);
}
error = sysctl_wire_old_buffer(req, 0);
if (error != 0)
return (error);
sx_slock(&allproc_lock);
for (doingzomb=0 ; doingzomb < 2 ; doingzomb++) {
if (!doingzomb)
p = LIST_FIRST(&allproc);
else
p = LIST_FIRST(&zombproc);
for (; p != 0; p = LIST_NEXT(p, p_list)) {
/*
* Skip embryonic processes.
*/
PROC_LOCK(p);
if (p->p_state == PRS_NEW) {
PROC_UNLOCK(p);
continue;
}
KASSERT(p->p_ucred != NULL,
("process credential is NULL for non-NEW proc"));
/*
* Show a user only appropriate processes.
*/
if (p_cansee(curthread, p)) {
PROC_UNLOCK(p);
continue;
}
/*
* TODO - make more efficient (see notes below).
* do by session.
*/
switch (oid_number) {
case KERN_PROC_GID:
if (p->p_ucred->cr_gid != (gid_t)name[0]) {
PROC_UNLOCK(p);
continue;
}
break;
case KERN_PROC_PGRP:
/* could do this by traversing pgrp */
if (p->p_pgrp == NULL ||
p->p_pgrp->pg_id != (pid_t)name[0]) {
PROC_UNLOCK(p);
continue;
}
break;
case KERN_PROC_RGID:
if (p->p_ucred->cr_rgid != (gid_t)name[0]) {
PROC_UNLOCK(p);
continue;
}
break;
case KERN_PROC_SESSION:
if (p->p_session == NULL ||
p->p_session->s_sid != (pid_t)name[0]) {
PROC_UNLOCK(p);
continue;
}
break;
case KERN_PROC_TTY:
if ((p->p_flag & P_CONTROLT) == 0 ||
p->p_session == NULL) {
PROC_UNLOCK(p);
continue;
}
/* XXX proctree_lock */
SESS_LOCK(p->p_session);
if (p->p_session->s_ttyp == NULL ||
tty_udev(p->p_session->s_ttyp) !=
(dev_t)name[0]) {
SESS_UNLOCK(p->p_session);
PROC_UNLOCK(p);
continue;
}
SESS_UNLOCK(p->p_session);
break;
case KERN_PROC_UID:
if (p->p_ucred->cr_uid != (uid_t)name[0]) {
PROC_UNLOCK(p);
continue;
}
break;
case KERN_PROC_RUID:
if (p->p_ucred->cr_ruid != (uid_t)name[0]) {
PROC_UNLOCK(p);
continue;
}
break;
case KERN_PROC_PROC:
break;
default:
break;
}
error = sysctl_out_proc(p, req, flags | doingzomb);
if (error) {
sx_sunlock(&allproc_lock);
return (error);
}
}
}
sx_sunlock(&allproc_lock);
return (0);
}
struct pargs *
pargs_alloc(int len)
{
struct pargs *pa;
pa = malloc(sizeof(struct pargs) + len, M_PARGS,
M_WAITOK);
refcount_init(&pa->ar_ref, 1);
pa->ar_length = len;
return (pa);
}
static void
pargs_free(struct pargs *pa)
{
free(pa, M_PARGS);
}
void
pargs_hold(struct pargs *pa)
{
if (pa == NULL)
return;
refcount_acquire(&pa->ar_ref);
}
void
pargs_drop(struct pargs *pa)
{
if (pa == NULL)
return;
if (refcount_release(&pa->ar_ref))
pargs_free(pa);
}
/*
* This sysctl allows a process to retrieve the argument list or process
* title for another process without groping around in the address space
* of the other process. It also allow a process to set its own "process
* title to a string of its own choice.
*/
static int
sysctl_kern_proc_args(SYSCTL_HANDLER_ARGS)
{
int *name = (int*) arg1;
u_int namelen = arg2;
struct pargs *newpa, *pa;
struct proc *p;
int error = 0;
if (namelen != 1)
return (EINVAL);
p = pfind((pid_t)name[0]);
if (!p)
return (ESRCH);
if ((error = p_cansee(curthread, p)) != 0) {
PROC_UNLOCK(p);
return (error);
}
if (req->newptr && curproc != p) {
PROC_UNLOCK(p);
return (EPERM);
}
pa = p->p_args;
pargs_hold(pa);
PROC_UNLOCK(p);
if (pa != NULL)
error = SYSCTL_OUT(req, pa->ar_args, pa->ar_length);
pargs_drop(pa);
if (error != 0 || req->newptr == NULL)
return (error);
if (req->newlen + sizeof(struct pargs) > ps_arg_cache_limit)
return (ENOMEM);
newpa = pargs_alloc(req->newlen);
error = SYSCTL_IN(req, newpa->ar_args, req->newlen);
if (error != 0) {
pargs_free(newpa);
return (error);
}
PROC_LOCK(p);
pa = p->p_args;
p->p_args = newpa;
PROC_UNLOCK(p);
pargs_drop(pa);
return (0);
}
/*
* This sysctl allows a process to retrieve the path of the executable for
* itself or another process.
*/
static int
sysctl_kern_proc_pathname(SYSCTL_HANDLER_ARGS)
{
pid_t *pidp = (pid_t *)arg1;
unsigned int arglen = arg2;
struct proc *p;
struct vnode *vp;
char *retbuf, *freebuf;
int error, vfslocked;
if (arglen != 1)
return (EINVAL);
if (*pidp == -1) { /* -1 means this process */
p = req->td->td_proc;
} else {
p = pfind(*pidp);
if (p == NULL)
return (ESRCH);
if ((error = p_cansee(curthread, p)) != 0) {
PROC_UNLOCK(p);
return (error);
}
}
vp = p->p_textvp;
if (vp == NULL) {
if (*pidp != -1)
PROC_UNLOCK(p);
return (0);
}
vref(vp);
if (*pidp != -1)
PROC_UNLOCK(p);
error = vn_fullpath(req->td, vp, &retbuf, &freebuf);
vfslocked = VFS_LOCK_GIANT(vp->v_mount);
vrele(vp);
VFS_UNLOCK_GIANT(vfslocked);
if (error)
return (error);
error = SYSCTL_OUT(req, retbuf, strlen(retbuf) + 1);
free(freebuf, M_TEMP);
return (error);
}
static int
sysctl_kern_proc_sv_name(SYSCTL_HANDLER_ARGS)
{
struct proc *p;
char *sv_name;
int *name;
int namelen;
int error;
namelen = arg2;
if (namelen != 1)
return (EINVAL);
name = (int *)arg1;
if ((p = pfind((pid_t)name[0])) == NULL)
return (ESRCH);
if ((error = p_cansee(curthread, p))) {
PROC_UNLOCK(p);
return (error);
}
sv_name = p->p_sysent->sv_name;
PROC_UNLOCK(p);
return (sysctl_handle_string(oidp, sv_name, 0, req));
}
#ifdef KINFO_OVMENTRY_SIZE
CTASSERT(sizeof(struct kinfo_ovmentry) == KINFO_OVMENTRY_SIZE);
#endif
#ifdef COMPAT_FREEBSD7
static int
sysctl_kern_proc_ovmmap(SYSCTL_HANDLER_ARGS)
{
vm_map_entry_t entry, tmp_entry;
unsigned int last_timestamp;
char *fullpath, *freepath;
struct kinfo_ovmentry *kve;
struct vattr va;
struct ucred *cred;
int error, *name;
struct vnode *vp;
struct proc *p;
vm_map_t map;
struct vmspace *vm;
name = (int *)arg1;
if ((p = pfind((pid_t)name[0])) == NULL)
return (ESRCH);
if (p->p_flag & P_WEXIT) {
PROC_UNLOCK(p);
return (ESRCH);
}
if ((error = p_candebug(curthread, p))) {
PROC_UNLOCK(p);
return (error);
}
_PHOLD(p);
PROC_UNLOCK(p);
vm = vmspace_acquire_ref(p);
if (vm == NULL) {
PRELE(p);
return (ESRCH);
}
kve = malloc(sizeof(*kve), M_TEMP, M_WAITOK);
map = &p->p_vmspace->vm_map; /* XXXRW: More locking required? */
vm_map_lock_read(map);
for (entry = map->header.next; entry != &map->header;
entry = entry->next) {
vm_object_t obj, tobj, lobj;
vm_offset_t addr;
int vfslocked;
if (entry->eflags & MAP_ENTRY_IS_SUB_MAP)
continue;
bzero(kve, sizeof(*kve));
kve->kve_structsize = sizeof(*kve);
kve->kve_private_resident = 0;
obj = entry->object.vm_object;
if (obj != NULL) {
VM_OBJECT_LOCK(obj);
if (obj->shadow_count == 1)
kve->kve_private_resident =
obj->resident_page_count;
}
kve->kve_resident = 0;
addr = entry->start;
while (addr < entry->end) {
if (pmap_extract(map->pmap, addr))
kve->kve_resident++;
addr += PAGE_SIZE;
}
for (lobj = tobj = obj; tobj; tobj = tobj->backing_object) {
if (tobj != obj)
VM_OBJECT_LOCK(tobj);
if (lobj != obj)
VM_OBJECT_UNLOCK(lobj);
lobj = tobj;
}
kve->kve_start = (void*)entry->start;
kve->kve_end = (void*)entry->end;
kve->kve_offset = (off_t)entry->offset;
if (entry->protection & VM_PROT_READ)
kve->kve_protection |= KVME_PROT_READ;
if (entry->protection & VM_PROT_WRITE)
kve->kve_protection |= KVME_PROT_WRITE;
if (entry->protection & VM_PROT_EXECUTE)
kve->kve_protection |= KVME_PROT_EXEC;
if (entry->eflags & MAP_ENTRY_COW)
kve->kve_flags |= KVME_FLAG_COW;
if (entry->eflags & MAP_ENTRY_NEEDS_COPY)
kve->kve_flags |= KVME_FLAG_NEEDS_COPY;
if (entry->eflags & MAP_ENTRY_NOCOREDUMP)
kve->kve_flags |= KVME_FLAG_NOCOREDUMP;
last_timestamp = map->timestamp;
vm_map_unlock_read(map);
kve->kve_fileid = 0;
kve->kve_fsid = 0;
freepath = NULL;
fullpath = "";
if (lobj) {
vp = NULL;
switch (lobj->type) {
case OBJT_DEFAULT:
kve->kve_type = KVME_TYPE_DEFAULT;
break;
case OBJT_VNODE:
kve->kve_type = KVME_TYPE_VNODE;
vp = lobj->handle;
vref(vp);
break;
case OBJT_SWAP:
kve->kve_type = KVME_TYPE_SWAP;
break;
case OBJT_DEVICE:
kve->kve_type = KVME_TYPE_DEVICE;
break;
case OBJT_PHYS:
kve->kve_type = KVME_TYPE_PHYS;
break;
case OBJT_DEAD:
kve->kve_type = KVME_TYPE_DEAD;
break;
case OBJT_SG:
kve->kve_type = KVME_TYPE_SG;
break;
default:
kve->kve_type = KVME_TYPE_UNKNOWN;
break;
}
if (lobj != obj)
VM_OBJECT_UNLOCK(lobj);
kve->kve_ref_count = obj->ref_count;
kve->kve_shadow_count = obj->shadow_count;
VM_OBJECT_UNLOCK(obj);
if (vp != NULL) {
vn_fullpath(curthread, vp, &fullpath,
&freepath);
cred = curthread->td_ucred;
vfslocked = VFS_LOCK_GIANT(vp->v_mount);
vn_lock(vp, LK_SHARED | LK_RETRY);
if (VOP_GETATTR(vp, &va, cred) == 0) {
kve->kve_fileid = va.va_fileid;
kve->kve_fsid = va.va_fsid;
}
vput(vp);
VFS_UNLOCK_GIANT(vfslocked);
}
} else {
kve->kve_type = KVME_TYPE_NONE;
kve->kve_ref_count = 0;
kve->kve_shadow_count = 0;
}
strlcpy(kve->kve_path, fullpath, sizeof(kve->kve_path));
if (freepath != NULL)
free(freepath, M_TEMP);
error = SYSCTL_OUT(req, kve, sizeof(*kve));
vm_map_lock_read(map);
if (error)
break;
if (last_timestamp != map->timestamp) {
vm_map_lookup_entry(map, addr - 1, &tmp_entry);
entry = tmp_entry;
}
}
vm_map_unlock_read(map);
vmspace_free(vm);
PRELE(p);
free(kve, M_TEMP);
return (error);
}
#endif /* COMPAT_FREEBSD7 */
#ifdef KINFO_VMENTRY_SIZE
CTASSERT(sizeof(struct kinfo_vmentry) == KINFO_VMENTRY_SIZE);
#endif
static int
sysctl_kern_proc_vmmap(SYSCTL_HANDLER_ARGS)
{
vm_map_entry_t entry, tmp_entry;
unsigned int last_timestamp;
char *fullpath, *freepath;
struct kinfo_vmentry *kve;
struct vattr va;
struct ucred *cred;
int error, *name;
struct vnode *vp;
struct proc *p;
struct vmspace *vm;
vm_map_t map;
name = (int *)arg1;
if ((p = pfind((pid_t)name[0])) == NULL)
return (ESRCH);
if (p->p_flag & P_WEXIT) {
PROC_UNLOCK(p);
return (ESRCH);
}
if ((error = p_candebug(curthread, p))) {
PROC_UNLOCK(p);
return (error);
}
_PHOLD(p);
PROC_UNLOCK(p);
vm = vmspace_acquire_ref(p);
if (vm == NULL) {
PRELE(p);
return (ESRCH);
}
kve = malloc(sizeof(*kve), M_TEMP, M_WAITOK);
map = &vm->vm_map; /* XXXRW: More locking required? */
vm_map_lock_read(map);
for (entry = map->header.next; entry != &map->header;
entry = entry->next) {
vm_object_t obj, tobj, lobj;
vm_offset_t addr;
int vfslocked;
if (entry->eflags & MAP_ENTRY_IS_SUB_MAP)
continue;
bzero(kve, sizeof(*kve));
kve->kve_private_resident = 0;
obj = entry->object.vm_object;
if (obj != NULL) {
VM_OBJECT_LOCK(obj);
if (obj->shadow_count == 1)
kve->kve_private_resident =
obj->resident_page_count;
}
kve->kve_resident = 0;
addr = entry->start;
while (addr < entry->end) {
if (pmap_extract(map->pmap, addr))
kve->kve_resident++;
addr += PAGE_SIZE;
}
for (lobj = tobj = obj; tobj; tobj = tobj->backing_object) {
if (tobj != obj)
VM_OBJECT_LOCK(tobj);
if (lobj != obj)
VM_OBJECT_UNLOCK(lobj);
lobj = tobj;
}
kve->kve_start = entry->start;
kve->kve_end = entry->end;
kve->kve_offset = entry->offset;
if (entry->protection & VM_PROT_READ)
kve->kve_protection |= KVME_PROT_READ;
if (entry->protection & VM_PROT_WRITE)
kve->kve_protection |= KVME_PROT_WRITE;
if (entry->protection & VM_PROT_EXECUTE)
kve->kve_protection |= KVME_PROT_EXEC;
if (entry->eflags & MAP_ENTRY_COW)
kve->kve_flags |= KVME_FLAG_COW;
if (entry->eflags & MAP_ENTRY_NEEDS_COPY)
kve->kve_flags |= KVME_FLAG_NEEDS_COPY;
if (entry->eflags & MAP_ENTRY_NOCOREDUMP)
kve->kve_flags |= KVME_FLAG_NOCOREDUMP;
last_timestamp = map->timestamp;
vm_map_unlock_read(map);
freepath = NULL;
fullpath = "";
if (lobj) {
vp = NULL;
switch (lobj->type) {
case OBJT_DEFAULT:
kve->kve_type = KVME_TYPE_DEFAULT;
break;
case OBJT_VNODE:
kve->kve_type = KVME_TYPE_VNODE;
vp = lobj->handle;
vref(vp);
break;
case OBJT_SWAP:
kve->kve_type = KVME_TYPE_SWAP;
break;
case OBJT_DEVICE:
kve->kve_type = KVME_TYPE_DEVICE;
break;
case OBJT_PHYS:
kve->kve_type = KVME_TYPE_PHYS;
break;
case OBJT_DEAD:
kve->kve_type = KVME_TYPE_DEAD;
break;
case OBJT_SG:
kve->kve_type = KVME_TYPE_SG;
break;
default:
kve->kve_type = KVME_TYPE_UNKNOWN;
break;
}
if (lobj != obj)
VM_OBJECT_UNLOCK(lobj);
kve->kve_ref_count = obj->ref_count;
kve->kve_shadow_count = obj->shadow_count;
VM_OBJECT_UNLOCK(obj);
if (vp != NULL) {
vn_fullpath(curthread, vp, &fullpath,
&freepath);
kve->kve_vn_type = vntype_to_kinfo(vp->v_type);
cred = curthread->td_ucred;
vfslocked = VFS_LOCK_GIANT(vp->v_mount);
vn_lock(vp, LK_SHARED | LK_RETRY);
if (VOP_GETATTR(vp, &va, cred) == 0) {
kve->kve_vn_fileid = va.va_fileid;
kve->kve_vn_fsid = va.va_fsid;
kve->kve_vn_mode =
MAKEIMODE(va.va_type, va.va_mode);
kve->kve_vn_size = va.va_size;
kve->kve_vn_rdev = va.va_rdev;
kve->kve_status = KF_ATTR_VALID;
}
vput(vp);
VFS_UNLOCK_GIANT(vfslocked);
}
} else {
kve->kve_type = KVME_TYPE_NONE;
kve->kve_ref_count = 0;
kve->kve_shadow_count = 0;
}
strlcpy(kve->kve_path, fullpath, sizeof(kve->kve_path));
if (freepath != NULL)
free(freepath, M_TEMP);
/* Pack record size down */
kve->kve_structsize = offsetof(struct kinfo_vmentry, kve_path) +
strlen(kve->kve_path) + 1;
kve->kve_structsize = roundup(kve->kve_structsize,
sizeof(uint64_t));
error = SYSCTL_OUT(req, kve, kve->kve_structsize);
vm_map_lock_read(map);
if (error)
break;
if (last_timestamp != map->timestamp) {
vm_map_lookup_entry(map, addr - 1, &tmp_entry);
entry = tmp_entry;
}
}
vm_map_unlock_read(map);
vmspace_free(vm);
PRELE(p);
free(kve, M_TEMP);
return (error);
}
#if defined(STACK) || defined(DDB)
static int
sysctl_kern_proc_kstack(SYSCTL_HANDLER_ARGS)
{
struct kinfo_kstack *kkstp;
int error, i, *name, numthreads;
lwpid_t *lwpidarray;
struct thread *td;
struct stack *st;
struct sbuf sb;
struct proc *p;
name = (int *)arg1;
if ((p = pfind((pid_t)name[0])) == NULL)
return (ESRCH);
/* XXXRW: Not clear ESRCH is the right error during proc execve(). */
if (p->p_flag & P_WEXIT || p->p_flag & P_INEXEC) {
PROC_UNLOCK(p);
return (ESRCH);
}
if ((error = p_candebug(curthread, p))) {
PROC_UNLOCK(p);
return (error);
}
_PHOLD(p);
PROC_UNLOCK(p);
kkstp = malloc(sizeof(*kkstp), M_TEMP, M_WAITOK);
st = stack_create();
lwpidarray = NULL;
numthreads = 0;
PROC_LOCK(p);
repeat:
if (numthreads < p->p_numthreads) {
if (lwpidarray != NULL) {
free(lwpidarray, M_TEMP);
lwpidarray = NULL;
}
numthreads = p->p_numthreads;
PROC_UNLOCK(p);
lwpidarray = malloc(sizeof(*lwpidarray) * numthreads, M_TEMP,
M_WAITOK | M_ZERO);
PROC_LOCK(p);
goto repeat;
}
i = 0;
/*
* XXXRW: During the below loop, execve(2) and countless other sorts
* of changes could have taken place. Should we check to see if the
* vmspace has been replaced, or the like, in order to prevent
* giving a snapshot that spans, say, execve(2), with some threads
* before and some after? Among other things, the credentials could
* have changed, in which case the right to extract debug info might
* no longer be assured.
*/
FOREACH_THREAD_IN_PROC(p, td) {
KASSERT(i < numthreads,
("sysctl_kern_proc_kstack: numthreads"));
lwpidarray[i] = td->td_tid;
i++;
}
numthreads = i;
for (i = 0; i < numthreads; i++) {
td = thread_find(p, lwpidarray[i]);
if (td == NULL) {
continue;
}
bzero(kkstp, sizeof(*kkstp));
(void)sbuf_new(&sb, kkstp->kkst_trace,
sizeof(kkstp->kkst_trace), SBUF_FIXEDLEN);
thread_lock(td);
kkstp->kkst_tid = td->td_tid;
if (TD_IS_SWAPPED(td))
kkstp->kkst_state = KKST_STATE_SWAPPED;
else if (TD_IS_RUNNING(td))
kkstp->kkst_state = KKST_STATE_RUNNING;
else {
kkstp->kkst_state = KKST_STATE_STACKOK;
stack_save_td(st, td);
}
thread_unlock(td);
PROC_UNLOCK(p);
stack_sbuf_print(&sb, st);
sbuf_finish(&sb);
sbuf_delete(&sb);
error = SYSCTL_OUT(req, kkstp, sizeof(*kkstp));
PROC_LOCK(p);
if (error)
break;
}
_PRELE(p);
PROC_UNLOCK(p);
if (lwpidarray != NULL)
free(lwpidarray, M_TEMP);
stack_destroy(st);
free(kkstp, M_TEMP);
return (error);
}
#endif
/*
* This sysctl allows a process to retrieve the full list of groups from
* itself or another process.
*/
static int
sysctl_kern_proc_groups(SYSCTL_HANDLER_ARGS)
{
pid_t *pidp = (pid_t *)arg1;
unsigned int arglen = arg2;
struct proc *p;
struct ucred *cred;
int error;
if (arglen != 1)
return (EINVAL);
if (*pidp == -1) { /* -1 means this process */
p = req->td->td_proc;
} else {
p = pfind(*pidp);
if (p == NULL)
return (ESRCH);
if ((error = p_cansee(curthread, p)) != 0) {
PROC_UNLOCK(p);
return (error);
}
}
cred = crhold(p->p_ucred);
if (*pidp != -1)
PROC_UNLOCK(p);
error = SYSCTL_OUT(req, cred->cr_groups,
cred->cr_ngroups * sizeof(gid_t));
crfree(cred);
return (error);
}
SYSCTL_NODE(_kern, KERN_PROC, proc, CTLFLAG_RD, 0, "Process table");
SYSCTL_PROC(_kern_proc, KERN_PROC_ALL, all, CTLFLAG_RD|CTLTYPE_STRUCT|
CTLFLAG_MPSAFE, 0, 0, sysctl_kern_proc, "S,proc",
"Return entire process table");
static SYSCTL_NODE(_kern_proc, KERN_PROC_GID, gid, CTLFLAG_RD | CTLFLAG_MPSAFE,
sysctl_kern_proc, "Process table");
static SYSCTL_NODE(_kern_proc, KERN_PROC_PGRP, pgrp, CTLFLAG_RD | CTLFLAG_MPSAFE,
sysctl_kern_proc, "Process table");
static SYSCTL_NODE(_kern_proc, KERN_PROC_RGID, rgid, CTLFLAG_RD | CTLFLAG_MPSAFE,
sysctl_kern_proc, "Process table");
static SYSCTL_NODE(_kern_proc, KERN_PROC_SESSION, sid, CTLFLAG_RD |
CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table");
static SYSCTL_NODE(_kern_proc, KERN_PROC_TTY, tty, CTLFLAG_RD | CTLFLAG_MPSAFE,
sysctl_kern_proc, "Process table");
static SYSCTL_NODE(_kern_proc, KERN_PROC_UID, uid, CTLFLAG_RD | CTLFLAG_MPSAFE,
sysctl_kern_proc, "Process table");
static SYSCTL_NODE(_kern_proc, KERN_PROC_RUID, ruid, CTLFLAG_RD | CTLFLAG_MPSAFE,
sysctl_kern_proc, "Process table");
static SYSCTL_NODE(_kern_proc, KERN_PROC_PID, pid, CTLFLAG_RD | CTLFLAG_MPSAFE,
sysctl_kern_proc, "Process table");
static SYSCTL_NODE(_kern_proc, KERN_PROC_PROC, proc, CTLFLAG_RD | CTLFLAG_MPSAFE,
sysctl_kern_proc, "Return process table, no threads");
static SYSCTL_NODE(_kern_proc, KERN_PROC_ARGS, args,
CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_MPSAFE,
sysctl_kern_proc_args, "Process argument list");
static SYSCTL_NODE(_kern_proc, KERN_PROC_PATHNAME, pathname, CTLFLAG_RD |
CTLFLAG_MPSAFE, sysctl_kern_proc_pathname, "Process executable path");
static SYSCTL_NODE(_kern_proc, KERN_PROC_SV_NAME, sv_name, CTLFLAG_RD |
CTLFLAG_MPSAFE, sysctl_kern_proc_sv_name,
"Process syscall vector name (ABI type)");
static SYSCTL_NODE(_kern_proc, (KERN_PROC_GID | KERN_PROC_INC_THREAD), gid_td,
CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table");
static SYSCTL_NODE(_kern_proc, (KERN_PROC_PGRP | KERN_PROC_INC_THREAD), pgrp_td,
CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table");
static SYSCTL_NODE(_kern_proc, (KERN_PROC_RGID | KERN_PROC_INC_THREAD), rgid_td,
CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table");
static SYSCTL_NODE(_kern_proc, (KERN_PROC_SESSION | KERN_PROC_INC_THREAD),
sid_td, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table");
static SYSCTL_NODE(_kern_proc, (KERN_PROC_TTY | KERN_PROC_INC_THREAD), tty_td,
CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table");
static SYSCTL_NODE(_kern_proc, (KERN_PROC_UID | KERN_PROC_INC_THREAD), uid_td,
CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table");
static SYSCTL_NODE(_kern_proc, (KERN_PROC_RUID | KERN_PROC_INC_THREAD), ruid_td,
CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table");
static SYSCTL_NODE(_kern_proc, (KERN_PROC_PID | KERN_PROC_INC_THREAD), pid_td,
CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table");
static SYSCTL_NODE(_kern_proc, (KERN_PROC_PROC | KERN_PROC_INC_THREAD), proc_td,
CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc,
"Return process table, no threads");
#ifdef COMPAT_FREEBSD7
static SYSCTL_NODE(_kern_proc, KERN_PROC_OVMMAP, ovmmap, CTLFLAG_RD |
CTLFLAG_MPSAFE, sysctl_kern_proc_ovmmap, "Old Process vm map entries");
#endif
static SYSCTL_NODE(_kern_proc, KERN_PROC_VMMAP, vmmap, CTLFLAG_RD |
CTLFLAG_MPSAFE, sysctl_kern_proc_vmmap, "Process vm map entries");
#if defined(STACK) || defined(DDB)
static SYSCTL_NODE(_kern_proc, KERN_PROC_KSTACK, kstack, CTLFLAG_RD |
CTLFLAG_MPSAFE, sysctl_kern_proc_kstack, "Process kernel stacks");
#endif
static SYSCTL_NODE(_kern_proc, KERN_PROC_GROUPS, groups, CTLFLAG_RD |
CTLFLAG_MPSAFE, sysctl_kern_proc_groups, "Process groups");
Index: head/sys/kern/kern_prot.c
===================================================================
--- head/sys/kern/kern_prot.c (revision 225616)
+++ head/sys/kern/kern_prot.c (revision 225617)
@@ -1,2220 +1,2220 @@
/*-
* Copyright (c) 1982, 1986, 1989, 1990, 1991, 1993
* The Regents of the University of California.
* (c) UNIX System Laboratories, Inc.
* Copyright (c) 2000-2001 Robert N. M. Watson.
* All rights reserved.
*
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_prot.c 8.6 (Berkeley) 1/21/94
*/
/*
* System calls related to processes and protection
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_compat.h"
#include "opt_inet.h"
#include "opt_inet6.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/acct.h>
#include <sys/kdb.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/loginclass.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
#include <sys/refcount.h>
#include <sys/sx.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/sysproto.h>
#include <sys/jail.h>
#include <sys/pioctl.h>
#include <sys/racct.h>
#include <sys/resourcevar.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/syscallsubr.h>
#include <sys/sysctl.h>
#ifdef REGRESSION
FEATURE(regression,
"Kernel support for interfaces nessesary for regression testing (SECURITY RISK!)");
#endif
#if defined(INET) || defined(INET6)
#include <netinet/in.h>
#include <netinet/in_pcb.h>
#endif
#include <security/audit/audit.h>
#include <security/mac/mac_framework.h>
static MALLOC_DEFINE(M_CRED, "cred", "credentials");
SYSCTL_NODE(_security, OID_AUTO, bsd, CTLFLAG_RW, 0, "BSD security policy");
static void crextend(struct ucred *cr, int n);
static void crsetgroups_locked(struct ucred *cr, int ngrp,
gid_t *groups);
#ifndef _SYS_SYSPROTO_H_
struct getpid_args {
int dummy;
};
#endif
/* ARGSUSED */
int
-getpid(struct thread *td, struct getpid_args *uap)
+sys_getpid(struct thread *td, struct getpid_args *uap)
{
struct proc *p = td->td_proc;
td->td_retval[0] = p->p_pid;
#if defined(COMPAT_43)
PROC_LOCK(p);
td->td_retval[1] = p->p_pptr->p_pid;
PROC_UNLOCK(p);
#endif
return (0);
}
#ifndef _SYS_SYSPROTO_H_
struct getppid_args {
int dummy;
};
#endif
/* ARGSUSED */
int
-getppid(struct thread *td, struct getppid_args *uap)
+sys_getppid(struct thread *td, struct getppid_args *uap)
{
struct proc *p = td->td_proc;
PROC_LOCK(p);
td->td_retval[0] = p->p_pptr->p_pid;
PROC_UNLOCK(p);
return (0);
}
/*
* Get process group ID; note that POSIX getpgrp takes no parameter.
*/
#ifndef _SYS_SYSPROTO_H_
struct getpgrp_args {
int dummy;
};
#endif
int
-getpgrp(struct thread *td, struct getpgrp_args *uap)
+sys_getpgrp(struct thread *td, struct getpgrp_args *uap)
{
struct proc *p = td->td_proc;
PROC_LOCK(p);
td->td_retval[0] = p->p_pgrp->pg_id;
PROC_UNLOCK(p);
return (0);
}
/* Get an arbitary pid's process group id */
#ifndef _SYS_SYSPROTO_H_
struct getpgid_args {
pid_t pid;
};
#endif
int
-getpgid(struct thread *td, struct getpgid_args *uap)
+sys_getpgid(struct thread *td, struct getpgid_args *uap)
{
struct proc *p;
int error;
if (uap->pid == 0) {
p = td->td_proc;
PROC_LOCK(p);
} else {
p = pfind(uap->pid);
if (p == NULL)
return (ESRCH);
error = p_cansee(td, p);
if (error) {
PROC_UNLOCK(p);
return (error);
}
}
td->td_retval[0] = p->p_pgrp->pg_id;
PROC_UNLOCK(p);
return (0);
}
/*
* Get an arbitary pid's session id.
*/
#ifndef _SYS_SYSPROTO_H_
struct getsid_args {
pid_t pid;
};
#endif
int
-getsid(struct thread *td, struct getsid_args *uap)
+sys_getsid(struct thread *td, struct getsid_args *uap)
{
struct proc *p;
int error;
if (uap->pid == 0) {
p = td->td_proc;
PROC_LOCK(p);
} else {
p = pfind(uap->pid);
if (p == NULL)
return (ESRCH);
error = p_cansee(td, p);
if (error) {
PROC_UNLOCK(p);
return (error);
}
}
td->td_retval[0] = p->p_session->s_sid;
PROC_UNLOCK(p);
return (0);
}
#ifndef _SYS_SYSPROTO_H_
struct getuid_args {
int dummy;
};
#endif
/* ARGSUSED */
int
-getuid(struct thread *td, struct getuid_args *uap)
+sys_getuid(struct thread *td, struct getuid_args *uap)
{
td->td_retval[0] = td->td_ucred->cr_ruid;
#if defined(COMPAT_43)
td->td_retval[1] = td->td_ucred->cr_uid;
#endif
return (0);
}
#ifndef _SYS_SYSPROTO_H_
struct geteuid_args {
int dummy;
};
#endif
/* ARGSUSED */
int
-geteuid(struct thread *td, struct geteuid_args *uap)
+sys_geteuid(struct thread *td, struct geteuid_args *uap)
{
td->td_retval[0] = td->td_ucred->cr_uid;
return (0);
}
#ifndef _SYS_SYSPROTO_H_
struct getgid_args {
int dummy;
};
#endif
/* ARGSUSED */
int
-getgid(struct thread *td, struct getgid_args *uap)
+sys_getgid(struct thread *td, struct getgid_args *uap)
{
td->td_retval[0] = td->td_ucred->cr_rgid;
#if defined(COMPAT_43)
td->td_retval[1] = td->td_ucred->cr_groups[0];
#endif
return (0);
}
/*
* Get effective group ID. The "egid" is groups[0], and could be obtained
* via getgroups. This syscall exists because it is somewhat painful to do
* correctly in a library function.
*/
#ifndef _SYS_SYSPROTO_H_
struct getegid_args {
int dummy;
};
#endif
/* ARGSUSED */
int
-getegid(struct thread *td, struct getegid_args *uap)
+sys_getegid(struct thread *td, struct getegid_args *uap)
{
td->td_retval[0] = td->td_ucred->cr_groups[0];
return (0);
}
#ifndef _SYS_SYSPROTO_H_
struct getgroups_args {
u_int gidsetsize;
gid_t *gidset;
};
#endif
int
-getgroups(struct thread *td, register struct getgroups_args *uap)
+sys_getgroups(struct thread *td, register struct getgroups_args *uap)
{
gid_t *groups;
u_int ngrp;
int error;
if (uap->gidsetsize < td->td_ucred->cr_ngroups) {
if (uap->gidsetsize == 0)
ngrp = 0;
else
return (EINVAL);
} else
ngrp = td->td_ucred->cr_ngroups;
groups = malloc(ngrp * sizeof(*groups), M_TEMP, M_WAITOK);
error = kern_getgroups(td, &ngrp, groups);
if (error)
goto out;
if (uap->gidsetsize > 0)
error = copyout(groups, uap->gidset, ngrp * sizeof(gid_t));
if (error == 0)
td->td_retval[0] = ngrp;
out:
free(groups, M_TEMP);
return (error);
}
int
kern_getgroups(struct thread *td, u_int *ngrp, gid_t *groups)
{
struct ucred *cred;
cred = td->td_ucred;
if (*ngrp == 0) {
*ngrp = cred->cr_ngroups;
return (0);
}
if (*ngrp < cred->cr_ngroups)
return (EINVAL);
*ngrp = cred->cr_ngroups;
bcopy(cred->cr_groups, groups, *ngrp * sizeof(gid_t));
return (0);
}
#ifndef _SYS_SYSPROTO_H_
struct setsid_args {
int dummy;
};
#endif
/* ARGSUSED */
int
-setsid(register struct thread *td, struct setsid_args *uap)
+sys_setsid(register struct thread *td, struct setsid_args *uap)
{
struct pgrp *pgrp;
int error;
struct proc *p = td->td_proc;
struct pgrp *newpgrp;
struct session *newsess;
error = 0;
pgrp = NULL;
newpgrp = malloc(sizeof(struct pgrp), M_PGRP, M_WAITOK | M_ZERO);
newsess = malloc(sizeof(struct session), M_SESSION, M_WAITOK | M_ZERO);
sx_xlock(&proctree_lock);
if (p->p_pgid == p->p_pid || (pgrp = pgfind(p->p_pid)) != NULL) {
if (pgrp != NULL)
PGRP_UNLOCK(pgrp);
error = EPERM;
} else {
(void)enterpgrp(p, p->p_pid, newpgrp, newsess);
td->td_retval[0] = p->p_pid;
newpgrp = NULL;
newsess = NULL;
}
sx_xunlock(&proctree_lock);
if (newpgrp != NULL)
free(newpgrp, M_PGRP);
if (newsess != NULL)
free(newsess, M_SESSION);
return (error);
}
/*
* set process group (setpgid/old setpgrp)
*
* caller does setpgid(targpid, targpgid)
*
* pid must be caller or child of caller (ESRCH)
* if a child
* pid must be in same session (EPERM)
* pid can't have done an exec (EACCES)
* if pgid != pid
* there must exist some pid in same session having pgid (EPERM)
* pid must not be session leader (EPERM)
*/
#ifndef _SYS_SYSPROTO_H_
struct setpgid_args {
int pid; /* target process id */
int pgid; /* target pgrp id */
};
#endif
/* ARGSUSED */
int
-setpgid(struct thread *td, register struct setpgid_args *uap)
+sys_setpgid(struct thread *td, register struct setpgid_args *uap)
{
struct proc *curp = td->td_proc;
register struct proc *targp; /* target process */
register struct pgrp *pgrp; /* target pgrp */
int error;
struct pgrp *newpgrp;
if (uap->pgid < 0)
return (EINVAL);
error = 0;
newpgrp = malloc(sizeof(struct pgrp), M_PGRP, M_WAITOK | M_ZERO);
sx_xlock(&proctree_lock);
if (uap->pid != 0 && uap->pid != curp->p_pid) {
if ((targp = pfind(uap->pid)) == NULL) {
error = ESRCH;
goto done;
}
if (!inferior(targp)) {
PROC_UNLOCK(targp);
error = ESRCH;
goto done;
}
if ((error = p_cansee(td, targp))) {
PROC_UNLOCK(targp);
goto done;
}
if (targp->p_pgrp == NULL ||
targp->p_session != curp->p_session) {
PROC_UNLOCK(targp);
error = EPERM;
goto done;
}
if (targp->p_flag & P_EXEC) {
PROC_UNLOCK(targp);
error = EACCES;
goto done;
}
PROC_UNLOCK(targp);
} else
targp = curp;
if (SESS_LEADER(targp)) {
error = EPERM;
goto done;
}
if (uap->pgid == 0)
uap->pgid = targp->p_pid;
if ((pgrp = pgfind(uap->pgid)) == NULL) {
if (uap->pgid == targp->p_pid) {
error = enterpgrp(targp, uap->pgid, newpgrp,
NULL);
if (error == 0)
newpgrp = NULL;
} else
error = EPERM;
} else {
if (pgrp == targp->p_pgrp) {
PGRP_UNLOCK(pgrp);
goto done;
}
if (pgrp->pg_id != targp->p_pid &&
pgrp->pg_session != curp->p_session) {
PGRP_UNLOCK(pgrp);
error = EPERM;
goto done;
}
PGRP_UNLOCK(pgrp);
error = enterthispgrp(targp, pgrp);
}
done:
sx_xunlock(&proctree_lock);
KASSERT((error == 0) || (newpgrp != NULL),
("setpgid failed and newpgrp is NULL"));
if (newpgrp != NULL)
free(newpgrp, M_PGRP);
return (error);
}
/*
* Use the clause in B.4.2.2 that allows setuid/setgid to be 4.2/4.3BSD
* compatible. It says that setting the uid/gid to euid/egid is a special
* case of "appropriate privilege". Once the rules are expanded out, this
* basically means that setuid(nnn) sets all three id's, in all permitted
* cases unless _POSIX_SAVED_IDS is enabled. In that case, setuid(getuid())
* does not set the saved id - this is dangerous for traditional BSD
* programs. For this reason, we *really* do not want to set
* _POSIX_SAVED_IDS and do not want to clear POSIX_APPENDIX_B_4_2_2.
*/
#define POSIX_APPENDIX_B_4_2_2
#ifndef _SYS_SYSPROTO_H_
struct setuid_args {
uid_t uid;
};
#endif
/* ARGSUSED */
int
-setuid(struct thread *td, struct setuid_args *uap)
+sys_setuid(struct thread *td, struct setuid_args *uap)
{
struct proc *p = td->td_proc;
struct ucred *newcred, *oldcred;
uid_t uid;
struct uidinfo *uip;
int error;
uid = uap->uid;
AUDIT_ARG_UID(uid);
newcred = crget();
uip = uifind(uid);
PROC_LOCK(p);
/*
* Copy credentials so other references do not see our changes.
*/
oldcred = crcopysafe(p, newcred);
#ifdef MAC
error = mac_cred_check_setuid(oldcred, uid);
if (error)
goto fail;
#endif
/*
* See if we have "permission" by POSIX 1003.1 rules.
*
* Note that setuid(geteuid()) is a special case of
* "appropriate privileges" in appendix B.4.2.2. We need
* to use this clause to be compatible with traditional BSD
* semantics. Basically, it means that "setuid(xx)" sets all
* three id's (assuming you have privs).
*
* Notes on the logic. We do things in three steps.
* 1: We determine if the euid is going to change, and do EPERM
* right away. We unconditionally change the euid later if this
* test is satisfied, simplifying that part of the logic.
* 2: We determine if the real and/or saved uids are going to
* change. Determined by compile options.
* 3: Change euid last. (after tests in #2 for "appropriate privs")
*/
if (uid != oldcred->cr_ruid && /* allow setuid(getuid()) */
#ifdef _POSIX_SAVED_IDS
uid != oldcred->cr_svuid && /* allow setuid(saved gid) */
#endif
#ifdef POSIX_APPENDIX_B_4_2_2 /* Use BSD-compat clause from B.4.2.2 */
uid != oldcred->cr_uid && /* allow setuid(geteuid()) */
#endif
(error = priv_check_cred(oldcred, PRIV_CRED_SETUID, 0)) != 0)
goto fail;
#ifdef _POSIX_SAVED_IDS
/*
* Do we have "appropriate privileges" (are we root or uid == euid)
* If so, we are changing the real uid and/or saved uid.
*/
if (
#ifdef POSIX_APPENDIX_B_4_2_2 /* Use the clause from B.4.2.2 */
uid == oldcred->cr_uid ||
#endif
/* We are using privs. */
priv_check_cred(oldcred, PRIV_CRED_SETUID, 0) == 0)
#endif
{
/*
* Set the real uid and transfer proc count to new user.
*/
if (uid != oldcred->cr_ruid) {
change_ruid(newcred, uip);
setsugid(p);
}
/*
* Set saved uid
*
* XXX always set saved uid even if not _POSIX_SAVED_IDS, as
* the security of seteuid() depends on it. B.4.2.2 says it
* is important that we should do this.
*/
if (uid != oldcred->cr_svuid) {
change_svuid(newcred, uid);
setsugid(p);
}
}
/*
* In all permitted cases, we are changing the euid.
*/
if (uid != oldcred->cr_uid) {
change_euid(newcred, uip);
setsugid(p);
}
p->p_ucred = newcred;
PROC_UNLOCK(p);
#ifdef RACCT
racct_proc_ucred_changed(p, oldcred, newcred);
#endif
uifree(uip);
crfree(oldcred);
return (0);
fail:
PROC_UNLOCK(p);
uifree(uip);
crfree(newcred);
return (error);
}
#ifndef _SYS_SYSPROTO_H_
struct seteuid_args {
uid_t euid;
};
#endif
/* ARGSUSED */
int
-seteuid(struct thread *td, struct seteuid_args *uap)
+sys_seteuid(struct thread *td, struct seteuid_args *uap)
{
struct proc *p = td->td_proc;
struct ucred *newcred, *oldcred;
uid_t euid;
struct uidinfo *euip;
int error;
euid = uap->euid;
AUDIT_ARG_EUID(euid);
newcred = crget();
euip = uifind(euid);
PROC_LOCK(p);
/*
* Copy credentials so other references do not see our changes.
*/
oldcred = crcopysafe(p, newcred);
#ifdef MAC
error = mac_cred_check_seteuid(oldcred, euid);
if (error)
goto fail;
#endif
if (euid != oldcred->cr_ruid && /* allow seteuid(getuid()) */
euid != oldcred->cr_svuid && /* allow seteuid(saved uid) */
(error = priv_check_cred(oldcred, PRIV_CRED_SETEUID, 0)) != 0)
goto fail;
/*
* Everything's okay, do it.
*/
if (oldcred->cr_uid != euid) {
change_euid(newcred, euip);
setsugid(p);
}
p->p_ucred = newcred;
PROC_UNLOCK(p);
uifree(euip);
crfree(oldcred);
return (0);
fail:
PROC_UNLOCK(p);
uifree(euip);
crfree(newcred);
return (error);
}
#ifndef _SYS_SYSPROTO_H_
struct setgid_args {
gid_t gid;
};
#endif
/* ARGSUSED */
int
-setgid(struct thread *td, struct setgid_args *uap)
+sys_setgid(struct thread *td, struct setgid_args *uap)
{
struct proc *p = td->td_proc;
struct ucred *newcred, *oldcred;
gid_t gid;
int error;
gid = uap->gid;
AUDIT_ARG_GID(gid);
newcred = crget();
PROC_LOCK(p);
oldcred = crcopysafe(p, newcred);
#ifdef MAC
error = mac_cred_check_setgid(oldcred, gid);
if (error)
goto fail;
#endif
/*
* See if we have "permission" by POSIX 1003.1 rules.
*
* Note that setgid(getegid()) is a special case of
* "appropriate privileges" in appendix B.4.2.2. We need
* to use this clause to be compatible with traditional BSD
* semantics. Basically, it means that "setgid(xx)" sets all
* three id's (assuming you have privs).
*
* For notes on the logic here, see setuid() above.
*/
if (gid != oldcred->cr_rgid && /* allow setgid(getgid()) */
#ifdef _POSIX_SAVED_IDS
gid != oldcred->cr_svgid && /* allow setgid(saved gid) */
#endif
#ifdef POSIX_APPENDIX_B_4_2_2 /* Use BSD-compat clause from B.4.2.2 */
gid != oldcred->cr_groups[0] && /* allow setgid(getegid()) */
#endif
(error = priv_check_cred(oldcred, PRIV_CRED_SETGID, 0)) != 0)
goto fail;
#ifdef _POSIX_SAVED_IDS
/*
* Do we have "appropriate privileges" (are we root or gid == egid)
* If so, we are changing the real uid and saved gid.
*/
if (
#ifdef POSIX_APPENDIX_B_4_2_2 /* use the clause from B.4.2.2 */
gid == oldcred->cr_groups[0] ||
#endif
/* We are using privs. */
priv_check_cred(oldcred, PRIV_CRED_SETGID, 0) == 0)
#endif
{
/*
* Set real gid
*/
if (oldcred->cr_rgid != gid) {
change_rgid(newcred, gid);
setsugid(p);
}
/*
* Set saved gid
*
* XXX always set saved gid even if not _POSIX_SAVED_IDS, as
* the security of setegid() depends on it. B.4.2.2 says it
* is important that we should do this.
*/
if (oldcred->cr_svgid != gid) {
change_svgid(newcred, gid);
setsugid(p);
}
}
/*
* In all cases permitted cases, we are changing the egid.
* Copy credentials so other references do not see our changes.
*/
if (oldcred->cr_groups[0] != gid) {
change_egid(newcred, gid);
setsugid(p);
}
p->p_ucred = newcred;
PROC_UNLOCK(p);
crfree(oldcred);
return (0);
fail:
PROC_UNLOCK(p);
crfree(newcred);
return (error);
}
#ifndef _SYS_SYSPROTO_H_
struct setegid_args {
gid_t egid;
};
#endif
/* ARGSUSED */
int
-setegid(struct thread *td, struct setegid_args *uap)
+sys_setegid(struct thread *td, struct setegid_args *uap)
{
struct proc *p = td->td_proc;
struct ucred *newcred, *oldcred;
gid_t egid;
int error;
egid = uap->egid;
AUDIT_ARG_EGID(egid);
newcred = crget();
PROC_LOCK(p);
oldcred = crcopysafe(p, newcred);
#ifdef MAC
error = mac_cred_check_setegid(oldcred, egid);
if (error)
goto fail;
#endif
if (egid != oldcred->cr_rgid && /* allow setegid(getgid()) */
egid != oldcred->cr_svgid && /* allow setegid(saved gid) */
(error = priv_check_cred(oldcred, PRIV_CRED_SETEGID, 0)) != 0)
goto fail;
if (oldcred->cr_groups[0] != egid) {
change_egid(newcred, egid);
setsugid(p);
}
p->p_ucred = newcred;
PROC_UNLOCK(p);
crfree(oldcred);
return (0);
fail:
PROC_UNLOCK(p);
crfree(newcred);
return (error);
}
#ifndef _SYS_SYSPROTO_H_
struct setgroups_args {
u_int gidsetsize;
gid_t *gidset;
};
#endif
/* ARGSUSED */
int
-setgroups(struct thread *td, struct setgroups_args *uap)
+sys_setgroups(struct thread *td, struct setgroups_args *uap)
{
gid_t *groups = NULL;
int error;
if (uap->gidsetsize > ngroups_max + 1)
return (EINVAL);
groups = malloc(uap->gidsetsize * sizeof(gid_t), M_TEMP, M_WAITOK);
error = copyin(uap->gidset, groups, uap->gidsetsize * sizeof(gid_t));
if (error)
goto out;
error = kern_setgroups(td, uap->gidsetsize, groups);
out:
free(groups, M_TEMP);
return (error);
}
int
kern_setgroups(struct thread *td, u_int ngrp, gid_t *groups)
{
struct proc *p = td->td_proc;
struct ucred *newcred, *oldcred;
int error;
if (ngrp > ngroups_max + 1)
return (EINVAL);
AUDIT_ARG_GROUPSET(groups, ngrp);
newcred = crget();
crextend(newcred, ngrp);
PROC_LOCK(p);
oldcred = crcopysafe(p, newcred);
#ifdef MAC
error = mac_cred_check_setgroups(oldcred, ngrp, groups);
if (error)
goto fail;
#endif
error = priv_check_cred(oldcred, PRIV_CRED_SETGROUPS, 0);
if (error)
goto fail;
if (ngrp < 1) {
/*
* setgroups(0, NULL) is a legitimate way of clearing the
* groups vector on non-BSD systems (which generally do not
* have the egid in the groups[0]). We risk security holes
* when running non-BSD software if we do not do the same.
*/
newcred->cr_ngroups = 1;
} else {
crsetgroups_locked(newcred, ngrp, groups);
}
setsugid(p);
p->p_ucred = newcred;
PROC_UNLOCK(p);
crfree(oldcred);
return (0);
fail:
PROC_UNLOCK(p);
crfree(newcred);
return (error);
}
#ifndef _SYS_SYSPROTO_H_
struct setreuid_args {
uid_t ruid;
uid_t euid;
};
#endif
/* ARGSUSED */
int
-setreuid(register struct thread *td, struct setreuid_args *uap)
+sys_setreuid(register struct thread *td, struct setreuid_args *uap)
{
struct proc *p = td->td_proc;
struct ucred *newcred, *oldcred;
uid_t euid, ruid;
struct uidinfo *euip, *ruip;
int error;
euid = uap->euid;
ruid = uap->ruid;
AUDIT_ARG_EUID(euid);
AUDIT_ARG_RUID(ruid);
newcred = crget();
euip = uifind(euid);
ruip = uifind(ruid);
PROC_LOCK(p);
oldcred = crcopysafe(p, newcred);
#ifdef MAC
error = mac_cred_check_setreuid(oldcred, ruid, euid);
if (error)
goto fail;
#endif
if (((ruid != (uid_t)-1 && ruid != oldcred->cr_ruid &&
ruid != oldcred->cr_svuid) ||
(euid != (uid_t)-1 && euid != oldcred->cr_uid &&
euid != oldcred->cr_ruid && euid != oldcred->cr_svuid)) &&
(error = priv_check_cred(oldcred, PRIV_CRED_SETREUID, 0)) != 0)
goto fail;
if (euid != (uid_t)-1 && oldcred->cr_uid != euid) {
change_euid(newcred, euip);
setsugid(p);
}
if (ruid != (uid_t)-1 && oldcred->cr_ruid != ruid) {
change_ruid(newcred, ruip);
setsugid(p);
}
if ((ruid != (uid_t)-1 || newcred->cr_uid != newcred->cr_ruid) &&
newcred->cr_svuid != newcred->cr_uid) {
change_svuid(newcred, newcred->cr_uid);
setsugid(p);
}
p->p_ucred = newcred;
PROC_UNLOCK(p);
#ifdef RACCT
racct_proc_ucred_changed(p, oldcred, newcred);
#endif
uifree(ruip);
uifree(euip);
crfree(oldcred);
return (0);
fail:
PROC_UNLOCK(p);
uifree(ruip);
uifree(euip);
crfree(newcred);
return (error);
}
#ifndef _SYS_SYSPROTO_H_
struct setregid_args {
gid_t rgid;
gid_t egid;
};
#endif
/* ARGSUSED */
int
-setregid(register struct thread *td, struct setregid_args *uap)
+sys_setregid(register struct thread *td, struct setregid_args *uap)
{
struct proc *p = td->td_proc;
struct ucred *newcred, *oldcred;
gid_t egid, rgid;
int error;
egid = uap->egid;
rgid = uap->rgid;
AUDIT_ARG_EGID(egid);
AUDIT_ARG_RGID(rgid);
newcred = crget();
PROC_LOCK(p);
oldcred = crcopysafe(p, newcred);
#ifdef MAC
error = mac_cred_check_setregid(oldcred, rgid, egid);
if (error)
goto fail;
#endif
if (((rgid != (gid_t)-1 && rgid != oldcred->cr_rgid &&
rgid != oldcred->cr_svgid) ||
(egid != (gid_t)-1 && egid != oldcred->cr_groups[0] &&
egid != oldcred->cr_rgid && egid != oldcred->cr_svgid)) &&
(error = priv_check_cred(oldcred, PRIV_CRED_SETREGID, 0)) != 0)
goto fail;
if (egid != (gid_t)-1 && oldcred->cr_groups[0] != egid) {
change_egid(newcred, egid);
setsugid(p);
}
if (rgid != (gid_t)-1 && oldcred->cr_rgid != rgid) {
change_rgid(newcred, rgid);
setsugid(p);
}
if ((rgid != (gid_t)-1 || newcred->cr_groups[0] != newcred->cr_rgid) &&
newcred->cr_svgid != newcred->cr_groups[0]) {
change_svgid(newcred, newcred->cr_groups[0]);
setsugid(p);
}
p->p_ucred = newcred;
PROC_UNLOCK(p);
crfree(oldcred);
return (0);
fail:
PROC_UNLOCK(p);
crfree(newcred);
return (error);
}
/*
* setresuid(ruid, euid, suid) is like setreuid except control over the saved
* uid is explicit.
*/
#ifndef _SYS_SYSPROTO_H_
struct setresuid_args {
uid_t ruid;
uid_t euid;
uid_t suid;
};
#endif
/* ARGSUSED */
int
-setresuid(register struct thread *td, struct setresuid_args *uap)
+sys_setresuid(register struct thread *td, struct setresuid_args *uap)
{
struct proc *p = td->td_proc;
struct ucred *newcred, *oldcred;
uid_t euid, ruid, suid;
struct uidinfo *euip, *ruip;
int error;
euid = uap->euid;
ruid = uap->ruid;
suid = uap->suid;
AUDIT_ARG_EUID(euid);
AUDIT_ARG_RUID(ruid);
AUDIT_ARG_SUID(suid);
newcred = crget();
euip = uifind(euid);
ruip = uifind(ruid);
PROC_LOCK(p);
oldcred = crcopysafe(p, newcred);
#ifdef MAC
error = mac_cred_check_setresuid(oldcred, ruid, euid, suid);
if (error)
goto fail;
#endif
if (((ruid != (uid_t)-1 && ruid != oldcred->cr_ruid &&
ruid != oldcred->cr_svuid &&
ruid != oldcred->cr_uid) ||
(euid != (uid_t)-1 && euid != oldcred->cr_ruid &&
euid != oldcred->cr_svuid &&
euid != oldcred->cr_uid) ||
(suid != (uid_t)-1 && suid != oldcred->cr_ruid &&
suid != oldcred->cr_svuid &&
suid != oldcred->cr_uid)) &&
(error = priv_check_cred(oldcred, PRIV_CRED_SETRESUID, 0)) != 0)
goto fail;
if (euid != (uid_t)-1 && oldcred->cr_uid != euid) {
change_euid(newcred, euip);
setsugid(p);
}
if (ruid != (uid_t)-1 && oldcred->cr_ruid != ruid) {
change_ruid(newcred, ruip);
setsugid(p);
}
if (suid != (uid_t)-1 && oldcred->cr_svuid != suid) {
change_svuid(newcred, suid);
setsugid(p);
}
p->p_ucred = newcred;
PROC_UNLOCK(p);
#ifdef RACCT
racct_proc_ucred_changed(p, oldcred, newcred);
#endif
uifree(ruip);
uifree(euip);
crfree(oldcred);
return (0);
fail:
PROC_UNLOCK(p);
uifree(ruip);
uifree(euip);
crfree(newcred);
return (error);
}
/*
* setresgid(rgid, egid, sgid) is like setregid except control over the saved
* gid is explicit.
*/
#ifndef _SYS_SYSPROTO_H_
struct setresgid_args {
gid_t rgid;
gid_t egid;
gid_t sgid;
};
#endif
/* ARGSUSED */
int
-setresgid(register struct thread *td, struct setresgid_args *uap)
+sys_setresgid(register struct thread *td, struct setresgid_args *uap)
{
struct proc *p = td->td_proc;
struct ucred *newcred, *oldcred;
gid_t egid, rgid, sgid;
int error;
egid = uap->egid;
rgid = uap->rgid;
sgid = uap->sgid;
AUDIT_ARG_EGID(egid);
AUDIT_ARG_RGID(rgid);
AUDIT_ARG_SGID(sgid);
newcred = crget();
PROC_LOCK(p);
oldcred = crcopysafe(p, newcred);
#ifdef MAC
error = mac_cred_check_setresgid(oldcred, rgid, egid, sgid);
if (error)
goto fail;
#endif
if (((rgid != (gid_t)-1 && rgid != oldcred->cr_rgid &&
rgid != oldcred->cr_svgid &&
rgid != oldcred->cr_groups[0]) ||
(egid != (gid_t)-1 && egid != oldcred->cr_rgid &&
egid != oldcred->cr_svgid &&
egid != oldcred->cr_groups[0]) ||
(sgid != (gid_t)-1 && sgid != oldcred->cr_rgid &&
sgid != oldcred->cr_svgid &&
sgid != oldcred->cr_groups[0])) &&
(error = priv_check_cred(oldcred, PRIV_CRED_SETRESGID, 0)) != 0)
goto fail;
if (egid != (gid_t)-1 && oldcred->cr_groups[0] != egid) {
change_egid(newcred, egid);
setsugid(p);
}
if (rgid != (gid_t)-1 && oldcred->cr_rgid != rgid) {
change_rgid(newcred, rgid);
setsugid(p);
}
if (sgid != (gid_t)-1 && oldcred->cr_svgid != sgid) {
change_svgid(newcred, sgid);
setsugid(p);
}
p->p_ucred = newcred;
PROC_UNLOCK(p);
crfree(oldcred);
return (0);
fail:
PROC_UNLOCK(p);
crfree(newcred);
return (error);
}
#ifndef _SYS_SYSPROTO_H_
struct getresuid_args {
uid_t *ruid;
uid_t *euid;
uid_t *suid;
};
#endif
/* ARGSUSED */
int
-getresuid(register struct thread *td, struct getresuid_args *uap)
+sys_getresuid(register struct thread *td, struct getresuid_args *uap)
{
struct ucred *cred;
int error1 = 0, error2 = 0, error3 = 0;
cred = td->td_ucred;
if (uap->ruid)
error1 = copyout(&cred->cr_ruid,
uap->ruid, sizeof(cred->cr_ruid));
if (uap->euid)
error2 = copyout(&cred->cr_uid,
uap->euid, sizeof(cred->cr_uid));
if (uap->suid)
error3 = copyout(&cred->cr_svuid,
uap->suid, sizeof(cred->cr_svuid));
return (error1 ? error1 : error2 ? error2 : error3);
}
#ifndef _SYS_SYSPROTO_H_
struct getresgid_args {
gid_t *rgid;
gid_t *egid;
gid_t *sgid;
};
#endif
/* ARGSUSED */
int
-getresgid(register struct thread *td, struct getresgid_args *uap)
+sys_getresgid(register struct thread *td, struct getresgid_args *uap)
{
struct ucred *cred;
int error1 = 0, error2 = 0, error3 = 0;
cred = td->td_ucred;
if (uap->rgid)
error1 = copyout(&cred->cr_rgid,
uap->rgid, sizeof(cred->cr_rgid));
if (uap->egid)
error2 = copyout(&cred->cr_groups[0],
uap->egid, sizeof(cred->cr_groups[0]));
if (uap->sgid)
error3 = copyout(&cred->cr_svgid,
uap->sgid, sizeof(cred->cr_svgid));
return (error1 ? error1 : error2 ? error2 : error3);
}
#ifndef _SYS_SYSPROTO_H_
struct issetugid_args {
int dummy;
};
#endif
/* ARGSUSED */
int
-issetugid(register struct thread *td, struct issetugid_args *uap)
+sys_issetugid(register struct thread *td, struct issetugid_args *uap)
{
struct proc *p = td->td_proc;
/*
* Note: OpenBSD sets a P_SUGIDEXEC flag set at execve() time,
* we use P_SUGID because we consider changing the owners as
* "tainting" as well.
* This is significant for procs that start as root and "become"
* a user without an exec - programs cannot know *everything*
* that libc *might* have put in their data segment.
*/
PROC_LOCK(p);
td->td_retval[0] = (p->p_flag & P_SUGID) ? 1 : 0;
PROC_UNLOCK(p);
return (0);
}
int
-__setugid(struct thread *td, struct __setugid_args *uap)
+sys___setugid(struct thread *td, struct __setugid_args *uap)
{
#ifdef REGRESSION
struct proc *p;
p = td->td_proc;
switch (uap->flag) {
case 0:
PROC_LOCK(p);
p->p_flag &= ~P_SUGID;
PROC_UNLOCK(p);
return (0);
case 1:
PROC_LOCK(p);
p->p_flag |= P_SUGID;
PROC_UNLOCK(p);
return (0);
default:
return (EINVAL);
}
#else /* !REGRESSION */
return (ENOSYS);
#endif /* REGRESSION */
}
/*
* Check if gid is a member of the group set.
*/
int
groupmember(gid_t gid, struct ucred *cred)
{
int l;
int h;
int m;
if (cred->cr_groups[0] == gid)
return(1);
/*
* If gid was not our primary group, perform a binary search
* of the supplemental groups. This is possible because we
* sort the groups in crsetgroups().
*/
l = 1;
h = cred->cr_ngroups;
while (l < h) {
m = l + ((h - l) / 2);
if (cred->cr_groups[m] < gid)
l = m + 1;
else
h = m;
}
if ((l < cred->cr_ngroups) && (cred->cr_groups[l] == gid))
return (1);
return (0);
}
/*
* Test the active securelevel against a given level. securelevel_gt()
* implements (securelevel > level). securelevel_ge() implements
* (securelevel >= level). Note that the logic is inverted -- these
* functions return EPERM on "success" and 0 on "failure".
*
* Due to care taken when setting the securelevel, we know that no jail will
* be less secure that its parent (or the physical system), so it is sufficient
* to test the current jail only.
*
* XXXRW: Possibly since this has to do with privilege, it should move to
* kern_priv.c.
*/
int
securelevel_gt(struct ucred *cr, int level)
{
return (cr->cr_prison->pr_securelevel > level ? EPERM : 0);
}
int
securelevel_ge(struct ucred *cr, int level)
{
return (cr->cr_prison->pr_securelevel >= level ? EPERM : 0);
}
/*
* 'see_other_uids' determines whether or not visibility of processes
* and sockets with credentials holding different real uids is possible
* using a variety of system MIBs.
* XXX: data declarations should be together near the beginning of the file.
*/
static int see_other_uids = 1;
SYSCTL_INT(_security_bsd, OID_AUTO, see_other_uids, CTLFLAG_RW,
&see_other_uids, 0,
"Unprivileged processes may see subjects/objects with different real uid");
/*-
* Determine if u1 "can see" the subject specified by u2, according to the
* 'see_other_uids' policy.
* Returns: 0 for permitted, ESRCH otherwise
* Locks: none
* References: *u1 and *u2 must not change during the call
* u1 may equal u2, in which case only one reference is required
*/
static int
cr_seeotheruids(struct ucred *u1, struct ucred *u2)
{
if (!see_other_uids && u1->cr_ruid != u2->cr_ruid) {
if (priv_check_cred(u1, PRIV_SEEOTHERUIDS, 0) != 0)
return (ESRCH);
}
return (0);
}
/*
* 'see_other_gids' determines whether or not visibility of processes
* and sockets with credentials holding different real gids is possible
* using a variety of system MIBs.
* XXX: data declarations should be together near the beginning of the file.
*/
static int see_other_gids = 1;
SYSCTL_INT(_security_bsd, OID_AUTO, see_other_gids, CTLFLAG_RW,
&see_other_gids, 0,
"Unprivileged processes may see subjects/objects with different real gid");
/*
* Determine if u1 can "see" the subject specified by u2, according to the
* 'see_other_gids' policy.
* Returns: 0 for permitted, ESRCH otherwise
* Locks: none
* References: *u1 and *u2 must not change during the call
* u1 may equal u2, in which case only one reference is required
*/
static int
cr_seeothergids(struct ucred *u1, struct ucred *u2)
{
int i, match;
if (!see_other_gids) {
match = 0;
for (i = 0; i < u1->cr_ngroups; i++) {
if (groupmember(u1->cr_groups[i], u2))
match = 1;
if (match)
break;
}
if (!match) {
if (priv_check_cred(u1, PRIV_SEEOTHERGIDS, 0) != 0)
return (ESRCH);
}
}
return (0);
}
/*-
* Determine if u1 "can see" the subject specified by u2.
* Returns: 0 for permitted, an errno value otherwise
* Locks: none
* References: *u1 and *u2 must not change during the call
* u1 may equal u2, in which case only one reference is required
*/
int
cr_cansee(struct ucred *u1, struct ucred *u2)
{
int error;
if ((error = prison_check(u1, u2)))
return (error);
#ifdef MAC
if ((error = mac_cred_check_visible(u1, u2)))
return (error);
#endif
if ((error = cr_seeotheruids(u1, u2)))
return (error);
if ((error = cr_seeothergids(u1, u2)))
return (error);
return (0);
}
/*-
* Determine if td "can see" the subject specified by p.
* Returns: 0 for permitted, an errno value otherwise
* Locks: Sufficient locks to protect p->p_ucred must be held. td really
* should be curthread.
* References: td and p must be valid for the lifetime of the call
*/
int
p_cansee(struct thread *td, struct proc *p)
{
/* Wrap cr_cansee() for all functionality. */
KASSERT(td == curthread, ("%s: td not curthread", __func__));
PROC_LOCK_ASSERT(p, MA_OWNED);
return (cr_cansee(td->td_ucred, p->p_ucred));
}
/*
* 'conservative_signals' prevents the delivery of a broad class of
* signals by unprivileged processes to processes that have changed their
* credentials since the last invocation of execve(). This can prevent
* the leakage of cached information or retained privileges as a result
* of a common class of signal-related vulnerabilities. However, this
* may interfere with some applications that expect to be able to
* deliver these signals to peer processes after having given up
* privilege.
*/
static int conservative_signals = 1;
SYSCTL_INT(_security_bsd, OID_AUTO, conservative_signals, CTLFLAG_RW,
&conservative_signals, 0, "Unprivileged processes prevented from "
"sending certain signals to processes whose credentials have changed");
/*-
* Determine whether cred may deliver the specified signal to proc.
* Returns: 0 for permitted, an errno value otherwise.
* Locks: A lock must be held for proc.
* References: cred and proc must be valid for the lifetime of the call.
*/
int
cr_cansignal(struct ucred *cred, struct proc *proc, int signum)
{
int error;
PROC_LOCK_ASSERT(proc, MA_OWNED);
/*
* Jail semantics limit the scope of signalling to proc in the
* same jail as cred, if cred is in jail.
*/
error = prison_check(cred, proc->p_ucred);
if (error)
return (error);
#ifdef MAC
if ((error = mac_proc_check_signal(cred, proc, signum)))
return (error);
#endif
if ((error = cr_seeotheruids(cred, proc->p_ucred)))
return (error);
if ((error = cr_seeothergids(cred, proc->p_ucred)))
return (error);
/*
* UNIX signal semantics depend on the status of the P_SUGID
* bit on the target process. If the bit is set, then additional
* restrictions are placed on the set of available signals.
*/
if (conservative_signals && (proc->p_flag & P_SUGID)) {
switch (signum) {
case 0:
case SIGKILL:
case SIGINT:
case SIGTERM:
case SIGALRM:
case SIGSTOP:
case SIGTTIN:
case SIGTTOU:
case SIGTSTP:
case SIGHUP:
case SIGUSR1:
case SIGUSR2:
/*
* Generally, permit job and terminal control
* signals.
*/
break;
default:
/* Not permitted without privilege. */
error = priv_check_cred(cred, PRIV_SIGNAL_SUGID, 0);
if (error)
return (error);
}
}
/*
* Generally, the target credential's ruid or svuid must match the
* subject credential's ruid or euid.
*/
if (cred->cr_ruid != proc->p_ucred->cr_ruid &&
cred->cr_ruid != proc->p_ucred->cr_svuid &&
cred->cr_uid != proc->p_ucred->cr_ruid &&
cred->cr_uid != proc->p_ucred->cr_svuid) {
error = priv_check_cred(cred, PRIV_SIGNAL_DIFFCRED, 0);
if (error)
return (error);
}
return (0);
}
/*-
* Determine whether td may deliver the specified signal to p.
* Returns: 0 for permitted, an errno value otherwise
* Locks: Sufficient locks to protect various components of td and p
* must be held. td must be curthread, and a lock must be
* held for p.
* References: td and p must be valid for the lifetime of the call
*/
int
p_cansignal(struct thread *td, struct proc *p, int signum)
{
KASSERT(td == curthread, ("%s: td not curthread", __func__));
PROC_LOCK_ASSERT(p, MA_OWNED);
if (td->td_proc == p)
return (0);
/*
* UNIX signalling semantics require that processes in the same
* session always be able to deliver SIGCONT to one another,
* overriding the remaining protections.
*/
/* XXX: This will require an additional lock of some sort. */
if (signum == SIGCONT && td->td_proc->p_session == p->p_session)
return (0);
/*
* Some compat layers use SIGTHR and higher signals for
* communication between different kernel threads of the same
* process, so that they expect that it's always possible to
* deliver them, even for suid applications where cr_cansignal() can
* deny such ability for security consideration. It should be
* pretty safe to do since the only way to create two processes
* with the same p_leader is via rfork(2).
*/
if (td->td_proc->p_leader != NULL && signum >= SIGTHR &&
signum < SIGTHR + 4 && td->td_proc->p_leader == p->p_leader)
return (0);
return (cr_cansignal(td->td_ucred, p, signum));
}
/*-
* Determine whether td may reschedule p.
* Returns: 0 for permitted, an errno value otherwise
* Locks: Sufficient locks to protect various components of td and p
* must be held. td must be curthread, and a lock must
* be held for p.
* References: td and p must be valid for the lifetime of the call
*/
int
p_cansched(struct thread *td, struct proc *p)
{
int error;
KASSERT(td == curthread, ("%s: td not curthread", __func__));
PROC_LOCK_ASSERT(p, MA_OWNED);
if (td->td_proc == p)
return (0);
if ((error = prison_check(td->td_ucred, p->p_ucred)))
return (error);
#ifdef MAC
if ((error = mac_proc_check_sched(td->td_ucred, p)))
return (error);
#endif
if ((error = cr_seeotheruids(td->td_ucred, p->p_ucred)))
return (error);
if ((error = cr_seeothergids(td->td_ucred, p->p_ucred)))
return (error);
if (td->td_ucred->cr_ruid != p->p_ucred->cr_ruid &&
td->td_ucred->cr_uid != p->p_ucred->cr_ruid) {
error = priv_check(td, PRIV_SCHED_DIFFCRED);
if (error)
return (error);
}
return (0);
}
/*
* The 'unprivileged_proc_debug' flag may be used to disable a variety of
* unprivileged inter-process debugging services, including some procfs
* functionality, ptrace(), and ktrace(). In the past, inter-process
* debugging has been involved in a variety of security problems, and sites
* not requiring the service might choose to disable it when hardening
* systems.
*
* XXX: Should modifying and reading this variable require locking?
* XXX: data declarations should be together near the beginning of the file.
*/
static int unprivileged_proc_debug = 1;
SYSCTL_INT(_security_bsd, OID_AUTO, unprivileged_proc_debug, CTLFLAG_RW,
&unprivileged_proc_debug, 0,
"Unprivileged processes may use process debugging facilities");
/*-
* Determine whether td may debug p.
* Returns: 0 for permitted, an errno value otherwise
* Locks: Sufficient locks to protect various components of td and p
* must be held. td must be curthread, and a lock must
* be held for p.
* References: td and p must be valid for the lifetime of the call
*/
int
p_candebug(struct thread *td, struct proc *p)
{
int credentialchanged, error, grpsubset, i, uidsubset;
KASSERT(td == curthread, ("%s: td not curthread", __func__));
PROC_LOCK_ASSERT(p, MA_OWNED);
if (!unprivileged_proc_debug) {
error = priv_check(td, PRIV_DEBUG_UNPRIV);
if (error)
return (error);
}
if (td->td_proc == p)
return (0);
if ((error = prison_check(td->td_ucred, p->p_ucred)))
return (error);
#ifdef MAC
if ((error = mac_proc_check_debug(td->td_ucred, p)))
return (error);
#endif
if ((error = cr_seeotheruids(td->td_ucred, p->p_ucred)))
return (error);
if ((error = cr_seeothergids(td->td_ucred, p->p_ucred)))
return (error);
/*
* Is p's group set a subset of td's effective group set? This
* includes p's egid, group access list, rgid, and svgid.
*/
grpsubset = 1;
for (i = 0; i < p->p_ucred->cr_ngroups; i++) {
if (!groupmember(p->p_ucred->cr_groups[i], td->td_ucred)) {
grpsubset = 0;
break;
}
}
grpsubset = grpsubset &&
groupmember(p->p_ucred->cr_rgid, td->td_ucred) &&
groupmember(p->p_ucred->cr_svgid, td->td_ucred);
/*
* Are the uids present in p's credential equal to td's
* effective uid? This includes p's euid, svuid, and ruid.
*/
uidsubset = (td->td_ucred->cr_uid == p->p_ucred->cr_uid &&
td->td_ucred->cr_uid == p->p_ucred->cr_svuid &&
td->td_ucred->cr_uid == p->p_ucred->cr_ruid);
/*
* Has the credential of the process changed since the last exec()?
*/
credentialchanged = (p->p_flag & P_SUGID);
/*
* If p's gids aren't a subset, or the uids aren't a subset,
* or the credential has changed, require appropriate privilege
* for td to debug p.
*/
if (!grpsubset || !uidsubset) {
error = priv_check(td, PRIV_DEBUG_DIFFCRED);
if (error)
return (error);
}
if (credentialchanged) {
error = priv_check(td, PRIV_DEBUG_SUGID);
if (error)
return (error);
}
/* Can't trace init when securelevel > 0. */
if (p == initproc) {
error = securelevel_gt(td->td_ucred, 0);
if (error)
return (error);
}
/*
* Can't trace a process that's currently exec'ing.
*
* XXX: Note, this is not a security policy decision, it's a
* basic correctness/functionality decision. Therefore, this check
* should be moved to the caller's of p_candebug().
*/
if ((p->p_flag & P_INEXEC) != 0)
return (EBUSY);
return (0);
}
/*-
* Determine whether the subject represented by cred can "see" a socket.
* Returns: 0 for permitted, ENOENT otherwise.
*/
int
cr_canseesocket(struct ucred *cred, struct socket *so)
{
int error;
error = prison_check(cred, so->so_cred);
if (error)
return (ENOENT);
#ifdef MAC
error = mac_socket_check_visible(cred, so);
if (error)
return (error);
#endif
if (cr_seeotheruids(cred, so->so_cred))
return (ENOENT);
if (cr_seeothergids(cred, so->so_cred))
return (ENOENT);
return (0);
}
#if defined(INET) || defined(INET6)
/*-
* Determine whether the subject represented by cred can "see" a socket.
* Returns: 0 for permitted, ENOENT otherwise.
*/
int
cr_canseeinpcb(struct ucred *cred, struct inpcb *inp)
{
int error;
error = prison_check(cred, inp->inp_cred);
if (error)
return (ENOENT);
#ifdef MAC
INP_LOCK_ASSERT(inp);
error = mac_inpcb_check_visible(cred, inp);
if (error)
return (error);
#endif
if (cr_seeotheruids(cred, inp->inp_cred))
return (ENOENT);
if (cr_seeothergids(cred, inp->inp_cred))
return (ENOENT);
return (0);
}
#endif
/*-
* Determine whether td can wait for the exit of p.
* Returns: 0 for permitted, an errno value otherwise
* Locks: Sufficient locks to protect various components of td and p
* must be held. td must be curthread, and a lock must
* be held for p.
* References: td and p must be valid for the lifetime of the call
*/
int
p_canwait(struct thread *td, struct proc *p)
{
int error;
KASSERT(td == curthread, ("%s: td not curthread", __func__));
PROC_LOCK_ASSERT(p, MA_OWNED);
if ((error = prison_check(td->td_ucred, p->p_ucred)))
return (error);
#ifdef MAC
if ((error = mac_proc_check_wait(td->td_ucred, p)))
return (error);
#endif
#if 0
/* XXXMAC: This could have odd effects on some shells. */
if ((error = cr_seeotheruids(td->td_ucred, p->p_ucred)))
return (error);
#endif
return (0);
}
/*
* Allocate a zeroed cred structure.
*/
struct ucred *
crget(void)
{
register struct ucred *cr;
cr = malloc(sizeof(*cr), M_CRED, M_WAITOK | M_ZERO);
refcount_init(&cr->cr_ref, 1);
#ifdef AUDIT
audit_cred_init(cr);
#endif
#ifdef MAC
mac_cred_init(cr);
#endif
crextend(cr, XU_NGROUPS);
return (cr);
}
/*
* Claim another reference to a ucred structure.
*/
struct ucred *
crhold(struct ucred *cr)
{
refcount_acquire(&cr->cr_ref);
return (cr);
}
/*
* Free a cred structure. Throws away space when ref count gets to 0.
*/
void
crfree(struct ucred *cr)
{
KASSERT(cr->cr_ref > 0, ("bad ucred refcount: %d", cr->cr_ref));
KASSERT(cr->cr_ref != 0xdeadc0de, ("dangling reference to ucred"));
if (refcount_release(&cr->cr_ref)) {
/*
* Some callers of crget(), such as nfs_statfs(),
* allocate a temporary credential, but don't
* allocate a uidinfo structure.
*/
if (cr->cr_uidinfo != NULL)
uifree(cr->cr_uidinfo);
if (cr->cr_ruidinfo != NULL)
uifree(cr->cr_ruidinfo);
/*
* Free a prison, if any.
*/
if (cr->cr_prison != NULL)
prison_free(cr->cr_prison);
if (cr->cr_loginclass != NULL)
loginclass_free(cr->cr_loginclass);
#ifdef AUDIT
audit_cred_destroy(cr);
#endif
#ifdef MAC
mac_cred_destroy(cr);
#endif
free(cr->cr_groups, M_CRED);
free(cr, M_CRED);
}
}
/*
* Check to see if this ucred is shared.
*/
int
crshared(struct ucred *cr)
{
return (cr->cr_ref > 1);
}
/*
* Copy a ucred's contents from a template. Does not block.
*/
void
crcopy(struct ucred *dest, struct ucred *src)
{
KASSERT(crshared(dest) == 0, ("crcopy of shared ucred"));
bcopy(&src->cr_startcopy, &dest->cr_startcopy,
(unsigned)((caddr_t)&src->cr_endcopy -
(caddr_t)&src->cr_startcopy));
crsetgroups(dest, src->cr_ngroups, src->cr_groups);
uihold(dest->cr_uidinfo);
uihold(dest->cr_ruidinfo);
prison_hold(dest->cr_prison);
loginclass_hold(dest->cr_loginclass);
#ifdef AUDIT
audit_cred_copy(src, dest);
#endif
#ifdef MAC
mac_cred_copy(src, dest);
#endif
}
/*
* Dup cred struct to a new held one.
*/
struct ucred *
crdup(struct ucred *cr)
{
struct ucred *newcr;
newcr = crget();
crcopy(newcr, cr);
return (newcr);
}
/*
* Fill in a struct xucred based on a struct ucred.
*/
void
cru2x(struct ucred *cr, struct xucred *xcr)
{
int ngroups;
bzero(xcr, sizeof(*xcr));
xcr->cr_version = XUCRED_VERSION;
xcr->cr_uid = cr->cr_uid;
ngroups = MIN(cr->cr_ngroups, XU_NGROUPS);
xcr->cr_ngroups = ngroups;
bcopy(cr->cr_groups, xcr->cr_groups,
ngroups * sizeof(*cr->cr_groups));
}
/*
* small routine to swap a thread's current ucred for the correct one taken
* from the process.
*/
void
cred_update_thread(struct thread *td)
{
struct proc *p;
struct ucred *cred;
p = td->td_proc;
cred = td->td_ucred;
PROC_LOCK(p);
td->td_ucred = crhold(p->p_ucred);
PROC_UNLOCK(p);
if (cred != NULL)
crfree(cred);
}
struct ucred *
crcopysafe(struct proc *p, struct ucred *cr)
{
struct ucred *oldcred;
int groups;
PROC_LOCK_ASSERT(p, MA_OWNED);
oldcred = p->p_ucred;
while (cr->cr_agroups < oldcred->cr_agroups) {
groups = oldcred->cr_agroups;
PROC_UNLOCK(p);
crextend(cr, groups);
PROC_LOCK(p);
oldcred = p->p_ucred;
}
crcopy(cr, oldcred);
return (oldcred);
}
/*
* Extend the passed in credential to hold n items.
*/
static void
crextend(struct ucred *cr, int n)
{
int cnt;
/* Truncate? */
if (n <= cr->cr_agroups)
return;
/*
* We extend by 2 each time since we're using a power of two
* allocator until we need enough groups to fill a page.
* Once we're allocating multiple pages, only allocate as many
* as we actually need. The case of processes needing a
* non-power of two number of pages seems more likely than
* a real world process that adds thousands of groups one at a
* time.
*/
if ( n < PAGE_SIZE / sizeof(gid_t) ) {
if (cr->cr_agroups == 0)
cnt = MINALLOCSIZE / sizeof(gid_t);
else
cnt = cr->cr_agroups * 2;
while (cnt < n)
cnt *= 2;
} else
cnt = roundup2(n, PAGE_SIZE / sizeof(gid_t));
/* Free the old array. */
if (cr->cr_groups)
free(cr->cr_groups, M_CRED);
cr->cr_groups = malloc(cnt * sizeof(gid_t), M_CRED, M_WAITOK | M_ZERO);
cr->cr_agroups = cnt;
}
/*
* Copy groups in to a credential, preserving any necessary invariants.
* Currently this includes the sorting of all supplemental gids.
* crextend() must have been called before hand to ensure sufficient
* space is available.
*/
static void
crsetgroups_locked(struct ucred *cr, int ngrp, gid_t *groups)
{
int i;
int j;
gid_t g;
KASSERT(cr->cr_agroups >= ngrp, ("cr_ngroups is too small"));
bcopy(groups, cr->cr_groups, ngrp * sizeof(gid_t));
cr->cr_ngroups = ngrp;
/*
* Sort all groups except cr_groups[0] to allow groupmember to
* perform a binary search.
*
* XXX: If large numbers of groups become common this should
* be replaced with shell sort like linux uses or possibly
* heap sort.
*/
for (i = 2; i < ngrp; i++) {
g = cr->cr_groups[i];
for (j = i-1; j >= 1 && g < cr->cr_groups[j]; j--)
cr->cr_groups[j + 1] = cr->cr_groups[j];
cr->cr_groups[j + 1] = g;
}
}
/*
* Copy groups in to a credential after expanding it if required.
* Truncate the list to (ngroups_max + 1) if it is too large.
*/
void
crsetgroups(struct ucred *cr, int ngrp, gid_t *groups)
{
if (ngrp > ngroups_max + 1)
ngrp = ngroups_max + 1;
crextend(cr, ngrp);
crsetgroups_locked(cr, ngrp, groups);
}
/*
* Get login name, if available.
*/
#ifndef _SYS_SYSPROTO_H_
struct getlogin_args {
char *namebuf;
u_int namelen;
};
#endif
/* ARGSUSED */
int
-getlogin(struct thread *td, struct getlogin_args *uap)
+sys_getlogin(struct thread *td, struct getlogin_args *uap)
{
int error;
char login[MAXLOGNAME];
struct proc *p = td->td_proc;
if (uap->namelen > MAXLOGNAME)
uap->namelen = MAXLOGNAME;
PROC_LOCK(p);
SESS_LOCK(p->p_session);
bcopy(p->p_session->s_login, login, uap->namelen);
SESS_UNLOCK(p->p_session);
PROC_UNLOCK(p);
error = copyout(login, uap->namebuf, uap->namelen);
return(error);
}
/*
* Set login name.
*/
#ifndef _SYS_SYSPROTO_H_
struct setlogin_args {
char *namebuf;
};
#endif
/* ARGSUSED */
int
-setlogin(struct thread *td, struct setlogin_args *uap)
+sys_setlogin(struct thread *td, struct setlogin_args *uap)
{
struct proc *p = td->td_proc;
int error;
char logintmp[MAXLOGNAME];
error = priv_check(td, PRIV_PROC_SETLOGIN);
if (error)
return (error);
error = copyinstr(uap->namebuf, logintmp, sizeof(logintmp), NULL);
if (error == ENAMETOOLONG)
error = EINVAL;
else if (!error) {
PROC_LOCK(p);
SESS_LOCK(p->p_session);
(void) memcpy(p->p_session->s_login, logintmp,
sizeof(logintmp));
SESS_UNLOCK(p->p_session);
PROC_UNLOCK(p);
}
return (error);
}
void
setsugid(struct proc *p)
{
PROC_LOCK_ASSERT(p, MA_OWNED);
p->p_flag |= P_SUGID;
if (!(p->p_pfsflags & PF_ISUGID))
p->p_stops = 0;
}
/*-
* Change a process's effective uid.
* Side effects: newcred->cr_uid and newcred->cr_uidinfo will be modified.
* References: newcred must be an exclusive credential reference for the
* duration of the call.
*/
void
change_euid(struct ucred *newcred, struct uidinfo *euip)
{
newcred->cr_uid = euip->ui_uid;
uihold(euip);
uifree(newcred->cr_uidinfo);
newcred->cr_uidinfo = euip;
}
/*-
* Change a process's effective gid.
* Side effects: newcred->cr_gid will be modified.
* References: newcred must be an exclusive credential reference for the
* duration of the call.
*/
void
change_egid(struct ucred *newcred, gid_t egid)
{
newcred->cr_groups[0] = egid;
}
/*-
* Change a process's real uid.
* Side effects: newcred->cr_ruid will be updated, newcred->cr_ruidinfo
* will be updated, and the old and new cr_ruidinfo proc
* counts will be updated.
* References: newcred must be an exclusive credential reference for the
* duration of the call.
*/
void
change_ruid(struct ucred *newcred, struct uidinfo *ruip)
{
(void)chgproccnt(newcred->cr_ruidinfo, -1, 0);
newcred->cr_ruid = ruip->ui_uid;
uihold(ruip);
uifree(newcred->cr_ruidinfo);
newcred->cr_ruidinfo = ruip;
(void)chgproccnt(newcred->cr_ruidinfo, 1, 0);
}
/*-
* Change a process's real gid.
* Side effects: newcred->cr_rgid will be updated.
* References: newcred must be an exclusive credential reference for the
* duration of the call.
*/
void
change_rgid(struct ucred *newcred, gid_t rgid)
{
newcred->cr_rgid = rgid;
}
/*-
* Change a process's saved uid.
* Side effects: newcred->cr_svuid will be updated.
* References: newcred must be an exclusive credential reference for the
* duration of the call.
*/
void
change_svuid(struct ucred *newcred, uid_t svuid)
{
newcred->cr_svuid = svuid;
}
/*-
* Change a process's saved gid.
* Side effects: newcred->cr_svgid will be updated.
* References: newcred must be an exclusive credential reference for the
* duration of the call.
*/
void
change_svgid(struct ucred *newcred, gid_t svgid)
{
newcred->cr_svgid = svgid;
}
Index: head/sys/kern/kern_rctl.c
===================================================================
--- head/sys/kern/kern_rctl.c (revision 225616)
+++ head/sys/kern/kern_rctl.c (revision 225617)
@@ -1,1838 +1,1838 @@
/*-
* Copyright (c) 2010 The FreeBSD Foundation
* All rights reserved.
*
* This software was developed by Edward Tomasz Napierala under sponsorship
* from the FreeBSD Foundation.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/bus.h>
#include <sys/malloc.h>
#include <sys/queue.h>
#include <sys/refcount.h>
#include <sys/jail.h>
#include <sys/kernel.h>
#include <sys/limits.h>
#include <sys/loginclass.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/racct.h>
#include <sys/rctl.h>
#include <sys/resourcevar.h>
#include <sys/sx.h>
#include <sys/sysent.h>
#include <sys/sysproto.h>
#include <sys/systm.h>
#include <sys/types.h>
#include <sys/eventhandler.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/rwlock.h>
#include <sys/sbuf.h>
#include <sys/taskqueue.h>
#include <sys/tree.h>
#include <vm/uma.h>
#ifdef RCTL
#ifndef RACCT
#error "The RCTL option requires the RACCT option"
#endif
FEATURE(rctl, "Resource Limits");
#define HRF_DEFAULT 0
#define HRF_DONT_INHERIT 1
#define HRF_DONT_ACCUMULATE 2
/* Default buffer size for rctl_get_rules(2). */
#define RCTL_DEFAULT_BUFSIZE 4096
#define RCTL_LOG_BUFSIZE 128
/*
* 'rctl_rule_link' connects a rule with every racct it's related to.
* For example, rule 'user:X:openfiles:deny=N/process' is linked
* with uidinfo for user X, and to each process of that user.
*/
struct rctl_rule_link {
LIST_ENTRY(rctl_rule_link) rrl_next;
struct rctl_rule *rrl_rule;
int rrl_exceeded;
};
struct dict {
const char *d_name;
int d_value;
};
static struct dict subjectnames[] = {
{ "process", RCTL_SUBJECT_TYPE_PROCESS },
{ "user", RCTL_SUBJECT_TYPE_USER },
{ "loginclass", RCTL_SUBJECT_TYPE_LOGINCLASS },
{ "jail", RCTL_SUBJECT_TYPE_JAIL },
{ NULL, -1 }};
static struct dict resourcenames[] = {
{ "cputime", RACCT_CPU },
{ "datasize", RACCT_DATA },
{ "stacksize", RACCT_STACK },
{ "coredumpsize", RACCT_CORE },
{ "memoryuse", RACCT_RSS },
{ "memorylocked", RACCT_MEMLOCK },
{ "maxproc", RACCT_NPROC },
{ "openfiles", RACCT_NOFILE },
{ "vmemoryuse", RACCT_VMEM },
{ "pseudoterminals", RACCT_NPTS },
{ "swapuse", RACCT_SWAP },
{ "nthr", RACCT_NTHR },
{ "msgqqueued", RACCT_MSGQQUEUED },
{ "msgqsize", RACCT_MSGQSIZE },
{ "nmsgq", RACCT_NMSGQ },
{ "nsem", RACCT_NSEM },
{ "nsemop", RACCT_NSEMOP },
{ "nshm", RACCT_NSHM },
{ "shmsize", RACCT_SHMSIZE },
{ "wallclock", RACCT_WALLCLOCK },
{ NULL, -1 }};
static struct dict actionnames[] = {
{ "sighup", RCTL_ACTION_SIGHUP },
{ "sigint", RCTL_ACTION_SIGINT },
{ "sigquit", RCTL_ACTION_SIGQUIT },
{ "sigill", RCTL_ACTION_SIGILL },
{ "sigtrap", RCTL_ACTION_SIGTRAP },
{ "sigabrt", RCTL_ACTION_SIGABRT },
{ "sigemt", RCTL_ACTION_SIGEMT },
{ "sigfpe", RCTL_ACTION_SIGFPE },
{ "sigkill", RCTL_ACTION_SIGKILL },
{ "sigbus", RCTL_ACTION_SIGBUS },
{ "sigsegv", RCTL_ACTION_SIGSEGV },
{ "sigsys", RCTL_ACTION_SIGSYS },
{ "sigpipe", RCTL_ACTION_SIGPIPE },
{ "sigalrm", RCTL_ACTION_SIGALRM },
{ "sigterm", RCTL_ACTION_SIGTERM },
{ "sigurg", RCTL_ACTION_SIGURG },
{ "sigstop", RCTL_ACTION_SIGSTOP },
{ "sigtstp", RCTL_ACTION_SIGTSTP },
{ "sigchld", RCTL_ACTION_SIGCHLD },
{ "sigttin", RCTL_ACTION_SIGTTIN },
{ "sigttou", RCTL_ACTION_SIGTTOU },
{ "sigio", RCTL_ACTION_SIGIO },
{ "sigxcpu", RCTL_ACTION_SIGXCPU },
{ "sigxfsz", RCTL_ACTION_SIGXFSZ },
{ "sigvtalrm", RCTL_ACTION_SIGVTALRM },
{ "sigprof", RCTL_ACTION_SIGPROF },
{ "sigwinch", RCTL_ACTION_SIGWINCH },
{ "siginfo", RCTL_ACTION_SIGINFO },
{ "sigusr1", RCTL_ACTION_SIGUSR1 },
{ "sigusr2", RCTL_ACTION_SIGUSR2 },
{ "sigthr", RCTL_ACTION_SIGTHR },
{ "deny", RCTL_ACTION_DENY },
{ "log", RCTL_ACTION_LOG },
{ "devctl", RCTL_ACTION_DEVCTL },
{ NULL, -1 }};
static void rctl_init(void);
SYSINIT(rctl, SI_SUB_RACCT, SI_ORDER_FIRST, rctl_init, NULL);
static uma_zone_t rctl_rule_link_zone;
static uma_zone_t rctl_rule_zone;
static struct rwlock rctl_lock;
RW_SYSINIT(rctl_lock, &rctl_lock, "RCTL lock");
static int rctl_rule_fully_specified(const struct rctl_rule *rule);
static void rctl_rule_to_sbuf(struct sbuf *sb, const struct rctl_rule *rule);
MALLOC_DEFINE(M_RCTL, "rctl", "Resource Limits");
static const char *
rctl_subject_type_name(int subject)
{
int i;
for (i = 0; subjectnames[i].d_name != NULL; i++) {
if (subjectnames[i].d_value == subject)
return (subjectnames[i].d_name);
}
panic("rctl_subject_type_name: unknown subject type %d", subject);
}
static const char *
rctl_action_name(int action)
{
int i;
for (i = 0; actionnames[i].d_name != NULL; i++) {
if (actionnames[i].d_value == action)
return (actionnames[i].d_name);
}
panic("rctl_action_name: unknown action %d", action);
}
const char *
rctl_resource_name(int resource)
{
int i;
for (i = 0; resourcenames[i].d_name != NULL; i++) {
if (resourcenames[i].d_value == resource)
return (resourcenames[i].d_name);
}
panic("rctl_resource_name: unknown resource %d", resource);
}
/*
* Return the amount of resource that can be allocated by 'p' before
* hitting 'rule'.
*/
static int64_t
rctl_available_resource(const struct proc *p, const struct rctl_rule *rule)
{
int resource;
int64_t available = INT64_MAX;
struct ucred *cred = p->p_ucred;
rw_assert(&rctl_lock, RA_LOCKED);
resource = rule->rr_resource;
switch (rule->rr_per) {
case RCTL_SUBJECT_TYPE_PROCESS:
available = rule->rr_amount -
p->p_racct->r_resources[resource];
break;
case RCTL_SUBJECT_TYPE_USER:
available = rule->rr_amount -
cred->cr_ruidinfo->ui_racct->r_resources[resource];
break;
case RCTL_SUBJECT_TYPE_LOGINCLASS:
available = rule->rr_amount -
cred->cr_loginclass->lc_racct->r_resources[resource];
break;
case RCTL_SUBJECT_TYPE_JAIL:
available = rule->rr_amount -
cred->cr_prison->pr_prison_racct->prr_racct->
r_resources[resource];
break;
default:
panic("rctl_compute_available: unknown per %d",
rule->rr_per);
}
return (available);
}
/*
* Return non-zero if allocating 'amount' by proc 'p' would exceed
* resource limit specified by 'rule'.
*/
static int
rctl_would_exceed(const struct proc *p, const struct rctl_rule *rule,
int64_t amount)
{
int64_t available;
rw_assert(&rctl_lock, RA_LOCKED);
available = rctl_available_resource(p, rule);
if (available >= amount)
return (0);
return (1);
}
/*
* Check whether the proc 'p' can allocate 'amount' of 'resource' in addition
* to what it keeps allocated now. Returns non-zero if the allocation should
* be denied, 0 otherwise.
*/
int
rctl_enforce(struct proc *p, int resource, uint64_t amount)
{
struct rctl_rule *rule;
struct rctl_rule_link *link;
struct sbuf sb;
int should_deny = 0;
char *buf;
static int curtime = 0;
static struct timeval lasttime;
rw_rlock(&rctl_lock);
/*
* There may be more than one matching rule; go through all of them.
* Denial should be done last, after logging and sending signals.
*/
LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
rule = link->rrl_rule;
if (rule->rr_resource != resource)
continue;
if (!rctl_would_exceed(p, rule, amount)) {
link->rrl_exceeded = 0;
continue;
}
switch (rule->rr_action) {
case RCTL_ACTION_DENY:
should_deny = 1;
continue;
case RCTL_ACTION_LOG:
/*
* If rrl_exceeded != 0, it means we've already
* logged a warning for this process.
*/
if (link->rrl_exceeded != 0)
continue;
if (!ppsratecheck(&lasttime, &curtime, 10))
continue;
buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT);
if (buf == NULL) {
printf("rctl_enforce: out of memory\n");
continue;
}
sbuf_new(&sb, buf, RCTL_LOG_BUFSIZE, SBUF_FIXEDLEN);
rctl_rule_to_sbuf(&sb, rule);
sbuf_finish(&sb);
printf("rctl: rule \"%s\" matched by pid %d "
"(%s), uid %d, jail %s\n", sbuf_data(&sb),
p->p_pid, p->p_comm, p->p_ucred->cr_uid,
p->p_ucred->cr_prison->pr_prison_racct->prr_name);
sbuf_delete(&sb);
free(buf, M_RCTL);
link->rrl_exceeded = 1;
continue;
case RCTL_ACTION_DEVCTL:
if (link->rrl_exceeded != 0)
continue;
buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT);
if (buf == NULL) {
printf("rctl_enforce: out of memory\n");
continue;
}
sbuf_new(&sb, buf, RCTL_LOG_BUFSIZE, SBUF_FIXEDLEN);
sbuf_printf(&sb, "rule=");
rctl_rule_to_sbuf(&sb, rule);
sbuf_printf(&sb, " pid=%d ruid=%d jail=%s",
p->p_pid, p->p_ucred->cr_ruid,
p->p_ucred->cr_prison->pr_prison_racct->prr_name);
sbuf_finish(&sb);
devctl_notify_f("RCTL", "rule", "matched",
sbuf_data(&sb), M_NOWAIT);
sbuf_delete(&sb);
free(buf, M_RCTL);
link->rrl_exceeded = 1;
continue;
default:
if (link->rrl_exceeded != 0)
continue;
KASSERT(rule->rr_action > 0 &&
rule->rr_action <= RCTL_ACTION_SIGNAL_MAX,
("rctl_enforce: unknown action %d",
rule->rr_action));
/*
* We're supposed to send a signal, but the process
* is not fully initialized yet, probably because we
* got called from fork1(). For now just deny the
* allocation instead.
*/
if (p->p_state != PRS_NORMAL) {
should_deny = 1;
continue;
}
/*
* We're using the fact that RCTL_ACTION_SIG* values
* are equal to their counterparts from sys/signal.h.
*/
- psignal(p, rule->rr_action);
+ kern_psignal(p, rule->rr_action);
link->rrl_exceeded = 1;
continue;
}
}
rw_runlock(&rctl_lock);
if (should_deny) {
/*
* Return fake error code; the caller should change it
* into one proper for the situation - EFSIZ, ENOMEM etc.
*/
return (EDOOFUS);
}
return (0);
}
uint64_t
rctl_get_limit(struct proc *p, int resource)
{
struct rctl_rule *rule;
struct rctl_rule_link *link;
uint64_t amount = UINT64_MAX;
rw_rlock(&rctl_lock);
/*
* There may be more than one matching rule; go through all of them.
* Denial should be done last, after logging and sending signals.
*/
LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
rule = link->rrl_rule;
if (rule->rr_resource != resource)
continue;
if (rule->rr_action != RCTL_ACTION_DENY)
continue;
if (rule->rr_amount < amount)
amount = rule->rr_amount;
}
rw_runlock(&rctl_lock);
return (amount);
}
uint64_t
rctl_get_available(struct proc *p, int resource)
{
struct rctl_rule *rule;
struct rctl_rule_link *link;
int64_t available, minavailable, allocated;
minavailable = INT64_MAX;
rw_rlock(&rctl_lock);
/*
* There may be more than one matching rule; go through all of them.
* Denial should be done last, after logging and sending signals.
*/
LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
rule = link->rrl_rule;
if (rule->rr_resource != resource)
continue;
if (rule->rr_action != RCTL_ACTION_DENY)
continue;
available = rctl_available_resource(p, rule);
if (available < minavailable)
minavailable = available;
}
rw_runlock(&rctl_lock);
/*
* XXX: Think about this _hard_.
*/
allocated = p->p_racct->r_resources[resource];
if (minavailable < INT64_MAX - allocated)
minavailable += allocated;
if (minavailable < 0)
minavailable = 0;
return (minavailable);
}
static int
rctl_rule_matches(const struct rctl_rule *rule, const struct rctl_rule *filter)
{
if (filter->rr_subject_type != RCTL_SUBJECT_TYPE_UNDEFINED) {
if (rule->rr_subject_type != filter->rr_subject_type)
return (0);
switch (filter->rr_subject_type) {
case RCTL_SUBJECT_TYPE_PROCESS:
if (filter->rr_subject.rs_proc != NULL &&
rule->rr_subject.rs_proc !=
filter->rr_subject.rs_proc)
return (0);
break;
case RCTL_SUBJECT_TYPE_USER:
if (filter->rr_subject.rs_uip != NULL &&
rule->rr_subject.rs_uip !=
filter->rr_subject.rs_uip)
return (0);
break;
case RCTL_SUBJECT_TYPE_LOGINCLASS:
if (filter->rr_subject.rs_loginclass != NULL &&
rule->rr_subject.rs_loginclass !=
filter->rr_subject.rs_loginclass)
return (0);
break;
case RCTL_SUBJECT_TYPE_JAIL:
if (filter->rr_subject.rs_prison_racct != NULL &&
rule->rr_subject.rs_prison_racct !=
filter->rr_subject.rs_prison_racct)
return (0);
break;
default:
panic("rctl_rule_matches: unknown subject type %d",
filter->rr_subject_type);
}
}
if (filter->rr_resource != RACCT_UNDEFINED) {
if (rule->rr_resource != filter->rr_resource)
return (0);
}
if (filter->rr_action != RCTL_ACTION_UNDEFINED) {
if (rule->rr_action != filter->rr_action)
return (0);
}
if (filter->rr_amount != RCTL_AMOUNT_UNDEFINED) {
if (rule->rr_amount != filter->rr_amount)
return (0);
}
if (filter->rr_per != RCTL_SUBJECT_TYPE_UNDEFINED) {
if (rule->rr_per != filter->rr_per)
return (0);
}
return (1);
}
static int
str2value(const char *str, int *value, struct dict *table)
{
int i;
if (value == NULL)
return (EINVAL);
for (i = 0; table[i].d_name != NULL; i++) {
if (strcasecmp(table[i].d_name, str) == 0) {
*value = table[i].d_value;
return (0);
}
}
return (EINVAL);
}
static int
str2id(const char *str, id_t *value)
{
char *end;
if (str == NULL)
return (EINVAL);
*value = strtoul(str, &end, 10);
if ((size_t)(end - str) != strlen(str))
return (EINVAL);
return (0);
}
static int
str2int64(const char *str, int64_t *value)
{
char *end;
if (str == NULL)
return (EINVAL);
*value = strtoul(str, &end, 10);
if ((size_t)(end - str) != strlen(str))
return (EINVAL);
return (0);
}
/*
* Connect the rule to the racct, increasing refcount for the rule.
*/
static void
rctl_racct_add_rule(struct racct *racct, struct rctl_rule *rule)
{
struct rctl_rule_link *link;
KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
rctl_rule_acquire(rule);
link = uma_zalloc(rctl_rule_link_zone, M_WAITOK);
link->rrl_rule = rule;
link->rrl_exceeded = 0;
rw_wlock(&rctl_lock);
LIST_INSERT_HEAD(&racct->r_rule_links, link, rrl_next);
rw_wunlock(&rctl_lock);
}
static int
rctl_racct_add_rule_locked(struct racct *racct, struct rctl_rule *rule)
{
struct rctl_rule_link *link;
KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
rw_assert(&rctl_lock, RA_WLOCKED);
link = uma_zalloc(rctl_rule_link_zone, M_NOWAIT);
if (link == NULL)
return (ENOMEM);
rctl_rule_acquire(rule);
link->rrl_rule = rule;
link->rrl_exceeded = 0;
LIST_INSERT_HEAD(&racct->r_rule_links, link, rrl_next);
return (0);
}
/*
* Remove limits for a rules matching the filter and release
* the refcounts for the rules, possibly freeing them. Returns
* the number of limit structures removed.
*/
static int
rctl_racct_remove_rules(struct racct *racct,
const struct rctl_rule *filter)
{
int removed = 0;
struct rctl_rule_link *link, *linktmp;
rw_assert(&rctl_lock, RA_WLOCKED);
LIST_FOREACH_SAFE(link, &racct->r_rule_links, rrl_next, linktmp) {
if (!rctl_rule_matches(link->rrl_rule, filter))
continue;
LIST_REMOVE(link, rrl_next);
rctl_rule_release(link->rrl_rule);
uma_zfree(rctl_rule_link_zone, link);
removed++;
}
return (removed);
}
static void
rctl_rule_acquire_subject(struct rctl_rule *rule)
{
switch (rule->rr_subject_type) {
case RCTL_SUBJECT_TYPE_UNDEFINED:
case RCTL_SUBJECT_TYPE_PROCESS:
break;
case RCTL_SUBJECT_TYPE_JAIL:
if (rule->rr_subject.rs_prison_racct != NULL)
prison_racct_hold(rule->rr_subject.rs_prison_racct);
break;
case RCTL_SUBJECT_TYPE_USER:
if (rule->rr_subject.rs_uip != NULL)
uihold(rule->rr_subject.rs_uip);
break;
case RCTL_SUBJECT_TYPE_LOGINCLASS:
if (rule->rr_subject.rs_loginclass != NULL)
loginclass_hold(rule->rr_subject.rs_loginclass);
break;
default:
panic("rctl_rule_acquire_subject: unknown subject type %d",
rule->rr_subject_type);
}
}
static void
rctl_rule_release_subject(struct rctl_rule *rule)
{
switch (rule->rr_subject_type) {
case RCTL_SUBJECT_TYPE_UNDEFINED:
case RCTL_SUBJECT_TYPE_PROCESS:
break;
case RCTL_SUBJECT_TYPE_JAIL:
if (rule->rr_subject.rs_prison_racct != NULL)
prison_racct_free(rule->rr_subject.rs_prison_racct);
break;
case RCTL_SUBJECT_TYPE_USER:
if (rule->rr_subject.rs_uip != NULL)
uifree(rule->rr_subject.rs_uip);
break;
case RCTL_SUBJECT_TYPE_LOGINCLASS:
if (rule->rr_subject.rs_loginclass != NULL)
loginclass_free(rule->rr_subject.rs_loginclass);
break;
default:
panic("rctl_rule_release_subject: unknown subject type %d",
rule->rr_subject_type);
}
}
struct rctl_rule *
rctl_rule_alloc(int flags)
{
struct rctl_rule *rule;
rule = uma_zalloc(rctl_rule_zone, flags);
if (rule == NULL)
return (NULL);
rule->rr_subject_type = RCTL_SUBJECT_TYPE_UNDEFINED;
rule->rr_subject.rs_proc = NULL;
rule->rr_subject.rs_uip = NULL;
rule->rr_subject.rs_loginclass = NULL;
rule->rr_subject.rs_prison_racct = NULL;
rule->rr_per = RCTL_SUBJECT_TYPE_UNDEFINED;
rule->rr_resource = RACCT_UNDEFINED;
rule->rr_action = RCTL_ACTION_UNDEFINED;
rule->rr_amount = RCTL_AMOUNT_UNDEFINED;
refcount_init(&rule->rr_refcount, 1);
return (rule);
}
struct rctl_rule *
rctl_rule_duplicate(const struct rctl_rule *rule, int flags)
{
struct rctl_rule *copy;
copy = uma_zalloc(rctl_rule_zone, flags);
if (copy == NULL)
return (NULL);
copy->rr_subject_type = rule->rr_subject_type;
copy->rr_subject.rs_proc = rule->rr_subject.rs_proc;
copy->rr_subject.rs_uip = rule->rr_subject.rs_uip;
copy->rr_subject.rs_loginclass = rule->rr_subject.rs_loginclass;
copy->rr_subject.rs_prison_racct = rule->rr_subject.rs_prison_racct;
copy->rr_per = rule->rr_per;
copy->rr_resource = rule->rr_resource;
copy->rr_action = rule->rr_action;
copy->rr_amount = rule->rr_amount;
refcount_init(&copy->rr_refcount, 1);
rctl_rule_acquire_subject(copy);
return (copy);
}
void
rctl_rule_acquire(struct rctl_rule *rule)
{
KASSERT(rule->rr_refcount > 0, ("rule->rr_refcount <= 0"));
refcount_acquire(&rule->rr_refcount);
}
static void
rctl_rule_free(void *context, int pending)
{
struct rctl_rule *rule;
rule = (struct rctl_rule *)context;
KASSERT(rule->rr_refcount == 0, ("rule->rr_refcount != 0"));
/*
* We don't need locking here; rule is guaranteed to be inaccessible.
*/
rctl_rule_release_subject(rule);
uma_zfree(rctl_rule_zone, rule);
}
void
rctl_rule_release(struct rctl_rule *rule)
{
KASSERT(rule->rr_refcount > 0, ("rule->rr_refcount <= 0"));
if (refcount_release(&rule->rr_refcount)) {
/*
* rctl_rule_release() is often called when iterating
* over all the uidinfo structures in the system,
* holding uihashtbl_lock. Since rctl_rule_free()
* might end up calling uifree(), this would lead
* to lock recursion. Use taskqueue to avoid this.
*/
TASK_INIT(&rule->rr_task, 0, rctl_rule_free, rule);
taskqueue_enqueue(taskqueue_thread, &rule->rr_task);
}
}
static int
rctl_rule_fully_specified(const struct rctl_rule *rule)
{
switch (rule->rr_subject_type) {
case RCTL_SUBJECT_TYPE_UNDEFINED:
return (0);
case RCTL_SUBJECT_TYPE_PROCESS:
if (rule->rr_subject.rs_proc == NULL)
return (0);
break;
case RCTL_SUBJECT_TYPE_USER:
if (rule->rr_subject.rs_uip == NULL)
return (0);
break;
case RCTL_SUBJECT_TYPE_LOGINCLASS:
if (rule->rr_subject.rs_loginclass == NULL)
return (0);
break;
case RCTL_SUBJECT_TYPE_JAIL:
if (rule->rr_subject.rs_prison_racct == NULL)
return (0);
break;
default:
panic("rctl_rule_fully_specified: unknown subject type %d",
rule->rr_subject_type);
}
if (rule->rr_resource == RACCT_UNDEFINED)
return (0);
if (rule->rr_action == RCTL_ACTION_UNDEFINED)
return (0);
if (rule->rr_amount == RCTL_AMOUNT_UNDEFINED)
return (0);
if (rule->rr_per == RCTL_SUBJECT_TYPE_UNDEFINED)
return (0);
return (1);
}
static int
rctl_string_to_rule(char *rulestr, struct rctl_rule **rulep)
{
int error = 0;
char *subjectstr, *subject_idstr, *resourcestr, *actionstr,
*amountstr, *perstr;
struct rctl_rule *rule;
id_t id;
rule = rctl_rule_alloc(M_WAITOK);
subjectstr = strsep(&rulestr, ":");
subject_idstr = strsep(&rulestr, ":");
resourcestr = strsep(&rulestr, ":");
actionstr = strsep(&rulestr, "=/");
amountstr = strsep(&rulestr, "/");
perstr = rulestr;
if (subjectstr == NULL || subjectstr[0] == '\0')
rule->rr_subject_type = RCTL_SUBJECT_TYPE_UNDEFINED;
else {
error = str2value(subjectstr, &rule->rr_subject_type, subjectnames);
if (error != 0)
goto out;
}
if (subject_idstr == NULL || subject_idstr[0] == '\0') {
rule->rr_subject.rs_proc = NULL;
rule->rr_subject.rs_uip = NULL;
rule->rr_subject.rs_loginclass = NULL;
rule->rr_subject.rs_prison_racct = NULL;
} else {
switch (rule->rr_subject_type) {
case RCTL_SUBJECT_TYPE_UNDEFINED:
error = EINVAL;
goto out;
case RCTL_SUBJECT_TYPE_PROCESS:
error = str2id(subject_idstr, &id);
if (error != 0)
goto out;
sx_assert(&allproc_lock, SA_LOCKED);
rule->rr_subject.rs_proc = pfind(id);
if (rule->rr_subject.rs_proc == NULL) {
error = ESRCH;
goto out;
}
PROC_UNLOCK(rule->rr_subject.rs_proc);
break;
case RCTL_SUBJECT_TYPE_USER:
error = str2id(subject_idstr, &id);
if (error != 0)
goto out;
rule->rr_subject.rs_uip = uifind(id);
break;
case RCTL_SUBJECT_TYPE_LOGINCLASS:
rule->rr_subject.rs_loginclass =
loginclass_find(subject_idstr);
if (rule->rr_subject.rs_loginclass == NULL) {
error = ENAMETOOLONG;
goto out;
}
break;
case RCTL_SUBJECT_TYPE_JAIL:
rule->rr_subject.rs_prison_racct =
prison_racct_find(subject_idstr);
if (rule->rr_subject.rs_prison_racct == NULL) {
error = ENAMETOOLONG;
goto out;
}
break;
default:
panic("rctl_string_to_rule: unknown subject type %d",
rule->rr_subject_type);
}
}
if (resourcestr == NULL || resourcestr[0] == '\0')
rule->rr_resource = RACCT_UNDEFINED;
else {
error = str2value(resourcestr, &rule->rr_resource,
resourcenames);
if (error != 0)
goto out;
}
if (actionstr == NULL || actionstr[0] == '\0')
rule->rr_action = RCTL_ACTION_UNDEFINED;
else {
error = str2value(actionstr, &rule->rr_action, actionnames);
if (error != 0)
goto out;
}
if (amountstr == NULL || amountstr[0] == '\0')
rule->rr_amount = RCTL_AMOUNT_UNDEFINED;
else {
error = str2int64(amountstr, &rule->rr_amount);
if (error != 0)
goto out;
if (RACCT_IS_IN_MILLIONS(rule->rr_resource))
rule->rr_amount *= 1000000;
}
if (perstr == NULL || perstr[0] == '\0')
rule->rr_per = RCTL_SUBJECT_TYPE_UNDEFINED;
else {
error = str2value(perstr, &rule->rr_per, subjectnames);
if (error != 0)
goto out;
}
out:
if (error == 0)
*rulep = rule;
else
rctl_rule_release(rule);
return (error);
}
/*
* Link a rule with all the subjects it applies to.
*/
int
rctl_rule_add(struct rctl_rule *rule)
{
struct proc *p;
struct ucred *cred;
struct uidinfo *uip;
struct prison *pr;
struct prison_racct *prr;
struct loginclass *lc;
struct rctl_rule *rule2;
int match;
KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
/*
* Some rules just don't make sense. Note that the one below
* cannot be rewritten using RACCT_IS_DENIABLE(); the RACCT_PCTCPU,
* for example, is not deniable in the racct sense, but the
* limit is enforced in a different way, so "deny" rules for %CPU
* do make sense.
*/
if (rule->rr_action == RCTL_ACTION_DENY &&
(rule->rr_resource == RACCT_CPU ||
rule->rr_resource == RACCT_WALLCLOCK))
return (EOPNOTSUPP);
if (rule->rr_per == RCTL_SUBJECT_TYPE_PROCESS &&
RACCT_IS_SLOPPY(rule->rr_resource))
return (EOPNOTSUPP);
/*
* Make sure there are no duplicated rules. Also, for the "deny"
* rules, remove ones differing only by "amount".
*/
if (rule->rr_action == RCTL_ACTION_DENY) {
rule2 = rctl_rule_duplicate(rule, M_WAITOK);
rule2->rr_amount = RCTL_AMOUNT_UNDEFINED;
rctl_rule_remove(rule2);
rctl_rule_release(rule2);
} else
rctl_rule_remove(rule);
switch (rule->rr_subject_type) {
case RCTL_SUBJECT_TYPE_PROCESS:
p = rule->rr_subject.rs_proc;
KASSERT(p != NULL, ("rctl_rule_add: NULL proc"));
/*
* No resource limits for system processes.
*/
if (p->p_flag & P_SYSTEM)
return (EPERM);
rctl_racct_add_rule(p->p_racct, rule);
/*
* In case of per-process rule, we don't have anything more
* to do.
*/
return (0);
case RCTL_SUBJECT_TYPE_USER:
uip = rule->rr_subject.rs_uip;
KASSERT(uip != NULL, ("rctl_rule_add: NULL uip"));
rctl_racct_add_rule(uip->ui_racct, rule);
break;
case RCTL_SUBJECT_TYPE_LOGINCLASS:
lc = rule->rr_subject.rs_loginclass;
KASSERT(lc != NULL, ("rctl_rule_add: NULL loginclass"));
rctl_racct_add_rule(lc->lc_racct, rule);
break;
case RCTL_SUBJECT_TYPE_JAIL:
prr = rule->rr_subject.rs_prison_racct;
KASSERT(prr != NULL, ("rctl_rule_add: NULL pr"));
rctl_racct_add_rule(prr->prr_racct, rule);
break;
default:
panic("rctl_rule_add: unknown subject type %d",
rule->rr_subject_type);
}
/*
* Now go through all the processes and add the new rule to the ones
* it applies to.
*/
sx_assert(&allproc_lock, SA_LOCKED);
FOREACH_PROC_IN_SYSTEM(p) {
if (p->p_flag & P_SYSTEM)
continue;
cred = p->p_ucred;
switch (rule->rr_subject_type) {
case RCTL_SUBJECT_TYPE_USER:
if (cred->cr_uidinfo == rule->rr_subject.rs_uip ||
cred->cr_ruidinfo == rule->rr_subject.rs_uip)
break;
continue;
case RCTL_SUBJECT_TYPE_LOGINCLASS:
if (cred->cr_loginclass == rule->rr_subject.rs_loginclass)
break;
continue;
case RCTL_SUBJECT_TYPE_JAIL:
match = 0;
for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent) {
if (pr->pr_prison_racct == rule->rr_subject.rs_prison_racct) {
match = 1;
break;
}
}
if (match)
break;
continue;
default:
panic("rctl_rule_add: unknown subject type %d",
rule->rr_subject_type);
}
rctl_racct_add_rule(p->p_racct, rule);
}
return (0);
}
static void
rctl_rule_remove_callback(struct racct *racct, void *arg2, void *arg3)
{
struct rctl_rule *filter = (struct rctl_rule *)arg2;
int found = 0;
rw_wlock(&rctl_lock);
found += rctl_racct_remove_rules(racct, filter);
rw_wunlock(&rctl_lock);
*((int *)arg3) += found;
}
/*
* Remove all rules that match the filter.
*/
int
rctl_rule_remove(struct rctl_rule *filter)
{
int found = 0;
struct proc *p;
if (filter->rr_subject_type == RCTL_SUBJECT_TYPE_PROCESS &&
filter->rr_subject.rs_proc != NULL) {
p = filter->rr_subject.rs_proc;
rw_wlock(&rctl_lock);
found = rctl_racct_remove_rules(p->p_racct, filter);
rw_wunlock(&rctl_lock);
if (found)
return (0);
return (ESRCH);
}
loginclass_racct_foreach(rctl_rule_remove_callback, filter,
(void *)&found);
ui_racct_foreach(rctl_rule_remove_callback, filter,
(void *)&found);
prison_racct_foreach(rctl_rule_remove_callback, filter,
(void *)&found);
sx_assert(&allproc_lock, SA_LOCKED);
rw_wlock(&rctl_lock);
FOREACH_PROC_IN_SYSTEM(p) {
found += rctl_racct_remove_rules(p->p_racct, filter);
}
rw_wunlock(&rctl_lock);
if (found)
return (0);
return (ESRCH);
}
/*
* Appends a rule to the sbuf.
*/
static void
rctl_rule_to_sbuf(struct sbuf *sb, const struct rctl_rule *rule)
{
int64_t amount;
sbuf_printf(sb, "%s:", rctl_subject_type_name(rule->rr_subject_type));
switch (rule->rr_subject_type) {
case RCTL_SUBJECT_TYPE_PROCESS:
if (rule->rr_subject.rs_proc == NULL)
sbuf_printf(sb, ":");
else
sbuf_printf(sb, "%d:",
rule->rr_subject.rs_proc->p_pid);
break;
case RCTL_SUBJECT_TYPE_USER:
if (rule->rr_subject.rs_uip == NULL)
sbuf_printf(sb, ":");
else
sbuf_printf(sb, "%d:",
rule->rr_subject.rs_uip->ui_uid);
break;
case RCTL_SUBJECT_TYPE_LOGINCLASS:
if (rule->rr_subject.rs_loginclass == NULL)
sbuf_printf(sb, ":");
else
sbuf_printf(sb, "%s:",
rule->rr_subject.rs_loginclass->lc_name);
break;
case RCTL_SUBJECT_TYPE_JAIL:
if (rule->rr_subject.rs_prison_racct == NULL)
sbuf_printf(sb, ":");
else
sbuf_printf(sb, "%s:",
rule->rr_subject.rs_prison_racct->prr_name);
break;
default:
panic("rctl_rule_to_sbuf: unknown subject type %d",
rule->rr_subject_type);
}
amount = rule->rr_amount;
if (amount != RCTL_AMOUNT_UNDEFINED &&
RACCT_IS_IN_MILLIONS(rule->rr_resource))
amount /= 1000000;
sbuf_printf(sb, "%s:%s=%jd",
rctl_resource_name(rule->rr_resource),
rctl_action_name(rule->rr_action),
amount);
if (rule->rr_per != rule->rr_subject_type)
sbuf_printf(sb, "/%s", rctl_subject_type_name(rule->rr_per));
}
/*
* Routine used by RCTL syscalls to read in input string.
*/
static int
rctl_read_inbuf(char **inputstr, const char *inbufp, size_t inbuflen)
{
int error;
char *str;
if (inbuflen <= 0)
return (EINVAL);
str = malloc(inbuflen + 1, M_RCTL, M_WAITOK);
error = copyinstr(inbufp, str, inbuflen, NULL);
if (error != 0) {
free(str, M_RCTL);
return (error);
}
*inputstr = str;
return (0);
}
/*
* Routine used by RCTL syscalls to write out output string.
*/
static int
rctl_write_outbuf(struct sbuf *outputsbuf, char *outbufp, size_t outbuflen)
{
int error;
if (outputsbuf == NULL)
return (0);
sbuf_finish(outputsbuf);
if (outbuflen < sbuf_len(outputsbuf) + 1) {
sbuf_delete(outputsbuf);
return (ERANGE);
}
error = copyout(sbuf_data(outputsbuf), outbufp,
sbuf_len(outputsbuf) + 1);
sbuf_delete(outputsbuf);
return (error);
}
static struct sbuf *
rctl_racct_to_sbuf(struct racct *racct, int sloppy)
{
int i;
int64_t amount;
struct sbuf *sb;
sb = sbuf_new_auto();
for (i = 0; i <= RACCT_MAX; i++) {
if (sloppy == 0 && RACCT_IS_SLOPPY(i))
continue;
amount = racct->r_resources[i];
if (RACCT_IS_IN_MILLIONS(i))
amount /= 1000000;
sbuf_printf(sb, "%s=%jd,", rctl_resource_name(i), amount);
}
sbuf_setpos(sb, sbuf_len(sb) - 1);
return (sb);
}
int
-rctl_get_racct(struct thread *td, struct rctl_get_racct_args *uap)
+sys_rctl_get_racct(struct thread *td, struct rctl_get_racct_args *uap)
{
int error;
char *inputstr;
struct rctl_rule *filter;
struct sbuf *outputsbuf = NULL;
struct proc *p;
struct uidinfo *uip;
struct loginclass *lc;
struct prison_racct *prr;
error = priv_check(td, PRIV_RCTL_GET_RACCT);
if (error != 0)
return (error);
error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
if (error != 0)
return (error);
sx_slock(&allproc_lock);
error = rctl_string_to_rule(inputstr, &filter);
free(inputstr, M_RCTL);
if (error != 0) {
sx_sunlock(&allproc_lock);
return (error);
}
switch (filter->rr_subject_type) {
case RCTL_SUBJECT_TYPE_PROCESS:
p = filter->rr_subject.rs_proc;
if (p == NULL) {
error = EINVAL;
goto out;
}
if (p->p_flag & P_SYSTEM) {
error = EINVAL;
goto out;
}
outputsbuf = rctl_racct_to_sbuf(p->p_racct, 0);
break;
case RCTL_SUBJECT_TYPE_USER:
uip = filter->rr_subject.rs_uip;
if (uip == NULL) {
error = EINVAL;
goto out;
}
outputsbuf = rctl_racct_to_sbuf(uip->ui_racct, 1);
break;
case RCTL_SUBJECT_TYPE_LOGINCLASS:
lc = filter->rr_subject.rs_loginclass;
if (lc == NULL) {
error = EINVAL;
goto out;
}
outputsbuf = rctl_racct_to_sbuf(lc->lc_racct, 1);
break;
case RCTL_SUBJECT_TYPE_JAIL:
prr = filter->rr_subject.rs_prison_racct;
if (prr == NULL) {
error = EINVAL;
goto out;
}
outputsbuf = rctl_racct_to_sbuf(prr->prr_racct, 1);
break;
default:
error = EINVAL;
}
out:
rctl_rule_release(filter);
sx_sunlock(&allproc_lock);
if (error != 0)
return (error);
error = rctl_write_outbuf(outputsbuf, uap->outbufp, uap->outbuflen);
return (error);
}
static void
rctl_get_rules_callback(struct racct *racct, void *arg2, void *arg3)
{
struct rctl_rule *filter = (struct rctl_rule *)arg2;
struct rctl_rule_link *link;
struct sbuf *sb = (struct sbuf *)arg3;
rw_rlock(&rctl_lock);
LIST_FOREACH(link, &racct->r_rule_links, rrl_next) {
if (!rctl_rule_matches(link->rrl_rule, filter))
continue;
rctl_rule_to_sbuf(sb, link->rrl_rule);
sbuf_printf(sb, ",");
}
rw_runlock(&rctl_lock);
}
int
-rctl_get_rules(struct thread *td, struct rctl_get_rules_args *uap)
+sys_rctl_get_rules(struct thread *td, struct rctl_get_rules_args *uap)
{
int error;
size_t bufsize = RCTL_DEFAULT_BUFSIZE;
char *inputstr, *buf;
struct sbuf *sb;
struct rctl_rule *filter;
struct rctl_rule_link *link;
struct proc *p;
error = priv_check(td, PRIV_RCTL_GET_RULES);
if (error != 0)
return (error);
error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
if (error != 0)
return (error);
sx_slock(&allproc_lock);
error = rctl_string_to_rule(inputstr, &filter);
free(inputstr, M_RCTL);
if (error != 0) {
sx_sunlock(&allproc_lock);
return (error);
}
again:
buf = malloc(bufsize, M_RCTL, M_WAITOK);
sb = sbuf_new(NULL, buf, bufsize, SBUF_FIXEDLEN);
KASSERT(sb != NULL, ("sbuf_new failed"));
sx_assert(&allproc_lock, SA_LOCKED);
FOREACH_PROC_IN_SYSTEM(p) {
rw_rlock(&rctl_lock);
LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
/*
* Non-process rules will be added to the buffer later.
* Adding them here would result in duplicated output.
*/
if (link->rrl_rule->rr_subject_type !=
RCTL_SUBJECT_TYPE_PROCESS)
continue;
if (!rctl_rule_matches(link->rrl_rule, filter))
continue;
rctl_rule_to_sbuf(sb, link->rrl_rule);
sbuf_printf(sb, ",");
}
rw_runlock(&rctl_lock);
}
loginclass_racct_foreach(rctl_get_rules_callback, filter, sb);
ui_racct_foreach(rctl_get_rules_callback, filter, sb);
prison_racct_foreach(rctl_get_rules_callback, filter, sb);
if (sbuf_error(sb) == ENOMEM) {
sbuf_delete(sb);
free(buf, M_RCTL);
bufsize *= 4;
goto again;
}
/*
* Remove trailing ",".
*/
if (sbuf_len(sb) > 0)
sbuf_setpos(sb, sbuf_len(sb) - 1);
error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen);
rctl_rule_release(filter);
sx_sunlock(&allproc_lock);
free(buf, M_RCTL);
return (error);
}
int
-rctl_get_limits(struct thread *td, struct rctl_get_limits_args *uap)
+sys_rctl_get_limits(struct thread *td, struct rctl_get_limits_args *uap)
{
int error;
size_t bufsize = RCTL_DEFAULT_BUFSIZE;
char *inputstr, *buf;
struct sbuf *sb;
struct rctl_rule *filter;
struct rctl_rule_link *link;
error = priv_check(td, PRIV_RCTL_GET_LIMITS);
if (error != 0)
return (error);
error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
if (error != 0)
return (error);
sx_slock(&allproc_lock);
error = rctl_string_to_rule(inputstr, &filter);
free(inputstr, M_RCTL);
if (error != 0) {
sx_sunlock(&allproc_lock);
return (error);
}
if (filter->rr_subject_type == RCTL_SUBJECT_TYPE_UNDEFINED) {
rctl_rule_release(filter);
sx_sunlock(&allproc_lock);
return (EINVAL);
}
if (filter->rr_subject_type != RCTL_SUBJECT_TYPE_PROCESS) {
rctl_rule_release(filter);
sx_sunlock(&allproc_lock);
return (EOPNOTSUPP);
}
if (filter->rr_subject.rs_proc == NULL) {
rctl_rule_release(filter);
sx_sunlock(&allproc_lock);
return (EINVAL);
}
again:
buf = malloc(bufsize, M_RCTL, M_WAITOK);
sb = sbuf_new(NULL, buf, bufsize, SBUF_FIXEDLEN);
KASSERT(sb != NULL, ("sbuf_new failed"));
rw_rlock(&rctl_lock);
LIST_FOREACH(link, &filter->rr_subject.rs_proc->p_racct->r_rule_links,
rrl_next) {
rctl_rule_to_sbuf(sb, link->rrl_rule);
sbuf_printf(sb, ",");
}
rw_runlock(&rctl_lock);
if (sbuf_error(sb) == ENOMEM) {
sbuf_delete(sb);
free(buf, M_RCTL);
bufsize *= 4;
goto again;
}
/*
* Remove trailing ",".
*/
if (sbuf_len(sb) > 0)
sbuf_setpos(sb, sbuf_len(sb) - 1);
error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen);
rctl_rule_release(filter);
sx_sunlock(&allproc_lock);
free(buf, M_RCTL);
return (error);
}
int
-rctl_add_rule(struct thread *td, struct rctl_add_rule_args *uap)
+sys_rctl_add_rule(struct thread *td, struct rctl_add_rule_args *uap)
{
int error;
struct rctl_rule *rule;
char *inputstr;
error = priv_check(td, PRIV_RCTL_ADD_RULE);
if (error != 0)
return (error);
error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
if (error != 0)
return (error);
sx_slock(&allproc_lock);
error = rctl_string_to_rule(inputstr, &rule);
free(inputstr, M_RCTL);
if (error != 0) {
sx_sunlock(&allproc_lock);
return (error);
}
/*
* The 'per' part of a rule is optional.
*/
if (rule->rr_per == RCTL_SUBJECT_TYPE_UNDEFINED &&
rule->rr_subject_type != RCTL_SUBJECT_TYPE_UNDEFINED)
rule->rr_per = rule->rr_subject_type;
if (!rctl_rule_fully_specified(rule)) {
error = EINVAL;
goto out;
}
error = rctl_rule_add(rule);
out:
rctl_rule_release(rule);
sx_sunlock(&allproc_lock);
return (error);
}
int
-rctl_remove_rule(struct thread *td, struct rctl_remove_rule_args *uap)
+sys_rctl_remove_rule(struct thread *td, struct rctl_remove_rule_args *uap)
{
int error;
struct rctl_rule *filter;
char *inputstr;
error = priv_check(td, PRIV_RCTL_REMOVE_RULE);
if (error != 0)
return (error);
error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
if (error != 0)
return (error);
sx_slock(&allproc_lock);
error = rctl_string_to_rule(inputstr, &filter);
free(inputstr, M_RCTL);
if (error != 0) {
sx_sunlock(&allproc_lock);
return (error);
}
error = rctl_rule_remove(filter);
rctl_rule_release(filter);
sx_sunlock(&allproc_lock);
return (error);
}
/*
* Update RCTL rule list after credential change.
*/
void
rctl_proc_ucred_changed(struct proc *p, struct ucred *newcred)
{
int rulecnt, i;
struct rctl_rule_link *link, *newlink;
struct uidinfo *newuip;
struct loginclass *newlc;
struct prison_racct *newprr;
LIST_HEAD(, rctl_rule_link) newrules;
newuip = newcred->cr_ruidinfo;
newlc = newcred->cr_loginclass;
newprr = newcred->cr_prison->pr_prison_racct;
LIST_INIT(&newrules);
again:
/*
* First, count the rules that apply to the process with new
* credentials.
*/
rulecnt = 0;
rw_rlock(&rctl_lock);
LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
if (link->rrl_rule->rr_subject_type ==
RCTL_SUBJECT_TYPE_PROCESS)
rulecnt++;
}
LIST_FOREACH(link, &newuip->ui_racct->r_rule_links, rrl_next)
rulecnt++;
LIST_FOREACH(link, &newlc->lc_racct->r_rule_links, rrl_next)
rulecnt++;
LIST_FOREACH(link, &newprr->prr_racct->r_rule_links, rrl_next)
rulecnt++;
rw_runlock(&rctl_lock);
/*
* Create temporary list. We've dropped the rctl_lock in order
* to use M_WAITOK.
*/
for (i = 0; i < rulecnt; i++) {
newlink = uma_zalloc(rctl_rule_link_zone, M_WAITOK);
newlink->rrl_rule = NULL;
LIST_INSERT_HEAD(&newrules, newlink, rrl_next);
}
newlink = LIST_FIRST(&newrules);
/*
* Assign rules to the newly allocated list entries.
*/
rw_wlock(&rctl_lock);
LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
if (link->rrl_rule->rr_subject_type ==
RCTL_SUBJECT_TYPE_PROCESS) {
if (newlink == NULL)
goto goaround;
rctl_rule_acquire(link->rrl_rule);
newlink->rrl_rule = link->rrl_rule;
newlink = LIST_NEXT(newlink, rrl_next);
rulecnt--;
}
}
LIST_FOREACH(link, &newuip->ui_racct->r_rule_links, rrl_next) {
if (newlink == NULL)
goto goaround;
rctl_rule_acquire(link->rrl_rule);
newlink->rrl_rule = link->rrl_rule;
newlink = LIST_NEXT(newlink, rrl_next);
rulecnt--;
}
LIST_FOREACH(link, &newlc->lc_racct->r_rule_links, rrl_next) {
if (newlink == NULL)
goto goaround;
rctl_rule_acquire(link->rrl_rule);
newlink->rrl_rule = link->rrl_rule;
newlink = LIST_NEXT(newlink, rrl_next);
rulecnt--;
}
LIST_FOREACH(link, &newprr->prr_racct->r_rule_links, rrl_next) {
if (newlink == NULL)
goto goaround;
rctl_rule_acquire(link->rrl_rule);
newlink->rrl_rule = link->rrl_rule;
newlink = LIST_NEXT(newlink, rrl_next);
rulecnt--;
}
if (rulecnt == 0) {
/*
* Free the old rule list.
*/
while (!LIST_EMPTY(&p->p_racct->r_rule_links)) {
link = LIST_FIRST(&p->p_racct->r_rule_links);
LIST_REMOVE(link, rrl_next);
rctl_rule_release(link->rrl_rule);
uma_zfree(rctl_rule_link_zone, link);
}
/*
* Replace lists and we're done.
*
* XXX: Is there any way to switch list heads instead
* of iterating here?
*/
while (!LIST_EMPTY(&newrules)) {
newlink = LIST_FIRST(&newrules);
LIST_REMOVE(newlink, rrl_next);
LIST_INSERT_HEAD(&p->p_racct->r_rule_links,
newlink, rrl_next);
}
rw_wunlock(&rctl_lock);
return;
}
goaround:
rw_wunlock(&rctl_lock);
/*
* Rule list changed while we were not holding the rctl_lock.
* Free the new list and try again.
*/
while (!LIST_EMPTY(&newrules)) {
newlink = LIST_FIRST(&newrules);
LIST_REMOVE(newlink, rrl_next);
if (newlink->rrl_rule != NULL)
rctl_rule_release(newlink->rrl_rule);
uma_zfree(rctl_rule_link_zone, newlink);
}
goto again;
}
/*
* Assign RCTL rules to the newly created process.
*/
int
rctl_proc_fork(struct proc *parent, struct proc *child)
{
int error;
struct rctl_rule_link *link;
struct rctl_rule *rule;
LIST_INIT(&child->p_racct->r_rule_links);
/*
* No limits for kernel processes.
*/
if (child->p_flag & P_SYSTEM)
return (0);
/*
* Nothing to inherit from P_SYSTEM parents.
*/
if (parent->p_racct == NULL) {
KASSERT(parent->p_flag & P_SYSTEM,
("non-system process without racct; p = %p", parent));
return (0);
}
rw_wlock(&rctl_lock);
/*
* Go through limits applicable to the parent and assign them
* to the child. Rules with 'process' subject have to be duplicated
* in order to make their rr_subject point to the new process.
*/
LIST_FOREACH(link, &parent->p_racct->r_rule_links, rrl_next) {
if (link->rrl_rule->rr_subject_type ==
RCTL_SUBJECT_TYPE_PROCESS) {
rule = rctl_rule_duplicate(link->rrl_rule, M_NOWAIT);
if (rule == NULL)
goto fail;
KASSERT(rule->rr_subject.rs_proc == parent,
("rule->rr_subject.rs_proc != parent"));
rule->rr_subject.rs_proc = child;
error = rctl_racct_add_rule_locked(child->p_racct,
rule);
rctl_rule_release(rule);
if (error != 0)
goto fail;
} else {
error = rctl_racct_add_rule_locked(child->p_racct,
link->rrl_rule);
if (error != 0)
goto fail;
}
}
rw_wunlock(&rctl_lock);
return (0);
fail:
while (!LIST_EMPTY(&child->p_racct->r_rule_links)) {
link = LIST_FIRST(&child->p_racct->r_rule_links);
LIST_REMOVE(link, rrl_next);
rctl_rule_release(link->rrl_rule);
uma_zfree(rctl_rule_link_zone, link);
}
rw_wunlock(&rctl_lock);
return (EAGAIN);
}
/*
* Release rules attached to the racct.
*/
void
rctl_racct_release(struct racct *racct)
{
struct rctl_rule_link *link;
rw_wlock(&rctl_lock);
while (!LIST_EMPTY(&racct->r_rule_links)) {
link = LIST_FIRST(&racct->r_rule_links);
LIST_REMOVE(link, rrl_next);
rctl_rule_release(link->rrl_rule);
uma_zfree(rctl_rule_link_zone, link);
}
rw_wunlock(&rctl_lock);
}
static void
rctl_init(void)
{
rctl_rule_link_zone = uma_zcreate("rctl_rule_link",
sizeof(struct rctl_rule_link), NULL, NULL, NULL, NULL,
UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
rctl_rule_zone = uma_zcreate("rctl_rule", sizeof(struct rctl_rule),
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
}
#else /* !RCTL */
int
-rctl_get_racct(struct thread *td, struct rctl_get_racct_args *uap)
+sys_rctl_get_racct(struct thread *td, struct rctl_get_racct_args *uap)
{
return (ENOSYS);
}
int
-rctl_get_rules(struct thread *td, struct rctl_get_rules_args *uap)
+sys_rctl_get_rules(struct thread *td, struct rctl_get_rules_args *uap)
{
return (ENOSYS);
}
int
-rctl_get_limits(struct thread *td, struct rctl_get_limits_args *uap)
+sys_rctl_get_limits(struct thread *td, struct rctl_get_limits_args *uap)
{
return (ENOSYS);
}
int
-rctl_add_rule(struct thread *td, struct rctl_add_rule_args *uap)
+sys_rctl_add_rule(struct thread *td, struct rctl_add_rule_args *uap)
{
return (ENOSYS);
}
int
-rctl_remove_rule(struct thread *td, struct rctl_remove_rule_args *uap)
+sys_rctl_remove_rule(struct thread *td, struct rctl_remove_rule_args *uap)
{
return (ENOSYS);
}
#endif /* !RCTL */
Index: head/sys/kern/kern_resource.c
===================================================================
--- head/sys/kern/kern_resource.c (revision 225616)
+++ head/sys/kern/kern_resource.c (revision 225617)
@@ -1,1415 +1,1415 @@
/*-
* Copyright (c) 1982, 1986, 1991, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_resource.c 8.5 (Berkeley) 1/21/94
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_compat.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/sysproto.h>
#include <sys/file.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/refcount.h>
#include <sys/racct.h>
#include <sys/resourcevar.h>
#include <sys/rwlock.h>
#include <sys/sched.h>
#include <sys/sx.h>
#include <sys/syscallsubr.h>
#include <sys/sysent.h>
#include <sys/time.h>
#include <sys/umtx.h>
#include <vm/vm.h>
#include <vm/vm_param.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
static MALLOC_DEFINE(M_PLIMIT, "plimit", "plimit structures");
static MALLOC_DEFINE(M_UIDINFO, "uidinfo", "uidinfo structures");
#define UIHASH(uid) (&uihashtbl[(uid) & uihash])
static struct rwlock uihashtbl_lock;
static LIST_HEAD(uihashhead, uidinfo) *uihashtbl;
static u_long uihash; /* size of hash table - 1 */
static void calcru1(struct proc *p, struct rusage_ext *ruxp,
struct timeval *up, struct timeval *sp);
static int donice(struct thread *td, struct proc *chgp, int n);
static struct uidinfo *uilookup(uid_t uid);
static void ruxagg_locked(struct rusage_ext *rux, struct thread *td);
/*
* Resource controls and accounting.
*/
#ifndef _SYS_SYSPROTO_H_
struct getpriority_args {
int which;
int who;
};
#endif
int
-getpriority(td, uap)
+sys_getpriority(td, uap)
struct thread *td;
register struct getpriority_args *uap;
{
struct proc *p;
struct pgrp *pg;
int error, low;
error = 0;
low = PRIO_MAX + 1;
switch (uap->which) {
case PRIO_PROCESS:
if (uap->who == 0)
low = td->td_proc->p_nice;
else {
p = pfind(uap->who);
if (p == NULL)
break;
if (p_cansee(td, p) == 0)
low = p->p_nice;
PROC_UNLOCK(p);
}
break;
case PRIO_PGRP:
sx_slock(&proctree_lock);
if (uap->who == 0) {
pg = td->td_proc->p_pgrp;
PGRP_LOCK(pg);
} else {
pg = pgfind(uap->who);
if (pg == NULL) {
sx_sunlock(&proctree_lock);
break;
}
}
sx_sunlock(&proctree_lock);
LIST_FOREACH(p, &pg->pg_members, p_pglist) {
PROC_LOCK(p);
if (p->p_state == PRS_NORMAL &&
p_cansee(td, p) == 0) {
if (p->p_nice < low)
low = p->p_nice;
}
PROC_UNLOCK(p);
}
PGRP_UNLOCK(pg);
break;
case PRIO_USER:
if (uap->who == 0)
uap->who = td->td_ucred->cr_uid;
sx_slock(&allproc_lock);
FOREACH_PROC_IN_SYSTEM(p) {
PROC_LOCK(p);
if (p->p_state == PRS_NORMAL &&
p_cansee(td, p) == 0 &&
p->p_ucred->cr_uid == uap->who) {
if (p->p_nice < low)
low = p->p_nice;
}
PROC_UNLOCK(p);
}
sx_sunlock(&allproc_lock);
break;
default:
error = EINVAL;
break;
}
if (low == PRIO_MAX + 1 && error == 0)
error = ESRCH;
td->td_retval[0] = low;
return (error);
}
#ifndef _SYS_SYSPROTO_H_
struct setpriority_args {
int which;
int who;
int prio;
};
#endif
int
-setpriority(td, uap)
+sys_setpriority(td, uap)
struct thread *td;
struct setpriority_args *uap;
{
struct proc *curp, *p;
struct pgrp *pg;
int found = 0, error = 0;
curp = td->td_proc;
switch (uap->which) {
case PRIO_PROCESS:
if (uap->who == 0) {
PROC_LOCK(curp);
error = donice(td, curp, uap->prio);
PROC_UNLOCK(curp);
} else {
p = pfind(uap->who);
if (p == NULL)
break;
error = p_cansee(td, p);
if (error == 0)
error = donice(td, p, uap->prio);
PROC_UNLOCK(p);
}
found++;
break;
case PRIO_PGRP:
sx_slock(&proctree_lock);
if (uap->who == 0) {
pg = curp->p_pgrp;
PGRP_LOCK(pg);
} else {
pg = pgfind(uap->who);
if (pg == NULL) {
sx_sunlock(&proctree_lock);
break;
}
}
sx_sunlock(&proctree_lock);
LIST_FOREACH(p, &pg->pg_members, p_pglist) {
PROC_LOCK(p);
if (p->p_state == PRS_NORMAL &&
p_cansee(td, p) == 0) {
error = donice(td, p, uap->prio);
found++;
}
PROC_UNLOCK(p);
}
PGRP_UNLOCK(pg);
break;
case PRIO_USER:
if (uap->who == 0)
uap->who = td->td_ucred->cr_uid;
sx_slock(&allproc_lock);
FOREACH_PROC_IN_SYSTEM(p) {
PROC_LOCK(p);
if (p->p_state == PRS_NORMAL &&
p->p_ucred->cr_uid == uap->who &&
p_cansee(td, p) == 0) {
error = donice(td, p, uap->prio);
found++;
}
PROC_UNLOCK(p);
}
sx_sunlock(&allproc_lock);
break;
default:
error = EINVAL;
break;
}
if (found == 0 && error == 0)
error = ESRCH;
return (error);
}
/*
* Set "nice" for a (whole) process.
*/
static int
donice(struct thread *td, struct proc *p, int n)
{
int error;
PROC_LOCK_ASSERT(p, MA_OWNED);
if ((error = p_cansched(td, p)))
return (error);
if (n > PRIO_MAX)
n = PRIO_MAX;
if (n < PRIO_MIN)
n = PRIO_MIN;
if (n < p->p_nice && priv_check(td, PRIV_SCHED_SETPRIORITY) != 0)
return (EACCES);
sched_nice(p, n);
return (0);
}
/*
* Set realtime priority for LWP.
*/
#ifndef _SYS_SYSPROTO_H_
struct rtprio_thread_args {
int function;
lwpid_t lwpid;
struct rtprio *rtp;
};
#endif
int
-rtprio_thread(struct thread *td, struct rtprio_thread_args *uap)
+sys_rtprio_thread(struct thread *td, struct rtprio_thread_args *uap)
{
struct proc *p;
struct rtprio rtp;
struct thread *td1;
int cierror, error;
/* Perform copyin before acquiring locks if needed. */
if (uap->function == RTP_SET)
cierror = copyin(uap->rtp, &rtp, sizeof(struct rtprio));
else
cierror = 0;
if (uap->lwpid == 0 || uap->lwpid == td->td_tid) {
p = td->td_proc;
td1 = td;
PROC_LOCK(p);
} else {
/* Only look up thread in current process */
td1 = tdfind(uap->lwpid, curproc->p_pid);
if (td1 == NULL)
return (ESRCH);
p = td1->td_proc;
}
switch (uap->function) {
case RTP_LOOKUP:
if ((error = p_cansee(td, p)))
break;
pri_to_rtp(td1, &rtp);
PROC_UNLOCK(p);
return (copyout(&rtp, uap->rtp, sizeof(struct rtprio)));
case RTP_SET:
if ((error = p_cansched(td, p)) || (error = cierror))
break;
/* Disallow setting rtprio in most cases if not superuser. */
/*
* Realtime priority has to be restricted for reasons which should be
* obvious. However, for idle priority, there is a potential for
* system deadlock if an idleprio process gains a lock on a resource
* that other processes need (and the idleprio process can't run
* due to a CPU-bound normal process). Fix me! XXX
*/
#if 0
if (RTP_PRIO_IS_REALTIME(rtp.type)) {
#else
if (rtp.type != RTP_PRIO_NORMAL) {
#endif
error = priv_check(td, PRIV_SCHED_RTPRIO);
if (error)
break;
}
error = rtp_to_pri(&rtp, td1);
break;
default:
error = EINVAL;
break;
}
PROC_UNLOCK(p);
return (error);
}
/*
* Set realtime priority.
*/
#ifndef _SYS_SYSPROTO_H_
struct rtprio_args {
int function;
pid_t pid;
struct rtprio *rtp;
};
#endif
int
-rtprio(td, uap)
+sys_rtprio(td, uap)
struct thread *td; /* curthread */
register struct rtprio_args *uap;
{
struct proc *p;
struct thread *tdp;
struct rtprio rtp;
int cierror, error;
/* Perform copyin before acquiring locks if needed. */
if (uap->function == RTP_SET)
cierror = copyin(uap->rtp, &rtp, sizeof(struct rtprio));
else
cierror = 0;
if (uap->pid == 0) {
p = td->td_proc;
PROC_LOCK(p);
} else {
p = pfind(uap->pid);
if (p == NULL)
return (ESRCH);
}
switch (uap->function) {
case RTP_LOOKUP:
if ((error = p_cansee(td, p)))
break;
/*
* Return OUR priority if no pid specified,
* or if one is, report the highest priority
* in the process. There isn't much more you can do as
* there is only room to return a single priority.
* Note: specifying our own pid is not the same
* as leaving it zero.
*/
if (uap->pid == 0) {
pri_to_rtp(td, &rtp);
} else {
struct rtprio rtp2;
rtp.type = RTP_PRIO_IDLE;
rtp.prio = RTP_PRIO_MAX;
FOREACH_THREAD_IN_PROC(p, tdp) {
pri_to_rtp(tdp, &rtp2);
if (rtp2.type < rtp.type ||
(rtp2.type == rtp.type &&
rtp2.prio < rtp.prio)) {
rtp.type = rtp2.type;
rtp.prio = rtp2.prio;
}
}
}
PROC_UNLOCK(p);
return (copyout(&rtp, uap->rtp, sizeof(struct rtprio)));
case RTP_SET:
if ((error = p_cansched(td, p)) || (error = cierror))
break;
/* Disallow setting rtprio in most cases if not superuser. */
/*
* Realtime priority has to be restricted for reasons which should be
* obvious. However, for idle priority, there is a potential for
* system deadlock if an idleprio process gains a lock on a resource
* that other processes need (and the idleprio process can't run
* due to a CPU-bound normal process). Fix me! XXX
*/
#if 0
if (RTP_PRIO_IS_REALTIME(rtp.type)) {
#else
if (rtp.type != RTP_PRIO_NORMAL) {
#endif
error = priv_check(td, PRIV_SCHED_RTPRIO);
if (error)
break;
}
/*
* If we are setting our own priority, set just our
* thread but if we are doing another process,
* do all the threads on that process. If we
* specify our own pid we do the latter.
*/
if (uap->pid == 0) {
error = rtp_to_pri(&rtp, td);
} else {
FOREACH_THREAD_IN_PROC(p, td) {
if ((error = rtp_to_pri(&rtp, td)) != 0)
break;
}
}
break;
default:
error = EINVAL;
break;
}
PROC_UNLOCK(p);
return (error);
}
int
rtp_to_pri(struct rtprio *rtp, struct thread *td)
{
u_char newpri;
u_char oldpri;
switch (RTP_PRIO_BASE(rtp->type)) {
case RTP_PRIO_REALTIME:
if (rtp->prio > RTP_PRIO_MAX)
return (EINVAL);
newpri = PRI_MIN_REALTIME + rtp->prio;
break;
case RTP_PRIO_NORMAL:
if (rtp->prio > (PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE))
return (EINVAL);
newpri = PRI_MIN_TIMESHARE + rtp->prio;
break;
case RTP_PRIO_IDLE:
if (rtp->prio > RTP_PRIO_MAX)
return (EINVAL);
newpri = PRI_MIN_IDLE + rtp->prio;
break;
default:
return (EINVAL);
}
thread_lock(td);
sched_class(td, rtp->type); /* XXX fix */
oldpri = td->td_user_pri;
sched_user_prio(td, newpri);
if (curthread == td)
sched_prio(curthread, td->td_user_pri); /* XXX dubious */
if (TD_ON_UPILOCK(td) && oldpri != newpri) {
critical_enter();
thread_unlock(td);
umtx_pi_adjust(td, oldpri);
critical_exit();
} else
thread_unlock(td);
return (0);
}
void
pri_to_rtp(struct thread *td, struct rtprio *rtp)
{
thread_lock(td);
switch (PRI_BASE(td->td_pri_class)) {
case PRI_REALTIME:
rtp->prio = td->td_base_user_pri - PRI_MIN_REALTIME;
break;
case PRI_TIMESHARE:
rtp->prio = td->td_base_user_pri - PRI_MIN_TIMESHARE;
break;
case PRI_IDLE:
rtp->prio = td->td_base_user_pri - PRI_MIN_IDLE;
break;
default:
break;
}
rtp->type = td->td_pri_class;
thread_unlock(td);
}
#if defined(COMPAT_43)
#ifndef _SYS_SYSPROTO_H_
struct osetrlimit_args {
u_int which;
struct orlimit *rlp;
};
#endif
int
osetrlimit(td, uap)
struct thread *td;
register struct osetrlimit_args *uap;
{
struct orlimit olim;
struct rlimit lim;
int error;
if ((error = copyin(uap->rlp, &olim, sizeof(struct orlimit))))
return (error);
lim.rlim_cur = olim.rlim_cur;
lim.rlim_max = olim.rlim_max;
error = kern_setrlimit(td, uap->which, &lim);
return (error);
}
#ifndef _SYS_SYSPROTO_H_
struct ogetrlimit_args {
u_int which;
struct orlimit *rlp;
};
#endif
int
ogetrlimit(td, uap)
struct thread *td;
register struct ogetrlimit_args *uap;
{
struct orlimit olim;
struct rlimit rl;
struct proc *p;
int error;
if (uap->which >= RLIM_NLIMITS)
return (EINVAL);
p = td->td_proc;
PROC_LOCK(p);
lim_rlimit(p, uap->which, &rl);
PROC_UNLOCK(p);
/*
* XXX would be more correct to convert only RLIM_INFINITY to the
* old RLIM_INFINITY and fail with EOVERFLOW for other larger
* values. Most 64->32 and 32->16 conversions, including not
* unimportant ones of uids are even more broken than what we
* do here (they blindly truncate). We don't do this correctly
* here since we have little experience with EOVERFLOW yet.
* Elsewhere, getuid() can't fail...
*/
olim.rlim_cur = rl.rlim_cur > 0x7fffffff ? 0x7fffffff : rl.rlim_cur;
olim.rlim_max = rl.rlim_max > 0x7fffffff ? 0x7fffffff : rl.rlim_max;
error = copyout(&olim, uap->rlp, sizeof(olim));
return (error);
}
#endif /* COMPAT_43 */
#ifndef _SYS_SYSPROTO_H_
struct __setrlimit_args {
u_int which;
struct rlimit *rlp;
};
#endif
int
-setrlimit(td, uap)
+sys_setrlimit(td, uap)
struct thread *td;
register struct __setrlimit_args *uap;
{
struct rlimit alim;
int error;
if ((error = copyin(uap->rlp, &alim, sizeof(struct rlimit))))
return (error);
error = kern_setrlimit(td, uap->which, &alim);
return (error);
}
static void
lim_cb(void *arg)
{
struct rlimit rlim;
struct thread *td;
struct proc *p;
p = arg;
PROC_LOCK_ASSERT(p, MA_OWNED);
/*
* Check if the process exceeds its cpu resource allocation. If
* it reaches the max, arrange to kill the process in ast().
*/
if (p->p_cpulimit == RLIM_INFINITY)
return;
PROC_SLOCK(p);
FOREACH_THREAD_IN_PROC(p, td) {
ruxagg(p, td);
}
PROC_SUNLOCK(p);
if (p->p_rux.rux_runtime > p->p_cpulimit * cpu_tickrate()) {
lim_rlimit(p, RLIMIT_CPU, &rlim);
if (p->p_rux.rux_runtime >= rlim.rlim_max * cpu_tickrate()) {
killproc(p, "exceeded maximum CPU limit");
} else {
if (p->p_cpulimit < rlim.rlim_max)
p->p_cpulimit += 5;
- psignal(p, SIGXCPU);
+ kern_psignal(p, SIGXCPU);
}
}
if ((p->p_flag & P_WEXIT) == 0)
callout_reset(&p->p_limco, hz, lim_cb, p);
}
int
kern_setrlimit(td, which, limp)
struct thread *td;
u_int which;
struct rlimit *limp;
{
struct plimit *newlim, *oldlim;
struct proc *p;
register struct rlimit *alimp;
struct rlimit oldssiz;
int error;
if (which >= RLIM_NLIMITS)
return (EINVAL);
/*
* Preserve historical bugs by treating negative limits as unsigned.
*/
if (limp->rlim_cur < 0)
limp->rlim_cur = RLIM_INFINITY;
if (limp->rlim_max < 0)
limp->rlim_max = RLIM_INFINITY;
oldssiz.rlim_cur = 0;
p = td->td_proc;
newlim = lim_alloc();
PROC_LOCK(p);
oldlim = p->p_limit;
alimp = &oldlim->pl_rlimit[which];
if (limp->rlim_cur > alimp->rlim_max ||
limp->rlim_max > alimp->rlim_max)
if ((error = priv_check(td, PRIV_PROC_SETRLIMIT))) {
PROC_UNLOCK(p);
lim_free(newlim);
return (error);
}
if (limp->rlim_cur > limp->rlim_max)
limp->rlim_cur = limp->rlim_max;
lim_copy(newlim, oldlim);
alimp = &newlim->pl_rlimit[which];
switch (which) {
case RLIMIT_CPU:
if (limp->rlim_cur != RLIM_INFINITY &&
p->p_cpulimit == RLIM_INFINITY)
callout_reset(&p->p_limco, hz, lim_cb, p);
p->p_cpulimit = limp->rlim_cur;
break;
case RLIMIT_DATA:
if (limp->rlim_cur > maxdsiz)
limp->rlim_cur = maxdsiz;
if (limp->rlim_max > maxdsiz)
limp->rlim_max = maxdsiz;
break;
case RLIMIT_STACK:
if (limp->rlim_cur > maxssiz)
limp->rlim_cur = maxssiz;
if (limp->rlim_max > maxssiz)
limp->rlim_max = maxssiz;
oldssiz = *alimp;
if (p->p_sysent->sv_fixlimit != NULL)
p->p_sysent->sv_fixlimit(&oldssiz,
RLIMIT_STACK);
break;
case RLIMIT_NOFILE:
if (limp->rlim_cur > maxfilesperproc)
limp->rlim_cur = maxfilesperproc;
if (limp->rlim_max > maxfilesperproc)
limp->rlim_max = maxfilesperproc;
break;
case RLIMIT_NPROC:
if (limp->rlim_cur > maxprocperuid)
limp->rlim_cur = maxprocperuid;
if (limp->rlim_max > maxprocperuid)
limp->rlim_max = maxprocperuid;
if (limp->rlim_cur < 1)
limp->rlim_cur = 1;
if (limp->rlim_max < 1)
limp->rlim_max = 1;
break;
}
if (p->p_sysent->sv_fixlimit != NULL)
p->p_sysent->sv_fixlimit(limp, which);
*alimp = *limp;
p->p_limit = newlim;
PROC_UNLOCK(p);
lim_free(oldlim);
if (which == RLIMIT_STACK) {
/*
* Stack is allocated to the max at exec time with only
* "rlim_cur" bytes accessible. If stack limit is going
* up make more accessible, if going down make inaccessible.
*/
if (limp->rlim_cur != oldssiz.rlim_cur) {
vm_offset_t addr;
vm_size_t size;
vm_prot_t prot;
if (limp->rlim_cur > oldssiz.rlim_cur) {
prot = p->p_sysent->sv_stackprot;
size = limp->rlim_cur - oldssiz.rlim_cur;
addr = p->p_sysent->sv_usrstack -
limp->rlim_cur;
} else {
prot = VM_PROT_NONE;
size = oldssiz.rlim_cur - limp->rlim_cur;
addr = p->p_sysent->sv_usrstack -
oldssiz.rlim_cur;
}
addr = trunc_page(addr);
size = round_page(size);
(void)vm_map_protect(&p->p_vmspace->vm_map,
addr, addr + size, prot, FALSE);
}
}
return (0);
}
#ifndef _SYS_SYSPROTO_H_
struct __getrlimit_args {
u_int which;
struct rlimit *rlp;
};
#endif
/* ARGSUSED */
int
-getrlimit(td, uap)
+sys_getrlimit(td, uap)
struct thread *td;
register struct __getrlimit_args *uap;
{
struct rlimit rlim;
struct proc *p;
int error;
if (uap->which >= RLIM_NLIMITS)
return (EINVAL);
p = td->td_proc;
PROC_LOCK(p);
lim_rlimit(p, uap->which, &rlim);
PROC_UNLOCK(p);
error = copyout(&rlim, uap->rlp, sizeof(struct rlimit));
return (error);
}
/*
* Transform the running time and tick information for children of proc p
* into user and system time usage.
*/
void
calccru(p, up, sp)
struct proc *p;
struct timeval *up;
struct timeval *sp;
{
PROC_LOCK_ASSERT(p, MA_OWNED);
calcru1(p, &p->p_crux, up, sp);
}
/*
* Transform the running time and tick information in proc p into user
* and system time usage. If appropriate, include the current time slice
* on this CPU.
*/
void
calcru(struct proc *p, struct timeval *up, struct timeval *sp)
{
struct thread *td;
uint64_t runtime, u;
PROC_LOCK_ASSERT(p, MA_OWNED);
PROC_SLOCK_ASSERT(p, MA_OWNED);
/*
* If we are getting stats for the current process, then add in the
* stats that this thread has accumulated in its current time slice.
* We reset the thread and CPU state as if we had performed a context
* switch right here.
*/
td = curthread;
if (td->td_proc == p) {
u = cpu_ticks();
runtime = u - PCPU_GET(switchtime);
td->td_runtime += runtime;
td->td_incruntime += runtime;
PCPU_SET(switchtime, u);
}
/* Make sure the per-thread stats are current. */
FOREACH_THREAD_IN_PROC(p, td) {
if (td->td_incruntime == 0)
continue;
ruxagg(p, td);
}
calcru1(p, &p->p_rux, up, sp);
}
/* Collect resource usage for a single thread. */
void
rufetchtd(struct thread *td, struct rusage *ru)
{
struct proc *p;
uint64_t runtime, u;
p = td->td_proc;
PROC_SLOCK_ASSERT(p, MA_OWNED);
THREAD_LOCK_ASSERT(td, MA_OWNED);
/*
* If we are getting stats for the current thread, then add in the
* stats that this thread has accumulated in its current time slice.
* We reset the thread and CPU state as if we had performed a context
* switch right here.
*/
if (td == curthread) {
u = cpu_ticks();
runtime = u - PCPU_GET(switchtime);
td->td_runtime += runtime;
td->td_incruntime += runtime;
PCPU_SET(switchtime, u);
}
ruxagg(p, td);
*ru = td->td_ru;
calcru1(p, &td->td_rux, &ru->ru_utime, &ru->ru_stime);
}
static void
calcru1(struct proc *p, struct rusage_ext *ruxp, struct timeval *up,
struct timeval *sp)
{
/* {user, system, interrupt, total} {ticks, usec}: */
uint64_t ut, uu, st, su, it, tt, tu;
ut = ruxp->rux_uticks;
st = ruxp->rux_sticks;
it = ruxp->rux_iticks;
tt = ut + st + it;
if (tt == 0) {
/* Avoid divide by zero */
st = 1;
tt = 1;
}
tu = cputick2usec(ruxp->rux_runtime);
if ((int64_t)tu < 0) {
/* XXX: this should be an assert /phk */
printf("calcru: negative runtime of %jd usec for pid %d (%s)\n",
(intmax_t)tu, p->p_pid, p->p_comm);
tu = ruxp->rux_tu;
}
if (tu >= ruxp->rux_tu) {
/*
* The normal case, time increased.
* Enforce monotonicity of bucketed numbers.
*/
uu = (tu * ut) / tt;
if (uu < ruxp->rux_uu)
uu = ruxp->rux_uu;
su = (tu * st) / tt;
if (su < ruxp->rux_su)
su = ruxp->rux_su;
} else if (tu + 3 > ruxp->rux_tu || 101 * tu > 100 * ruxp->rux_tu) {
/*
* When we calibrate the cputicker, it is not uncommon to
* see the presumably fixed frequency increase slightly over
* time as a result of thermal stabilization and NTP
* discipline (of the reference clock). We therefore ignore
* a bit of backwards slop because we expect to catch up
* shortly. We use a 3 microsecond limit to catch low
* counts and a 1% limit for high counts.
*/
uu = ruxp->rux_uu;
su = ruxp->rux_su;
tu = ruxp->rux_tu;
} else { /* tu < ruxp->rux_tu */
/*
* What happened here was likely that a laptop, which ran at
* a reduced clock frequency at boot, kicked into high gear.
* The wisdom of spamming this message in that case is
* dubious, but it might also be indicative of something
* serious, so lets keep it and hope laptops can be made
* more truthful about their CPU speed via ACPI.
*/
printf("calcru: runtime went backwards from %ju usec "
"to %ju usec for pid %d (%s)\n",
(uintmax_t)ruxp->rux_tu, (uintmax_t)tu,
p->p_pid, p->p_comm);
uu = (tu * ut) / tt;
su = (tu * st) / tt;
}
ruxp->rux_uu = uu;
ruxp->rux_su = su;
ruxp->rux_tu = tu;
up->tv_sec = uu / 1000000;
up->tv_usec = uu % 1000000;
sp->tv_sec = su / 1000000;
sp->tv_usec = su % 1000000;
}
#ifndef _SYS_SYSPROTO_H_
struct getrusage_args {
int who;
struct rusage *rusage;
};
#endif
int
-getrusage(td, uap)
+sys_getrusage(td, uap)
register struct thread *td;
register struct getrusage_args *uap;
{
struct rusage ru;
int error;
error = kern_getrusage(td, uap->who, &ru);
if (error == 0)
error = copyout(&ru, uap->rusage, sizeof(struct rusage));
return (error);
}
int
kern_getrusage(struct thread *td, int who, struct rusage *rup)
{
struct proc *p;
int error;
error = 0;
p = td->td_proc;
PROC_LOCK(p);
switch (who) {
case RUSAGE_SELF:
rufetchcalc(p, rup, &rup->ru_utime,
&rup->ru_stime);
break;
case RUSAGE_CHILDREN:
*rup = p->p_stats->p_cru;
calccru(p, &rup->ru_utime, &rup->ru_stime);
break;
case RUSAGE_THREAD:
PROC_SLOCK(p);
thread_lock(td);
rufetchtd(td, rup);
thread_unlock(td);
PROC_SUNLOCK(p);
break;
default:
error = EINVAL;
}
PROC_UNLOCK(p);
return (error);
}
void
rucollect(struct rusage *ru, struct rusage *ru2)
{
long *ip, *ip2;
int i;
if (ru->ru_maxrss < ru2->ru_maxrss)
ru->ru_maxrss = ru2->ru_maxrss;
ip = &ru->ru_first;
ip2 = &ru2->ru_first;
for (i = &ru->ru_last - &ru->ru_first; i >= 0; i--)
*ip++ += *ip2++;
}
void
ruadd(struct rusage *ru, struct rusage_ext *rux, struct rusage *ru2,
struct rusage_ext *rux2)
{
rux->rux_runtime += rux2->rux_runtime;
rux->rux_uticks += rux2->rux_uticks;
rux->rux_sticks += rux2->rux_sticks;
rux->rux_iticks += rux2->rux_iticks;
rux->rux_uu += rux2->rux_uu;
rux->rux_su += rux2->rux_su;
rux->rux_tu += rux2->rux_tu;
rucollect(ru, ru2);
}
/*
* Aggregate tick counts into the proc's rusage_ext.
*/
static void
ruxagg_locked(struct rusage_ext *rux, struct thread *td)
{
THREAD_LOCK_ASSERT(td, MA_OWNED);
PROC_SLOCK_ASSERT(td->td_proc, MA_OWNED);
rux->rux_runtime += td->td_incruntime;
rux->rux_uticks += td->td_uticks;
rux->rux_sticks += td->td_sticks;
rux->rux_iticks += td->td_iticks;
}
void
ruxagg(struct proc *p, struct thread *td)
{
thread_lock(td);
ruxagg_locked(&p->p_rux, td);
ruxagg_locked(&td->td_rux, td);
td->td_incruntime = 0;
td->td_uticks = 0;
td->td_iticks = 0;
td->td_sticks = 0;
thread_unlock(td);
}
/*
* Update the rusage_ext structure and fetch a valid aggregate rusage
* for proc p if storage for one is supplied.
*/
void
rufetch(struct proc *p, struct rusage *ru)
{
struct thread *td;
PROC_SLOCK_ASSERT(p, MA_OWNED);
*ru = p->p_ru;
if (p->p_numthreads > 0) {
FOREACH_THREAD_IN_PROC(p, td) {
ruxagg(p, td);
rucollect(ru, &td->td_ru);
}
}
}
/*
* Atomically perform a rufetch and a calcru together.
* Consumers, can safely assume the calcru is executed only once
* rufetch is completed.
*/
void
rufetchcalc(struct proc *p, struct rusage *ru, struct timeval *up,
struct timeval *sp)
{
PROC_SLOCK(p);
rufetch(p, ru);
calcru(p, up, sp);
PROC_SUNLOCK(p);
}
/*
* Allocate a new resource limits structure and initialize its
* reference count and mutex pointer.
*/
struct plimit *
lim_alloc()
{
struct plimit *limp;
limp = malloc(sizeof(struct plimit), M_PLIMIT, M_WAITOK);
refcount_init(&limp->pl_refcnt, 1);
return (limp);
}
struct plimit *
lim_hold(limp)
struct plimit *limp;
{
refcount_acquire(&limp->pl_refcnt);
return (limp);
}
void
lim_fork(struct proc *p1, struct proc *p2)
{
p2->p_limit = lim_hold(p1->p_limit);
callout_init_mtx(&p2->p_limco, &p2->p_mtx, 0);
if (p1->p_cpulimit != RLIM_INFINITY)
callout_reset(&p2->p_limco, hz, lim_cb, p2);
}
void
lim_free(limp)
struct plimit *limp;
{
KASSERT(limp->pl_refcnt > 0, ("plimit refcnt underflow"));
if (refcount_release(&limp->pl_refcnt))
free((void *)limp, M_PLIMIT);
}
/*
* Make a copy of the plimit structure.
* We share these structures copy-on-write after fork.
*/
void
lim_copy(dst, src)
struct plimit *dst, *src;
{
KASSERT(dst->pl_refcnt == 1, ("lim_copy to shared limit"));
bcopy(src->pl_rlimit, dst->pl_rlimit, sizeof(src->pl_rlimit));
}
/*
* Return the hard limit for a particular system resource. The
* which parameter specifies the index into the rlimit array.
*/
rlim_t
lim_max(struct proc *p, int which)
{
struct rlimit rl;
lim_rlimit(p, which, &rl);
return (rl.rlim_max);
}
/*
* Return the current (soft) limit for a particular system resource.
* The which parameter which specifies the index into the rlimit array
*/
rlim_t
lim_cur(struct proc *p, int which)
{
struct rlimit rl;
lim_rlimit(p, which, &rl);
return (rl.rlim_cur);
}
/*
* Return a copy of the entire rlimit structure for the system limit
* specified by 'which' in the rlimit structure pointed to by 'rlp'.
*/
void
lim_rlimit(struct proc *p, int which, struct rlimit *rlp)
{
PROC_LOCK_ASSERT(p, MA_OWNED);
KASSERT(which >= 0 && which < RLIM_NLIMITS,
("request for invalid resource limit"));
*rlp = p->p_limit->pl_rlimit[which];
if (p->p_sysent->sv_fixlimit != NULL)
p->p_sysent->sv_fixlimit(rlp, which);
}
void
uihashinit()
{
uihashtbl = hashinit(maxproc / 16, M_UIDINFO, &uihash);
rw_init(&uihashtbl_lock, "uidinfo hash");
}
/*
* Look up a uidinfo struct for the parameter uid.
* uihashtbl_lock must be locked.
*/
static struct uidinfo *
uilookup(uid)
uid_t uid;
{
struct uihashhead *uipp;
struct uidinfo *uip;
rw_assert(&uihashtbl_lock, RA_LOCKED);
uipp = UIHASH(uid);
LIST_FOREACH(uip, uipp, ui_hash)
if (uip->ui_uid == uid)
break;
return (uip);
}
/*
* Find or allocate a struct uidinfo for a particular uid.
* Increase refcount on uidinfo struct returned.
* uifree() should be called on a struct uidinfo when released.
*/
struct uidinfo *
uifind(uid)
uid_t uid;
{
struct uidinfo *old_uip, *uip;
rw_rlock(&uihashtbl_lock);
uip = uilookup(uid);
if (uip == NULL) {
rw_runlock(&uihashtbl_lock);
uip = malloc(sizeof(*uip), M_UIDINFO, M_WAITOK | M_ZERO);
racct_create(&uip->ui_racct);
rw_wlock(&uihashtbl_lock);
/*
* There's a chance someone created our uidinfo while we
* were in malloc and not holding the lock, so we have to
* make sure we don't insert a duplicate uidinfo.
*/
if ((old_uip = uilookup(uid)) != NULL) {
/* Someone else beat us to it. */
racct_destroy(&uip->ui_racct);
free(uip, M_UIDINFO);
uip = old_uip;
} else {
refcount_init(&uip->ui_ref, 0);
uip->ui_uid = uid;
mtx_init(&uip->ui_vmsize_mtx, "ui_vmsize", NULL,
MTX_DEF);
LIST_INSERT_HEAD(UIHASH(uid), uip, ui_hash);
}
}
uihold(uip);
rw_unlock(&uihashtbl_lock);
return (uip);
}
/*
* Place another refcount on a uidinfo struct.
*/
void
uihold(uip)
struct uidinfo *uip;
{
refcount_acquire(&uip->ui_ref);
}
/*-
* Since uidinfo structs have a long lifetime, we use an
* opportunistic refcounting scheme to avoid locking the lookup hash
* for each release.
*
* If the refcount hits 0, we need to free the structure,
* which means we need to lock the hash.
* Optimal case:
* After locking the struct and lowering the refcount, if we find
* that we don't need to free, simply unlock and return.
* Suboptimal case:
* If refcount lowering results in need to free, bump the count
* back up, lose the lock and acquire the locks in the proper
* order to try again.
*/
void
uifree(uip)
struct uidinfo *uip;
{
int old;
/* Prepare for optimal case. */
old = uip->ui_ref;
if (old > 1 && atomic_cmpset_int(&uip->ui_ref, old, old - 1))
return;
/* Prepare for suboptimal case. */
rw_wlock(&uihashtbl_lock);
if (refcount_release(&uip->ui_ref)) {
racct_destroy(&uip->ui_racct);
LIST_REMOVE(uip, ui_hash);
rw_wunlock(&uihashtbl_lock);
if (uip->ui_sbsize != 0)
printf("freeing uidinfo: uid = %d, sbsize = %ld\n",
uip->ui_uid, uip->ui_sbsize);
if (uip->ui_proccnt != 0)
printf("freeing uidinfo: uid = %d, proccnt = %ld\n",
uip->ui_uid, uip->ui_proccnt);
if (uip->ui_vmsize != 0)
printf("freeing uidinfo: uid = %d, swapuse = %lld\n",
uip->ui_uid, (unsigned long long)uip->ui_vmsize);
mtx_destroy(&uip->ui_vmsize_mtx);
free(uip, M_UIDINFO);
return;
}
/*
* Someone added a reference between atomic_cmpset_int() and
* rw_wlock(&uihashtbl_lock).
*/
rw_wunlock(&uihashtbl_lock);
}
void
ui_racct_foreach(void (*callback)(struct racct *racct,
void *arg2, void *arg3), void *arg2, void *arg3)
{
struct uidinfo *uip;
struct uihashhead *uih;
rw_rlock(&uihashtbl_lock);
for (uih = &uihashtbl[uihash]; uih >= uihashtbl; uih--) {
LIST_FOREACH(uip, uih, ui_hash) {
(callback)(uip->ui_racct, arg2, arg3);
}
}
rw_runlock(&uihashtbl_lock);
}
/*
* Change the count associated with number of processes
* a given user is using. When 'max' is 0, don't enforce a limit
*/
int
chgproccnt(uip, diff, max)
struct uidinfo *uip;
int diff;
rlim_t max;
{
/* Don't allow them to exceed max, but allow subtraction. */
if (diff > 0 && max != 0) {
if (atomic_fetchadd_long(&uip->ui_proccnt, (long)diff) + diff > max) {
atomic_subtract_long(&uip->ui_proccnt, (long)diff);
return (0);
}
} else {
atomic_add_long(&uip->ui_proccnt, (long)diff);
if (uip->ui_proccnt < 0)
printf("negative proccnt for uid = %d\n", uip->ui_uid);
}
return (1);
}
/*
* Change the total socket buffer size a user has used.
*/
int
chgsbsize(uip, hiwat, to, max)
struct uidinfo *uip;
u_int *hiwat;
u_int to;
rlim_t max;
{
int diff;
diff = to - *hiwat;
if (diff > 0) {
if (atomic_fetchadd_long(&uip->ui_sbsize, (long)diff) + diff > max) {
atomic_subtract_long(&uip->ui_sbsize, (long)diff);
return (0);
}
} else {
atomic_add_long(&uip->ui_sbsize, (long)diff);
if (uip->ui_sbsize < 0)
printf("negative sbsize for uid = %d\n", uip->ui_uid);
}
*hiwat = to;
return (1);
}
/*
* Change the count associated with number of pseudo-terminals
* a given user is using. When 'max' is 0, don't enforce a limit
*/
int
chgptscnt(uip, diff, max)
struct uidinfo *uip;
int diff;
rlim_t max;
{
/* Don't allow them to exceed max, but allow subtraction. */
if (diff > 0 && max != 0) {
if (atomic_fetchadd_long(&uip->ui_ptscnt, (long)diff) + diff > max) {
atomic_subtract_long(&uip->ui_ptscnt, (long)diff);
return (0);
}
} else {
atomic_add_long(&uip->ui_ptscnt, (long)diff);
if (uip->ui_ptscnt < 0)
printf("negative ptscnt for uid = %d\n", uip->ui_uid);
}
return (1);
}
Index: head/sys/kern/kern_shutdown.c
===================================================================
--- head/sys/kern/kern_shutdown.c (revision 225616)
+++ head/sys/kern/kern_shutdown.c (revision 225617)
@@ -1,735 +1,735 @@
/*-
* Copyright (c) 1986, 1988, 1991, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_shutdown.c 8.3 (Berkeley) 1/21/94
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_ddb.h"
#include "opt_kdb.h"
#include "opt_panic.h"
#include "opt_sched.h"
#include "opt_watchdog.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/bio.h>
#include <sys/buf.h>
#include <sys/conf.h>
#include <sys/cons.h>
#include <sys/eventhandler.h>
#include <sys/jail.h>
#include <sys/kdb.h>
#include <sys/kernel.h>
#include <sys/kerneldump.h>
#include <sys/kthread.h>
#include <sys/malloc.h>
#include <sys/mount.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/reboot.h>
#include <sys/resourcevar.h>
#include <sys/sched.h>
#include <sys/smp.h>
#include <sys/sysctl.h>
#include <sys/sysproto.h>
#include <sys/vnode.h>
#ifdef SW_WATCHDOG
#include <sys/watchdog.h>
#endif
#include <ddb/ddb.h>
#include <machine/cpu.h>
#include <machine/pcb.h>
#include <machine/smp.h>
#include <security/mac/mac_framework.h>
#include <vm/vm.h>
#include <vm/vm_object.h>
#include <vm/vm_page.h>
#include <vm/vm_pager.h>
#include <vm/swap_pager.h>
#include <sys/signalvar.h>
#ifndef PANIC_REBOOT_WAIT_TIME
#define PANIC_REBOOT_WAIT_TIME 15 /* default to 15 seconds */
#endif
/*
* Note that stdarg.h and the ANSI style va_start macro is used for both
* ANSI and traditional C compilers.
*/
#include <machine/stdarg.h>
#ifdef KDB
#ifdef KDB_UNATTENDED
int debugger_on_panic = 0;
#else
int debugger_on_panic = 1;
#endif
SYSCTL_INT(_debug, OID_AUTO, debugger_on_panic, CTLFLAG_RW | CTLFLAG_TUN,
&debugger_on_panic, 0, "Run debugger on kernel panic");
TUNABLE_INT("debug.debugger_on_panic", &debugger_on_panic);
#ifdef KDB_TRACE
static int trace_on_panic = 1;
#else
static int trace_on_panic = 0;
#endif
SYSCTL_INT(_debug, OID_AUTO, trace_on_panic, CTLFLAG_RW | CTLFLAG_TUN,
&trace_on_panic, 0, "Print stack trace on kernel panic");
TUNABLE_INT("debug.trace_on_panic", &trace_on_panic);
#endif /* KDB */
static int sync_on_panic = 0;
SYSCTL_INT(_kern, OID_AUTO, sync_on_panic, CTLFLAG_RW | CTLFLAG_TUN,
&sync_on_panic, 0, "Do a sync before rebooting from a panic");
TUNABLE_INT("kern.sync_on_panic", &sync_on_panic);
SYSCTL_NODE(_kern, OID_AUTO, shutdown, CTLFLAG_RW, 0, "Shutdown environment");
#ifndef DIAGNOSTIC
static int show_busybufs;
#else
static int show_busybufs = 1;
#endif
SYSCTL_INT(_kern_shutdown, OID_AUTO, show_busybufs, CTLFLAG_RW,
&show_busybufs, 0, "");
/*
* Variable panicstr contains argument to first call to panic; used as flag
* to indicate that the kernel has already called panic.
*/
const char *panicstr;
int dumping; /* system is dumping */
int rebooting; /* system is rebooting */
static struct dumperinfo dumper; /* our selected dumper */
/* Context information for dump-debuggers. */
static struct pcb dumppcb; /* Registers. */
static lwpid_t dumptid; /* Thread ID. */
static void poweroff_wait(void *, int);
static void shutdown_halt(void *junk, int howto);
static void shutdown_panic(void *junk, int howto);
static void shutdown_reset(void *junk, int howto);
/* register various local shutdown events */
static void
shutdown_conf(void *unused)
{
EVENTHANDLER_REGISTER(shutdown_final, poweroff_wait, NULL,
SHUTDOWN_PRI_FIRST);
EVENTHANDLER_REGISTER(shutdown_final, shutdown_halt, NULL,
SHUTDOWN_PRI_LAST + 100);
EVENTHANDLER_REGISTER(shutdown_final, shutdown_panic, NULL,
SHUTDOWN_PRI_LAST + 100);
EVENTHANDLER_REGISTER(shutdown_final, shutdown_reset, NULL,
SHUTDOWN_PRI_LAST + 200);
}
SYSINIT(shutdown_conf, SI_SUB_INTRINSIC, SI_ORDER_ANY, shutdown_conf, NULL);
/*
* The system call that results in a reboot.
*/
/* ARGSUSED */
int
-reboot(struct thread *td, struct reboot_args *uap)
+sys_reboot(struct thread *td, struct reboot_args *uap)
{
int error;
error = 0;
#ifdef MAC
error = mac_system_check_reboot(td->td_ucred, uap->opt);
#endif
if (error == 0)
error = priv_check(td, PRIV_REBOOT);
if (error == 0) {
mtx_lock(&Giant);
kern_reboot(uap->opt);
mtx_unlock(&Giant);
}
return (error);
}
/*
* Called by events that want to shut down.. e.g <CTL><ALT><DEL> on a PC
*/
static int shutdown_howto = 0;
void
shutdown_nice(int howto)
{
shutdown_howto = howto;
/* Send a signal to init(8) and have it shutdown the world */
if (initproc != NULL) {
PROC_LOCK(initproc);
- psignal(initproc, SIGINT);
+ kern_psignal(initproc, SIGINT);
PROC_UNLOCK(initproc);
} else {
/* No init(8) running, so simply reboot */
kern_reboot(RB_NOSYNC);
}
return;
}
static int waittime = -1;
static void
print_uptime(void)
{
int f;
struct timespec ts;
getnanouptime(&ts);
printf("Uptime: ");
f = 0;
if (ts.tv_sec >= 86400) {
printf("%ldd", (long)ts.tv_sec / 86400);
ts.tv_sec %= 86400;
f = 1;
}
if (f || ts.tv_sec >= 3600) {
printf("%ldh", (long)ts.tv_sec / 3600);
ts.tv_sec %= 3600;
f = 1;
}
if (f || ts.tv_sec >= 60) {
printf("%ldm", (long)ts.tv_sec / 60);
ts.tv_sec %= 60;
f = 1;
}
printf("%lds\n", (long)ts.tv_sec);
}
int
doadump(boolean_t textdump)
{
boolean_t coredump;
if (dumping)
return (EBUSY);
if (dumper.dumper == NULL)
return (ENXIO);
savectx(&dumppcb);
dumptid = curthread->td_tid;
dumping++;
coredump = TRUE;
#ifdef DDB
if (textdump && textdump_pending) {
coredump = FALSE;
textdump_dumpsys(&dumper);
}
#endif
if (coredump)
dumpsys(&dumper);
dumping--;
return (0);
}
static int
isbufbusy(struct buf *bp)
{
if (((bp->b_flags & (B_INVAL | B_PERSISTENT)) == 0 &&
BUF_ISLOCKED(bp)) ||
((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI))
return (1);
return (0);
}
/*
* Shutdown the system cleanly to prepare for reboot, halt, or power off.
*/
void
kern_reboot(int howto)
{
static int first_buf_printf = 1;
#if defined(SMP)
/*
* Bind us to CPU 0 so that all shutdown code runs there. Some
* systems don't shutdown properly (i.e., ACPI power off) if we
* run on another processor.
*/
thread_lock(curthread);
sched_bind(curthread, 0);
thread_unlock(curthread);
KASSERT(PCPU_GET(cpuid) == 0, ("%s: not running on cpu 0", __func__));
#endif
/* We're in the process of rebooting. */
rebooting = 1;
/* collect extra flags that shutdown_nice might have set */
howto |= shutdown_howto;
/* We are out of the debugger now. */
kdb_active = 0;
/*
* Do any callouts that should be done BEFORE syncing the filesystems.
*/
EVENTHANDLER_INVOKE(shutdown_pre_sync, howto);
/*
* Now sync filesystems
*/
if (!cold && (howto & RB_NOSYNC) == 0 && waittime < 0) {
register struct buf *bp;
int iter, nbusy, pbusy;
#ifndef PREEMPTION
int subiter;
#endif
waittime = 0;
#ifdef SW_WATCHDOG
wdog_kern_pat(WD_LASTVAL);
#endif
- sync(curthread, NULL);
+ sys_sync(curthread, NULL);
/*
* With soft updates, some buffers that are
* written will be remarked as dirty until other
* buffers are written.
*/
for (iter = pbusy = 0; iter < 20; iter++) {
nbusy = 0;
for (bp = &buf[nbuf]; --bp >= buf; )
if (isbufbusy(bp))
nbusy++;
if (nbusy == 0) {
if (first_buf_printf)
printf("All buffers synced.");
break;
}
if (first_buf_printf) {
printf("Syncing disks, buffers remaining... ");
first_buf_printf = 0;
}
printf("%d ", nbusy);
if (nbusy < pbusy)
iter = 0;
pbusy = nbusy;
#ifdef SW_WATCHDOG
wdog_kern_pat(WD_LASTVAL);
#endif
- sync(curthread, NULL);
+ sys_sync(curthread, NULL);
#ifdef PREEMPTION
/*
* Drop Giant and spin for a while to allow
* interrupt threads to run.
*/
DROP_GIANT();
DELAY(50000 * iter);
PICKUP_GIANT();
#else
/*
* Drop Giant and context switch several times to
* allow interrupt threads to run.
*/
DROP_GIANT();
for (subiter = 0; subiter < 50 * iter; subiter++) {
thread_lock(curthread);
mi_switch(SW_VOL, NULL);
thread_unlock(curthread);
DELAY(1000);
}
PICKUP_GIANT();
#endif
}
printf("\n");
/*
* Count only busy local buffers to prevent forcing
* a fsck if we're just a client of a wedged NFS server
*/
nbusy = 0;
for (bp = &buf[nbuf]; --bp >= buf; ) {
if (isbufbusy(bp)) {
#if 0
/* XXX: This is bogus. We should probably have a BO_REMOTE flag instead */
if (bp->b_dev == NULL) {
TAILQ_REMOVE(&mountlist,
bp->b_vp->v_mount, mnt_list);
continue;
}
#endif
nbusy++;
if (show_busybufs > 0) {
printf(
"%d: buf:%p, vnode:%p, flags:%0x, blkno:%jd, lblkno:%jd, buflock:",
nbusy, bp, bp->b_vp, bp->b_flags,
(intmax_t)bp->b_blkno,
(intmax_t)bp->b_lblkno);
BUF_LOCKPRINTINFO(bp);
if (show_busybufs > 1)
vn_printf(bp->b_vp,
"vnode content: ");
}
}
}
if (nbusy) {
/*
* Failed to sync all blocks. Indicate this and don't
* unmount filesystems (thus forcing an fsck on reboot).
*/
printf("Giving up on %d buffers\n", nbusy);
DELAY(5000000); /* 5 seconds */
} else {
if (!first_buf_printf)
printf("Final sync complete\n");
/*
* Unmount filesystems
*/
if (panicstr == 0)
vfs_unmountall();
}
swapoff_all();
DELAY(100000); /* wait for console output to finish */
}
print_uptime();
/*
* Ok, now do things that assume all filesystem activity has
* been completed.
*/
EVENTHANDLER_INVOKE(shutdown_post_sync, howto);
if ((howto & (RB_HALT|RB_DUMP)) == RB_DUMP && !cold && !dumping)
doadump(TRUE);
/* Now that we're going to really halt the system... */
EVENTHANDLER_INVOKE(shutdown_final, howto);
for(;;) ; /* safety against shutdown_reset not working */
/* NOTREACHED */
}
/*
* If the shutdown was a clean halt, behave accordingly.
*/
static void
shutdown_halt(void *junk, int howto)
{
if (howto & RB_HALT) {
printf("\n");
printf("The operating system has halted.\n");
printf("Please press any key to reboot.\n\n");
switch (cngetc()) {
case -1: /* No console, just die */
cpu_halt();
/* NOTREACHED */
default:
howto &= ~RB_HALT;
break;
}
}
}
/*
* Check to see if the system paniced, pause and then reboot
* according to the specified delay.
*/
static void
shutdown_panic(void *junk, int howto)
{
int loop;
if (howto & RB_DUMP) {
if (PANIC_REBOOT_WAIT_TIME != 0) {
if (PANIC_REBOOT_WAIT_TIME != -1) {
printf("Automatic reboot in %d seconds - "
"press a key on the console to abort\n",
PANIC_REBOOT_WAIT_TIME);
for (loop = PANIC_REBOOT_WAIT_TIME * 10;
loop > 0; --loop) {
DELAY(1000 * 100); /* 1/10th second */
/* Did user type a key? */
if (cncheckc() != -1)
break;
}
if (!loop)
return;
}
} else { /* zero time specified - reboot NOW */
return;
}
printf("--> Press a key on the console to reboot,\n");
printf("--> or switch off the system now.\n");
cngetc();
}
}
/*
* Everything done, now reset
*/
static void
shutdown_reset(void *junk, int howto)
{
printf("Rebooting...\n");
DELAY(1000000); /* wait 1 sec for printf's to complete and be read */
/*
* Acquiring smp_ipi_mtx here has a double effect:
* - it disables interrupts avoiding CPU0 preemption
* by fast handlers (thus deadlocking against other CPUs)
* - it avoids deadlocks against smp_rendezvous() or, more
* generally, threads busy-waiting, with this spinlock held,
* and waiting for responses by threads on other CPUs
* (ie. smp_tlb_shootdown()).
*
* For the !SMP case it just needs to handle the former problem.
*/
#ifdef SMP
mtx_lock_spin(&smp_ipi_mtx);
#else
spinlock_enter();
#endif
/* cpu_boot(howto); */ /* doesn't do anything at the moment */
cpu_reset();
/* NOTREACHED */ /* assuming reset worked */
}
/*
* Panic is called on unresolvable fatal errors. It prints "panic: mesg",
* and then reboots. If we are called twice, then we avoid trying to sync
* the disks as this often leads to recursive panics.
*/
void
panic(const char *fmt, ...)
{
#ifdef SMP
static volatile u_int panic_cpu = NOCPU;
#endif
struct thread *td = curthread;
int bootopt, newpanic;
va_list ap;
static char buf[256];
critical_enter();
#ifdef SMP
/*
* We don't want multiple CPU's to panic at the same time, so we
* use panic_cpu as a simple spinlock. We have to keep checking
* panic_cpu if we are spinning in case the panic on the first
* CPU is canceled.
*/
if (panic_cpu != PCPU_GET(cpuid))
while (atomic_cmpset_int(&panic_cpu, NOCPU,
PCPU_GET(cpuid)) == 0)
while (panic_cpu != NOCPU)
; /* nothing */
#endif
bootopt = RB_AUTOBOOT;
newpanic = 0;
if (panicstr)
bootopt |= RB_NOSYNC;
else {
bootopt |= RB_DUMP;
panicstr = fmt;
newpanic = 1;
}
va_start(ap, fmt);
if (newpanic) {
(void)vsnprintf(buf, sizeof(buf), fmt, ap);
panicstr = buf;
printf("panic: %s\n", buf);
} else {
printf("panic: ");
vprintf(fmt, ap);
printf("\n");
}
va_end(ap);
#ifdef SMP
printf("cpuid = %d\n", PCPU_GET(cpuid));
#endif
#ifdef KDB
if (newpanic && trace_on_panic)
kdb_backtrace();
if (debugger_on_panic)
kdb_enter(KDB_WHY_PANIC, "panic");
#endif
/*thread_lock(td); */
td->td_flags |= TDF_INPANIC;
/* thread_unlock(td); */
if (!sync_on_panic)
bootopt |= RB_NOSYNC;
critical_exit();
kern_reboot(bootopt);
}
/*
* Support for poweroff delay.
*
* Please note that setting this delay too short might power off your machine
* before the write cache on your hard disk has been flushed, leading to
* soft-updates inconsistencies.
*/
#ifndef POWEROFF_DELAY
# define POWEROFF_DELAY 5000
#endif
static int poweroff_delay = POWEROFF_DELAY;
SYSCTL_INT(_kern_shutdown, OID_AUTO, poweroff_delay, CTLFLAG_RW,
&poweroff_delay, 0, "");
static void
poweroff_wait(void *junk, int howto)
{
if (!(howto & RB_POWEROFF) || poweroff_delay <= 0)
return;
DELAY(poweroff_delay * 1000);
}
/*
* Some system processes (e.g. syncer) need to be stopped at appropriate
* points in their main loops prior to a system shutdown, so that they
* won't interfere with the shutdown process (e.g. by holding a disk buf
* to cause sync to fail). For each of these system processes, register
* shutdown_kproc() as a handler for one of shutdown events.
*/
static int kproc_shutdown_wait = 60;
SYSCTL_INT(_kern_shutdown, OID_AUTO, kproc_shutdown_wait, CTLFLAG_RW,
&kproc_shutdown_wait, 0, "");
void
kproc_shutdown(void *arg, int howto)
{
struct proc *p;
int error;
if (panicstr)
return;
p = (struct proc *)arg;
printf("Waiting (max %d seconds) for system process `%s' to stop...",
kproc_shutdown_wait, p->p_comm);
error = kproc_suspend(p, kproc_shutdown_wait * hz);
if (error == EWOULDBLOCK)
printf("timed out\n");
else
printf("done\n");
}
void
kthread_shutdown(void *arg, int howto)
{
struct thread *td;
int error;
if (panicstr)
return;
td = (struct thread *)arg;
printf("Waiting (max %d seconds) for system thread `%s' to stop...",
kproc_shutdown_wait, td->td_name);
error = kthread_suspend(td, kproc_shutdown_wait * hz);
if (error == EWOULDBLOCK)
printf("timed out\n");
else
printf("done\n");
}
/* Registration of dumpers */
int
set_dumper(struct dumperinfo *di)
{
if (di == NULL) {
bzero(&dumper, sizeof dumper);
return (0);
}
if (dumper.dumper != NULL)
return (EBUSY);
dumper = *di;
return (0);
}
/* Call dumper with bounds checking. */
int
dump_write(struct dumperinfo *di, void *virtual, vm_offset_t physical,
off_t offset, size_t length)
{
if (length != 0 && (offset < di->mediaoffset ||
offset - di->mediaoffset + length > di->mediasize)) {
printf("Attempt to write outside dump device boundaries.\n"
"offset(%jd), mediaoffset(%jd), length(%ju), mediasize(%jd).\n",
(intmax_t)offset, (intmax_t)di->mediaoffset,
(uintmax_t)length, (intmax_t)di->mediasize);
return (ENOSPC);
}
return (di->dumper(di->priv, virtual, physical, offset, length));
}
void
mkdumpheader(struct kerneldumpheader *kdh, char *magic, uint32_t archver,
uint64_t dumplen, uint32_t blksz)
{
bzero(kdh, sizeof(*kdh));
strncpy(kdh->magic, magic, sizeof(kdh->magic));
strncpy(kdh->architecture, MACHINE_ARCH, sizeof(kdh->architecture));
kdh->version = htod32(KERNELDUMPVERSION);
kdh->architectureversion = htod32(archver);
kdh->dumplength = htod64(dumplen);
kdh->dumptime = htod64(time_second);
kdh->blocksize = htod32(blksz);
strncpy(kdh->hostname, prison0.pr_hostname, sizeof(kdh->hostname));
strncpy(kdh->versionstring, version, sizeof(kdh->versionstring));
if (panicstr != NULL)
strncpy(kdh->panicstring, panicstr, sizeof(kdh->panicstring));
kdh->parity = kerneldump_parity(kdh);
}
Index: head/sys/kern/kern_sig.c
===================================================================
--- head/sys/kern/kern_sig.c (revision 225616)
+++ head/sys/kern/kern_sig.c (revision 225617)
@@ -1,3453 +1,3453 @@
/*-
* Copyright (c) 1982, 1986, 1989, 1991, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_sig.c 8.7 (Berkeley) 4/18/94
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_compat.h"
#include "opt_kdtrace.h"
#include "opt_ktrace.h"
#include "opt_core.h"
#include "opt_procdesc.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/signalvar.h>
#include <sys/vnode.h>
#include <sys/acct.h>
#include <sys/capability.h>
#include <sys/condvar.h>
#include <sys/event.h>
#include <sys/fcntl.h>
#include <sys/imgact.h>
#include <sys/kernel.h>
#include <sys/ktr.h>
#include <sys/ktrace.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
#include <sys/namei.h>
#include <sys/proc.h>
#include <sys/procdesc.h>
#include <sys/posix4.h>
#include <sys/pioctl.h>
#include <sys/racct.h>
#include <sys/resourcevar.h>
#include <sys/sdt.h>
#include <sys/sbuf.h>
#include <sys/sleepqueue.h>
#include <sys/smp.h>
#include <sys/stat.h>
#include <sys/sx.h>
#include <sys/syscallsubr.h>
#include <sys/sysctl.h>
#include <sys/sysent.h>
#include <sys/syslog.h>
#include <sys/sysproto.h>
#include <sys/timers.h>
#include <sys/unistd.h>
#include <sys/wait.h>
#include <vm/vm.h>
#include <vm/vm_extern.h>
#include <vm/uma.h>
#include <sys/jail.h>
#include <machine/cpu.h>
#include <security/audit/audit.h>
#define ONSIG 32 /* NSIG for osig* syscalls. XXX. */
SDT_PROVIDER_DECLARE(proc);
SDT_PROBE_DEFINE(proc, kernel, , signal_send, signal-send);
SDT_PROBE_ARGTYPE(proc, kernel, , signal_send, 0, "struct thread *");
SDT_PROBE_ARGTYPE(proc, kernel, , signal_send, 1, "struct proc *");
SDT_PROBE_ARGTYPE(proc, kernel, , signal_send, 2, "int");
SDT_PROBE_DEFINE(proc, kernel, , signal_clear, signal-clear);
SDT_PROBE_ARGTYPE(proc, kernel, , signal_clear, 0, "int");
SDT_PROBE_ARGTYPE(proc, kernel, , signal_clear, 1, "ksiginfo_t *");
SDT_PROBE_DEFINE(proc, kernel, , signal_discard, signal-discard);
SDT_PROBE_ARGTYPE(proc, kernel, , signal_discard, 0, "struct thread *");
SDT_PROBE_ARGTYPE(proc, kernel, , signal_discard, 1, "struct proc *");
SDT_PROBE_ARGTYPE(proc, kernel, , signal_discard, 2, "int");
static int coredump(struct thread *);
static char *expand_name(const char *, uid_t, pid_t, struct thread *, int);
static int killpg1(struct thread *td, int sig, int pgid, int all,
ksiginfo_t *ksi);
static int issignal(struct thread *td, int stop_allowed);
static int sigprop(int sig);
static void tdsigwakeup(struct thread *, int, sig_t, int);
static void sig_suspend_threads(struct thread *, struct proc *, int);
static int filt_sigattach(struct knote *kn);
static void filt_sigdetach(struct knote *kn);
static int filt_signal(struct knote *kn, long hint);
static struct thread *sigtd(struct proc *p, int sig, int prop);
static void sigqueue_start(void);
static uma_zone_t ksiginfo_zone = NULL;
struct filterops sig_filtops = {
.f_isfd = 0,
.f_attach = filt_sigattach,
.f_detach = filt_sigdetach,
.f_event = filt_signal,
};
static int kern_logsigexit = 1;
SYSCTL_INT(_kern, KERN_LOGSIGEXIT, logsigexit, CTLFLAG_RW,
&kern_logsigexit, 0,
"Log processes quitting on abnormal signals to syslog(3)");
static int kern_forcesigexit = 1;
SYSCTL_INT(_kern, OID_AUTO, forcesigexit, CTLFLAG_RW,
&kern_forcesigexit, 0, "Force trap signal to be handled");
SYSCTL_NODE(_kern, OID_AUTO, sigqueue, CTLFLAG_RW, 0, "POSIX real time signal");
static int max_pending_per_proc = 128;
SYSCTL_INT(_kern_sigqueue, OID_AUTO, max_pending_per_proc, CTLFLAG_RW,
&max_pending_per_proc, 0, "Max pending signals per proc");
static int preallocate_siginfo = 1024;
TUNABLE_INT("kern.sigqueue.preallocate", &preallocate_siginfo);
SYSCTL_INT(_kern_sigqueue, OID_AUTO, preallocate, CTLFLAG_RD,
&preallocate_siginfo, 0, "Preallocated signal memory size");
static int signal_overflow = 0;
SYSCTL_INT(_kern_sigqueue, OID_AUTO, overflow, CTLFLAG_RD,
&signal_overflow, 0, "Number of signals overflew");
static int signal_alloc_fail = 0;
SYSCTL_INT(_kern_sigqueue, OID_AUTO, alloc_fail, CTLFLAG_RD,
&signal_alloc_fail, 0, "signals failed to be allocated");
SYSINIT(signal, SI_SUB_P1003_1B, SI_ORDER_FIRST+3, sigqueue_start, NULL);
/*
* Policy -- Can ucred cr1 send SIGIO to process cr2?
* Should use cr_cansignal() once cr_cansignal() allows SIGIO and SIGURG
* in the right situations.
*/
#define CANSIGIO(cr1, cr2) \
((cr1)->cr_uid == 0 || \
(cr1)->cr_ruid == (cr2)->cr_ruid || \
(cr1)->cr_uid == (cr2)->cr_ruid || \
(cr1)->cr_ruid == (cr2)->cr_uid || \
(cr1)->cr_uid == (cr2)->cr_uid)
static int sugid_coredump;
SYSCTL_INT(_kern, OID_AUTO, sugid_coredump, CTLFLAG_RW,
&sugid_coredump, 0, "Allow setuid and setgid processes to dump core");
static int do_coredump = 1;
SYSCTL_INT(_kern, OID_AUTO, coredump, CTLFLAG_RW,
&do_coredump, 0, "Enable/Disable coredumps");
static int set_core_nodump_flag = 0;
SYSCTL_INT(_kern, OID_AUTO, nodump_coredump, CTLFLAG_RW, &set_core_nodump_flag,
0, "Enable setting the NODUMP flag on coredump files");
/*
* Signal properties and actions.
* The array below categorizes the signals and their default actions
* according to the following properties:
*/
#define SA_KILL 0x01 /* terminates process by default */
#define SA_CORE 0x02 /* ditto and coredumps */
#define SA_STOP 0x04 /* suspend process */
#define SA_TTYSTOP 0x08 /* ditto, from tty */
#define SA_IGNORE 0x10 /* ignore by default */
#define SA_CONT 0x20 /* continue if suspended */
#define SA_CANTMASK 0x40 /* non-maskable, catchable */
#define SA_PROC 0x80 /* deliverable to any thread */
static int sigproptbl[NSIG] = {
SA_KILL|SA_PROC, /* SIGHUP */
SA_KILL|SA_PROC, /* SIGINT */
SA_KILL|SA_CORE|SA_PROC, /* SIGQUIT */
SA_KILL|SA_CORE, /* SIGILL */
SA_KILL|SA_CORE, /* SIGTRAP */
SA_KILL|SA_CORE, /* SIGABRT */
SA_KILL|SA_CORE|SA_PROC, /* SIGEMT */
SA_KILL|SA_CORE, /* SIGFPE */
SA_KILL|SA_PROC, /* SIGKILL */
SA_KILL|SA_CORE, /* SIGBUS */
SA_KILL|SA_CORE, /* SIGSEGV */
SA_KILL|SA_CORE, /* SIGSYS */
SA_KILL|SA_PROC, /* SIGPIPE */
SA_KILL|SA_PROC, /* SIGALRM */
SA_KILL|SA_PROC, /* SIGTERM */
SA_IGNORE|SA_PROC, /* SIGURG */
SA_STOP|SA_PROC, /* SIGSTOP */
SA_STOP|SA_TTYSTOP|SA_PROC, /* SIGTSTP */
SA_IGNORE|SA_CONT|SA_PROC, /* SIGCONT */
SA_IGNORE|SA_PROC, /* SIGCHLD */
SA_STOP|SA_TTYSTOP|SA_PROC, /* SIGTTIN */
SA_STOP|SA_TTYSTOP|SA_PROC, /* SIGTTOU */
SA_IGNORE|SA_PROC, /* SIGIO */
SA_KILL, /* SIGXCPU */
SA_KILL, /* SIGXFSZ */
SA_KILL|SA_PROC, /* SIGVTALRM */
SA_KILL|SA_PROC, /* SIGPROF */
SA_IGNORE|SA_PROC, /* SIGWINCH */
SA_IGNORE|SA_PROC, /* SIGINFO */
SA_KILL|SA_PROC, /* SIGUSR1 */
SA_KILL|SA_PROC, /* SIGUSR2 */
};
static void reschedule_signals(struct proc *p, sigset_t block, int flags);
static void
sigqueue_start(void)
{
ksiginfo_zone = uma_zcreate("ksiginfo", sizeof(ksiginfo_t),
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
uma_prealloc(ksiginfo_zone, preallocate_siginfo);
p31b_setcfg(CTL_P1003_1B_REALTIME_SIGNALS, _POSIX_REALTIME_SIGNALS);
p31b_setcfg(CTL_P1003_1B_RTSIG_MAX, SIGRTMAX - SIGRTMIN + 1);
p31b_setcfg(CTL_P1003_1B_SIGQUEUE_MAX, max_pending_per_proc);
}
ksiginfo_t *
ksiginfo_alloc(int wait)
{
int flags;
flags = M_ZERO;
if (! wait)
flags |= M_NOWAIT;
if (ksiginfo_zone != NULL)
return ((ksiginfo_t *)uma_zalloc(ksiginfo_zone, flags));
return (NULL);
}
void
ksiginfo_free(ksiginfo_t *ksi)
{
uma_zfree(ksiginfo_zone, ksi);
}
static __inline int
ksiginfo_tryfree(ksiginfo_t *ksi)
{
if (!(ksi->ksi_flags & KSI_EXT)) {
uma_zfree(ksiginfo_zone, ksi);
return (1);
}
return (0);
}
void
sigqueue_init(sigqueue_t *list, struct proc *p)
{
SIGEMPTYSET(list->sq_signals);
SIGEMPTYSET(list->sq_kill);
TAILQ_INIT(&list->sq_list);
list->sq_proc = p;
list->sq_flags = SQ_INIT;
}
/*
* Get a signal's ksiginfo.
* Return:
* 0 - signal not found
* others - signal number
*/
static int
sigqueue_get(sigqueue_t *sq, int signo, ksiginfo_t *si)
{
struct proc *p = sq->sq_proc;
struct ksiginfo *ksi, *next;
int count = 0;
KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited"));
if (!SIGISMEMBER(sq->sq_signals, signo))
return (0);
if (SIGISMEMBER(sq->sq_kill, signo)) {
count++;
SIGDELSET(sq->sq_kill, signo);
}
TAILQ_FOREACH_SAFE(ksi, &sq->sq_list, ksi_link, next) {
if (ksi->ksi_signo == signo) {
if (count == 0) {
TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link);
ksi->ksi_sigq = NULL;
ksiginfo_copy(ksi, si);
if (ksiginfo_tryfree(ksi) && p != NULL)
p->p_pendingcnt--;
}
if (++count > 1)
break;
}
}
if (count <= 1)
SIGDELSET(sq->sq_signals, signo);
si->ksi_signo = signo;
return (signo);
}
void
sigqueue_take(ksiginfo_t *ksi)
{
struct ksiginfo *kp;
struct proc *p;
sigqueue_t *sq;
if (ksi == NULL || (sq = ksi->ksi_sigq) == NULL)
return;
p = sq->sq_proc;
TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link);
ksi->ksi_sigq = NULL;
if (!(ksi->ksi_flags & KSI_EXT) && p != NULL)
p->p_pendingcnt--;
for (kp = TAILQ_FIRST(&sq->sq_list); kp != NULL;
kp = TAILQ_NEXT(kp, ksi_link)) {
if (kp->ksi_signo == ksi->ksi_signo)
break;
}
if (kp == NULL && !SIGISMEMBER(sq->sq_kill, ksi->ksi_signo))
SIGDELSET(sq->sq_signals, ksi->ksi_signo);
}
static int
sigqueue_add(sigqueue_t *sq, int signo, ksiginfo_t *si)
{
struct proc *p = sq->sq_proc;
struct ksiginfo *ksi;
int ret = 0;
KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited"));
if (signo == SIGKILL || signo == SIGSTOP || si == NULL) {
SIGADDSET(sq->sq_kill, signo);
goto out_set_bit;
}
/* directly insert the ksi, don't copy it */
if (si->ksi_flags & KSI_INS) {
if (si->ksi_flags & KSI_HEAD)
TAILQ_INSERT_HEAD(&sq->sq_list, si, ksi_link);
else
TAILQ_INSERT_TAIL(&sq->sq_list, si, ksi_link);
si->ksi_sigq = sq;
goto out_set_bit;
}
if (__predict_false(ksiginfo_zone == NULL)) {
SIGADDSET(sq->sq_kill, signo);
goto out_set_bit;
}
if (p != NULL && p->p_pendingcnt >= max_pending_per_proc) {
signal_overflow++;
ret = EAGAIN;
} else if ((ksi = ksiginfo_alloc(0)) == NULL) {
signal_alloc_fail++;
ret = EAGAIN;
} else {
if (p != NULL)
p->p_pendingcnt++;
ksiginfo_copy(si, ksi);
ksi->ksi_signo = signo;
if (si->ksi_flags & KSI_HEAD)
TAILQ_INSERT_HEAD(&sq->sq_list, ksi, ksi_link);
else
TAILQ_INSERT_TAIL(&sq->sq_list, ksi, ksi_link);
ksi->ksi_sigq = sq;
}
if ((si->ksi_flags & KSI_TRAP) != 0 ||
(si->ksi_flags & KSI_SIGQ) == 0) {
if (ret != 0)
SIGADDSET(sq->sq_kill, signo);
ret = 0;
goto out_set_bit;
}
if (ret != 0)
return (ret);
out_set_bit:
SIGADDSET(sq->sq_signals, signo);
return (ret);
}
void
sigqueue_flush(sigqueue_t *sq)
{
struct proc *p = sq->sq_proc;
ksiginfo_t *ksi;
KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited"));
if (p != NULL)
PROC_LOCK_ASSERT(p, MA_OWNED);
while ((ksi = TAILQ_FIRST(&sq->sq_list)) != NULL) {
TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link);
ksi->ksi_sigq = NULL;
if (ksiginfo_tryfree(ksi) && p != NULL)
p->p_pendingcnt--;
}
SIGEMPTYSET(sq->sq_signals);
SIGEMPTYSET(sq->sq_kill);
}
static void
sigqueue_move_set(sigqueue_t *src, sigqueue_t *dst, const sigset_t *set)
{
sigset_t tmp;
struct proc *p1, *p2;
ksiginfo_t *ksi, *next;
KASSERT(src->sq_flags & SQ_INIT, ("src sigqueue not inited"));
KASSERT(dst->sq_flags & SQ_INIT, ("dst sigqueue not inited"));
p1 = src->sq_proc;
p2 = dst->sq_proc;
/* Move siginfo to target list */
TAILQ_FOREACH_SAFE(ksi, &src->sq_list, ksi_link, next) {
if (SIGISMEMBER(*set, ksi->ksi_signo)) {
TAILQ_REMOVE(&src->sq_list, ksi, ksi_link);
if (p1 != NULL)
p1->p_pendingcnt--;
TAILQ_INSERT_TAIL(&dst->sq_list, ksi, ksi_link);
ksi->ksi_sigq = dst;
if (p2 != NULL)
p2->p_pendingcnt++;
}
}
/* Move pending bits to target list */
tmp = src->sq_kill;
SIGSETAND(tmp, *set);
SIGSETOR(dst->sq_kill, tmp);
SIGSETNAND(src->sq_kill, tmp);
tmp = src->sq_signals;
SIGSETAND(tmp, *set);
SIGSETOR(dst->sq_signals, tmp);
SIGSETNAND(src->sq_signals, tmp);
}
#if 0
static void
sigqueue_move(sigqueue_t *src, sigqueue_t *dst, int signo)
{
sigset_t set;
SIGEMPTYSET(set);
SIGADDSET(set, signo);
sigqueue_move_set(src, dst, &set);
}
#endif
static void
sigqueue_delete_set(sigqueue_t *sq, const sigset_t *set)
{
struct proc *p = sq->sq_proc;
ksiginfo_t *ksi, *next;
KASSERT(sq->sq_flags & SQ_INIT, ("src sigqueue not inited"));
/* Remove siginfo queue */
TAILQ_FOREACH_SAFE(ksi, &sq->sq_list, ksi_link, next) {
if (SIGISMEMBER(*set, ksi->ksi_signo)) {
TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link);
ksi->ksi_sigq = NULL;
if (ksiginfo_tryfree(ksi) && p != NULL)
p->p_pendingcnt--;
}
}
SIGSETNAND(sq->sq_kill, *set);
SIGSETNAND(sq->sq_signals, *set);
}
void
sigqueue_delete(sigqueue_t *sq, int signo)
{
sigset_t set;
SIGEMPTYSET(set);
SIGADDSET(set, signo);
sigqueue_delete_set(sq, &set);
}
/* Remove a set of signals for a process */
static void
sigqueue_delete_set_proc(struct proc *p, const sigset_t *set)
{
sigqueue_t worklist;
struct thread *td0;
PROC_LOCK_ASSERT(p, MA_OWNED);
sigqueue_init(&worklist, NULL);
sigqueue_move_set(&p->p_sigqueue, &worklist, set);
FOREACH_THREAD_IN_PROC(p, td0)
sigqueue_move_set(&td0->td_sigqueue, &worklist, set);
sigqueue_flush(&worklist);
}
void
sigqueue_delete_proc(struct proc *p, int signo)
{
sigset_t set;
SIGEMPTYSET(set);
SIGADDSET(set, signo);
sigqueue_delete_set_proc(p, &set);
}
static void
sigqueue_delete_stopmask_proc(struct proc *p)
{
sigset_t set;
SIGEMPTYSET(set);
SIGADDSET(set, SIGSTOP);
SIGADDSET(set, SIGTSTP);
SIGADDSET(set, SIGTTIN);
SIGADDSET(set, SIGTTOU);
sigqueue_delete_set_proc(p, &set);
}
/*
* Determine signal that should be delivered to process p, the current
* process, 0 if none. If there is a pending stop signal with default
* action, the process stops in issignal().
*/
int
cursig(struct thread *td, int stop_allowed)
{
PROC_LOCK_ASSERT(td->td_proc, MA_OWNED);
KASSERT(stop_allowed == SIG_STOP_ALLOWED ||
stop_allowed == SIG_STOP_NOT_ALLOWED, ("cursig: stop_allowed"));
mtx_assert(&td->td_proc->p_sigacts->ps_mtx, MA_OWNED);
THREAD_LOCK_ASSERT(td, MA_NOTOWNED);
return (SIGPENDING(td) ? issignal(td, stop_allowed) : 0);
}
/*
* Arrange for ast() to handle unmasked pending signals on return to user
* mode. This must be called whenever a signal is added to td_sigqueue or
* unmasked in td_sigmask.
*/
void
signotify(struct thread *td)
{
struct proc *p;
p = td->td_proc;
PROC_LOCK_ASSERT(p, MA_OWNED);
if (SIGPENDING(td)) {
thread_lock(td);
td->td_flags |= TDF_NEEDSIGCHK | TDF_ASTPENDING;
thread_unlock(td);
}
}
int
sigonstack(size_t sp)
{
struct thread *td = curthread;
return ((td->td_pflags & TDP_ALTSTACK) ?
#if defined(COMPAT_43)
((td->td_sigstk.ss_size == 0) ?
(td->td_sigstk.ss_flags & SS_ONSTACK) :
((sp - (size_t)td->td_sigstk.ss_sp) < td->td_sigstk.ss_size))
#else
((sp - (size_t)td->td_sigstk.ss_sp) < td->td_sigstk.ss_size)
#endif
: 0);
}
static __inline int
sigprop(int sig)
{
if (sig > 0 && sig < NSIG)
return (sigproptbl[_SIG_IDX(sig)]);
return (0);
}
int
sig_ffs(sigset_t *set)
{
int i;
for (i = 0; i < _SIG_WORDS; i++)
if (set->__bits[i])
return (ffs(set->__bits[i]) + (i * 32));
return (0);
}
/*
* kern_sigaction
* sigaction
* freebsd4_sigaction
* osigaction
*/
int
kern_sigaction(td, sig, act, oact, flags)
struct thread *td;
register int sig;
struct sigaction *act, *oact;
int flags;
{
struct sigacts *ps;
struct proc *p = td->td_proc;
if (!_SIG_VALID(sig))
return (EINVAL);
PROC_LOCK(p);
ps = p->p_sigacts;
mtx_lock(&ps->ps_mtx);
if (oact) {
oact->sa_mask = ps->ps_catchmask[_SIG_IDX(sig)];
oact->sa_flags = 0;
if (SIGISMEMBER(ps->ps_sigonstack, sig))
oact->sa_flags |= SA_ONSTACK;
if (!SIGISMEMBER(ps->ps_sigintr, sig))
oact->sa_flags |= SA_RESTART;
if (SIGISMEMBER(ps->ps_sigreset, sig))
oact->sa_flags |= SA_RESETHAND;
if (SIGISMEMBER(ps->ps_signodefer, sig))
oact->sa_flags |= SA_NODEFER;
if (SIGISMEMBER(ps->ps_siginfo, sig)) {
oact->sa_flags |= SA_SIGINFO;
oact->sa_sigaction =
(__siginfohandler_t *)ps->ps_sigact[_SIG_IDX(sig)];
} else
oact->sa_handler = ps->ps_sigact[_SIG_IDX(sig)];
if (sig == SIGCHLD && ps->ps_flag & PS_NOCLDSTOP)
oact->sa_flags |= SA_NOCLDSTOP;
if (sig == SIGCHLD && ps->ps_flag & PS_NOCLDWAIT)
oact->sa_flags |= SA_NOCLDWAIT;
}
if (act) {
if ((sig == SIGKILL || sig == SIGSTOP) &&
act->sa_handler != SIG_DFL) {
mtx_unlock(&ps->ps_mtx);
PROC_UNLOCK(p);
return (EINVAL);
}
/*
* Change setting atomically.
*/
ps->ps_catchmask[_SIG_IDX(sig)] = act->sa_mask;
SIG_CANTMASK(ps->ps_catchmask[_SIG_IDX(sig)]);
if (act->sa_flags & SA_SIGINFO) {
ps->ps_sigact[_SIG_IDX(sig)] =
(__sighandler_t *)act->sa_sigaction;
SIGADDSET(ps->ps_siginfo, sig);
} else {
ps->ps_sigact[_SIG_IDX(sig)] = act->sa_handler;
SIGDELSET(ps->ps_siginfo, sig);
}
if (!(act->sa_flags & SA_RESTART))
SIGADDSET(ps->ps_sigintr, sig);
else
SIGDELSET(ps->ps_sigintr, sig);
if (act->sa_flags & SA_ONSTACK)
SIGADDSET(ps->ps_sigonstack, sig);
else
SIGDELSET(ps->ps_sigonstack, sig);
if (act->sa_flags & SA_RESETHAND)
SIGADDSET(ps->ps_sigreset, sig);
else
SIGDELSET(ps->ps_sigreset, sig);
if (act->sa_flags & SA_NODEFER)
SIGADDSET(ps->ps_signodefer, sig);
else
SIGDELSET(ps->ps_signodefer, sig);
if (sig == SIGCHLD) {
if (act->sa_flags & SA_NOCLDSTOP)
ps->ps_flag |= PS_NOCLDSTOP;
else
ps->ps_flag &= ~PS_NOCLDSTOP;
if (act->sa_flags & SA_NOCLDWAIT) {
/*
* Paranoia: since SA_NOCLDWAIT is implemented
* by reparenting the dying child to PID 1 (and
* trust it to reap the zombie), PID 1 itself
* is forbidden to set SA_NOCLDWAIT.
*/
if (p->p_pid == 1)
ps->ps_flag &= ~PS_NOCLDWAIT;
else
ps->ps_flag |= PS_NOCLDWAIT;
} else
ps->ps_flag &= ~PS_NOCLDWAIT;
if (ps->ps_sigact[_SIG_IDX(SIGCHLD)] == SIG_IGN)
ps->ps_flag |= PS_CLDSIGIGN;
else
ps->ps_flag &= ~PS_CLDSIGIGN;
}
/*
* Set bit in ps_sigignore for signals that are set to SIG_IGN,
* and for signals set to SIG_DFL where the default is to
* ignore. However, don't put SIGCONT in ps_sigignore, as we
* have to restart the process.
*/
if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN ||
(sigprop(sig) & SA_IGNORE &&
ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL)) {
/* never to be seen again */
sigqueue_delete_proc(p, sig);
if (sig != SIGCONT)
/* easier in psignal */
SIGADDSET(ps->ps_sigignore, sig);
SIGDELSET(ps->ps_sigcatch, sig);
} else {
SIGDELSET(ps->ps_sigignore, sig);
if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL)
SIGDELSET(ps->ps_sigcatch, sig);
else
SIGADDSET(ps->ps_sigcatch, sig);
}
#ifdef COMPAT_FREEBSD4
if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN ||
ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL ||
(flags & KSA_FREEBSD4) == 0)
SIGDELSET(ps->ps_freebsd4, sig);
else
SIGADDSET(ps->ps_freebsd4, sig);
#endif
#ifdef COMPAT_43
if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN ||
ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL ||
(flags & KSA_OSIGSET) == 0)
SIGDELSET(ps->ps_osigset, sig);
else
SIGADDSET(ps->ps_osigset, sig);
#endif
}
mtx_unlock(&ps->ps_mtx);
PROC_UNLOCK(p);
return (0);
}
#ifndef _SYS_SYSPROTO_H_
struct sigaction_args {
int sig;
struct sigaction *act;
struct sigaction *oact;
};
#endif
int
-sigaction(td, uap)
+sys_sigaction(td, uap)
struct thread *td;
register struct sigaction_args *uap;
{
struct sigaction act, oact;
register struct sigaction *actp, *oactp;
int error;
actp = (uap->act != NULL) ? &act : NULL;
oactp = (uap->oact != NULL) ? &oact : NULL;
if (actp) {
error = copyin(uap->act, actp, sizeof(act));
if (error)
return (error);
}
error = kern_sigaction(td, uap->sig, actp, oactp, 0);
if (oactp && !error)
error = copyout(oactp, uap->oact, sizeof(oact));
return (error);
}
#ifdef COMPAT_FREEBSD4
#ifndef _SYS_SYSPROTO_H_
struct freebsd4_sigaction_args {
int sig;
struct sigaction *act;
struct sigaction *oact;
};
#endif
int
freebsd4_sigaction(td, uap)
struct thread *td;
register struct freebsd4_sigaction_args *uap;
{
struct sigaction act, oact;
register struct sigaction *actp, *oactp;
int error;
actp = (uap->act != NULL) ? &act : NULL;
oactp = (uap->oact != NULL) ? &oact : NULL;
if (actp) {
error = copyin(uap->act, actp, sizeof(act));
if (error)
return (error);
}
error = kern_sigaction(td, uap->sig, actp, oactp, KSA_FREEBSD4);
if (oactp && !error)
error = copyout(oactp, uap->oact, sizeof(oact));
return (error);
}
#endif /* COMAPT_FREEBSD4 */
#ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */
#ifndef _SYS_SYSPROTO_H_
struct osigaction_args {
int signum;
struct osigaction *nsa;
struct osigaction *osa;
};
#endif
int
osigaction(td, uap)
struct thread *td;
register struct osigaction_args *uap;
{
struct osigaction sa;
struct sigaction nsa, osa;
register struct sigaction *nsap, *osap;
int error;
if (uap->signum <= 0 || uap->signum >= ONSIG)
return (EINVAL);
nsap = (uap->nsa != NULL) ? &nsa : NULL;
osap = (uap->osa != NULL) ? &osa : NULL;
if (nsap) {
error = copyin(uap->nsa, &sa, sizeof(sa));
if (error)
return (error);
nsap->sa_handler = sa.sa_handler;
nsap->sa_flags = sa.sa_flags;
OSIG2SIG(sa.sa_mask, nsap->sa_mask);
}
error = kern_sigaction(td, uap->signum, nsap, osap, KSA_OSIGSET);
if (osap && !error) {
sa.sa_handler = osap->sa_handler;
sa.sa_flags = osap->sa_flags;
SIG2OSIG(osap->sa_mask, sa.sa_mask);
error = copyout(&sa, uap->osa, sizeof(sa));
}
return (error);
}
#if !defined(__i386__)
/* Avoid replicating the same stub everywhere */
int
osigreturn(td, uap)
struct thread *td;
struct osigreturn_args *uap;
{
return (nosys(td, (struct nosys_args *)uap));
}
#endif
#endif /* COMPAT_43 */
/*
* Initialize signal state for process 0;
* set to ignore signals that are ignored by default.
*/
void
siginit(p)
struct proc *p;
{
register int i;
struct sigacts *ps;
PROC_LOCK(p);
ps = p->p_sigacts;
mtx_lock(&ps->ps_mtx);
for (i = 1; i <= NSIG; i++)
if (sigprop(i) & SA_IGNORE && i != SIGCONT)
SIGADDSET(ps->ps_sigignore, i);
mtx_unlock(&ps->ps_mtx);
PROC_UNLOCK(p);
}
/*
* Reset signals for an exec of the specified process.
*/
void
execsigs(struct proc *p)
{
struct sigacts *ps;
int sig;
struct thread *td;
/*
* Reset caught signals. Held signals remain held
* through td_sigmask (unless they were caught,
* and are now ignored by default).
*/
PROC_LOCK_ASSERT(p, MA_OWNED);
td = FIRST_THREAD_IN_PROC(p);
ps = p->p_sigacts;
mtx_lock(&ps->ps_mtx);
while (SIGNOTEMPTY(ps->ps_sigcatch)) {
sig = sig_ffs(&ps->ps_sigcatch);
SIGDELSET(ps->ps_sigcatch, sig);
if (sigprop(sig) & SA_IGNORE) {
if (sig != SIGCONT)
SIGADDSET(ps->ps_sigignore, sig);
sigqueue_delete_proc(p, sig);
}
ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL;
}
/*
* Reset stack state to the user stack.
* Clear set of signals caught on the signal stack.
*/
td->td_sigstk.ss_flags = SS_DISABLE;
td->td_sigstk.ss_size = 0;
td->td_sigstk.ss_sp = 0;
td->td_pflags &= ~TDP_ALTSTACK;
/*
* Reset no zombies if child dies flag as Solaris does.
*/
ps->ps_flag &= ~(PS_NOCLDWAIT | PS_CLDSIGIGN);
if (ps->ps_sigact[_SIG_IDX(SIGCHLD)] == SIG_IGN)
ps->ps_sigact[_SIG_IDX(SIGCHLD)] = SIG_DFL;
mtx_unlock(&ps->ps_mtx);
}
/*
* kern_sigprocmask()
*
* Manipulate signal mask.
*/
int
kern_sigprocmask(struct thread *td, int how, sigset_t *set, sigset_t *oset,
int flags)
{
sigset_t new_block, oset1;
struct proc *p;
int error;
p = td->td_proc;
if (!(flags & SIGPROCMASK_PROC_LOCKED))
PROC_LOCK(p);
if (oset != NULL)
*oset = td->td_sigmask;
error = 0;
if (set != NULL) {
switch (how) {
case SIG_BLOCK:
SIG_CANTMASK(*set);
oset1 = td->td_sigmask;
SIGSETOR(td->td_sigmask, *set);
new_block = td->td_sigmask;
SIGSETNAND(new_block, oset1);
break;
case SIG_UNBLOCK:
SIGSETNAND(td->td_sigmask, *set);
signotify(td);
goto out;
case SIG_SETMASK:
SIG_CANTMASK(*set);
oset1 = td->td_sigmask;
if (flags & SIGPROCMASK_OLD)
SIGSETLO(td->td_sigmask, *set);
else
td->td_sigmask = *set;
new_block = td->td_sigmask;
SIGSETNAND(new_block, oset1);
signotify(td);
break;
default:
error = EINVAL;
goto out;
}
/*
* The new_block set contains signals that were not previously
* blocked, but are blocked now.
*
* In case we block any signal that was not previously blocked
* for td, and process has the signal pending, try to schedule
* signal delivery to some thread that does not block the
* signal, possibly waking it up.
*/
if (p->p_numthreads != 1)
reschedule_signals(p, new_block, flags);
}
out:
if (!(flags & SIGPROCMASK_PROC_LOCKED))
PROC_UNLOCK(p);
return (error);
}
#ifndef _SYS_SYSPROTO_H_
struct sigprocmask_args {
int how;
const sigset_t *set;
sigset_t *oset;
};
#endif
int
-sigprocmask(td, uap)
+sys_sigprocmask(td, uap)
register struct thread *td;
struct sigprocmask_args *uap;
{
sigset_t set, oset;
sigset_t *setp, *osetp;
int error;
setp = (uap->set != NULL) ? &set : NULL;
osetp = (uap->oset != NULL) ? &oset : NULL;
if (setp) {
error = copyin(uap->set, setp, sizeof(set));
if (error)
return (error);
}
error = kern_sigprocmask(td, uap->how, setp, osetp, 0);
if (osetp && !error) {
error = copyout(osetp, uap->oset, sizeof(oset));
}
return (error);
}
#ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */
#ifndef _SYS_SYSPROTO_H_
struct osigprocmask_args {
int how;
osigset_t mask;
};
#endif
int
osigprocmask(td, uap)
register struct thread *td;
struct osigprocmask_args *uap;
{
sigset_t set, oset;
int error;
OSIG2SIG(uap->mask, set);
error = kern_sigprocmask(td, uap->how, &set, &oset, 1);
SIG2OSIG(oset, td->td_retval[0]);
return (error);
}
#endif /* COMPAT_43 */
int
-sigwait(struct thread *td, struct sigwait_args *uap)
+sys_sigwait(struct thread *td, struct sigwait_args *uap)
{
ksiginfo_t ksi;
sigset_t set;
int error;
error = copyin(uap->set, &set, sizeof(set));
if (error) {
td->td_retval[0] = error;
return (0);
}
error = kern_sigtimedwait(td, set, &ksi, NULL);
if (error) {
if (error == ERESTART)
return (error);
td->td_retval[0] = error;
return (0);
}
error = copyout(&ksi.ksi_signo, uap->sig, sizeof(ksi.ksi_signo));
td->td_retval[0] = error;
return (0);
}
int
-sigtimedwait(struct thread *td, struct sigtimedwait_args *uap)
+sys_sigtimedwait(struct thread *td, struct sigtimedwait_args *uap)
{
struct timespec ts;
struct timespec *timeout;
sigset_t set;
ksiginfo_t ksi;
int error;
if (uap->timeout) {
error = copyin(uap->timeout, &ts, sizeof(ts));
if (error)
return (error);
timeout = &ts;
} else
timeout = NULL;
error = copyin(uap->set, &set, sizeof(set));
if (error)
return (error);
error = kern_sigtimedwait(td, set, &ksi, timeout);
if (error)
return (error);
if (uap->info)
error = copyout(&ksi.ksi_info, uap->info, sizeof(siginfo_t));
if (error == 0)
td->td_retval[0] = ksi.ksi_signo;
return (error);
}
int
-sigwaitinfo(struct thread *td, struct sigwaitinfo_args *uap)
+sys_sigwaitinfo(struct thread *td, struct sigwaitinfo_args *uap)
{
ksiginfo_t ksi;
sigset_t set;
int error;
error = copyin(uap->set, &set, sizeof(set));
if (error)
return (error);
error = kern_sigtimedwait(td, set, &ksi, NULL);
if (error)
return (error);
if (uap->info)
error = copyout(&ksi.ksi_info, uap->info, sizeof(siginfo_t));
if (error == 0)
td->td_retval[0] = ksi.ksi_signo;
return (error);
}
int
kern_sigtimedwait(struct thread *td, sigset_t waitset, ksiginfo_t *ksi,
struct timespec *timeout)
{
struct sigacts *ps;
sigset_t saved_mask, new_block;
struct proc *p;
int error, sig, timo, timevalid = 0;
struct timespec rts, ets, ts;
struct timeval tv;
p = td->td_proc;
error = 0;
ets.tv_sec = 0;
ets.tv_nsec = 0;
if (timeout != NULL) {
if (timeout->tv_nsec >= 0 && timeout->tv_nsec < 1000000000) {
timevalid = 1;
getnanouptime(&rts);
ets = rts;
timespecadd(&ets, timeout);
}
}
ksiginfo_init(ksi);
/* Some signals can not be waited for. */
SIG_CANTMASK(waitset);
ps = p->p_sigacts;
PROC_LOCK(p);
saved_mask = td->td_sigmask;
SIGSETNAND(td->td_sigmask, waitset);
for (;;) {
mtx_lock(&ps->ps_mtx);
sig = cursig(td, SIG_STOP_ALLOWED);
mtx_unlock(&ps->ps_mtx);
if (sig != 0 && SIGISMEMBER(waitset, sig)) {
if (sigqueue_get(&td->td_sigqueue, sig, ksi) != 0 ||
sigqueue_get(&p->p_sigqueue, sig, ksi) != 0) {
error = 0;
break;
}
}
if (error != 0)
break;
/*
* POSIX says this must be checked after looking for pending
* signals.
*/
if (timeout != NULL) {
if (!timevalid) {
error = EINVAL;
break;
}
getnanouptime(&rts);
if (timespeccmp(&rts, &ets, >=)) {
error = EAGAIN;
break;
}
ts = ets;
timespecsub(&ts, &rts);
TIMESPEC_TO_TIMEVAL(&tv, &ts);
timo = tvtohz(&tv);
} else {
timo = 0;
}
error = msleep(ps, &p->p_mtx, PPAUSE|PCATCH, "sigwait", timo);
if (timeout != NULL) {
if (error == ERESTART) {
/* Timeout can not be restarted. */
error = EINTR;
} else if (error == EAGAIN) {
/* We will calculate timeout by ourself. */
error = 0;
}
}
}
new_block = saved_mask;
SIGSETNAND(new_block, td->td_sigmask);
td->td_sigmask = saved_mask;
/*
* Fewer signals can be delivered to us, reschedule signal
* notification.
*/
if (p->p_numthreads != 1)
reschedule_signals(p, new_block, 0);
if (error == 0) {
SDT_PROBE(proc, kernel, , signal_clear, sig, ksi, 0, 0, 0);
if (ksi->ksi_code == SI_TIMER)
itimer_accept(p, ksi->ksi_timerid, ksi);
#ifdef KTRACE
if (KTRPOINT(td, KTR_PSIG)) {
sig_t action;
mtx_lock(&ps->ps_mtx);
action = ps->ps_sigact[_SIG_IDX(sig)];
mtx_unlock(&ps->ps_mtx);
ktrpsig(sig, action, &td->td_sigmask, ksi->ksi_code);
}
#endif
if (sig == SIGKILL)
sigexit(td, sig);
}
PROC_UNLOCK(p);
return (error);
}
#ifndef _SYS_SYSPROTO_H_
struct sigpending_args {
sigset_t *set;
};
#endif
int
-sigpending(td, uap)
+sys_sigpending(td, uap)
struct thread *td;
struct sigpending_args *uap;
{
struct proc *p = td->td_proc;
sigset_t pending;
PROC_LOCK(p);
pending = p->p_sigqueue.sq_signals;
SIGSETOR(pending, td->td_sigqueue.sq_signals);
PROC_UNLOCK(p);
return (copyout(&pending, uap->set, sizeof(sigset_t)));
}
#ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */
#ifndef _SYS_SYSPROTO_H_
struct osigpending_args {
int dummy;
};
#endif
int
osigpending(td, uap)
struct thread *td;
struct osigpending_args *uap;
{
struct proc *p = td->td_proc;
sigset_t pending;
PROC_LOCK(p);
pending = p->p_sigqueue.sq_signals;
SIGSETOR(pending, td->td_sigqueue.sq_signals);
PROC_UNLOCK(p);
SIG2OSIG(pending, td->td_retval[0]);
return (0);
}
#endif /* COMPAT_43 */
#if defined(COMPAT_43)
/*
* Generalized interface signal handler, 4.3-compatible.
*/
#ifndef _SYS_SYSPROTO_H_
struct osigvec_args {
int signum;
struct sigvec *nsv;
struct sigvec *osv;
};
#endif
/* ARGSUSED */
int
osigvec(td, uap)
struct thread *td;
register struct osigvec_args *uap;
{
struct sigvec vec;
struct sigaction nsa, osa;
register struct sigaction *nsap, *osap;
int error;
if (uap->signum <= 0 || uap->signum >= ONSIG)
return (EINVAL);
nsap = (uap->nsv != NULL) ? &nsa : NULL;
osap = (uap->osv != NULL) ? &osa : NULL;
if (nsap) {
error = copyin(uap->nsv, &vec, sizeof(vec));
if (error)
return (error);
nsap->sa_handler = vec.sv_handler;
OSIG2SIG(vec.sv_mask, nsap->sa_mask);
nsap->sa_flags = vec.sv_flags;
nsap->sa_flags ^= SA_RESTART; /* opposite of SV_INTERRUPT */
}
error = kern_sigaction(td, uap->signum, nsap, osap, KSA_OSIGSET);
if (osap && !error) {
vec.sv_handler = osap->sa_handler;
SIG2OSIG(osap->sa_mask, vec.sv_mask);
vec.sv_flags = osap->sa_flags;
vec.sv_flags &= ~SA_NOCLDWAIT;
vec.sv_flags ^= SA_RESTART;
error = copyout(&vec, uap->osv, sizeof(vec));
}
return (error);
}
#ifndef _SYS_SYSPROTO_H_
struct osigblock_args {
int mask;
};
#endif
int
osigblock(td, uap)
register struct thread *td;
struct osigblock_args *uap;
{
sigset_t set, oset;
OSIG2SIG(uap->mask, set);
kern_sigprocmask(td, SIG_BLOCK, &set, &oset, 0);
SIG2OSIG(oset, td->td_retval[0]);
return (0);
}
#ifndef _SYS_SYSPROTO_H_
struct osigsetmask_args {
int mask;
};
#endif
int
osigsetmask(td, uap)
struct thread *td;
struct osigsetmask_args *uap;
{
sigset_t set, oset;
OSIG2SIG(uap->mask, set);
kern_sigprocmask(td, SIG_SETMASK, &set, &oset, 0);
SIG2OSIG(oset, td->td_retval[0]);
return (0);
}
#endif /* COMPAT_43 */
/*
* Suspend calling thread until signal, providing mask to be set in the
* meantime.
*/
#ifndef _SYS_SYSPROTO_H_
struct sigsuspend_args {
const sigset_t *sigmask;
};
#endif
/* ARGSUSED */
int
-sigsuspend(td, uap)
+sys_sigsuspend(td, uap)
struct thread *td;
struct sigsuspend_args *uap;
{
sigset_t mask;
int error;
error = copyin(uap->sigmask, &mask, sizeof(mask));
if (error)
return (error);
return (kern_sigsuspend(td, mask));
}
int
kern_sigsuspend(struct thread *td, sigset_t mask)
{
struct proc *p = td->td_proc;
int has_sig, sig;
/*
* When returning from sigsuspend, we want
* the old mask to be restored after the
* signal handler has finished. Thus, we
* save it here and mark the sigacts structure
* to indicate this.
*/
PROC_LOCK(p);
kern_sigprocmask(td, SIG_SETMASK, &mask, &td->td_oldsigmask,
SIGPROCMASK_PROC_LOCKED);
td->td_pflags |= TDP_OLDMASK;
/*
* Process signals now. Otherwise, we can get spurious wakeup
* due to signal entered process queue, but delivered to other
* thread. But sigsuspend should return only on signal
* delivery.
*/
(p->p_sysent->sv_set_syscall_retval)(td, EINTR);
for (has_sig = 0; !has_sig;) {
while (msleep(&p->p_sigacts, &p->p_mtx, PPAUSE|PCATCH, "pause",
0) == 0)
/* void */;
thread_suspend_check(0);
mtx_lock(&p->p_sigacts->ps_mtx);
while ((sig = cursig(td, SIG_STOP_ALLOWED)) != 0)
has_sig += postsig(sig);
mtx_unlock(&p->p_sigacts->ps_mtx);
}
PROC_UNLOCK(p);
return (EJUSTRETURN);
}
#ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */
/*
* Compatibility sigsuspend call for old binaries. Note nonstandard calling
* convention: libc stub passes mask, not pointer, to save a copyin.
*/
#ifndef _SYS_SYSPROTO_H_
struct osigsuspend_args {
osigset_t mask;
};
#endif
/* ARGSUSED */
int
osigsuspend(td, uap)
struct thread *td;
struct osigsuspend_args *uap;
{
sigset_t mask;
OSIG2SIG(uap->mask, mask);
return (kern_sigsuspend(td, mask));
}
#endif /* COMPAT_43 */
#if defined(COMPAT_43)
#ifndef _SYS_SYSPROTO_H_
struct osigstack_args {
struct sigstack *nss;
struct sigstack *oss;
};
#endif
/* ARGSUSED */
int
osigstack(td, uap)
struct thread *td;
register struct osigstack_args *uap;
{
struct sigstack nss, oss;
int error = 0;
if (uap->nss != NULL) {
error = copyin(uap->nss, &nss, sizeof(nss));
if (error)
return (error);
}
oss.ss_sp = td->td_sigstk.ss_sp;
oss.ss_onstack = sigonstack(cpu_getstack(td));
if (uap->nss != NULL) {
td->td_sigstk.ss_sp = nss.ss_sp;
td->td_sigstk.ss_size = 0;
td->td_sigstk.ss_flags |= nss.ss_onstack & SS_ONSTACK;
td->td_pflags |= TDP_ALTSTACK;
}
if (uap->oss != NULL)
error = copyout(&oss, uap->oss, sizeof(oss));
return (error);
}
#endif /* COMPAT_43 */
#ifndef _SYS_SYSPROTO_H_
struct sigaltstack_args {
stack_t *ss;
stack_t *oss;
};
#endif
/* ARGSUSED */
int
-sigaltstack(td, uap)
+sys_sigaltstack(td, uap)
struct thread *td;
register struct sigaltstack_args *uap;
{
stack_t ss, oss;
int error;
if (uap->ss != NULL) {
error = copyin(uap->ss, &ss, sizeof(ss));
if (error)
return (error);
}
error = kern_sigaltstack(td, (uap->ss != NULL) ? &ss : NULL,
(uap->oss != NULL) ? &oss : NULL);
if (error)
return (error);
if (uap->oss != NULL)
error = copyout(&oss, uap->oss, sizeof(stack_t));
return (error);
}
int
kern_sigaltstack(struct thread *td, stack_t *ss, stack_t *oss)
{
struct proc *p = td->td_proc;
int oonstack;
oonstack = sigonstack(cpu_getstack(td));
if (oss != NULL) {
*oss = td->td_sigstk;
oss->ss_flags = (td->td_pflags & TDP_ALTSTACK)
? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
}
if (ss != NULL) {
if (oonstack)
return (EPERM);
if ((ss->ss_flags & ~SS_DISABLE) != 0)
return (EINVAL);
if (!(ss->ss_flags & SS_DISABLE)) {
if (ss->ss_size < p->p_sysent->sv_minsigstksz)
return (ENOMEM);
td->td_sigstk = *ss;
td->td_pflags |= TDP_ALTSTACK;
} else {
td->td_pflags &= ~TDP_ALTSTACK;
}
}
return (0);
}
/*
* Common code for kill process group/broadcast kill.
* cp is calling process.
*/
static int
killpg1(struct thread *td, int sig, int pgid, int all, ksiginfo_t *ksi)
{
struct proc *p;
struct pgrp *pgrp;
int nfound = 0;
if (all) {
/*
* broadcast
*/
sx_slock(&allproc_lock);
FOREACH_PROC_IN_SYSTEM(p) {
PROC_LOCK(p);
if (p->p_pid <= 1 || p->p_flag & P_SYSTEM ||
p == td->td_proc || p->p_state == PRS_NEW) {
PROC_UNLOCK(p);
continue;
}
if (p_cansignal(td, p, sig) == 0) {
nfound++;
if (sig)
pksignal(p, sig, ksi);
}
PROC_UNLOCK(p);
}
sx_sunlock(&allproc_lock);
} else {
sx_slock(&proctree_lock);
if (pgid == 0) {
/*
* zero pgid means send to my process group.
*/
pgrp = td->td_proc->p_pgrp;
PGRP_LOCK(pgrp);
} else {
pgrp = pgfind(pgid);
if (pgrp == NULL) {
sx_sunlock(&proctree_lock);
return (ESRCH);
}
}
sx_sunlock(&proctree_lock);
LIST_FOREACH(p, &pgrp->pg_members, p_pglist) {
PROC_LOCK(p);
if (p->p_pid <= 1 || p->p_flag & P_SYSTEM ||
p->p_state == PRS_NEW) {
PROC_UNLOCK(p);
continue;
}
if (p_cansignal(td, p, sig) == 0) {
nfound++;
if (sig)
pksignal(p, sig, ksi);
}
PROC_UNLOCK(p);
}
PGRP_UNLOCK(pgrp);
}
return (nfound ? 0 : ESRCH);
}
#ifndef _SYS_SYSPROTO_H_
struct kill_args {
int pid;
int signum;
};
#endif
/* ARGSUSED */
int
-kill(struct thread *td, struct kill_args *uap)
+sys_kill(struct thread *td, struct kill_args *uap)
{
ksiginfo_t ksi;
struct proc *p;
int error;
AUDIT_ARG_SIGNUM(uap->signum);
AUDIT_ARG_PID(uap->pid);
if ((u_int)uap->signum > _SIG_MAXSIG)
return (EINVAL);
ksiginfo_init(&ksi);
ksi.ksi_signo = uap->signum;
ksi.ksi_code = SI_USER;
ksi.ksi_pid = td->td_proc->p_pid;
ksi.ksi_uid = td->td_ucred->cr_ruid;
if (uap->pid > 0) {
/* kill single process */
if ((p = pfind(uap->pid)) == NULL) {
if ((p = zpfind(uap->pid)) == NULL)
return (ESRCH);
}
AUDIT_ARG_PROCESS(p);
error = p_cansignal(td, p, uap->signum);
if (error == 0 && uap->signum)
pksignal(p, uap->signum, &ksi);
PROC_UNLOCK(p);
return (error);
}
switch (uap->pid) {
case -1: /* broadcast signal */
return (killpg1(td, uap->signum, 0, 1, &ksi));
case 0: /* signal own process group */
return (killpg1(td, uap->signum, 0, 0, &ksi));
default: /* negative explicit process group */
return (killpg1(td, uap->signum, -uap->pid, 0, &ksi));
}
/* NOTREACHED */
}
int
-pdkill(td, uap)
+sys_pdkill(td, uap)
struct thread *td;
struct pdkill_args *uap;
{
#ifdef PROCDESC
struct proc *p;
int error;
AUDIT_ARG_SIGNUM(uap->signum);
AUDIT_ARG_FD(uap->fd);
if ((u_int)uap->signum > _SIG_MAXSIG)
return (EINVAL);
error = procdesc_find(td, uap->fd, CAP_PDKILL, &p);
if (error)
return (error);
AUDIT_ARG_PROCESS(p);
error = p_cansignal(td, p, uap->signum);
if (error == 0 && uap->signum)
- psignal(p, uap->signum);
+ kern_psignal(p, uap->signum);
PROC_UNLOCK(p);
return (error);
#else
return (ENOSYS);
#endif
}
#if defined(COMPAT_43)
#ifndef _SYS_SYSPROTO_H_
struct okillpg_args {
int pgid;
int signum;
};
#endif
/* ARGSUSED */
int
okillpg(struct thread *td, struct okillpg_args *uap)
{
ksiginfo_t ksi;
AUDIT_ARG_SIGNUM(uap->signum);
AUDIT_ARG_PID(uap->pgid);
if ((u_int)uap->signum > _SIG_MAXSIG)
return (EINVAL);
ksiginfo_init(&ksi);
ksi.ksi_signo = uap->signum;
ksi.ksi_code = SI_USER;
ksi.ksi_pid = td->td_proc->p_pid;
ksi.ksi_uid = td->td_ucred->cr_ruid;
return (killpg1(td, uap->signum, uap->pgid, 0, &ksi));
}
#endif /* COMPAT_43 */
#ifndef _SYS_SYSPROTO_H_
struct sigqueue_args {
pid_t pid;
int signum;
/* union sigval */ void *value;
};
#endif
int
-sigqueue(struct thread *td, struct sigqueue_args *uap)
+sys_sigqueue(struct thread *td, struct sigqueue_args *uap)
{
ksiginfo_t ksi;
struct proc *p;
int error;
if ((u_int)uap->signum > _SIG_MAXSIG)
return (EINVAL);
/*
* Specification says sigqueue can only send signal to
* single process.
*/
if (uap->pid <= 0)
return (EINVAL);
if ((p = pfind(uap->pid)) == NULL) {
if ((p = zpfind(uap->pid)) == NULL)
return (ESRCH);
}
error = p_cansignal(td, p, uap->signum);
if (error == 0 && uap->signum != 0) {
ksiginfo_init(&ksi);
ksi.ksi_flags = KSI_SIGQ;
ksi.ksi_signo = uap->signum;
ksi.ksi_code = SI_QUEUE;
ksi.ksi_pid = td->td_proc->p_pid;
ksi.ksi_uid = td->td_ucred->cr_ruid;
ksi.ksi_value.sival_ptr = uap->value;
error = pksignal(p, ksi.ksi_signo, &ksi);
}
PROC_UNLOCK(p);
return (error);
}
/*
* Send a signal to a process group.
*/
void
gsignal(int pgid, int sig, ksiginfo_t *ksi)
{
struct pgrp *pgrp;
if (pgid != 0) {
sx_slock(&proctree_lock);
pgrp = pgfind(pgid);
sx_sunlock(&proctree_lock);
if (pgrp != NULL) {
pgsignal(pgrp, sig, 0, ksi);
PGRP_UNLOCK(pgrp);
}
}
}
/*
* Send a signal to a process group. If checktty is 1,
* limit to members which have a controlling terminal.
*/
void
pgsignal(struct pgrp *pgrp, int sig, int checkctty, ksiginfo_t *ksi)
{
struct proc *p;
if (pgrp) {
PGRP_LOCK_ASSERT(pgrp, MA_OWNED);
LIST_FOREACH(p, &pgrp->pg_members, p_pglist) {
PROC_LOCK(p);
if (p->p_state == PRS_NORMAL &&
(checkctty == 0 || p->p_flag & P_CONTROLT))
pksignal(p, sig, ksi);
PROC_UNLOCK(p);
}
}
}
/*
* Send a signal caused by a trap to the current thread. If it will be
* caught immediately, deliver it with correct code. Otherwise, post it
* normally.
*/
void
trapsignal(struct thread *td, ksiginfo_t *ksi)
{
struct sigacts *ps;
sigset_t mask;
struct proc *p;
int sig;
int code;
p = td->td_proc;
sig = ksi->ksi_signo;
code = ksi->ksi_code;
KASSERT(_SIG_VALID(sig), ("invalid signal"));
PROC_LOCK(p);
ps = p->p_sigacts;
mtx_lock(&ps->ps_mtx);
if ((p->p_flag & P_TRACED) == 0 && SIGISMEMBER(ps->ps_sigcatch, sig) &&
!SIGISMEMBER(td->td_sigmask, sig)) {
td->td_ru.ru_nsignals++;
#ifdef KTRACE
if (KTRPOINT(curthread, KTR_PSIG))
ktrpsig(sig, ps->ps_sigact[_SIG_IDX(sig)],
&td->td_sigmask, code);
#endif
(*p->p_sysent->sv_sendsig)(ps->ps_sigact[_SIG_IDX(sig)],
ksi, &td->td_sigmask);
mask = ps->ps_catchmask[_SIG_IDX(sig)];
if (!SIGISMEMBER(ps->ps_signodefer, sig))
SIGADDSET(mask, sig);
kern_sigprocmask(td, SIG_BLOCK, &mask, NULL,
SIGPROCMASK_PROC_LOCKED | SIGPROCMASK_PS_LOCKED);
if (SIGISMEMBER(ps->ps_sigreset, sig)) {
/*
* See kern_sigaction() for origin of this code.
*/
SIGDELSET(ps->ps_sigcatch, sig);
if (sig != SIGCONT &&
sigprop(sig) & SA_IGNORE)
SIGADDSET(ps->ps_sigignore, sig);
ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL;
}
mtx_unlock(&ps->ps_mtx);
} else {
/*
* Avoid a possible infinite loop if the thread
* masking the signal or process is ignoring the
* signal.
*/
if (kern_forcesigexit &&
(SIGISMEMBER(td->td_sigmask, sig) ||
ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN)) {
SIGDELSET(td->td_sigmask, sig);
SIGDELSET(ps->ps_sigcatch, sig);
SIGDELSET(ps->ps_sigignore, sig);
ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL;
}
mtx_unlock(&ps->ps_mtx);
p->p_code = code; /* XXX for core dump/debugger */
p->p_sig = sig; /* XXX to verify code */
tdsendsignal(p, td, sig, ksi);
}
PROC_UNLOCK(p);
}
static struct thread *
sigtd(struct proc *p, int sig, int prop)
{
struct thread *td, *signal_td;
PROC_LOCK_ASSERT(p, MA_OWNED);
/*
* Check if current thread can handle the signal without
* switching context to another thread.
*/
if (curproc == p && !SIGISMEMBER(curthread->td_sigmask, sig))
return (curthread);
signal_td = NULL;
FOREACH_THREAD_IN_PROC(p, td) {
if (!SIGISMEMBER(td->td_sigmask, sig)) {
signal_td = td;
break;
}
}
if (signal_td == NULL)
signal_td = FIRST_THREAD_IN_PROC(p);
return (signal_td);
}
/*
* Send the signal to the process. If the signal has an action, the action
* is usually performed by the target process rather than the caller; we add
* the signal to the set of pending signals for the process.
*
* Exceptions:
* o When a stop signal is sent to a sleeping process that takes the
* default action, the process is stopped without awakening it.
* o SIGCONT restarts stopped processes (or puts them back to sleep)
* regardless of the signal action (eg, blocked or ignored).
*
* Other ignored signals are discarded immediately.
*
* NB: This function may be entered from the debugger via the "kill" DDB
* command. There is little that can be done to mitigate the possibly messy
* side effects of this unwise possibility.
*/
void
-psignal(struct proc *p, int sig)
+kern_psignal(struct proc *p, int sig)
{
ksiginfo_t ksi;
ksiginfo_init(&ksi);
ksi.ksi_signo = sig;
ksi.ksi_code = SI_KERNEL;
(void) tdsendsignal(p, NULL, sig, &ksi);
}
int
pksignal(struct proc *p, int sig, ksiginfo_t *ksi)
{
return (tdsendsignal(p, NULL, sig, ksi));
}
/* Utility function for finding a thread to send signal event to. */
int
sigev_findtd(struct proc *p ,struct sigevent *sigev, struct thread **ttd)
{
struct thread *td;
if (sigev->sigev_notify == SIGEV_THREAD_ID) {
td = tdfind(sigev->sigev_notify_thread_id, p->p_pid);
if (td == NULL)
return (ESRCH);
*ttd = td;
} else {
*ttd = NULL;
PROC_LOCK(p);
}
return (0);
}
void
tdsignal(struct thread *td, int sig)
{
ksiginfo_t ksi;
ksiginfo_init(&ksi);
ksi.ksi_signo = sig;
ksi.ksi_code = SI_KERNEL;
(void) tdsendsignal(td->td_proc, td, sig, &ksi);
}
void
tdksignal(struct thread *td, int sig, ksiginfo_t *ksi)
{
(void) tdsendsignal(td->td_proc, td, sig, ksi);
}
int
tdsendsignal(struct proc *p, struct thread *td, int sig, ksiginfo_t *ksi)
{
sig_t action;
sigqueue_t *sigqueue;
int prop;
struct sigacts *ps;
int intrval;
int ret = 0;
int wakeup_swapper;
MPASS(td == NULL || p == td->td_proc);
PROC_LOCK_ASSERT(p, MA_OWNED);
if (!_SIG_VALID(sig))
panic("%s(): invalid signal %d", __func__, sig);
KASSERT(ksi == NULL || !KSI_ONQ(ksi), ("%s: ksi on queue", __func__));
/*
* IEEE Std 1003.1-2001: return success when killing a zombie.
*/
if (p->p_state == PRS_ZOMBIE) {
if (ksi && (ksi->ksi_flags & KSI_INS))
ksiginfo_tryfree(ksi);
return (ret);
}
ps = p->p_sigacts;
KNOTE_LOCKED(&p->p_klist, NOTE_SIGNAL | sig);
prop = sigprop(sig);
if (td == NULL) {
td = sigtd(p, sig, prop);
sigqueue = &p->p_sigqueue;
} else {
KASSERT(td->td_proc == p, ("invalid thread"));
sigqueue = &td->td_sigqueue;
}
SDT_PROBE(proc, kernel, , signal_send, td, p, sig, 0, 0 );
/*
* If the signal is being ignored,
* then we forget about it immediately.
* (Note: we don't set SIGCONT in ps_sigignore,
* and if it is set to SIG_IGN,
* action will be SIG_DFL here.)
*/
mtx_lock(&ps->ps_mtx);
if (SIGISMEMBER(ps->ps_sigignore, sig)) {
SDT_PROBE(proc, kernel, , signal_discard, ps, td, sig, 0, 0 );
mtx_unlock(&ps->ps_mtx);
if (ksi && (ksi->ksi_flags & KSI_INS))
ksiginfo_tryfree(ksi);
return (ret);
}
if (SIGISMEMBER(td->td_sigmask, sig))
action = SIG_HOLD;
else if (SIGISMEMBER(ps->ps_sigcatch, sig))
action = SIG_CATCH;
else
action = SIG_DFL;
if (SIGISMEMBER(ps->ps_sigintr, sig))
intrval = EINTR;
else
intrval = ERESTART;
mtx_unlock(&ps->ps_mtx);
if (prop & SA_CONT)
sigqueue_delete_stopmask_proc(p);
else if (prop & SA_STOP) {
/*
* If sending a tty stop signal to a member of an orphaned
* process group, discard the signal here if the action
* is default; don't stop the process below if sleeping,
* and don't clear any pending SIGCONT.
*/
if ((prop & SA_TTYSTOP) &&
(p->p_pgrp->pg_jobc == 0) &&
(action == SIG_DFL)) {
if (ksi && (ksi->ksi_flags & KSI_INS))
ksiginfo_tryfree(ksi);
return (ret);
}
sigqueue_delete_proc(p, SIGCONT);
if (p->p_flag & P_CONTINUED) {
p->p_flag &= ~P_CONTINUED;
PROC_LOCK(p->p_pptr);
sigqueue_take(p->p_ksi);
PROC_UNLOCK(p->p_pptr);
}
}
ret = sigqueue_add(sigqueue, sig, ksi);
if (ret != 0)
return (ret);
signotify(td);
/*
* Defer further processing for signals which are held,
* except that stopped processes must be continued by SIGCONT.
*/
if (action == SIG_HOLD &&
!((prop & SA_CONT) && (p->p_flag & P_STOPPED_SIG)))
return (ret);
/*
* SIGKILL: Remove procfs STOPEVENTs.
*/
if (sig == SIGKILL) {
/* from procfs_ioctl.c: PIOCBIC */
p->p_stops = 0;
/* from procfs_ioctl.c: PIOCCONT */
p->p_step = 0;
wakeup(&p->p_step);
}
/*
* Some signals have a process-wide effect and a per-thread
* component. Most processing occurs when the process next
* tries to cross the user boundary, however there are some
* times when processing needs to be done immediatly, such as
* waking up threads so that they can cross the user boundary.
* We try do the per-process part here.
*/
if (P_SHOULDSTOP(p)) {
if (sig == SIGKILL) {
/*
* If traced process is already stopped,
* then no further action is necessary.
*/
if (p->p_flag & P_TRACED)
goto out;
/*
* SIGKILL sets process running.
* It will die elsewhere.
* All threads must be restarted.
*/
p->p_flag &= ~P_STOPPED_SIG;
goto runfast;
}
if (prop & SA_CONT) {
/*
* If traced process is already stopped,
* then no further action is necessary.
*/
if (p->p_flag & P_TRACED)
goto out;
/*
* If SIGCONT is default (or ignored), we continue the
* process but don't leave the signal in sigqueue as
* it has no further action. If SIGCONT is held, we
* continue the process and leave the signal in
* sigqueue. If the process catches SIGCONT, let it
* handle the signal itself. If it isn't waiting on
* an event, it goes back to run state.
* Otherwise, process goes back to sleep state.
*/
p->p_flag &= ~P_STOPPED_SIG;
PROC_SLOCK(p);
if (p->p_numthreads == p->p_suspcount) {
PROC_SUNLOCK(p);
p->p_flag |= P_CONTINUED;
p->p_xstat = SIGCONT;
PROC_LOCK(p->p_pptr);
childproc_continued(p);
PROC_UNLOCK(p->p_pptr);
PROC_SLOCK(p);
}
if (action == SIG_DFL) {
thread_unsuspend(p);
PROC_SUNLOCK(p);
sigqueue_delete(sigqueue, sig);
goto out;
}
if (action == SIG_CATCH) {
/*
* The process wants to catch it so it needs
* to run at least one thread, but which one?
*/
PROC_SUNLOCK(p);
goto runfast;
}
/*
* The signal is not ignored or caught.
*/
thread_unsuspend(p);
PROC_SUNLOCK(p);
goto out;
}
if (prop & SA_STOP) {
/*
* If traced process is already stopped,
* then no further action is necessary.
*/
if (p->p_flag & P_TRACED)
goto out;
/*
* Already stopped, don't need to stop again
* (If we did the shell could get confused).
* Just make sure the signal STOP bit set.
*/
p->p_flag |= P_STOPPED_SIG;
sigqueue_delete(sigqueue, sig);
goto out;
}
/*
* All other kinds of signals:
* If a thread is sleeping interruptibly, simulate a
* wakeup so that when it is continued it will be made
* runnable and can look at the signal. However, don't make
* the PROCESS runnable, leave it stopped.
* It may run a bit until it hits a thread_suspend_check().
*/
wakeup_swapper = 0;
PROC_SLOCK(p);
thread_lock(td);
if (TD_ON_SLEEPQ(td) && (td->td_flags & TDF_SINTR))
wakeup_swapper = sleepq_abort(td, intrval);
thread_unlock(td);
PROC_SUNLOCK(p);
if (wakeup_swapper)
kick_proc0();
goto out;
/*
* Mutexes are short lived. Threads waiting on them will
* hit thread_suspend_check() soon.
*/
} else if (p->p_state == PRS_NORMAL) {
if (p->p_flag & P_TRACED || action == SIG_CATCH) {
tdsigwakeup(td, sig, action, intrval);
goto out;
}
MPASS(action == SIG_DFL);
if (prop & SA_STOP) {
if (p->p_flag & P_PPWAIT)
goto out;
p->p_flag |= P_STOPPED_SIG;
p->p_xstat = sig;
PROC_SLOCK(p);
sig_suspend_threads(td, p, 1);
if (p->p_numthreads == p->p_suspcount) {
/*
* only thread sending signal to another
* process can reach here, if thread is sending
* signal to its process, because thread does
* not suspend itself here, p_numthreads
* should never be equal to p_suspcount.
*/
thread_stopped(p);
PROC_SUNLOCK(p);
sigqueue_delete_proc(p, p->p_xstat);
} else
PROC_SUNLOCK(p);
goto out;
}
} else {
/* Not in "NORMAL" state. discard the signal. */
sigqueue_delete(sigqueue, sig);
goto out;
}
/*
* The process is not stopped so we need to apply the signal to all the
* running threads.
*/
runfast:
tdsigwakeup(td, sig, action, intrval);
PROC_SLOCK(p);
thread_unsuspend(p);
PROC_SUNLOCK(p);
out:
/* If we jump here, proc slock should not be owned. */
PROC_SLOCK_ASSERT(p, MA_NOTOWNED);
return (ret);
}
/*
* The force of a signal has been directed against a single
* thread. We need to see what we can do about knocking it
* out of any sleep it may be in etc.
*/
static void
tdsigwakeup(struct thread *td, int sig, sig_t action, int intrval)
{
struct proc *p = td->td_proc;
register int prop;
int wakeup_swapper;
wakeup_swapper = 0;
PROC_LOCK_ASSERT(p, MA_OWNED);
prop = sigprop(sig);
PROC_SLOCK(p);
thread_lock(td);
/*
* Bring the priority of a thread up if we want it to get
* killed in this lifetime.
*/
if (action == SIG_DFL && (prop & SA_KILL) && td->td_priority > PUSER)
sched_prio(td, PUSER);
if (TD_ON_SLEEPQ(td)) {
/*
* If thread is sleeping uninterruptibly
* we can't interrupt the sleep... the signal will
* be noticed when the process returns through
* trap() or syscall().
*/
if ((td->td_flags & TDF_SINTR) == 0)
goto out;
/*
* If SIGCONT is default (or ignored) and process is
* asleep, we are finished; the process should not
* be awakened.
*/
if ((prop & SA_CONT) && action == SIG_DFL) {
thread_unlock(td);
PROC_SUNLOCK(p);
sigqueue_delete(&p->p_sigqueue, sig);
/*
* It may be on either list in this state.
* Remove from both for now.
*/
sigqueue_delete(&td->td_sigqueue, sig);
return;
}
/*
* Give low priority threads a better chance to run.
*/
if (td->td_priority > PUSER)
sched_prio(td, PUSER);
wakeup_swapper = sleepq_abort(td, intrval);
} else {
/*
* Other states do nothing with the signal immediately,
* other than kicking ourselves if we are running.
* It will either never be noticed, or noticed very soon.
*/
#ifdef SMP
if (TD_IS_RUNNING(td) && td != curthread)
forward_signal(td);
#endif
}
out:
PROC_SUNLOCK(p);
thread_unlock(td);
if (wakeup_swapper)
kick_proc0();
}
static void
sig_suspend_threads(struct thread *td, struct proc *p, int sending)
{
struct thread *td2;
int wakeup_swapper;
PROC_LOCK_ASSERT(p, MA_OWNED);
PROC_SLOCK_ASSERT(p, MA_OWNED);
wakeup_swapper = 0;
FOREACH_THREAD_IN_PROC(p, td2) {
thread_lock(td2);
td2->td_flags |= TDF_ASTPENDING | TDF_NEEDSUSPCHK;
if ((TD_IS_SLEEPING(td2) || TD_IS_SWAPPED(td2)) &&
(td2->td_flags & TDF_SINTR)) {
if (td2->td_flags & TDF_SBDRY) {
if (TD_IS_SUSPENDED(td2))
wakeup_swapper |=
thread_unsuspend_one(td2);
if (TD_ON_SLEEPQ(td2))
wakeup_swapper |=
sleepq_abort(td2, ERESTART);
} else if (!TD_IS_SUSPENDED(td2)) {
thread_suspend_one(td2);
}
} else if (!TD_IS_SUSPENDED(td2)) {
if (sending || td != td2)
td2->td_flags |= TDF_ASTPENDING;
#ifdef SMP
if (TD_IS_RUNNING(td2) && td2 != td)
forward_signal(td2);
#endif
}
thread_unlock(td2);
}
if (wakeup_swapper)
kick_proc0();
}
int
ptracestop(struct thread *td, int sig)
{
struct proc *p = td->td_proc;
PROC_LOCK_ASSERT(p, MA_OWNED);
WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK,
&p->p_mtx.lock_object, "Stopping for traced signal");
td->td_dbgflags |= TDB_XSIG;
td->td_xsig = sig;
PROC_SLOCK(p);
while ((p->p_flag & P_TRACED) && (td->td_dbgflags & TDB_XSIG)) {
if (p->p_flag & P_SINGLE_EXIT) {
td->td_dbgflags &= ~TDB_XSIG;
PROC_SUNLOCK(p);
return (sig);
}
/*
* Just make wait() to work, the last stopped thread
* will win.
*/
p->p_xstat = sig;
p->p_xthread = td;
p->p_flag |= (P_STOPPED_SIG|P_STOPPED_TRACE);
sig_suspend_threads(td, p, 0);
if ((td->td_dbgflags & TDB_STOPATFORK) != 0) {
td->td_dbgflags &= ~TDB_STOPATFORK;
cv_broadcast(&p->p_dbgwait);
}
stopme:
thread_suspend_switch(td);
if (!(p->p_flag & P_TRACED)) {
break;
}
if (td->td_dbgflags & TDB_SUSPEND) {
if (p->p_flag & P_SINGLE_EXIT)
break;
goto stopme;
}
}
PROC_SUNLOCK(p);
return (td->td_xsig);
}
static void
reschedule_signals(struct proc *p, sigset_t block, int flags)
{
struct sigacts *ps;
struct thread *td;
int sig;
PROC_LOCK_ASSERT(p, MA_OWNED);
if (SIGISEMPTY(p->p_siglist))
return;
ps = p->p_sigacts;
SIGSETAND(block, p->p_siglist);
while ((sig = sig_ffs(&block)) != 0) {
SIGDELSET(block, sig);
td = sigtd(p, sig, 0);
signotify(td);
if (!(flags & SIGPROCMASK_PS_LOCKED))
mtx_lock(&ps->ps_mtx);
if (p->p_flag & P_TRACED || SIGISMEMBER(ps->ps_sigcatch, sig))
tdsigwakeup(td, sig, SIG_CATCH,
(SIGISMEMBER(ps->ps_sigintr, sig) ? EINTR :
ERESTART));
if (!(flags & SIGPROCMASK_PS_LOCKED))
mtx_unlock(&ps->ps_mtx);
}
}
void
tdsigcleanup(struct thread *td)
{
struct proc *p;
sigset_t unblocked;
p = td->td_proc;
PROC_LOCK_ASSERT(p, MA_OWNED);
sigqueue_flush(&td->td_sigqueue);
if (p->p_numthreads == 1)
return;
/*
* Since we cannot handle signals, notify signal post code
* about this by filling the sigmask.
*
* Also, if needed, wake up thread(s) that do not block the
* same signals as the exiting thread, since the thread might
* have been selected for delivery and woken up.
*/
SIGFILLSET(unblocked);
SIGSETNAND(unblocked, td->td_sigmask);
SIGFILLSET(td->td_sigmask);
reschedule_signals(p, unblocked, 0);
}
/*
* If the current process has received a signal (should be caught or cause
* termination, should interrupt current syscall), return the signal number.
* Stop signals with default action are processed immediately, then cleared;
* they aren't returned. This is checked after each entry to the system for
* a syscall or trap (though this can usually be done without calling issignal
* by checking the pending signal masks in cursig.) The normal call
* sequence is
*
* while (sig = cursig(curthread))
* postsig(sig);
*/
static int
issignal(struct thread *td, int stop_allowed)
{
struct proc *p;
struct sigacts *ps;
struct sigqueue *queue;
sigset_t sigpending;
int sig, prop, newsig;
p = td->td_proc;
ps = p->p_sigacts;
mtx_assert(&ps->ps_mtx, MA_OWNED);
PROC_LOCK_ASSERT(p, MA_OWNED);
for (;;) {
int traced = (p->p_flag & P_TRACED) || (p->p_stops & S_SIG);
sigpending = td->td_sigqueue.sq_signals;
SIGSETOR(sigpending, p->p_sigqueue.sq_signals);
SIGSETNAND(sigpending, td->td_sigmask);
if (p->p_flag & P_PPWAIT)
SIG_STOPSIGMASK(sigpending);
if (SIGISEMPTY(sigpending)) /* no signal to send */
return (0);
sig = sig_ffs(&sigpending);
if (p->p_stops & S_SIG) {
mtx_unlock(&ps->ps_mtx);
stopevent(p, S_SIG, sig);
mtx_lock(&ps->ps_mtx);
}
/*
* We should see pending but ignored signals
* only if P_TRACED was on when they were posted.
*/
if (SIGISMEMBER(ps->ps_sigignore, sig) && (traced == 0)) {
sigqueue_delete(&td->td_sigqueue, sig);
sigqueue_delete(&p->p_sigqueue, sig);
continue;
}
if (p->p_flag & P_TRACED && (p->p_flag & P_PPWAIT) == 0) {
/*
* If traced, always stop.
* Remove old signal from queue before the stop.
* XXX shrug off debugger, it causes siginfo to
* be thrown away.
*/
queue = &td->td_sigqueue;
td->td_dbgksi.ksi_signo = 0;
if (sigqueue_get(queue, sig, &td->td_dbgksi) == 0) {
queue = &p->p_sigqueue;
sigqueue_get(queue, sig, &td->td_dbgksi);
}
mtx_unlock(&ps->ps_mtx);
newsig = ptracestop(td, sig);
mtx_lock(&ps->ps_mtx);
if (sig != newsig) {
/*
* If parent wants us to take the signal,
* then it will leave it in p->p_xstat;
* otherwise we just look for signals again.
*/
if (newsig == 0)
continue;
sig = newsig;
/*
* Put the new signal into td_sigqueue. If the
* signal is being masked, look for other signals.
*/
sigqueue_add(queue, sig, NULL);
if (SIGISMEMBER(td->td_sigmask, sig))
continue;
signotify(td);
} else {
if (td->td_dbgksi.ksi_signo != 0) {
td->td_dbgksi.ksi_flags |= KSI_HEAD;
if (sigqueue_add(&td->td_sigqueue, sig,
&td->td_dbgksi) != 0)
td->td_dbgksi.ksi_signo = 0;
}
if (td->td_dbgksi.ksi_signo == 0)
sigqueue_add(&td->td_sigqueue, sig,
NULL);
}
/*
* If the traced bit got turned off, go back up
* to the top to rescan signals. This ensures
* that p_sig* and p_sigact are consistent.
*/
if ((p->p_flag & P_TRACED) == 0)
continue;
}
prop = sigprop(sig);
/*
* Decide whether the signal should be returned.
* Return the signal's number, or fall through
* to clear it from the pending mask.
*/
switch ((intptr_t)p->p_sigacts->ps_sigact[_SIG_IDX(sig)]) {
case (intptr_t)SIG_DFL:
/*
* Don't take default actions on system processes.
*/
if (p->p_pid <= 1) {
#ifdef DIAGNOSTIC
/*
* Are you sure you want to ignore SIGSEGV
* in init? XXX
*/
printf("Process (pid %lu) got signal %d\n",
(u_long)p->p_pid, sig);
#endif
break; /* == ignore */
}
/*
* If there is a pending stop signal to process
* with default action, stop here,
* then clear the signal. However,
* if process is member of an orphaned
* process group, ignore tty stop signals.
*/
if (prop & SA_STOP) {
if (p->p_flag & P_TRACED ||
(p->p_pgrp->pg_jobc == 0 &&
prop & SA_TTYSTOP))
break; /* == ignore */
/* Ignore, but do not drop the stop signal. */
if (stop_allowed != SIG_STOP_ALLOWED)
return (sig);
mtx_unlock(&ps->ps_mtx);
WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK,
&p->p_mtx.lock_object, "Catching SIGSTOP");
p->p_flag |= P_STOPPED_SIG;
p->p_xstat = sig;
PROC_SLOCK(p);
sig_suspend_threads(td, p, 0);
thread_suspend_switch(td);
PROC_SUNLOCK(p);
mtx_lock(&ps->ps_mtx);
break;
} else if (prop & SA_IGNORE) {
/*
* Except for SIGCONT, shouldn't get here.
* Default action is to ignore; drop it.
*/
break; /* == ignore */
} else
return (sig);
/*NOTREACHED*/
case (intptr_t)SIG_IGN:
/*
* Masking above should prevent us ever trying
* to take action on an ignored signal other
* than SIGCONT, unless process is traced.
*/
if ((prop & SA_CONT) == 0 &&
(p->p_flag & P_TRACED) == 0)
printf("issignal\n");
break; /* == ignore */
default:
/*
* This signal has an action, let
* postsig() process it.
*/
return (sig);
}
sigqueue_delete(&td->td_sigqueue, sig); /* take the signal! */
sigqueue_delete(&p->p_sigqueue, sig);
}
/* NOTREACHED */
}
void
thread_stopped(struct proc *p)
{
int n;
PROC_LOCK_ASSERT(p, MA_OWNED);
PROC_SLOCK_ASSERT(p, MA_OWNED);
n = p->p_suspcount;
if (p == curproc)
n++;
if ((p->p_flag & P_STOPPED_SIG) && (n == p->p_numthreads)) {
PROC_SUNLOCK(p);
p->p_flag &= ~P_WAITED;
PROC_LOCK(p->p_pptr);
childproc_stopped(p, (p->p_flag & P_TRACED) ?
CLD_TRAPPED : CLD_STOPPED);
PROC_UNLOCK(p->p_pptr);
PROC_SLOCK(p);
}
}
/*
* Take the action for the specified signal
* from the current set of pending signals.
*/
int
postsig(sig)
register int sig;
{
struct thread *td = curthread;
register struct proc *p = td->td_proc;
struct sigacts *ps;
sig_t action;
ksiginfo_t ksi;
sigset_t returnmask, mask;
KASSERT(sig != 0, ("postsig"));
PROC_LOCK_ASSERT(p, MA_OWNED);
ps = p->p_sigacts;
mtx_assert(&ps->ps_mtx, MA_OWNED);
ksiginfo_init(&ksi);
if (sigqueue_get(&td->td_sigqueue, sig, &ksi) == 0 &&
sigqueue_get(&p->p_sigqueue, sig, &ksi) == 0)
return (0);
ksi.ksi_signo = sig;
if (ksi.ksi_code == SI_TIMER)
itimer_accept(p, ksi.ksi_timerid, &ksi);
action = ps->ps_sigact[_SIG_IDX(sig)];
#ifdef KTRACE
if (KTRPOINT(td, KTR_PSIG))
ktrpsig(sig, action, td->td_pflags & TDP_OLDMASK ?
&td->td_oldsigmask : &td->td_sigmask, ksi.ksi_code);
#endif
if (p->p_stops & S_SIG) {
mtx_unlock(&ps->ps_mtx);
stopevent(p, S_SIG, sig);
mtx_lock(&ps->ps_mtx);
}
if (action == SIG_DFL) {
/*
* Default action, where the default is to kill
* the process. (Other cases were ignored above.)
*/
mtx_unlock(&ps->ps_mtx);
sigexit(td, sig);
/* NOTREACHED */
} else {
/*
* If we get here, the signal must be caught.
*/
KASSERT(action != SIG_IGN && !SIGISMEMBER(td->td_sigmask, sig),
("postsig action"));
/*
* Set the new mask value and also defer further
* occurrences of this signal.
*
* Special case: user has done a sigsuspend. Here the
* current mask is not of interest, but rather the
* mask from before the sigsuspend is what we want
* restored after the signal processing is completed.
*/
if (td->td_pflags & TDP_OLDMASK) {
returnmask = td->td_oldsigmask;
td->td_pflags &= ~TDP_OLDMASK;
} else
returnmask = td->td_sigmask;
mask = ps->ps_catchmask[_SIG_IDX(sig)];
if (!SIGISMEMBER(ps->ps_signodefer, sig))
SIGADDSET(mask, sig);
kern_sigprocmask(td, SIG_BLOCK, &mask, NULL,
SIGPROCMASK_PROC_LOCKED | SIGPROCMASK_PS_LOCKED);
if (SIGISMEMBER(ps->ps_sigreset, sig)) {
/*
* See kern_sigaction() for origin of this code.
*/
SIGDELSET(ps->ps_sigcatch, sig);
if (sig != SIGCONT &&
sigprop(sig) & SA_IGNORE)
SIGADDSET(ps->ps_sigignore, sig);
ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL;
}
td->td_ru.ru_nsignals++;
if (p->p_sig == sig) {
p->p_code = 0;
p->p_sig = 0;
}
(*p->p_sysent->sv_sendsig)(action, &ksi, &returnmask);
}
return (1);
}
/*
* Kill the current process for stated reason.
*/
void
killproc(p, why)
struct proc *p;
char *why;
{
PROC_LOCK_ASSERT(p, MA_OWNED);
CTR3(KTR_PROC, "killproc: proc %p (pid %d, %s)",
p, p->p_pid, p->p_comm);
log(LOG_ERR, "pid %d (%s), uid %d, was killed: %s\n", p->p_pid, p->p_comm,
p->p_ucred ? p->p_ucred->cr_uid : -1, why);
p->p_flag |= P_WKILLED;
- psignal(p, SIGKILL);
+ kern_psignal(p, SIGKILL);
}
/*
* Force the current process to exit with the specified signal, dumping core
* if appropriate. We bypass the normal tests for masked and caught signals,
* allowing unrecoverable failures to terminate the process without changing
* signal state. Mark the accounting record with the signal termination.
* If dumping core, save the signal number for the debugger. Calls exit and
* does not return.
*/
void
sigexit(td, sig)
struct thread *td;
int sig;
{
struct proc *p = td->td_proc;
PROC_LOCK_ASSERT(p, MA_OWNED);
p->p_acflag |= AXSIG;
/*
* We must be single-threading to generate a core dump. This
* ensures that the registers in the core file are up-to-date.
* Also, the ELF dump handler assumes that the thread list doesn't
* change out from under it.
*
* XXX If another thread attempts to single-thread before us
* (e.g. via fork()), we won't get a dump at all.
*/
if ((sigprop(sig) & SA_CORE) && (thread_single(SINGLE_NO_EXIT) == 0)) {
p->p_sig = sig;
/*
* Log signals which would cause core dumps
* (Log as LOG_INFO to appease those who don't want
* these messages.)
* XXX : Todo, as well as euid, write out ruid too
* Note that coredump() drops proc lock.
*/
if (coredump(td) == 0)
sig |= WCOREFLAG;
if (kern_logsigexit)
log(LOG_INFO,
"pid %d (%s), uid %d: exited on signal %d%s\n",
p->p_pid, p->p_comm,
td->td_ucred ? td->td_ucred->cr_uid : -1,
sig &~ WCOREFLAG,
sig & WCOREFLAG ? " (core dumped)" : "");
} else
PROC_UNLOCK(p);
exit1(td, W_EXITCODE(0, sig));
/* NOTREACHED */
}
/*
* Send queued SIGCHLD to parent when child process's state
* is changed.
*/
static void
sigparent(struct proc *p, int reason, int status)
{
PROC_LOCK_ASSERT(p, MA_OWNED);
PROC_LOCK_ASSERT(p->p_pptr, MA_OWNED);
if (p->p_ksi != NULL) {
p->p_ksi->ksi_signo = SIGCHLD;
p->p_ksi->ksi_code = reason;
p->p_ksi->ksi_status = status;
p->p_ksi->ksi_pid = p->p_pid;
p->p_ksi->ksi_uid = p->p_ucred->cr_ruid;
if (KSI_ONQ(p->p_ksi))
return;
}
pksignal(p->p_pptr, SIGCHLD, p->p_ksi);
}
static void
childproc_jobstate(struct proc *p, int reason, int status)
{
struct sigacts *ps;
PROC_LOCK_ASSERT(p, MA_OWNED);
PROC_LOCK_ASSERT(p->p_pptr, MA_OWNED);
/*
* Wake up parent sleeping in kern_wait(), also send
* SIGCHLD to parent, but SIGCHLD does not guarantee
* that parent will awake, because parent may masked
* the signal.
*/
p->p_pptr->p_flag |= P_STATCHILD;
wakeup(p->p_pptr);
ps = p->p_pptr->p_sigacts;
mtx_lock(&ps->ps_mtx);
if ((ps->ps_flag & PS_NOCLDSTOP) == 0) {
mtx_unlock(&ps->ps_mtx);
sigparent(p, reason, status);
} else
mtx_unlock(&ps->ps_mtx);
}
void
childproc_stopped(struct proc *p, int reason)
{
childproc_jobstate(p, reason, p->p_xstat);
}
void
childproc_continued(struct proc *p)
{
childproc_jobstate(p, CLD_CONTINUED, SIGCONT);
}
void
childproc_exited(struct proc *p)
{
int reason;
int status = p->p_xstat; /* convert to int */
reason = CLD_EXITED;
if (WCOREDUMP(status))
reason = CLD_DUMPED;
else if (WIFSIGNALED(status))
reason = CLD_KILLED;
/*
* XXX avoid calling wakeup(p->p_pptr), the work is
* done in exit1().
*/
sigparent(p, reason, status);
}
/*
* We only have 1 character for the core count in the format
* string, so the range will be 0-9
*/
#define MAX_NUM_CORES 10
static int num_cores = 5;
static int
sysctl_debug_num_cores_check (SYSCTL_HANDLER_ARGS)
{
int error;
int new_val;
new_val = num_cores;
error = sysctl_handle_int(oidp, &new_val, 0, req);
if (error != 0 || req->newptr == NULL)
return (error);
if (new_val > MAX_NUM_CORES)
new_val = MAX_NUM_CORES;
if (new_val < 0)
new_val = 0;
num_cores = new_val;
return (0);
}
SYSCTL_PROC(_debug, OID_AUTO, ncores, CTLTYPE_INT|CTLFLAG_RW,
0, sizeof(int), sysctl_debug_num_cores_check, "I", "");
#if defined(COMPRESS_USER_CORES)
int compress_user_cores = 1;
SYSCTL_INT(_kern, OID_AUTO, compress_user_cores, CTLFLAG_RW,
&compress_user_cores, 0, "");
int compress_user_cores_gzlevel = -1; /* default level */
SYSCTL_INT(_kern, OID_AUTO, compress_user_cores_gzlevel, CTLFLAG_RW,
&compress_user_cores_gzlevel, -1, "user core gz compression level");
#define GZ_SUFFIX ".gz"
#define GZ_SUFFIX_LEN 3
#endif
static char corefilename[MAXPATHLEN] = {"%N.core"};
SYSCTL_STRING(_kern, OID_AUTO, corefile, CTLFLAG_RW, corefilename,
sizeof(corefilename), "process corefile name format string");
/*
* expand_name(name, uid, pid, td, compress)
* Expand the name described in corefilename, using name, uid, and pid.
* corefilename is a printf-like string, with three format specifiers:
* %N name of process ("name")
* %P process id (pid)
* %U user id (uid)
* For example, "%N.core" is the default; they can be disabled completely
* by using "/dev/null", or all core files can be stored in "/cores/%U/%N-%P".
* This is controlled by the sysctl variable kern.corefile (see above).
*/
static char *
expand_name(const char *name, uid_t uid, pid_t pid, struct thread *td,
int compress)
{
struct sbuf sb;
const char *format;
char *temp;
size_t i;
int indexpos;
char *hostname;
hostname = NULL;
format = corefilename;
temp = malloc(MAXPATHLEN, M_TEMP, M_NOWAIT | M_ZERO);
if (temp == NULL)
return (NULL);
indexpos = -1;
(void)sbuf_new(&sb, temp, MAXPATHLEN, SBUF_FIXEDLEN);
for (i = 0; format[i]; i++) {
switch (format[i]) {
case '%': /* Format character */
i++;
switch (format[i]) {
case '%':
sbuf_putc(&sb, '%');
break;
case 'H': /* hostname */
if (hostname == NULL) {
hostname = malloc(MAXHOSTNAMELEN,
M_TEMP, M_NOWAIT);
if (hostname == NULL) {
log(LOG_ERR,
"pid %ld (%s), uid (%lu): "
"unable to alloc memory "
"for corefile hostname\n",
(long)pid, name,
(u_long)uid);
goto nomem;
}
}
getcredhostname(td->td_ucred, hostname,
MAXHOSTNAMELEN);
sbuf_printf(&sb, "%s", hostname);
break;
case 'I': /* autoincrementing index */
sbuf_printf(&sb, "0");
indexpos = sbuf_len(&sb) - 1;
break;
case 'N': /* process name */
sbuf_printf(&sb, "%s", name);
break;
case 'P': /* process id */
sbuf_printf(&sb, "%u", pid);
break;
case 'U': /* user id */
sbuf_printf(&sb, "%u", uid);
break;
default:
log(LOG_ERR,
"Unknown format character %c in "
"corename `%s'\n", format[i], format);
}
break;
default:
sbuf_putc(&sb, format[i]);
}
}
free(hostname, M_TEMP);
#ifdef COMPRESS_USER_CORES
if (compress) {
sbuf_printf(&sb, GZ_SUFFIX);
}
#endif
if (sbuf_error(&sb) != 0) {
log(LOG_ERR, "pid %ld (%s), uid (%lu): corename is too "
"long\n", (long)pid, name, (u_long)uid);
nomem:
sbuf_delete(&sb);
free(temp, M_TEMP);
return (NULL);
}
sbuf_finish(&sb);
sbuf_delete(&sb);
/*
* If the core format has a %I in it, then we need to check
* for existing corefiles before returning a name.
* To do this we iterate over 0..num_cores to find a
* non-existing core file name to use.
*/
if (indexpos != -1) {
struct nameidata nd;
int error, n;
int flags = O_CREAT | O_EXCL | FWRITE | O_NOFOLLOW;
int cmode = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP;
int vfslocked;
for (n = 0; n < num_cores; n++) {
temp[indexpos] = '0' + n;
NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE, UIO_SYSSPACE,
temp, td);
error = vn_open(&nd, &flags, cmode, NULL);
if (error) {
if (error == EEXIST) {
continue;
}
log(LOG_ERR,
"pid %d (%s), uid (%u): Path `%s' failed "
"on initial open test, error = %d\n",
pid, name, uid, temp, error);
free(temp, M_TEMP);
return (NULL);
}
vfslocked = NDHASGIANT(&nd);
NDFREE(&nd, NDF_ONLY_PNBUF);
VOP_UNLOCK(nd.ni_vp, 0);
error = vn_close(nd.ni_vp, FWRITE, td->td_ucred, td);
VFS_UNLOCK_GIANT(vfslocked);
if (error) {
log(LOG_ERR,
"pid %d (%s), uid (%u): Path `%s' failed "
"on close after initial open test, "
"error = %d\n",
pid, name, uid, temp, error);
free(temp, M_TEMP);
return (NULL);
}
break;
}
}
return (temp);
}
/*
* Dump a process' core. The main routine does some
* policy checking, and creates the name of the coredump;
* then it passes on a vnode and a size limit to the process-specific
* coredump routine if there is one; if there _is not_ one, it returns
* ENOSYS; otherwise it returns the error from the process-specific routine.
*/
static int
coredump(struct thread *td)
{
struct proc *p = td->td_proc;
register struct vnode *vp;
register struct ucred *cred = td->td_ucred;
struct flock lf;
struct nameidata nd;
struct vattr vattr;
int error, error1, flags, locked;
struct mount *mp;
char *name; /* name of corefile */
off_t limit;
int vfslocked;
int compress;
#ifdef COMPRESS_USER_CORES
compress = compress_user_cores;
#else
compress = 0;
#endif
PROC_LOCK_ASSERT(p, MA_OWNED);
MPASS((p->p_flag & P_HADTHREADS) == 0 || p->p_singlethread == td);
_STOPEVENT(p, S_CORE, 0);
name = expand_name(p->p_comm, td->td_ucred->cr_uid, p->p_pid, td,
compress);
if (name == NULL) {
PROC_UNLOCK(p);
#ifdef AUDIT
audit_proc_coredump(td, NULL, EINVAL);
#endif
return (EINVAL);
}
if (((sugid_coredump == 0) && p->p_flag & P_SUGID) || do_coredump == 0) {
PROC_UNLOCK(p);
#ifdef AUDIT
audit_proc_coredump(td, name, EFAULT);
#endif
free(name, M_TEMP);
return (EFAULT);
}
/*
* Note that the bulk of limit checking is done after
* the corefile is created. The exception is if the limit
* for corefiles is 0, in which case we don't bother
* creating the corefile at all. This layout means that
* a corefile is truncated instead of not being created,
* if it is larger than the limit.
*/
limit = (off_t)lim_cur(p, RLIMIT_CORE);
if (limit == 0 || racct_get_available(p, RACCT_CORE) == 0) {
PROC_UNLOCK(p);
#ifdef AUDIT
audit_proc_coredump(td, name, EFBIG);
#endif
free(name, M_TEMP);
return (EFBIG);
}
PROC_UNLOCK(p);
restart:
NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE, UIO_SYSSPACE, name, td);
flags = O_CREAT | FWRITE | O_NOFOLLOW;
error = vn_open_cred(&nd, &flags, S_IRUSR | S_IWUSR, VN_OPEN_NOAUDIT,
cred, NULL);
if (error) {
#ifdef AUDIT
audit_proc_coredump(td, name, error);
#endif
free(name, M_TEMP);
return (error);
}
vfslocked = NDHASGIANT(&nd);
NDFREE(&nd, NDF_ONLY_PNBUF);
vp = nd.ni_vp;
/* Don't dump to non-regular files or files with links. */
if (vp->v_type != VREG ||
VOP_GETATTR(vp, &vattr, cred) || vattr.va_nlink != 1) {
VOP_UNLOCK(vp, 0);
error = EFAULT;
goto close;
}
VOP_UNLOCK(vp, 0);
lf.l_whence = SEEK_SET;
lf.l_start = 0;
lf.l_len = 0;
lf.l_type = F_WRLCK;
locked = (VOP_ADVLOCK(vp, (caddr_t)p, F_SETLK, &lf, F_FLOCK) == 0);
if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
lf.l_type = F_UNLCK;
if (locked)
VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &lf, F_FLOCK);
if ((error = vn_close(vp, FWRITE, cred, td)) != 0)
goto out;
if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
goto out;
VFS_UNLOCK_GIANT(vfslocked);
goto restart;
}
VATTR_NULL(&vattr);
vattr.va_size = 0;
if (set_core_nodump_flag)
vattr.va_flags = UF_NODUMP;
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
VOP_SETATTR(vp, &vattr, cred);
VOP_UNLOCK(vp, 0);
vn_finished_write(mp);
PROC_LOCK(p);
p->p_acflag |= ACORE;
PROC_UNLOCK(p);
error = p->p_sysent->sv_coredump ?
p->p_sysent->sv_coredump(td, vp, limit, compress ? IMGACT_CORE_COMPRESS : 0) :
ENOSYS;
if (locked) {
lf.l_type = F_UNLCK;
VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &lf, F_FLOCK);
}
close:
error1 = vn_close(vp, FWRITE, cred, td);
if (error == 0)
error = error1;
out:
#ifdef AUDIT
audit_proc_coredump(td, name, error);
#endif
free(name, M_TEMP);
VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
/*
* Nonexistent system call-- signal process (may want to handle it). Flag
* error in case process won't see signal immediately (blocked or ignored).
*/
#ifndef _SYS_SYSPROTO_H_
struct nosys_args {
int dummy;
};
#endif
/* ARGSUSED */
int
nosys(td, args)
struct thread *td;
struct nosys_args *args;
{
struct proc *p = td->td_proc;
PROC_LOCK(p);
- psignal(p, SIGSYS);
+ kern_psignal(p, SIGSYS);
PROC_UNLOCK(p);
return (ENOSYS);
}
/*
* Send a SIGIO or SIGURG signal to a process or process group using stored
* credentials rather than those of the current process.
*/
void
pgsigio(sigiop, sig, checkctty)
struct sigio **sigiop;
int sig, checkctty;
{
ksiginfo_t ksi;
struct sigio *sigio;
ksiginfo_init(&ksi);
ksi.ksi_signo = sig;
ksi.ksi_code = SI_KERNEL;
SIGIO_LOCK();
sigio = *sigiop;
if (sigio == NULL) {
SIGIO_UNLOCK();
return;
}
if (sigio->sio_pgid > 0) {
PROC_LOCK(sigio->sio_proc);
if (CANSIGIO(sigio->sio_ucred, sigio->sio_proc->p_ucred))
- psignal(sigio->sio_proc, sig);
+ kern_psignal(sigio->sio_proc, sig);
PROC_UNLOCK(sigio->sio_proc);
} else if (sigio->sio_pgid < 0) {
struct proc *p;
PGRP_LOCK(sigio->sio_pgrp);
LIST_FOREACH(p, &sigio->sio_pgrp->pg_members, p_pglist) {
PROC_LOCK(p);
if (p->p_state == PRS_NORMAL &&
CANSIGIO(sigio->sio_ucred, p->p_ucred) &&
(checkctty == 0 || (p->p_flag & P_CONTROLT)))
- psignal(p, sig);
+ kern_psignal(p, sig);
PROC_UNLOCK(p);
}
PGRP_UNLOCK(sigio->sio_pgrp);
}
SIGIO_UNLOCK();
}
static int
filt_sigattach(struct knote *kn)
{
struct proc *p = curproc;
kn->kn_ptr.p_proc = p;
kn->kn_flags |= EV_CLEAR; /* automatically set */
knlist_add(&p->p_klist, kn, 0);
return (0);
}
static void
filt_sigdetach(struct knote *kn)
{
struct proc *p = kn->kn_ptr.p_proc;
knlist_remove(&p->p_klist, kn, 0);
}
/*
* signal knotes are shared with proc knotes, so we apply a mask to
* the hint in order to differentiate them from process hints. This
* could be avoided by using a signal-specific knote list, but probably
* isn't worth the trouble.
*/
static int
filt_signal(struct knote *kn, long hint)
{
if (hint & NOTE_SIGNAL) {
hint &= ~NOTE_SIGNAL;
if (kn->kn_id == hint)
kn->kn_data++;
}
return (kn->kn_data != 0);
}
struct sigacts *
sigacts_alloc(void)
{
struct sigacts *ps;
ps = malloc(sizeof(struct sigacts), M_SUBPROC, M_WAITOK | M_ZERO);
ps->ps_refcnt = 1;
mtx_init(&ps->ps_mtx, "sigacts", NULL, MTX_DEF);
return (ps);
}
void
sigacts_free(struct sigacts *ps)
{
mtx_lock(&ps->ps_mtx);
ps->ps_refcnt--;
if (ps->ps_refcnt == 0) {
mtx_destroy(&ps->ps_mtx);
free(ps, M_SUBPROC);
} else
mtx_unlock(&ps->ps_mtx);
}
struct sigacts *
sigacts_hold(struct sigacts *ps)
{
mtx_lock(&ps->ps_mtx);
ps->ps_refcnt++;
mtx_unlock(&ps->ps_mtx);
return (ps);
}
void
sigacts_copy(struct sigacts *dest, struct sigacts *src)
{
KASSERT(dest->ps_refcnt == 1, ("sigacts_copy to shared dest"));
mtx_lock(&src->ps_mtx);
bcopy(src, dest, offsetof(struct sigacts, ps_refcnt));
mtx_unlock(&src->ps_mtx);
}
int
sigacts_shared(struct sigacts *ps)
{
int shared;
mtx_lock(&ps->ps_mtx);
shared = ps->ps_refcnt > 1;
mtx_unlock(&ps->ps_mtx);
return (shared);
}
Index: head/sys/kern/kern_synch.c
===================================================================
--- head/sys/kern/kern_synch.c (revision 225616)
+++ head/sys/kern/kern_synch.c (revision 225617)
@@ -1,586 +1,586 @@
/*-
* Copyright (c) 1982, 1986, 1990, 1991, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_synch.c 8.9 (Berkeley) 5/19/95
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_ktrace.h"
#include "opt_sched.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/condvar.h>
#include <sys/kdb.h>
#include <sys/kernel.h>
#include <sys/ktr.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/resourcevar.h>
#include <sys/sched.h>
#include <sys/signalvar.h>
#include <sys/sleepqueue.h>
#include <sys/smp.h>
#include <sys/sx.h>
#include <sys/sysctl.h>
#include <sys/sysproto.h>
#include <sys/vmmeter.h>
#ifdef KTRACE
#include <sys/uio.h>
#include <sys/ktrace.h>
#endif
#include <machine/cpu.h>
#ifdef XEN
#include <vm/vm.h>
#include <vm/vm_param.h>
#include <vm/pmap.h>
#endif
#define KTDSTATE(td) \
(((td)->td_inhibitors & TDI_SLEEPING) != 0 ? "sleep" : \
((td)->td_inhibitors & TDI_SUSPENDED) != 0 ? "suspended" : \
((td)->td_inhibitors & TDI_SWAPPED) != 0 ? "swapped" : \
((td)->td_inhibitors & TDI_LOCK) != 0 ? "blocked" : \
((td)->td_inhibitors & TDI_IWAIT) != 0 ? "iwait" : "yielding")
static void synch_setup(void *dummy);
SYSINIT(synch_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, synch_setup,
NULL);
int hogticks;
static int pause_wchan;
static struct callout loadav_callout;
struct loadavg averunnable =
{ {0, 0, 0}, FSCALE }; /* load average, of runnable procs */
/*
* Constants for averages over 1, 5, and 15 minutes
* when sampling at 5 second intervals.
*/
static fixpt_t cexp[3] = {
0.9200444146293232 * FSCALE, /* exp(-1/12) */
0.9834714538216174 * FSCALE, /* exp(-1/60) */
0.9944598480048967 * FSCALE, /* exp(-1/180) */
};
/* kernel uses `FSCALE', userland (SHOULD) use kern.fscale */
static int fscale __unused = FSCALE;
SYSCTL_INT(_kern, OID_AUTO, fscale, CTLFLAG_RD, 0, FSCALE, "");
static void loadav(void *arg);
void
sleepinit(void)
{
hogticks = (hz / 10) * 2; /* Default only. */
init_sleepqueues();
}
/*
* General sleep call. Suspends the current thread until a wakeup is
* performed on the specified identifier. The thread will then be made
* runnable with the specified priority. Sleeps at most timo/hz seconds
* (0 means no timeout). If pri includes PCATCH flag, signals are checked
* before and after sleeping, else signals are not checked. Returns 0 if
* awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a
* signal needs to be delivered, ERESTART is returned if the current system
* call should be restarted if possible, and EINTR is returned if the system
* call should be interrupted by the signal (return EINTR).
*
* The lock argument is unlocked before the caller is suspended, and
* re-locked before _sleep() returns. If priority includes the PDROP
* flag the lock is not re-locked before returning.
*/
int
_sleep(void *ident, struct lock_object *lock, int priority,
const char *wmesg, int timo)
{
struct thread *td;
struct proc *p;
struct lock_class *class;
int catch, flags, lock_state, pri, rval;
WITNESS_SAVE_DECL(lock_witness);
td = curthread;
p = td->td_proc;
#ifdef KTRACE
if (KTRPOINT(td, KTR_CSW))
ktrcsw(1, 0);
#endif
WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, lock,
"Sleeping on \"%s\"", wmesg);
KASSERT(timo != 0 || mtx_owned(&Giant) || lock != NULL,
("sleeping without a lock"));
KASSERT(p != NULL, ("msleep1"));
KASSERT(ident != NULL && TD_IS_RUNNING(td), ("msleep"));
if (priority & PDROP)
KASSERT(lock != NULL && lock != &Giant.lock_object,
("PDROP requires a non-Giant lock"));
if (lock != NULL)
class = LOCK_CLASS(lock);
else
class = NULL;
if (cold) {
/*
* During autoconfiguration, just return;
* don't run any other threads or panic below,
* in case this is the idle thread and already asleep.
* XXX: this used to do "s = splhigh(); splx(safepri);
* splx(s);" to give interrupts a chance, but there is
* no way to give interrupts a chance now.
*/
if (lock != NULL && priority & PDROP)
class->lc_unlock(lock);
return (0);
}
catch = priority & PCATCH;
pri = priority & PRIMASK;
/*
* If we are already on a sleep queue, then remove us from that
* sleep queue first. We have to do this to handle recursive
* sleeps.
*/
if (TD_ON_SLEEPQ(td))
sleepq_remove(td, td->td_wchan);
if (ident == &pause_wchan)
flags = SLEEPQ_PAUSE;
else
flags = SLEEPQ_SLEEP;
if (catch)
flags |= SLEEPQ_INTERRUPTIBLE;
if (priority & PBDRY)
flags |= SLEEPQ_STOP_ON_BDRY;
sleepq_lock(ident);
CTR5(KTR_PROC, "sleep: thread %ld (pid %ld, %s) on %s (%p)",
td->td_tid, p->p_pid, td->td_name, wmesg, ident);
if (lock == &Giant.lock_object)
mtx_assert(&Giant, MA_OWNED);
DROP_GIANT();
if (lock != NULL && lock != &Giant.lock_object &&
!(class->lc_flags & LC_SLEEPABLE)) {
WITNESS_SAVE(lock, lock_witness);
lock_state = class->lc_unlock(lock);
} else
/* GCC needs to follow the Yellow Brick Road */
lock_state = -1;
/*
* We put ourselves on the sleep queue and start our timeout
* before calling thread_suspend_check, as we could stop there,
* and a wakeup or a SIGCONT (or both) could occur while we were
* stopped without resuming us. Thus, we must be ready for sleep
* when cursig() is called. If the wakeup happens while we're
* stopped, then td will no longer be on a sleep queue upon
* return from cursig().
*/
sleepq_add(ident, lock, wmesg, flags, 0);
if (timo)
sleepq_set_timeout(ident, timo);
if (lock != NULL && class->lc_flags & LC_SLEEPABLE) {
sleepq_release(ident);
WITNESS_SAVE(lock, lock_witness);
lock_state = class->lc_unlock(lock);
sleepq_lock(ident);
}
if (timo && catch)
rval = sleepq_timedwait_sig(ident, pri);
else if (timo)
rval = sleepq_timedwait(ident, pri);
else if (catch)
rval = sleepq_wait_sig(ident, pri);
else {
sleepq_wait(ident, pri);
rval = 0;
}
#ifdef KTRACE
if (KTRPOINT(td, KTR_CSW))
ktrcsw(0, 0);
#endif
PICKUP_GIANT();
if (lock != NULL && lock != &Giant.lock_object && !(priority & PDROP)) {
class->lc_lock(lock, lock_state);
WITNESS_RESTORE(lock, lock_witness);
}
return (rval);
}
int
msleep_spin(void *ident, struct mtx *mtx, const char *wmesg, int timo)
{
struct thread *td;
struct proc *p;
int rval;
WITNESS_SAVE_DECL(mtx);
td = curthread;
p = td->td_proc;
KASSERT(mtx != NULL, ("sleeping without a mutex"));
KASSERT(p != NULL, ("msleep1"));
KASSERT(ident != NULL && TD_IS_RUNNING(td), ("msleep"));
if (cold) {
/*
* During autoconfiguration, just return;
* don't run any other threads or panic below,
* in case this is the idle thread and already asleep.
* XXX: this used to do "s = splhigh(); splx(safepri);
* splx(s);" to give interrupts a chance, but there is
* no way to give interrupts a chance now.
*/
return (0);
}
sleepq_lock(ident);
CTR5(KTR_PROC, "msleep_spin: thread %ld (pid %ld, %s) on %s (%p)",
td->td_tid, p->p_pid, td->td_name, wmesg, ident);
DROP_GIANT();
mtx_assert(mtx, MA_OWNED | MA_NOTRECURSED);
WITNESS_SAVE(&mtx->lock_object, mtx);
mtx_unlock_spin(mtx);
/*
* We put ourselves on the sleep queue and start our timeout.
*/
sleepq_add(ident, &mtx->lock_object, wmesg, SLEEPQ_SLEEP, 0);
if (timo)
sleepq_set_timeout(ident, timo);
/*
* Can't call ktrace with any spin locks held so it can lock the
* ktrace_mtx lock, and WITNESS_WARN considers it an error to hold
* any spin lock. Thus, we have to drop the sleepq spin lock while
* we handle those requests. This is safe since we have placed our
* thread on the sleep queue already.
*/
#ifdef KTRACE
if (KTRPOINT(td, KTR_CSW)) {
sleepq_release(ident);
ktrcsw(1, 0);
sleepq_lock(ident);
}
#endif
#ifdef WITNESS
sleepq_release(ident);
WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "Sleeping on \"%s\"",
wmesg);
sleepq_lock(ident);
#endif
if (timo)
rval = sleepq_timedwait(ident, 0);
else {
sleepq_wait(ident, 0);
rval = 0;
}
#ifdef KTRACE
if (KTRPOINT(td, KTR_CSW))
ktrcsw(0, 0);
#endif
PICKUP_GIANT();
mtx_lock_spin(mtx);
WITNESS_RESTORE(&mtx->lock_object, mtx);
return (rval);
}
/*
* pause() is like tsleep() except that the intention is to not be
* explicitly woken up by another thread. Instead, the current thread
* simply wishes to sleep until the timeout expires. It is
* implemented using a dummy wait channel.
*/
int
pause(const char *wmesg, int timo)
{
KASSERT(timo != 0, ("pause: timeout required"));
return (tsleep(&pause_wchan, 0, wmesg, timo));
}
/*
* Make all threads sleeping on the specified identifier runnable.
*/
void
wakeup(void *ident)
{
int wakeup_swapper;
sleepq_lock(ident);
wakeup_swapper = sleepq_broadcast(ident, SLEEPQ_SLEEP, 0, 0);
sleepq_release(ident);
if (wakeup_swapper) {
KASSERT(ident != &proc0,
("wakeup and wakeup_swapper and proc0"));
kick_proc0();
}
}
/*
* Make a thread sleeping on the specified identifier runnable.
* May wake more than one thread if a target thread is currently
* swapped out.
*/
void
wakeup_one(void *ident)
{
int wakeup_swapper;
sleepq_lock(ident);
wakeup_swapper = sleepq_signal(ident, SLEEPQ_SLEEP, 0, 0);
sleepq_release(ident);
if (wakeup_swapper)
kick_proc0();
}
static void
kdb_switch(void)
{
thread_unlock(curthread);
kdb_backtrace();
kdb_reenter();
panic("%s: did not reenter debugger", __func__);
}
/*
* The machine independent parts of context switching.
*/
void
mi_switch(int flags, struct thread *newtd)
{
uint64_t runtime, new_switchtime;
struct thread *td;
struct proc *p;
td = curthread; /* XXX */
THREAD_LOCK_ASSERT(td, MA_OWNED | MA_NOTRECURSED);
p = td->td_proc; /* XXX */
KASSERT(!TD_ON_RUNQ(td), ("mi_switch: called by old code"));
#ifdef INVARIANTS
if (!TD_ON_LOCK(td) && !TD_IS_RUNNING(td))
mtx_assert(&Giant, MA_NOTOWNED);
#endif
KASSERT(td->td_critnest == 1 || panicstr,
("mi_switch: switch in a critical section"));
KASSERT((flags & (SW_INVOL | SW_VOL)) != 0,
("mi_switch: switch must be voluntary or involuntary"));
KASSERT(newtd != curthread, ("mi_switch: preempting back to ourself"));
/*
* Don't perform context switches from the debugger.
*/
if (kdb_active)
kdb_switch();
if (flags & SW_VOL) {
td->td_ru.ru_nvcsw++;
td->td_swvoltick = ticks;
} else
td->td_ru.ru_nivcsw++;
#ifdef SCHED_STATS
SCHED_STAT_INC(sched_switch_stats[flags & SW_TYPE_MASK]);
#endif
/*
* Compute the amount of time during which the current
* thread was running, and add that to its total so far.
*/
new_switchtime = cpu_ticks();
runtime = new_switchtime - PCPU_GET(switchtime);
td->td_runtime += runtime;
td->td_incruntime += runtime;
PCPU_SET(switchtime, new_switchtime);
td->td_generation++; /* bump preempt-detect counter */
PCPU_INC(cnt.v_swtch);
PCPU_SET(switchticks, ticks);
CTR4(KTR_PROC, "mi_switch: old thread %ld (td_sched %p, pid %ld, %s)",
td->td_tid, td->td_sched, p->p_pid, td->td_name);
#if (KTR_COMPILE & KTR_SCHED) != 0
if (TD_IS_IDLETHREAD(td))
KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "idle",
"prio:%d", td->td_priority);
else
KTR_STATE3(KTR_SCHED, "thread", sched_tdname(td), KTDSTATE(td),
"prio:%d", td->td_priority, "wmesg:\"%s\"", td->td_wmesg,
"lockname:\"%s\"", td->td_lockname);
#endif
#ifdef XEN
PT_UPDATES_FLUSH();
#endif
sched_switch(td, newtd, flags);
KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "running",
"prio:%d", td->td_priority);
CTR4(KTR_PROC, "mi_switch: new thread %ld (td_sched %p, pid %ld, %s)",
td->td_tid, td->td_sched, p->p_pid, td->td_name);
/*
* If the last thread was exiting, finish cleaning it up.
*/
if ((td = PCPU_GET(deadthread))) {
PCPU_SET(deadthread, NULL);
thread_stash(td);
}
}
/*
* Change thread state to be runnable, placing it on the run queue if
* it is in memory. If it is swapped out, return true so our caller
* will know to awaken the swapper.
*/
int
setrunnable(struct thread *td)
{
THREAD_LOCK_ASSERT(td, MA_OWNED);
KASSERT(td->td_proc->p_state != PRS_ZOMBIE,
("setrunnable: pid %d is a zombie", td->td_proc->p_pid));
switch (td->td_state) {
case TDS_RUNNING:
case TDS_RUNQ:
return (0);
case TDS_INHIBITED:
/*
* If we are only inhibited because we are swapped out
* then arange to swap in this process. Otherwise just return.
*/
if (td->td_inhibitors != TDI_SWAPPED)
return (0);
/* FALLTHROUGH */
case TDS_CAN_RUN:
break;
default:
printf("state is 0x%x", td->td_state);
panic("setrunnable(2)");
}
if ((td->td_flags & TDF_INMEM) == 0) {
if ((td->td_flags & TDF_SWAPINREQ) == 0) {
td->td_flags |= TDF_SWAPINREQ;
return (1);
}
} else
sched_wakeup(td);
return (0);
}
/*
* Compute a tenex style load average of a quantity on
* 1, 5 and 15 minute intervals.
*/
static void
loadav(void *arg)
{
int i, nrun;
struct loadavg *avg;
nrun = sched_load();
avg = &averunnable;
for (i = 0; i < 3; i++)
avg->ldavg[i] = (cexp[i] * avg->ldavg[i] +
nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT;
/*
* Schedule the next update to occur after 5 seconds, but add a
* random variation to avoid synchronisation with processes that
* run at regular intervals.
*/
callout_reset(&loadav_callout, hz * 4 + (int)(random() % (hz * 2 + 1)),
loadav, NULL);
}
/* ARGSUSED */
static void
synch_setup(void *dummy)
{
callout_init(&loadav_callout, CALLOUT_MPSAFE);
/* Kick off timeout driven events by calling first time. */
loadav(NULL);
}
int
should_yield(void)
{
return (ticks - curthread->td_swvoltick >= hogticks);
}
void
maybe_yield(void)
{
if (should_yield())
kern_yield(PRI_USER);
}
void
kern_yield(int prio)
{
struct thread *td;
td = curthread;
DROP_GIANT();
thread_lock(td);
if (prio == PRI_USER)
prio = td->td_user_pri;
if (prio >= 0)
sched_prio(td, prio);
mi_switch(SW_VOL | SWT_RELINQUISH, NULL);
thread_unlock(td);
PICKUP_GIANT();
}
/*
* General purpose yield system call.
*/
int
-yield(struct thread *td, struct yield_args *uap)
+sys_yield(struct thread *td, struct yield_args *uap)
{
thread_lock(td);
if (PRI_BASE(td->td_pri_class) == PRI_TIMESHARE)
sched_prio(td, PRI_MAX_TIMESHARE);
mi_switch(SW_VOL | SWT_RELINQUISH, NULL);
thread_unlock(td);
td->td_retval[0] = 0;
return (0);
}
Index: head/sys/kern/kern_sysctl.c
===================================================================
--- head/sys/kern/kern_sysctl.c (revision 225616)
+++ head/sys/kern/kern_sysctl.c (revision 225617)
@@ -1,1668 +1,1668 @@
/*-
* Copyright (c) 1982, 1986, 1989, 1993
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* Mike Karels at Berkeley Software Design, Inc.
*
* Quite extensively rewritten by Poul-Henning Kamp of the FreeBSD
* project, to make these variables more userfriendly.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_sysctl.c 8.4 (Berkeley) 4/14/94
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_capsicum.h"
#include "opt_compat.h"
#include "opt_ktrace.h"
#include <sys/param.h>
#include <sys/fail.h>
#include <sys/systm.h>
#include <sys/capability.h>
#include <sys/kernel.h>
#include <sys/sysctl.h>
#include <sys/malloc.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/jail.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/sbuf.h>
#include <sys/sx.h>
#include <sys/sysproto.h>
#include <sys/uio.h>
#ifdef KTRACE
#include <sys/ktrace.h>
#endif
#include <net/vnet.h>
#include <security/mac/mac_framework.h>
#include <vm/vm.h>
#include <vm/vm_extern.h>
static MALLOC_DEFINE(M_SYSCTL, "sysctl", "sysctl internal magic");
static MALLOC_DEFINE(M_SYSCTLOID, "sysctloid", "sysctl dynamic oids");
static MALLOC_DEFINE(M_SYSCTLTMP, "sysctltmp", "sysctl temp output buffer");
/*
* The sysctllock protects the MIB tree. It also protects sysctl
* contexts used with dynamic sysctls. The sysctl_register_oid() and
* sysctl_unregister_oid() routines require the sysctllock to already
* be held, so the sysctl_lock() and sysctl_unlock() routines are
* provided for the few places in the kernel which need to use that
* API rather than using the dynamic API. Use of the dynamic API is
* strongly encouraged for most code.
*
* The sysctlmemlock is used to limit the amount of user memory wired for
* sysctl requests. This is implemented by serializing any userland
* sysctl requests larger than a single page via an exclusive lock.
*/
static struct sx sysctllock;
static struct sx sysctlmemlock;
#define SYSCTL_XLOCK() sx_xlock(&sysctllock)
#define SYSCTL_XUNLOCK() sx_xunlock(&sysctllock)
#define SYSCTL_ASSERT_XLOCKED() sx_assert(&sysctllock, SA_XLOCKED)
#define SYSCTL_INIT() sx_init(&sysctllock, "sysctl lock")
#define SYSCTL_SLEEP(ch, wmesg, timo) \
sx_sleep(ch, &sysctllock, 0, wmesg, timo)
static int sysctl_root(SYSCTL_HANDLER_ARGS);
struct sysctl_oid_list sysctl__children; /* root list */
static int sysctl_remove_oid_locked(struct sysctl_oid *oidp, int del,
int recurse);
static struct sysctl_oid *
sysctl_find_oidname(const char *name, struct sysctl_oid_list *list)
{
struct sysctl_oid *oidp;
SYSCTL_ASSERT_XLOCKED();
SLIST_FOREACH(oidp, list, oid_link) {
if (strcmp(oidp->oid_name, name) == 0) {
return (oidp);
}
}
return (NULL);
}
/*
* Initialization of the MIB tree.
*
* Order by number in each list.
*/
void
sysctl_lock(void)
{
SYSCTL_XLOCK();
}
void
sysctl_unlock(void)
{
SYSCTL_XUNLOCK();
}
void
sysctl_register_oid(struct sysctl_oid *oidp)
{
struct sysctl_oid_list *parent = oidp->oid_parent;
struct sysctl_oid *p;
struct sysctl_oid *q;
/*
* First check if another oid with the same name already
* exists in the parent's list.
*/
SYSCTL_ASSERT_XLOCKED();
p = sysctl_find_oidname(oidp->oid_name, parent);
if (p != NULL) {
if ((p->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
p->oid_refcnt++;
return;
} else {
printf("can't re-use a leaf (%s)!\n", p->oid_name);
return;
}
}
/*
* If this oid has a number OID_AUTO, give it a number which
* is greater than any current oid.
* NOTE: DO NOT change the starting value here, change it in
* <sys/sysctl.h>, and make sure it is at least 256 to
* accomodate e.g. net.inet.raw as a static sysctl node.
*/
if (oidp->oid_number == OID_AUTO) {
static int newoid = CTL_AUTO_START;
oidp->oid_number = newoid++;
if (newoid == 0x7fffffff)
panic("out of oids");
}
#if 0
else if (oidp->oid_number >= CTL_AUTO_START) {
/* do not panic; this happens when unregistering sysctl sets */
printf("static sysctl oid too high: %d", oidp->oid_number);
}
#endif
/*
* Insert the oid into the parent's list in order.
*/
q = NULL;
SLIST_FOREACH(p, parent, oid_link) {
if (oidp->oid_number < p->oid_number)
break;
q = p;
}
if (q)
SLIST_INSERT_AFTER(q, oidp, oid_link);
else
SLIST_INSERT_HEAD(parent, oidp, oid_link);
}
void
sysctl_unregister_oid(struct sysctl_oid *oidp)
{
struct sysctl_oid *p;
int error;
SYSCTL_ASSERT_XLOCKED();
error = ENOENT;
if (oidp->oid_number == OID_AUTO) {
error = EINVAL;
} else {
SLIST_FOREACH(p, oidp->oid_parent, oid_link) {
if (p == oidp) {
SLIST_REMOVE(oidp->oid_parent, oidp,
sysctl_oid, oid_link);
error = 0;
break;
}
}
}
/*
* This can happen when a module fails to register and is
* being unloaded afterwards. It should not be a panic()
* for normal use.
*/
if (error)
printf("%s: failed to unregister sysctl\n", __func__);
}
/* Initialize a new context to keep track of dynamically added sysctls. */
int
sysctl_ctx_init(struct sysctl_ctx_list *c)
{
if (c == NULL) {
return (EINVAL);
}
/*
* No locking here, the caller is responsible for not adding
* new nodes to a context until after this function has
* returned.
*/
TAILQ_INIT(c);
return (0);
}
/* Free the context, and destroy all dynamic oids registered in this context */
int
sysctl_ctx_free(struct sysctl_ctx_list *clist)
{
struct sysctl_ctx_entry *e, *e1;
int error;
error = 0;
/*
* First perform a "dry run" to check if it's ok to remove oids.
* XXX FIXME
* XXX This algorithm is a hack. But I don't know any
* XXX better solution for now...
*/
SYSCTL_XLOCK();
TAILQ_FOREACH(e, clist, link) {
error = sysctl_remove_oid_locked(e->entry, 0, 0);
if (error)
break;
}
/*
* Restore deregistered entries, either from the end,
* or from the place where error occured.
* e contains the entry that was not unregistered
*/
if (error)
e1 = TAILQ_PREV(e, sysctl_ctx_list, link);
else
e1 = TAILQ_LAST(clist, sysctl_ctx_list);
while (e1 != NULL) {
sysctl_register_oid(e1->entry);
e1 = TAILQ_PREV(e1, sysctl_ctx_list, link);
}
if (error) {
SYSCTL_XUNLOCK();
return(EBUSY);
}
/* Now really delete the entries */
e = TAILQ_FIRST(clist);
while (e != NULL) {
e1 = TAILQ_NEXT(e, link);
error = sysctl_remove_oid_locked(e->entry, 1, 0);
if (error)
panic("sysctl_remove_oid: corrupt tree, entry: %s",
e->entry->oid_name);
free(e, M_SYSCTLOID);
e = e1;
}
SYSCTL_XUNLOCK();
return (error);
}
/* Add an entry to the context */
struct sysctl_ctx_entry *
sysctl_ctx_entry_add(struct sysctl_ctx_list *clist, struct sysctl_oid *oidp)
{
struct sysctl_ctx_entry *e;
SYSCTL_ASSERT_XLOCKED();
if (clist == NULL || oidp == NULL)
return(NULL);
e = malloc(sizeof(struct sysctl_ctx_entry), M_SYSCTLOID, M_WAITOK);
e->entry = oidp;
TAILQ_INSERT_HEAD(clist, e, link);
return (e);
}
/* Find an entry in the context */
struct sysctl_ctx_entry *
sysctl_ctx_entry_find(struct sysctl_ctx_list *clist, struct sysctl_oid *oidp)
{
struct sysctl_ctx_entry *e;
SYSCTL_ASSERT_XLOCKED();
if (clist == NULL || oidp == NULL)
return(NULL);
TAILQ_FOREACH(e, clist, link) {
if(e->entry == oidp)
return(e);
}
return (e);
}
/*
* Delete an entry from the context.
* NOTE: this function doesn't free oidp! You have to remove it
* with sysctl_remove_oid().
*/
int
sysctl_ctx_entry_del(struct sysctl_ctx_list *clist, struct sysctl_oid *oidp)
{
struct sysctl_ctx_entry *e;
if (clist == NULL || oidp == NULL)
return (EINVAL);
SYSCTL_XLOCK();
e = sysctl_ctx_entry_find(clist, oidp);
if (e != NULL) {
TAILQ_REMOVE(clist, e, link);
SYSCTL_XUNLOCK();
free(e, M_SYSCTLOID);
return (0);
} else {
SYSCTL_XUNLOCK();
return (ENOENT);
}
}
/*
* Remove dynamically created sysctl trees.
* oidp - top of the tree to be removed
* del - if 0 - just deregister, otherwise free up entries as well
* recurse - if != 0 traverse the subtree to be deleted
*/
int
sysctl_remove_oid(struct sysctl_oid *oidp, int del, int recurse)
{
int error;
SYSCTL_XLOCK();
error = sysctl_remove_oid_locked(oidp, del, recurse);
SYSCTL_XUNLOCK();
return (error);
}
int
sysctl_remove_name(struct sysctl_oid *parent, const char *name,
int del, int recurse)
{
struct sysctl_oid *p, *tmp;
int error;
error = ENOENT;
SYSCTL_XLOCK();
SLIST_FOREACH_SAFE(p, SYSCTL_CHILDREN(parent), oid_link, tmp) {
if (strcmp(p->oid_name, name) == 0) {
error = sysctl_remove_oid_locked(p, del, recurse);
break;
}
}
SYSCTL_XUNLOCK();
return (error);
}
static int
sysctl_remove_oid_locked(struct sysctl_oid *oidp, int del, int recurse)
{
struct sysctl_oid *p, *tmp;
int error;
SYSCTL_ASSERT_XLOCKED();
if (oidp == NULL)
return(EINVAL);
if ((oidp->oid_kind & CTLFLAG_DYN) == 0) {
printf("can't remove non-dynamic nodes!\n");
return (EINVAL);
}
/*
* WARNING: normal method to do this should be through
* sysctl_ctx_free(). Use recursing as the last resort
* method to purge your sysctl tree of leftovers...
* However, if some other code still references these nodes,
* it will panic.
*/
if ((oidp->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
if (oidp->oid_refcnt == 1) {
SLIST_FOREACH_SAFE(p,
SYSCTL_CHILDREN(oidp), oid_link, tmp) {
if (!recurse)
return (ENOTEMPTY);
error = sysctl_remove_oid_locked(p, del,
recurse);
if (error)
return (error);
}
if (del)
free(SYSCTL_CHILDREN(oidp), M_SYSCTLOID);
}
}
if (oidp->oid_refcnt > 1 ) {
oidp->oid_refcnt--;
} else {
if (oidp->oid_refcnt == 0) {
printf("Warning: bad oid_refcnt=%u (%s)!\n",
oidp->oid_refcnt, oidp->oid_name);
return (EINVAL);
}
sysctl_unregister_oid(oidp);
if (del) {
/*
* Wait for all threads running the handler to drain.
* This preserves the previous behavior when the
* sysctl lock was held across a handler invocation,
* and is necessary for module unload correctness.
*/
while (oidp->oid_running > 0) {
oidp->oid_kind |= CTLFLAG_DYING;
SYSCTL_SLEEP(&oidp->oid_running, "oidrm", 0);
}
if (oidp->oid_descr)
free((void *)(uintptr_t)(const void *)oidp->oid_descr, M_SYSCTLOID);
free((void *)(uintptr_t)(const void *)oidp->oid_name,
M_SYSCTLOID);
free(oidp, M_SYSCTLOID);
}
}
return (0);
}
/*
* Create new sysctls at run time.
* clist may point to a valid context initialized with sysctl_ctx_init().
*/
struct sysctl_oid *
sysctl_add_oid(struct sysctl_ctx_list *clist, struct sysctl_oid_list *parent,
int number, const char *name, int kind, void *arg1, intptr_t arg2,
int (*handler)(SYSCTL_HANDLER_ARGS), const char *fmt, const char *descr)
{
struct sysctl_oid *oidp;
ssize_t len;
char *newname;
/* You have to hook up somewhere.. */
if (parent == NULL)
return(NULL);
/* Check if the node already exists, otherwise create it */
SYSCTL_XLOCK();
oidp = sysctl_find_oidname(name, parent);
if (oidp != NULL) {
if ((oidp->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
oidp->oid_refcnt++;
/* Update the context */
if (clist != NULL)
sysctl_ctx_entry_add(clist, oidp);
SYSCTL_XUNLOCK();
return (oidp);
} else {
SYSCTL_XUNLOCK();
printf("can't re-use a leaf (%s)!\n", name);
return (NULL);
}
}
oidp = malloc(sizeof(struct sysctl_oid), M_SYSCTLOID, M_WAITOK|M_ZERO);
oidp->oid_parent = parent;
SLIST_NEXT(oidp, oid_link) = NULL;
oidp->oid_number = number;
oidp->oid_refcnt = 1;
len = strlen(name);
newname = malloc(len + 1, M_SYSCTLOID, M_WAITOK);
bcopy(name, newname, len + 1);
newname[len] = '\0';
oidp->oid_name = newname;
oidp->oid_handler = handler;
oidp->oid_kind = CTLFLAG_DYN | kind;
if ((kind & CTLTYPE) == CTLTYPE_NODE) {
/* Allocate space for children */
SYSCTL_CHILDREN_SET(oidp, malloc(sizeof(struct sysctl_oid_list),
M_SYSCTLOID, M_WAITOK));
SLIST_INIT(SYSCTL_CHILDREN(oidp));
oidp->oid_arg2 = arg2;
} else {
oidp->oid_arg1 = arg1;
oidp->oid_arg2 = arg2;
}
oidp->oid_fmt = fmt;
if (descr) {
int len = strlen(descr) + 1;
oidp->oid_descr = malloc(len, M_SYSCTLOID, M_WAITOK);
if (oidp->oid_descr)
strcpy((char *)(uintptr_t)(const void *)oidp->oid_descr, descr);
}
/* Update the context, if used */
if (clist != NULL)
sysctl_ctx_entry_add(clist, oidp);
/* Register this oid */
sysctl_register_oid(oidp);
SYSCTL_XUNLOCK();
return (oidp);
}
/*
* Rename an existing oid.
*/
void
sysctl_rename_oid(struct sysctl_oid *oidp, const char *name)
{
ssize_t len;
char *newname;
void *oldname;
len = strlen(name);
newname = malloc(len + 1, M_SYSCTLOID, M_WAITOK);
bcopy(name, newname, len + 1);
newname[len] = '\0';
SYSCTL_XLOCK();
oldname = (void *)(uintptr_t)(const void *)oidp->oid_name;
oidp->oid_name = newname;
SYSCTL_XUNLOCK();
free(oldname, M_SYSCTLOID);
}
/*
* Reparent an existing oid.
*/
int
sysctl_move_oid(struct sysctl_oid *oid, struct sysctl_oid_list *parent)
{
struct sysctl_oid *oidp;
SYSCTL_XLOCK();
if (oid->oid_parent == parent) {
SYSCTL_XUNLOCK();
return (0);
}
oidp = sysctl_find_oidname(oid->oid_name, parent);
if (oidp != NULL) {
SYSCTL_XUNLOCK();
return (EEXIST);
}
sysctl_unregister_oid(oid);
oid->oid_parent = parent;
oid->oid_number = OID_AUTO;
sysctl_register_oid(oid);
SYSCTL_XUNLOCK();
return (0);
}
/*
* Register the kernel's oids on startup.
*/
SET_DECLARE(sysctl_set, struct sysctl_oid);
static void
sysctl_register_all(void *arg)
{
struct sysctl_oid **oidp;
sx_init(&sysctlmemlock, "sysctl mem");
SYSCTL_INIT();
SYSCTL_XLOCK();
SET_FOREACH(oidp, sysctl_set)
sysctl_register_oid(*oidp);
SYSCTL_XUNLOCK();
}
SYSINIT(sysctl, SI_SUB_KMEM, SI_ORDER_ANY, sysctl_register_all, 0);
/*
* "Staff-functions"
*
* These functions implement a presently undocumented interface
* used by the sysctl program to walk the tree, and get the type
* so it can print the value.
* This interface is under work and consideration, and should probably
* be killed with a big axe by the first person who can find the time.
* (be aware though, that the proper interface isn't as obvious as it
* may seem, there are various conflicting requirements.
*
* {0,0} printf the entire MIB-tree.
* {0,1,...} return the name of the "..." OID.
* {0,2,...} return the next OID.
* {0,3} return the OID of the name in "new"
* {0,4,...} return the kind & format info for the "..." OID.
* {0,5,...} return the description the "..." OID.
*/
#ifdef SYSCTL_DEBUG
static void
sysctl_sysctl_debug_dump_node(struct sysctl_oid_list *l, int i)
{
int k;
struct sysctl_oid *oidp;
SYSCTL_ASSERT_XLOCKED();
SLIST_FOREACH(oidp, l, oid_link) {
for (k=0; k<i; k++)
printf(" ");
printf("%d %s ", oidp->oid_number, oidp->oid_name);
printf("%c%c",
oidp->oid_kind & CTLFLAG_RD ? 'R':' ',
oidp->oid_kind & CTLFLAG_WR ? 'W':' ');
if (oidp->oid_handler)
printf(" *Handler");
switch (oidp->oid_kind & CTLTYPE) {
case CTLTYPE_NODE:
printf(" Node\n");
if (!oidp->oid_handler) {
sysctl_sysctl_debug_dump_node(
oidp->oid_arg1, i+2);
}
break;
case CTLTYPE_INT: printf(" Int\n"); break;
case CTLTYPE_UINT: printf(" u_int\n"); break;
case CTLTYPE_LONG: printf(" Long\n"); break;
case CTLTYPE_ULONG: printf(" u_long\n"); break;
case CTLTYPE_STRING: printf(" String\n"); break;
case CTLTYPE_U64: printf(" uint64_t\n"); break;
case CTLTYPE_S64: printf(" int64_t\n"); break;
case CTLTYPE_OPAQUE: printf(" Opaque/struct\n"); break;
default: printf("\n");
}
}
}
static int
sysctl_sysctl_debug(SYSCTL_HANDLER_ARGS)
{
int error;
error = priv_check(req->td, PRIV_SYSCTL_DEBUG);
if (error)
return (error);
SYSCTL_XLOCK();
sysctl_sysctl_debug_dump_node(&sysctl__children, 0);
SYSCTL_XUNLOCK();
return (ENOENT);
}
SYSCTL_PROC(_sysctl, 0, debug, CTLTYPE_STRING|CTLFLAG_RD,
0, 0, sysctl_sysctl_debug, "-", "");
#endif
static int
sysctl_sysctl_name(SYSCTL_HANDLER_ARGS)
{
int *name = (int *) arg1;
u_int namelen = arg2;
int error = 0;
struct sysctl_oid *oid;
struct sysctl_oid_list *lsp = &sysctl__children, *lsp2;
char buf[10];
SYSCTL_XLOCK();
while (namelen) {
if (!lsp) {
snprintf(buf,sizeof(buf),"%d",*name);
if (req->oldidx)
error = SYSCTL_OUT(req, ".", 1);
if (!error)
error = SYSCTL_OUT(req, buf, strlen(buf));
if (error)
goto out;
namelen--;
name++;
continue;
}
lsp2 = 0;
SLIST_FOREACH(oid, lsp, oid_link) {
if (oid->oid_number != *name)
continue;
if (req->oldidx)
error = SYSCTL_OUT(req, ".", 1);
if (!error)
error = SYSCTL_OUT(req, oid->oid_name,
strlen(oid->oid_name));
if (error)
goto out;
namelen--;
name++;
if ((oid->oid_kind & CTLTYPE) != CTLTYPE_NODE)
break;
if (oid->oid_handler)
break;
lsp2 = SYSCTL_CHILDREN(oid);
break;
}
lsp = lsp2;
}
error = SYSCTL_OUT(req, "", 1);
out:
SYSCTL_XUNLOCK();
return (error);
}
/*
* XXXRW/JA: Shouldn't return name data for nodes that we don't permit in
* capability mode.
*/
static SYSCTL_NODE(_sysctl, 1, name, CTLFLAG_RD | CTLFLAG_CAPRD,
sysctl_sysctl_name, "");
static int
sysctl_sysctl_next_ls(struct sysctl_oid_list *lsp, int *name, u_int namelen,
int *next, int *len, int level, struct sysctl_oid **oidpp)
{
struct sysctl_oid *oidp;
SYSCTL_ASSERT_XLOCKED();
*len = level;
SLIST_FOREACH(oidp, lsp, oid_link) {
*next = oidp->oid_number;
*oidpp = oidp;
if (oidp->oid_kind & CTLFLAG_SKIP)
continue;
if (!namelen) {
if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE)
return (0);
if (oidp->oid_handler)
/* We really should call the handler here...*/
return (0);
lsp = SYSCTL_CHILDREN(oidp);
if (!sysctl_sysctl_next_ls(lsp, 0, 0, next+1,
len, level+1, oidpp))
return (0);
goto emptynode;
}
if (oidp->oid_number < *name)
continue;
if (oidp->oid_number > *name) {
if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE)
return (0);
if (oidp->oid_handler)
return (0);
lsp = SYSCTL_CHILDREN(oidp);
if (!sysctl_sysctl_next_ls(lsp, name+1, namelen-1,
next+1, len, level+1, oidpp))
return (0);
goto next;
}
if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE)
continue;
if (oidp->oid_handler)
continue;
lsp = SYSCTL_CHILDREN(oidp);
if (!sysctl_sysctl_next_ls(lsp, name+1, namelen-1, next+1,
len, level+1, oidpp))
return (0);
next:
namelen = 1;
emptynode:
*len = level;
}
return (1);
}
static int
sysctl_sysctl_next(SYSCTL_HANDLER_ARGS)
{
int *name = (int *) arg1;
u_int namelen = arg2;
int i, j, error;
struct sysctl_oid *oid;
struct sysctl_oid_list *lsp = &sysctl__children;
int newoid[CTL_MAXNAME];
SYSCTL_XLOCK();
i = sysctl_sysctl_next_ls(lsp, name, namelen, newoid, &j, 1, &oid);
SYSCTL_XUNLOCK();
if (i)
return (ENOENT);
error = SYSCTL_OUT(req, newoid, j * sizeof (int));
return (error);
}
/*
* XXXRW/JA: Shouldn't return next data for nodes that we don't permit in
* capability mode.
*/
static SYSCTL_NODE(_sysctl, 2, next, CTLFLAG_RD | CTLFLAG_CAPRD,
sysctl_sysctl_next, "");
static int
name2oid(char *name, int *oid, int *len, struct sysctl_oid **oidpp)
{
int i;
struct sysctl_oid *oidp;
struct sysctl_oid_list *lsp = &sysctl__children;
char *p;
SYSCTL_ASSERT_XLOCKED();
if (!*name)
return (ENOENT);
p = name + strlen(name) - 1 ;
if (*p == '.')
*p = '\0';
*len = 0;
for (p = name; *p && *p != '.'; p++)
;
i = *p;
if (i == '.')
*p = '\0';
oidp = SLIST_FIRST(lsp);
while (oidp && *len < CTL_MAXNAME) {
if (strcmp(name, oidp->oid_name)) {
oidp = SLIST_NEXT(oidp, oid_link);
continue;
}
*oid++ = oidp->oid_number;
(*len)++;
if (!i) {
if (oidpp)
*oidpp = oidp;
return (0);
}
if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE)
break;
if (oidp->oid_handler)
break;
lsp = SYSCTL_CHILDREN(oidp);
oidp = SLIST_FIRST(lsp);
name = p+1;
for (p = name; *p && *p != '.'; p++)
;
i = *p;
if (i == '.')
*p = '\0';
}
return (ENOENT);
}
static int
sysctl_sysctl_name2oid(SYSCTL_HANDLER_ARGS)
{
char *p;
int error, oid[CTL_MAXNAME], len = 0;
struct sysctl_oid *op = 0;
if (!req->newlen)
return (ENOENT);
if (req->newlen >= MAXPATHLEN) /* XXX arbitrary, undocumented */
return (ENAMETOOLONG);
p = malloc(req->newlen+1, M_SYSCTL, M_WAITOK);
error = SYSCTL_IN(req, p, req->newlen);
if (error) {
free(p, M_SYSCTL);
return (error);
}
p [req->newlen] = '\0';
SYSCTL_XLOCK();
error = name2oid(p, oid, &len, &op);
SYSCTL_XUNLOCK();
free(p, M_SYSCTL);
if (error)
return (error);
error = SYSCTL_OUT(req, oid, len * sizeof *oid);
return (error);
}
/*
* XXXRW/JA: Shouldn't return name2oid data for nodes that we don't permit in
* capability mode.
*/
SYSCTL_PROC(_sysctl, 3, name2oid,
CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_MPSAFE
| CTLFLAG_CAPRW, 0, 0, sysctl_sysctl_name2oid, "I", "");
static int
sysctl_sysctl_oidfmt(SYSCTL_HANDLER_ARGS)
{
struct sysctl_oid *oid;
int error;
SYSCTL_XLOCK();
error = sysctl_find_oid(arg1, arg2, &oid, NULL, req);
if (error)
goto out;
if (oid->oid_fmt == NULL) {
error = ENOENT;
goto out;
}
error = SYSCTL_OUT(req, &oid->oid_kind, sizeof(oid->oid_kind));
if (error)
goto out;
error = SYSCTL_OUT(req, oid->oid_fmt, strlen(oid->oid_fmt) + 1);
out:
SYSCTL_XUNLOCK();
return (error);
}
static SYSCTL_NODE(_sysctl, 4, oidfmt, CTLFLAG_RD|CTLFLAG_MPSAFE|CTLFLAG_CAPRD,
sysctl_sysctl_oidfmt, "");
static int
sysctl_sysctl_oiddescr(SYSCTL_HANDLER_ARGS)
{
struct sysctl_oid *oid;
int error;
SYSCTL_XLOCK();
error = sysctl_find_oid(arg1, arg2, &oid, NULL, req);
if (error)
goto out;
if (oid->oid_descr == NULL) {
error = ENOENT;
goto out;
}
error = SYSCTL_OUT(req, oid->oid_descr, strlen(oid->oid_descr) + 1);
out:
SYSCTL_XUNLOCK();
return (error);
}
static SYSCTL_NODE(_sysctl, 5, oiddescr, CTLFLAG_RD|CTLFLAG_CAPRD,
sysctl_sysctl_oiddescr, "");
/*
* Default "handler" functions.
*/
/*
* Handle an int, signed or unsigned.
* Two cases:
* a variable: point arg1 at it.
* a constant: pass it in arg2.
*/
int
sysctl_handle_int(SYSCTL_HANDLER_ARGS)
{
int tmpout, error = 0;
/*
* Attempt to get a coherent snapshot by making a copy of the data.
*/
if (arg1)
tmpout = *(int *)arg1;
else
tmpout = arg2;
error = SYSCTL_OUT(req, &tmpout, sizeof(int));
if (error || !req->newptr)
return (error);
if (!arg1)
error = EPERM;
else
error = SYSCTL_IN(req, arg1, sizeof(int));
return (error);
}
/*
* Based on on sysctl_handle_int() convert milliseconds into ticks.
* Note: this is used by TCP.
*/
int
sysctl_msec_to_ticks(SYSCTL_HANDLER_ARGS)
{
int error, s, tt;
tt = *(int *)arg1;
s = (int)((int64_t)tt * 1000 / hz);
error = sysctl_handle_int(oidp, &s, 0, req);
if (error || !req->newptr)
return (error);
tt = (int)((int64_t)s * hz / 1000);
if (tt < 1)
return (EINVAL);
*(int *)arg1 = tt;
return (0);
}
/*
* Handle a long, signed or unsigned. arg1 points to it.
*/
int
sysctl_handle_long(SYSCTL_HANDLER_ARGS)
{
int error = 0;
long tmplong;
#ifdef SCTL_MASK32
int tmpint;
#endif
/*
* Attempt to get a coherent snapshot by making a copy of the data.
*/
if (!arg1)
return (EINVAL);
tmplong = *(long *)arg1;
#ifdef SCTL_MASK32
if (req->flags & SCTL_MASK32) {
tmpint = tmplong;
error = SYSCTL_OUT(req, &tmpint, sizeof(int));
} else
#endif
error = SYSCTL_OUT(req, &tmplong, sizeof(long));
if (error || !req->newptr)
return (error);
#ifdef SCTL_MASK32
if (req->flags & SCTL_MASK32) {
error = SYSCTL_IN(req, &tmpint, sizeof(int));
*(long *)arg1 = (long)tmpint;
} else
#endif
error = SYSCTL_IN(req, arg1, sizeof(long));
return (error);
}
/*
* Handle a 64 bit int, signed or unsigned. arg1 points to it.
*/
int
sysctl_handle_64(SYSCTL_HANDLER_ARGS)
{
int error = 0;
uint64_t tmpout;
/*
* Attempt to get a coherent snapshot by making a copy of the data.
*/
if (!arg1)
return (EINVAL);
tmpout = *(uint64_t *)arg1;
error = SYSCTL_OUT(req, &tmpout, sizeof(uint64_t));
if (error || !req->newptr)
return (error);
error = SYSCTL_IN(req, arg1, sizeof(uint64_t));
return (error);
}
/*
* Handle our generic '\0' terminated 'C' string.
* Two cases:
* a variable string: point arg1 at it, arg2 is max length.
* a constant string: point arg1 at it, arg2 is zero.
*/
int
sysctl_handle_string(SYSCTL_HANDLER_ARGS)
{
int error=0;
char *tmparg;
size_t outlen;
/*
* Attempt to get a coherent snapshot by copying to a
* temporary kernel buffer.
*/
retry:
outlen = strlen((char *)arg1)+1;
tmparg = malloc(outlen, M_SYSCTLTMP, M_WAITOK);
if (strlcpy(tmparg, (char *)arg1, outlen) >= outlen) {
free(tmparg, M_SYSCTLTMP);
goto retry;
}
error = SYSCTL_OUT(req, tmparg, outlen);
free(tmparg, M_SYSCTLTMP);
if (error || !req->newptr)
return (error);
if ((req->newlen - req->newidx) >= arg2) {
error = EINVAL;
} else {
arg2 = (req->newlen - req->newidx);
error = SYSCTL_IN(req, arg1, arg2);
((char *)arg1)[arg2] = '\0';
}
return (error);
}
/*
* Handle any kind of opaque data.
* arg1 points to it, arg2 is the size.
*/
int
sysctl_handle_opaque(SYSCTL_HANDLER_ARGS)
{
int error, tries;
u_int generation;
struct sysctl_req req2;
/*
* Attempt to get a coherent snapshot, by using the thread
* pre-emption counter updated from within mi_switch() to
* determine if we were pre-empted during a bcopy() or
* copyout(). Make 3 attempts at doing this before giving up.
* If we encounter an error, stop immediately.
*/
tries = 0;
req2 = *req;
retry:
generation = curthread->td_generation;
error = SYSCTL_OUT(req, arg1, arg2);
if (error)
return (error);
tries++;
if (generation != curthread->td_generation && tries < 3) {
*req = req2;
goto retry;
}
error = SYSCTL_IN(req, arg1, arg2);
return (error);
}
/*
* Transfer functions to/from kernel space.
* XXX: rather untested at this point
*/
static int
sysctl_old_kernel(struct sysctl_req *req, const void *p, size_t l)
{
size_t i = 0;
if (req->oldptr) {
i = l;
if (req->oldlen <= req->oldidx)
i = 0;
else
if (i > req->oldlen - req->oldidx)
i = req->oldlen - req->oldidx;
if (i > 0)
bcopy(p, (char *)req->oldptr + req->oldidx, i);
}
req->oldidx += l;
if (req->oldptr && i != l)
return (ENOMEM);
return (0);
}
static int
sysctl_new_kernel(struct sysctl_req *req, void *p, size_t l)
{
if (!req->newptr)
return (0);
if (req->newlen - req->newidx < l)
return (EINVAL);
bcopy((char *)req->newptr + req->newidx, p, l);
req->newidx += l;
return (0);
}
int
kernel_sysctl(struct thread *td, int *name, u_int namelen, void *old,
size_t *oldlenp, void *new, size_t newlen, size_t *retval, int flags)
{
int error = 0;
struct sysctl_req req;
bzero(&req, sizeof req);
req.td = td;
req.flags = flags;
if (oldlenp) {
req.oldlen = *oldlenp;
}
req.validlen = req.oldlen;
if (old) {
req.oldptr= old;
}
if (new != NULL) {
req.newlen = newlen;
req.newptr = new;
}
req.oldfunc = sysctl_old_kernel;
req.newfunc = sysctl_new_kernel;
req.lock = REQ_UNWIRED;
SYSCTL_XLOCK();
error = sysctl_root(0, name, namelen, &req);
SYSCTL_XUNLOCK();
if (req.lock == REQ_WIRED && req.validlen > 0)
vsunlock(req.oldptr, req.validlen);
if (error && error != ENOMEM)
return (error);
if (retval) {
if (req.oldptr && req.oldidx > req.validlen)
*retval = req.validlen;
else
*retval = req.oldidx;
}
return (error);
}
int
kernel_sysctlbyname(struct thread *td, char *name, void *old, size_t *oldlenp,
void *new, size_t newlen, size_t *retval, int flags)
{
int oid[CTL_MAXNAME];
size_t oidlen, plen;
int error;
oid[0] = 0; /* sysctl internal magic */
oid[1] = 3; /* name2oid */
oidlen = sizeof(oid);
error = kernel_sysctl(td, oid, 2, oid, &oidlen,
(void *)name, strlen(name), &plen, flags);
if (error)
return (error);
error = kernel_sysctl(td, oid, plen / sizeof(int), old, oldlenp,
new, newlen, retval, flags);
return (error);
}
/*
* Transfer function to/from user space.
*/
static int
sysctl_old_user(struct sysctl_req *req, const void *p, size_t l)
{
int error = 0;
size_t i, len, origidx;
origidx = req->oldidx;
req->oldidx += l;
if (req->oldptr == NULL)
return (0);
/*
* If we have not wired the user supplied buffer and we are currently
* holding locks, drop a witness warning, as it's possible that
* write operations to the user page can sleep.
*/
if (req->lock != REQ_WIRED)
WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
"sysctl_old_user()");
i = l;
len = req->validlen;
if (len <= origidx)
i = 0;
else {
if (i > len - origidx)
i = len - origidx;
error = copyout(p, (char *)req->oldptr + origidx, i);
}
if (error)
return (error);
if (i < l)
return (ENOMEM);
return (0);
}
static int
sysctl_new_user(struct sysctl_req *req, void *p, size_t l)
{
int error;
if (!req->newptr)
return (0);
if (req->newlen - req->newidx < l)
return (EINVAL);
WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
"sysctl_new_user()");
error = copyin((char *)req->newptr + req->newidx, p, l);
req->newidx += l;
return (error);
}
/*
* Wire the user space destination buffer. If set to a value greater than
* zero, the len parameter limits the maximum amount of wired memory.
*/
int
sysctl_wire_old_buffer(struct sysctl_req *req, size_t len)
{
int ret;
size_t wiredlen;
wiredlen = (len > 0 && len < req->oldlen) ? len : req->oldlen;
ret = 0;
if (req->lock != REQ_WIRED && req->oldptr &&
req->oldfunc == sysctl_old_user) {
if (wiredlen != 0) {
ret = vslock(req->oldptr, wiredlen);
if (ret != 0) {
if (ret != ENOMEM)
return (ret);
wiredlen = 0;
}
}
req->lock = REQ_WIRED;
req->validlen = wiredlen;
}
return (0);
}
int
sysctl_find_oid(int *name, u_int namelen, struct sysctl_oid **noid,
int *nindx, struct sysctl_req *req)
{
struct sysctl_oid_list *lsp;
struct sysctl_oid *oid;
int indx;
SYSCTL_ASSERT_XLOCKED();
lsp = &sysctl__children;
indx = 0;
while (indx < CTL_MAXNAME) {
SLIST_FOREACH(oid, lsp, oid_link) {
if (oid->oid_number == name[indx])
break;
}
if (oid == NULL)
return (ENOENT);
indx++;
if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
if (oid->oid_handler != NULL || indx == namelen) {
*noid = oid;
if (nindx != NULL)
*nindx = indx;
KASSERT((oid->oid_kind & CTLFLAG_DYING) == 0,
("%s found DYING node %p", __func__, oid));
return (0);
}
lsp = SYSCTL_CHILDREN(oid);
} else if (indx == namelen) {
*noid = oid;
if (nindx != NULL)
*nindx = indx;
KASSERT((oid->oid_kind & CTLFLAG_DYING) == 0,
("%s found DYING node %p", __func__, oid));
return (0);
} else {
return (ENOTDIR);
}
}
return (ENOENT);
}
/*
* Traverse our tree, and find the right node, execute whatever it points
* to, and return the resulting error code.
*/
static int
sysctl_root(SYSCTL_HANDLER_ARGS)
{
struct sysctl_oid *oid;
int error, indx, lvl;
SYSCTL_ASSERT_XLOCKED();
error = sysctl_find_oid(arg1, arg2, &oid, &indx, req);
if (error)
return (error);
if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
/*
* You can't call a sysctl when it's a node, but has
* no handler. Inform the user that it's a node.
* The indx may or may not be the same as namelen.
*/
if (oid->oid_handler == NULL)
return (EISDIR);
}
/* Is this sysctl writable? */
if (req->newptr && !(oid->oid_kind & CTLFLAG_WR))
return (EPERM);
KASSERT(req->td != NULL, ("sysctl_root(): req->td == NULL"));
#ifdef CAPABILITY_MODE
/*
* If the process is in capability mode, then don't permit reading or
* writing unless specifically granted for the node.
*/
if (IN_CAPABILITY_MODE(req->td)) {
if (req->oldptr && !(oid->oid_kind & CTLFLAG_CAPRD))
return (EPERM);
if (req->newptr && !(oid->oid_kind & CTLFLAG_CAPWR))
return (EPERM);
}
#endif
/* Is this sysctl sensitive to securelevels? */
if (req->newptr && (oid->oid_kind & CTLFLAG_SECURE)) {
lvl = (oid->oid_kind & CTLMASK_SECURE) >> CTLSHIFT_SECURE;
error = securelevel_gt(req->td->td_ucred, lvl);
if (error)
return (error);
}
/* Is this sysctl writable by only privileged users? */
if (req->newptr && !(oid->oid_kind & CTLFLAG_ANYBODY)) {
int priv;
if (oid->oid_kind & CTLFLAG_PRISON)
priv = PRIV_SYSCTL_WRITEJAIL;
#ifdef VIMAGE
else if ((oid->oid_kind & CTLFLAG_VNET) &&
prison_owns_vnet(req->td->td_ucred))
priv = PRIV_SYSCTL_WRITEJAIL;
#endif
else
priv = PRIV_SYSCTL_WRITE;
error = priv_check(req->td, priv);
if (error)
return (error);
}
if (!oid->oid_handler)
return (EINVAL);
if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
arg1 = (int *)arg1 + indx;
arg2 -= indx;
} else {
arg1 = oid->oid_arg1;
arg2 = oid->oid_arg2;
}
#ifdef MAC
error = mac_system_check_sysctl(req->td->td_ucred, oid, arg1, arg2,
req);
if (error != 0)
return (error);
#endif
oid->oid_running++;
SYSCTL_XUNLOCK();
if (!(oid->oid_kind & CTLFLAG_MPSAFE))
mtx_lock(&Giant);
error = oid->oid_handler(oid, arg1, arg2, req);
if (!(oid->oid_kind & CTLFLAG_MPSAFE))
mtx_unlock(&Giant);
KFAIL_POINT_ERROR(_debug_fail_point, sysctl_running, error);
SYSCTL_XLOCK();
oid->oid_running--;
if (oid->oid_running == 0 && (oid->oid_kind & CTLFLAG_DYING) != 0)
wakeup(&oid->oid_running);
return (error);
}
#ifndef _SYS_SYSPROTO_H_
struct sysctl_args {
int *name;
u_int namelen;
void *old;
size_t *oldlenp;
void *new;
size_t newlen;
};
#endif
int
-__sysctl(struct thread *td, struct sysctl_args *uap)
+sys___sysctl(struct thread *td, struct sysctl_args *uap)
{
int error, i, name[CTL_MAXNAME];
size_t j;
if (uap->namelen > CTL_MAXNAME || uap->namelen < 2)
return (EINVAL);
error = copyin(uap->name, &name, uap->namelen * sizeof(int));
if (error)
return (error);
error = userland_sysctl(td, name, uap->namelen,
uap->old, uap->oldlenp, 0,
uap->new, uap->newlen, &j, 0);
if (error && error != ENOMEM)
return (error);
if (uap->oldlenp) {
i = copyout(&j, uap->oldlenp, sizeof(j));
if (i)
return (i);
}
return (error);
}
/*
* This is used from various compatibility syscalls too. That's why name
* must be in kernel space.
*/
int
userland_sysctl(struct thread *td, int *name, u_int namelen, void *old,
size_t *oldlenp, int inkernel, void *new, size_t newlen, size_t *retval,
int flags)
{
int error = 0, memlocked;
struct sysctl_req req;
bzero(&req, sizeof req);
req.td = td;
req.flags = flags;
if (oldlenp) {
if (inkernel) {
req.oldlen = *oldlenp;
} else {
error = copyin(oldlenp, &req.oldlen, sizeof(*oldlenp));
if (error)
return (error);
}
}
req.validlen = req.oldlen;
if (old) {
if (!useracc(old, req.oldlen, VM_PROT_WRITE))
return (EFAULT);
req.oldptr= old;
}
if (new != NULL) {
if (!useracc(new, newlen, VM_PROT_READ))
return (EFAULT);
req.newlen = newlen;
req.newptr = new;
}
req.oldfunc = sysctl_old_user;
req.newfunc = sysctl_new_user;
req.lock = REQ_UNWIRED;
#ifdef KTRACE
if (KTRPOINT(curthread, KTR_SYSCTL))
ktrsysctl(name, namelen);
#endif
if (req.oldlen > PAGE_SIZE) {
memlocked = 1;
sx_xlock(&sysctlmemlock);
} else
memlocked = 0;
CURVNET_SET(TD_TO_VNET(td));
for (;;) {
req.oldidx = 0;
req.newidx = 0;
SYSCTL_XLOCK();
error = sysctl_root(0, name, namelen, &req);
SYSCTL_XUNLOCK();
if (error != EAGAIN)
break;
kern_yield(PRI_USER);
}
CURVNET_RESTORE();
if (req.lock == REQ_WIRED && req.validlen > 0)
vsunlock(req.oldptr, req.validlen);
if (memlocked)
sx_xunlock(&sysctlmemlock);
if (error && error != ENOMEM)
return (error);
if (retval) {
if (req.oldptr && req.oldidx > req.validlen)
*retval = req.validlen;
else
*retval = req.oldidx;
}
return (error);
}
/*
* Drain into a sysctl struct. The user buffer should be wired if a page
* fault would cause issue.
*/
static int
sbuf_sysctl_drain(void *arg, const char *data, int len)
{
struct sysctl_req *req = arg;
int error;
error = SYSCTL_OUT(req, data, len);
KASSERT(error >= 0, ("Got unexpected negative value %d", error));
return (error == 0 ? len : -error);
}
struct sbuf *
sbuf_new_for_sysctl(struct sbuf *s, char *buf, int length,
struct sysctl_req *req)
{
s = sbuf_new(s, buf, length, SBUF_FIXEDLEN);
sbuf_set_drain(s, sbuf_sysctl_drain, req);
return (s);
}
Index: head/sys/kern/kern_thr.c
===================================================================
--- head/sys/kern/kern_thr.c (revision 225616)
+++ head/sys/kern/kern_thr.c (revision 225617)
@@ -1,555 +1,555 @@
/*-
* Copyright (c) 2003, Jeffrey Roberson <jeff@freebsd.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice unmodified, this list of conditions, and the following
* disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_compat.h"
#include "opt_posix.h"
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/posix4.h>
#include <sys/racct.h>
#include <sys/resourcevar.h>
#include <sys/rwlock.h>
#include <sys/sched.h>
#include <sys/sysctl.h>
#include <sys/smp.h>
#include <sys/syscallsubr.h>
#include <sys/sysent.h>
#include <sys/systm.h>
#include <sys/sysproto.h>
#include <sys/signalvar.h>
#include <sys/sysctl.h>
#include <sys/ucontext.h>
#include <sys/thr.h>
#include <sys/rtprio.h>
#include <sys/umtx.h>
#include <sys/limits.h>
#include <machine/frame.h>
#include <security/audit/audit.h>
SYSCTL_NODE(_kern, OID_AUTO, threads, CTLFLAG_RW, 0, "thread allocation");
static int max_threads_per_proc = 1500;
SYSCTL_INT(_kern_threads, OID_AUTO, max_threads_per_proc, CTLFLAG_RW,
&max_threads_per_proc, 0, "Limit on threads per proc");
static int max_threads_hits;
SYSCTL_INT(_kern_threads, OID_AUTO, max_threads_hits, CTLFLAG_RD,
&max_threads_hits, 0, "");
#ifdef COMPAT_FREEBSD32
static inline int
suword_lwpid(void *addr, lwpid_t lwpid)
{
int error;
if (SV_CURPROC_FLAG(SV_LP64))
error = suword(addr, lwpid);
else
error = suword32(addr, lwpid);
return (error);
}
#else
#define suword_lwpid suword
#endif
static int create_thread(struct thread *td, mcontext_t *ctx,
void (*start_func)(void *), void *arg,
char *stack_base, size_t stack_size,
char *tls_base,
long *child_tid, long *parent_tid,
int flags, struct rtprio *rtp);
/*
* System call interface.
*/
int
-thr_create(struct thread *td, struct thr_create_args *uap)
+sys_thr_create(struct thread *td, struct thr_create_args *uap)
/* ucontext_t *ctx, long *id, int flags */
{
ucontext_t ctx;
int error;
if ((error = copyin(uap->ctx, &ctx, sizeof(ctx))))
return (error);
error = create_thread(td, &ctx.uc_mcontext, NULL, NULL,
NULL, 0, NULL, uap->id, NULL, uap->flags, NULL);
return (error);
}
int
-thr_new(struct thread *td, struct thr_new_args *uap)
+sys_thr_new(struct thread *td, struct thr_new_args *uap)
/* struct thr_param * */
{
struct thr_param param;
int error;
if (uap->param_size < 0 || uap->param_size > sizeof(param))
return (EINVAL);
bzero(&param, sizeof(param));
if ((error = copyin(uap->param, &param, uap->param_size)))
return (error);
return (kern_thr_new(td, &param));
}
int
kern_thr_new(struct thread *td, struct thr_param *param)
{
struct rtprio rtp, *rtpp;
int error;
rtpp = NULL;
if (param->rtp != 0) {
error = copyin(param->rtp, &rtp, sizeof(struct rtprio));
if (error)
return (error);
rtpp = &rtp;
}
error = create_thread(td, NULL, param->start_func, param->arg,
param->stack_base, param->stack_size, param->tls_base,
param->child_tid, param->parent_tid, param->flags,
rtpp);
return (error);
}
static int
create_thread(struct thread *td, mcontext_t *ctx,
void (*start_func)(void *), void *arg,
char *stack_base, size_t stack_size,
char *tls_base,
long *child_tid, long *parent_tid,
int flags, struct rtprio *rtp)
{
stack_t stack;
struct thread *newtd;
struct proc *p;
int error;
p = td->td_proc;
/* Have race condition but it is cheap. */
if (p->p_numthreads >= max_threads_per_proc) {
++max_threads_hits;
return (EPROCLIM);
}
if (rtp != NULL) {
switch(rtp->type) {
case RTP_PRIO_REALTIME:
case RTP_PRIO_FIFO:
/* Only root can set scheduler policy */
if (priv_check(td, PRIV_SCHED_SETPOLICY) != 0)
return (EPERM);
if (rtp->prio > RTP_PRIO_MAX)
return (EINVAL);
break;
case RTP_PRIO_NORMAL:
rtp->prio = 0;
break;
default:
return (EINVAL);
}
}
#ifdef RACCT
PROC_LOCK(td->td_proc);
error = racct_add(p, RACCT_NTHR, 1);
PROC_UNLOCK(td->td_proc);
if (error != 0)
return (EPROCLIM);
#endif
/* Initialize our td */
newtd = thread_alloc(0);
if (newtd == NULL) {
error = ENOMEM;
goto fail;
}
/*
* Try the copyout as soon as we allocate the td so we don't
* have to tear things down in a failure case below.
* Here we copy out tid to two places, one for child and one
* for parent, because pthread can create a detached thread,
* if parent wants to safely access child tid, it has to provide
* its storage, because child thread may exit quickly and
* memory is freed before parent thread can access it.
*/
if ((child_tid != NULL &&
suword_lwpid(child_tid, newtd->td_tid)) ||
(parent_tid != NULL &&
suword_lwpid(parent_tid, newtd->td_tid))) {
thread_free(newtd);
error = EFAULT;
goto fail;
}
bzero(&newtd->td_startzero,
__rangeof(struct thread, td_startzero, td_endzero));
bcopy(&td->td_startcopy, &newtd->td_startcopy,
__rangeof(struct thread, td_startcopy, td_endcopy));
newtd->td_proc = td->td_proc;
newtd->td_ucred = crhold(td->td_ucred);
cpu_set_upcall(newtd, td);
if (ctx != NULL) { /* old way to set user context */
error = set_mcontext(newtd, ctx);
if (error != 0) {
thread_free(newtd);
crfree(td->td_ucred);
goto fail;
}
} else {
/* Set up our machine context. */
stack.ss_sp = stack_base;
stack.ss_size = stack_size;
/* Set upcall address to user thread entry function. */
cpu_set_upcall_kse(newtd, start_func, arg, &stack);
/* Setup user TLS address and TLS pointer register. */
error = cpu_set_user_tls(newtd, tls_base);
if (error != 0) {
thread_free(newtd);
crfree(td->td_ucred);
goto fail;
}
}
PROC_LOCK(td->td_proc);
td->td_proc->p_flag |= P_HADTHREADS;
newtd->td_sigmask = td->td_sigmask;
thread_link(newtd, p);
bcopy(p->p_comm, newtd->td_name, sizeof(newtd->td_name));
thread_lock(td);
/* let the scheduler know about these things. */
sched_fork_thread(td, newtd);
thread_unlock(td);
if (P_SHOULDSTOP(p))
newtd->td_flags |= TDF_ASTPENDING | TDF_NEEDSUSPCHK;
PROC_UNLOCK(p);
tidhash_add(newtd);
thread_lock(newtd);
if (rtp != NULL) {
if (!(td->td_pri_class == PRI_TIMESHARE &&
rtp->type == RTP_PRIO_NORMAL)) {
rtp_to_pri(rtp, newtd);
sched_prio(newtd, newtd->td_user_pri);
} /* ignore timesharing class */
}
TD_SET_CAN_RUN(newtd);
sched_add(newtd, SRQ_BORING);
thread_unlock(newtd);
return (0);
fail:
#ifdef RACCT
PROC_LOCK(p);
racct_sub(p, RACCT_NTHR, 1);
PROC_UNLOCK(p);
#endif
return (error);
}
int
-thr_self(struct thread *td, struct thr_self_args *uap)
+sys_thr_self(struct thread *td, struct thr_self_args *uap)
/* long *id */
{
int error;
error = suword_lwpid(uap->id, (unsigned)td->td_tid);
if (error == -1)
return (EFAULT);
return (0);
}
int
-thr_exit(struct thread *td, struct thr_exit_args *uap)
+sys_thr_exit(struct thread *td, struct thr_exit_args *uap)
/* long *state */
{
struct proc *p;
p = td->td_proc;
/* Signal userland that it can free the stack. */
if ((void *)uap->state != NULL) {
suword_lwpid(uap->state, 1);
kern_umtx_wake(td, uap->state, INT_MAX, 0);
}
rw_wlock(&tidhash_lock);
PROC_LOCK(p);
racct_sub(p, RACCT_NTHR, 1);
/*
* Shutting down last thread in the proc. This will actually
* call exit() in the trampoline when it returns.
*/
if (p->p_numthreads != 1) {
LIST_REMOVE(td, td_hash);
rw_wunlock(&tidhash_lock);
tdsigcleanup(td);
PROC_SLOCK(p);
thread_stopped(p);
thread_exit();
/* NOTREACHED */
}
PROC_UNLOCK(p);
rw_wunlock(&tidhash_lock);
return (0);
}
int
-thr_kill(struct thread *td, struct thr_kill_args *uap)
+sys_thr_kill(struct thread *td, struct thr_kill_args *uap)
/* long id, int sig */
{
ksiginfo_t ksi;
struct thread *ttd;
struct proc *p;
int error;
p = td->td_proc;
ksiginfo_init(&ksi);
ksi.ksi_signo = uap->sig;
ksi.ksi_code = SI_LWP;
ksi.ksi_pid = p->p_pid;
ksi.ksi_uid = td->td_ucred->cr_ruid;
if (uap->id == -1) {
if (uap->sig != 0 && !_SIG_VALID(uap->sig)) {
error = EINVAL;
} else {
error = ESRCH;
PROC_LOCK(p);
FOREACH_THREAD_IN_PROC(p, ttd) {
if (ttd != td) {
error = 0;
if (uap->sig == 0)
break;
tdksignal(ttd, uap->sig, &ksi);
}
}
PROC_UNLOCK(p);
}
} else {
error = 0;
ttd = tdfind((lwpid_t)uap->id, p->p_pid);
if (ttd == NULL)
return (ESRCH);
if (uap->sig == 0)
;
else if (!_SIG_VALID(uap->sig))
error = EINVAL;
else
tdksignal(ttd, uap->sig, &ksi);
PROC_UNLOCK(ttd->td_proc);
}
return (error);
}
int
-thr_kill2(struct thread *td, struct thr_kill2_args *uap)
+sys_thr_kill2(struct thread *td, struct thr_kill2_args *uap)
/* pid_t pid, long id, int sig */
{
ksiginfo_t ksi;
struct thread *ttd;
struct proc *p;
int error;
AUDIT_ARG_SIGNUM(uap->sig);
ksiginfo_init(&ksi);
ksi.ksi_signo = uap->sig;
ksi.ksi_code = SI_LWP;
ksi.ksi_pid = td->td_proc->p_pid;
ksi.ksi_uid = td->td_ucred->cr_ruid;
if (uap->id == -1) {
if ((p = pfind(uap->pid)) == NULL)
return (ESRCH);
AUDIT_ARG_PROCESS(p);
error = p_cansignal(td, p, uap->sig);
if (error) {
PROC_UNLOCK(p);
return (error);
}
if (uap->sig != 0 && !_SIG_VALID(uap->sig)) {
error = EINVAL;
} else {
error = ESRCH;
FOREACH_THREAD_IN_PROC(p, ttd) {
if (ttd != td) {
error = 0;
if (uap->sig == 0)
break;
tdksignal(ttd, uap->sig, &ksi);
}
}
}
PROC_UNLOCK(p);
} else {
ttd = tdfind((lwpid_t)uap->id, uap->pid);
if (ttd == NULL)
return (ESRCH);
p = ttd->td_proc;
AUDIT_ARG_PROCESS(p);
error = p_cansignal(td, p, uap->sig);
if (uap->sig == 0)
;
else if (!_SIG_VALID(uap->sig))
error = EINVAL;
else
tdksignal(ttd, uap->sig, &ksi);
PROC_UNLOCK(p);
}
return (error);
}
int
-thr_suspend(struct thread *td, struct thr_suspend_args *uap)
+sys_thr_suspend(struct thread *td, struct thr_suspend_args *uap)
/* const struct timespec *timeout */
{
struct timespec ts, *tsp;
int error;
tsp = NULL;
if (uap->timeout != NULL) {
error = copyin((const void *)uap->timeout, (void *)&ts,
sizeof(struct timespec));
if (error != 0)
return (error);
tsp = &ts;
}
return (kern_thr_suspend(td, tsp));
}
int
kern_thr_suspend(struct thread *td, struct timespec *tsp)
{
struct proc *p = td->td_proc;
struct timeval tv;
int error = 0;
int timo = 0;
if (td->td_pflags & TDP_WAKEUP) {
td->td_pflags &= ~TDP_WAKEUP;
return (0);
}
if (tsp != NULL) {
if (tsp->tv_nsec < 0 || tsp->tv_nsec > 1000000000)
return (EINVAL);
if (tsp->tv_sec == 0 && tsp->tv_nsec == 0)
error = EWOULDBLOCK;
else {
TIMESPEC_TO_TIMEVAL(&tv, tsp);
timo = tvtohz(&tv);
}
}
PROC_LOCK(p);
if (error == 0 && (td->td_flags & TDF_THRWAKEUP) == 0)
error = msleep((void *)td, &p->p_mtx,
PCATCH, "lthr", timo);
if (td->td_flags & TDF_THRWAKEUP) {
thread_lock(td);
td->td_flags &= ~TDF_THRWAKEUP;
thread_unlock(td);
PROC_UNLOCK(p);
return (0);
}
PROC_UNLOCK(p);
if (error == EWOULDBLOCK)
error = ETIMEDOUT;
else if (error == ERESTART) {
if (timo != 0)
error = EINTR;
}
return (error);
}
int
-thr_wake(struct thread *td, struct thr_wake_args *uap)
+sys_thr_wake(struct thread *td, struct thr_wake_args *uap)
/* long id */
{
struct proc *p;
struct thread *ttd;
if (uap->id == td->td_tid) {
td->td_pflags |= TDP_WAKEUP;
return (0);
}
p = td->td_proc;
ttd = tdfind((lwpid_t)uap->id, p->p_pid);
if (ttd == NULL)
return (ESRCH);
thread_lock(ttd);
ttd->td_flags |= TDF_THRWAKEUP;
thread_unlock(ttd);
wakeup((void *)ttd);
PROC_UNLOCK(p);
return (0);
}
int
-thr_set_name(struct thread *td, struct thr_set_name_args *uap)
+sys_thr_set_name(struct thread *td, struct thr_set_name_args *uap)
{
struct proc *p;
char name[MAXCOMLEN + 1];
struct thread *ttd;
int error;
error = 0;
name[0] = '\0';
if (uap->name != NULL) {
error = copyinstr(uap->name, name, sizeof(name),
NULL);
if (error)
return (error);
}
p = td->td_proc;
ttd = tdfind((lwpid_t)uap->id, p->p_pid);
if (ttd == NULL)
return (ESRCH);
strcpy(ttd->td_name, name);
PROC_UNLOCK(p);
return (error);
}
Index: head/sys/kern/kern_time.c
===================================================================
--- head/sys/kern/kern_time.c (revision 225616)
+++ head/sys/kern/kern_time.c (revision 225617)
@@ -1,1496 +1,1496 @@
/*-
* Copyright (c) 1982, 1986, 1989, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_time.c 8.1 (Berkeley) 6/10/93
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/limits.h>
#include <sys/clock.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/sysproto.h>
#include <sys/eventhandler.h>
#include <sys/resourcevar.h>
#include <sys/signalvar.h>
#include <sys/kernel.h>
#include <sys/syscallsubr.h>
#include <sys/sysctl.h>
#include <sys/sysent.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/posix4.h>
#include <sys/time.h>
#include <sys/timers.h>
#include <sys/timetc.h>
#include <sys/vnode.h>
#include <vm/vm.h>
#include <vm/vm_extern.h>
#define MAX_CLOCKS (CLOCK_MONOTONIC+1)
static struct kclock posix_clocks[MAX_CLOCKS];
static uma_zone_t itimer_zone = NULL;
/*
* Time of day and interval timer support.
*
* These routines provide the kernel entry points to get and set
* the time-of-day and per-process interval timers. Subroutines
* here provide support for adding and subtracting timeval structures
* and decrementing interval timers, optionally reloading the interval
* timers when they expire.
*/
static int settime(struct thread *, struct timeval *);
static void timevalfix(struct timeval *);
static void itimer_start(void);
static int itimer_init(void *, int, int);
static void itimer_fini(void *, int);
static void itimer_enter(struct itimer *);
static void itimer_leave(struct itimer *);
static struct itimer *itimer_find(struct proc *, int);
static void itimers_alloc(struct proc *);
static void itimers_event_hook_exec(void *arg, struct proc *p, struct image_params *imgp);
static void itimers_event_hook_exit(void *arg, struct proc *p);
static int realtimer_create(struct itimer *);
static int realtimer_gettime(struct itimer *, struct itimerspec *);
static int realtimer_settime(struct itimer *, int,
struct itimerspec *, struct itimerspec *);
static int realtimer_delete(struct itimer *);
static void realtimer_clocktime(clockid_t, struct timespec *);
static void realtimer_expire(void *);
static int kern_timer_create(struct thread *, clockid_t,
struct sigevent *, int *, int);
static int kern_timer_delete(struct thread *, int);
int register_posix_clock(int, struct kclock *);
void itimer_fire(struct itimer *it);
int itimespecfix(struct timespec *ts);
#define CLOCK_CALL(clock, call, arglist) \
((*posix_clocks[clock].call) arglist)
SYSINIT(posix_timer, SI_SUB_P1003_1B, SI_ORDER_FIRST+4, itimer_start, NULL);
static int
settime(struct thread *td, struct timeval *tv)
{
struct timeval delta, tv1, tv2;
static struct timeval maxtime, laststep;
struct timespec ts;
int s;
s = splclock();
microtime(&tv1);
delta = *tv;
timevalsub(&delta, &tv1);
/*
* If the system is secure, we do not allow the time to be
* set to a value earlier than 1 second less than the highest
* time we have yet seen. The worst a miscreant can do in
* this circumstance is "freeze" time. He couldn't go
* back to the past.
*
* We similarly do not allow the clock to be stepped more
* than one second, nor more than once per second. This allows
* a miscreant to make the clock march double-time, but no worse.
*/
if (securelevel_gt(td->td_ucred, 1) != 0) {
if (delta.tv_sec < 0 || delta.tv_usec < 0) {
/*
* Update maxtime to latest time we've seen.
*/
if (tv1.tv_sec > maxtime.tv_sec)
maxtime = tv1;
tv2 = *tv;
timevalsub(&tv2, &maxtime);
if (tv2.tv_sec < -1) {
tv->tv_sec = maxtime.tv_sec - 1;
printf("Time adjustment clamped to -1 second\n");
}
} else {
if (tv1.tv_sec == laststep.tv_sec) {
splx(s);
return (EPERM);
}
if (delta.tv_sec > 1) {
tv->tv_sec = tv1.tv_sec + 1;
printf("Time adjustment clamped to +1 second\n");
}
laststep = *tv;
}
}
ts.tv_sec = tv->tv_sec;
ts.tv_nsec = tv->tv_usec * 1000;
mtx_lock(&Giant);
tc_setclock(&ts);
resettodr();
mtx_unlock(&Giant);
return (0);
}
#ifndef _SYS_SYSPROTO_H_
struct clock_gettime_args {
clockid_t clock_id;
struct timespec *tp;
};
#endif
/* ARGSUSED */
int
-clock_gettime(struct thread *td, struct clock_gettime_args *uap)
+sys_clock_gettime(struct thread *td, struct clock_gettime_args *uap)
{
struct timespec ats;
int error;
error = kern_clock_gettime(td, uap->clock_id, &ats);
if (error == 0)
error = copyout(&ats, uap->tp, sizeof(ats));
return (error);
}
int
kern_clock_gettime(struct thread *td, clockid_t clock_id, struct timespec *ats)
{
struct timeval sys, user;
struct proc *p;
uint64_t runtime, curtime, switchtime;
p = td->td_proc;
switch (clock_id) {
case CLOCK_REALTIME: /* Default to precise. */
case CLOCK_REALTIME_PRECISE:
nanotime(ats);
break;
case CLOCK_REALTIME_FAST:
getnanotime(ats);
break;
case CLOCK_VIRTUAL:
PROC_LOCK(p);
PROC_SLOCK(p);
calcru(p, &user, &sys);
PROC_SUNLOCK(p);
PROC_UNLOCK(p);
TIMEVAL_TO_TIMESPEC(&user, ats);
break;
case CLOCK_PROF:
PROC_LOCK(p);
PROC_SLOCK(p);
calcru(p, &user, &sys);
PROC_SUNLOCK(p);
PROC_UNLOCK(p);
timevaladd(&user, &sys);
TIMEVAL_TO_TIMESPEC(&user, ats);
break;
case CLOCK_MONOTONIC: /* Default to precise. */
case CLOCK_MONOTONIC_PRECISE:
case CLOCK_UPTIME:
case CLOCK_UPTIME_PRECISE:
nanouptime(ats);
break;
case CLOCK_UPTIME_FAST:
case CLOCK_MONOTONIC_FAST:
getnanouptime(ats);
break;
case CLOCK_SECOND:
ats->tv_sec = time_second;
ats->tv_nsec = 0;
break;
case CLOCK_THREAD_CPUTIME_ID:
critical_enter();
switchtime = PCPU_GET(switchtime);
curtime = cpu_ticks();
runtime = td->td_runtime;
critical_exit();
runtime = cputick2usec(runtime + curtime - switchtime);
ats->tv_sec = runtime / 1000000;
ats->tv_nsec = runtime % 1000000 * 1000;
break;
default:
return (EINVAL);
}
return (0);
}
#ifndef _SYS_SYSPROTO_H_
struct clock_settime_args {
clockid_t clock_id;
const struct timespec *tp;
};
#endif
/* ARGSUSED */
int
-clock_settime(struct thread *td, struct clock_settime_args *uap)
+sys_clock_settime(struct thread *td, struct clock_settime_args *uap)
{
struct timespec ats;
int error;
if ((error = copyin(uap->tp, &ats, sizeof(ats))) != 0)
return (error);
return (kern_clock_settime(td, uap->clock_id, &ats));
}
int
kern_clock_settime(struct thread *td, clockid_t clock_id, struct timespec *ats)
{
struct timeval atv;
int error;
if ((error = priv_check(td, PRIV_CLOCK_SETTIME)) != 0)
return (error);
if (clock_id != CLOCK_REALTIME)
return (EINVAL);
if (ats->tv_nsec < 0 || ats->tv_nsec >= 1000000000)
return (EINVAL);
/* XXX Don't convert nsec->usec and back */
TIMESPEC_TO_TIMEVAL(&atv, ats);
error = settime(td, &atv);
return (error);
}
#ifndef _SYS_SYSPROTO_H_
struct clock_getres_args {
clockid_t clock_id;
struct timespec *tp;
};
#endif
int
-clock_getres(struct thread *td, struct clock_getres_args *uap)
+sys_clock_getres(struct thread *td, struct clock_getres_args *uap)
{
struct timespec ts;
int error;
if (uap->tp == NULL)
return (0);
error = kern_clock_getres(td, uap->clock_id, &ts);
if (error == 0)
error = copyout(&ts, uap->tp, sizeof(ts));
return (error);
}
int
kern_clock_getres(struct thread *td, clockid_t clock_id, struct timespec *ts)
{
ts->tv_sec = 0;
switch (clock_id) {
case CLOCK_REALTIME:
case CLOCK_REALTIME_FAST:
case CLOCK_REALTIME_PRECISE:
case CLOCK_MONOTONIC:
case CLOCK_MONOTONIC_FAST:
case CLOCK_MONOTONIC_PRECISE:
case CLOCK_UPTIME:
case CLOCK_UPTIME_FAST:
case CLOCK_UPTIME_PRECISE:
/*
* Round up the result of the division cheaply by adding 1.
* Rounding up is especially important if rounding down
* would give 0. Perfect rounding is unimportant.
*/
ts->tv_nsec = 1000000000 / tc_getfrequency() + 1;
break;
case CLOCK_VIRTUAL:
case CLOCK_PROF:
/* Accurately round up here because we can do so cheaply. */
ts->tv_nsec = (1000000000 + hz - 1) / hz;
break;
case CLOCK_SECOND:
ts->tv_sec = 1;
ts->tv_nsec = 0;
break;
case CLOCK_THREAD_CPUTIME_ID:
/* sync with cputick2usec */
ts->tv_nsec = 1000000 / cpu_tickrate();
if (ts->tv_nsec == 0)
ts->tv_nsec = 1000;
break;
default:
return (EINVAL);
}
return (0);
}
static int nanowait;
int
kern_nanosleep(struct thread *td, struct timespec *rqt, struct timespec *rmt)
{
struct timespec ts, ts2, ts3;
struct timeval tv;
int error;
if (rqt->tv_nsec < 0 || rqt->tv_nsec >= 1000000000)
return (EINVAL);
if (rqt->tv_sec < 0 || (rqt->tv_sec == 0 && rqt->tv_nsec == 0))
return (0);
getnanouptime(&ts);
timespecadd(&ts, rqt);
TIMESPEC_TO_TIMEVAL(&tv, rqt);
for (;;) {
error = tsleep(&nanowait, PWAIT | PCATCH, "nanslp",
tvtohz(&tv));
getnanouptime(&ts2);
if (error != EWOULDBLOCK) {
if (error == ERESTART)
error = EINTR;
if (rmt != NULL) {
timespecsub(&ts, &ts2);
if (ts.tv_sec < 0)
timespecclear(&ts);
*rmt = ts;
}
return (error);
}
if (timespeccmp(&ts2, &ts, >=))
return (0);
ts3 = ts;
timespecsub(&ts3, &ts2);
TIMESPEC_TO_TIMEVAL(&tv, &ts3);
}
}
#ifndef _SYS_SYSPROTO_H_
struct nanosleep_args {
struct timespec *rqtp;
struct timespec *rmtp;
};
#endif
/* ARGSUSED */
int
-nanosleep(struct thread *td, struct nanosleep_args *uap)
+sys_nanosleep(struct thread *td, struct nanosleep_args *uap)
{
struct timespec rmt, rqt;
int error;
error = copyin(uap->rqtp, &rqt, sizeof(rqt));
if (error)
return (error);
if (uap->rmtp &&
!useracc((caddr_t)uap->rmtp, sizeof(rmt), VM_PROT_WRITE))
return (EFAULT);
error = kern_nanosleep(td, &rqt, &rmt);
if (error && uap->rmtp) {
int error2;
error2 = copyout(&rmt, uap->rmtp, sizeof(rmt));
if (error2)
error = error2;
}
return (error);
}
#ifndef _SYS_SYSPROTO_H_
struct gettimeofday_args {
struct timeval *tp;
struct timezone *tzp;
};
#endif
/* ARGSUSED */
int
-gettimeofday(struct thread *td, struct gettimeofday_args *uap)
+sys_gettimeofday(struct thread *td, struct gettimeofday_args *uap)
{
struct timeval atv;
struct timezone rtz;
int error = 0;
if (uap->tp) {
microtime(&atv);
error = copyout(&atv, uap->tp, sizeof (atv));
}
if (error == 0 && uap->tzp != NULL) {
rtz.tz_minuteswest = tz_minuteswest;
rtz.tz_dsttime = tz_dsttime;
error = copyout(&rtz, uap->tzp, sizeof (rtz));
}
return (error);
}
#ifndef _SYS_SYSPROTO_H_
struct settimeofday_args {
struct timeval *tv;
struct timezone *tzp;
};
#endif
/* ARGSUSED */
int
-settimeofday(struct thread *td, struct settimeofday_args *uap)
+sys_settimeofday(struct thread *td, struct settimeofday_args *uap)
{
struct timeval atv, *tvp;
struct timezone atz, *tzp;
int error;
if (uap->tv) {
error = copyin(uap->tv, &atv, sizeof(atv));
if (error)
return (error);
tvp = &atv;
} else
tvp = NULL;
if (uap->tzp) {
error = copyin(uap->tzp, &atz, sizeof(atz));
if (error)
return (error);
tzp = &atz;
} else
tzp = NULL;
return (kern_settimeofday(td, tvp, tzp));
}
int
kern_settimeofday(struct thread *td, struct timeval *tv, struct timezone *tzp)
{
int error;
error = priv_check(td, PRIV_SETTIMEOFDAY);
if (error)
return (error);
/* Verify all parameters before changing time. */
if (tv) {
if (tv->tv_usec < 0 || tv->tv_usec >= 1000000)
return (EINVAL);
error = settime(td, tv);
}
if (tzp && error == 0) {
tz_minuteswest = tzp->tz_minuteswest;
tz_dsttime = tzp->tz_dsttime;
}
return (error);
}
/*
* Get value of an interval timer. The process virtual and profiling virtual
* time timers are kept in the p_stats area, since they can be swapped out.
* These are kept internally in the way they are specified externally: in
* time until they expire.
*
* The real time interval timer is kept in the process table slot for the
* process, and its value (it_value) is kept as an absolute time rather than
* as a delta, so that it is easy to keep periodic real-time signals from
* drifting.
*
* Virtual time timers are processed in the hardclock() routine of
* kern_clock.c. The real time timer is processed by a timeout routine,
* called from the softclock() routine. Since a callout may be delayed in
* real time due to interrupt processing in the system, it is possible for
* the real time timeout routine (realitexpire, given below), to be delayed
* in real time past when it is supposed to occur. It does not suffice,
* therefore, to reload the real timer .it_value from the real time timers
* .it_interval. Rather, we compute the next time in absolute time the timer
* should go off.
*/
#ifndef _SYS_SYSPROTO_H_
struct getitimer_args {
u_int which;
struct itimerval *itv;
};
#endif
int
-getitimer(struct thread *td, struct getitimer_args *uap)
+sys_getitimer(struct thread *td, struct getitimer_args *uap)
{
struct itimerval aitv;
int error;
error = kern_getitimer(td, uap->which, &aitv);
if (error != 0)
return (error);
return (copyout(&aitv, uap->itv, sizeof (struct itimerval)));
}
int
kern_getitimer(struct thread *td, u_int which, struct itimerval *aitv)
{
struct proc *p = td->td_proc;
struct timeval ctv;
if (which > ITIMER_PROF)
return (EINVAL);
if (which == ITIMER_REAL) {
/*
* Convert from absolute to relative time in .it_value
* part of real time timer. If time for real time timer
* has passed return 0, else return difference between
* current time and time for the timer to go off.
*/
PROC_LOCK(p);
*aitv = p->p_realtimer;
PROC_UNLOCK(p);
if (timevalisset(&aitv->it_value)) {
getmicrouptime(&ctv);
if (timevalcmp(&aitv->it_value, &ctv, <))
timevalclear(&aitv->it_value);
else
timevalsub(&aitv->it_value, &ctv);
}
} else {
PROC_SLOCK(p);
*aitv = p->p_stats->p_timer[which];
PROC_SUNLOCK(p);
}
return (0);
}
#ifndef _SYS_SYSPROTO_H_
struct setitimer_args {
u_int which;
struct itimerval *itv, *oitv;
};
#endif
int
-setitimer(struct thread *td, struct setitimer_args *uap)
+sys_setitimer(struct thread *td, struct setitimer_args *uap)
{
struct itimerval aitv, oitv;
int error;
if (uap->itv == NULL) {
uap->itv = uap->oitv;
- return (getitimer(td, (struct getitimer_args *)uap));
+ return (sys_getitimer(td, (struct getitimer_args *)uap));
}
if ((error = copyin(uap->itv, &aitv, sizeof(struct itimerval))))
return (error);
error = kern_setitimer(td, uap->which, &aitv, &oitv);
if (error != 0 || uap->oitv == NULL)
return (error);
return (copyout(&oitv, uap->oitv, sizeof(struct itimerval)));
}
int
kern_setitimer(struct thread *td, u_int which, struct itimerval *aitv,
struct itimerval *oitv)
{
struct proc *p = td->td_proc;
struct timeval ctv;
if (aitv == NULL)
return (kern_getitimer(td, which, oitv));
if (which > ITIMER_PROF)
return (EINVAL);
if (itimerfix(&aitv->it_value))
return (EINVAL);
if (!timevalisset(&aitv->it_value))
timevalclear(&aitv->it_interval);
else if (itimerfix(&aitv->it_interval))
return (EINVAL);
if (which == ITIMER_REAL) {
PROC_LOCK(p);
if (timevalisset(&p->p_realtimer.it_value))
callout_stop(&p->p_itcallout);
getmicrouptime(&ctv);
if (timevalisset(&aitv->it_value)) {
callout_reset(&p->p_itcallout, tvtohz(&aitv->it_value),
realitexpire, p);
timevaladd(&aitv->it_value, &ctv);
}
*oitv = p->p_realtimer;
p->p_realtimer = *aitv;
PROC_UNLOCK(p);
if (timevalisset(&oitv->it_value)) {
if (timevalcmp(&oitv->it_value, &ctv, <))
timevalclear(&oitv->it_value);
else
timevalsub(&oitv->it_value, &ctv);
}
} else {
PROC_SLOCK(p);
*oitv = p->p_stats->p_timer[which];
p->p_stats->p_timer[which] = *aitv;
PROC_SUNLOCK(p);
}
return (0);
}
/*
* Real interval timer expired:
* send process whose timer expired an alarm signal.
* If time is not set up to reload, then just return.
* Else compute next time timer should go off which is > current time.
* This is where delay in processing this timeout causes multiple
* SIGALRM calls to be compressed into one.
* tvtohz() always adds 1 to allow for the time until the next clock
* interrupt being strictly less than 1 clock tick, but we don't want
* that here since we want to appear to be in sync with the clock
* interrupt even when we're delayed.
*/
void
realitexpire(void *arg)
{
struct proc *p;
struct timeval ctv, ntv;
p = (struct proc *)arg;
PROC_LOCK(p);
- psignal(p, SIGALRM);
+ kern_psignal(p, SIGALRM);
if (!timevalisset(&p->p_realtimer.it_interval)) {
timevalclear(&p->p_realtimer.it_value);
if (p->p_flag & P_WEXIT)
wakeup(&p->p_itcallout);
PROC_UNLOCK(p);
return;
}
for (;;) {
timevaladd(&p->p_realtimer.it_value,
&p->p_realtimer.it_interval);
getmicrouptime(&ctv);
if (timevalcmp(&p->p_realtimer.it_value, &ctv, >)) {
ntv = p->p_realtimer.it_value;
timevalsub(&ntv, &ctv);
callout_reset(&p->p_itcallout, tvtohz(&ntv) - 1,
realitexpire, p);
PROC_UNLOCK(p);
return;
}
}
/*NOTREACHED*/
}
/*
* Check that a proposed value to load into the .it_value or
* .it_interval part of an interval timer is acceptable, and
* fix it to have at least minimal value (i.e. if it is less
* than the resolution of the clock, round it up.)
*/
int
itimerfix(struct timeval *tv)
{
if (tv->tv_sec < 0 || tv->tv_usec < 0 || tv->tv_usec >= 1000000)
return (EINVAL);
if (tv->tv_sec == 0 && tv->tv_usec != 0 && tv->tv_usec < tick)
tv->tv_usec = tick;
return (0);
}
/*
* Decrement an interval timer by a specified number
* of microseconds, which must be less than a second,
* i.e. < 1000000. If the timer expires, then reload
* it. In this case, carry over (usec - old value) to
* reduce the value reloaded into the timer so that
* the timer does not drift. This routine assumes
* that it is called in a context where the timers
* on which it is operating cannot change in value.
*/
int
itimerdecr(struct itimerval *itp, int usec)
{
if (itp->it_value.tv_usec < usec) {
if (itp->it_value.tv_sec == 0) {
/* expired, and already in next interval */
usec -= itp->it_value.tv_usec;
goto expire;
}
itp->it_value.tv_usec += 1000000;
itp->it_value.tv_sec--;
}
itp->it_value.tv_usec -= usec;
usec = 0;
if (timevalisset(&itp->it_value))
return (1);
/* expired, exactly at end of interval */
expire:
if (timevalisset(&itp->it_interval)) {
itp->it_value = itp->it_interval;
itp->it_value.tv_usec -= usec;
if (itp->it_value.tv_usec < 0) {
itp->it_value.tv_usec += 1000000;
itp->it_value.tv_sec--;
}
} else
itp->it_value.tv_usec = 0; /* sec is already 0 */
return (0);
}
/*
* Add and subtract routines for timevals.
* N.B.: subtract routine doesn't deal with
* results which are before the beginning,
* it just gets very confused in this case.
* Caveat emptor.
*/
void
timevaladd(struct timeval *t1, const struct timeval *t2)
{
t1->tv_sec += t2->tv_sec;
t1->tv_usec += t2->tv_usec;
timevalfix(t1);
}
void
timevalsub(struct timeval *t1, const struct timeval *t2)
{
t1->tv_sec -= t2->tv_sec;
t1->tv_usec -= t2->tv_usec;
timevalfix(t1);
}
static void
timevalfix(struct timeval *t1)
{
if (t1->tv_usec < 0) {
t1->tv_sec--;
t1->tv_usec += 1000000;
}
if (t1->tv_usec >= 1000000) {
t1->tv_sec++;
t1->tv_usec -= 1000000;
}
}
/*
* ratecheck(): simple time-based rate-limit checking.
*/
int
ratecheck(struct timeval *lasttime, const struct timeval *mininterval)
{
struct timeval tv, delta;
int rv = 0;
getmicrouptime(&tv); /* NB: 10ms precision */
delta = tv;
timevalsub(&delta, lasttime);
/*
* check for 0,0 is so that the message will be seen at least once,
* even if interval is huge.
*/
if (timevalcmp(&delta, mininterval, >=) ||
(lasttime->tv_sec == 0 && lasttime->tv_usec == 0)) {
*lasttime = tv;
rv = 1;
}
return (rv);
}
/*
* ppsratecheck(): packets (or events) per second limitation.
*
* Return 0 if the limit is to be enforced (e.g. the caller
* should drop a packet because of the rate limitation).
*
* maxpps of 0 always causes zero to be returned. maxpps of -1
* always causes 1 to be returned; this effectively defeats rate
* limiting.
*
* Note that we maintain the struct timeval for compatibility
* with other bsd systems. We reuse the storage and just monitor
* clock ticks for minimal overhead.
*/
int
ppsratecheck(struct timeval *lasttime, int *curpps, int maxpps)
{
int now;
/*
* Reset the last time and counter if this is the first call
* or more than a second has passed since the last update of
* lasttime.
*/
now = ticks;
if (lasttime->tv_sec == 0 || (u_int)(now - lasttime->tv_sec) >= hz) {
lasttime->tv_sec = now;
*curpps = 1;
return (maxpps != 0);
} else {
(*curpps)++; /* NB: ignore potential overflow */
return (maxpps < 0 || *curpps < maxpps);
}
}
static void
itimer_start(void)
{
struct kclock rt_clock = {
.timer_create = realtimer_create,
.timer_delete = realtimer_delete,
.timer_settime = realtimer_settime,
.timer_gettime = realtimer_gettime,
.event_hook = NULL
};
itimer_zone = uma_zcreate("itimer", sizeof(struct itimer),
NULL, NULL, itimer_init, itimer_fini, UMA_ALIGN_PTR, 0);
register_posix_clock(CLOCK_REALTIME, &rt_clock);
register_posix_clock(CLOCK_MONOTONIC, &rt_clock);
p31b_setcfg(CTL_P1003_1B_TIMERS, 200112L);
p31b_setcfg(CTL_P1003_1B_DELAYTIMER_MAX, INT_MAX);
p31b_setcfg(CTL_P1003_1B_TIMER_MAX, TIMER_MAX);
EVENTHANDLER_REGISTER(process_exit, itimers_event_hook_exit,
(void *)ITIMER_EV_EXIT, EVENTHANDLER_PRI_ANY);
EVENTHANDLER_REGISTER(process_exec, itimers_event_hook_exec,
(void *)ITIMER_EV_EXEC, EVENTHANDLER_PRI_ANY);
}
int
register_posix_clock(int clockid, struct kclock *clk)
{
if ((unsigned)clockid >= MAX_CLOCKS) {
printf("%s: invalid clockid\n", __func__);
return (0);
}
posix_clocks[clockid] = *clk;
return (1);
}
static int
itimer_init(void *mem, int size, int flags)
{
struct itimer *it;
it = (struct itimer *)mem;
mtx_init(&it->it_mtx, "itimer lock", NULL, MTX_DEF);
return (0);
}
static void
itimer_fini(void *mem, int size)
{
struct itimer *it;
it = (struct itimer *)mem;
mtx_destroy(&it->it_mtx);
}
static void
itimer_enter(struct itimer *it)
{
mtx_assert(&it->it_mtx, MA_OWNED);
it->it_usecount++;
}
static void
itimer_leave(struct itimer *it)
{
mtx_assert(&it->it_mtx, MA_OWNED);
KASSERT(it->it_usecount > 0, ("invalid it_usecount"));
if (--it->it_usecount == 0 && (it->it_flags & ITF_WANTED) != 0)
wakeup(it);
}
#ifndef _SYS_SYSPROTO_H_
struct ktimer_create_args {
clockid_t clock_id;
struct sigevent * evp;
int * timerid;
};
#endif
int
-ktimer_create(struct thread *td, struct ktimer_create_args *uap)
+sys_ktimer_create(struct thread *td, struct ktimer_create_args *uap)
{
struct sigevent *evp1, ev;
int id;
int error;
if (uap->evp != NULL) {
error = copyin(uap->evp, &ev, sizeof(ev));
if (error != 0)
return (error);
evp1 = &ev;
} else
evp1 = NULL;
error = kern_timer_create(td, uap->clock_id, evp1, &id, -1);
if (error == 0) {
error = copyout(&id, uap->timerid, sizeof(int));
if (error != 0)
kern_timer_delete(td, id);
}
return (error);
}
static int
kern_timer_create(struct thread *td, clockid_t clock_id,
struct sigevent *evp, int *timerid, int preset_id)
{
struct proc *p = td->td_proc;
struct itimer *it;
int id;
int error;
if (clock_id < 0 || clock_id >= MAX_CLOCKS)
return (EINVAL);
if (posix_clocks[clock_id].timer_create == NULL)
return (EINVAL);
if (evp != NULL) {
if (evp->sigev_notify != SIGEV_NONE &&
evp->sigev_notify != SIGEV_SIGNAL &&
evp->sigev_notify != SIGEV_THREAD_ID)
return (EINVAL);
if ((evp->sigev_notify == SIGEV_SIGNAL ||
evp->sigev_notify == SIGEV_THREAD_ID) &&
!_SIG_VALID(evp->sigev_signo))
return (EINVAL);
}
if (p->p_itimers == NULL)
itimers_alloc(p);
it = uma_zalloc(itimer_zone, M_WAITOK);
it->it_flags = 0;
it->it_usecount = 0;
it->it_active = 0;
timespecclear(&it->it_time.it_value);
timespecclear(&it->it_time.it_interval);
it->it_overrun = 0;
it->it_overrun_last = 0;
it->it_clockid = clock_id;
it->it_timerid = -1;
it->it_proc = p;
ksiginfo_init(&it->it_ksi);
it->it_ksi.ksi_flags |= KSI_INS | KSI_EXT;
error = CLOCK_CALL(clock_id, timer_create, (it));
if (error != 0)
goto out;
PROC_LOCK(p);
if (preset_id != -1) {
KASSERT(preset_id >= 0 && preset_id < 3, ("invalid preset_id"));
id = preset_id;
if (p->p_itimers->its_timers[id] != NULL) {
PROC_UNLOCK(p);
error = 0;
goto out;
}
} else {
/*
* Find a free timer slot, skipping those reserved
* for setitimer().
*/
for (id = 3; id < TIMER_MAX; id++)
if (p->p_itimers->its_timers[id] == NULL)
break;
if (id == TIMER_MAX) {
PROC_UNLOCK(p);
error = EAGAIN;
goto out;
}
}
it->it_timerid = id;
p->p_itimers->its_timers[id] = it;
if (evp != NULL)
it->it_sigev = *evp;
else {
it->it_sigev.sigev_notify = SIGEV_SIGNAL;
switch (clock_id) {
default:
case CLOCK_REALTIME:
it->it_sigev.sigev_signo = SIGALRM;
break;
case CLOCK_VIRTUAL:
it->it_sigev.sigev_signo = SIGVTALRM;
break;
case CLOCK_PROF:
it->it_sigev.sigev_signo = SIGPROF;
break;
}
it->it_sigev.sigev_value.sival_int = id;
}
if (it->it_sigev.sigev_notify == SIGEV_SIGNAL ||
it->it_sigev.sigev_notify == SIGEV_THREAD_ID) {
it->it_ksi.ksi_signo = it->it_sigev.sigev_signo;
it->it_ksi.ksi_code = SI_TIMER;
it->it_ksi.ksi_value = it->it_sigev.sigev_value;
it->it_ksi.ksi_timerid = id;
}
PROC_UNLOCK(p);
*timerid = id;
return (0);
out:
ITIMER_LOCK(it);
CLOCK_CALL(it->it_clockid, timer_delete, (it));
ITIMER_UNLOCK(it);
uma_zfree(itimer_zone, it);
return (error);
}
#ifndef _SYS_SYSPROTO_H_
struct ktimer_delete_args {
int timerid;
};
#endif
int
-ktimer_delete(struct thread *td, struct ktimer_delete_args *uap)
+sys_ktimer_delete(struct thread *td, struct ktimer_delete_args *uap)
{
return (kern_timer_delete(td, uap->timerid));
}
static struct itimer *
itimer_find(struct proc *p, int timerid)
{
struct itimer *it;
PROC_LOCK_ASSERT(p, MA_OWNED);
if ((p->p_itimers == NULL) ||
(timerid < 0) || (timerid >= TIMER_MAX) ||
(it = p->p_itimers->its_timers[timerid]) == NULL) {
return (NULL);
}
ITIMER_LOCK(it);
if ((it->it_flags & ITF_DELETING) != 0) {
ITIMER_UNLOCK(it);
it = NULL;
}
return (it);
}
static int
kern_timer_delete(struct thread *td, int timerid)
{
struct proc *p = td->td_proc;
struct itimer *it;
PROC_LOCK(p);
it = itimer_find(p, timerid);
if (it == NULL) {
PROC_UNLOCK(p);
return (EINVAL);
}
PROC_UNLOCK(p);
it->it_flags |= ITF_DELETING;
while (it->it_usecount > 0) {
it->it_flags |= ITF_WANTED;
msleep(it, &it->it_mtx, PPAUSE, "itimer", 0);
}
it->it_flags &= ~ITF_WANTED;
CLOCK_CALL(it->it_clockid, timer_delete, (it));
ITIMER_UNLOCK(it);
PROC_LOCK(p);
if (KSI_ONQ(&it->it_ksi))
sigqueue_take(&it->it_ksi);
p->p_itimers->its_timers[timerid] = NULL;
PROC_UNLOCK(p);
uma_zfree(itimer_zone, it);
return (0);
}
#ifndef _SYS_SYSPROTO_H_
struct ktimer_settime_args {
int timerid;
int flags;
const struct itimerspec * value;
struct itimerspec * ovalue;
};
#endif
int
-ktimer_settime(struct thread *td, struct ktimer_settime_args *uap)
+sys_ktimer_settime(struct thread *td, struct ktimer_settime_args *uap)
{
struct proc *p = td->td_proc;
struct itimer *it;
struct itimerspec val, oval, *ovalp;
int error;
error = copyin(uap->value, &val, sizeof(val));
if (error != 0)
return (error);
if (uap->ovalue != NULL)
ovalp = &oval;
else
ovalp = NULL;
PROC_LOCK(p);
if (uap->timerid < 3 ||
(it = itimer_find(p, uap->timerid)) == NULL) {
PROC_UNLOCK(p);
error = EINVAL;
} else {
PROC_UNLOCK(p);
itimer_enter(it);
error = CLOCK_CALL(it->it_clockid, timer_settime,
(it, uap->flags, &val, ovalp));
itimer_leave(it);
ITIMER_UNLOCK(it);
}
if (error == 0 && uap->ovalue != NULL)
error = copyout(ovalp, uap->ovalue, sizeof(*ovalp));
return (error);
}
#ifndef _SYS_SYSPROTO_H_
struct ktimer_gettime_args {
int timerid;
struct itimerspec * value;
};
#endif
int
-ktimer_gettime(struct thread *td, struct ktimer_gettime_args *uap)
+sys_ktimer_gettime(struct thread *td, struct ktimer_gettime_args *uap)
{
struct proc *p = td->td_proc;
struct itimer *it;
struct itimerspec val;
int error;
PROC_LOCK(p);
if (uap->timerid < 3 ||
(it = itimer_find(p, uap->timerid)) == NULL) {
PROC_UNLOCK(p);
error = EINVAL;
} else {
PROC_UNLOCK(p);
itimer_enter(it);
error = CLOCK_CALL(it->it_clockid, timer_gettime,
(it, &val));
itimer_leave(it);
ITIMER_UNLOCK(it);
}
if (error == 0)
error = copyout(&val, uap->value, sizeof(val));
return (error);
}
#ifndef _SYS_SYSPROTO_H_
struct timer_getoverrun_args {
int timerid;
};
#endif
int
-ktimer_getoverrun(struct thread *td, struct ktimer_getoverrun_args *uap)
+sys_ktimer_getoverrun(struct thread *td, struct ktimer_getoverrun_args *uap)
{
struct proc *p = td->td_proc;
struct itimer *it;
int error ;
PROC_LOCK(p);
if (uap->timerid < 3 ||
(it = itimer_find(p, uap->timerid)) == NULL) {
PROC_UNLOCK(p);
error = EINVAL;
} else {
td->td_retval[0] = it->it_overrun_last;
ITIMER_UNLOCK(it);
PROC_UNLOCK(p);
error = 0;
}
return (error);
}
static int
realtimer_create(struct itimer *it)
{
callout_init_mtx(&it->it_callout, &it->it_mtx, 0);
return (0);
}
static int
realtimer_delete(struct itimer *it)
{
mtx_assert(&it->it_mtx, MA_OWNED);
/*
* clear timer's value and interval to tell realtimer_expire
* to not rearm the timer.
*/
timespecclear(&it->it_time.it_value);
timespecclear(&it->it_time.it_interval);
ITIMER_UNLOCK(it);
callout_drain(&it->it_callout);
ITIMER_LOCK(it);
return (0);
}
static int
realtimer_gettime(struct itimer *it, struct itimerspec *ovalue)
{
struct timespec cts;
mtx_assert(&it->it_mtx, MA_OWNED);
realtimer_clocktime(it->it_clockid, &cts);
*ovalue = it->it_time;
if (ovalue->it_value.tv_sec != 0 || ovalue->it_value.tv_nsec != 0) {
timespecsub(&ovalue->it_value, &cts);
if (ovalue->it_value.tv_sec < 0 ||
(ovalue->it_value.tv_sec == 0 &&
ovalue->it_value.tv_nsec == 0)) {
ovalue->it_value.tv_sec = 0;
ovalue->it_value.tv_nsec = 1;
}
}
return (0);
}
static int
realtimer_settime(struct itimer *it, int flags,
struct itimerspec *value, struct itimerspec *ovalue)
{
struct timespec cts, ts;
struct timeval tv;
struct itimerspec val;
mtx_assert(&it->it_mtx, MA_OWNED);
val = *value;
if (itimespecfix(&val.it_value))
return (EINVAL);
if (timespecisset(&val.it_value)) {
if (itimespecfix(&val.it_interval))
return (EINVAL);
} else {
timespecclear(&val.it_interval);
}
if (ovalue != NULL)
realtimer_gettime(it, ovalue);
it->it_time = val;
if (timespecisset(&val.it_value)) {
realtimer_clocktime(it->it_clockid, &cts);
ts = val.it_value;
if ((flags & TIMER_ABSTIME) == 0) {
/* Convert to absolute time. */
timespecadd(&it->it_time.it_value, &cts);
} else {
timespecsub(&ts, &cts);
/*
* We don't care if ts is negative, tztohz will
* fix it.
*/
}
TIMESPEC_TO_TIMEVAL(&tv, &ts);
callout_reset(&it->it_callout, tvtohz(&tv),
realtimer_expire, it);
} else {
callout_stop(&it->it_callout);
}
return (0);
}
static void
realtimer_clocktime(clockid_t id, struct timespec *ts)
{
if (id == CLOCK_REALTIME)
getnanotime(ts);
else /* CLOCK_MONOTONIC */
getnanouptime(ts);
}
int
itimer_accept(struct proc *p, int timerid, ksiginfo_t *ksi)
{
struct itimer *it;
PROC_LOCK_ASSERT(p, MA_OWNED);
it = itimer_find(p, timerid);
if (it != NULL) {
ksi->ksi_overrun = it->it_overrun;
it->it_overrun_last = it->it_overrun;
it->it_overrun = 0;
ITIMER_UNLOCK(it);
return (0);
}
return (EINVAL);
}
int
itimespecfix(struct timespec *ts)
{
if (ts->tv_sec < 0 || ts->tv_nsec < 0 || ts->tv_nsec >= 1000000000)
return (EINVAL);
if (ts->tv_sec == 0 && ts->tv_nsec != 0 && ts->tv_nsec < tick * 1000)
ts->tv_nsec = tick * 1000;
return (0);
}
/* Timeout callback for realtime timer */
static void
realtimer_expire(void *arg)
{
struct timespec cts, ts;
struct timeval tv;
struct itimer *it;
it = (struct itimer *)arg;
realtimer_clocktime(it->it_clockid, &cts);
/* Only fire if time is reached. */
if (timespeccmp(&cts, &it->it_time.it_value, >=)) {
if (timespecisset(&it->it_time.it_interval)) {
timespecadd(&it->it_time.it_value,
&it->it_time.it_interval);
while (timespeccmp(&cts, &it->it_time.it_value, >=)) {
if (it->it_overrun < INT_MAX)
it->it_overrun++;
else
it->it_ksi.ksi_errno = ERANGE;
timespecadd(&it->it_time.it_value,
&it->it_time.it_interval);
}
} else {
/* single shot timer ? */
timespecclear(&it->it_time.it_value);
}
if (timespecisset(&it->it_time.it_value)) {
ts = it->it_time.it_value;
timespecsub(&ts, &cts);
TIMESPEC_TO_TIMEVAL(&tv, &ts);
callout_reset(&it->it_callout, tvtohz(&tv),
realtimer_expire, it);
}
itimer_enter(it);
ITIMER_UNLOCK(it);
itimer_fire(it);
ITIMER_LOCK(it);
itimer_leave(it);
} else if (timespecisset(&it->it_time.it_value)) {
ts = it->it_time.it_value;
timespecsub(&ts, &cts);
TIMESPEC_TO_TIMEVAL(&tv, &ts);
callout_reset(&it->it_callout, tvtohz(&tv), realtimer_expire,
it);
}
}
void
itimer_fire(struct itimer *it)
{
struct proc *p = it->it_proc;
struct thread *td;
if (it->it_sigev.sigev_notify == SIGEV_SIGNAL ||
it->it_sigev.sigev_notify == SIGEV_THREAD_ID) {
if (sigev_findtd(p, &it->it_sigev, &td) != 0) {
ITIMER_LOCK(it);
timespecclear(&it->it_time.it_value);
timespecclear(&it->it_time.it_interval);
callout_stop(&it->it_callout);
ITIMER_UNLOCK(it);
return;
}
if (!KSI_ONQ(&it->it_ksi)) {
it->it_ksi.ksi_errno = 0;
ksiginfo_set_sigev(&it->it_ksi, &it->it_sigev);
tdsendsignal(p, td, it->it_ksi.ksi_signo, &it->it_ksi);
} else {
if (it->it_overrun < INT_MAX)
it->it_overrun++;
else
it->it_ksi.ksi_errno = ERANGE;
}
PROC_UNLOCK(p);
}
}
static void
itimers_alloc(struct proc *p)
{
struct itimers *its;
int i;
its = malloc(sizeof (struct itimers), M_SUBPROC, M_WAITOK | M_ZERO);
LIST_INIT(&its->its_virtual);
LIST_INIT(&its->its_prof);
TAILQ_INIT(&its->its_worklist);
for (i = 0; i < TIMER_MAX; i++)
its->its_timers[i] = NULL;
PROC_LOCK(p);
if (p->p_itimers == NULL) {
p->p_itimers = its;
PROC_UNLOCK(p);
}
else {
PROC_UNLOCK(p);
free(its, M_SUBPROC);
}
}
static void
itimers_event_hook_exec(void *arg, struct proc *p, struct image_params *imgp __unused)
{
itimers_event_hook_exit(arg, p);
}
/* Clean up timers when some process events are being triggered. */
static void
itimers_event_hook_exit(void *arg, struct proc *p)
{
struct itimers *its;
struct itimer *it;
int event = (int)(intptr_t)arg;
int i;
if (p->p_itimers != NULL) {
its = p->p_itimers;
for (i = 0; i < MAX_CLOCKS; ++i) {
if (posix_clocks[i].event_hook != NULL)
CLOCK_CALL(i, event_hook, (p, i, event));
}
/*
* According to susv3, XSI interval timers should be inherited
* by new image.
*/
if (event == ITIMER_EV_EXEC)
i = 3;
else if (event == ITIMER_EV_EXIT)
i = 0;
else
panic("unhandled event");
for (; i < TIMER_MAX; ++i) {
if ((it = its->its_timers[i]) != NULL)
kern_timer_delete(curthread, i);
}
if (its->its_timers[0] == NULL &&
its->its_timers[1] == NULL &&
its->its_timers[2] == NULL) {
free(its, M_SUBPROC);
p->p_itimers = NULL;
}
}
}
Index: head/sys/kern/kern_umtx.c
===================================================================
--- head/sys/kern/kern_umtx.c (revision 225616)
+++ head/sys/kern/kern_umtx.c (revision 225617)
@@ -1,3612 +1,3612 @@
/*-
* Copyright (c) 2004, David Xu <davidxu@freebsd.org>
* Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice unmodified, this list of conditions, and the following
* disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_compat.h"
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/limits.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/sched.h>
#include <sys/smp.h>
#include <sys/sysctl.h>
#include <sys/sysent.h>
#include <sys/systm.h>
#include <sys/sysproto.h>
#include <sys/syscallsubr.h>
#include <sys/eventhandler.h>
#include <sys/umtx.h>
#include <vm/vm.h>
#include <vm/vm_param.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
#include <vm/vm_object.h>
#include <machine/cpu.h>
#ifdef COMPAT_FREEBSD32
#include <compat/freebsd32/freebsd32_proto.h>
#endif
#define _UMUTEX_TRY 1
#define _UMUTEX_WAIT 2
/* Priority inheritance mutex info. */
struct umtx_pi {
/* Owner thread */
struct thread *pi_owner;
/* Reference count */
int pi_refcount;
/* List entry to link umtx holding by thread */
TAILQ_ENTRY(umtx_pi) pi_link;
/* List entry in hash */
TAILQ_ENTRY(umtx_pi) pi_hashlink;
/* List for waiters */
TAILQ_HEAD(,umtx_q) pi_blocked;
/* Identify a userland lock object */
struct umtx_key pi_key;
};
/* A userland synchronous object user. */
struct umtx_q {
/* Linked list for the hash. */
TAILQ_ENTRY(umtx_q) uq_link;
/* Umtx key. */
struct umtx_key uq_key;
/* Umtx flags. */
int uq_flags;
#define UQF_UMTXQ 0x0001
/* The thread waits on. */
struct thread *uq_thread;
/*
* Blocked on PI mutex. read can use chain lock
* or umtx_lock, write must have both chain lock and
* umtx_lock being hold.
*/
struct umtx_pi *uq_pi_blocked;
/* On blocked list */
TAILQ_ENTRY(umtx_q) uq_lockq;
/* Thread contending with us */
TAILQ_HEAD(,umtx_pi) uq_pi_contested;
/* Inherited priority from PP mutex */
u_char uq_inherited_pri;
/* Spare queue ready to be reused */
struct umtxq_queue *uq_spare_queue;
/* The queue we on */
struct umtxq_queue *uq_cur_queue;
};
TAILQ_HEAD(umtxq_head, umtx_q);
/* Per-key wait-queue */
struct umtxq_queue {
struct umtxq_head head;
struct umtx_key key;
LIST_ENTRY(umtxq_queue) link;
int length;
};
LIST_HEAD(umtxq_list, umtxq_queue);
/* Userland lock object's wait-queue chain */
struct umtxq_chain {
/* Lock for this chain. */
struct mtx uc_lock;
/* List of sleep queues. */
struct umtxq_list uc_queue[2];
#define UMTX_SHARED_QUEUE 0
#define UMTX_EXCLUSIVE_QUEUE 1
LIST_HEAD(, umtxq_queue) uc_spare_queue;
/* Busy flag */
char uc_busy;
/* Chain lock waiters */
int uc_waiters;
/* All PI in the list */
TAILQ_HEAD(,umtx_pi) uc_pi_list;
};
#define UMTXQ_LOCKED_ASSERT(uc) mtx_assert(&(uc)->uc_lock, MA_OWNED)
#define UMTXQ_BUSY_ASSERT(uc) KASSERT(&(uc)->uc_busy, ("umtx chain is not busy"))
/*
* Don't propagate time-sharing priority, there is a security reason,
* a user can simply introduce PI-mutex, let thread A lock the mutex,
* and let another thread B block on the mutex, because B is
* sleeping, its priority will be boosted, this causes A's priority to
* be boosted via priority propagating too and will never be lowered even
* if it is using 100%CPU, this is unfair to other processes.
*/
#define UPRI(td) (((td)->td_user_pri >= PRI_MIN_TIMESHARE &&\
(td)->td_user_pri <= PRI_MAX_TIMESHARE) ?\
PRI_MAX_TIMESHARE : (td)->td_user_pri)
#define GOLDEN_RATIO_PRIME 2654404609U
#define UMTX_CHAINS 512
#define UMTX_SHIFTS (__WORD_BIT - 9)
#define GET_SHARE(flags) \
(((flags) & USYNC_PROCESS_SHARED) == 0 ? THREAD_SHARE : PROCESS_SHARE)
#define BUSY_SPINS 200
static uma_zone_t umtx_pi_zone;
static struct umtxq_chain umtxq_chains[2][UMTX_CHAINS];
static MALLOC_DEFINE(M_UMTX, "umtx", "UMTX queue memory");
static int umtx_pi_allocated;
SYSCTL_NODE(_debug, OID_AUTO, umtx, CTLFLAG_RW, 0, "umtx debug");
SYSCTL_INT(_debug_umtx, OID_AUTO, umtx_pi_allocated, CTLFLAG_RD,
&umtx_pi_allocated, 0, "Allocated umtx_pi");
static void umtxq_sysinit(void *);
static void umtxq_hash(struct umtx_key *key);
static struct umtxq_chain *umtxq_getchain(struct umtx_key *key);
static void umtxq_lock(struct umtx_key *key);
static void umtxq_unlock(struct umtx_key *key);
static void umtxq_busy(struct umtx_key *key);
static void umtxq_unbusy(struct umtx_key *key);
static void umtxq_insert_queue(struct umtx_q *uq, int q);
static void umtxq_remove_queue(struct umtx_q *uq, int q);
static int umtxq_sleep(struct umtx_q *uq, const char *wmesg, int timo);
static int umtxq_count(struct umtx_key *key);
static struct umtx_pi *umtx_pi_alloc(int);
static void umtx_pi_free(struct umtx_pi *pi);
static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags);
static void umtx_thread_cleanup(struct thread *td);
static void umtx_exec_hook(void *arg __unused, struct proc *p __unused,
struct image_params *imgp __unused);
SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL);
#define umtxq_signal(key, nwake) umtxq_signal_queue((key), (nwake), UMTX_SHARED_QUEUE)
#define umtxq_insert(uq) umtxq_insert_queue((uq), UMTX_SHARED_QUEUE)
#define umtxq_remove(uq) umtxq_remove_queue((uq), UMTX_SHARED_QUEUE)
static struct mtx umtx_lock;
static void
umtxq_sysinit(void *arg __unused)
{
int i, j;
umtx_pi_zone = uma_zcreate("umtx pi", sizeof(struct umtx_pi),
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
for (i = 0; i < 2; ++i) {
for (j = 0; j < UMTX_CHAINS; ++j) {
mtx_init(&umtxq_chains[i][j].uc_lock, "umtxql", NULL,
MTX_DEF | MTX_DUPOK);
LIST_INIT(&umtxq_chains[i][j].uc_queue[0]);
LIST_INIT(&umtxq_chains[i][j].uc_queue[1]);
LIST_INIT(&umtxq_chains[i][j].uc_spare_queue);
TAILQ_INIT(&umtxq_chains[i][j].uc_pi_list);
umtxq_chains[i][j].uc_busy = 0;
umtxq_chains[i][j].uc_waiters = 0;
}
}
mtx_init(&umtx_lock, "umtx lock", NULL, MTX_SPIN);
EVENTHANDLER_REGISTER(process_exec, umtx_exec_hook, NULL,
EVENTHANDLER_PRI_ANY);
}
struct umtx_q *
umtxq_alloc(void)
{
struct umtx_q *uq;
uq = malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK | M_ZERO);
uq->uq_spare_queue = malloc(sizeof(struct umtxq_queue), M_UMTX, M_WAITOK | M_ZERO);
TAILQ_INIT(&uq->uq_spare_queue->head);
TAILQ_INIT(&uq->uq_pi_contested);
uq->uq_inherited_pri = PRI_MAX;
return (uq);
}
void
umtxq_free(struct umtx_q *uq)
{
MPASS(uq->uq_spare_queue != NULL);
free(uq->uq_spare_queue, M_UMTX);
free(uq, M_UMTX);
}
static inline void
umtxq_hash(struct umtx_key *key)
{
unsigned n = (uintptr_t)key->info.both.a + key->info.both.b;
key->hash = ((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS;
}
static inline struct umtxq_chain *
umtxq_getchain(struct umtx_key *key)
{
if (key->type <= TYPE_SEM)
return (&umtxq_chains[1][key->hash]);
return (&umtxq_chains[0][key->hash]);
}
/*
* Lock a chain.
*/
static inline void
umtxq_lock(struct umtx_key *key)
{
struct umtxq_chain *uc;
uc = umtxq_getchain(key);
mtx_lock(&uc->uc_lock);
}
/*
* Unlock a chain.
*/
static inline void
umtxq_unlock(struct umtx_key *key)
{
struct umtxq_chain *uc;
uc = umtxq_getchain(key);
mtx_unlock(&uc->uc_lock);
}
/*
* Set chain to busy state when following operation
* may be blocked (kernel mutex can not be used).
*/
static inline void
umtxq_busy(struct umtx_key *key)
{
struct umtxq_chain *uc;
uc = umtxq_getchain(key);
mtx_assert(&uc->uc_lock, MA_OWNED);
if (uc->uc_busy) {
#ifdef SMP
if (smp_cpus > 1) {
int count = BUSY_SPINS;
if (count > 0) {
umtxq_unlock(key);
while (uc->uc_busy && --count > 0)
cpu_spinwait();
umtxq_lock(key);
}
}
#endif
while (uc->uc_busy) {
uc->uc_waiters++;
msleep(uc, &uc->uc_lock, 0, "umtxqb", 0);
uc->uc_waiters--;
}
}
uc->uc_busy = 1;
}
/*
* Unbusy a chain.
*/
static inline void
umtxq_unbusy(struct umtx_key *key)
{
struct umtxq_chain *uc;
uc = umtxq_getchain(key);
mtx_assert(&uc->uc_lock, MA_OWNED);
KASSERT(uc->uc_busy != 0, ("not busy"));
uc->uc_busy = 0;
if (uc->uc_waiters)
wakeup_one(uc);
}
static struct umtxq_queue *
umtxq_queue_lookup(struct umtx_key *key, int q)
{
struct umtxq_queue *uh;
struct umtxq_chain *uc;
uc = umtxq_getchain(key);
UMTXQ_LOCKED_ASSERT(uc);
LIST_FOREACH(uh, &uc->uc_queue[q], link) {
if (umtx_key_match(&uh->key, key))
return (uh);
}
return (NULL);
}
static inline void
umtxq_insert_queue(struct umtx_q *uq, int q)
{
struct umtxq_queue *uh;
struct umtxq_chain *uc;
uc = umtxq_getchain(&uq->uq_key);
UMTXQ_LOCKED_ASSERT(uc);
KASSERT((uq->uq_flags & UQF_UMTXQ) == 0, ("umtx_q is already on queue"));
uh = umtxq_queue_lookup(&uq->uq_key, q);
if (uh != NULL) {
LIST_INSERT_HEAD(&uc->uc_spare_queue, uq->uq_spare_queue, link);
} else {
uh = uq->uq_spare_queue;
uh->key = uq->uq_key;
LIST_INSERT_HEAD(&uc->uc_queue[q], uh, link);
}
uq->uq_spare_queue = NULL;
TAILQ_INSERT_TAIL(&uh->head, uq, uq_link);
uh->length++;
uq->uq_flags |= UQF_UMTXQ;
uq->uq_cur_queue = uh;
return;
}
static inline void
umtxq_remove_queue(struct umtx_q *uq, int q)
{
struct umtxq_chain *uc;
struct umtxq_queue *uh;
uc = umtxq_getchain(&uq->uq_key);
UMTXQ_LOCKED_ASSERT(uc);
if (uq->uq_flags & UQF_UMTXQ) {
uh = uq->uq_cur_queue;
TAILQ_REMOVE(&uh->head, uq, uq_link);
uh->length--;
uq->uq_flags &= ~UQF_UMTXQ;
if (TAILQ_EMPTY(&uh->head)) {
KASSERT(uh->length == 0,
("inconsistent umtxq_queue length"));
LIST_REMOVE(uh, link);
} else {
uh = LIST_FIRST(&uc->uc_spare_queue);
KASSERT(uh != NULL, ("uc_spare_queue is empty"));
LIST_REMOVE(uh, link);
}
uq->uq_spare_queue = uh;
uq->uq_cur_queue = NULL;
}
}
/*
* Check if there are multiple waiters
*/
static int
umtxq_count(struct umtx_key *key)
{
struct umtxq_chain *uc;
struct umtxq_queue *uh;
uc = umtxq_getchain(key);
UMTXQ_LOCKED_ASSERT(uc);
uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
if (uh != NULL)
return (uh->length);
return (0);
}
/*
* Check if there are multiple PI waiters and returns first
* waiter.
*/
static int
umtxq_count_pi(struct umtx_key *key, struct umtx_q **first)
{
struct umtxq_chain *uc;
struct umtxq_queue *uh;
*first = NULL;
uc = umtxq_getchain(key);
UMTXQ_LOCKED_ASSERT(uc);
uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
if (uh != NULL) {
*first = TAILQ_FIRST(&uh->head);
return (uh->length);
}
return (0);
}
/*
* Wake up threads waiting on an userland object.
*/
static int
umtxq_signal_queue(struct umtx_key *key, int n_wake, int q)
{
struct umtxq_chain *uc;
struct umtxq_queue *uh;
struct umtx_q *uq;
int ret;
ret = 0;
uc = umtxq_getchain(key);
UMTXQ_LOCKED_ASSERT(uc);
uh = umtxq_queue_lookup(key, q);
if (uh != NULL) {
while ((uq = TAILQ_FIRST(&uh->head)) != NULL) {
umtxq_remove_queue(uq, q);
wakeup(uq);
if (++ret >= n_wake)
return (ret);
}
}
return (ret);
}
/*
* Wake up specified thread.
*/
static inline void
umtxq_signal_thread(struct umtx_q *uq)
{
struct umtxq_chain *uc;
uc = umtxq_getchain(&uq->uq_key);
UMTXQ_LOCKED_ASSERT(uc);
umtxq_remove(uq);
wakeup(uq);
}
/*
* Put thread into sleep state, before sleeping, check if
* thread was removed from umtx queue.
*/
static inline int
umtxq_sleep(struct umtx_q *uq, const char *wmesg, int timo)
{
struct umtxq_chain *uc;
int error;
uc = umtxq_getchain(&uq->uq_key);
UMTXQ_LOCKED_ASSERT(uc);
if (!(uq->uq_flags & UQF_UMTXQ))
return (0);
error = msleep(uq, &uc->uc_lock, PCATCH, wmesg, timo);
if (error == EWOULDBLOCK)
error = ETIMEDOUT;
return (error);
}
/*
* Convert userspace address into unique logical address.
*/
int
umtx_key_get(void *addr, int type, int share, struct umtx_key *key)
{
struct thread *td = curthread;
vm_map_t map;
vm_map_entry_t entry;
vm_pindex_t pindex;
vm_prot_t prot;
boolean_t wired;
key->type = type;
if (share == THREAD_SHARE) {
key->shared = 0;
key->info.private.vs = td->td_proc->p_vmspace;
key->info.private.addr = (uintptr_t)addr;
} else {
MPASS(share == PROCESS_SHARE || share == AUTO_SHARE);
map = &td->td_proc->p_vmspace->vm_map;
if (vm_map_lookup(&map, (vm_offset_t)addr, VM_PROT_WRITE,
&entry, &key->info.shared.object, &pindex, &prot,
&wired) != KERN_SUCCESS) {
return EFAULT;
}
if ((share == PROCESS_SHARE) ||
(share == AUTO_SHARE &&
VM_INHERIT_SHARE == entry->inheritance)) {
key->shared = 1;
key->info.shared.offset = entry->offset + entry->start -
(vm_offset_t)addr;
vm_object_reference(key->info.shared.object);
} else {
key->shared = 0;
key->info.private.vs = td->td_proc->p_vmspace;
key->info.private.addr = (uintptr_t)addr;
}
vm_map_lookup_done(map, entry);
}
umtxq_hash(key);
return (0);
}
/*
* Release key.
*/
void
umtx_key_release(struct umtx_key *key)
{
if (key->shared)
vm_object_deallocate(key->info.shared.object);
}
/*
* Lock a umtx object.
*/
static int
_do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id, int timo)
{
struct umtx_q *uq;
u_long owner;
u_long old;
int error = 0;
uq = td->td_umtxq;
/*
* Care must be exercised when dealing with umtx structure. It
* can fault on any access.
*/
for (;;) {
/*
* Try the uncontested case. This should be done in userland.
*/
owner = casuword(&umtx->u_owner, UMTX_UNOWNED, id);
/* The acquire succeeded. */
if (owner == UMTX_UNOWNED)
return (0);
/* The address was invalid. */
if (owner == -1)
return (EFAULT);
/* If no one owns it but it is contested try to acquire it. */
if (owner == UMTX_CONTESTED) {
owner = casuword(&umtx->u_owner,
UMTX_CONTESTED, id | UMTX_CONTESTED);
if (owner == UMTX_CONTESTED)
return (0);
/* The address was invalid. */
if (owner == -1)
return (EFAULT);
/* If this failed the lock has changed, restart. */
continue;
}
/*
* If we caught a signal, we have retried and now
* exit immediately.
*/
if (error != 0)
return (error);
if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK,
AUTO_SHARE, &uq->uq_key)) != 0)
return (error);
umtxq_lock(&uq->uq_key);
umtxq_busy(&uq->uq_key);
umtxq_insert(uq);
umtxq_unbusy(&uq->uq_key);
umtxq_unlock(&uq->uq_key);
/*
* Set the contested bit so that a release in user space
* knows to use the system call for unlock. If this fails
* either some one else has acquired the lock or it has been
* released.
*/
old = casuword(&umtx->u_owner, owner, owner | UMTX_CONTESTED);
/* The address was invalid. */
if (old == -1) {
umtxq_lock(&uq->uq_key);
umtxq_remove(uq);
umtxq_unlock(&uq->uq_key);
umtx_key_release(&uq->uq_key);
return (EFAULT);
}
/*
* We set the contested bit, sleep. Otherwise the lock changed
* and we need to retry or we lost a race to the thread
* unlocking the umtx.
*/
umtxq_lock(&uq->uq_key);
if (old == owner)
error = umtxq_sleep(uq, "umtx", timo);
umtxq_remove(uq);
umtxq_unlock(&uq->uq_key);
umtx_key_release(&uq->uq_key);
}
return (0);
}
/*
* Lock a umtx object.
*/
static int
do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id,
struct timespec *timeout)
{
struct timespec ts, ts2, ts3;
struct timeval tv;
int error;
if (timeout == NULL) {
error = _do_lock_umtx(td, umtx, id, 0);
/* Mutex locking is restarted if it is interrupted. */
if (error == EINTR)
error = ERESTART;
} else {
getnanouptime(&ts);
timespecadd(&ts, timeout);
TIMESPEC_TO_TIMEVAL(&tv, timeout);
for (;;) {
error = _do_lock_umtx(td, umtx, id, tvtohz(&tv));
if (error != ETIMEDOUT)
break;
getnanouptime(&ts2);
if (timespeccmp(&ts2, &ts, >=)) {
error = ETIMEDOUT;
break;
}
ts3 = ts;
timespecsub(&ts3, &ts2);
TIMESPEC_TO_TIMEVAL(&tv, &ts3);
}
/* Timed-locking is not restarted. */
if (error == ERESTART)
error = EINTR;
}
return (error);
}
/*
* Unlock a umtx object.
*/
static int
do_unlock_umtx(struct thread *td, struct umtx *umtx, u_long id)
{
struct umtx_key key;
u_long owner;
u_long old;
int error;
int count;
/*
* Make sure we own this mtx.
*/
owner = fuword(__DEVOLATILE(u_long *, &umtx->u_owner));
if (owner == -1)
return (EFAULT);
if ((owner & ~UMTX_CONTESTED) != id)
return (EPERM);
/* This should be done in userland */
if ((owner & UMTX_CONTESTED) == 0) {
old = casuword(&umtx->u_owner, owner, UMTX_UNOWNED);
if (old == -1)
return (EFAULT);
if (old == owner)
return (0);
owner = old;
}
/* We should only ever be in here for contested locks */
if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK, AUTO_SHARE,
&key)) != 0)
return (error);
umtxq_lock(&key);
umtxq_busy(&key);
count = umtxq_count(&key);
umtxq_unlock(&key);
/*
* When unlocking the umtx, it must be marked as unowned if
* there is zero or one thread only waiting for it.
* Otherwise, it must be marked as contested.
*/
old = casuword(&umtx->u_owner, owner,
count <= 1 ? UMTX_UNOWNED : UMTX_CONTESTED);
umtxq_lock(&key);
umtxq_signal(&key,1);
umtxq_unbusy(&key);
umtxq_unlock(&key);
umtx_key_release(&key);
if (old == -1)
return (EFAULT);
if (old != owner)
return (EINVAL);
return (0);
}
#ifdef COMPAT_FREEBSD32
/*
* Lock a umtx object.
*/
static int
_do_lock_umtx32(struct thread *td, uint32_t *m, uint32_t id, int timo)
{
struct umtx_q *uq;
uint32_t owner;
uint32_t old;
int error = 0;
uq = td->td_umtxq;
/*
* Care must be exercised when dealing with umtx structure. It
* can fault on any access.
*/
for (;;) {
/*
* Try the uncontested case. This should be done in userland.
*/
owner = casuword32(m, UMUTEX_UNOWNED, id);
/* The acquire succeeded. */
if (owner == UMUTEX_UNOWNED)
return (0);
/* The address was invalid. */
if (owner == -1)
return (EFAULT);
/* If no one owns it but it is contested try to acquire it. */
if (owner == UMUTEX_CONTESTED) {
owner = casuword32(m,
UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
if (owner == UMUTEX_CONTESTED)
return (0);
/* The address was invalid. */
if (owner == -1)
return (EFAULT);
/* If this failed the lock has changed, restart. */
continue;
}
/*
* If we caught a signal, we have retried and now
* exit immediately.
*/
if (error != 0)
return (error);
if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK,
AUTO_SHARE, &uq->uq_key)) != 0)
return (error);
umtxq_lock(&uq->uq_key);
umtxq_busy(&uq->uq_key);
umtxq_insert(uq);
umtxq_unbusy(&uq->uq_key);
umtxq_unlock(&uq->uq_key);
/*
* Set the contested bit so that a release in user space
* knows to use the system call for unlock. If this fails
* either some one else has acquired the lock or it has been
* released.
*/
old = casuword32(m, owner, owner | UMUTEX_CONTESTED);
/* The address was invalid. */
if (old == -1) {
umtxq_lock(&uq->uq_key);
umtxq_remove(uq);
umtxq_unlock(&uq->uq_key);
umtx_key_release(&uq->uq_key);
return (EFAULT);
}
/*
* We set the contested bit, sleep. Otherwise the lock changed
* and we need to retry or we lost a race to the thread
* unlocking the umtx.
*/
umtxq_lock(&uq->uq_key);
if (old == owner)
error = umtxq_sleep(uq, "umtx", timo);
umtxq_remove(uq);
umtxq_unlock(&uq->uq_key);
umtx_key_release(&uq->uq_key);
}
return (0);
}
/*
* Lock a umtx object.
*/
static int
do_lock_umtx32(struct thread *td, void *m, uint32_t id,
struct timespec *timeout)
{
struct timespec ts, ts2, ts3;
struct timeval tv;
int error;
if (timeout == NULL) {
error = _do_lock_umtx32(td, m, id, 0);
/* Mutex locking is restarted if it is interrupted. */
if (error == EINTR)
error = ERESTART;
} else {
getnanouptime(&ts);
timespecadd(&ts, timeout);
TIMESPEC_TO_TIMEVAL(&tv, timeout);
for (;;) {
error = _do_lock_umtx32(td, m, id, tvtohz(&tv));
if (error != ETIMEDOUT)
break;
getnanouptime(&ts2);
if (timespeccmp(&ts2, &ts, >=)) {
error = ETIMEDOUT;
break;
}
ts3 = ts;
timespecsub(&ts3, &ts2);
TIMESPEC_TO_TIMEVAL(&tv, &ts3);
}
/* Timed-locking is not restarted. */
if (error == ERESTART)
error = EINTR;
}
return (error);
}
/*
* Unlock a umtx object.
*/
static int
do_unlock_umtx32(struct thread *td, uint32_t *m, uint32_t id)
{
struct umtx_key key;
uint32_t owner;
uint32_t old;
int error;
int count;
/*
* Make sure we own this mtx.
*/
owner = fuword32(m);
if (owner == -1)
return (EFAULT);
if ((owner & ~UMUTEX_CONTESTED) != id)
return (EPERM);
/* This should be done in userland */
if ((owner & UMUTEX_CONTESTED) == 0) {
old = casuword32(m, owner, UMUTEX_UNOWNED);
if (old == -1)
return (EFAULT);
if (old == owner)
return (0);
owner = old;
}
/* We should only ever be in here for contested locks */
if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK, AUTO_SHARE,
&key)) != 0)
return (error);
umtxq_lock(&key);
umtxq_busy(&key);
count = umtxq_count(&key);
umtxq_unlock(&key);
/*
* When unlocking the umtx, it must be marked as unowned if
* there is zero or one thread only waiting for it.
* Otherwise, it must be marked as contested.
*/
old = casuword32(m, owner,
count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
umtxq_lock(&key);
umtxq_signal(&key,1);
umtxq_unbusy(&key);
umtxq_unlock(&key);
umtx_key_release(&key);
if (old == -1)
return (EFAULT);
if (old != owner)
return (EINVAL);
return (0);
}
#endif
/*
* Fetch and compare value, sleep on the address if value is not changed.
*/
static int
do_wait(struct thread *td, void *addr, u_long id,
struct timespec *timeout, int compat32, int is_private)
{
struct umtx_q *uq;
struct timespec ts, ts2, ts3;
struct timeval tv;
u_long tmp;
int error = 0;
uq = td->td_umtxq;
if ((error = umtx_key_get(addr, TYPE_SIMPLE_WAIT,
is_private ? THREAD_SHARE : AUTO_SHARE, &uq->uq_key)) != 0)
return (error);
umtxq_lock(&uq->uq_key);
umtxq_insert(uq);
umtxq_unlock(&uq->uq_key);
if (compat32 == 0)
tmp = fuword(addr);
else
tmp = (unsigned int)fuword32(addr);
if (tmp != id) {
umtxq_lock(&uq->uq_key);
umtxq_remove(uq);
umtxq_unlock(&uq->uq_key);
} else if (timeout == NULL) {
umtxq_lock(&uq->uq_key);
error = umtxq_sleep(uq, "uwait", 0);
umtxq_remove(uq);
umtxq_unlock(&uq->uq_key);
} else {
getnanouptime(&ts);
timespecadd(&ts, timeout);
TIMESPEC_TO_TIMEVAL(&tv, timeout);
umtxq_lock(&uq->uq_key);
for (;;) {
error = umtxq_sleep(uq, "uwait", tvtohz(&tv));
if (!(uq->uq_flags & UQF_UMTXQ)) {
error = 0;
break;
}
if (error != ETIMEDOUT)
break;
umtxq_unlock(&uq->uq_key);
getnanouptime(&ts2);
if (timespeccmp(&ts2, &ts, >=)) {
error = ETIMEDOUT;
umtxq_lock(&uq->uq_key);
break;
}
ts3 = ts;
timespecsub(&ts3, &ts2);
TIMESPEC_TO_TIMEVAL(&tv, &ts3);
umtxq_lock(&uq->uq_key);
}
umtxq_remove(uq);
umtxq_unlock(&uq->uq_key);
}
umtx_key_release(&uq->uq_key);
if (error == ERESTART)
error = EINTR;
return (error);
}
/*
* Wake up threads sleeping on the specified address.
*/
int
kern_umtx_wake(struct thread *td, void *uaddr, int n_wake, int is_private)
{
struct umtx_key key;
int ret;
if ((ret = umtx_key_get(uaddr, TYPE_SIMPLE_WAIT,
is_private ? THREAD_SHARE : AUTO_SHARE, &key)) != 0)
return (ret);
umtxq_lock(&key);
ret = umtxq_signal(&key, n_wake);
umtxq_unlock(&key);
umtx_key_release(&key);
return (0);
}
/*
* Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
*/
static int
_do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags, int timo,
int mode)
{
struct umtx_q *uq;
uint32_t owner, old, id;
int error = 0;
id = td->td_tid;
uq = td->td_umtxq;
/*
* Care must be exercised when dealing with umtx structure. It
* can fault on any access.
*/
for (;;) {
owner = fuword32(__DEVOLATILE(void *, &m->m_owner));
if (mode == _UMUTEX_WAIT) {
if (owner == UMUTEX_UNOWNED || owner == UMUTEX_CONTESTED)
return (0);
} else {
/*
* Try the uncontested case. This should be done in userland.
*/
owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
/* The acquire succeeded. */
if (owner == UMUTEX_UNOWNED)
return (0);
/* The address was invalid. */
if (owner == -1)
return (EFAULT);
/* If no one owns it but it is contested try to acquire it. */
if (owner == UMUTEX_CONTESTED) {
owner = casuword32(&m->m_owner,
UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
if (owner == UMUTEX_CONTESTED)
return (0);
/* The address was invalid. */
if (owner == -1)
return (EFAULT);
/* If this failed the lock has changed, restart. */
continue;
}
}
if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
(owner & ~UMUTEX_CONTESTED) == id)
return (EDEADLK);
if (mode == _UMUTEX_TRY)
return (EBUSY);
/*
* If we caught a signal, we have retried and now
* exit immediately.
*/
if (error != 0)
return (error);
if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX,
GET_SHARE(flags), &uq->uq_key)) != 0)
return (error);
umtxq_lock(&uq->uq_key);
umtxq_busy(&uq->uq_key);
umtxq_insert(uq);
umtxq_unlock(&uq->uq_key);
/*
* Set the contested bit so that a release in user space
* knows to use the system call for unlock. If this fails
* either some one else has acquired the lock or it has been
* released.
*/
old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
/* The address was invalid. */
if (old == -1) {
umtxq_lock(&uq->uq_key);
umtxq_remove(uq);
umtxq_unbusy(&uq->uq_key);
umtxq_unlock(&uq->uq_key);
umtx_key_release(&uq->uq_key);
return (EFAULT);
}
/*
* We set the contested bit, sleep. Otherwise the lock changed
* and we need to retry or we lost a race to the thread
* unlocking the umtx.
*/
umtxq_lock(&uq->uq_key);
umtxq_unbusy(&uq->uq_key);
if (old == owner)
error = umtxq_sleep(uq, "umtxn", timo);
umtxq_remove(uq);
umtxq_unlock(&uq->uq_key);
umtx_key_release(&uq->uq_key);
}
return (0);
}
/*
* Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
*/
/*
* Unlock PTHREAD_PRIO_NONE protocol POSIX mutex.
*/
static int
do_unlock_normal(struct thread *td, struct umutex *m, uint32_t flags)
{
struct umtx_key key;
uint32_t owner, old, id;
int error;
int count;
id = td->td_tid;
/*
* Make sure we own this mtx.
*/
owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
if (owner == -1)
return (EFAULT);
if ((owner & ~UMUTEX_CONTESTED) != id)
return (EPERM);
if ((owner & UMUTEX_CONTESTED) == 0) {
old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
if (old == -1)
return (EFAULT);
if (old == owner)
return (0);
owner = old;
}
/* We should only ever be in here for contested locks */
if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
&key)) != 0)
return (error);
umtxq_lock(&key);
umtxq_busy(&key);
count = umtxq_count(&key);
umtxq_unlock(&key);
/*
* When unlocking the umtx, it must be marked as unowned if
* there is zero or one thread only waiting for it.
* Otherwise, it must be marked as contested.
*/
old = casuword32(&m->m_owner, owner,
count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
umtxq_lock(&key);
umtxq_signal(&key,1);
umtxq_unbusy(&key);
umtxq_unlock(&key);
umtx_key_release(&key);
if (old == -1)
return (EFAULT);
if (old != owner)
return (EINVAL);
return (0);
}
/*
* Check if the mutex is available and wake up a waiter,
* only for simple mutex.
*/
static int
do_wake_umutex(struct thread *td, struct umutex *m)
{
struct umtx_key key;
uint32_t owner;
uint32_t flags;
int error;
int count;
owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
if (owner == -1)
return (EFAULT);
if ((owner & ~UMUTEX_CONTESTED) != 0)
return (0);
flags = fuword32(&m->m_flags);
/* We should only ever be in here for contested locks */
if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
&key)) != 0)
return (error);
umtxq_lock(&key);
umtxq_busy(&key);
count = umtxq_count(&key);
umtxq_unlock(&key);
if (count <= 1)
owner = casuword32(&m->m_owner, UMUTEX_CONTESTED, UMUTEX_UNOWNED);
umtxq_lock(&key);
if (count != 0 && (owner & ~UMUTEX_CONTESTED) == 0)
umtxq_signal(&key, 1);
umtxq_unbusy(&key);
umtxq_unlock(&key);
umtx_key_release(&key);
return (0);
}
static inline struct umtx_pi *
umtx_pi_alloc(int flags)
{
struct umtx_pi *pi;
pi = uma_zalloc(umtx_pi_zone, M_ZERO | flags);
TAILQ_INIT(&pi->pi_blocked);
atomic_add_int(&umtx_pi_allocated, 1);
return (pi);
}
static inline void
umtx_pi_free(struct umtx_pi *pi)
{
uma_zfree(umtx_pi_zone, pi);
atomic_add_int(&umtx_pi_allocated, -1);
}
/*
* Adjust the thread's position on a pi_state after its priority has been
* changed.
*/
static int
umtx_pi_adjust_thread(struct umtx_pi *pi, struct thread *td)
{
struct umtx_q *uq, *uq1, *uq2;
struct thread *td1;
mtx_assert(&umtx_lock, MA_OWNED);
if (pi == NULL)
return (0);
uq = td->td_umtxq;
/*
* Check if the thread needs to be moved on the blocked chain.
* It needs to be moved if either its priority is lower than
* the previous thread or higher than the next thread.
*/
uq1 = TAILQ_PREV(uq, umtxq_head, uq_lockq);
uq2 = TAILQ_NEXT(uq, uq_lockq);
if ((uq1 != NULL && UPRI(td) < UPRI(uq1->uq_thread)) ||
(uq2 != NULL && UPRI(td) > UPRI(uq2->uq_thread))) {
/*
* Remove thread from blocked chain and determine where
* it should be moved to.
*/
TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
td1 = uq1->uq_thread;
MPASS(td1->td_proc->p_magic == P_MAGIC);
if (UPRI(td1) > UPRI(td))
break;
}
if (uq1 == NULL)
TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
else
TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
}
return (1);
}
/*
* Propagate priority when a thread is blocked on POSIX
* PI mutex.
*/
static void
umtx_propagate_priority(struct thread *td)
{
struct umtx_q *uq;
struct umtx_pi *pi;
int pri;
mtx_assert(&umtx_lock, MA_OWNED);
pri = UPRI(td);
uq = td->td_umtxq;
pi = uq->uq_pi_blocked;
if (pi == NULL)
return;
for (;;) {
td = pi->pi_owner;
if (td == NULL || td == curthread)
return;
MPASS(td->td_proc != NULL);
MPASS(td->td_proc->p_magic == P_MAGIC);
thread_lock(td);
if (td->td_lend_user_pri > pri)
sched_lend_user_prio(td, pri);
else {
thread_unlock(td);
break;
}
thread_unlock(td);
/*
* Pick up the lock that td is blocked on.
*/
uq = td->td_umtxq;
pi = uq->uq_pi_blocked;
if (pi == NULL)
break;
/* Resort td on the list if needed. */
umtx_pi_adjust_thread(pi, td);
}
}
/*
* Unpropagate priority for a PI mutex when a thread blocked on
* it is interrupted by signal or resumed by others.
*/
static void
umtx_repropagate_priority(struct umtx_pi *pi)
{
struct umtx_q *uq, *uq_owner;
struct umtx_pi *pi2;
int pri;
mtx_assert(&umtx_lock, MA_OWNED);
while (pi != NULL && pi->pi_owner != NULL) {
pri = PRI_MAX;
uq_owner = pi->pi_owner->td_umtxq;
TAILQ_FOREACH(pi2, &uq_owner->uq_pi_contested, pi_link) {
uq = TAILQ_FIRST(&pi2->pi_blocked);
if (uq != NULL) {
if (pri > UPRI(uq->uq_thread))
pri = UPRI(uq->uq_thread);
}
}
if (pri > uq_owner->uq_inherited_pri)
pri = uq_owner->uq_inherited_pri;
thread_lock(pi->pi_owner);
sched_lend_user_prio(pi->pi_owner, pri);
thread_unlock(pi->pi_owner);
if ((pi = uq_owner->uq_pi_blocked) != NULL)
umtx_pi_adjust_thread(pi, uq_owner->uq_thread);
}
}
/*
* Insert a PI mutex into owned list.
*/
static void
umtx_pi_setowner(struct umtx_pi *pi, struct thread *owner)
{
struct umtx_q *uq_owner;
uq_owner = owner->td_umtxq;
mtx_assert(&umtx_lock, MA_OWNED);
if (pi->pi_owner != NULL)
panic("pi_ower != NULL");
pi->pi_owner = owner;
TAILQ_INSERT_TAIL(&uq_owner->uq_pi_contested, pi, pi_link);
}
/*
* Claim ownership of a PI mutex.
*/
static int
umtx_pi_claim(struct umtx_pi *pi, struct thread *owner)
{
struct umtx_q *uq, *uq_owner;
uq_owner = owner->td_umtxq;
mtx_lock_spin(&umtx_lock);
if (pi->pi_owner == owner) {
mtx_unlock_spin(&umtx_lock);
return (0);
}
if (pi->pi_owner != NULL) {
/*
* userland may have already messed the mutex, sigh.
*/
mtx_unlock_spin(&umtx_lock);
return (EPERM);
}
umtx_pi_setowner(pi, owner);
uq = TAILQ_FIRST(&pi->pi_blocked);
if (uq != NULL) {
int pri;
pri = UPRI(uq->uq_thread);
thread_lock(owner);
if (pri < UPRI(owner))
sched_lend_user_prio(owner, pri);
thread_unlock(owner);
}
mtx_unlock_spin(&umtx_lock);
return (0);
}
/*
* Adjust a thread's order position in its blocked PI mutex,
* this may result new priority propagating process.
*/
void
umtx_pi_adjust(struct thread *td, u_char oldpri)
{
struct umtx_q *uq;
struct umtx_pi *pi;
uq = td->td_umtxq;
mtx_lock_spin(&umtx_lock);
/*
* Pick up the lock that td is blocked on.
*/
pi = uq->uq_pi_blocked;
if (pi != NULL) {
umtx_pi_adjust_thread(pi, td);
umtx_repropagate_priority(pi);
}
mtx_unlock_spin(&umtx_lock);
}
/*
* Sleep on a PI mutex.
*/
static int
umtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi,
uint32_t owner, const char *wmesg, int timo)
{
struct umtxq_chain *uc;
struct thread *td, *td1;
struct umtx_q *uq1;
int pri;
int error = 0;
td = uq->uq_thread;
KASSERT(td == curthread, ("inconsistent uq_thread"));
uc = umtxq_getchain(&uq->uq_key);
UMTXQ_LOCKED_ASSERT(uc);
UMTXQ_BUSY_ASSERT(uc);
umtxq_insert(uq);
mtx_lock_spin(&umtx_lock);
if (pi->pi_owner == NULL) {
mtx_unlock_spin(&umtx_lock);
/* XXX Only look up thread in current process. */
td1 = tdfind(owner, curproc->p_pid);
mtx_lock_spin(&umtx_lock);
if (td1 != NULL) {
if (pi->pi_owner == NULL)
umtx_pi_setowner(pi, td1);
PROC_UNLOCK(td1->td_proc);
}
}
TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
pri = UPRI(uq1->uq_thread);
if (pri > UPRI(td))
break;
}
if (uq1 != NULL)
TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
else
TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
uq->uq_pi_blocked = pi;
thread_lock(td);
td->td_flags |= TDF_UPIBLOCKED;
thread_unlock(td);
umtx_propagate_priority(td);
mtx_unlock_spin(&umtx_lock);
umtxq_unbusy(&uq->uq_key);
if (uq->uq_flags & UQF_UMTXQ) {
error = msleep(uq, &uc->uc_lock, PCATCH, wmesg, timo);
if (error == EWOULDBLOCK)
error = ETIMEDOUT;
if (uq->uq_flags & UQF_UMTXQ) {
umtxq_remove(uq);
}
}
mtx_lock_spin(&umtx_lock);
uq->uq_pi_blocked = NULL;
thread_lock(td);
td->td_flags &= ~TDF_UPIBLOCKED;
thread_unlock(td);
TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
umtx_repropagate_priority(pi);
mtx_unlock_spin(&umtx_lock);
umtxq_unlock(&uq->uq_key);
return (error);
}
/*
* Add reference count for a PI mutex.
*/
static void
umtx_pi_ref(struct umtx_pi *pi)
{
struct umtxq_chain *uc;
uc = umtxq_getchain(&pi->pi_key);
UMTXQ_LOCKED_ASSERT(uc);
pi->pi_refcount++;
}
/*
* Decrease reference count for a PI mutex, if the counter
* is decreased to zero, its memory space is freed.
*/
static void
umtx_pi_unref(struct umtx_pi *pi)
{
struct umtxq_chain *uc;
uc = umtxq_getchain(&pi->pi_key);
UMTXQ_LOCKED_ASSERT(uc);
KASSERT(pi->pi_refcount > 0, ("invalid reference count"));
if (--pi->pi_refcount == 0) {
mtx_lock_spin(&umtx_lock);
if (pi->pi_owner != NULL) {
TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested,
pi, pi_link);
pi->pi_owner = NULL;
}
KASSERT(TAILQ_EMPTY(&pi->pi_blocked),
("blocked queue not empty"));
mtx_unlock_spin(&umtx_lock);
TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink);
umtx_pi_free(pi);
}
}
/*
* Find a PI mutex in hash table.
*/
static struct umtx_pi *
umtx_pi_lookup(struct umtx_key *key)
{
struct umtxq_chain *uc;
struct umtx_pi *pi;
uc = umtxq_getchain(key);
UMTXQ_LOCKED_ASSERT(uc);
TAILQ_FOREACH(pi, &uc->uc_pi_list, pi_hashlink) {
if (umtx_key_match(&pi->pi_key, key)) {
return (pi);
}
}
return (NULL);
}
/*
* Insert a PI mutex into hash table.
*/
static inline void
umtx_pi_insert(struct umtx_pi *pi)
{
struct umtxq_chain *uc;
uc = umtxq_getchain(&pi->pi_key);
UMTXQ_LOCKED_ASSERT(uc);
TAILQ_INSERT_TAIL(&uc->uc_pi_list, pi, pi_hashlink);
}
/*
* Lock a PI mutex.
*/
static int
_do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags, int timo,
int try)
{
struct umtx_q *uq;
struct umtx_pi *pi, *new_pi;
uint32_t id, owner, old;
int error;
id = td->td_tid;
uq = td->td_umtxq;
if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
&uq->uq_key)) != 0)
return (error);
umtxq_lock(&uq->uq_key);
pi = umtx_pi_lookup(&uq->uq_key);
if (pi == NULL) {
new_pi = umtx_pi_alloc(M_NOWAIT);
if (new_pi == NULL) {
umtxq_unlock(&uq->uq_key);
new_pi = umtx_pi_alloc(M_WAITOK);
umtxq_lock(&uq->uq_key);
pi = umtx_pi_lookup(&uq->uq_key);
if (pi != NULL) {
umtx_pi_free(new_pi);
new_pi = NULL;
}
}
if (new_pi != NULL) {
new_pi->pi_key = uq->uq_key;
umtx_pi_insert(new_pi);
pi = new_pi;
}
}
umtx_pi_ref(pi);
umtxq_unlock(&uq->uq_key);
/*
* Care must be exercised when dealing with umtx structure. It
* can fault on any access.
*/
for (;;) {
/*
* Try the uncontested case. This should be done in userland.
*/
owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
/* The acquire succeeded. */
if (owner == UMUTEX_UNOWNED) {
error = 0;
break;
}
/* The address was invalid. */
if (owner == -1) {
error = EFAULT;
break;
}
/* If no one owns it but it is contested try to acquire it. */
if (owner == UMUTEX_CONTESTED) {
owner = casuword32(&m->m_owner,
UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
if (owner == UMUTEX_CONTESTED) {
umtxq_lock(&uq->uq_key);
umtxq_busy(&uq->uq_key);
error = umtx_pi_claim(pi, td);
umtxq_unbusy(&uq->uq_key);
umtxq_unlock(&uq->uq_key);
break;
}
/* The address was invalid. */
if (owner == -1) {
error = EFAULT;
break;
}
/* If this failed the lock has changed, restart. */
continue;
}
if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
(owner & ~UMUTEX_CONTESTED) == id) {
error = EDEADLK;
break;
}
if (try != 0) {
error = EBUSY;
break;
}
/*
* If we caught a signal, we have retried and now
* exit immediately.
*/
if (error != 0)
break;
umtxq_lock(&uq->uq_key);
umtxq_busy(&uq->uq_key);
umtxq_unlock(&uq->uq_key);
/*
* Set the contested bit so that a release in user space
* knows to use the system call for unlock. If this fails
* either some one else has acquired the lock or it has been
* released.
*/
old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
/* The address was invalid. */
if (old == -1) {
umtxq_lock(&uq->uq_key);
umtxq_unbusy(&uq->uq_key);
umtxq_unlock(&uq->uq_key);
error = EFAULT;
break;
}
umtxq_lock(&uq->uq_key);
/*
* We set the contested bit, sleep. Otherwise the lock changed
* and we need to retry or we lost a race to the thread
* unlocking the umtx.
*/
if (old == owner)
error = umtxq_sleep_pi(uq, pi, owner & ~UMUTEX_CONTESTED,
"umtxpi", timo);
else {
umtxq_unbusy(&uq->uq_key);
umtxq_unlock(&uq->uq_key);
}
}
umtxq_lock(&uq->uq_key);
umtx_pi_unref(pi);
umtxq_unlock(&uq->uq_key);
umtx_key_release(&uq->uq_key);
return (error);
}
/*
* Unlock a PI mutex.
*/
static int
do_unlock_pi(struct thread *td, struct umutex *m, uint32_t flags)
{
struct umtx_key key;
struct umtx_q *uq_first, *uq_first2, *uq_me;
struct umtx_pi *pi, *pi2;
uint32_t owner, old, id;
int error;
int count;
int pri;
id = td->td_tid;
/*
* Make sure we own this mtx.
*/
owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
if (owner == -1)
return (EFAULT);
if ((owner & ~UMUTEX_CONTESTED) != id)
return (EPERM);
/* This should be done in userland */
if ((owner & UMUTEX_CONTESTED) == 0) {
old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
if (old == -1)
return (EFAULT);
if (old == owner)
return (0);
owner = old;
}
/* We should only ever be in here for contested locks */
if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
&key)) != 0)
return (error);
umtxq_lock(&key);
umtxq_busy(&key);
count = umtxq_count_pi(&key, &uq_first);
if (uq_first != NULL) {
mtx_lock_spin(&umtx_lock);
pi = uq_first->uq_pi_blocked;
KASSERT(pi != NULL, ("pi == NULL?"));
if (pi->pi_owner != curthread) {
mtx_unlock_spin(&umtx_lock);
umtxq_unbusy(&key);
umtxq_unlock(&key);
umtx_key_release(&key);
/* userland messed the mutex */
return (EPERM);
}
uq_me = curthread->td_umtxq;
pi->pi_owner = NULL;
TAILQ_REMOVE(&uq_me->uq_pi_contested, pi, pi_link);
/* get highest priority thread which is still sleeping. */
uq_first = TAILQ_FIRST(&pi->pi_blocked);
while (uq_first != NULL &&
(uq_first->uq_flags & UQF_UMTXQ) == 0) {
uq_first = TAILQ_NEXT(uq_first, uq_lockq);
}
pri = PRI_MAX;
TAILQ_FOREACH(pi2, &uq_me->uq_pi_contested, pi_link) {
uq_first2 = TAILQ_FIRST(&pi2->pi_blocked);
if (uq_first2 != NULL) {
if (pri > UPRI(uq_first2->uq_thread))
pri = UPRI(uq_first2->uq_thread);
}
}
thread_lock(curthread);
sched_lend_user_prio(curthread, pri);
thread_unlock(curthread);
mtx_unlock_spin(&umtx_lock);
if (uq_first)
umtxq_signal_thread(uq_first);
}
umtxq_unlock(&key);
/*
* When unlocking the umtx, it must be marked as unowned if
* there is zero or one thread only waiting for it.
* Otherwise, it must be marked as contested.
*/
old = casuword32(&m->m_owner, owner,
count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
umtxq_lock(&key);
umtxq_unbusy(&key);
umtxq_unlock(&key);
umtx_key_release(&key);
if (old == -1)
return (EFAULT);
if (old != owner)
return (EINVAL);
return (0);
}
/*
* Lock a PP mutex.
*/
static int
_do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags, int timo,
int try)
{
struct umtx_q *uq, *uq2;
struct umtx_pi *pi;
uint32_t ceiling;
uint32_t owner, id;
int error, pri, old_inherited_pri, su;
id = td->td_tid;
uq = td->td_umtxq;
if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
&uq->uq_key)) != 0)
return (error);
su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
for (;;) {
old_inherited_pri = uq->uq_inherited_pri;
umtxq_lock(&uq->uq_key);
umtxq_busy(&uq->uq_key);
umtxq_unlock(&uq->uq_key);
ceiling = RTP_PRIO_MAX - fuword32(&m->m_ceilings[0]);
if (ceiling > RTP_PRIO_MAX) {
error = EINVAL;
goto out;
}
mtx_lock_spin(&umtx_lock);
if (UPRI(td) < PRI_MIN_REALTIME + ceiling) {
mtx_unlock_spin(&umtx_lock);
error = EINVAL;
goto out;
}
if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) {
uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling;
thread_lock(td);
if (uq->uq_inherited_pri < UPRI(td))
sched_lend_user_prio(td, uq->uq_inherited_pri);
thread_unlock(td);
}
mtx_unlock_spin(&umtx_lock);
owner = casuword32(&m->m_owner,
UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
if (owner == UMUTEX_CONTESTED) {
error = 0;
break;
}
/* The address was invalid. */
if (owner == -1) {
error = EFAULT;
break;
}
if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
(owner & ~UMUTEX_CONTESTED) == id) {
error = EDEADLK;
break;
}
if (try != 0) {
error = EBUSY;
break;
}
/*
* If we caught a signal, we have retried and now
* exit immediately.
*/
if (error != 0)
break;
umtxq_lock(&uq->uq_key);
umtxq_insert(uq);
umtxq_unbusy(&uq->uq_key);
error = umtxq_sleep(uq, "umtxpp", timo);
umtxq_remove(uq);
umtxq_unlock(&uq->uq_key);
mtx_lock_spin(&umtx_lock);
uq->uq_inherited_pri = old_inherited_pri;
pri = PRI_MAX;
TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
uq2 = TAILQ_FIRST(&pi->pi_blocked);
if (uq2 != NULL) {
if (pri > UPRI(uq2->uq_thread))
pri = UPRI(uq2->uq_thread);
}
}
if (pri > uq->uq_inherited_pri)
pri = uq->uq_inherited_pri;
thread_lock(td);
sched_lend_user_prio(td, pri);
thread_unlock(td);
mtx_unlock_spin(&umtx_lock);
}
if (error != 0) {
mtx_lock_spin(&umtx_lock);
uq->uq_inherited_pri = old_inherited_pri;
pri = PRI_MAX;
TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
uq2 = TAILQ_FIRST(&pi->pi_blocked);
if (uq2 != NULL) {
if (pri > UPRI(uq2->uq_thread))
pri = UPRI(uq2->uq_thread);
}
}
if (pri > uq->uq_inherited_pri)
pri = uq->uq_inherited_pri;
thread_lock(td);
sched_lend_user_prio(td, pri);
thread_unlock(td);
mtx_unlock_spin(&umtx_lock);
}
out:
umtxq_lock(&uq->uq_key);
umtxq_unbusy(&uq->uq_key);
umtxq_unlock(&uq->uq_key);
umtx_key_release(&uq->uq_key);
return (error);
}
/*
* Unlock a PP mutex.
*/
static int
do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags)
{
struct umtx_key key;
struct umtx_q *uq, *uq2;
struct umtx_pi *pi;
uint32_t owner, id;
uint32_t rceiling;
int error, pri, new_inherited_pri, su;
id = td->td_tid;
uq = td->td_umtxq;
su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
/*
* Make sure we own this mtx.
*/
owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
if (owner == -1)
return (EFAULT);
if ((owner & ~UMUTEX_CONTESTED) != id)
return (EPERM);
error = copyin(&m->m_ceilings[1], &rceiling, sizeof(uint32_t));
if (error != 0)
return (error);
if (rceiling == -1)
new_inherited_pri = PRI_MAX;
else {
rceiling = RTP_PRIO_MAX - rceiling;
if (rceiling > RTP_PRIO_MAX)
return (EINVAL);
new_inherited_pri = PRI_MIN_REALTIME + rceiling;
}
if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
&key)) != 0)
return (error);
umtxq_lock(&key);
umtxq_busy(&key);
umtxq_unlock(&key);
/*
* For priority protected mutex, always set unlocked state
* to UMUTEX_CONTESTED, so that userland always enters kernel
* to lock the mutex, it is necessary because thread priority
* has to be adjusted for such mutex.
*/
error = suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
UMUTEX_CONTESTED);
umtxq_lock(&key);
if (error == 0)
umtxq_signal(&key, 1);
umtxq_unbusy(&key);
umtxq_unlock(&key);
if (error == -1)
error = EFAULT;
else {
mtx_lock_spin(&umtx_lock);
if (su != 0)
uq->uq_inherited_pri = new_inherited_pri;
pri = PRI_MAX;
TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
uq2 = TAILQ_FIRST(&pi->pi_blocked);
if (uq2 != NULL) {
if (pri > UPRI(uq2->uq_thread))
pri = UPRI(uq2->uq_thread);
}
}
if (pri > uq->uq_inherited_pri)
pri = uq->uq_inherited_pri;
thread_lock(td);
sched_lend_user_prio(td, pri);
thread_unlock(td);
mtx_unlock_spin(&umtx_lock);
}
umtx_key_release(&key);
return (error);
}
static int
do_set_ceiling(struct thread *td, struct umutex *m, uint32_t ceiling,
uint32_t *old_ceiling)
{
struct umtx_q *uq;
uint32_t save_ceiling;
uint32_t owner, id;
uint32_t flags;
int error;
flags = fuword32(&m->m_flags);
if ((flags & UMUTEX_PRIO_PROTECT) == 0)
return (EINVAL);
if (ceiling > RTP_PRIO_MAX)
return (EINVAL);
id = td->td_tid;
uq = td->td_umtxq;
if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
&uq->uq_key)) != 0)
return (error);
for (;;) {
umtxq_lock(&uq->uq_key);
umtxq_busy(&uq->uq_key);
umtxq_unlock(&uq->uq_key);
save_ceiling = fuword32(&m->m_ceilings[0]);
owner = casuword32(&m->m_owner,
UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
if (owner == UMUTEX_CONTESTED) {
suword32(&m->m_ceilings[0], ceiling);
suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
UMUTEX_CONTESTED);
error = 0;
break;
}
/* The address was invalid. */
if (owner == -1) {
error = EFAULT;
break;
}
if ((owner & ~UMUTEX_CONTESTED) == id) {
suword32(&m->m_ceilings[0], ceiling);
error = 0;
break;
}
/*
* If we caught a signal, we have retried and now
* exit immediately.
*/
if (error != 0)
break;
/*
* We set the contested bit, sleep. Otherwise the lock changed
* and we need to retry or we lost a race to the thread
* unlocking the umtx.
*/
umtxq_lock(&uq->uq_key);
umtxq_insert(uq);
umtxq_unbusy(&uq->uq_key);
error = umtxq_sleep(uq, "umtxpp", 0);
umtxq_remove(uq);
umtxq_unlock(&uq->uq_key);
}
umtxq_lock(&uq->uq_key);
if (error == 0)
umtxq_signal(&uq->uq_key, INT_MAX);
umtxq_unbusy(&uq->uq_key);
umtxq_unlock(&uq->uq_key);
umtx_key_release(&uq->uq_key);
if (error == 0 && old_ceiling != NULL)
suword32(old_ceiling, save_ceiling);
return (error);
}
static int
_do_lock_umutex(struct thread *td, struct umutex *m, int flags, int timo,
int mode)
{
switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
case 0:
return (_do_lock_normal(td, m, flags, timo, mode));
case UMUTEX_PRIO_INHERIT:
return (_do_lock_pi(td, m, flags, timo, mode));
case UMUTEX_PRIO_PROTECT:
return (_do_lock_pp(td, m, flags, timo, mode));
}
return (EINVAL);
}
/*
* Lock a userland POSIX mutex.
*/
static int
do_lock_umutex(struct thread *td, struct umutex *m,
struct timespec *timeout, int mode)
{
struct timespec ts, ts2, ts3;
struct timeval tv;
uint32_t flags;
int error;
flags = fuword32(&m->m_flags);
if (flags == -1)
return (EFAULT);
if (timeout == NULL) {
error = _do_lock_umutex(td, m, flags, 0, mode);
/* Mutex locking is restarted if it is interrupted. */
if (error == EINTR && mode != _UMUTEX_WAIT)
error = ERESTART;
} else {
getnanouptime(&ts);
timespecadd(&ts, timeout);
TIMESPEC_TO_TIMEVAL(&tv, timeout);
for (;;) {
error = _do_lock_umutex(td, m, flags, tvtohz(&tv), mode);
if (error != ETIMEDOUT)
break;
getnanouptime(&ts2);
if (timespeccmp(&ts2, &ts, >=)) {
error = ETIMEDOUT;
break;
}
ts3 = ts;
timespecsub(&ts3, &ts2);
TIMESPEC_TO_TIMEVAL(&tv, &ts3);
}
/* Timed-locking is not restarted. */
if (error == ERESTART)
error = EINTR;
}
return (error);
}
/*
* Unlock a userland POSIX mutex.
*/
static int
do_unlock_umutex(struct thread *td, struct umutex *m)
{
uint32_t flags;
flags = fuword32(&m->m_flags);
if (flags == -1)
return (EFAULT);
switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
case 0:
return (do_unlock_normal(td, m, flags));
case UMUTEX_PRIO_INHERIT:
return (do_unlock_pi(td, m, flags));
case UMUTEX_PRIO_PROTECT:
return (do_unlock_pp(td, m, flags));
}
return (EINVAL);
}
static int
do_cv_wait(struct thread *td, struct ucond *cv, struct umutex *m,
struct timespec *timeout, u_long wflags)
{
struct umtx_q *uq;
struct timeval tv;
struct timespec cts, ets, tts;
uint32_t flags;
uint32_t clockid;
int error;
uq = td->td_umtxq;
flags = fuword32(&cv->c_flags);
error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &uq->uq_key);
if (error != 0)
return (error);
if ((wflags & CVWAIT_CLOCKID) != 0) {
clockid = fuword32(&cv->c_clockid);
if (clockid < CLOCK_REALTIME ||
clockid >= CLOCK_THREAD_CPUTIME_ID) {
/* hmm, only HW clock id will work. */
return (EINVAL);
}
} else {
clockid = CLOCK_REALTIME;
}
umtxq_lock(&uq->uq_key);
umtxq_busy(&uq->uq_key);
umtxq_insert(uq);
umtxq_unlock(&uq->uq_key);
/*
* Set c_has_waiters to 1 before releasing user mutex, also
* don't modify cache line when unnecessary.
*/
if (fuword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters)) == 0)
suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 1);
umtxq_lock(&uq->uq_key);
umtxq_unbusy(&uq->uq_key);
umtxq_unlock(&uq->uq_key);
error = do_unlock_umutex(td, m);
umtxq_lock(&uq->uq_key);
if (error == 0) {
if (timeout == NULL) {
error = umtxq_sleep(uq, "ucond", 0);
} else {
if ((wflags & CVWAIT_ABSTIME) == 0) {
kern_clock_gettime(td, clockid, &ets);
timespecadd(&ets, timeout);
tts = *timeout;
} else { /* absolute time */
ets = *timeout;
tts = *timeout;
kern_clock_gettime(td, clockid, &cts);
timespecsub(&tts, &cts);
}
TIMESPEC_TO_TIMEVAL(&tv, &tts);
for (;;) {
error = umtxq_sleep(uq, "ucond", tvtohz(&tv));
if (error != ETIMEDOUT)
break;
kern_clock_gettime(td, clockid, &cts);
if (timespeccmp(&cts, &ets, >=)) {
error = ETIMEDOUT;
break;
}
tts = ets;
timespecsub(&tts, &cts);
TIMESPEC_TO_TIMEVAL(&tv, &tts);
}
}
}
if ((uq->uq_flags & UQF_UMTXQ) == 0)
error = 0;
else {
/*
* This must be timeout,interrupted by signal or
* surprious wakeup, clear c_has_waiter flag when
* necessary.
*/
umtxq_busy(&uq->uq_key);
if ((uq->uq_flags & UQF_UMTXQ) != 0) {
int oldlen = uq->uq_cur_queue->length;
umtxq_remove(uq);
if (oldlen == 1) {
umtxq_unlock(&uq->uq_key);
suword32(
__DEVOLATILE(uint32_t *,
&cv->c_has_waiters), 0);
umtxq_lock(&uq->uq_key);
}
}
umtxq_unbusy(&uq->uq_key);
if (error == ERESTART)
error = EINTR;
}
umtxq_unlock(&uq->uq_key);
umtx_key_release(&uq->uq_key);
return (error);
}
/*
* Signal a userland condition variable.
*/
static int
do_cv_signal(struct thread *td, struct ucond *cv)
{
struct umtx_key key;
int error, cnt, nwake;
uint32_t flags;
flags = fuword32(&cv->c_flags);
if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
return (error);
umtxq_lock(&key);
umtxq_busy(&key);
cnt = umtxq_count(&key);
nwake = umtxq_signal(&key, 1);
if (cnt <= nwake) {
umtxq_unlock(&key);
error = suword32(
__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
umtxq_lock(&key);
}
umtxq_unbusy(&key);
umtxq_unlock(&key);
umtx_key_release(&key);
return (error);
}
static int
do_cv_broadcast(struct thread *td, struct ucond *cv)
{
struct umtx_key key;
int error;
uint32_t flags;
flags = fuword32(&cv->c_flags);
if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
return (error);
umtxq_lock(&key);
umtxq_busy(&key);
umtxq_signal(&key, INT_MAX);
umtxq_unlock(&key);
error = suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
umtxq_lock(&key);
umtxq_unbusy(&key);
umtxq_unlock(&key);
umtx_key_release(&key);
return (error);
}
static int
do_rw_rdlock(struct thread *td, struct urwlock *rwlock, long fflag, int timo)
{
struct umtx_q *uq;
uint32_t flags, wrflags;
int32_t state, oldstate;
int32_t blocked_readers;
int error;
uq = td->td_umtxq;
flags = fuword32(&rwlock->rw_flags);
error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
if (error != 0)
return (error);
wrflags = URWLOCK_WRITE_OWNER;
if (!(fflag & URWLOCK_PREFER_READER) && !(flags & URWLOCK_PREFER_READER))
wrflags |= URWLOCK_WRITE_WAITERS;
for (;;) {
state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
/* try to lock it */
while (!(state & wrflags)) {
if (__predict_false(URWLOCK_READER_COUNT(state) == URWLOCK_MAX_READERS)) {
umtx_key_release(&uq->uq_key);
return (EAGAIN);
}
oldstate = casuword32(&rwlock->rw_state, state, state + 1);
if (oldstate == state) {
umtx_key_release(&uq->uq_key);
return (0);
}
state = oldstate;
}
if (error)
break;
/* grab monitor lock */
umtxq_lock(&uq->uq_key);
umtxq_busy(&uq->uq_key);
umtxq_unlock(&uq->uq_key);
/*
* re-read the state, in case it changed between the try-lock above
* and the check below
*/
state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
/* set read contention bit */
while ((state & wrflags) && !(state & URWLOCK_READ_WAITERS)) {
oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_READ_WAITERS);
if (oldstate == state)
goto sleep;
state = oldstate;
}
/* state is changed while setting flags, restart */
if (!(state & wrflags)) {
umtxq_lock(&uq->uq_key);
umtxq_unbusy(&uq->uq_key);
umtxq_unlock(&uq->uq_key);
continue;
}
sleep:
/* contention bit is set, before sleeping, increase read waiter count */
blocked_readers = fuword32(&rwlock->rw_blocked_readers);
suword32(&rwlock->rw_blocked_readers, blocked_readers+1);
while (state & wrflags) {
umtxq_lock(&uq->uq_key);
umtxq_insert(uq);
umtxq_unbusy(&uq->uq_key);
error = umtxq_sleep(uq, "urdlck", timo);
umtxq_busy(&uq->uq_key);
umtxq_remove(uq);
umtxq_unlock(&uq->uq_key);
if (error)
break;
state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
}
/* decrease read waiter count, and may clear read contention bit */
blocked_readers = fuword32(&rwlock->rw_blocked_readers);
suword32(&rwlock->rw_blocked_readers, blocked_readers-1);
if (blocked_readers == 1) {
state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
for (;;) {
oldstate = casuword32(&rwlock->rw_state, state,
state & ~URWLOCK_READ_WAITERS);
if (oldstate == state)
break;
state = oldstate;
}
}
umtxq_lock(&uq->uq_key);
umtxq_unbusy(&uq->uq_key);
umtxq_unlock(&uq->uq_key);
}
umtx_key_release(&uq->uq_key);
return (error);
}
static int
do_rw_rdlock2(struct thread *td, void *obj, long val, struct timespec *timeout)
{
struct timespec ts, ts2, ts3;
struct timeval tv;
int error;
getnanouptime(&ts);
timespecadd(&ts, timeout);
TIMESPEC_TO_TIMEVAL(&tv, timeout);
for (;;) {
error = do_rw_rdlock(td, obj, val, tvtohz(&tv));
if (error != ETIMEDOUT)
break;
getnanouptime(&ts2);
if (timespeccmp(&ts2, &ts, >=)) {
error = ETIMEDOUT;
break;
}
ts3 = ts;
timespecsub(&ts3, &ts2);
TIMESPEC_TO_TIMEVAL(&tv, &ts3);
}
if (error == ERESTART)
error = EINTR;
return (error);
}
static int
do_rw_wrlock(struct thread *td, struct urwlock *rwlock, int timo)
{
struct umtx_q *uq;
uint32_t flags;
int32_t state, oldstate;
int32_t blocked_writers;
int32_t blocked_readers;
int error;
uq = td->td_umtxq;
flags = fuword32(&rwlock->rw_flags);
error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
if (error != 0)
return (error);
blocked_readers = 0;
for (;;) {
state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
while (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_WRITE_OWNER);
if (oldstate == state) {
umtx_key_release(&uq->uq_key);
return (0);
}
state = oldstate;
}
if (error) {
if (!(state & (URWLOCK_WRITE_OWNER|URWLOCK_WRITE_WAITERS)) &&
blocked_readers != 0) {
umtxq_lock(&uq->uq_key);
umtxq_busy(&uq->uq_key);
umtxq_signal_queue(&uq->uq_key, INT_MAX, UMTX_SHARED_QUEUE);
umtxq_unbusy(&uq->uq_key);
umtxq_unlock(&uq->uq_key);
}
break;
}
/* grab monitor lock */
umtxq_lock(&uq->uq_key);
umtxq_busy(&uq->uq_key);
umtxq_unlock(&uq->uq_key);
/*
* re-read the state, in case it changed between the try-lock above
* and the check below
*/
state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
while (((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) &&
(state & URWLOCK_WRITE_WAITERS) == 0) {
oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_WRITE_WAITERS);
if (oldstate == state)
goto sleep;
state = oldstate;
}
if (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
umtxq_lock(&uq->uq_key);
umtxq_unbusy(&uq->uq_key);
umtxq_unlock(&uq->uq_key);
continue;
}
sleep:
blocked_writers = fuword32(&rwlock->rw_blocked_writers);
suword32(&rwlock->rw_blocked_writers, blocked_writers+1);
while ((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) {
umtxq_lock(&uq->uq_key);
umtxq_insert_queue(uq, UMTX_EXCLUSIVE_QUEUE);
umtxq_unbusy(&uq->uq_key);
error = umtxq_sleep(uq, "uwrlck", timo);
umtxq_busy(&uq->uq_key);
umtxq_remove_queue(uq, UMTX_EXCLUSIVE_QUEUE);
umtxq_unlock(&uq->uq_key);
if (error)
break;
state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
}
blocked_writers = fuword32(&rwlock->rw_blocked_writers);
suword32(&rwlock->rw_blocked_writers, blocked_writers-1);
if (blocked_writers == 1) {
state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
for (;;) {
oldstate = casuword32(&rwlock->rw_state, state,
state & ~URWLOCK_WRITE_WAITERS);
if (oldstate == state)
break;
state = oldstate;
}
blocked_readers = fuword32(&rwlock->rw_blocked_readers);
} else
blocked_readers = 0;
umtxq_lock(&uq->uq_key);
umtxq_unbusy(&uq->uq_key);
umtxq_unlock(&uq->uq_key);
}
umtx_key_release(&uq->uq_key);
return (error);
}
static int
do_rw_wrlock2(struct thread *td, void *obj, struct timespec *timeout)
{
struct timespec ts, ts2, ts3;
struct timeval tv;
int error;
getnanouptime(&ts);
timespecadd(&ts, timeout);
TIMESPEC_TO_TIMEVAL(&tv, timeout);
for (;;) {
error = do_rw_wrlock(td, obj, tvtohz(&tv));
if (error != ETIMEDOUT)
break;
getnanouptime(&ts2);
if (timespeccmp(&ts2, &ts, >=)) {
error = ETIMEDOUT;
break;
}
ts3 = ts;
timespecsub(&ts3, &ts2);
TIMESPEC_TO_TIMEVAL(&tv, &ts3);
}
if (error == ERESTART)
error = EINTR;
return (error);
}
static int
do_rw_unlock(struct thread *td, struct urwlock *rwlock)
{
struct umtx_q *uq;
uint32_t flags;
int32_t state, oldstate;
int error, q, count;
uq = td->td_umtxq;
flags = fuword32(&rwlock->rw_flags);
error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
if (error != 0)
return (error);
state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
if (state & URWLOCK_WRITE_OWNER) {
for (;;) {
oldstate = casuword32(&rwlock->rw_state, state,
state & ~URWLOCK_WRITE_OWNER);
if (oldstate != state) {
state = oldstate;
if (!(oldstate & URWLOCK_WRITE_OWNER)) {
error = EPERM;
goto out;
}
} else
break;
}
} else if (URWLOCK_READER_COUNT(state) != 0) {
for (;;) {
oldstate = casuword32(&rwlock->rw_state, state,
state - 1);
if (oldstate != state) {
state = oldstate;
if (URWLOCK_READER_COUNT(oldstate) == 0) {
error = EPERM;
goto out;
}
}
else
break;
}
} else {
error = EPERM;
goto out;
}
count = 0;
if (!(flags & URWLOCK_PREFER_READER)) {
if (state & URWLOCK_WRITE_WAITERS) {
count = 1;
q = UMTX_EXCLUSIVE_QUEUE;
} else if (state & URWLOCK_READ_WAITERS) {
count = INT_MAX;
q = UMTX_SHARED_QUEUE;
}
} else {
if (state & URWLOCK_READ_WAITERS) {
count = INT_MAX;
q = UMTX_SHARED_QUEUE;
} else if (state & URWLOCK_WRITE_WAITERS) {
count = 1;
q = UMTX_EXCLUSIVE_QUEUE;
}
}
if (count) {
umtxq_lock(&uq->uq_key);
umtxq_busy(&uq->uq_key);
umtxq_signal_queue(&uq->uq_key, count, q);
umtxq_unbusy(&uq->uq_key);
umtxq_unlock(&uq->uq_key);
}
out:
umtx_key_release(&uq->uq_key);
return (error);
}
static int
do_sem_wait(struct thread *td, struct _usem *sem, struct timespec *timeout)
{
struct umtx_q *uq;
struct timeval tv;
struct timespec cts, ets, tts;
uint32_t flags, count;
int error;
uq = td->td_umtxq;
flags = fuword32(&sem->_flags);
error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key);
if (error != 0)
return (error);
umtxq_lock(&uq->uq_key);
umtxq_busy(&uq->uq_key);
umtxq_insert(uq);
umtxq_unlock(&uq->uq_key);
if (fuword32(__DEVOLATILE(uint32_t *, &sem->_has_waiters)) == 0)
casuword32(__DEVOLATILE(uint32_t *, &sem->_has_waiters), 0, 1);
count = fuword32(__DEVOLATILE(uint32_t *, &sem->_count));
if (count != 0) {
umtxq_lock(&uq->uq_key);
umtxq_unbusy(&uq->uq_key);
umtxq_remove(uq);
umtxq_unlock(&uq->uq_key);
umtx_key_release(&uq->uq_key);
return (0);
}
umtxq_lock(&uq->uq_key);
umtxq_unbusy(&uq->uq_key);
umtxq_unlock(&uq->uq_key);
umtxq_lock(&uq->uq_key);
if (timeout == NULL) {
error = umtxq_sleep(uq, "usem", 0);
} else {
getnanouptime(&ets);
timespecadd(&ets, timeout);
TIMESPEC_TO_TIMEVAL(&tv, timeout);
for (;;) {
error = umtxq_sleep(uq, "usem", tvtohz(&tv));
if (error != ETIMEDOUT)
break;
getnanouptime(&cts);
if (timespeccmp(&cts, &ets, >=)) {
error = ETIMEDOUT;
break;
}
tts = ets;
timespecsub(&tts, &cts);
TIMESPEC_TO_TIMEVAL(&tv, &tts);
}
}
if ((uq->uq_flags & UQF_UMTXQ) == 0)
error = 0;
else {
umtxq_remove(uq);
if (error == ERESTART)
error = EINTR;
}
umtxq_unlock(&uq->uq_key);
umtx_key_release(&uq->uq_key);
return (error);
}
/*
* Signal a userland condition variable.
*/
static int
do_sem_wake(struct thread *td, struct _usem *sem)
{
struct umtx_key key;
int error, cnt, nwake;
uint32_t flags;
flags = fuword32(&sem->_flags);
if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0)
return (error);
umtxq_lock(&key);
umtxq_busy(&key);
cnt = umtxq_count(&key);
nwake = umtxq_signal(&key, 1);
if (cnt <= nwake) {
umtxq_unlock(&key);
error = suword32(
__DEVOLATILE(uint32_t *, &sem->_has_waiters), 0);
umtxq_lock(&key);
}
umtxq_unbusy(&key);
umtxq_unlock(&key);
umtx_key_release(&key);
return (error);
}
int
-_umtx_lock(struct thread *td, struct _umtx_lock_args *uap)
+sys__umtx_lock(struct thread *td, struct _umtx_lock_args *uap)
/* struct umtx *umtx */
{
return _do_lock_umtx(td, uap->umtx, td->td_tid, 0);
}
int
-_umtx_unlock(struct thread *td, struct _umtx_unlock_args *uap)
+sys__umtx_unlock(struct thread *td, struct _umtx_unlock_args *uap)
/* struct umtx *umtx */
{
return do_unlock_umtx(td, uap->umtx, td->td_tid);
}
static int
__umtx_op_lock_umtx(struct thread *td, struct _umtx_op_args *uap)
{
struct timespec *ts, timeout;
int error;
/* Allow a null timespec (wait forever). */
if (uap->uaddr2 == NULL)
ts = NULL;
else {
error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
if (error != 0)
return (error);
if (timeout.tv_nsec >= 1000000000 ||
timeout.tv_nsec < 0) {
return (EINVAL);
}
ts = &timeout;
}
return (do_lock_umtx(td, uap->obj, uap->val, ts));
}
static int
__umtx_op_unlock_umtx(struct thread *td, struct _umtx_op_args *uap)
{
return (do_unlock_umtx(td, uap->obj, uap->val));
}
static int
__umtx_op_wait(struct thread *td, struct _umtx_op_args *uap)
{
struct timespec *ts, timeout;
int error;
if (uap->uaddr2 == NULL)
ts = NULL;
else {
error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
if (error != 0)
return (error);
if (timeout.tv_nsec >= 1000000000 ||
timeout.tv_nsec < 0)
return (EINVAL);
ts = &timeout;
}
return do_wait(td, uap->obj, uap->val, ts, 0, 0);
}
static int
__umtx_op_wait_uint(struct thread *td, struct _umtx_op_args *uap)
{
struct timespec *ts, timeout;
int error;
if (uap->uaddr2 == NULL)
ts = NULL;
else {
error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
if (error != 0)
return (error);
if (timeout.tv_nsec >= 1000000000 ||
timeout.tv_nsec < 0)
return (EINVAL);
ts = &timeout;
}
return do_wait(td, uap->obj, uap->val, ts, 1, 0);
}
static int
__umtx_op_wait_uint_private(struct thread *td, struct _umtx_op_args *uap)
{
struct timespec *ts, timeout;
int error;
if (uap->uaddr2 == NULL)
ts = NULL;
else {
error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
if (error != 0)
return (error);
if (timeout.tv_nsec >= 1000000000 ||
timeout.tv_nsec < 0)
return (EINVAL);
ts = &timeout;
}
return do_wait(td, uap->obj, uap->val, ts, 1, 1);
}
static int
__umtx_op_wake(struct thread *td, struct _umtx_op_args *uap)
{
return (kern_umtx_wake(td, uap->obj, uap->val, 0));
}
#define BATCH_SIZE 128
static int
__umtx_op_nwake_private(struct thread *td, struct _umtx_op_args *uap)
{
int count = uap->val;
void *uaddrs[BATCH_SIZE];
char **upp = (char **)uap->obj;
int tocopy;
int error = 0;
int i, pos = 0;
while (count > 0) {
tocopy = count;
if (tocopy > BATCH_SIZE)
tocopy = BATCH_SIZE;
error = copyin(upp+pos, uaddrs, tocopy * sizeof(char *));
if (error != 0)
break;
for (i = 0; i < tocopy; ++i)
kern_umtx_wake(td, uaddrs[i], INT_MAX, 1);
count -= tocopy;
pos += tocopy;
}
return (error);
}
static int
__umtx_op_wake_private(struct thread *td, struct _umtx_op_args *uap)
{
return (kern_umtx_wake(td, uap->obj, uap->val, 1));
}
static int
__umtx_op_lock_umutex(struct thread *td, struct _umtx_op_args *uap)
{
struct timespec *ts, timeout;
int error;
/* Allow a null timespec (wait forever). */
if (uap->uaddr2 == NULL)
ts = NULL;
else {
error = copyin(uap->uaddr2, &timeout,
sizeof(timeout));
if (error != 0)
return (error);
if (timeout.tv_nsec >= 1000000000 ||
timeout.tv_nsec < 0) {
return (EINVAL);
}
ts = &timeout;
}
return do_lock_umutex(td, uap->obj, ts, 0);
}
static int
__umtx_op_trylock_umutex(struct thread *td, struct _umtx_op_args *uap)
{
return do_lock_umutex(td, uap->obj, NULL, _UMUTEX_TRY);
}
static int
__umtx_op_wait_umutex(struct thread *td, struct _umtx_op_args *uap)
{
struct timespec *ts, timeout;
int error;
/* Allow a null timespec (wait forever). */
if (uap->uaddr2 == NULL)
ts = NULL;
else {
error = copyin(uap->uaddr2, &timeout,
sizeof(timeout));
if (error != 0)
return (error);
if (timeout.tv_nsec >= 1000000000 ||
timeout.tv_nsec < 0) {
return (EINVAL);
}
ts = &timeout;
}
return do_lock_umutex(td, uap->obj, ts, _UMUTEX_WAIT);
}
static int
__umtx_op_wake_umutex(struct thread *td, struct _umtx_op_args *uap)
{
return do_wake_umutex(td, uap->obj);
}
static int
__umtx_op_unlock_umutex(struct thread *td, struct _umtx_op_args *uap)
{
return do_unlock_umutex(td, uap->obj);
}
static int
__umtx_op_set_ceiling(struct thread *td, struct _umtx_op_args *uap)
{
return do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1);
}
static int
__umtx_op_cv_wait(struct thread *td, struct _umtx_op_args *uap)
{
struct timespec *ts, timeout;
int error;
/* Allow a null timespec (wait forever). */
if (uap->uaddr2 == NULL)
ts = NULL;
else {
error = copyin(uap->uaddr2, &timeout,
sizeof(timeout));
if (error != 0)
return (error);
if (timeout.tv_nsec >= 1000000000 ||
timeout.tv_nsec < 0) {
return (EINVAL);
}
ts = &timeout;
}
return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
}
static int
__umtx_op_cv_signal(struct thread *td, struct _umtx_op_args *uap)
{
return do_cv_signal(td, uap->obj);
}
static int
__umtx_op_cv_broadcast(struct thread *td, struct _umtx_op_args *uap)
{
return do_cv_broadcast(td, uap->obj);
}
static int
__umtx_op_rw_rdlock(struct thread *td, struct _umtx_op_args *uap)
{
struct timespec timeout;
int error;
/* Allow a null timespec (wait forever). */
if (uap->uaddr2 == NULL) {
error = do_rw_rdlock(td, uap->obj, uap->val, 0);
} else {
error = copyin(uap->uaddr2, &timeout,
sizeof(timeout));
if (error != 0)
return (error);
if (timeout.tv_nsec >= 1000000000 ||
timeout.tv_nsec < 0) {
return (EINVAL);
}
error = do_rw_rdlock2(td, uap->obj, uap->val, &timeout);
}
return (error);
}
static int
__umtx_op_rw_wrlock(struct thread *td, struct _umtx_op_args *uap)
{
struct timespec timeout;
int error;
/* Allow a null timespec (wait forever). */
if (uap->uaddr2 == NULL) {
error = do_rw_wrlock(td, uap->obj, 0);
} else {
error = copyin(uap->uaddr2, &timeout,
sizeof(timeout));
if (error != 0)
return (error);
if (timeout.tv_nsec >= 1000000000 ||
timeout.tv_nsec < 0) {
return (EINVAL);
}
error = do_rw_wrlock2(td, uap->obj, &timeout);
}
return (error);
}
static int
__umtx_op_rw_unlock(struct thread *td, struct _umtx_op_args *uap)
{
return do_rw_unlock(td, uap->obj);
}
static int
__umtx_op_sem_wait(struct thread *td, struct _umtx_op_args *uap)
{
struct timespec *ts, timeout;
int error;
/* Allow a null timespec (wait forever). */
if (uap->uaddr2 == NULL)
ts = NULL;
else {
error = copyin(uap->uaddr2, &timeout,
sizeof(timeout));
if (error != 0)
return (error);
if (timeout.tv_nsec >= 1000000000 ||
timeout.tv_nsec < 0) {
return (EINVAL);
}
ts = &timeout;
}
return (do_sem_wait(td, uap->obj, ts));
}
static int
__umtx_op_sem_wake(struct thread *td, struct _umtx_op_args *uap)
{
return do_sem_wake(td, uap->obj);
}
typedef int (*_umtx_op_func)(struct thread *td, struct _umtx_op_args *uap);
static _umtx_op_func op_table[] = {
__umtx_op_lock_umtx, /* UMTX_OP_LOCK */
__umtx_op_unlock_umtx, /* UMTX_OP_UNLOCK */
__umtx_op_wait, /* UMTX_OP_WAIT */
__umtx_op_wake, /* UMTX_OP_WAKE */
__umtx_op_trylock_umutex, /* UMTX_OP_MUTEX_TRYLOCK */
__umtx_op_lock_umutex, /* UMTX_OP_MUTEX_LOCK */
__umtx_op_unlock_umutex, /* UMTX_OP_MUTEX_UNLOCK */
__umtx_op_set_ceiling, /* UMTX_OP_SET_CEILING */
__umtx_op_cv_wait, /* UMTX_OP_CV_WAIT*/
__umtx_op_cv_signal, /* UMTX_OP_CV_SIGNAL */
__umtx_op_cv_broadcast, /* UMTX_OP_CV_BROADCAST */
__umtx_op_wait_uint, /* UMTX_OP_WAIT_UINT */
__umtx_op_rw_rdlock, /* UMTX_OP_RW_RDLOCK */
__umtx_op_rw_wrlock, /* UMTX_OP_RW_WRLOCK */
__umtx_op_rw_unlock, /* UMTX_OP_RW_UNLOCK */
__umtx_op_wait_uint_private, /* UMTX_OP_WAIT_UINT_PRIVATE */
__umtx_op_wake_private, /* UMTX_OP_WAKE_PRIVATE */
__umtx_op_wait_umutex, /* UMTX_OP_UMUTEX_WAIT */
__umtx_op_wake_umutex, /* UMTX_OP_UMUTEX_WAKE */
__umtx_op_sem_wait, /* UMTX_OP_SEM_WAIT */
__umtx_op_sem_wake, /* UMTX_OP_SEM_WAKE */
__umtx_op_nwake_private /* UMTX_OP_NWAKE_PRIVATE */
};
int
-_umtx_op(struct thread *td, struct _umtx_op_args *uap)
+sys__umtx_op(struct thread *td, struct _umtx_op_args *uap)
{
if ((unsigned)uap->op < UMTX_OP_MAX)
return (*op_table[uap->op])(td, uap);
return (EINVAL);
}
#ifdef COMPAT_FREEBSD32
int
freebsd32_umtx_lock(struct thread *td, struct freebsd32_umtx_lock_args *uap)
/* struct umtx *umtx */
{
return (do_lock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid, NULL));
}
int
freebsd32_umtx_unlock(struct thread *td, struct freebsd32_umtx_unlock_args *uap)
/* struct umtx *umtx */
{
return (do_unlock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid));
}
struct timespec32 {
uint32_t tv_sec;
uint32_t tv_nsec;
};
static inline int
copyin_timeout32(void *addr, struct timespec *tsp)
{
struct timespec32 ts32;
int error;
error = copyin(addr, &ts32, sizeof(struct timespec32));
if (error == 0) {
tsp->tv_sec = ts32.tv_sec;
tsp->tv_nsec = ts32.tv_nsec;
}
return (error);
}
static int
__umtx_op_lock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
{
struct timespec *ts, timeout;
int error;
/* Allow a null timespec (wait forever). */
if (uap->uaddr2 == NULL)
ts = NULL;
else {
error = copyin_timeout32(uap->uaddr2, &timeout);
if (error != 0)
return (error);
if (timeout.tv_nsec >= 1000000000 ||
timeout.tv_nsec < 0) {
return (EINVAL);
}
ts = &timeout;
}
return (do_lock_umtx32(td, uap->obj, uap->val, ts));
}
static int
__umtx_op_unlock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
{
return (do_unlock_umtx32(td, uap->obj, (uint32_t)uap->val));
}
static int
__umtx_op_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
{
struct timespec *ts, timeout;
int error;
if (uap->uaddr2 == NULL)
ts = NULL;
else {
error = copyin_timeout32(uap->uaddr2, &timeout);
if (error != 0)
return (error);
if (timeout.tv_nsec >= 1000000000 ||
timeout.tv_nsec < 0)
return (EINVAL);
ts = &timeout;
}
return do_wait(td, uap->obj, uap->val, ts, 1, 0);
}
static int
__umtx_op_lock_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
{
struct timespec *ts, timeout;
int error;
/* Allow a null timespec (wait forever). */
if (uap->uaddr2 == NULL)
ts = NULL;
else {
error = copyin_timeout32(uap->uaddr2, &timeout);
if (error != 0)
return (error);
if (timeout.tv_nsec >= 1000000000 ||
timeout.tv_nsec < 0)
return (EINVAL);
ts = &timeout;
}
return do_lock_umutex(td, uap->obj, ts, 0);
}
static int
__umtx_op_wait_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
{
struct timespec *ts, timeout;
int error;
/* Allow a null timespec (wait forever). */
if (uap->uaddr2 == NULL)
ts = NULL;
else {
error = copyin_timeout32(uap->uaddr2, &timeout);
if (error != 0)
return (error);
if (timeout.tv_nsec >= 1000000000 ||
timeout.tv_nsec < 0)
return (EINVAL);
ts = &timeout;
}
return do_lock_umutex(td, uap->obj, ts, _UMUTEX_WAIT);
}
static int
__umtx_op_cv_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
{
struct timespec *ts, timeout;
int error;
/* Allow a null timespec (wait forever). */
if (uap->uaddr2 == NULL)
ts = NULL;
else {
error = copyin_timeout32(uap->uaddr2, &timeout);
if (error != 0)
return (error);
if (timeout.tv_nsec >= 1000000000 ||
timeout.tv_nsec < 0)
return (EINVAL);
ts = &timeout;
}
return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
}
static int
__umtx_op_rw_rdlock_compat32(struct thread *td, struct _umtx_op_args *uap)
{
struct timespec timeout;
int error;
/* Allow a null timespec (wait forever). */
if (uap->uaddr2 == NULL) {
error = do_rw_rdlock(td, uap->obj, uap->val, 0);
} else {
error = copyin_timeout32(uap->uaddr2, &timeout);
if (error != 0)
return (error);
if (timeout.tv_nsec >= 1000000000 ||
timeout.tv_nsec < 0) {
return (EINVAL);
}
error = do_rw_rdlock2(td, uap->obj, uap->val, &timeout);
}
return (error);
}
static int
__umtx_op_rw_wrlock_compat32(struct thread *td, struct _umtx_op_args *uap)
{
struct timespec timeout;
int error;
/* Allow a null timespec (wait forever). */
if (uap->uaddr2 == NULL) {
error = do_rw_wrlock(td, uap->obj, 0);
} else {
error = copyin_timeout32(uap->uaddr2, &timeout);
if (error != 0)
return (error);
if (timeout.tv_nsec >= 1000000000 ||
timeout.tv_nsec < 0) {
return (EINVAL);
}
error = do_rw_wrlock2(td, uap->obj, &timeout);
}
return (error);
}
static int
__umtx_op_wait_uint_private_compat32(struct thread *td, struct _umtx_op_args *uap)
{
struct timespec *ts, timeout;
int error;
if (uap->uaddr2 == NULL)
ts = NULL;
else {
error = copyin_timeout32(uap->uaddr2, &timeout);
if (error != 0)
return (error);
if (timeout.tv_nsec >= 1000000000 ||
timeout.tv_nsec < 0)
return (EINVAL);
ts = &timeout;
}
return do_wait(td, uap->obj, uap->val, ts, 1, 1);
}
static int
__umtx_op_sem_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
{
struct timespec *ts, timeout;
int error;
/* Allow a null timespec (wait forever). */
if (uap->uaddr2 == NULL)
ts = NULL;
else {
error = copyin_timeout32(uap->uaddr2, &timeout);
if (error != 0)
return (error);
if (timeout.tv_nsec >= 1000000000 ||
timeout.tv_nsec < 0)
return (EINVAL);
ts = &timeout;
}
return (do_sem_wait(td, uap->obj, ts));
}
static int
__umtx_op_nwake_private32(struct thread *td, struct _umtx_op_args *uap)
{
int count = uap->val;
uint32_t uaddrs[BATCH_SIZE];
uint32_t **upp = (uint32_t **)uap->obj;
int tocopy;
int error = 0;
int i, pos = 0;
while (count > 0) {
tocopy = count;
if (tocopy > BATCH_SIZE)
tocopy = BATCH_SIZE;
error = copyin(upp+pos, uaddrs, tocopy * sizeof(uint32_t));
if (error != 0)
break;
for (i = 0; i < tocopy; ++i)
kern_umtx_wake(td, (void *)(intptr_t)uaddrs[i],
INT_MAX, 1);
count -= tocopy;
pos += tocopy;
}
return (error);
}
static _umtx_op_func op_table_compat32[] = {
__umtx_op_lock_umtx_compat32, /* UMTX_OP_LOCK */
__umtx_op_unlock_umtx_compat32, /* UMTX_OP_UNLOCK */
__umtx_op_wait_compat32, /* UMTX_OP_WAIT */
__umtx_op_wake, /* UMTX_OP_WAKE */
__umtx_op_trylock_umutex, /* UMTX_OP_MUTEX_LOCK */
__umtx_op_lock_umutex_compat32, /* UMTX_OP_MUTEX_TRYLOCK */
__umtx_op_unlock_umutex, /* UMTX_OP_MUTEX_UNLOCK */
__umtx_op_set_ceiling, /* UMTX_OP_SET_CEILING */
__umtx_op_cv_wait_compat32, /* UMTX_OP_CV_WAIT*/
__umtx_op_cv_signal, /* UMTX_OP_CV_SIGNAL */
__umtx_op_cv_broadcast, /* UMTX_OP_CV_BROADCAST */
__umtx_op_wait_compat32, /* UMTX_OP_WAIT_UINT */
__umtx_op_rw_rdlock_compat32, /* UMTX_OP_RW_RDLOCK */
__umtx_op_rw_wrlock_compat32, /* UMTX_OP_RW_WRLOCK */
__umtx_op_rw_unlock, /* UMTX_OP_RW_UNLOCK */
__umtx_op_wait_uint_private_compat32, /* UMTX_OP_WAIT_UINT_PRIVATE */
__umtx_op_wake_private, /* UMTX_OP_WAKE_PRIVATE */
__umtx_op_wait_umutex_compat32, /* UMTX_OP_UMUTEX_WAIT */
__umtx_op_wake_umutex, /* UMTX_OP_UMUTEX_WAKE */
__umtx_op_sem_wait_compat32, /* UMTX_OP_SEM_WAIT */
__umtx_op_sem_wake, /* UMTX_OP_SEM_WAKE */
__umtx_op_nwake_private32 /* UMTX_OP_NWAKE_PRIVATE */
};
int
freebsd32_umtx_op(struct thread *td, struct freebsd32_umtx_op_args *uap)
{
if ((unsigned)uap->op < UMTX_OP_MAX)
return (*op_table_compat32[uap->op])(td,
(struct _umtx_op_args *)uap);
return (EINVAL);
}
#endif
void
umtx_thread_init(struct thread *td)
{
td->td_umtxq = umtxq_alloc();
td->td_umtxq->uq_thread = td;
}
void
umtx_thread_fini(struct thread *td)
{
umtxq_free(td->td_umtxq);
}
/*
* It will be called when new thread is created, e.g fork().
*/
void
umtx_thread_alloc(struct thread *td)
{
struct umtx_q *uq;
uq = td->td_umtxq;
uq->uq_inherited_pri = PRI_MAX;
KASSERT(uq->uq_flags == 0, ("uq_flags != 0"));
KASSERT(uq->uq_thread == td, ("uq_thread != td"));
KASSERT(uq->uq_pi_blocked == NULL, ("uq_pi_blocked != NULL"));
KASSERT(TAILQ_EMPTY(&uq->uq_pi_contested), ("uq_pi_contested is not empty"));
}
/*
* exec() hook.
*/
static void
umtx_exec_hook(void *arg __unused, struct proc *p __unused,
struct image_params *imgp __unused)
{
umtx_thread_cleanup(curthread);
}
/*
* thread_exit() hook.
*/
void
umtx_thread_exit(struct thread *td)
{
umtx_thread_cleanup(td);
}
/*
* clean up umtx data.
*/
static void
umtx_thread_cleanup(struct thread *td)
{
struct umtx_q *uq;
struct umtx_pi *pi;
if ((uq = td->td_umtxq) == NULL)
return;
mtx_lock_spin(&umtx_lock);
uq->uq_inherited_pri = PRI_MAX;
while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) {
pi->pi_owner = NULL;
TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link);
}
mtx_unlock_spin(&umtx_lock);
thread_lock(td);
sched_lend_user_prio(td, PRI_MAX);
thread_unlock(td);
}
Index: head/sys/kern/kern_uuid.c
===================================================================
--- head/sys/kern/kern_uuid.c (revision 225616)
+++ head/sys/kern/kern_uuid.c (revision 225617)
@@ -1,369 +1,369 @@
/*-
* Copyright (c) 2002 Marcel Moolenaar
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/endian.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/sbuf.h>
#include <sys/socket.h>
#include <sys/sysproto.h>
#include <sys/systm.h>
#include <sys/jail.h>
#include <sys/uuid.h>
#include <net/if.h>
#include <net/if_dl.h>
#include <net/if_types.h>
#include <net/vnet.h>
/*
* See also:
* http://www.opengroup.org/dce/info/draft-leach-uuids-guids-01.txt
* http://www.opengroup.org/onlinepubs/009629399/apdxa.htm
*
* Note that the generator state is itself an UUID, but the time and clock
* sequence fields are written in the native byte order.
*/
CTASSERT(sizeof(struct uuid) == 16);
/* We use an alternative, more convenient representation in the generator. */
struct uuid_private {
union {
uint64_t ll; /* internal. */
struct {
uint32_t low;
uint16_t mid;
uint16_t hi;
} x;
} time;
uint16_t seq; /* Big-endian. */
uint16_t node[UUID_NODE_LEN>>1];
};
CTASSERT(sizeof(struct uuid_private) == 16);
static struct uuid_private uuid_last;
static struct mtx uuid_mutex;
MTX_SYSINIT(uuid_lock, &uuid_mutex, "UUID generator mutex lock", MTX_DEF);
/*
* Return the first MAC address we encounter or, if none was found,
* construct a sufficiently random multicast address. We don't try
* to return the same MAC address as previously returned. We always
* generate a new multicast address if no MAC address exists in the
* system.
* It would be nice to know if 'ifnet' or any of its sub-structures
* has been changed in any way. If not, we could simply skip the
* scan and safely return the MAC address we returned before.
*/
static void
uuid_node(uint16_t *node)
{
struct ifnet *ifp;
struct ifaddr *ifa;
struct sockaddr_dl *sdl;
int i;
CURVNET_SET(TD_TO_VNET(curthread));
IFNET_RLOCK_NOSLEEP();
TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
/* Walk the address list */
IF_ADDR_LOCK(ifp);
TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
sdl = (struct sockaddr_dl*)ifa->ifa_addr;
if (sdl != NULL && sdl->sdl_family == AF_LINK &&
sdl->sdl_type == IFT_ETHER) {
/* Got a MAC address. */
bcopy(LLADDR(sdl), node, UUID_NODE_LEN);
IF_ADDR_UNLOCK(ifp);
IFNET_RUNLOCK_NOSLEEP();
CURVNET_RESTORE();
return;
}
}
IF_ADDR_UNLOCK(ifp);
}
IFNET_RUNLOCK_NOSLEEP();
for (i = 0; i < (UUID_NODE_LEN>>1); i++)
node[i] = (uint16_t)arc4random();
*((uint8_t*)node) |= 0x01;
CURVNET_RESTORE();
}
/*
* Get the current time as a 60 bit count of 100-nanosecond intervals
* since 00:00:00.00, October 15,1582. We apply a magic offset to convert
* the Unix time since 00:00:00.00, January 1, 1970 to the date of the
* Gregorian reform to the Christian calendar.
*/
static uint64_t
uuid_time(void)
{
struct bintime bt;
uint64_t time = 0x01B21DD213814000LL;
bintime(&bt);
time += (uint64_t)bt.sec * 10000000LL;
time += (10000000LL * (uint32_t)(bt.frac >> 32)) >> 32;
return (time & ((1LL << 60) - 1LL));
}
struct uuid *
kern_uuidgen(struct uuid *store, size_t count)
{
struct uuid_private uuid;
uint64_t time;
size_t n;
mtx_lock(&uuid_mutex);
uuid_node(uuid.node);
time = uuid_time();
if (uuid_last.time.ll == 0LL || uuid_last.node[0] != uuid.node[0] ||
uuid_last.node[1] != uuid.node[1] ||
uuid_last.node[2] != uuid.node[2])
uuid.seq = (uint16_t)arc4random() & 0x3fff;
else if (uuid_last.time.ll >= time)
uuid.seq = (uuid_last.seq + 1) & 0x3fff;
else
uuid.seq = uuid_last.seq;
uuid_last = uuid;
uuid_last.time.ll = (time + count - 1) & ((1LL << 60) - 1LL);
mtx_unlock(&uuid_mutex);
/* Set sequence and variant and deal with byte order. */
uuid.seq = htobe16(uuid.seq | 0x8000);
for (n = 0; n < count; n++) {
/* Set time and version (=1). */
uuid.time.x.low = (uint32_t)time;
uuid.time.x.mid = (uint16_t)(time >> 32);
uuid.time.x.hi = ((uint16_t)(time >> 48) & 0xfff) | (1 << 12);
store[n] = *(struct uuid *)&uuid;
time++;
}
return (store);
}
#ifndef _SYS_SYSPROTO_H_
struct uuidgen_args {
struct uuid *store;
int count;
};
#endif
int
-uuidgen(struct thread *td, struct uuidgen_args *uap)
+sys_uuidgen(struct thread *td, struct uuidgen_args *uap)
{
struct uuid *store;
size_t count;
int error;
/*
* Limit the number of UUIDs that can be created at the same time
* to some arbitrary number. This isn't really necessary, but I
* like to have some sort of upper-bound that's less than 2G :-)
* XXX probably needs to be tunable.
*/
if (uap->count < 1 || uap->count > 2048)
return (EINVAL);
count = uap->count;
store = malloc(count * sizeof(struct uuid), M_TEMP, M_WAITOK);
kern_uuidgen(store, count);
error = copyout(store, uap->store, count * sizeof(struct uuid));
free(store, M_TEMP);
return (error);
}
int
snprintf_uuid(char *buf, size_t sz, struct uuid *uuid)
{
struct uuid_private *id;
int cnt;
id = (struct uuid_private *)uuid;
cnt = snprintf(buf, sz, "%08x-%04x-%04x-%04x-%04x%04x%04x",
id->time.x.low, id->time.x.mid, id->time.x.hi, be16toh(id->seq),
be16toh(id->node[0]), be16toh(id->node[1]), be16toh(id->node[2]));
return (cnt);
}
int
printf_uuid(struct uuid *uuid)
{
char buf[38];
snprintf_uuid(buf, sizeof(buf), uuid);
return (printf("%s", buf));
}
int
sbuf_printf_uuid(struct sbuf *sb, struct uuid *uuid)
{
char buf[38];
snprintf_uuid(buf, sizeof(buf), uuid);
return (sbuf_printf(sb, "%s", buf));
}
/*
* Encode/Decode UUID into byte-stream.
* http://www.opengroup.org/dce/info/draft-leach-uuids-guids-01.txt
*
* 0 1 2 3
* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
* | time_low |
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
* | time_mid | time_hi_and_version |
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
* |clk_seq_hi_res | clk_seq_low | node (0-1) |
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
* | node (2-5) |
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*/
void
le_uuid_enc(void *buf, struct uuid const *uuid)
{
u_char *p;
int i;
p = buf;
le32enc(p, uuid->time_low);
le16enc(p + 4, uuid->time_mid);
le16enc(p + 6, uuid->time_hi_and_version);
p[8] = uuid->clock_seq_hi_and_reserved;
p[9] = uuid->clock_seq_low;
for (i = 0; i < _UUID_NODE_LEN; i++)
p[10 + i] = uuid->node[i];
}
void
le_uuid_dec(void const *buf, struct uuid *uuid)
{
u_char const *p;
int i;
p = buf;
uuid->time_low = le32dec(p);
uuid->time_mid = le16dec(p + 4);
uuid->time_hi_and_version = le16dec(p + 6);
uuid->clock_seq_hi_and_reserved = p[8];
uuid->clock_seq_low = p[9];
for (i = 0; i < _UUID_NODE_LEN; i++)
uuid->node[i] = p[10 + i];
}
void
be_uuid_enc(void *buf, struct uuid const *uuid)
{
u_char *p;
int i;
p = buf;
be32enc(p, uuid->time_low);
be16enc(p + 4, uuid->time_mid);
be16enc(p + 6, uuid->time_hi_and_version);
p[8] = uuid->clock_seq_hi_and_reserved;
p[9] = uuid->clock_seq_low;
for (i = 0; i < _UUID_NODE_LEN; i++)
p[10 + i] = uuid->node[i];
}
void
be_uuid_dec(void const *buf, struct uuid *uuid)
{
u_char const *p;
int i;
p = buf;
uuid->time_low = be32dec(p);
uuid->time_mid = le16dec(p + 4);
uuid->time_hi_and_version = be16dec(p + 6);
uuid->clock_seq_hi_and_reserved = p[8];
uuid->clock_seq_low = p[9];
for (i = 0; i < _UUID_NODE_LEN; i++)
uuid->node[i] = p[10 + i];
}
int
parse_uuid(const char *str, struct uuid *uuid)
{
u_int c[11];
int n;
/* An empty string represents a nil UUID. */
if (*str == '\0') {
bzero(uuid, sizeof(*uuid));
return (0);
}
/* The UUID string representation has a fixed length. */
if (strlen(str) != 36)
return (EINVAL);
/*
* We only work with "new" UUIDs. New UUIDs have the form:
* 01234567-89ab-cdef-0123-456789abcdef
* The so called "old" UUIDs, which we don't support, have the form:
* 0123456789ab.cd.ef.01.23.45.67.89.ab
*/
if (str[8] != '-')
return (EINVAL);
n = sscanf(str, "%8x-%4x-%4x-%2x%2x-%2x%2x%2x%2x%2x%2x", c + 0, c + 1,
c + 2, c + 3, c + 4, c + 5, c + 6, c + 7, c + 8, c + 9, c + 10);
/* Make sure we have all conversions. */
if (n != 11)
return (EINVAL);
/* Successful scan. Build the UUID. */
uuid->time_low = c[0];
uuid->time_mid = c[1];
uuid->time_hi_and_version = c[2];
uuid->clock_seq_hi_and_reserved = c[3];
uuid->clock_seq_low = c[4];
for (n = 0; n < 6; n++)
uuid->node[n] = c[n + 5];
/* Check semantics... */
return (((c[3] & 0x80) != 0x00 && /* variant 0? */
(c[3] & 0xc0) != 0x80 && /* variant 1? */
(c[3] & 0xe0) != 0xc0) ? EINVAL : 0); /* variant 2? */
}
Index: head/sys/kern/makesyscalls.sh
===================================================================
--- head/sys/kern/makesyscalls.sh (revision 225616)
+++ head/sys/kern/makesyscalls.sh (revision 225617)
@@ -1,620 +1,636 @@
#! /bin/sh -
# @(#)makesyscalls.sh 8.1 (Berkeley) 6/10/93
# $FreeBSD$
set -e
# name of compat options:
compat=COMPAT_43
compat4=COMPAT_FREEBSD4
compat6=COMPAT_FREEBSD6
compat7=COMPAT_FREEBSD7
# output files:
sysnames="syscalls.c"
sysproto="../sys/sysproto.h"
sysproto_h=_SYS_SYSPROTO_H_
syshdr="../sys/syscall.h"
sysmk="../sys/syscall.mk"
syssw="init_sysent.c"
syscallprefix="SYS_"
switchname="sysent"
namesname="syscallnames"
systrace="systrace_args.c"
# tmp files:
sysaue="sysent.aue.$$"
sysdcl="sysent.dcl.$$"
syscompat="sysent.compat.$$"
syscompatdcl="sysent.compatdcl.$$"
syscompat4="sysent.compat4.$$"
syscompat4dcl="sysent.compat4dcl.$$"
syscompat6="sysent.compat6.$$"
syscompat6dcl="sysent.compat6dcl.$$"
syscompat7="sysent.compat7.$$"
syscompat7dcl="sysent.compat7dcl.$$"
sysent="sysent.switch.$$"
sysinc="sysinc.switch.$$"
sysarg="sysarg.switch.$$"
sysprotoend="sysprotoend.$$"
systracetmp="systrace.$$"
if [ -r capabilities.conf ]; then
capenabled=`cat capabilities.conf | grep -v "^#" | grep -v "^$"`
capenabled=`echo $capenabled | sed 's/ /,/g'`
else
capenabled=""
fi
trap "rm $sysaue $sysdcl $syscompat $syscompatdcl $syscompat4 $syscompat4dcl $syscompat6 $syscompat6dcl $syscompat7 $syscompat7dcl $sysent $sysinc $sysarg $sysprotoend $systracetmp" 0
touch $sysaue $sysdcl $syscompat $syscompatdcl $syscompat4 $syscompat4dcl $syscompat6 $syscompat6dcl $syscompat7 $syscompat7dcl $sysent $sysinc $sysarg $sysprotoend $systracetmp
case $# in
0) echo "usage: $0 input-file <config-file>" 1>&2
exit 1
;;
esac
if [ -n "$2" -a -f "$2" ]; then
. $2
fi
sed -e '
s/\$//g
:join
/\\$/{a\
N
s/\\\n//
b join
}
2,${
/^#/!s/\([{}()*,]\)/ \1 /g
}
' < $1 | awk "
BEGIN {
sysaue = \"$sysaue\"
sysdcl = \"$sysdcl\"
sysproto = \"$sysproto\"
sysprotoend = \"$sysprotoend\"
sysproto_h = \"$sysproto_h\"
syscompat = \"$syscompat\"
syscompatdcl = \"$syscompatdcl\"
syscompat4 = \"$syscompat4\"
syscompat4dcl = \"$syscompat4dcl\"
syscompat6 = \"$syscompat6\"
syscompat6dcl = \"$syscompat6dcl\"
syscompat7 = \"$syscompat7\"
syscompat7dcl = \"$syscompat7dcl\"
sysent = \"$sysent\"
syssw = \"$syssw\"
sysinc = \"$sysinc\"
sysarg = \"$sysarg\"
sysnames = \"$sysnames\"
syshdr = \"$syshdr\"
sysmk = \"$sysmk\"
systrace = \"$systrace\"
systracetmp = \"$systracetmp\"
compat = \"$compat\"
compat4 = \"$compat4\"
compat6 = \"$compat6\"
compat7 = \"$compat7\"
syscallprefix = \"$syscallprefix\"
switchname = \"$switchname\"
namesname = \"$namesname\"
infile = \"$1\"
capenabled_string = \"$capenabled\"
"'
split(capenabled_string, capenabled, ",");
printf "/*\n * System call switch table.\n *\n" > syssw
printf " * DO NOT EDIT-- this file is automatically generated.\n" > syssw
printf " * $%s$\n", "FreeBSD" > syssw
printf "/*\n * System call prototypes.\n *\n" > sysarg
printf " * DO NOT EDIT-- this file is automatically generated.\n" > sysarg
printf " * $%s$\n", "FreeBSD" > sysarg
printf "\n#ifdef %s\n\n", compat > syscompat
printf "\n#ifdef %s\n\n", compat4 > syscompat4
printf "\n#ifdef %s\n\n", compat6 > syscompat6
printf "\n#ifdef %s\n\n", compat7 > syscompat7
printf "/*\n * System call names.\n *\n" > sysnames
printf " * DO NOT EDIT-- this file is automatically generated.\n" > sysnames
printf " * $%s$\n", "FreeBSD" > sysnames
printf "/*\n * System call numbers.\n *\n" > syshdr
printf " * DO NOT EDIT-- this file is automatically generated.\n" > syshdr
printf " * $%s$\n", "FreeBSD" > syshdr
printf "# FreeBSD system call names.\n" > sysmk
printf "# DO NOT EDIT-- this file is automatically generated.\n" > sysmk
printf "# $%s$\n", "FreeBSD" > sysmk
printf "/*\n * System call argument to DTrace register array converstion.\n *\n" > systrace
printf " * DO NOT EDIT-- this file is automatically generated.\n" > systrace
printf " * $%s$\n", "FreeBSD" > systrace
}
NR == 1 {
gsub("[$]FreeBSD: ", "", $0)
gsub(" [$]", "", $0)
printf " * created from%s\n */\n\n", $0 > syssw
printf "\n/* The casts are bogus but will do for now. */\n" > sysent
printf "struct sysent %s[] = {\n",switchname > sysent
printf " * created from%s\n */\n\n", $0 > sysarg
printf "#ifndef %s\n", sysproto_h > sysarg
printf "#define\t%s\n\n", sysproto_h > sysarg
printf "#include <sys/signal.h>\n" > sysarg
printf "#include <sys/acl.h>\n" > sysarg
printf "#include <sys/cpuset.h>\n" > sysarg
printf "#include <sys/_semaphore.h>\n" > sysarg
printf "#include <sys/ucontext.h>\n\n" > sysarg
printf "#include <bsm/audit_kevents.h>\n\n" > sysarg
printf "struct proc;\n\n" > sysarg
printf "struct thread;\n\n" > sysarg
printf "#define\tPAD_(t)\t(sizeof(register_t) <= sizeof(t) ? \\\n" > sysarg
printf "\t\t0 : sizeof(register_t) - sizeof(t))\n\n" > sysarg
printf "#if BYTE_ORDER == LITTLE_ENDIAN\n"> sysarg
printf "#define\tPADL_(t)\t0\n" > sysarg
printf "#define\tPADR_(t)\tPAD_(t)\n" > sysarg
printf "#else\n" > sysarg
printf "#define\tPADL_(t)\tPAD_(t)\n" > sysarg
printf "#define\tPADR_(t)\t0\n" > sysarg
printf "#endif\n\n" > sysarg
printf " * created from%s\n */\n\n", $0 > sysnames
printf "const char *%s[] = {\n", namesname > sysnames
printf " * created from%s\n */\n\n", $0 > syshdr
printf "# created from%s\nMIASM = ", $0 > sysmk
printf " * This file is part of the DTrace syscall provider.\n */\n\n" > systrace
printf "static void\nsystrace_args(int sysnum, void *params, uint64_t *uarg, int *n_args)\n{\n" > systrace
printf "\tint64_t *iarg = (int64_t *) uarg;\n" > systrace
printf "\tswitch (sysnum) {\n" > systrace
printf "static void\nsystrace_setargdesc(int sysnum, int ndx, char *desc, size_t descsz)\n{\n\tconst char *p = NULL;\n" > systracetmp
printf "\tswitch (sysnum) {\n" > systracetmp
next
}
NF == 0 || $1 ~ /^;/ {
next
}
$1 ~ /^#[ ]*include/ {
print > sysinc
next
}
$1 ~ /^#[ ]*if/ {
print > sysent
print > sysdcl
print > sysarg
print > syscompat
print > syscompat4
print > syscompat6
print > syscompat7
print > sysnames
print > systrace
print > systracetmp
savesyscall = syscall
next
}
$1 ~ /^#[ ]*else/ {
print > sysent
print > sysdcl
print > sysarg
print > syscompat
print > syscompat4
print > syscompat6
print > syscompat7
print > sysnames
print > systrace
print > systracetmp
syscall = savesyscall
next
}
$1 ~ /^#/ {
print > sysent
print > sysdcl
print > sysarg
print > syscompat
print > syscompat4
print > syscompat6
print > syscompat7
print > sysnames
print > systrace
print > systracetmp
next
}
syscall != $1 {
printf "%s: line %d: syscall number out of sync at %d\n",
infile, NR, syscall
printf "line is:\n"
print
exit 1
}
# Returns true if the type "name" is the first flag in the type field
function type(name, flags, n) {
n = split($3, flags, /\|/)
return (n > 0 && flags[1] == name)
}
# Returns true if the flag "name" is set in the type field
function flag(name, flags, i, n) {
n = split($3, flags, /\|/)
for (i = 1; i <= n; i++)
if (flags[i] == name)
return 1
return 0
}
function align_sysent_comment(column) {
printf("\t") > sysent
column = column + 8 - column % 8
while (column < 56) {
printf("\t") > sysent
column = column + 8
}
}
function parserr(was, wanted) {
printf "%s: line %d: unexpected %s (expected %s)\n",
infile, NR, was, wanted
exit 1
}
function parseline() {
f=4 # toss number, type, audit event
argc= 0;
argssize = "0"
thr_flag = "SY_THR_STATIC"
if (flag("NOTSTATIC")) {
thr_flag = "SY_THR_ABSENT"
}
if ($NF != "}") {
funcalias=$(NF-2)
argalias=$(NF-1)
rettype=$NF
end=NF-3
} else {
funcalias=""
argalias=""
rettype="int"
end=NF
}
if (flag("NODEF")) {
auditev="AUE_NULL"
funcname=$4
argssize = "AS(" $6 ")"
return
}
if ($f != "{")
parserr($f, "{")
f++
if ($end != "}")
parserr($end, "}")
end--
if ($end != ";")
parserr($end, ";")
end--
if ($end != ")")
parserr($end, ")")
end--
f++ #function return type
funcname=$f
#
# We now know the func name, so define a flags field for it.
# Do this before any other processing as we may return early
# from it.
#
for (cap in capenabled) {
if (funcname == capenabled[cap]) {
flags = "SYF_CAPENABLED";
}
}
if (funcalias == "")
funcalias = funcname
if (argalias == "") {
argalias = funcname "_args"
if (flag("COMPAT"))
argalias = "o" argalias
if (flag("COMPAT4"))
argalias = "freebsd4_" argalias
if (flag("COMPAT6"))
argalias = "freebsd6_" argalias
if (flag("COMPAT7"))
argalias = "freebsd7_" argalias
}
f++
if ($f != "(")
parserr($f, ")")
f++
if (f == end) {
if ($f != "void")
parserr($f, "argument definition")
return
}
while (f <= end) {
argc++
argtype[argc]=""
oldf=""
while (f < end && $(f+1) != ",") {
if (argtype[argc] != "" && oldf != "*")
argtype[argc] = argtype[argc]" ";
argtype[argc] = argtype[argc]$f;
oldf = $f;
f++
}
if (argtype[argc] == "")
parserr($f, "argument definition")
argname[argc]=$f;
f += 2; # skip name, and any comma
}
if (argc != 0)
argssize = "AS(" argalias ")"
}
{ comment = $4
if (NF < 7)
for (i = 5; i <= NF; i++)
comment = comment " " $i
}
#
# The AUE_ audit event identifier.
#
{
auditev = $2;
}
#
# The flags, if any.
#
{
flags = "0";
}
type("STD") || type("NODEF") || type("NOARGS") || type("NOPROTO") \
|| type("NOSTD") {
parseline()
printf("\t/* %s */\n\tcase %d: {\n", funcname, syscall) > systrace
printf("\t/* %s */\n\tcase %d:\n", funcname, syscall) > systracetmp
if (argc > 0) {
printf("\t\tswitch(ndx) {\n") > systracetmp
printf("\t\tstruct %s *p = params;\n", argalias) > systrace
for (i = 1; i <= argc; i++) {
printf("\t\tcase %d:\n\t\t\tp = \"%s\";\n\t\t\tbreak;\n", i - 1, argtype[i]) > systracetmp
if (index(argtype[i], "*") > 0 || argtype[i] == "caddr_t")
printf("\t\tuarg[%d] = (intptr_t) p->%s; /* %s */\n", \
i - 1, \
argname[i], argtype[i]) > systrace
else if (substr(argtype[i], 1, 1) == "u" || argtype[i] == "size_t")
printf("\t\tuarg[%d] = p->%s; /* %s */\n", \
i - 1, \
argname[i], argtype[i]) > systrace
else
printf("\t\tiarg[%d] = p->%s; /* %s */\n", \
i - 1, \
argname[i], argtype[i]) > systrace
}
printf("\t\tdefault:\n\t\t\tbreak;\n\t\t};\n") > systracetmp
}
printf("\t\t*n_args = %d;\n\t\tbreak;\n\t}\n", argc) > systrace
printf("\t\tbreak;\n") > systracetmp
if (argc != 0 && !flag("NOARGS") && !flag("NOPROTO") && \
!flag("NODEF")) {
printf("struct %s {\n", argalias) > sysarg
for (i = 1; i <= argc; i++)
printf("\tchar %s_l_[PADL_(%s)]; " \
"%s %s; char %s_r_[PADR_(%s)];\n",
argname[i], argtype[i],
argtype[i], argname[i],
argname[i], argtype[i]) > sysarg
printf("};\n") > sysarg
}
else if (!flag("NOARGS") && !flag("NOPROTO") && !flag("NODEF"))
printf("struct %s {\n\tregister_t dummy;\n};\n",
argalias) > sysarg
if (!flag("NOPROTO") && !flag("NODEF")) {
- printf("%s\t%s(struct thread *, struct %s *)",
- rettype, funcname, argalias) > sysdcl
+ if (funcname == "nosys" || funcname == "lkmnosys" ||
+ funcname == "sysarch" || funcname ~ /^freebsd/ ||
+ funcname ~ /^linux/ || funcname ~ /^svr4/ ||
+ funcname ~ /^ibcs2/ || funcname ~ /^xenix/) {
+ printf("%s\t%s(struct thread *, struct %s *)",
+ rettype, funcname, argalias) > sysdcl
+ } else {
+ printf("%s\tsys_%s(struct thread *, struct %s *)",
+ rettype, funcname, argalias) > sysdcl
+ }
printf(";\n") > sysdcl
printf("#define\t%sAUE_%s\t%s\n", syscallprefix,
funcalias, auditev) > sysaue
}
printf("\t{ %s, (sy_call_t *)", argssize) > sysent
column = 8 + 2 + length(argssize) + 15
if (flag("NOSTD")) {
printf("%s },", "lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT") > sysent
column = column + length("lkmressys") + length("AUE_NULL") + 3
} else {
- printf("%s, %s, NULL, 0, 0, %s, %s },", funcname, auditev, flags, thr_flag) > sysent
- column = column + length(funcname) + length(auditev) + length(flags) + 3
+ if (funcname == "nosys" || funcname == "sysarch" ||
+ funcname == "lkmnosys" || funcname ~ /^freebsd/ ||
+ funcname ~ /^linux/ || funcname ~ /^svr4/ ||
+ funcname ~ /^ibcs2/ || funcname ~ /^xenix/) {
+ printf("%s, %s, NULL, 0, 0, %s, %s },", funcname, auditev, flags, thr_flag) > sysent
+ column = column + length(funcname) + length(auditev) + length(flags) + 3
+ } else {
+ printf("sys_%s, %s, NULL, 0, 0, %s, %s },", funcname, auditev, flags, thr_flag) > sysent
+ column = column + length(funcname) + length(auditev) + length(flags) + 3 + 4
+ }
}
align_sysent_comment(column)
printf("/* %d = %s */\n", syscall, funcalias) > sysent
printf("\t\"%s\",\t\t\t/* %d = %s */\n",
funcalias, syscall, funcalias) > sysnames
if (!flag("NODEF")) {
printf("#define\t%s%s\t%d\n", syscallprefix,
funcalias, syscall) > syshdr
printf(" \\\n\t%s.o", funcalias) > sysmk
}
syscall++
next
}
type("COMPAT") || type("COMPAT4") || type("COMPAT6") || \
type("COMPAT7") {
if (flag("COMPAT")) {
ncompat++
out = syscompat
outdcl = syscompatdcl
wrap = "compat"
prefix = "o"
descr = "old"
} else if (flag("COMPAT4")) {
ncompat4++
out = syscompat4
outdcl = syscompat4dcl
wrap = "compat4"
prefix = "freebsd4_"
descr = "freebsd4"
} else if (flag("COMPAT6")) {
ncompat6++
out = syscompat6
outdcl = syscompat6dcl
wrap = "compat6"
prefix = "freebsd6_"
descr = "freebsd6"
} else if (flag("COMPAT7")) {
ncompat7++
out = syscompat7
outdcl = syscompat7dcl
wrap = "compat7"
prefix = "freebsd7_"
descr = "freebsd7"
}
parseline()
if (argc != 0 && !flag("NOARGS") && !flag("NOPROTO") && \
!flag("NODEF")) {
printf("struct %s {\n", argalias) > out
for (i = 1; i <= argc; i++)
printf("\tchar %s_l_[PADL_(%s)]; %s %s; " \
"char %s_r_[PADR_(%s)];\n",
argname[i], argtype[i],
argtype[i], argname[i],
argname[i], argtype[i]) > out
printf("};\n") > out
}
else if (!flag("NOARGS") && !flag("NOPROTO") && !flag("NODEF"))
printf("struct %s {\n\tregister_t dummy;\n};\n",
argalias) > sysarg
if (!flag("NOPROTO") && !flag("NODEF")) {
printf("%s\t%s%s(struct thread *, struct %s *);\n",
rettype, prefix, funcname, argalias) > outdcl
printf("#define\t%sAUE_%s%s\t%s\n", syscallprefix,
prefix, funcname, auditev) > sysaue
}
if (flag("NOSTD")) {
printf("\t{ %s, (sy_call_t *)%s, %s, NULL, 0, 0, 0, SY_THR_ABSENT },",
"0", "lkmressys", "AUE_NULL") > sysent
align_sysent_comment(8 + 2 + length("0") + 15 + \
length("lkmressys") + length("AUE_NULL") + 3)
} else {
printf("\t{ %s(%s,%s), %s, NULL, 0, 0, %s, %s },",
wrap, argssize, funcname, auditev, flags, thr_flag) > sysent
align_sysent_comment(8 + 9 + length(argssize) + 1 + \
length(funcname) + length(auditev) + \
length(flags) + 4)
}
printf("/* %d = %s %s */\n", syscall, descr, funcalias) > sysent
printf("\t\"%s.%s\",\t\t/* %d = %s %s */\n",
wrap, funcalias, syscall, descr, funcalias) > sysnames
if (flag("COMPAT")) {
printf("\t\t\t\t/* %d is old %s */\n",
syscall, funcalias) > syshdr
} else if (!flag("NODEF")) {
printf("#define\t%s%s%s\t%d\n", syscallprefix,
prefix, funcalias, syscall) > syshdr
printf(" \\\n\t%s%s.o", prefix, funcalias) > sysmk
}
syscall++
next
}
type("OBSOL") {
printf("\t{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },") > sysent
align_sysent_comment(34)
printf("/* %d = obsolete %s */\n", syscall, comment) > sysent
printf("\t\"obs_%s\",\t\t\t/* %d = obsolete %s */\n",
$4, syscall, comment) > sysnames
printf("\t\t\t\t/* %d is obsolete %s */\n",
syscall, comment) > syshdr
syscall++
next
}
type("UNIMPL") {
printf("\t{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },\t\t\t/* %d = %s */\n",
syscall, comment) > sysent
printf("\t\"#%d\",\t\t\t/* %d = %s */\n",
syscall, syscall, comment) > sysnames
syscall++
next
}
{
printf "%s: line %d: unrecognized keyword %s\n", infile, NR, $3
exit 1
}
END {
printf "\n#define AS(name) (sizeof(struct name) / sizeof(register_t))\n" > sysinc
if (ncompat != 0 || ncompat4 != 0 || ncompat6 != 0 || ncompat7 != 0)
printf "#include \"opt_compat.h\"\n\n" > syssw
if (ncompat != 0) {
printf "\n#ifdef %s\n", compat > sysinc
printf "#define compat(n, name) n, (sy_call_t *)__CONCAT(o,name)\n" > sysinc
printf "#else\n" > sysinc
printf "#define compat(n, name) 0, (sy_call_t *)nosys\n" > sysinc
printf "#endif\n" > sysinc
}
if (ncompat4 != 0) {
printf "\n#ifdef %s\n", compat4 > sysinc
printf "#define compat4(n, name) n, (sy_call_t *)__CONCAT(freebsd4_,name)\n" > sysinc
printf "#else\n" > sysinc
printf "#define compat4(n, name) 0, (sy_call_t *)nosys\n" > sysinc
printf "#endif\n" > sysinc
}
if (ncompat6 != 0) {
printf "\n#ifdef %s\n", compat6 > sysinc
printf "#define compat6(n, name) n, (sy_call_t *)__CONCAT(freebsd6_,name)\n" > sysinc
printf "#else\n" > sysinc
printf "#define compat6(n, name) 0, (sy_call_t *)nosys\n" > sysinc
printf "#endif\n" > sysinc
}
if (ncompat7 != 0) {
printf "\n#ifdef %s\n", compat7 > sysinc
printf "#define compat7(n, name) n, (sy_call_t *)__CONCAT(freebsd7_,name)\n" > sysinc
printf "#else\n" > sysinc
printf "#define compat7(n, name) 0, (sy_call_t *)nosys\n" > sysinc
printf "#endif\n" > sysinc
}
printf("\n#endif /* %s */\n\n", compat) > syscompatdcl
printf("\n#endif /* %s */\n\n", compat4) > syscompat4dcl
printf("\n#endif /* %s */\n\n", compat6) > syscompat6dcl
printf("\n#endif /* %s */\n\n", compat7) > syscompat7dcl
printf("\n#undef PAD_\n") > sysprotoend
printf("#undef PADL_\n") > sysprotoend
printf("#undef PADR_\n") > sysprotoend
printf("\n#endif /* !%s */\n", sysproto_h) > sysprotoend
printf("\n") > sysmk
printf("};\n") > sysent
printf("};\n") > sysnames
printf("#define\t%sMAXSYSCALL\t%d\n", syscallprefix, syscall) \
> syshdr
printf "\tdefault:\n\t\t*n_args = 0;\n\t\tbreak;\n\t};\n}\n" > systrace
printf "\tdefault:\n\t\tbreak;\n\t};\n\tif (p != NULL)\n\t\tstrlcpy(desc, p, descsz);\n}\n" > systracetmp
} '
cat $sysinc $sysent >> $syssw
cat $sysarg $sysdcl \
$syscompat $syscompatdcl \
$syscompat4 $syscompat4dcl \
$syscompat6 $syscompat6dcl \
$syscompat7 $syscompat7dcl \
$sysaue $sysprotoend > $sysproto
cat $systracetmp >> $systrace
Index: head/sys/kern/p1003_1b.c
===================================================================
--- head/sys/kern/p1003_1b.c (revision 225616)
+++ head/sys/kern/p1003_1b.c (revision 225617)
@@ -1,315 +1,315 @@
/*-
* Copyright (c) 1996, 1997, 1998
* HD Associates, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by HD Associates, Inc
* 4. Neither the name of the author nor the names of any co-contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY HD ASSOCIATES AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL HD ASSOCIATES OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/* p1003_1b: Real Time common code.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_posix.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/module.h>
#include <sys/mutex.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/posix4.h>
#include <sys/syscallsubr.h>
#include <sys/sysctl.h>
#include <sys/sysent.h>
#include <sys/syslog.h>
#include <sys/sysproto.h>
MALLOC_DEFINE(M_P31B, "p1003.1b", "Posix 1003.1B");
/* The system calls return ENOSYS if an entry is called that is not run-time
* supported. I am also logging since some programs start to use this when
* they shouldn't. That will be removed if annoying.
*/
int
syscall_not_present(struct thread *td, const char *s, struct nosys_args *uap)
{
log(LOG_ERR, "cmd %s pid %d tried to use non-present %s\n",
td->td_name, td->td_proc->p_pid, s);
/* a " return nosys(p, uap); " here causes a core dump.
*/
return ENOSYS;
}
#if !defined(_KPOSIX_PRIORITY_SCHEDULING)
/* Not configured but loadable via a module:
*/
static int
sched_attach(void)
{
return 0;
}
SYSCALL_NOT_PRESENT_GEN(sched_setparam)
SYSCALL_NOT_PRESENT_GEN(sched_getparam)
SYSCALL_NOT_PRESENT_GEN(sched_setscheduler)
SYSCALL_NOT_PRESENT_GEN(sched_getscheduler)
SYSCALL_NOT_PRESENT_GEN(sched_yield)
SYSCALL_NOT_PRESENT_GEN(sched_get_priority_max)
SYSCALL_NOT_PRESENT_GEN(sched_get_priority_min)
SYSCALL_NOT_PRESENT_GEN(sched_rr_get_interval)
#else
/* Configured in kernel version:
*/
static struct ksched *ksched;
static int
sched_attach(void)
{
int ret = ksched_attach(&ksched);
if (ret == 0)
p31b_setcfg(CTL_P1003_1B_PRIORITY_SCHEDULING, 200112L);
return ret;
}
int
-sched_setparam(struct thread *td, struct sched_setparam_args *uap)
+sys_sched_setparam(struct thread *td, struct sched_setparam_args *uap)
{
struct thread *targettd;
struct proc *targetp;
int e;
struct sched_param sched_param;
e = copyin(uap->param, &sched_param, sizeof(sched_param));
if (e)
return (e);
if (uap->pid == 0) {
targetp = td->td_proc;
targettd = td;
PROC_LOCK(targetp);
} else {
targetp = pfind(uap->pid);
if (targetp == NULL)
return (ESRCH);
targettd = FIRST_THREAD_IN_PROC(targetp);
}
e = p_cansched(td, targetp);
if (e == 0) {
e = ksched_setparam(ksched, targettd,
(const struct sched_param *)&sched_param);
}
PROC_UNLOCK(targetp);
return (e);
}
int
-sched_getparam(struct thread *td, struct sched_getparam_args *uap)
+sys_sched_getparam(struct thread *td, struct sched_getparam_args *uap)
{
int e;
struct sched_param sched_param;
struct thread *targettd;
struct proc *targetp;
if (uap->pid == 0) {
targetp = td->td_proc;
targettd = td;
PROC_LOCK(targetp);
} else {
targetp = pfind(uap->pid);
if (targetp == NULL) {
return (ESRCH);
}
targettd = FIRST_THREAD_IN_PROC(targetp);
}
e = p_cansee(td, targetp);
if (e == 0) {
e = ksched_getparam(ksched, targettd, &sched_param);
}
PROC_UNLOCK(targetp);
if (e == 0)
e = copyout(&sched_param, uap->param, sizeof(sched_param));
return (e);
}
int
-sched_setscheduler(struct thread *td, struct sched_setscheduler_args *uap)
+sys_sched_setscheduler(struct thread *td, struct sched_setscheduler_args *uap)
{
int e;
struct sched_param sched_param;
struct thread *targettd;
struct proc *targetp;
/* Don't allow non root user to set a scheduler policy. */
e = priv_check(td, PRIV_SCHED_SET);
if (e)
return (e);
e = copyin(uap->param, &sched_param, sizeof(sched_param));
if (e)
return (e);
if (uap->pid == 0) {
targetp = td->td_proc;
targettd = td;
PROC_LOCK(targetp);
} else {
targetp = pfind(uap->pid);
if (targetp == NULL)
return (ESRCH);
targettd = FIRST_THREAD_IN_PROC(targetp);
}
e = p_cansched(td, targetp);
if (e == 0) {
e = ksched_setscheduler(ksched, targettd,
uap->policy, (const struct sched_param *)&sched_param);
}
PROC_UNLOCK(targetp);
return (e);
}
int
-sched_getscheduler(struct thread *td, struct sched_getscheduler_args *uap)
+sys_sched_getscheduler(struct thread *td, struct sched_getscheduler_args *uap)
{
int e, policy;
struct thread *targettd;
struct proc *targetp;
if (uap->pid == 0) {
targetp = td->td_proc;
targettd = td;
PROC_LOCK(targetp);
} else {
targetp = pfind(uap->pid);
if (targetp == NULL)
return (ESRCH);
targettd = FIRST_THREAD_IN_PROC(targetp);
}
e = p_cansee(td, targetp);
if (e == 0) {
e = ksched_getscheduler(ksched, targettd, &policy);
td->td_retval[0] = policy;
}
PROC_UNLOCK(targetp);
return (e);
}
int
-sched_yield(struct thread *td, struct sched_yield_args *uap)
+sys_sched_yield(struct thread *td, struct sched_yield_args *uap)
{
sched_relinquish(curthread);
return 0;
}
int
-sched_get_priority_max(struct thread *td,
+sys_sched_get_priority_max(struct thread *td,
struct sched_get_priority_max_args *uap)
{
int error, prio;
error = ksched_get_priority_max(ksched, uap->policy, &prio);
td->td_retval[0] = prio;
return (error);
}
int
-sched_get_priority_min(struct thread *td,
+sys_sched_get_priority_min(struct thread *td,
struct sched_get_priority_min_args *uap)
{
int error, prio;
error = ksched_get_priority_min(ksched, uap->policy, &prio);
td->td_retval[0] = prio;
return (error);
}
int
-sched_rr_get_interval(struct thread *td,
+sys_sched_rr_get_interval(struct thread *td,
struct sched_rr_get_interval_args *uap)
{
struct timespec timespec;
int error;
error = kern_sched_rr_get_interval(td, uap->pid, &timespec);
if (error == 0)
error = copyout(&timespec, uap->interval, sizeof(timespec));
return (error);
}
int
kern_sched_rr_get_interval(struct thread *td, pid_t pid,
struct timespec *ts)
{
int e;
struct thread *targettd;
struct proc *targetp;
if (pid == 0) {
targettd = td;
targetp = td->td_proc;
PROC_LOCK(targetp);
} else {
targetp = pfind(pid);
if (targetp == NULL)
return (ESRCH);
targettd = FIRST_THREAD_IN_PROC(targetp);
}
e = p_cansee(td, targetp);
if (e == 0)
e = ksched_rr_get_interval(ksched, targettd, ts);
PROC_UNLOCK(targetp);
return (e);
}
#endif
static void
p31binit(void *notused)
{
(void) sched_attach();
p31b_setcfg(CTL_P1003_1B_PAGESIZE, PAGE_SIZE);
}
SYSINIT(p31b, SI_SUB_P1003_1B, SI_ORDER_FIRST, p31binit, NULL);
Index: head/sys/kern/subr_bus.c
===================================================================
--- head/sys/kern/subr_bus.c (revision 225616)
+++ head/sys/kern/subr_bus.c (revision 225617)
@@ -1,4742 +1,4742 @@
/*-
* Copyright (c) 1997,1998,2003 Doug Rabson
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_bus.h"
#include <sys/param.h>
#include <sys/conf.h>
#include <sys/filio.h>
#include <sys/lock.h>
#include <sys/kernel.h>
#include <sys/kobj.h>
#include <sys/limits.h>
#include <sys/malloc.h>
#include <sys/module.h>
#include <sys/mutex.h>
#include <sys/poll.h>
#include <sys/proc.h>
#include <sys/condvar.h>
#include <sys/queue.h>
#include <machine/bus.h>
#include <sys/rman.h>
#include <sys/selinfo.h>
#include <sys/signalvar.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
#include <sys/uio.h>
#include <sys/bus.h>
#include <sys/interrupt.h>
#include <machine/stdarg.h>
#include <vm/uma.h>
SYSCTL_NODE(_hw, OID_AUTO, bus, CTLFLAG_RW, NULL, NULL);
SYSCTL_NODE(, OID_AUTO, dev, CTLFLAG_RW, NULL, NULL);
/*
* Used to attach drivers to devclasses.
*/
typedef struct driverlink *driverlink_t;
struct driverlink {
kobj_class_t driver;
TAILQ_ENTRY(driverlink) link; /* list of drivers in devclass */
int pass;
TAILQ_ENTRY(driverlink) passlink;
};
/*
* Forward declarations
*/
typedef TAILQ_HEAD(devclass_list, devclass) devclass_list_t;
typedef TAILQ_HEAD(driver_list, driverlink) driver_list_t;
typedef TAILQ_HEAD(device_list, device) device_list_t;
struct devclass {
TAILQ_ENTRY(devclass) link;
devclass_t parent; /* parent in devclass hierarchy */
driver_list_t drivers; /* bus devclasses store drivers for bus */
char *name;
device_t *devices; /* array of devices indexed by unit */
int maxunit; /* size of devices array */
int flags;
#define DC_HAS_CHILDREN 1
struct sysctl_ctx_list sysctl_ctx;
struct sysctl_oid *sysctl_tree;
};
/**
* @brief Implementation of device.
*/
struct device {
/*
* A device is a kernel object. The first field must be the
* current ops table for the object.
*/
KOBJ_FIELDS;
/*
* Device hierarchy.
*/
TAILQ_ENTRY(device) link; /**< list of devices in parent */
TAILQ_ENTRY(device) devlink; /**< global device list membership */
device_t parent; /**< parent of this device */
device_list_t children; /**< list of child devices */
/*
* Details of this device.
*/
driver_t *driver; /**< current driver */
devclass_t devclass; /**< current device class */
int unit; /**< current unit number */
char* nameunit; /**< name+unit e.g. foodev0 */
char* desc; /**< driver specific description */
int busy; /**< count of calls to device_busy() */
device_state_t state; /**< current device state */
uint32_t devflags; /**< api level flags for device_get_flags() */
u_int flags; /**< internal device flags */
#define DF_ENABLED 0x01 /* device should be probed/attached */
#define DF_FIXEDCLASS 0x02 /* devclass specified at create time */
#define DF_WILDCARD 0x04 /* unit was originally wildcard */
#define DF_DESCMALLOCED 0x08 /* description was malloced */
#define DF_QUIET 0x10 /* don't print verbose attach message */
#define DF_DONENOMATCH 0x20 /* don't execute DEVICE_NOMATCH again */
#define DF_EXTERNALSOFTC 0x40 /* softc not allocated by us */
#define DF_REBID 0x80 /* Can rebid after attach */
u_int order; /**< order from device_add_child_ordered() */
void *ivars; /**< instance variables */
void *softc; /**< current driver's variables */
struct sysctl_ctx_list sysctl_ctx; /**< state for sysctl variables */
struct sysctl_oid *sysctl_tree; /**< state for sysctl variables */
};
static MALLOC_DEFINE(M_BUS, "bus", "Bus data structures");
static MALLOC_DEFINE(M_BUS_SC, "bus-sc", "Bus data structures, softc");
#ifdef BUS_DEBUG
static int bus_debug = 1;
TUNABLE_INT("bus.debug", &bus_debug);
SYSCTL_INT(_debug, OID_AUTO, bus_debug, CTLFLAG_RW, &bus_debug, 0,
"Debug bus code");
#define PDEBUG(a) if (bus_debug) {printf("%s:%d: ", __func__, __LINE__), printf a; printf("\n");}
#define DEVICENAME(d) ((d)? device_get_name(d): "no device")
#define DRIVERNAME(d) ((d)? d->name : "no driver")
#define DEVCLANAME(d) ((d)? d->name : "no devclass")
/**
* Produce the indenting, indent*2 spaces plus a '.' ahead of that to
* prevent syslog from deleting initial spaces
*/
#define indentprintf(p) do { int iJ; printf("."); for (iJ=0; iJ<indent; iJ++) printf(" "); printf p ; } while (0)
static void print_device_short(device_t dev, int indent);
static void print_device(device_t dev, int indent);
void print_device_tree_short(device_t dev, int indent);
void print_device_tree(device_t dev, int indent);
static void print_driver_short(driver_t *driver, int indent);
static void print_driver(driver_t *driver, int indent);
static void print_driver_list(driver_list_t drivers, int indent);
static void print_devclass_short(devclass_t dc, int indent);
static void print_devclass(devclass_t dc, int indent);
void print_devclass_list_short(void);
void print_devclass_list(void);
#else
/* Make the compiler ignore the function calls */
#define PDEBUG(a) /* nop */
#define DEVICENAME(d) /* nop */
#define DRIVERNAME(d) /* nop */
#define DEVCLANAME(d) /* nop */
#define print_device_short(d,i) /* nop */
#define print_device(d,i) /* nop */
#define print_device_tree_short(d,i) /* nop */
#define print_device_tree(d,i) /* nop */
#define print_driver_short(d,i) /* nop */
#define print_driver(d,i) /* nop */
#define print_driver_list(d,i) /* nop */
#define print_devclass_short(d,i) /* nop */
#define print_devclass(d,i) /* nop */
#define print_devclass_list_short() /* nop */
#define print_devclass_list() /* nop */
#endif
/*
* dev sysctl tree
*/
enum {
DEVCLASS_SYSCTL_PARENT,
};
static int
devclass_sysctl_handler(SYSCTL_HANDLER_ARGS)
{
devclass_t dc = (devclass_t)arg1;
const char *value;
switch (arg2) {
case DEVCLASS_SYSCTL_PARENT:
value = dc->parent ? dc->parent->name : "";
break;
default:
return (EINVAL);
}
return (SYSCTL_OUT(req, value, strlen(value)));
}
static void
devclass_sysctl_init(devclass_t dc)
{
if (dc->sysctl_tree != NULL)
return;
sysctl_ctx_init(&dc->sysctl_ctx);
dc->sysctl_tree = SYSCTL_ADD_NODE(&dc->sysctl_ctx,
SYSCTL_STATIC_CHILDREN(_dev), OID_AUTO, dc->name,
CTLFLAG_RD, NULL, "");
SYSCTL_ADD_PROC(&dc->sysctl_ctx, SYSCTL_CHILDREN(dc->sysctl_tree),
OID_AUTO, "%parent", CTLTYPE_STRING | CTLFLAG_RD,
dc, DEVCLASS_SYSCTL_PARENT, devclass_sysctl_handler, "A",
"parent class");
}
enum {
DEVICE_SYSCTL_DESC,
DEVICE_SYSCTL_DRIVER,
DEVICE_SYSCTL_LOCATION,
DEVICE_SYSCTL_PNPINFO,
DEVICE_SYSCTL_PARENT,
};
static int
device_sysctl_handler(SYSCTL_HANDLER_ARGS)
{
device_t dev = (device_t)arg1;
const char *value;
char *buf;
int error;
buf = NULL;
switch (arg2) {
case DEVICE_SYSCTL_DESC:
value = dev->desc ? dev->desc : "";
break;
case DEVICE_SYSCTL_DRIVER:
value = dev->driver ? dev->driver->name : "";
break;
case DEVICE_SYSCTL_LOCATION:
value = buf = malloc(1024, M_BUS, M_WAITOK | M_ZERO);
bus_child_location_str(dev, buf, 1024);
break;
case DEVICE_SYSCTL_PNPINFO:
value = buf = malloc(1024, M_BUS, M_WAITOK | M_ZERO);
bus_child_pnpinfo_str(dev, buf, 1024);
break;
case DEVICE_SYSCTL_PARENT:
value = dev->parent ? dev->parent->nameunit : "";
break;
default:
return (EINVAL);
}
error = SYSCTL_OUT(req, value, strlen(value));
if (buf != NULL)
free(buf, M_BUS);
return (error);
}
static void
device_sysctl_init(device_t dev)
{
devclass_t dc = dev->devclass;
if (dev->sysctl_tree != NULL)
return;
devclass_sysctl_init(dc);
sysctl_ctx_init(&dev->sysctl_ctx);
dev->sysctl_tree = SYSCTL_ADD_NODE(&dev->sysctl_ctx,
SYSCTL_CHILDREN(dc->sysctl_tree), OID_AUTO,
dev->nameunit + strlen(dc->name),
CTLFLAG_RD, NULL, "");
SYSCTL_ADD_PROC(&dev->sysctl_ctx, SYSCTL_CHILDREN(dev->sysctl_tree),
OID_AUTO, "%desc", CTLTYPE_STRING | CTLFLAG_RD,
dev, DEVICE_SYSCTL_DESC, device_sysctl_handler, "A",
"device description");
SYSCTL_ADD_PROC(&dev->sysctl_ctx, SYSCTL_CHILDREN(dev->sysctl_tree),
OID_AUTO, "%driver", CTLTYPE_STRING | CTLFLAG_RD,
dev, DEVICE_SYSCTL_DRIVER, device_sysctl_handler, "A",
"device driver name");
SYSCTL_ADD_PROC(&dev->sysctl_ctx, SYSCTL_CHILDREN(dev->sysctl_tree),
OID_AUTO, "%location", CTLTYPE_STRING | CTLFLAG_RD,
dev, DEVICE_SYSCTL_LOCATION, device_sysctl_handler, "A",
"device location relative to parent");
SYSCTL_ADD_PROC(&dev->sysctl_ctx, SYSCTL_CHILDREN(dev->sysctl_tree),
OID_AUTO, "%pnpinfo", CTLTYPE_STRING | CTLFLAG_RD,
dev, DEVICE_SYSCTL_PNPINFO, device_sysctl_handler, "A",
"device identification");
SYSCTL_ADD_PROC(&dev->sysctl_ctx, SYSCTL_CHILDREN(dev->sysctl_tree),
OID_AUTO, "%parent", CTLTYPE_STRING | CTLFLAG_RD,
dev, DEVICE_SYSCTL_PARENT, device_sysctl_handler, "A",
"parent device");
}
static void
device_sysctl_update(device_t dev)
{
devclass_t dc = dev->devclass;
if (dev->sysctl_tree == NULL)
return;
sysctl_rename_oid(dev->sysctl_tree, dev->nameunit + strlen(dc->name));
}
static void
device_sysctl_fini(device_t dev)
{
if (dev->sysctl_tree == NULL)
return;
sysctl_ctx_free(&dev->sysctl_ctx);
dev->sysctl_tree = NULL;
}
/*
* /dev/devctl implementation
*/
/*
* This design allows only one reader for /dev/devctl. This is not desirable
* in the long run, but will get a lot of hair out of this implementation.
* Maybe we should make this device a clonable device.
*
* Also note: we specifically do not attach a device to the device_t tree
* to avoid potential chicken and egg problems. One could argue that all
* of this belongs to the root node. One could also further argue that the
* sysctl interface that we have not might more properly be an ioctl
* interface, but at this stage of the game, I'm not inclined to rock that
* boat.
*
* I'm also not sure that the SIGIO support is done correctly or not, as
* I copied it from a driver that had SIGIO support that likely hasn't been
* tested since 3.4 or 2.2.8!
*/
/* Deprecated way to adjust queue length */
static int sysctl_devctl_disable(SYSCTL_HANDLER_ARGS);
/* XXX Need to support old-style tunable hw.bus.devctl_disable" */
SYSCTL_PROC(_hw_bus, OID_AUTO, devctl_disable, CTLTYPE_INT | CTLFLAG_RW, NULL,
0, sysctl_devctl_disable, "I", "devctl disable -- deprecated");
#define DEVCTL_DEFAULT_QUEUE_LEN 1000
static int sysctl_devctl_queue(SYSCTL_HANDLER_ARGS);
static int devctl_queue_length = DEVCTL_DEFAULT_QUEUE_LEN;
TUNABLE_INT("hw.bus.devctl_queue", &devctl_queue_length);
SYSCTL_PROC(_hw_bus, OID_AUTO, devctl_queue, CTLTYPE_INT | CTLFLAG_RW, NULL,
0, sysctl_devctl_queue, "I", "devctl queue length");
static d_open_t devopen;
static d_close_t devclose;
static d_read_t devread;
static d_ioctl_t devioctl;
static d_poll_t devpoll;
static struct cdevsw dev_cdevsw = {
.d_version = D_VERSION,
.d_flags = D_NEEDGIANT,
.d_open = devopen,
.d_close = devclose,
.d_read = devread,
.d_ioctl = devioctl,
.d_poll = devpoll,
.d_name = "devctl",
};
struct dev_event_info
{
char *dei_data;
TAILQ_ENTRY(dev_event_info) dei_link;
};
TAILQ_HEAD(devq, dev_event_info);
static struct dev_softc
{
int inuse;
int nonblock;
int queued;
struct mtx mtx;
struct cv cv;
struct selinfo sel;
struct devq devq;
struct proc *async_proc;
} devsoftc;
static struct cdev *devctl_dev;
static void
devinit(void)
{
devctl_dev = make_dev_credf(MAKEDEV_ETERNAL, &dev_cdevsw, 0, NULL,
UID_ROOT, GID_WHEEL, 0600, "devctl");
mtx_init(&devsoftc.mtx, "dev mtx", "devd", MTX_DEF);
cv_init(&devsoftc.cv, "dev cv");
TAILQ_INIT(&devsoftc.devq);
}
static int
devopen(struct cdev *dev, int oflags, int devtype, struct thread *td)
{
if (devsoftc.inuse)
return (EBUSY);
/* move to init */
devsoftc.inuse = 1;
devsoftc.nonblock = 0;
devsoftc.async_proc = NULL;
return (0);
}
static int
devclose(struct cdev *dev, int fflag, int devtype, struct thread *td)
{
devsoftc.inuse = 0;
mtx_lock(&devsoftc.mtx);
cv_broadcast(&devsoftc.cv);
mtx_unlock(&devsoftc.mtx);
devsoftc.async_proc = NULL;
return (0);
}
/*
* The read channel for this device is used to report changes to
* userland in realtime. We are required to free the data as well as
* the n1 object because we allocate them separately. Also note that
* we return one record at a time. If you try to read this device a
* character at a time, you will lose the rest of the data. Listening
* programs are expected to cope.
*/
static int
devread(struct cdev *dev, struct uio *uio, int ioflag)
{
struct dev_event_info *n1;
int rv;
mtx_lock(&devsoftc.mtx);
while (TAILQ_EMPTY(&devsoftc.devq)) {
if (devsoftc.nonblock) {
mtx_unlock(&devsoftc.mtx);
return (EAGAIN);
}
rv = cv_wait_sig(&devsoftc.cv, &devsoftc.mtx);
if (rv) {
/*
* Need to translate ERESTART to EINTR here? -- jake
*/
mtx_unlock(&devsoftc.mtx);
return (rv);
}
}
n1 = TAILQ_FIRST(&devsoftc.devq);
TAILQ_REMOVE(&devsoftc.devq, n1, dei_link);
devsoftc.queued--;
mtx_unlock(&devsoftc.mtx);
rv = uiomove(n1->dei_data, strlen(n1->dei_data), uio);
free(n1->dei_data, M_BUS);
free(n1, M_BUS);
return (rv);
}
static int
devioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td)
{
switch (cmd) {
case FIONBIO:
if (*(int*)data)
devsoftc.nonblock = 1;
else
devsoftc.nonblock = 0;
return (0);
case FIOASYNC:
if (*(int*)data)
devsoftc.async_proc = td->td_proc;
else
devsoftc.async_proc = NULL;
return (0);
/* (un)Support for other fcntl() calls. */
case FIOCLEX:
case FIONCLEX:
case FIONREAD:
case FIOSETOWN:
case FIOGETOWN:
default:
break;
}
return (ENOTTY);
}
static int
devpoll(struct cdev *dev, int events, struct thread *td)
{
int revents = 0;
mtx_lock(&devsoftc.mtx);
if (events & (POLLIN | POLLRDNORM)) {
if (!TAILQ_EMPTY(&devsoftc.devq))
revents = events & (POLLIN | POLLRDNORM);
else
selrecord(td, &devsoftc.sel);
}
mtx_unlock(&devsoftc.mtx);
return (revents);
}
/**
* @brief Return whether the userland process is running
*/
boolean_t
devctl_process_running(void)
{
return (devsoftc.inuse == 1);
}
/**
* @brief Queue data to be read from the devctl device
*
* Generic interface to queue data to the devctl device. It is
* assumed that @p data is properly formatted. It is further assumed
* that @p data is allocated using the M_BUS malloc type.
*/
void
devctl_queue_data_f(char *data, int flags)
{
struct dev_event_info *n1 = NULL, *n2 = NULL;
struct proc *p;
if (strlen(data) == 0)
goto out;
if (devctl_queue_length == 0)
goto out;
n1 = malloc(sizeof(*n1), M_BUS, flags);
if (n1 == NULL)
goto out;
n1->dei_data = data;
mtx_lock(&devsoftc.mtx);
if (devctl_queue_length == 0) {
mtx_unlock(&devsoftc.mtx);
free(n1->dei_data, M_BUS);
free(n1, M_BUS);
return;
}
/* Leave at least one spot in the queue... */
while (devsoftc.queued > devctl_queue_length - 1) {
n2 = TAILQ_FIRST(&devsoftc.devq);
TAILQ_REMOVE(&devsoftc.devq, n2, dei_link);
free(n2->dei_data, M_BUS);
free(n2, M_BUS);
devsoftc.queued--;
}
TAILQ_INSERT_TAIL(&devsoftc.devq, n1, dei_link);
devsoftc.queued++;
cv_broadcast(&devsoftc.cv);
mtx_unlock(&devsoftc.mtx);
selwakeup(&devsoftc.sel);
p = devsoftc.async_proc;
if (p != NULL) {
PROC_LOCK(p);
- psignal(p, SIGIO);
+ kern_psignal(p, SIGIO);
PROC_UNLOCK(p);
}
return;
out:
/*
* We have to free data on all error paths since the caller
* assumes it will be free'd when this item is dequeued.
*/
free(data, M_BUS);
return;
}
void
devctl_queue_data(char *data)
{
devctl_queue_data_f(data, M_NOWAIT);
}
/**
* @brief Send a 'notification' to userland, using standard ways
*/
void
devctl_notify_f(const char *system, const char *subsystem, const char *type,
const char *data, int flags)
{
int len = 0;
char *msg;
if (system == NULL)
return; /* BOGUS! Must specify system. */
if (subsystem == NULL)
return; /* BOGUS! Must specify subsystem. */
if (type == NULL)
return; /* BOGUS! Must specify type. */
len += strlen(" system=") + strlen(system);
len += strlen(" subsystem=") + strlen(subsystem);
len += strlen(" type=") + strlen(type);
/* add in the data message plus newline. */
if (data != NULL)
len += strlen(data);
len += 3; /* '!', '\n', and NUL */
msg = malloc(len, M_BUS, flags);
if (msg == NULL)
return; /* Drop it on the floor */
if (data != NULL)
snprintf(msg, len, "!system=%s subsystem=%s type=%s %s\n",
system, subsystem, type, data);
else
snprintf(msg, len, "!system=%s subsystem=%s type=%s\n",
system, subsystem, type);
devctl_queue_data_f(msg, flags);
}
void
devctl_notify(const char *system, const char *subsystem, const char *type,
const char *data)
{
devctl_notify_f(system, subsystem, type, data, M_NOWAIT);
}
/*
* Common routine that tries to make sending messages as easy as possible.
* We allocate memory for the data, copy strings into that, but do not
* free it unless there's an error. The dequeue part of the driver should
* free the data. We don't send data when the device is disabled. We do
* send data, even when we have no listeners, because we wish to avoid
* races relating to startup and restart of listening applications.
*
* devaddq is designed to string together the type of event, with the
* object of that event, plus the plug and play info and location info
* for that event. This is likely most useful for devices, but less
* useful for other consumers of this interface. Those should use
* the devctl_queue_data() interface instead.
*/
static void
devaddq(const char *type, const char *what, device_t dev)
{
char *data = NULL;
char *loc = NULL;
char *pnp = NULL;
const char *parstr;
if (!devctl_queue_length)/* Rare race, but lost races safely discard */
return;
data = malloc(1024, M_BUS, M_NOWAIT);
if (data == NULL)
goto bad;
/* get the bus specific location of this device */
loc = malloc(1024, M_BUS, M_NOWAIT);
if (loc == NULL)
goto bad;
*loc = '\0';
bus_child_location_str(dev, loc, 1024);
/* Get the bus specific pnp info of this device */
pnp = malloc(1024, M_BUS, M_NOWAIT);
if (pnp == NULL)
goto bad;
*pnp = '\0';
bus_child_pnpinfo_str(dev, pnp, 1024);
/* Get the parent of this device, or / if high enough in the tree. */
if (device_get_parent(dev) == NULL)
parstr = "."; /* Or '/' ? */
else
parstr = device_get_nameunit(device_get_parent(dev));
/* String it all together. */
snprintf(data, 1024, "%s%s at %s %s on %s\n", type, what, loc, pnp,
parstr);
free(loc, M_BUS);
free(pnp, M_BUS);
devctl_queue_data(data);
return;
bad:
free(pnp, M_BUS);
free(loc, M_BUS);
free(data, M_BUS);
return;
}
/*
* A device was added to the tree. We are called just after it successfully
* attaches (that is, probe and attach success for this device). No call
* is made if a device is merely parented into the tree. See devnomatch
* if probe fails. If attach fails, no notification is sent (but maybe
* we should have a different message for this).
*/
static void
devadded(device_t dev)
{
devaddq("+", device_get_nameunit(dev), dev);
}
/*
* A device was removed from the tree. We are called just before this
* happens.
*/
static void
devremoved(device_t dev)
{
devaddq("-", device_get_nameunit(dev), dev);
}
/*
* Called when there's no match for this device. This is only called
* the first time that no match happens, so we don't keep getting this
* message. Should that prove to be undesirable, we can change it.
* This is called when all drivers that can attach to a given bus
* decline to accept this device. Other errors may not be detected.
*/
static void
devnomatch(device_t dev)
{
devaddq("?", "", dev);
}
static int
sysctl_devctl_disable(SYSCTL_HANDLER_ARGS)
{
struct dev_event_info *n1;
int dis, error;
dis = devctl_queue_length == 0;
error = sysctl_handle_int(oidp, &dis, 0, req);
if (error || !req->newptr)
return (error);
mtx_lock(&devsoftc.mtx);
if (dis) {
while (!TAILQ_EMPTY(&devsoftc.devq)) {
n1 = TAILQ_FIRST(&devsoftc.devq);
TAILQ_REMOVE(&devsoftc.devq, n1, dei_link);
free(n1->dei_data, M_BUS);
free(n1, M_BUS);
}
devsoftc.queued = 0;
devctl_queue_length = 0;
} else {
devctl_queue_length = DEVCTL_DEFAULT_QUEUE_LEN;
}
mtx_unlock(&devsoftc.mtx);
return (0);
}
static int
sysctl_devctl_queue(SYSCTL_HANDLER_ARGS)
{
struct dev_event_info *n1;
int q, error;
q = devctl_queue_length;
error = sysctl_handle_int(oidp, &q, 0, req);
if (error || !req->newptr)
return (error);
if (q < 0)
return (EINVAL);
mtx_lock(&devsoftc.mtx);
devctl_queue_length = q;
while (devsoftc.queued > devctl_queue_length) {
n1 = TAILQ_FIRST(&devsoftc.devq);
TAILQ_REMOVE(&devsoftc.devq, n1, dei_link);
free(n1->dei_data, M_BUS);
free(n1, M_BUS);
devsoftc.queued--;
}
mtx_unlock(&devsoftc.mtx);
return (0);
}
/* End of /dev/devctl code */
static TAILQ_HEAD(,device) bus_data_devices;
static int bus_data_generation = 1;
static kobj_method_t null_methods[] = {
KOBJMETHOD_END
};
DEFINE_CLASS(null, null_methods, 0);
/*
* Bus pass implementation
*/
static driver_list_t passes = TAILQ_HEAD_INITIALIZER(passes);
int bus_current_pass = BUS_PASS_ROOT;
/**
* @internal
* @brief Register the pass level of a new driver attachment
*
* Register a new driver attachment's pass level. If no driver
* attachment with the same pass level has been added, then @p new
* will be added to the global passes list.
*
* @param new the new driver attachment
*/
static void
driver_register_pass(struct driverlink *new)
{
struct driverlink *dl;
/* We only consider pass numbers during boot. */
if (bus_current_pass == BUS_PASS_DEFAULT)
return;
/*
* Walk the passes list. If we already know about this pass
* then there is nothing to do. If we don't, then insert this
* driver link into the list.
*/
TAILQ_FOREACH(dl, &passes, passlink) {
if (dl->pass < new->pass)
continue;
if (dl->pass == new->pass)
return;
TAILQ_INSERT_BEFORE(dl, new, passlink);
return;
}
TAILQ_INSERT_TAIL(&passes, new, passlink);
}
/**
* @brief Raise the current bus pass
*
* Raise the current bus pass level to @p pass. Call the BUS_NEW_PASS()
* method on the root bus to kick off a new device tree scan for each
* new pass level that has at least one driver.
*/
void
bus_set_pass(int pass)
{
struct driverlink *dl;
if (bus_current_pass > pass)
panic("Attempt to lower bus pass level");
TAILQ_FOREACH(dl, &passes, passlink) {
/* Skip pass values below the current pass level. */
if (dl->pass <= bus_current_pass)
continue;
/*
* Bail once we hit a driver with a pass level that is
* too high.
*/
if (dl->pass > pass)
break;
/*
* Raise the pass level to the next level and rescan
* the tree.
*/
bus_current_pass = dl->pass;
BUS_NEW_PASS(root_bus);
}
/*
* If there isn't a driver registered for the requested pass,
* then bus_current_pass might still be less than 'pass'. Set
* it to 'pass' in that case.
*/
if (bus_current_pass < pass)
bus_current_pass = pass;
KASSERT(bus_current_pass == pass, ("Failed to update bus pass level"));
}
/*
* Devclass implementation
*/
static devclass_list_t devclasses = TAILQ_HEAD_INITIALIZER(devclasses);
/**
* @internal
* @brief Find or create a device class
*
* If a device class with the name @p classname exists, return it,
* otherwise if @p create is non-zero create and return a new device
* class.
*
* If @p parentname is non-NULL, the parent of the devclass is set to
* the devclass of that name.
*
* @param classname the devclass name to find or create
* @param parentname the parent devclass name or @c NULL
* @param create non-zero to create a devclass
*/
static devclass_t
devclass_find_internal(const char *classname, const char *parentname,
int create)
{
devclass_t dc;
PDEBUG(("looking for %s", classname));
if (!classname)
return (NULL);
TAILQ_FOREACH(dc, &devclasses, link) {
if (!strcmp(dc->name, classname))
break;
}
if (create && !dc) {
PDEBUG(("creating %s", classname));
dc = malloc(sizeof(struct devclass) + strlen(classname) + 1,
M_BUS, M_NOWAIT | M_ZERO);
if (!dc)
return (NULL);
dc->parent = NULL;
dc->name = (char*) (dc + 1);
strcpy(dc->name, classname);
TAILQ_INIT(&dc->drivers);
TAILQ_INSERT_TAIL(&devclasses, dc, link);
bus_data_generation_update();
}
/*
* If a parent class is specified, then set that as our parent so
* that this devclass will support drivers for the parent class as
* well. If the parent class has the same name don't do this though
* as it creates a cycle that can trigger an infinite loop in
* device_probe_child() if a device exists for which there is no
* suitable driver.
*/
if (parentname && dc && !dc->parent &&
strcmp(classname, parentname) != 0) {
dc->parent = devclass_find_internal(parentname, NULL, TRUE);
dc->parent->flags |= DC_HAS_CHILDREN;
}
return (dc);
}
/**
* @brief Create a device class
*
* If a device class with the name @p classname exists, return it,
* otherwise create and return a new device class.
*
* @param classname the devclass name to find or create
*/
devclass_t
devclass_create(const char *classname)
{
return (devclass_find_internal(classname, NULL, TRUE));
}
/**
* @brief Find a device class
*
* If a device class with the name @p classname exists, return it,
* otherwise return @c NULL.
*
* @param classname the devclass name to find
*/
devclass_t
devclass_find(const char *classname)
{
return (devclass_find_internal(classname, NULL, FALSE));
}
/**
* @brief Register that a device driver has been added to a devclass
*
* Register that a device driver has been added to a devclass. This
* is called by devclass_add_driver to accomplish the recursive
* notification of all the children classes of dc, as well as dc.
* Each layer will have BUS_DRIVER_ADDED() called for all instances of
* the devclass.
*
* We do a full search here of the devclass list at each iteration
* level to save storing children-lists in the devclass structure. If
* we ever move beyond a few dozen devices doing this, we may need to
* reevaluate...
*
* @param dc the devclass to edit
* @param driver the driver that was just added
*/
static void
devclass_driver_added(devclass_t dc, driver_t *driver)
{
devclass_t parent;
int i;
/*
* Call BUS_DRIVER_ADDED for any existing busses in this class.
*/
for (i = 0; i < dc->maxunit; i++)
if (dc->devices[i] && device_is_attached(dc->devices[i]))
BUS_DRIVER_ADDED(dc->devices[i], driver);
/*
* Walk through the children classes. Since we only keep a
* single parent pointer around, we walk the entire list of
* devclasses looking for children. We set the
* DC_HAS_CHILDREN flag when a child devclass is created on
* the parent, so we only walk the list for those devclasses
* that have children.
*/
if (!(dc->flags & DC_HAS_CHILDREN))
return;
parent = dc;
TAILQ_FOREACH(dc, &devclasses, link) {
if (dc->parent == parent)
devclass_driver_added(dc, driver);
}
}
/**
* @brief Add a device driver to a device class
*
* Add a device driver to a devclass. This is normally called
* automatically by DRIVER_MODULE(). The BUS_DRIVER_ADDED() method of
* all devices in the devclass will be called to allow them to attempt
* to re-probe any unmatched children.
*
* @param dc the devclass to edit
* @param driver the driver to register
*/
int
devclass_add_driver(devclass_t dc, driver_t *driver, int pass, devclass_t *dcp)
{
driverlink_t dl;
const char *parentname;
PDEBUG(("%s", DRIVERNAME(driver)));
/* Don't allow invalid pass values. */
if (pass <= BUS_PASS_ROOT)
return (EINVAL);
dl = malloc(sizeof *dl, M_BUS, M_NOWAIT|M_ZERO);
if (!dl)
return (ENOMEM);
/*
* Compile the driver's methods. Also increase the reference count
* so that the class doesn't get freed when the last instance
* goes. This means we can safely use static methods and avoids a
* double-free in devclass_delete_driver.
*/
kobj_class_compile((kobj_class_t) driver);
/*
* If the driver has any base classes, make the
* devclass inherit from the devclass of the driver's
* first base class. This will allow the system to
* search for drivers in both devclasses for children
* of a device using this driver.
*/
if (driver->baseclasses)
parentname = driver->baseclasses[0]->name;
else
parentname = NULL;
*dcp = devclass_find_internal(driver->name, parentname, TRUE);
dl->driver = driver;
TAILQ_INSERT_TAIL(&dc->drivers, dl, link);
driver->refs++; /* XXX: kobj_mtx */
dl->pass = pass;
driver_register_pass(dl);
devclass_driver_added(dc, driver);
bus_data_generation_update();
return (0);
}
/**
* @brief Register that a device driver has been deleted from a devclass
*
* Register that a device driver has been removed from a devclass.
* This is called by devclass_delete_driver to accomplish the
* recursive notification of all the children classes of busclass, as
* well as busclass. Each layer will attempt to detach the driver
* from any devices that are children of the bus's devclass. The function
* will return an error if a device fails to detach.
*
* We do a full search here of the devclass list at each iteration
* level to save storing children-lists in the devclass structure. If
* we ever move beyond a few dozen devices doing this, we may need to
* reevaluate...
*
* @param busclass the devclass of the parent bus
* @param dc the devclass of the driver being deleted
* @param driver the driver being deleted
*/
static int
devclass_driver_deleted(devclass_t busclass, devclass_t dc, driver_t *driver)
{
devclass_t parent;
device_t dev;
int error, i;
/*
* Disassociate from any devices. We iterate through all the
* devices in the devclass of the driver and detach any which are
* using the driver and which have a parent in the devclass which
* we are deleting from.
*
* Note that since a driver can be in multiple devclasses, we
* should not detach devices which are not children of devices in
* the affected devclass.
*/
for (i = 0; i < dc->maxunit; i++) {
if (dc->devices[i]) {
dev = dc->devices[i];
if (dev->driver == driver && dev->parent &&
dev->parent->devclass == busclass) {
if ((error = device_detach(dev)) != 0)
return (error);
device_set_driver(dev, NULL);
BUS_PROBE_NOMATCH(dev->parent, dev);
devnomatch(dev);
dev->flags |= DF_DONENOMATCH;
}
}
}
/*
* Walk through the children classes. Since we only keep a
* single parent pointer around, we walk the entire list of
* devclasses looking for children. We set the
* DC_HAS_CHILDREN flag when a child devclass is created on
* the parent, so we only walk the list for those devclasses
* that have children.
*/
if (!(busclass->flags & DC_HAS_CHILDREN))
return (0);
parent = busclass;
TAILQ_FOREACH(busclass, &devclasses, link) {
if (busclass->parent == parent) {
error = devclass_driver_deleted(busclass, dc, driver);
if (error)
return (error);
}
}
return (0);
}
/**
* @brief Delete a device driver from a device class
*
* Delete a device driver from a devclass. This is normally called
* automatically by DRIVER_MODULE().
*
* If the driver is currently attached to any devices,
* devclass_delete_driver() will first attempt to detach from each
* device. If one of the detach calls fails, the driver will not be
* deleted.
*
* @param dc the devclass to edit
* @param driver the driver to unregister
*/
int
devclass_delete_driver(devclass_t busclass, driver_t *driver)
{
devclass_t dc = devclass_find(driver->name);
driverlink_t dl;
int error;
PDEBUG(("%s from devclass %s", driver->name, DEVCLANAME(busclass)));
if (!dc)
return (0);
/*
* Find the link structure in the bus' list of drivers.
*/
TAILQ_FOREACH(dl, &busclass->drivers, link) {
if (dl->driver == driver)
break;
}
if (!dl) {
PDEBUG(("%s not found in %s list", driver->name,
busclass->name));
return (ENOENT);
}
error = devclass_driver_deleted(busclass, dc, driver);
if (error != 0)
return (error);
TAILQ_REMOVE(&busclass->drivers, dl, link);
free(dl, M_BUS);
/* XXX: kobj_mtx */
driver->refs--;
if (driver->refs == 0)
kobj_class_free((kobj_class_t) driver);
bus_data_generation_update();
return (0);
}
/**
* @brief Quiesces a set of device drivers from a device class
*
* Quiesce a device driver from a devclass. This is normally called
* automatically by DRIVER_MODULE().
*
* If the driver is currently attached to any devices,
* devclass_quiesece_driver() will first attempt to quiesce each
* device.
*
* @param dc the devclass to edit
* @param driver the driver to unregister
*/
static int
devclass_quiesce_driver(devclass_t busclass, driver_t *driver)
{
devclass_t dc = devclass_find(driver->name);
driverlink_t dl;
device_t dev;
int i;
int error;
PDEBUG(("%s from devclass %s", driver->name, DEVCLANAME(busclass)));
if (!dc)
return (0);
/*
* Find the link structure in the bus' list of drivers.
*/
TAILQ_FOREACH(dl, &busclass->drivers, link) {
if (dl->driver == driver)
break;
}
if (!dl) {
PDEBUG(("%s not found in %s list", driver->name,
busclass->name));
return (ENOENT);
}
/*
* Quiesce all devices. We iterate through all the devices in
* the devclass of the driver and quiesce any which are using
* the driver and which have a parent in the devclass which we
* are quiescing.
*
* Note that since a driver can be in multiple devclasses, we
* should not quiesce devices which are not children of
* devices in the affected devclass.
*/
for (i = 0; i < dc->maxunit; i++) {
if (dc->devices[i]) {
dev = dc->devices[i];
if (dev->driver == driver && dev->parent &&
dev->parent->devclass == busclass) {
if ((error = device_quiesce(dev)) != 0)
return (error);
}
}
}
return (0);
}
/**
* @internal
*/
static driverlink_t
devclass_find_driver_internal(devclass_t dc, const char *classname)
{
driverlink_t dl;
PDEBUG(("%s in devclass %s", classname, DEVCLANAME(dc)));
TAILQ_FOREACH(dl, &dc->drivers, link) {
if (!strcmp(dl->driver->name, classname))
return (dl);
}
PDEBUG(("not found"));
return (NULL);
}
/**
* @brief Return the name of the devclass
*/
const char *
devclass_get_name(devclass_t dc)
{
return (dc->name);
}
/**
* @brief Find a device given a unit number
*
* @param dc the devclass to search
* @param unit the unit number to search for
*
* @returns the device with the given unit number or @c
* NULL if there is no such device
*/
device_t
devclass_get_device(devclass_t dc, int unit)
{
if (dc == NULL || unit < 0 || unit >= dc->maxunit)
return (NULL);
return (dc->devices[unit]);
}
/**
* @brief Find the softc field of a device given a unit number
*
* @param dc the devclass to search
* @param unit the unit number to search for
*
* @returns the softc field of the device with the given
* unit number or @c NULL if there is no such
* device
*/
void *
devclass_get_softc(devclass_t dc, int unit)
{
device_t dev;
dev = devclass_get_device(dc, unit);
if (!dev)
return (NULL);
return (device_get_softc(dev));
}
/**
* @brief Get a list of devices in the devclass
*
* An array containing a list of all the devices in the given devclass
* is allocated and returned in @p *devlistp. The number of devices
* in the array is returned in @p *devcountp. The caller should free
* the array using @c free(p, M_TEMP), even if @p *devcountp is 0.
*
* @param dc the devclass to examine
* @param devlistp points at location for array pointer return
* value
* @param devcountp points at location for array size return value
*
* @retval 0 success
* @retval ENOMEM the array allocation failed
*/
int
devclass_get_devices(devclass_t dc, device_t **devlistp, int *devcountp)
{
int count, i;
device_t *list;
count = devclass_get_count(dc);
list = malloc(count * sizeof(device_t), M_TEMP, M_NOWAIT|M_ZERO);
if (!list)
return (ENOMEM);
count = 0;
for (i = 0; i < dc->maxunit; i++) {
if (dc->devices[i]) {
list[count] = dc->devices[i];
count++;
}
}
*devlistp = list;
*devcountp = count;
return (0);
}
/**
* @brief Get a list of drivers in the devclass
*
* An array containing a list of pointers to all the drivers in the
* given devclass is allocated and returned in @p *listp. The number
* of drivers in the array is returned in @p *countp. The caller should
* free the array using @c free(p, M_TEMP).
*
* @param dc the devclass to examine
* @param listp gives location for array pointer return value
* @param countp gives location for number of array elements
* return value
*
* @retval 0 success
* @retval ENOMEM the array allocation failed
*/
int
devclass_get_drivers(devclass_t dc, driver_t ***listp, int *countp)
{
driverlink_t dl;
driver_t **list;
int count;
count = 0;
TAILQ_FOREACH(dl, &dc->drivers, link)
count++;
list = malloc(count * sizeof(driver_t *), M_TEMP, M_NOWAIT);
if (list == NULL)
return (ENOMEM);
count = 0;
TAILQ_FOREACH(dl, &dc->drivers, link) {
list[count] = dl->driver;
count++;
}
*listp = list;
*countp = count;
return (0);
}
/**
* @brief Get the number of devices in a devclass
*
* @param dc the devclass to examine
*/
int
devclass_get_count(devclass_t dc)
{
int count, i;
count = 0;
for (i = 0; i < dc->maxunit; i++)
if (dc->devices[i])
count++;
return (count);
}
/**
* @brief Get the maximum unit number used in a devclass
*
* Note that this is one greater than the highest currently-allocated
* unit. If a null devclass_t is passed in, -1 is returned to indicate
* that not even the devclass has been allocated yet.
*
* @param dc the devclass to examine
*/
int
devclass_get_maxunit(devclass_t dc)
{
if (dc == NULL)
return (-1);
return (dc->maxunit);
}
/**
* @brief Find a free unit number in a devclass
*
* This function searches for the first unused unit number greater
* that or equal to @p unit.
*
* @param dc the devclass to examine
* @param unit the first unit number to check
*/
int
devclass_find_free_unit(devclass_t dc, int unit)
{
if (dc == NULL)
return (unit);
while (unit < dc->maxunit && dc->devices[unit] != NULL)
unit++;
return (unit);
}
/**
* @brief Set the parent of a devclass
*
* The parent class is normally initialised automatically by
* DRIVER_MODULE().
*
* @param dc the devclass to edit
* @param pdc the new parent devclass
*/
void
devclass_set_parent(devclass_t dc, devclass_t pdc)
{
dc->parent = pdc;
}
/**
* @brief Get the parent of a devclass
*
* @param dc the devclass to examine
*/
devclass_t
devclass_get_parent(devclass_t dc)
{
return (dc->parent);
}
struct sysctl_ctx_list *
devclass_get_sysctl_ctx(devclass_t dc)
{
return (&dc->sysctl_ctx);
}
struct sysctl_oid *
devclass_get_sysctl_tree(devclass_t dc)
{
return (dc->sysctl_tree);
}
/**
* @internal
* @brief Allocate a unit number
*
* On entry, @p *unitp is the desired unit number (or @c -1 if any
* will do). The allocated unit number is returned in @p *unitp.
* @param dc the devclass to allocate from
* @param unitp points at the location for the allocated unit
* number
*
* @retval 0 success
* @retval EEXIST the requested unit number is already allocated
* @retval ENOMEM memory allocation failure
*/
static int
devclass_alloc_unit(devclass_t dc, device_t dev, int *unitp)
{
const char *s;
int unit = *unitp;
PDEBUG(("unit %d in devclass %s", unit, DEVCLANAME(dc)));
/* Ask the parent bus if it wants to wire this device. */
if (unit == -1)
BUS_HINT_DEVICE_UNIT(device_get_parent(dev), dev, dc->name,
&unit);
/* If we were given a wired unit number, check for existing device */
/* XXX imp XXX */
if (unit != -1) {
if (unit >= 0 && unit < dc->maxunit &&
dc->devices[unit] != NULL) {
if (bootverbose)
printf("%s: %s%d already exists; skipping it\n",
dc->name, dc->name, *unitp);
return (EEXIST);
}
} else {
/* Unwired device, find the next available slot for it */
unit = 0;
for (unit = 0;; unit++) {
/* If there is an "at" hint for a unit then skip it. */
if (resource_string_value(dc->name, unit, "at", &s) ==
0)
continue;
/* If this device slot is already in use, skip it. */
if (unit < dc->maxunit && dc->devices[unit] != NULL)
continue;
break;
}
}
/*
* We've selected a unit beyond the length of the table, so let's
* extend the table to make room for all units up to and including
* this one.
*/
if (unit >= dc->maxunit) {
device_t *newlist, *oldlist;
int newsize;
oldlist = dc->devices;
newsize = roundup((unit + 1), MINALLOCSIZE / sizeof(device_t));
newlist = malloc(sizeof(device_t) * newsize, M_BUS, M_NOWAIT);
if (!newlist)
return (ENOMEM);
if (oldlist != NULL)
bcopy(oldlist, newlist, sizeof(device_t) * dc->maxunit);
bzero(newlist + dc->maxunit,
sizeof(device_t) * (newsize - dc->maxunit));
dc->devices = newlist;
dc->maxunit = newsize;
if (oldlist != NULL)
free(oldlist, M_BUS);
}
PDEBUG(("now: unit %d in devclass %s", unit, DEVCLANAME(dc)));
*unitp = unit;
return (0);
}
/**
* @internal
* @brief Add a device to a devclass
*
* A unit number is allocated for the device (using the device's
* preferred unit number if any) and the device is registered in the
* devclass. This allows the device to be looked up by its unit
* number, e.g. by decoding a dev_t minor number.
*
* @param dc the devclass to add to
* @param dev the device to add
*
* @retval 0 success
* @retval EEXIST the requested unit number is already allocated
* @retval ENOMEM memory allocation failure
*/
static int
devclass_add_device(devclass_t dc, device_t dev)
{
int buflen, error;
PDEBUG(("%s in devclass %s", DEVICENAME(dev), DEVCLANAME(dc)));
buflen = snprintf(NULL, 0, "%s%d$", dc->name, INT_MAX);
if (buflen < 0)
return (ENOMEM);
dev->nameunit = malloc(buflen, M_BUS, M_NOWAIT|M_ZERO);
if (!dev->nameunit)
return (ENOMEM);
if ((error = devclass_alloc_unit(dc, dev, &dev->unit)) != 0) {
free(dev->nameunit, M_BUS);
dev->nameunit = NULL;
return (error);
}
dc->devices[dev->unit] = dev;
dev->devclass = dc;
snprintf(dev->nameunit, buflen, "%s%d", dc->name, dev->unit);
return (0);
}
/**
* @internal
* @brief Delete a device from a devclass
*
* The device is removed from the devclass's device list and its unit
* number is freed.
* @param dc the devclass to delete from
* @param dev the device to delete
*
* @retval 0 success
*/
static int
devclass_delete_device(devclass_t dc, device_t dev)
{
if (!dc || !dev)
return (0);
PDEBUG(("%s in devclass %s", DEVICENAME(dev), DEVCLANAME(dc)));
if (dev->devclass != dc || dc->devices[dev->unit] != dev)
panic("devclass_delete_device: inconsistent device class");
dc->devices[dev->unit] = NULL;
if (dev->flags & DF_WILDCARD)
dev->unit = -1;
dev->devclass = NULL;
free(dev->nameunit, M_BUS);
dev->nameunit = NULL;
return (0);
}
/**
* @internal
* @brief Make a new device and add it as a child of @p parent
*
* @param parent the parent of the new device
* @param name the devclass name of the new device or @c NULL
* to leave the devclass unspecified
* @parem unit the unit number of the new device of @c -1 to
* leave the unit number unspecified
*
* @returns the new device
*/
static device_t
make_device(device_t parent, const char *name, int unit)
{
device_t dev;
devclass_t dc;
PDEBUG(("%s at %s as unit %d", name, DEVICENAME(parent), unit));
if (name) {
dc = devclass_find_internal(name, NULL, TRUE);
if (!dc) {
printf("make_device: can't find device class %s\n",
name);
return (NULL);
}
} else {
dc = NULL;
}
dev = malloc(sizeof(struct device), M_BUS, M_NOWAIT|M_ZERO);
if (!dev)
return (NULL);
dev->parent = parent;
TAILQ_INIT(&dev->children);
kobj_init((kobj_t) dev, &null_class);
dev->driver = NULL;
dev->devclass = NULL;
dev->unit = unit;
dev->nameunit = NULL;
dev->desc = NULL;
dev->busy = 0;
dev->devflags = 0;
dev->flags = DF_ENABLED;
dev->order = 0;
if (unit == -1)
dev->flags |= DF_WILDCARD;
if (name) {
dev->flags |= DF_FIXEDCLASS;
if (devclass_add_device(dc, dev)) {
kobj_delete((kobj_t) dev, M_BUS);
return (NULL);
}
}
dev->ivars = NULL;
dev->softc = NULL;
dev->state = DS_NOTPRESENT;
TAILQ_INSERT_TAIL(&bus_data_devices, dev, devlink);
bus_data_generation_update();
return (dev);
}
/**
* @internal
* @brief Print a description of a device.
*/
static int
device_print_child(device_t dev, device_t child)
{
int retval = 0;
if (device_is_alive(child))
retval += BUS_PRINT_CHILD(dev, child);
else
retval += device_printf(child, " not found\n");
return (retval);
}
/**
* @brief Create a new device
*
* This creates a new device and adds it as a child of an existing
* parent device. The new device will be added after the last existing
* child with order zero.
*
* @param dev the device which will be the parent of the
* new child device
* @param name devclass name for new device or @c NULL if not
* specified
* @param unit unit number for new device or @c -1 if not
* specified
*
* @returns the new device
*/
device_t
device_add_child(device_t dev, const char *name, int unit)
{
return (device_add_child_ordered(dev, 0, name, unit));
}
/**
* @brief Create a new device
*
* This creates a new device and adds it as a child of an existing
* parent device. The new device will be added after the last existing
* child with the same order.
*
* @param dev the device which will be the parent of the
* new child device
* @param order a value which is used to partially sort the
* children of @p dev - devices created using
* lower values of @p order appear first in @p
* dev's list of children
* @param name devclass name for new device or @c NULL if not
* specified
* @param unit unit number for new device or @c -1 if not
* specified
*
* @returns the new device
*/
device_t
device_add_child_ordered(device_t dev, u_int order, const char *name, int unit)
{
device_t child;
device_t place;
PDEBUG(("%s at %s with order %u as unit %d",
name, DEVICENAME(dev), order, unit));
child = make_device(dev, name, unit);
if (child == NULL)
return (child);
child->order = order;
TAILQ_FOREACH(place, &dev->children, link) {
if (place->order > order)
break;
}
if (place) {
/*
* The device 'place' is the first device whose order is
* greater than the new child.
*/
TAILQ_INSERT_BEFORE(place, child, link);
} else {
/*
* The new child's order is greater or equal to the order of
* any existing device. Add the child to the tail of the list.
*/
TAILQ_INSERT_TAIL(&dev->children, child, link);
}
bus_data_generation_update();
return (child);
}
/**
* @brief Delete a device
*
* This function deletes a device along with all of its children. If
* the device currently has a driver attached to it, the device is
* detached first using device_detach().
*
* @param dev the parent device
* @param child the device to delete
*
* @retval 0 success
* @retval non-zero a unit error code describing the error
*/
int
device_delete_child(device_t dev, device_t child)
{
int error;
device_t grandchild;
PDEBUG(("%s from %s", DEVICENAME(child), DEVICENAME(dev)));
/* remove children first */
while ( (grandchild = TAILQ_FIRST(&child->children)) ) {
error = device_delete_child(child, grandchild);
if (error)
return (error);
}
if ((error = device_detach(child)) != 0)
return (error);
if (child->devclass)
devclass_delete_device(child->devclass, child);
TAILQ_REMOVE(&dev->children, child, link);
TAILQ_REMOVE(&bus_data_devices, child, devlink);
kobj_delete((kobj_t) child, M_BUS);
bus_data_generation_update();
return (0);
}
/**
* @brief Find a device given a unit number
*
* This is similar to devclass_get_devices() but only searches for
* devices which have @p dev as a parent.
*
* @param dev the parent device to search
* @param unit the unit number to search for. If the unit is -1,
* return the first child of @p dev which has name
* @p classname (that is, the one with the lowest unit.)
*
* @returns the device with the given unit number or @c
* NULL if there is no such device
*/
device_t
device_find_child(device_t dev, const char *classname, int unit)
{
devclass_t dc;
device_t child;
dc = devclass_find(classname);
if (!dc)
return (NULL);
if (unit != -1) {
child = devclass_get_device(dc, unit);
if (child && child->parent == dev)
return (child);
} else {
for (unit = 0; unit < devclass_get_maxunit(dc); unit++) {
child = devclass_get_device(dc, unit);
if (child && child->parent == dev)
return (child);
}
}
return (NULL);
}
/**
* @internal
*/
static driverlink_t
first_matching_driver(devclass_t dc, device_t dev)
{
if (dev->devclass)
return (devclass_find_driver_internal(dc, dev->devclass->name));
return (TAILQ_FIRST(&dc->drivers));
}
/**
* @internal
*/
static driverlink_t
next_matching_driver(devclass_t dc, device_t dev, driverlink_t last)
{
if (dev->devclass) {
driverlink_t dl;
for (dl = TAILQ_NEXT(last, link); dl; dl = TAILQ_NEXT(dl, link))
if (!strcmp(dev->devclass->name, dl->driver->name))
return (dl);
return (NULL);
}
return (TAILQ_NEXT(last, link));
}
/**
* @internal
*/
int
device_probe_child(device_t dev, device_t child)
{
devclass_t dc;
driverlink_t best = NULL;
driverlink_t dl;
int result, pri = 0;
int hasclass = (child->devclass != NULL);
GIANT_REQUIRED;
dc = dev->devclass;
if (!dc)
panic("device_probe_child: parent device has no devclass");
/*
* If the state is already probed, then return. However, don't
* return if we can rebid this object.
*/
if (child->state == DS_ALIVE && (child->flags & DF_REBID) == 0)
return (0);
for (; dc; dc = dc->parent) {
for (dl = first_matching_driver(dc, child);
dl;
dl = next_matching_driver(dc, child, dl)) {
/* If this driver's pass is too high, then ignore it. */
if (dl->pass > bus_current_pass)
continue;
PDEBUG(("Trying %s", DRIVERNAME(dl->driver)));
device_set_driver(child, dl->driver);
if (!hasclass) {
if (device_set_devclass(child, dl->driver->name)) {
printf("driver bug: Unable to set devclass (devname: %s)\n",
(child ? device_get_name(child) :
"no device"));
device_set_driver(child, NULL);
continue;
}
}
/* Fetch any flags for the device before probing. */
resource_int_value(dl->driver->name, child->unit,
"flags", &child->devflags);
result = DEVICE_PROBE(child);
/* Reset flags and devclass before the next probe. */
child->devflags = 0;
if (!hasclass)
device_set_devclass(child, NULL);
/*
* If the driver returns SUCCESS, there can be
* no higher match for this device.
*/
if (result == 0) {
best = dl;
pri = 0;
break;
}
/*
* The driver returned an error so it
* certainly doesn't match.
*/
if (result > 0) {
device_set_driver(child, NULL);
continue;
}
/*
* A priority lower than SUCCESS, remember the
* best matching driver. Initialise the value
* of pri for the first match.
*/
if (best == NULL || result > pri) {
/*
* Probes that return BUS_PROBE_NOWILDCARD
* or lower only match when they are set
* in stone by the parent bus.
*/
if (result <= BUS_PROBE_NOWILDCARD &&
child->flags & DF_WILDCARD)
continue;
best = dl;
pri = result;
continue;
}
}
/*
* If we have an unambiguous match in this devclass,
* don't look in the parent.
*/
if (best && pri == 0)
break;
}
/*
* If we found a driver, change state and initialise the devclass.
*/
/* XXX What happens if we rebid and got no best? */
if (best) {
/*
* If this device was atached, and we were asked to
* rescan, and it is a different driver, then we have
* to detach the old driver and reattach this new one.
* Note, we don't have to check for DF_REBID here
* because if the state is > DS_ALIVE, we know it must
* be.
*
* This assumes that all DF_REBID drivers can have
* their probe routine called at any time and that
* they are idempotent as well as completely benign in
* normal operations.
*
* We also have to make sure that the detach
* succeeded, otherwise we fail the operation (or
* maybe it should just fail silently? I'm torn).
*/
if (child->state > DS_ALIVE && best->driver != child->driver)
if ((result = device_detach(dev)) != 0)
return (result);
/* Set the winning driver, devclass, and flags. */
if (!child->devclass) {
result = device_set_devclass(child, best->driver->name);
if (result != 0)
return (result);
}
device_set_driver(child, best->driver);
resource_int_value(best->driver->name, child->unit,
"flags", &child->devflags);
if (pri < 0) {
/*
* A bit bogus. Call the probe method again to make
* sure that we have the right description.
*/
DEVICE_PROBE(child);
#if 0
child->flags |= DF_REBID;
#endif
} else
child->flags &= ~DF_REBID;
child->state = DS_ALIVE;
bus_data_generation_update();
return (0);
}
return (ENXIO);
}
/**
* @brief Return the parent of a device
*/
device_t
device_get_parent(device_t dev)
{
return (dev->parent);
}
/**
* @brief Get a list of children of a device
*
* An array containing a list of all the children of the given device
* is allocated and returned in @p *devlistp. The number of devices
* in the array is returned in @p *devcountp. The caller should free
* the array using @c free(p, M_TEMP).
*
* @param dev the device to examine
* @param devlistp points at location for array pointer return
* value
* @param devcountp points at location for array size return value
*
* @retval 0 success
* @retval ENOMEM the array allocation failed
*/
int
device_get_children(device_t dev, device_t **devlistp, int *devcountp)
{
int count;
device_t child;
device_t *list;
count = 0;
TAILQ_FOREACH(child, &dev->children, link) {
count++;
}
list = malloc(count * sizeof(device_t), M_TEMP, M_NOWAIT|M_ZERO);
if (!list)
return (ENOMEM);
count = 0;
TAILQ_FOREACH(child, &dev->children, link) {
list[count] = child;
count++;
}
*devlistp = list;
*devcountp = count;
return (0);
}
/**
* @brief Return the current driver for the device or @c NULL if there
* is no driver currently attached
*/
driver_t *
device_get_driver(device_t dev)
{
return (dev->driver);
}
/**
* @brief Return the current devclass for the device or @c NULL if
* there is none.
*/
devclass_t
device_get_devclass(device_t dev)
{
return (dev->devclass);
}
/**
* @brief Return the name of the device's devclass or @c NULL if there
* is none.
*/
const char *
device_get_name(device_t dev)
{
if (dev != NULL && dev->devclass)
return (devclass_get_name(dev->devclass));
return (NULL);
}
/**
* @brief Return a string containing the device's devclass name
* followed by an ascii representation of the device's unit number
* (e.g. @c "foo2").
*/
const char *
device_get_nameunit(device_t dev)
{
return (dev->nameunit);
}
/**
* @brief Return the device's unit number.
*/
int
device_get_unit(device_t dev)
{
return (dev->unit);
}
/**
* @brief Return the device's description string
*/
const char *
device_get_desc(device_t dev)
{
return (dev->desc);
}
/**
* @brief Return the device's flags
*/
uint32_t
device_get_flags(device_t dev)
{
return (dev->devflags);
}
struct sysctl_ctx_list *
device_get_sysctl_ctx(device_t dev)
{
return (&dev->sysctl_ctx);
}
struct sysctl_oid *
device_get_sysctl_tree(device_t dev)
{
return (dev->sysctl_tree);
}
/**
* @brief Print the name of the device followed by a colon and a space
*
* @returns the number of characters printed
*/
int
device_print_prettyname(device_t dev)
{
const char *name = device_get_name(dev);
if (name == NULL)
return (printf("unknown: "));
return (printf("%s%d: ", name, device_get_unit(dev)));
}
/**
* @brief Print the name of the device followed by a colon, a space
* and the result of calling vprintf() with the value of @p fmt and
* the following arguments.
*
* @returns the number of characters printed
*/
int
device_printf(device_t dev, const char * fmt, ...)
{
va_list ap;
int retval;
retval = device_print_prettyname(dev);
va_start(ap, fmt);
retval += vprintf(fmt, ap);
va_end(ap);
return (retval);
}
/**
* @internal
*/
static void
device_set_desc_internal(device_t dev, const char* desc, int copy)
{
if (dev->desc && (dev->flags & DF_DESCMALLOCED)) {
free(dev->desc, M_BUS);
dev->flags &= ~DF_DESCMALLOCED;
dev->desc = NULL;
}
if (copy && desc) {
dev->desc = malloc(strlen(desc) + 1, M_BUS, M_NOWAIT);
if (dev->desc) {
strcpy(dev->desc, desc);
dev->flags |= DF_DESCMALLOCED;
}
} else {
/* Avoid a -Wcast-qual warning */
dev->desc = (char *)(uintptr_t) desc;
}
bus_data_generation_update();
}
/**
* @brief Set the device's description
*
* The value of @c desc should be a string constant that will not
* change (at least until the description is changed in a subsequent
* call to device_set_desc() or device_set_desc_copy()).
*/
void
device_set_desc(device_t dev, const char* desc)
{
device_set_desc_internal(dev, desc, FALSE);
}
/**
* @brief Set the device's description
*
* The string pointed to by @c desc is copied. Use this function if
* the device description is generated, (e.g. with sprintf()).
*/
void
device_set_desc_copy(device_t dev, const char* desc)
{
device_set_desc_internal(dev, desc, TRUE);
}
/**
* @brief Set the device's flags
*/
void
device_set_flags(device_t dev, uint32_t flags)
{
dev->devflags = flags;
}
/**
* @brief Return the device's softc field
*
* The softc is allocated and zeroed when a driver is attached, based
* on the size field of the driver.
*/
void *
device_get_softc(device_t dev)
{
return (dev->softc);
}
/**
* @brief Set the device's softc field
*
* Most drivers do not need to use this since the softc is allocated
* automatically when the driver is attached.
*/
void
device_set_softc(device_t dev, void *softc)
{
if (dev->softc && !(dev->flags & DF_EXTERNALSOFTC))
free(dev->softc, M_BUS_SC);
dev->softc = softc;
if (dev->softc)
dev->flags |= DF_EXTERNALSOFTC;
else
dev->flags &= ~DF_EXTERNALSOFTC;
}
/**
* @brief Get the device's ivars field
*
* The ivars field is used by the parent device to store per-device
* state (e.g. the physical location of the device or a list of
* resources).
*/
void *
device_get_ivars(device_t dev)
{
KASSERT(dev != NULL, ("device_get_ivars(NULL, ...)"));
return (dev->ivars);
}
/**
* @brief Set the device's ivars field
*/
void
device_set_ivars(device_t dev, void * ivars)
{
KASSERT(dev != NULL, ("device_set_ivars(NULL, ...)"));
dev->ivars = ivars;
}
/**
* @brief Return the device's state
*/
device_state_t
device_get_state(device_t dev)
{
return (dev->state);
}
/**
* @brief Set the DF_ENABLED flag for the device
*/
void
device_enable(device_t dev)
{
dev->flags |= DF_ENABLED;
}
/**
* @brief Clear the DF_ENABLED flag for the device
*/
void
device_disable(device_t dev)
{
dev->flags &= ~DF_ENABLED;
}
/**
* @brief Increment the busy counter for the device
*/
void
device_busy(device_t dev)
{
if (dev->state < DS_ATTACHED)
panic("device_busy: called for unattached device");
if (dev->busy == 0 && dev->parent)
device_busy(dev->parent);
dev->busy++;
dev->state = DS_BUSY;
}
/**
* @brief Decrement the busy counter for the device
*/
void
device_unbusy(device_t dev)
{
if (dev->state != DS_BUSY)
panic("device_unbusy: called for non-busy device %s",
device_get_nameunit(dev));
dev->busy--;
if (dev->busy == 0) {
if (dev->parent)
device_unbusy(dev->parent);
dev->state = DS_ATTACHED;
}
}
/**
* @brief Set the DF_QUIET flag for the device
*/
void
device_quiet(device_t dev)
{
dev->flags |= DF_QUIET;
}
/**
* @brief Clear the DF_QUIET flag for the device
*/
void
device_verbose(device_t dev)
{
dev->flags &= ~DF_QUIET;
}
/**
* @brief Return non-zero if the DF_QUIET flag is set on the device
*/
int
device_is_quiet(device_t dev)
{
return ((dev->flags & DF_QUIET) != 0);
}
/**
* @brief Return non-zero if the DF_ENABLED flag is set on the device
*/
int
device_is_enabled(device_t dev)
{
return ((dev->flags & DF_ENABLED) != 0);
}
/**
* @brief Return non-zero if the device was successfully probed
*/
int
device_is_alive(device_t dev)
{
return (dev->state >= DS_ALIVE);
}
/**
* @brief Return non-zero if the device currently has a driver
* attached to it
*/
int
device_is_attached(device_t dev)
{
return (dev->state >= DS_ATTACHED);
}
/**
* @brief Set the devclass of a device
* @see devclass_add_device().
*/
int
device_set_devclass(device_t dev, const char *classname)
{
devclass_t dc;
int error;
if (!classname) {
if (dev->devclass)
devclass_delete_device(dev->devclass, dev);
return (0);
}
if (dev->devclass) {
printf("device_set_devclass: device class already set\n");
return (EINVAL);
}
dc = devclass_find_internal(classname, NULL, TRUE);
if (!dc)
return (ENOMEM);
error = devclass_add_device(dc, dev);
bus_data_generation_update();
return (error);
}
/**
* @brief Set the driver of a device
*
* @retval 0 success
* @retval EBUSY the device already has a driver attached
* @retval ENOMEM a memory allocation failure occurred
*/
int
device_set_driver(device_t dev, driver_t *driver)
{
if (dev->state >= DS_ATTACHED)
return (EBUSY);
if (dev->driver == driver)
return (0);
if (dev->softc && !(dev->flags & DF_EXTERNALSOFTC)) {
free(dev->softc, M_BUS_SC);
dev->softc = NULL;
}
kobj_delete((kobj_t) dev, NULL);
dev->driver = driver;
if (driver) {
kobj_init((kobj_t) dev, (kobj_class_t) driver);
if (!(dev->flags & DF_EXTERNALSOFTC) && driver->size > 0) {
dev->softc = malloc(driver->size, M_BUS_SC,
M_NOWAIT | M_ZERO);
if (!dev->softc) {
kobj_delete((kobj_t) dev, NULL);
kobj_init((kobj_t) dev, &null_class);
dev->driver = NULL;
return (ENOMEM);
}
}
} else {
kobj_init((kobj_t) dev, &null_class);
}
bus_data_generation_update();
return (0);
}
/**
* @brief Probe a device, and return this status.
*
* This function is the core of the device autoconfiguration
* system. Its purpose is to select a suitable driver for a device and
* then call that driver to initialise the hardware appropriately. The
* driver is selected by calling the DEVICE_PROBE() method of a set of
* candidate drivers and then choosing the driver which returned the
* best value. This driver is then attached to the device using
* device_attach().
*
* The set of suitable drivers is taken from the list of drivers in
* the parent device's devclass. If the device was originally created
* with a specific class name (see device_add_child()), only drivers
* with that name are probed, otherwise all drivers in the devclass
* are probed. If no drivers return successful probe values in the
* parent devclass, the search continues in the parent of that
* devclass (see devclass_get_parent()) if any.
*
* @param dev the device to initialise
*
* @retval 0 success
* @retval ENXIO no driver was found
* @retval ENOMEM memory allocation failure
* @retval non-zero some other unix error code
* @retval -1 Device already attached
*/
int
device_probe(device_t dev)
{
int error;
GIANT_REQUIRED;
if (dev->state >= DS_ALIVE && (dev->flags & DF_REBID) == 0)
return (-1);
if (!(dev->flags & DF_ENABLED)) {
if (bootverbose && device_get_name(dev) != NULL) {
device_print_prettyname(dev);
printf("not probed (disabled)\n");
}
return (-1);
}
if ((error = device_probe_child(dev->parent, dev)) != 0) {
if (bus_current_pass == BUS_PASS_DEFAULT &&
!(dev->flags & DF_DONENOMATCH)) {
BUS_PROBE_NOMATCH(dev->parent, dev);
devnomatch(dev);
dev->flags |= DF_DONENOMATCH;
}
return (error);
}
return (0);
}
/**
* @brief Probe a device and attach a driver if possible
*
* calls device_probe() and attaches if that was successful.
*/
int
device_probe_and_attach(device_t dev)
{
int error;
GIANT_REQUIRED;
error = device_probe(dev);
if (error == -1)
return (0);
else if (error != 0)
return (error);
return (device_attach(dev));
}
/**
* @brief Attach a device driver to a device
*
* This function is a wrapper around the DEVICE_ATTACH() driver
* method. In addition to calling DEVICE_ATTACH(), it initialises the
* device's sysctl tree, optionally prints a description of the device
* and queues a notification event for user-based device management
* services.
*
* Normally this function is only called internally from
* device_probe_and_attach().
*
* @param dev the device to initialise
*
* @retval 0 success
* @retval ENXIO no driver was found
* @retval ENOMEM memory allocation failure
* @retval non-zero some other unix error code
*/
int
device_attach(device_t dev)
{
int error;
device_sysctl_init(dev);
if (!device_is_quiet(dev))
device_print_child(dev->parent, dev);
if ((error = DEVICE_ATTACH(dev)) != 0) {
printf("device_attach: %s%d attach returned %d\n",
dev->driver->name, dev->unit, error);
/* Unset the class; set in device_probe_child */
if (dev->devclass == NULL)
device_set_devclass(dev, NULL);
device_set_driver(dev, NULL);
device_sysctl_fini(dev);
dev->state = DS_NOTPRESENT;
return (error);
}
device_sysctl_update(dev);
dev->state = DS_ATTACHED;
dev->flags &= ~DF_DONENOMATCH;
devadded(dev);
return (0);
}
/**
* @brief Detach a driver from a device
*
* This function is a wrapper around the DEVICE_DETACH() driver
* method. If the call to DEVICE_DETACH() succeeds, it calls
* BUS_CHILD_DETACHED() for the parent of @p dev, queues a
* notification event for user-based device management services and
* cleans up the device's sysctl tree.
*
* @param dev the device to un-initialise
*
* @retval 0 success
* @retval ENXIO no driver was found
* @retval ENOMEM memory allocation failure
* @retval non-zero some other unix error code
*/
int
device_detach(device_t dev)
{
int error;
GIANT_REQUIRED;
PDEBUG(("%s", DEVICENAME(dev)));
if (dev->state == DS_BUSY)
return (EBUSY);
if (dev->state != DS_ATTACHED)
return (0);
if ((error = DEVICE_DETACH(dev)) != 0)
return (error);
devremoved(dev);
if (!device_is_quiet(dev))
device_printf(dev, "detached\n");
if (dev->parent)
BUS_CHILD_DETACHED(dev->parent, dev);
if (!(dev->flags & DF_FIXEDCLASS))
devclass_delete_device(dev->devclass, dev);
dev->state = DS_NOTPRESENT;
device_set_driver(dev, NULL);
device_set_desc(dev, NULL);
device_sysctl_fini(dev);
return (0);
}
/**
* @brief Tells a driver to quiesce itself.
*
* This function is a wrapper around the DEVICE_QUIESCE() driver
* method. If the call to DEVICE_QUIESCE() succeeds.
*
* @param dev the device to quiesce
*
* @retval 0 success
* @retval ENXIO no driver was found
* @retval ENOMEM memory allocation failure
* @retval non-zero some other unix error code
*/
int
device_quiesce(device_t dev)
{
PDEBUG(("%s", DEVICENAME(dev)));
if (dev->state == DS_BUSY)
return (EBUSY);
if (dev->state != DS_ATTACHED)
return (0);
return (DEVICE_QUIESCE(dev));
}
/**
* @brief Notify a device of system shutdown
*
* This function calls the DEVICE_SHUTDOWN() driver method if the
* device currently has an attached driver.
*
* @returns the value returned by DEVICE_SHUTDOWN()
*/
int
device_shutdown(device_t dev)
{
if (dev->state < DS_ATTACHED)
return (0);
return (DEVICE_SHUTDOWN(dev));
}
/**
* @brief Set the unit number of a device
*
* This function can be used to override the unit number used for a
* device (e.g. to wire a device to a pre-configured unit number).
*/
int
device_set_unit(device_t dev, int unit)
{
devclass_t dc;
int err;
dc = device_get_devclass(dev);
if (unit < dc->maxunit && dc->devices[unit])
return (EBUSY);
err = devclass_delete_device(dc, dev);
if (err)
return (err);
dev->unit = unit;
err = devclass_add_device(dc, dev);
if (err)
return (err);
bus_data_generation_update();
return (0);
}
/*======================================*/
/*
* Some useful method implementations to make life easier for bus drivers.
*/
/**
* @brief Initialise a resource list.
*
* @param rl the resource list to initialise
*/
void
resource_list_init(struct resource_list *rl)
{
STAILQ_INIT(rl);
}
/**
* @brief Reclaim memory used by a resource list.
*
* This function frees the memory for all resource entries on the list
* (if any).
*
* @param rl the resource list to free
*/
void
resource_list_free(struct resource_list *rl)
{
struct resource_list_entry *rle;
while ((rle = STAILQ_FIRST(rl)) != NULL) {
if (rle->res)
panic("resource_list_free: resource entry is busy");
STAILQ_REMOVE_HEAD(rl, link);
free(rle, M_BUS);
}
}
/**
* @brief Add a resource entry.
*
* This function adds a resource entry using the given @p type, @p
* start, @p end and @p count values. A rid value is chosen by
* searching sequentially for the first unused rid starting at zero.
*
* @param rl the resource list to edit
* @param type the resource entry type (e.g. SYS_RES_MEMORY)
* @param start the start address of the resource
* @param end the end address of the resource
* @param count XXX end-start+1
*/
int
resource_list_add_next(struct resource_list *rl, int type, u_long start,
u_long end, u_long count)
{
int rid;
rid = 0;
while (resource_list_find(rl, type, rid) != NULL)
rid++;
resource_list_add(rl, type, rid, start, end, count);
return (rid);
}
/**
* @brief Add or modify a resource entry.
*
* If an existing entry exists with the same type and rid, it will be
* modified using the given values of @p start, @p end and @p
* count. If no entry exists, a new one will be created using the
* given values. The resource list entry that matches is then returned.
*
* @param rl the resource list to edit
* @param type the resource entry type (e.g. SYS_RES_MEMORY)
* @param rid the resource identifier
* @param start the start address of the resource
* @param end the end address of the resource
* @param count XXX end-start+1
*/
struct resource_list_entry *
resource_list_add(struct resource_list *rl, int type, int rid,
u_long start, u_long end, u_long count)
{
struct resource_list_entry *rle;
rle = resource_list_find(rl, type, rid);
if (!rle) {
rle = malloc(sizeof(struct resource_list_entry), M_BUS,
M_NOWAIT);
if (!rle)
panic("resource_list_add: can't record entry");
STAILQ_INSERT_TAIL(rl, rle, link);
rle->type = type;
rle->rid = rid;
rle->res = NULL;
rle->flags = 0;
}
if (rle->res)
panic("resource_list_add: resource entry is busy");
rle->start = start;
rle->end = end;
rle->count = count;
return (rle);
}
/**
* @brief Determine if a resource entry is busy.
*
* Returns true if a resource entry is busy meaning that it has an
* associated resource that is not an unallocated "reserved" resource.
*
* @param rl the resource list to search
* @param type the resource entry type (e.g. SYS_RES_MEMORY)
* @param rid the resource identifier
*
* @returns Non-zero if the entry is busy, zero otherwise.
*/
int
resource_list_busy(struct resource_list *rl, int type, int rid)
{
struct resource_list_entry *rle;
rle = resource_list_find(rl, type, rid);
if (rle == NULL || rle->res == NULL)
return (0);
if ((rle->flags & (RLE_RESERVED | RLE_ALLOCATED)) == RLE_RESERVED) {
KASSERT(!(rman_get_flags(rle->res) & RF_ACTIVE),
("reserved resource is active"));
return (0);
}
return (1);
}
/**
* @brief Determine if a resource entry is reserved.
*
* Returns true if a resource entry is reserved meaning that it has an
* associated "reserved" resource. The resource can either be
* allocated or unallocated.
*
* @param rl the resource list to search
* @param type the resource entry type (e.g. SYS_RES_MEMORY)
* @param rid the resource identifier
*
* @returns Non-zero if the entry is reserved, zero otherwise.
*/
int
resource_list_reserved(struct resource_list *rl, int type, int rid)
{
struct resource_list_entry *rle;
rle = resource_list_find(rl, type, rid);
if (rle != NULL && rle->flags & RLE_RESERVED)
return (1);
return (0);
}
/**
* @brief Find a resource entry by type and rid.
*
* @param rl the resource list to search
* @param type the resource entry type (e.g. SYS_RES_MEMORY)
* @param rid the resource identifier
*
* @returns the resource entry pointer or NULL if there is no such
* entry.
*/
struct resource_list_entry *
resource_list_find(struct resource_list *rl, int type, int rid)
{
struct resource_list_entry *rle;
STAILQ_FOREACH(rle, rl, link) {
if (rle->type == type && rle->rid == rid)
return (rle);
}
return (NULL);
}
/**
* @brief Delete a resource entry.
*
* @param rl the resource list to edit
* @param type the resource entry type (e.g. SYS_RES_MEMORY)
* @param rid the resource identifier
*/
void
resource_list_delete(struct resource_list *rl, int type, int rid)
{
struct resource_list_entry *rle = resource_list_find(rl, type, rid);
if (rle) {
if (rle->res != NULL)
panic("resource_list_delete: resource has not been released");
STAILQ_REMOVE(rl, rle, resource_list_entry, link);
free(rle, M_BUS);
}
}
/**
* @brief Allocate a reserved resource
*
* This can be used by busses to force the allocation of resources
* that are always active in the system even if they are not allocated
* by a driver (e.g. PCI BARs). This function is usually called when
* adding a new child to the bus. The resource is allocated from the
* parent bus when it is reserved. The resource list entry is marked
* with RLE_RESERVED to note that it is a reserved resource.
*
* Subsequent attempts to allocate the resource with
* resource_list_alloc() will succeed the first time and will set
* RLE_ALLOCATED to note that it has been allocated. When a reserved
* resource that has been allocated is released with
* resource_list_release() the resource RLE_ALLOCATED is cleared, but
* the actual resource remains allocated. The resource can be released to
* the parent bus by calling resource_list_unreserve().
*
* @param rl the resource list to allocate from
* @param bus the parent device of @p child
* @param child the device for which the resource is being reserved
* @param type the type of resource to allocate
* @param rid a pointer to the resource identifier
* @param start hint at the start of the resource range - pass
* @c 0UL for any start address
* @param end hint at the end of the resource range - pass
* @c ~0UL for any end address
* @param count hint at the size of range required - pass @c 1
* for any size
* @param flags any extra flags to control the resource
* allocation - see @c RF_XXX flags in
* <sys/rman.h> for details
*
* @returns the resource which was allocated or @c NULL if no
* resource could be allocated
*/
struct resource *
resource_list_reserve(struct resource_list *rl, device_t bus, device_t child,
int type, int *rid, u_long start, u_long end, u_long count, u_int flags)
{
struct resource_list_entry *rle = NULL;
int passthrough = (device_get_parent(child) != bus);
struct resource *r;
if (passthrough)
panic(
"resource_list_reserve() should only be called for direct children");
if (flags & RF_ACTIVE)
panic(
"resource_list_reserve() should only reserve inactive resources");
r = resource_list_alloc(rl, bus, child, type, rid, start, end, count,
flags);
if (r != NULL) {
rle = resource_list_find(rl, type, *rid);
rle->flags |= RLE_RESERVED;
}
return (r);
}
/**
* @brief Helper function for implementing BUS_ALLOC_RESOURCE()
*
* Implement BUS_ALLOC_RESOURCE() by looking up a resource from the list
* and passing the allocation up to the parent of @p bus. This assumes
* that the first entry of @c device_get_ivars(child) is a struct
* resource_list. This also handles 'passthrough' allocations where a
* child is a remote descendant of bus by passing the allocation up to
* the parent of bus.
*
* Typically, a bus driver would store a list of child resources
* somewhere in the child device's ivars (see device_get_ivars()) and
* its implementation of BUS_ALLOC_RESOURCE() would find that list and
* then call resource_list_alloc() to perform the allocation.
*
* @param rl the resource list to allocate from
* @param bus the parent device of @p child
* @param child the device which is requesting an allocation
* @param type the type of resource to allocate
* @param rid a pointer to the resource identifier
* @param start hint at the start of the resource range - pass
* @c 0UL for any start address
* @param end hint at the end of the resource range - pass
* @c ~0UL for any end address
* @param count hint at the size of range required - pass @c 1
* for any size
* @param flags any extra flags to control the resource
* allocation - see @c RF_XXX flags in
* <sys/rman.h> for details
*
* @returns the resource which was allocated or @c NULL if no
* resource could be allocated
*/
struct resource *
resource_list_alloc(struct resource_list *rl, device_t bus, device_t child,
int type, int *rid, u_long start, u_long end, u_long count, u_int flags)
{
struct resource_list_entry *rle = NULL;
int passthrough = (device_get_parent(child) != bus);
int isdefault = (start == 0UL && end == ~0UL);
if (passthrough) {
return (BUS_ALLOC_RESOURCE(device_get_parent(bus), child,
type, rid, start, end, count, flags));
}
rle = resource_list_find(rl, type, *rid);
if (!rle)
return (NULL); /* no resource of that type/rid */
if (rle->res) {
if (rle->flags & RLE_RESERVED) {
if (rle->flags & RLE_ALLOCATED)
return (NULL);
if ((flags & RF_ACTIVE) &&
bus_activate_resource(child, type, *rid,
rle->res) != 0)
return (NULL);
rle->flags |= RLE_ALLOCATED;
return (rle->res);
}
panic("resource_list_alloc: resource entry is busy");
}
if (isdefault) {
start = rle->start;
count = ulmax(count, rle->count);
end = ulmax(rle->end, start + count - 1);
}
rle->res = BUS_ALLOC_RESOURCE(device_get_parent(bus), child,
type, rid, start, end, count, flags);
/*
* Record the new range.
*/
if (rle->res) {
rle->start = rman_get_start(rle->res);
rle->end = rman_get_end(rle->res);
rle->count = count;
}
return (rle->res);
}
/**
* @brief Helper function for implementing BUS_RELEASE_RESOURCE()
*
* Implement BUS_RELEASE_RESOURCE() using a resource list. Normally
* used with resource_list_alloc().
*
* @param rl the resource list which was allocated from
* @param bus the parent device of @p child
* @param child the device which is requesting a release
* @param type the type of resource to release
* @param rid the resource identifier
* @param res the resource to release
*
* @retval 0 success
* @retval non-zero a standard unix error code indicating what
* error condition prevented the operation
*/
int
resource_list_release(struct resource_list *rl, device_t bus, device_t child,
int type, int rid, struct resource *res)
{
struct resource_list_entry *rle = NULL;
int passthrough = (device_get_parent(child) != bus);
int error;
if (passthrough) {
return (BUS_RELEASE_RESOURCE(device_get_parent(bus), child,
type, rid, res));
}
rle = resource_list_find(rl, type, rid);
if (!rle)
panic("resource_list_release: can't find resource");
if (!rle->res)
panic("resource_list_release: resource entry is not busy");
if (rle->flags & RLE_RESERVED) {
if (rle->flags & RLE_ALLOCATED) {
if (rman_get_flags(res) & RF_ACTIVE) {
error = bus_deactivate_resource(child, type,
rid, res);
if (error)
return (error);
}
rle->flags &= ~RLE_ALLOCATED;
return (0);
}
return (EINVAL);
}
error = BUS_RELEASE_RESOURCE(device_get_parent(bus), child,
type, rid, res);
if (error)
return (error);
rle->res = NULL;
return (0);
}
/**
* @brief Fully release a reserved resource
*
* Fully releases a resouce reserved via resource_list_reserve().
*
* @param rl the resource list which was allocated from
* @param bus the parent device of @p child
* @param child the device whose reserved resource is being released
* @param type the type of resource to release
* @param rid the resource identifier
* @param res the resource to release
*
* @retval 0 success
* @retval non-zero a standard unix error code indicating what
* error condition prevented the operation
*/
int
resource_list_unreserve(struct resource_list *rl, device_t bus, device_t child,
int type, int rid)
{
struct resource_list_entry *rle = NULL;
int passthrough = (device_get_parent(child) != bus);
if (passthrough)
panic(
"resource_list_unreserve() should only be called for direct children");
rle = resource_list_find(rl, type, rid);
if (!rle)
panic("resource_list_unreserve: can't find resource");
if (!(rle->flags & RLE_RESERVED))
return (EINVAL);
if (rle->flags & RLE_ALLOCATED)
return (EBUSY);
rle->flags &= ~RLE_RESERVED;
return (resource_list_release(rl, bus, child, type, rid, rle->res));
}
/**
* @brief Print a description of resources in a resource list
*
* Print all resources of a specified type, for use in BUS_PRINT_CHILD().
* The name is printed if at least one resource of the given type is available.
* The format is used to print resource start and end.
*
* @param rl the resource list to print
* @param name the name of @p type, e.g. @c "memory"
* @param type type type of resource entry to print
* @param format printf(9) format string to print resource
* start and end values
*
* @returns the number of characters printed
*/
int
resource_list_print_type(struct resource_list *rl, const char *name, int type,
const char *format)
{
struct resource_list_entry *rle;
int printed, retval;
printed = 0;
retval = 0;
/* Yes, this is kinda cheating */
STAILQ_FOREACH(rle, rl, link) {
if (rle->type == type) {
if (printed == 0)
retval += printf(" %s ", name);
else
retval += printf(",");
printed++;
retval += printf(format, rle->start);
if (rle->count > 1) {
retval += printf("-");
retval += printf(format, rle->start +
rle->count - 1);
}
}
}
return (retval);
}
/**
* @brief Releases all the resources in a list.
*
* @param rl The resource list to purge.
*
* @returns nothing
*/
void
resource_list_purge(struct resource_list *rl)
{
struct resource_list_entry *rle;
while ((rle = STAILQ_FIRST(rl)) != NULL) {
if (rle->res)
bus_release_resource(rman_get_device(rle->res),
rle->type, rle->rid, rle->res);
STAILQ_REMOVE_HEAD(rl, link);
free(rle, M_BUS);
}
}
device_t
bus_generic_add_child(device_t dev, u_int order, const char *name, int unit)
{
return (device_add_child_ordered(dev, order, name, unit));
}
/**
* @brief Helper function for implementing DEVICE_PROBE()
*
* This function can be used to help implement the DEVICE_PROBE() for
* a bus (i.e. a device which has other devices attached to it). It
* calls the DEVICE_IDENTIFY() method of each driver in the device's
* devclass.
*/
int
bus_generic_probe(device_t dev)
{
devclass_t dc = dev->devclass;
driverlink_t dl;
TAILQ_FOREACH(dl, &dc->drivers, link) {
/*
* If this driver's pass is too high, then ignore it.
* For most drivers in the default pass, this will
* never be true. For early-pass drivers they will
* only call the identify routines of eligible drivers
* when this routine is called. Drivers for later
* passes should have their identify routines called
* on early-pass busses during BUS_NEW_PASS().
*/
if (dl->pass > bus_current_pass)
continue;
DEVICE_IDENTIFY(dl->driver, dev);
}
return (0);
}
/**
* @brief Helper function for implementing DEVICE_ATTACH()
*
* This function can be used to help implement the DEVICE_ATTACH() for
* a bus. It calls device_probe_and_attach() for each of the device's
* children.
*/
int
bus_generic_attach(device_t dev)
{
device_t child;
TAILQ_FOREACH(child, &dev->children, link) {
device_probe_and_attach(child);
}
return (0);
}
/**
* @brief Helper function for implementing DEVICE_DETACH()
*
* This function can be used to help implement the DEVICE_DETACH() for
* a bus. It calls device_detach() for each of the device's
* children.
*/
int
bus_generic_detach(device_t dev)
{
device_t child;
int error;
if (dev->state != DS_ATTACHED)
return (EBUSY);
TAILQ_FOREACH(child, &dev->children, link) {
if ((error = device_detach(child)) != 0)
return (error);
}
return (0);
}
/**
* @brief Helper function for implementing DEVICE_SHUTDOWN()
*
* This function can be used to help implement the DEVICE_SHUTDOWN()
* for a bus. It calls device_shutdown() for each of the device's
* children.
*/
int
bus_generic_shutdown(device_t dev)
{
device_t child;
TAILQ_FOREACH(child, &dev->children, link) {
device_shutdown(child);
}
return (0);
}
/**
* @brief Helper function for implementing DEVICE_SUSPEND()
*
* This function can be used to help implement the DEVICE_SUSPEND()
* for a bus. It calls DEVICE_SUSPEND() for each of the device's
* children. If any call to DEVICE_SUSPEND() fails, the suspend
* operation is aborted and any devices which were suspended are
* resumed immediately by calling their DEVICE_RESUME() methods.
*/
int
bus_generic_suspend(device_t dev)
{
int error;
device_t child, child2;
TAILQ_FOREACH(child, &dev->children, link) {
error = DEVICE_SUSPEND(child);
if (error) {
for (child2 = TAILQ_FIRST(&dev->children);
child2 && child2 != child;
child2 = TAILQ_NEXT(child2, link))
DEVICE_RESUME(child2);
return (error);
}
}
return (0);
}
/**
* @brief Helper function for implementing DEVICE_RESUME()
*
* This function can be used to help implement the DEVICE_RESUME() for
* a bus. It calls DEVICE_RESUME() on each of the device's children.
*/
int
bus_generic_resume(device_t dev)
{
device_t child;
TAILQ_FOREACH(child, &dev->children, link) {
DEVICE_RESUME(child);
/* if resume fails, there's nothing we can usefully do... */
}
return (0);
}
/**
* @brief Helper function for implementing BUS_PRINT_CHILD().
*
* This function prints the first part of the ascii representation of
* @p child, including its name, unit and description (if any - see
* device_set_desc()).
*
* @returns the number of characters printed
*/
int
bus_print_child_header(device_t dev, device_t child)
{
int retval = 0;
if (device_get_desc(child)) {
retval += device_printf(child, "<%s>", device_get_desc(child));
} else {
retval += printf("%s", device_get_nameunit(child));
}
return (retval);
}
/**
* @brief Helper function for implementing BUS_PRINT_CHILD().
*
* This function prints the last part of the ascii representation of
* @p child, which consists of the string @c " on " followed by the
* name and unit of the @p dev.
*
* @returns the number of characters printed
*/
int
bus_print_child_footer(device_t dev, device_t child)
{
return (printf(" on %s\n", device_get_nameunit(dev)));
}
/**
* @brief Helper function for implementing BUS_PRINT_CHILD().
*
* This function simply calls bus_print_child_header() followed by
* bus_print_child_footer().
*
* @returns the number of characters printed
*/
int
bus_generic_print_child(device_t dev, device_t child)
{
int retval = 0;
retval += bus_print_child_header(dev, child);
retval += bus_print_child_footer(dev, child);
return (retval);
}
/**
* @brief Stub function for implementing BUS_READ_IVAR().
*
* @returns ENOENT
*/
int
bus_generic_read_ivar(device_t dev, device_t child, int index,
uintptr_t * result)
{
return (ENOENT);
}
/**
* @brief Stub function for implementing BUS_WRITE_IVAR().
*
* @returns ENOENT
*/
int
bus_generic_write_ivar(device_t dev, device_t child, int index,
uintptr_t value)
{
return (ENOENT);
}
/**
* @brief Stub function for implementing BUS_GET_RESOURCE_LIST().
*
* @returns NULL
*/
struct resource_list *
bus_generic_get_resource_list(device_t dev, device_t child)
{
return (NULL);
}
/**
* @brief Helper function for implementing BUS_DRIVER_ADDED().
*
* This implementation of BUS_DRIVER_ADDED() simply calls the driver's
* DEVICE_IDENTIFY() method to allow it to add new children to the bus
* and then calls device_probe_and_attach() for each unattached child.
*/
void
bus_generic_driver_added(device_t dev, driver_t *driver)
{
device_t child;
DEVICE_IDENTIFY(driver, dev);
TAILQ_FOREACH(child, &dev->children, link) {
if (child->state == DS_NOTPRESENT ||
(child->flags & DF_REBID))
device_probe_and_attach(child);
}
}
/**
* @brief Helper function for implementing BUS_NEW_PASS().
*
* This implementing of BUS_NEW_PASS() first calls the identify
* routines for any drivers that probe at the current pass. Then it
* walks the list of devices for this bus. If a device is already
* attached, then it calls BUS_NEW_PASS() on that device. If the
* device is not already attached, it attempts to attach a driver to
* it.
*/
void
bus_generic_new_pass(device_t dev)
{
driverlink_t dl;
devclass_t dc;
device_t child;
dc = dev->devclass;
TAILQ_FOREACH(dl, &dc->drivers, link) {
if (dl->pass == bus_current_pass)
DEVICE_IDENTIFY(dl->driver, dev);
}
TAILQ_FOREACH(child, &dev->children, link) {
if (child->state >= DS_ATTACHED)
BUS_NEW_PASS(child);
else if (child->state == DS_NOTPRESENT)
device_probe_and_attach(child);
}
}
/**
* @brief Helper function for implementing BUS_SETUP_INTR().
*
* This simple implementation of BUS_SETUP_INTR() simply calls the
* BUS_SETUP_INTR() method of the parent of @p dev.
*/
int
bus_generic_setup_intr(device_t dev, device_t child, struct resource *irq,
int flags, driver_filter_t *filter, driver_intr_t *intr, void *arg,
void **cookiep)
{
/* Propagate up the bus hierarchy until someone handles it. */
if (dev->parent)
return (BUS_SETUP_INTR(dev->parent, child, irq, flags,
filter, intr, arg, cookiep));
return (EINVAL);
}
/**
* @brief Helper function for implementing BUS_TEARDOWN_INTR().
*
* This simple implementation of BUS_TEARDOWN_INTR() simply calls the
* BUS_TEARDOWN_INTR() method of the parent of @p dev.
*/
int
bus_generic_teardown_intr(device_t dev, device_t child, struct resource *irq,
void *cookie)
{
/* Propagate up the bus hierarchy until someone handles it. */
if (dev->parent)
return (BUS_TEARDOWN_INTR(dev->parent, child, irq, cookie));
return (EINVAL);
}
/**
* @brief Helper function for implementing BUS_ADJUST_RESOURCE().
*
* This simple implementation of BUS_ADJUST_RESOURCE() simply calls the
* BUS_ADJUST_RESOURCE() method of the parent of @p dev.
*/
int
bus_generic_adjust_resource(device_t dev, device_t child, int type,
struct resource *r, u_long start, u_long end)
{
/* Propagate up the bus hierarchy until someone handles it. */
if (dev->parent)
return (BUS_ADJUST_RESOURCE(dev->parent, child, type, r, start,
end));
return (EINVAL);
}
/**
* @brief Helper function for implementing BUS_ALLOC_RESOURCE().
*
* This simple implementation of BUS_ALLOC_RESOURCE() simply calls the
* BUS_ALLOC_RESOURCE() method of the parent of @p dev.
*/
struct resource *
bus_generic_alloc_resource(device_t dev, device_t child, int type, int *rid,
u_long start, u_long end, u_long count, u_int flags)
{
/* Propagate up the bus hierarchy until someone handles it. */
if (dev->parent)
return (BUS_ALLOC_RESOURCE(dev->parent, child, type, rid,
start, end, count, flags));
return (NULL);
}
/**
* @brief Helper function for implementing BUS_RELEASE_RESOURCE().
*
* This simple implementation of BUS_RELEASE_RESOURCE() simply calls the
* BUS_RELEASE_RESOURCE() method of the parent of @p dev.
*/
int
bus_generic_release_resource(device_t dev, device_t child, int type, int rid,
struct resource *r)
{
/* Propagate up the bus hierarchy until someone handles it. */
if (dev->parent)
return (BUS_RELEASE_RESOURCE(dev->parent, child, type, rid,
r));
return (EINVAL);
}
/**
* @brief Helper function for implementing BUS_ACTIVATE_RESOURCE().
*
* This simple implementation of BUS_ACTIVATE_RESOURCE() simply calls the
* BUS_ACTIVATE_RESOURCE() method of the parent of @p dev.
*/
int
bus_generic_activate_resource(device_t dev, device_t child, int type, int rid,
struct resource *r)
{
/* Propagate up the bus hierarchy until someone handles it. */
if (dev->parent)
return (BUS_ACTIVATE_RESOURCE(dev->parent, child, type, rid,
r));
return (EINVAL);
}
/**
* @brief Helper function for implementing BUS_DEACTIVATE_RESOURCE().
*
* This simple implementation of BUS_DEACTIVATE_RESOURCE() simply calls the
* BUS_DEACTIVATE_RESOURCE() method of the parent of @p dev.
*/
int
bus_generic_deactivate_resource(device_t dev, device_t child, int type,
int rid, struct resource *r)
{
/* Propagate up the bus hierarchy until someone handles it. */
if (dev->parent)
return (BUS_DEACTIVATE_RESOURCE(dev->parent, child, type, rid,
r));
return (EINVAL);
}
/**
* @brief Helper function for implementing BUS_BIND_INTR().
*
* This simple implementation of BUS_BIND_INTR() simply calls the
* BUS_BIND_INTR() method of the parent of @p dev.
*/
int
bus_generic_bind_intr(device_t dev, device_t child, struct resource *irq,
int cpu)
{
/* Propagate up the bus hierarchy until someone handles it. */
if (dev->parent)
return (BUS_BIND_INTR(dev->parent, child, irq, cpu));
return (EINVAL);
}
/**
* @brief Helper function for implementing BUS_CONFIG_INTR().
*
* This simple implementation of BUS_CONFIG_INTR() simply calls the
* BUS_CONFIG_INTR() method of the parent of @p dev.
*/
int
bus_generic_config_intr(device_t dev, int irq, enum intr_trigger trig,
enum intr_polarity pol)
{
/* Propagate up the bus hierarchy until someone handles it. */
if (dev->parent)
return (BUS_CONFIG_INTR(dev->parent, irq, trig, pol));
return (EINVAL);
}
/**
* @brief Helper function for implementing BUS_DESCRIBE_INTR().
*
* This simple implementation of BUS_DESCRIBE_INTR() simply calls the
* BUS_DESCRIBE_INTR() method of the parent of @p dev.
*/
int
bus_generic_describe_intr(device_t dev, device_t child, struct resource *irq,
void *cookie, const char *descr)
{
/* Propagate up the bus hierarchy until someone handles it. */
if (dev->parent)
return (BUS_DESCRIBE_INTR(dev->parent, child, irq, cookie,
descr));
return (EINVAL);
}
/**
* @brief Helper function for implementing BUS_GET_DMA_TAG().
*
* This simple implementation of BUS_GET_DMA_TAG() simply calls the
* BUS_GET_DMA_TAG() method of the parent of @p dev.
*/
bus_dma_tag_t
bus_generic_get_dma_tag(device_t dev, device_t child)
{
/* Propagate up the bus hierarchy until someone handles it. */
if (dev->parent != NULL)
return (BUS_GET_DMA_TAG(dev->parent, child));
return (NULL);
}
/**
* @brief Helper function for implementing BUS_GET_RESOURCE().
*
* This implementation of BUS_GET_RESOURCE() uses the
* resource_list_find() function to do most of the work. It calls
* BUS_GET_RESOURCE_LIST() to find a suitable resource list to
* search.
*/
int
bus_generic_rl_get_resource(device_t dev, device_t child, int type, int rid,
u_long *startp, u_long *countp)
{
struct resource_list * rl = NULL;
struct resource_list_entry * rle = NULL;
rl = BUS_GET_RESOURCE_LIST(dev, child);
if (!rl)
return (EINVAL);
rle = resource_list_find(rl, type, rid);
if (!rle)
return (ENOENT);
if (startp)
*startp = rle->start;
if (countp)
*countp = rle->count;
return (0);
}
/**
* @brief Helper function for implementing BUS_SET_RESOURCE().
*
* This implementation of BUS_SET_RESOURCE() uses the
* resource_list_add() function to do most of the work. It calls
* BUS_GET_RESOURCE_LIST() to find a suitable resource list to
* edit.
*/
int
bus_generic_rl_set_resource(device_t dev, device_t child, int type, int rid,
u_long start, u_long count)
{
struct resource_list * rl = NULL;
rl = BUS_GET_RESOURCE_LIST(dev, child);
if (!rl)
return (EINVAL);
resource_list_add(rl, type, rid, start, (start + count - 1), count);
return (0);
}
/**
* @brief Helper function for implementing BUS_DELETE_RESOURCE().
*
* This implementation of BUS_DELETE_RESOURCE() uses the
* resource_list_delete() function to do most of the work. It calls
* BUS_GET_RESOURCE_LIST() to find a suitable resource list to
* edit.
*/
void
bus_generic_rl_delete_resource(device_t dev, device_t child, int type, int rid)
{
struct resource_list * rl = NULL;
rl = BUS_GET_RESOURCE_LIST(dev, child);
if (!rl)
return;
resource_list_delete(rl, type, rid);
return;
}
/**
* @brief Helper function for implementing BUS_RELEASE_RESOURCE().
*
* This implementation of BUS_RELEASE_RESOURCE() uses the
* resource_list_release() function to do most of the work. It calls
* BUS_GET_RESOURCE_LIST() to find a suitable resource list.
*/
int
bus_generic_rl_release_resource(device_t dev, device_t child, int type,
int rid, struct resource *r)
{
struct resource_list * rl = NULL;
if (device_get_parent(child) != dev)
return (BUS_RELEASE_RESOURCE(device_get_parent(dev), child,
type, rid, r));
rl = BUS_GET_RESOURCE_LIST(dev, child);
if (!rl)
return (EINVAL);
return (resource_list_release(rl, dev, child, type, rid, r));
}
/**
* @brief Helper function for implementing BUS_ALLOC_RESOURCE().
*
* This implementation of BUS_ALLOC_RESOURCE() uses the
* resource_list_alloc() function to do most of the work. It calls
* BUS_GET_RESOURCE_LIST() to find a suitable resource list.
*/
struct resource *
bus_generic_rl_alloc_resource(device_t dev, device_t child, int type,
int *rid, u_long start, u_long end, u_long count, u_int flags)
{
struct resource_list * rl = NULL;
if (device_get_parent(child) != dev)
return (BUS_ALLOC_RESOURCE(device_get_parent(dev), child,
type, rid, start, end, count, flags));
rl = BUS_GET_RESOURCE_LIST(dev, child);
if (!rl)
return (NULL);
return (resource_list_alloc(rl, dev, child, type, rid,
start, end, count, flags));
}
/**
* @brief Helper function for implementing BUS_CHILD_PRESENT().
*
* This simple implementation of BUS_CHILD_PRESENT() simply calls the
* BUS_CHILD_PRESENT() method of the parent of @p dev.
*/
int
bus_generic_child_present(device_t dev, device_t child)
{
return (BUS_CHILD_PRESENT(device_get_parent(dev), dev));
}
/*
* Some convenience functions to make it easier for drivers to use the
* resource-management functions. All these really do is hide the
* indirection through the parent's method table, making for slightly
* less-wordy code. In the future, it might make sense for this code
* to maintain some sort of a list of resources allocated by each device.
*/
int
bus_alloc_resources(device_t dev, struct resource_spec *rs,
struct resource **res)
{
int i;
for (i = 0; rs[i].type != -1; i++)
res[i] = NULL;
for (i = 0; rs[i].type != -1; i++) {
res[i] = bus_alloc_resource_any(dev,
rs[i].type, &rs[i].rid, rs[i].flags);
if (res[i] == NULL && !(rs[i].flags & RF_OPTIONAL)) {
bus_release_resources(dev, rs, res);
return (ENXIO);
}
}
return (0);
}
void
bus_release_resources(device_t dev, const struct resource_spec *rs,
struct resource **res)
{
int i;
for (i = 0; rs[i].type != -1; i++)
if (res[i] != NULL) {
bus_release_resource(
dev, rs[i].type, rs[i].rid, res[i]);
res[i] = NULL;
}
}
/**
* @brief Wrapper function for BUS_ALLOC_RESOURCE().
*
* This function simply calls the BUS_ALLOC_RESOURCE() method of the
* parent of @p dev.
*/
struct resource *
bus_alloc_resource(device_t dev, int type, int *rid, u_long start, u_long end,
u_long count, u_int flags)
{
if (dev->parent == NULL)
return (NULL);
return (BUS_ALLOC_RESOURCE(dev->parent, dev, type, rid, start, end,
count, flags));
}
/**
* @brief Wrapper function for BUS_ADJUST_RESOURCE().
*
* This function simply calls the BUS_ADJUST_RESOURCE() method of the
* parent of @p dev.
*/
int
bus_adjust_resource(device_t dev, int type, struct resource *r, u_long start,
u_long end)
{
if (dev->parent == NULL)
return (EINVAL);
return (BUS_ADJUST_RESOURCE(dev->parent, dev, type, r, start, end));
}
/**
* @brief Wrapper function for BUS_ACTIVATE_RESOURCE().
*
* This function simply calls the BUS_ACTIVATE_RESOURCE() method of the
* parent of @p dev.
*/
int
bus_activate_resource(device_t dev, int type, int rid, struct resource *r)
{
if (dev->parent == NULL)
return (EINVAL);
return (BUS_ACTIVATE_RESOURCE(dev->parent, dev, type, rid, r));
}
/**
* @brief Wrapper function for BUS_DEACTIVATE_RESOURCE().
*
* This function simply calls the BUS_DEACTIVATE_RESOURCE() method of the
* parent of @p dev.
*/
int
bus_deactivate_resource(device_t dev, int type, int rid, struct resource *r)
{
if (dev->parent == NULL)
return (EINVAL);
return (BUS_DEACTIVATE_RESOURCE(dev->parent, dev, type, rid, r));
}
/**
* @brief Wrapper function for BUS_RELEASE_RESOURCE().
*
* This function simply calls the BUS_RELEASE_RESOURCE() method of the
* parent of @p dev.
*/
int
bus_release_resource(device_t dev, int type, int rid, struct resource *r)
{
if (dev->parent == NULL)
return (EINVAL);
return (BUS_RELEASE_RESOURCE(dev->parent, dev, type, rid, r));
}
/**
* @brief Wrapper function for BUS_SETUP_INTR().
*
* This function simply calls the BUS_SETUP_INTR() method of the
* parent of @p dev.
*/
int
bus_setup_intr(device_t dev, struct resource *r, int flags,
driver_filter_t filter, driver_intr_t handler, void *arg, void **cookiep)
{
int error;
if (dev->parent == NULL)
return (EINVAL);
error = BUS_SETUP_INTR(dev->parent, dev, r, flags, filter, handler,
arg, cookiep);
if (error != 0)
return (error);
if (handler != NULL && !(flags & INTR_MPSAFE))
device_printf(dev, "[GIANT-LOCKED]\n");
return (0);
}
/**
* @brief Wrapper function for BUS_TEARDOWN_INTR().
*
* This function simply calls the BUS_TEARDOWN_INTR() method of the
* parent of @p dev.
*/
int
bus_teardown_intr(device_t dev, struct resource *r, void *cookie)
{
if (dev->parent == NULL)
return (EINVAL);
return (BUS_TEARDOWN_INTR(dev->parent, dev, r, cookie));
}
/**
* @brief Wrapper function for BUS_BIND_INTR().
*
* This function simply calls the BUS_BIND_INTR() method of the
* parent of @p dev.
*/
int
bus_bind_intr(device_t dev, struct resource *r, int cpu)
{
if (dev->parent == NULL)
return (EINVAL);
return (BUS_BIND_INTR(dev->parent, dev, r, cpu));
}
/**
* @brief Wrapper function for BUS_DESCRIBE_INTR().
*
* This function first formats the requested description into a
* temporary buffer and then calls the BUS_DESCRIBE_INTR() method of
* the parent of @p dev.
*/
int
bus_describe_intr(device_t dev, struct resource *irq, void *cookie,
const char *fmt, ...)
{
va_list ap;
char descr[MAXCOMLEN + 1];
if (dev->parent == NULL)
return (EINVAL);
va_start(ap, fmt);
vsnprintf(descr, sizeof(descr), fmt, ap);
va_end(ap);
return (BUS_DESCRIBE_INTR(dev->parent, dev, irq, cookie, descr));
}
/**
* @brief Wrapper function for BUS_SET_RESOURCE().
*
* This function simply calls the BUS_SET_RESOURCE() method of the
* parent of @p dev.
*/
int
bus_set_resource(device_t dev, int type, int rid,
u_long start, u_long count)
{
return (BUS_SET_RESOURCE(device_get_parent(dev), dev, type, rid,
start, count));
}
/**
* @brief Wrapper function for BUS_GET_RESOURCE().
*
* This function simply calls the BUS_GET_RESOURCE() method of the
* parent of @p dev.
*/
int
bus_get_resource(device_t dev, int type, int rid,
u_long *startp, u_long *countp)
{
return (BUS_GET_RESOURCE(device_get_parent(dev), dev, type, rid,
startp, countp));
}
/**
* @brief Wrapper function for BUS_GET_RESOURCE().
*
* This function simply calls the BUS_GET_RESOURCE() method of the
* parent of @p dev and returns the start value.
*/
u_long
bus_get_resource_start(device_t dev, int type, int rid)
{
u_long start, count;
int error;
error = BUS_GET_RESOURCE(device_get_parent(dev), dev, type, rid,
&start, &count);
if (error)
return (0);
return (start);
}
/**
* @brief Wrapper function for BUS_GET_RESOURCE().
*
* This function simply calls the BUS_GET_RESOURCE() method of the
* parent of @p dev and returns the count value.
*/
u_long
bus_get_resource_count(device_t dev, int type, int rid)
{
u_long start, count;
int error;
error = BUS_GET_RESOURCE(device_get_parent(dev), dev, type, rid,
&start, &count);
if (error)
return (0);
return (count);
}
/**
* @brief Wrapper function for BUS_DELETE_RESOURCE().
*
* This function simply calls the BUS_DELETE_RESOURCE() method of the
* parent of @p dev.
*/
void
bus_delete_resource(device_t dev, int type, int rid)
{
BUS_DELETE_RESOURCE(device_get_parent(dev), dev, type, rid);
}
/**
* @brief Wrapper function for BUS_CHILD_PRESENT().
*
* This function simply calls the BUS_CHILD_PRESENT() method of the
* parent of @p dev.
*/
int
bus_child_present(device_t child)
{
return (BUS_CHILD_PRESENT(device_get_parent(child), child));
}
/**
* @brief Wrapper function for BUS_CHILD_PNPINFO_STR().
*
* This function simply calls the BUS_CHILD_PNPINFO_STR() method of the
* parent of @p dev.
*/
int
bus_child_pnpinfo_str(device_t child, char *buf, size_t buflen)
{
device_t parent;
parent = device_get_parent(child);
if (parent == NULL) {
*buf = '\0';
return (0);
}
return (BUS_CHILD_PNPINFO_STR(parent, child, buf, buflen));
}
/**
* @brief Wrapper function for BUS_CHILD_LOCATION_STR().
*
* This function simply calls the BUS_CHILD_LOCATION_STR() method of the
* parent of @p dev.
*/
int
bus_child_location_str(device_t child, char *buf, size_t buflen)
{
device_t parent;
parent = device_get_parent(child);
if (parent == NULL) {
*buf = '\0';
return (0);
}
return (BUS_CHILD_LOCATION_STR(parent, child, buf, buflen));
}
/**
* @brief Wrapper function for BUS_GET_DMA_TAG().
*
* This function simply calls the BUS_GET_DMA_TAG() method of the
* parent of @p dev.
*/
bus_dma_tag_t
bus_get_dma_tag(device_t dev)
{
device_t parent;
parent = device_get_parent(dev);
if (parent == NULL)
return (NULL);
return (BUS_GET_DMA_TAG(parent, dev));
}
/* Resume all devices and then notify userland that we're up again. */
static int
root_resume(device_t dev)
{
int error;
error = bus_generic_resume(dev);
if (error == 0)
devctl_notify("kern", "power", "resume", NULL);
return (error);
}
static int
root_print_child(device_t dev, device_t child)
{
int retval = 0;
retval += bus_print_child_header(dev, child);
retval += printf("\n");
return (retval);
}
static int
root_setup_intr(device_t dev, device_t child, struct resource *irq, int flags,
driver_filter_t *filter, driver_intr_t *intr, void *arg, void **cookiep)
{
/*
* If an interrupt mapping gets to here something bad has happened.
*/
panic("root_setup_intr");
}
/*
* If we get here, assume that the device is permanant and really is
* present in the system. Removable bus drivers are expected to intercept
* this call long before it gets here. We return -1 so that drivers that
* really care can check vs -1 or some ERRNO returned higher in the food
* chain.
*/
static int
root_child_present(device_t dev, device_t child)
{
return (-1);
}
static kobj_method_t root_methods[] = {
/* Device interface */
KOBJMETHOD(device_shutdown, bus_generic_shutdown),
KOBJMETHOD(device_suspend, bus_generic_suspend),
KOBJMETHOD(device_resume, root_resume),
/* Bus interface */
KOBJMETHOD(bus_print_child, root_print_child),
KOBJMETHOD(bus_read_ivar, bus_generic_read_ivar),
KOBJMETHOD(bus_write_ivar, bus_generic_write_ivar),
KOBJMETHOD(bus_setup_intr, root_setup_intr),
KOBJMETHOD(bus_child_present, root_child_present),
KOBJMETHOD_END
};
static driver_t root_driver = {
"root",
root_methods,
1, /* no softc */
};
device_t root_bus;
devclass_t root_devclass;
static int
root_bus_module_handler(module_t mod, int what, void* arg)
{
switch (what) {
case MOD_LOAD:
TAILQ_INIT(&bus_data_devices);
kobj_class_compile((kobj_class_t) &root_driver);
root_bus = make_device(NULL, "root", 0);
root_bus->desc = "System root bus";
kobj_init((kobj_t) root_bus, (kobj_class_t) &root_driver);
root_bus->driver = &root_driver;
root_bus->state = DS_ATTACHED;
root_devclass = devclass_find_internal("root", NULL, FALSE);
devinit();
return (0);
case MOD_SHUTDOWN:
device_shutdown(root_bus);
return (0);
default:
return (EOPNOTSUPP);
}
return (0);
}
static moduledata_t root_bus_mod = {
"rootbus",
root_bus_module_handler,
NULL
};
DECLARE_MODULE(rootbus, root_bus_mod, SI_SUB_DRIVERS, SI_ORDER_FIRST);
/**
* @brief Automatically configure devices
*
* This function begins the autoconfiguration process by calling
* device_probe_and_attach() for each child of the @c root0 device.
*/
void
root_bus_configure(void)
{
PDEBUG(("."));
/* Eventually this will be split up, but this is sufficient for now. */
bus_set_pass(BUS_PASS_DEFAULT);
}
/**
* @brief Module handler for registering device drivers
*
* This module handler is used to automatically register device
* drivers when modules are loaded. If @p what is MOD_LOAD, it calls
* devclass_add_driver() for the driver described by the
* driver_module_data structure pointed to by @p arg
*/
int
driver_module_handler(module_t mod, int what, void *arg)
{
struct driver_module_data *dmd;
devclass_t bus_devclass;
kobj_class_t driver;
int error, pass;
dmd = (struct driver_module_data *)arg;
bus_devclass = devclass_find_internal(dmd->dmd_busname, NULL, TRUE);
error = 0;
switch (what) {
case MOD_LOAD:
if (dmd->dmd_chainevh)
error = dmd->dmd_chainevh(mod,what,dmd->dmd_chainarg);
pass = dmd->dmd_pass;
driver = dmd->dmd_driver;
PDEBUG(("Loading module: driver %s on bus %s (pass %d)",
DRIVERNAME(driver), dmd->dmd_busname, pass));
error = devclass_add_driver(bus_devclass, driver, pass,
dmd->dmd_devclass);
break;
case MOD_UNLOAD:
PDEBUG(("Unloading module: driver %s from bus %s",
DRIVERNAME(dmd->dmd_driver),
dmd->dmd_busname));
error = devclass_delete_driver(bus_devclass,
dmd->dmd_driver);
if (!error && dmd->dmd_chainevh)
error = dmd->dmd_chainevh(mod,what,dmd->dmd_chainarg);
break;
case MOD_QUIESCE:
PDEBUG(("Quiesce module: driver %s from bus %s",
DRIVERNAME(dmd->dmd_driver),
dmd->dmd_busname));
error = devclass_quiesce_driver(bus_devclass,
dmd->dmd_driver);
if (!error && dmd->dmd_chainevh)
error = dmd->dmd_chainevh(mod,what,dmd->dmd_chainarg);
break;
default:
error = EOPNOTSUPP;
break;
}
return (error);
}
/**
* @brief Enumerate all hinted devices for this bus.
*
* Walks through the hints for this bus and calls the bus_hinted_child
* routine for each one it fines. It searches first for the specific
* bus that's being probed for hinted children (eg isa0), and then for
* generic children (eg isa).
*
* @param dev bus device to enumerate
*/
void
bus_enumerate_hinted_children(device_t bus)
{
int i;
const char *dname, *busname;
int dunit;
/*
* enumerate all devices on the specific bus
*/
busname = device_get_nameunit(bus);
i = 0;
while (resource_find_match(&i, &dname, &dunit, "at", busname) == 0)
BUS_HINTED_CHILD(bus, dname, dunit);
/*
* and all the generic ones.
*/
busname = device_get_name(bus);
i = 0;
while (resource_find_match(&i, &dname, &dunit, "at", busname) == 0)
BUS_HINTED_CHILD(bus, dname, dunit);
}
#ifdef BUS_DEBUG
/* the _short versions avoid iteration by not calling anything that prints
* more than oneliners. I love oneliners.
*/
static void
print_device_short(device_t dev, int indent)
{
if (!dev)
return;
indentprintf(("device %d: <%s> %sparent,%schildren,%s%s%s%s%s,%sivars,%ssoftc,busy=%d\n",
dev->unit, dev->desc,
(dev->parent? "":"no "),
(TAILQ_EMPTY(&dev->children)? "no ":""),
(dev->flags&DF_ENABLED? "enabled,":"disabled,"),
(dev->flags&DF_FIXEDCLASS? "fixed,":""),
(dev->flags&DF_WILDCARD? "wildcard,":""),
(dev->flags&DF_DESCMALLOCED? "descmalloced,":""),
(dev->flags&DF_REBID? "rebiddable,":""),
(dev->ivars? "":"no "),
(dev->softc? "":"no "),
dev->busy));
}
static void
print_device(device_t dev, int indent)
{
if (!dev)
return;
print_device_short(dev, indent);
indentprintf(("Parent:\n"));
print_device_short(dev->parent, indent+1);
indentprintf(("Driver:\n"));
print_driver_short(dev->driver, indent+1);
indentprintf(("Devclass:\n"));
print_devclass_short(dev->devclass, indent+1);
}
void
print_device_tree_short(device_t dev, int indent)
/* print the device and all its children (indented) */
{
device_t child;
if (!dev)
return;
print_device_short(dev, indent);
TAILQ_FOREACH(child, &dev->children, link) {
print_device_tree_short(child, indent+1);
}
}
void
print_device_tree(device_t dev, int indent)
/* print the device and all its children (indented) */
{
device_t child;
if (!dev)
return;
print_device(dev, indent);
TAILQ_FOREACH(child, &dev->children, link) {
print_device_tree(child, indent+1);
}
}
static void
print_driver_short(driver_t *driver, int indent)
{
if (!driver)
return;
indentprintf(("driver %s: softc size = %zd\n",
driver->name, driver->size));
}
static void
print_driver(driver_t *driver, int indent)
{
if (!driver)
return;
print_driver_short(driver, indent);
}
static void
print_driver_list(driver_list_t drivers, int indent)
{
driverlink_t driver;
TAILQ_FOREACH(driver, &drivers, link) {
print_driver(driver->driver, indent);
}
}
static void
print_devclass_short(devclass_t dc, int indent)
{
if ( !dc )
return;
indentprintf(("devclass %s: max units = %d\n", dc->name, dc->maxunit));
}
static void
print_devclass(devclass_t dc, int indent)
{
int i;
if ( !dc )
return;
print_devclass_short(dc, indent);
indentprintf(("Drivers:\n"));
print_driver_list(dc->drivers, indent+1);
indentprintf(("Devices:\n"));
for (i = 0; i < dc->maxunit; i++)
if (dc->devices[i])
print_device(dc->devices[i], indent+1);
}
void
print_devclass_list_short(void)
{
devclass_t dc;
printf("Short listing of devclasses, drivers & devices:\n");
TAILQ_FOREACH(dc, &devclasses, link) {
print_devclass_short(dc, 0);
}
}
void
print_devclass_list(void)
{
devclass_t dc;
printf("Full listing of devclasses, drivers & devices:\n");
TAILQ_FOREACH(dc, &devclasses, link) {
print_devclass(dc, 0);
}
}
#endif
/*
* User-space access to the device tree.
*
* We implement a small set of nodes:
*
* hw.bus Single integer read method to obtain the
* current generation count.
* hw.bus.devices Reads the entire device tree in flat space.
* hw.bus.rman Resource manager interface
*
* We might like to add the ability to scan devclasses and/or drivers to
* determine what else is currently loaded/available.
*/
static int
sysctl_bus(SYSCTL_HANDLER_ARGS)
{
struct u_businfo ubus;
ubus.ub_version = BUS_USER_VERSION;
ubus.ub_generation = bus_data_generation;
return (SYSCTL_OUT(req, &ubus, sizeof(ubus)));
}
SYSCTL_NODE(_hw_bus, OID_AUTO, info, CTLFLAG_RW, sysctl_bus,
"bus-related data");
static int
sysctl_devices(SYSCTL_HANDLER_ARGS)
{
int *name = (int *)arg1;
u_int namelen = arg2;
int index;
struct device *dev;
struct u_device udev; /* XXX this is a bit big */
int error;
if (namelen != 2)
return (EINVAL);
if (bus_data_generation_check(name[0]))
return (EINVAL);
index = name[1];
/*
* Scan the list of devices, looking for the requested index.
*/
TAILQ_FOREACH(dev, &bus_data_devices, devlink) {
if (index-- == 0)
break;
}
if (dev == NULL)
return (ENOENT);
/*
* Populate the return array.
*/
bzero(&udev, sizeof(udev));
udev.dv_handle = (uintptr_t)dev;
udev.dv_parent = (uintptr_t)dev->parent;
if (dev->nameunit != NULL)
strlcpy(udev.dv_name, dev->nameunit, sizeof(udev.dv_name));
if (dev->desc != NULL)
strlcpy(udev.dv_desc, dev->desc, sizeof(udev.dv_desc));
if (dev->driver != NULL && dev->driver->name != NULL)
strlcpy(udev.dv_drivername, dev->driver->name,
sizeof(udev.dv_drivername));
bus_child_pnpinfo_str(dev, udev.dv_pnpinfo, sizeof(udev.dv_pnpinfo));
bus_child_location_str(dev, udev.dv_location, sizeof(udev.dv_location));
udev.dv_devflags = dev->devflags;
udev.dv_flags = dev->flags;
udev.dv_state = dev->state;
error = SYSCTL_OUT(req, &udev, sizeof(udev));
return (error);
}
SYSCTL_NODE(_hw_bus, OID_AUTO, devices, CTLFLAG_RD, sysctl_devices,
"system device tree");
int
bus_data_generation_check(int generation)
{
if (generation != bus_data_generation)
return (1);
/* XXX generate optimised lists here? */
return (0);
}
void
bus_data_generation_update(void)
{
bus_data_generation++;
}
int
bus_free_resource(device_t dev, int type, struct resource *r)
{
if (r == NULL)
return (0);
return (bus_release_resource(dev, type, rman_get_rid(r), r));
}
Index: head/sys/kern/subr_prof.c
===================================================================
--- head/sys/kern/subr_prof.c (revision 225616)
+++ head/sys/kern/subr_prof.c (revision 225617)
@@ -1,589 +1,589 @@
/*-
* Copyright (c) 1982, 1986, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)subr_prof.c 8.3 (Berkeley) 9/23/93
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/sysproto.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/resourcevar.h>
#include <sys/sysctl.h>
#include <machine/cpu.h>
#ifdef GPROF
#include <sys/malloc.h>
#include <sys/gmon.h>
#undef MCOUNT
static MALLOC_DEFINE(M_GPROF, "gprof", "kernel profiling buffer");
static void kmstartup(void *);
SYSINIT(kmem, SI_SUB_KPROF, SI_ORDER_FIRST, kmstartup, NULL);
struct gmonparam _gmonparam = { GMON_PROF_OFF };
#ifdef GUPROF
void
nullfunc_loop_profiled()
{
int i;
for (i = 0; i < CALIB_SCALE; i++)
nullfunc_profiled();
}
#define nullfunc_loop_profiled_end nullfunc_profiled /* XXX */
void
nullfunc_profiled()
{
}
#endif /* GUPROF */
/*
* Update the histograms to support extending the text region arbitrarily.
* This is done slightly naively (no sparse regions), so will waste slight
* amounts of memory, but will overall work nicely enough to allow profiling
* of KLDs.
*/
void
kmupetext(uintfptr_t nhighpc)
{
struct gmonparam np; /* slightly large */
struct gmonparam *p = &_gmonparam;
char *cp;
GIANT_REQUIRED;
bcopy(p, &np, sizeof(*p));
np.highpc = ROUNDUP(nhighpc, HISTFRACTION * sizeof(HISTCOUNTER));
if (np.highpc <= p->highpc)
return;
np.textsize = np.highpc - p->lowpc;
np.kcountsize = np.textsize / HISTFRACTION;
np.hashfraction = HASHFRACTION;
np.fromssize = np.textsize / HASHFRACTION;
np.tolimit = np.textsize * ARCDENSITY / 100;
if (np.tolimit < MINARCS)
np.tolimit = MINARCS;
else if (np.tolimit > MAXARCS)
np.tolimit = MAXARCS;
np.tossize = np.tolimit * sizeof(struct tostruct);
cp = malloc(np.kcountsize + np.fromssize + np.tossize,
M_GPROF, M_WAITOK);
/*
* Check for something else extending highpc while we slept.
*/
if (np.highpc <= p->highpc) {
free(cp, M_GPROF);
return;
}
np.tos = (struct tostruct *)cp;
cp += np.tossize;
np.kcount = (HISTCOUNTER *)cp;
cp += np.kcountsize;
np.froms = (u_short *)cp;
#ifdef GUPROF
/* Reinitialize pointers to overhead counters. */
np.cputime_count = &KCOUNT(&np, PC_TO_I(&np, cputime));
np.mcount_count = &KCOUNT(&np, PC_TO_I(&np, mcount));
np.mexitcount_count = &KCOUNT(&np, PC_TO_I(&np, mexitcount));
#endif
critical_enter();
bcopy(p->tos, np.tos, p->tossize);
bzero((char *)np.tos + p->tossize, np.tossize - p->tossize);
bcopy(p->kcount, np.kcount, p->kcountsize);
bzero((char *)np.kcount + p->kcountsize, np.kcountsize -
p->kcountsize);
bcopy(p->froms, np.froms, p->fromssize);
bzero((char *)np.froms + p->fromssize, np.fromssize - p->fromssize);
cp = (char *)p->tos;
bcopy(&np, p, sizeof(*p));
critical_exit();
free(cp, M_GPROF);
}
static void
kmstartup(dummy)
void *dummy;
{
char *cp;
struct gmonparam *p = &_gmonparam;
#ifdef GUPROF
int cputime_overhead;
int empty_loop_time;
int i;
int mcount_overhead;
int mexitcount_overhead;
int nullfunc_loop_overhead;
int nullfunc_loop_profiled_time;
uintfptr_t tmp_addr;
#endif
/*
* Round lowpc and highpc to multiples of the density we're using
* so the rest of the scaling (here and in gprof) stays in ints.
*/
p->lowpc = ROUNDDOWN((u_long)btext, HISTFRACTION * sizeof(HISTCOUNTER));
p->highpc = ROUNDUP((u_long)etext, HISTFRACTION * sizeof(HISTCOUNTER));
p->textsize = p->highpc - p->lowpc;
printf("Profiling kernel, textsize=%lu [%jx..%jx]\n",
p->textsize, (uintmax_t)p->lowpc, (uintmax_t)p->highpc);
p->kcountsize = p->textsize / HISTFRACTION;
p->hashfraction = HASHFRACTION;
p->fromssize = p->textsize / HASHFRACTION;
p->tolimit = p->textsize * ARCDENSITY / 100;
if (p->tolimit < MINARCS)
p->tolimit = MINARCS;
else if (p->tolimit > MAXARCS)
p->tolimit = MAXARCS;
p->tossize = p->tolimit * sizeof(struct tostruct);
cp = (char *)malloc(p->kcountsize + p->fromssize + p->tossize,
M_GPROF, M_WAITOK | M_ZERO);
p->tos = (struct tostruct *)cp;
cp += p->tossize;
p->kcount = (HISTCOUNTER *)cp;
cp += p->kcountsize;
p->froms = (u_short *)cp;
p->histcounter_type = FUNCTION_ALIGNMENT / HISTFRACTION * NBBY;
#ifdef GUPROF
/* Signed counters. */
p->histcounter_type = -p->histcounter_type;
/* Initialize pointers to overhead counters. */
p->cputime_count = &KCOUNT(p, PC_TO_I(p, cputime));
p->mcount_count = &KCOUNT(p, PC_TO_I(p, mcount));
p->mexitcount_count = &KCOUNT(p, PC_TO_I(p, mexitcount));
/*
* Disable interrupts to avoid interference while we calibrate
* things.
*/
critical_enter();
/*
* Determine overheads.
* XXX this needs to be repeated for each useful timer/counter.
*/
cputime_overhead = 0;
startguprof(p);
for (i = 0; i < CALIB_SCALE; i++)
cputime_overhead += cputime();
empty_loop();
startguprof(p);
empty_loop();
empty_loop_time = cputime();
nullfunc_loop_profiled();
/*
* Start profiling. There won't be any normal function calls since
* interrupts are disabled, but we will call the profiling routines
* directly to determine their overheads.
*/
p->state = GMON_PROF_HIRES;
startguprof(p);
nullfunc_loop_profiled();
startguprof(p);
for (i = 0; i < CALIB_SCALE; i++)
- MCOUNT_OVERHEAD(profil);
- mcount_overhead = KCOUNT(p, PC_TO_I(p, profil));
+ MCOUNT_OVERHEAD(sys_profil);
+ mcount_overhead = KCOUNT(p, PC_TO_I(p, sys_profil));
startguprof(p);
for (i = 0; i < CALIB_SCALE; i++)
MEXITCOUNT_OVERHEAD();
MEXITCOUNT_OVERHEAD_GETLABEL(tmp_addr);
mexitcount_overhead = KCOUNT(p, PC_TO_I(p, tmp_addr));
p->state = GMON_PROF_OFF;
stopguprof(p);
critical_exit();
nullfunc_loop_profiled_time = 0;
for (tmp_addr = (uintfptr_t)nullfunc_loop_profiled;
tmp_addr < (uintfptr_t)nullfunc_loop_profiled_end;
tmp_addr += HISTFRACTION * sizeof(HISTCOUNTER))
nullfunc_loop_profiled_time += KCOUNT(p, PC_TO_I(p, tmp_addr));
#define CALIB_DOSCALE(count) (((count) + CALIB_SCALE / 3) / CALIB_SCALE)
#define c2n(count, freq) ((int)((count) * 1000000000LL / freq))
printf("cputime %d, empty_loop %d, nullfunc_loop_profiled %d, mcount %d, mexitcount %d\n",
CALIB_DOSCALE(c2n(cputime_overhead, p->profrate)),
CALIB_DOSCALE(c2n(empty_loop_time, p->profrate)),
CALIB_DOSCALE(c2n(nullfunc_loop_profiled_time, p->profrate)),
CALIB_DOSCALE(c2n(mcount_overhead, p->profrate)),
CALIB_DOSCALE(c2n(mexitcount_overhead, p->profrate)));
cputime_overhead -= empty_loop_time;
mcount_overhead -= empty_loop_time;
mexitcount_overhead -= empty_loop_time;
/*-
* Profiling overheads are determined by the times between the
* following events:
* MC1: mcount() is called
* MC2: cputime() (called from mcount()) latches the timer
* MC3: mcount() completes
* ME1: mexitcount() is called
* ME2: cputime() (called from mexitcount()) latches the timer
* ME3: mexitcount() completes.
* The times between the events vary slightly depending on instruction
* combination and cache misses, etc. Attempt to determine the
* minimum times. These can be subtracted from the profiling times
* without much risk of reducing the profiling times below what they
* would be when profiling is not configured. Abbreviate:
* ab = minimum time between MC1 and MC3
* a = minumum time between MC1 and MC2
* b = minimum time between MC2 and MC3
* cd = minimum time between ME1 and ME3
* c = minimum time between ME1 and ME2
* d = minimum time between ME2 and ME3.
* These satisfy the relations:
* ab <= mcount_overhead (just measured)
* a + b <= ab
* cd <= mexitcount_overhead (just measured)
* c + d <= cd
* a + d <= nullfunc_loop_profiled_time (just measured)
* a >= 0, b >= 0, c >= 0, d >= 0.
* Assume that ab and cd are equal to the minimums.
*/
p->cputime_overhead = CALIB_DOSCALE(cputime_overhead);
p->mcount_overhead = CALIB_DOSCALE(mcount_overhead - cputime_overhead);
p->mexitcount_overhead = CALIB_DOSCALE(mexitcount_overhead
- cputime_overhead);
nullfunc_loop_overhead = nullfunc_loop_profiled_time - empty_loop_time;
p->mexitcount_post_overhead = CALIB_DOSCALE((mcount_overhead
- nullfunc_loop_overhead)
/ 4);
p->mexitcount_pre_overhead = p->mexitcount_overhead
+ p->cputime_overhead
- p->mexitcount_post_overhead;
p->mcount_pre_overhead = CALIB_DOSCALE(nullfunc_loop_overhead)
- p->mexitcount_post_overhead;
p->mcount_post_overhead = p->mcount_overhead
+ p->cputime_overhead
- p->mcount_pre_overhead;
printf(
"Profiling overheads: mcount: %d+%d, %d+%d; mexitcount: %d+%d, %d+%d nsec\n",
c2n(p->cputime_overhead, p->profrate),
c2n(p->mcount_overhead, p->profrate),
c2n(p->mcount_pre_overhead, p->profrate),
c2n(p->mcount_post_overhead, p->profrate),
c2n(p->cputime_overhead, p->profrate),
c2n(p->mexitcount_overhead, p->profrate),
c2n(p->mexitcount_pre_overhead, p->profrate),
c2n(p->mexitcount_post_overhead, p->profrate));
printf(
"Profiling overheads: mcount: %d+%d, %d+%d; mexitcount: %d+%d, %d+%d cycles\n",
p->cputime_overhead, p->mcount_overhead,
p->mcount_pre_overhead, p->mcount_post_overhead,
p->cputime_overhead, p->mexitcount_overhead,
p->mexitcount_pre_overhead, p->mexitcount_post_overhead);
#endif /* GUPROF */
}
/*
* Return kernel profiling information.
*/
static int
sysctl_kern_prof(SYSCTL_HANDLER_ARGS)
{
int *name = (int *) arg1;
u_int namelen = arg2;
struct gmonparam *gp = &_gmonparam;
int error;
int state;
/* all sysctl names at this level are terminal */
if (namelen != 1)
return (ENOTDIR); /* overloaded */
switch (name[0]) {
case GPROF_STATE:
state = gp->state;
error = sysctl_handle_int(oidp, &state, 0, req);
if (error)
return (error);
if (!req->newptr)
return (0);
if (state == GMON_PROF_OFF) {
gp->state = state;
PROC_LOCK(&proc0);
stopprofclock(&proc0);
PROC_UNLOCK(&proc0);
stopguprof(gp);
} else if (state == GMON_PROF_ON) {
gp->state = GMON_PROF_OFF;
stopguprof(gp);
gp->profrate = profhz;
PROC_LOCK(&proc0);
startprofclock(&proc0);
PROC_UNLOCK(&proc0);
gp->state = state;
#ifdef GUPROF
} else if (state == GMON_PROF_HIRES) {
gp->state = GMON_PROF_OFF;
PROC_LOCK(&proc0);
stopprofclock(&proc0);
PROC_UNLOCK(&proc0);
startguprof(gp);
gp->state = state;
#endif
} else if (state != gp->state)
return (EINVAL);
return (0);
case GPROF_COUNT:
return (sysctl_handle_opaque(oidp,
gp->kcount, gp->kcountsize, req));
case GPROF_FROMS:
return (sysctl_handle_opaque(oidp,
gp->froms, gp->fromssize, req));
case GPROF_TOS:
return (sysctl_handle_opaque(oidp,
gp->tos, gp->tossize, req));
case GPROF_GMONPARAM:
return (sysctl_handle_opaque(oidp, gp, sizeof *gp, req));
default:
return (EOPNOTSUPP);
}
/* NOTREACHED */
}
SYSCTL_NODE(_kern, KERN_PROF, prof, CTLFLAG_RW, sysctl_kern_prof, "");
#endif /* GPROF */
/*
* Profiling system call.
*
* The scale factor is a fixed point number with 16 bits of fraction, so that
* 1.0 is represented as 0x10000. A scale factor of 0 turns off profiling.
*/
#ifndef _SYS_SYSPROTO_H_
struct profil_args {
caddr_t samples;
size_t size;
size_t offset;
u_int scale;
};
#endif
/* ARGSUSED */
int
-profil(struct thread *td, struct profil_args *uap)
+sys_profil(struct thread *td, struct profil_args *uap)
{
struct uprof *upp;
struct proc *p;
if (uap->scale > (1 << 16))
return (EINVAL);
p = td->td_proc;
if (uap->scale == 0) {
PROC_LOCK(p);
stopprofclock(p);
PROC_UNLOCK(p);
return (0);
}
PROC_LOCK(p);
upp = &td->td_proc->p_stats->p_prof;
PROC_SLOCK(p);
upp->pr_off = uap->offset;
upp->pr_scale = uap->scale;
upp->pr_base = uap->samples;
upp->pr_size = uap->size;
PROC_SUNLOCK(p);
startprofclock(p);
PROC_UNLOCK(p);
return (0);
}
/*
* Scale is a fixed-point number with the binary point 16 bits
* into the value, and is <= 1.0. pc is at most 32 bits, so the
* intermediate result is at most 48 bits.
*/
#define PC_TO_INDEX(pc, prof) \
((int)(((u_quad_t)((pc) - (prof)->pr_off) * \
(u_quad_t)((prof)->pr_scale)) >> 16) & ~1)
/*
* Collect user-level profiling statistics; called on a profiling tick,
* when a process is running in user-mode. This routine may be called
* from an interrupt context. We try to update the user profiling buffers
* cheaply with fuswintr() and suswintr(). If that fails, we revert to
* an AST that will vector us to trap() with a context in which copyin
* and copyout will work. Trap will then call addupc_task().
*
* Note that we may (rarely) not get around to the AST soon enough, and
* lose profile ticks when the next tick overwrites this one, but in this
* case the system is overloaded and the profile is probably already
* inaccurate.
*/
void
addupc_intr(struct thread *td, uintfptr_t pc, u_int ticks)
{
struct uprof *prof;
caddr_t addr;
u_int i;
int v;
if (ticks == 0)
return;
prof = &td->td_proc->p_stats->p_prof;
PROC_SLOCK(td->td_proc);
if (pc < prof->pr_off ||
(i = PC_TO_INDEX(pc, prof)) >= prof->pr_size) {
PROC_SUNLOCK(td->td_proc);
return; /* out of range; ignore */
}
addr = prof->pr_base + i;
PROC_SUNLOCK(td->td_proc);
if ((v = fuswintr(addr)) == -1 || suswintr(addr, v + ticks) == -1) {
td->td_profil_addr = pc;
td->td_profil_ticks = ticks;
td->td_pflags |= TDP_OWEUPC;
thread_lock(td);
td->td_flags |= TDF_ASTPENDING;
thread_unlock(td);
}
}
/*
* Much like before, but we can afford to take faults here. If the
* update fails, we simply turn off profiling.
*/
void
addupc_task(struct thread *td, uintfptr_t pc, u_int ticks)
{
struct proc *p = td->td_proc;
struct uprof *prof;
caddr_t addr;
u_int i;
u_short v;
int stop = 0;
if (ticks == 0)
return;
PROC_LOCK(p);
if (!(p->p_flag & P_PROFIL)) {
PROC_UNLOCK(p);
return;
}
p->p_profthreads++;
prof = &p->p_stats->p_prof;
PROC_SLOCK(p);
if (pc < prof->pr_off ||
(i = PC_TO_INDEX(pc, prof)) >= prof->pr_size) {
PROC_SUNLOCK(p);
goto out;
}
addr = prof->pr_base + i;
PROC_SUNLOCK(p);
PROC_UNLOCK(p);
if (copyin(addr, &v, sizeof(v)) == 0) {
v += ticks;
if (copyout(&v, addr, sizeof(v)) == 0) {
PROC_LOCK(p);
goto out;
}
}
stop = 1;
PROC_LOCK(p);
out:
if (--p->p_profthreads == 0) {
if (p->p_flag & P_STOPPROF) {
wakeup(&p->p_profthreads);
stop = 0;
}
}
if (stop)
stopprofclock(p);
PROC_UNLOCK(p);
}
#if (defined(__amd64__) || defined(__i386__)) && \
defined(__GNUCLIKE_CTOR_SECTION_HANDLING)
/*
* Support for "--test-coverage --profile-arcs" in GCC.
*
* We need to call all the functions in the .ctor section, in order
* to get all the counter-arrays strung into a list.
*
* XXX: the .ctors call __bb_init_func which is located in over in
* XXX: i386/i386/support.s for historical reasons. There is probably
* XXX: no reason for that to be assembler anymore, but doing it right
* XXX: in MI C code requires one to reverse-engineer the type-selection
* XXX: inside GCC. Have fun.
*
* XXX: Worrisome perspective: Calling the .ctors may make C++ in the
* XXX: kernel feasible. Don't.
*/
typedef void (*ctor_t)(void);
extern ctor_t _start_ctors, _stop_ctors;
static void
tcov_init(void *foo __unused)
{
ctor_t *p, q;
for (p = &_start_ctors; p < &_stop_ctors; p++) {
q = *p;
q();
}
}
SYSINIT(tcov_init, SI_SUB_KPROF, SI_ORDER_SECOND, tcov_init, NULL);
/*
* GCC contains magic to recognize calls to for instance execve() and
* puts in calls to this function to preserve the profile counters.
* XXX: Put zinging punchline here.
*/
void __bb_fork_func(void);
void
__bb_fork_func(void)
{
}
#endif
Index: head/sys/kern/subr_trap.c
===================================================================
--- head/sys/kern/subr_trap.c (revision 225616)
+++ head/sys/kern/subr_trap.c (revision 225617)
@@ -1,265 +1,265 @@
/*-
* Copyright (C) 1994, David Greenman
* Copyright (c) 1990, 1993
* The Regents of the University of California. All rights reserved.
* Copyright (c) 2007 The FreeBSD Foundation
*
* This code is derived from software contributed to Berkeley by
* the University of Utah, and William Jolitz.
*
* Portions of this software were developed by A. Joseph Koshy under
* sponsorship from the FreeBSD Foundation and Google, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* from: @(#)trap.c 7.4 (Berkeley) 5/13/91
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_capsicum.h"
#include "opt_ktrace.h"
#include "opt_kdtrace.h"
#include "opt_sched.h"
#include <sys/param.h>
#include <sys/bus.h>
#include <sys/capability.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/pmckern.h>
#include <sys/proc.h>
#include <sys/ktr.h>
#include <sys/pioctl.h>
#include <sys/ptrace.h>
#include <sys/resourcevar.h>
#include <sys/sched.h>
#include <sys/signalvar.h>
#include <sys/syscall.h>
#include <sys/syscallsubr.h>
#include <sys/sysent.h>
#include <sys/systm.h>
#include <sys/vmmeter.h>
#ifdef KTRACE
#include <sys/uio.h>
#include <sys/ktrace.h>
#endif
#include <security/audit/audit.h>
#include <machine/cpu.h>
#ifdef VIMAGE
#include <net/vnet.h>
#endif
#ifdef XEN
#include <vm/vm.h>
#include <vm/vm_param.h>
#include <vm/pmap.h>
#endif
#include <security/mac/mac_framework.h>
/*
* Define the code needed before returning to user mode, for trap and
* syscall.
*/
void
userret(struct thread *td, struct trapframe *frame)
{
struct proc *p = td->td_proc;
CTR3(KTR_SYSC, "userret: thread %p (pid %d, %s)", td, p->p_pid,
td->td_name);
#if 0
#ifdef DIAGNOSTIC
/* Check that we called signotify() enough. */
PROC_LOCK(p);
thread_lock(td);
if (SIGPENDING(td) && ((td->td_flags & TDF_NEEDSIGCHK) == 0 ||
(td->td_flags & TDF_ASTPENDING) == 0))
printf("failed to set signal flags properly for ast()\n");
thread_unlock(td);
PROC_UNLOCK(p);
#endif
#endif
#ifdef KTRACE
KTRUSERRET(td);
#endif
/*
* If this thread tickled GEOM, we need to wait for the giggling to
* stop before we return to userland
*/
if (td->td_pflags & TDP_GEOM)
g_waitidle();
/*
* Charge system time if profiling.
*/
if (p->p_flag & P_PROFIL)
addupc_task(td, TRAPF_PC(frame), td->td_pticks * psratio);
/*
* Let the scheduler adjust our priority etc.
*/
sched_userret(td);
KASSERT(td->td_locks == 0,
("userret: Returning with %d locks held.", td->td_locks));
#ifdef VIMAGE
/* Unfortunately td_vnet_lpush needs VNET_DEBUG. */
VNET_ASSERT(curvnet == NULL,
("%s: Returning on td %p (pid %d, %s) with vnet %p set in %s",
__func__, td, p->p_pid, td->td_name, curvnet,
(td->td_vnet_lpush != NULL) ? td->td_vnet_lpush : "N/A"));
#endif
#ifdef XEN
PT_UPDATES_FLUSH();
#endif
}
/*
* Process an asynchronous software trap.
* This is relatively easy.
* This function will return with preemption disabled.
*/
void
ast(struct trapframe *framep)
{
struct thread *td;
struct proc *p;
int flags;
int sig;
td = curthread;
p = td->td_proc;
CTR3(KTR_SYSC, "ast: thread %p (pid %d, %s)", td, p->p_pid,
p->p_comm);
KASSERT(TRAPF_USERMODE(framep), ("ast in kernel mode"));
WITNESS_WARN(WARN_PANIC, NULL, "Returning to user mode");
mtx_assert(&Giant, MA_NOTOWNED);
THREAD_LOCK_ASSERT(td, MA_NOTOWNED);
td->td_frame = framep;
td->td_pticks = 0;
/*
* This updates the td_flag's for the checks below in one
* "atomic" operation with turning off the astpending flag.
* If another AST is triggered while we are handling the
* AST's saved in flags, the astpending flag will be set and
* ast() will be called again.
*/
thread_lock(td);
flags = td->td_flags;
td->td_flags &= ~(TDF_ASTPENDING | TDF_NEEDSIGCHK | TDF_NEEDSUSPCHK |
TDF_NEEDRESCHED | TDF_ALRMPEND | TDF_PROFPEND | TDF_MACPEND);
thread_unlock(td);
PCPU_INC(cnt.v_trap);
if (td->td_ucred != p->p_ucred)
cred_update_thread(td);
if (td->td_pflags & TDP_OWEUPC && p->p_flag & P_PROFIL) {
addupc_task(td, td->td_profil_addr, td->td_profil_ticks);
td->td_profil_ticks = 0;
td->td_pflags &= ~TDP_OWEUPC;
}
if (flags & TDF_ALRMPEND) {
PROC_LOCK(p);
- psignal(p, SIGVTALRM);
+ kern_psignal(p, SIGVTALRM);
PROC_UNLOCK(p);
}
if (flags & TDF_PROFPEND) {
PROC_LOCK(p);
- psignal(p, SIGPROF);
+ kern_psignal(p, SIGPROF);
PROC_UNLOCK(p);
}
#ifdef MAC
if (flags & TDF_MACPEND)
mac_thread_userret(td);
#endif
if (flags & TDF_NEEDRESCHED) {
#ifdef KTRACE
if (KTRPOINT(td, KTR_CSW))
ktrcsw(1, 1);
#endif
thread_lock(td);
sched_prio(td, td->td_user_pri);
mi_switch(SW_INVOL | SWT_NEEDRESCHED, NULL);
thread_unlock(td);
#ifdef KTRACE
if (KTRPOINT(td, KTR_CSW))
ktrcsw(0, 1);
#endif
}
/*
* Check for signals. Unlocked reads of p_pendingcnt or
* p_siglist might cause process-directed signal to be handled
* later.
*/
if (flags & TDF_NEEDSIGCHK || p->p_pendingcnt > 0 ||
!SIGISEMPTY(p->p_siglist)) {
PROC_LOCK(p);
mtx_lock(&p->p_sigacts->ps_mtx);
while ((sig = cursig(td, SIG_STOP_ALLOWED)) != 0)
postsig(sig);
mtx_unlock(&p->p_sigacts->ps_mtx);
PROC_UNLOCK(p);
}
/*
* We need to check to see if we have to exit or wait due to a
* single threading requirement or some other STOP condition.
*/
if (flags & TDF_NEEDSUSPCHK) {
PROC_LOCK(p);
thread_suspend_check(0);
PROC_UNLOCK(p);
}
if (td->td_pflags & TDP_OLDMASK) {
td->td_pflags &= ~TDP_OLDMASK;
kern_sigprocmask(td, SIG_SETMASK, &td->td_oldsigmask, NULL, 0);
}
userret(td, framep);
mtx_assert(&Giant, MA_NOTOWNED);
}
const char *
syscallname(struct proc *p, u_int code)
{
static const char unknown[] = "unknown";
struct sysentvec *sv;
sv = p->p_sysent;
if (sv->sv_syscallnames == NULL || code >= sv->sv_size)
return (unknown);
return (sv->sv_syscallnames[code]);
}
Index: head/sys/kern/sys_capability.c
===================================================================
--- head/sys/kern/sys_capability.c (revision 225616)
+++ head/sys/kern/sys_capability.c (revision 225617)
@@ -1,553 +1,553 @@
/*-
* Copyright (c) 2008-2011 Robert N. M. Watson
* Copyright (c) 2010-2011 Jonathan Anderson
* All rights reserved.
*
* This software was developed at the University of Cambridge Computer
* Laboratory with support from a grant from Google, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* FreeBSD kernel capability facility.
*
* Two kernel features are implemented here: capability mode, a sandboxed mode
* of execution for processes, and capabilities, a refinement on file
* descriptors that allows fine-grained control over operations on the file
* descriptor. Collectively, these allow processes to run in the style of a
* historic "capability system" in which they can use only resources
* explicitly delegated to them. This model is enforced by restricting access
* to global namespaces in capability mode.
*
* Capabilities wrap other file descriptor types, binding them to a constant
* rights mask set when the capability is created. New capabilities may be
* derived from existing capabilities, but only if they have the same or a
* strict subset of the rights on the original capability.
*
* System calls permitted in capability mode are defined in capabilities.conf;
* calls must be carefully audited for safety to ensure that they don't allow
* escape from a sandbox. Some calls permit only a subset of operations in
* capability mode -- for example, shm_open(2) is limited to creating
* anonymous, rather than named, POSIX shared memory objects.
*/
#include "opt_capsicum.h"
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/capability.h>
#include <sys/file.h>
#include <sys/filedesc.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/sysproto.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
#include <sys/ucred.h>
#include <security/audit/audit.h>
#include <vm/uma.h>
#include <vm/vm.h>
#ifdef CAPABILITY_MODE
FEATURE(security_capability_mode, "Capsicum Capability Mode");
/*
* System call to enter capability mode for the process.
*/
int
-cap_enter(struct thread *td, struct cap_enter_args *uap)
+sys_cap_enter(struct thread *td, struct cap_enter_args *uap)
{
struct ucred *newcred, *oldcred;
struct proc *p;
if (IN_CAPABILITY_MODE(td))
return (0);
newcred = crget();
p = td->td_proc;
PROC_LOCK(p);
oldcred = p->p_ucred;
crcopy(newcred, oldcred);
newcred->cr_flags |= CRED_FLAG_CAPMODE;
p->p_ucred = newcred;
PROC_UNLOCK(p);
crfree(oldcred);
return (0);
}
/*
* System call to query whether the process is in capability mode.
*/
int
-cap_getmode(struct thread *td, struct cap_getmode_args *uap)
+sys_cap_getmode(struct thread *td, struct cap_getmode_args *uap)
{
u_int i;
i = (IN_CAPABILITY_MODE(td)) ? 1 : 0;
return (copyout(&i, uap->modep, sizeof(i)));
}
#else /* !CAPABILITY_MODE */
int
-cap_enter(struct thread *td, struct cap_enter_args *uap)
+sys_cap_enter(struct thread *td, struct cap_enter_args *uap)
{
return (ENOSYS);
}
int
-cap_getmode(struct thread *td, struct cap_getmode_args *uap)
+sys_cap_getmode(struct thread *td, struct cap_getmode_args *uap)
{
return (ENOSYS);
}
#endif /* CAPABILITY_MODE */
#ifdef CAPABILITIES
FEATURE(security_capabilities, "Capsicum Capabilities");
/*
* struct capability describes a capability, and is hung off of its struct
* file f_data field. cap_file and cap_rightss are static once hooked up, as
* neither the object it references nor the rights it encapsulates are
* permitted to change.
*/
struct capability {
struct file *cap_object; /* Underlying object's file. */
struct file *cap_file; /* Back-pointer to cap's file. */
cap_rights_t cap_rights; /* Mask of rights on object. */
};
/*
* Capabilities have a fileops vector, but in practice none should ever be
* called except for fo_close, as the capability will normally not be
* returned during a file descriptor lookup in the system call code.
*/
static fo_rdwr_t capability_read;
static fo_rdwr_t capability_write;
static fo_truncate_t capability_truncate;
static fo_ioctl_t capability_ioctl;
static fo_poll_t capability_poll;
static fo_kqfilter_t capability_kqfilter;
static fo_stat_t capability_stat;
static fo_close_t capability_close;
static fo_chmod_t capability_chmod;
static fo_chown_t capability_chown;
static struct fileops capability_ops = {
.fo_read = capability_read,
.fo_write = capability_write,
.fo_truncate = capability_truncate,
.fo_ioctl = capability_ioctl,
.fo_poll = capability_poll,
.fo_kqfilter = capability_kqfilter,
.fo_stat = capability_stat,
.fo_close = capability_close,
.fo_chmod = capability_chmod,
.fo_chown = capability_chown,
.fo_flags = DFLAG_PASSABLE,
};
static struct fileops capability_ops_unpassable = {
.fo_read = capability_read,
.fo_write = capability_write,
.fo_truncate = capability_truncate,
.fo_ioctl = capability_ioctl,
.fo_poll = capability_poll,
.fo_kqfilter = capability_kqfilter,
.fo_stat = capability_stat,
.fo_close = capability_close,
.fo_chmod = capability_chmod,
.fo_chown = capability_chown,
.fo_flags = 0,
};
static uma_zone_t capability_zone;
static void
capability_init(void *dummy __unused)
{
capability_zone = uma_zcreate("capability", sizeof(struct capability),
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
if (capability_zone == NULL)
panic("capability_init: capability_zone not initialized");
}
SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, capability_init, NULL);
/*
* Test whether a capability grants the requested rights.
*/
static int
cap_check(struct capability *c, cap_rights_t rights)
{
if ((c->cap_rights | rights) != c->cap_rights)
return (ENOTCAPABLE);
return (0);
}
/*
* Extract rights from a capability for monitoring purposes -- not for use in
* any other way, as we want to keep all capability permission evaluation in
* this one file.
*/
cap_rights_t
cap_rights(struct file *fp_cap)
{
struct capability *c;
KASSERT(fp_cap->f_type == DTYPE_CAPABILITY,
("cap_rights: !capability"));
c = fp_cap->f_data;
return (c->cap_rights);
}
/*
* System call to create a new capability reference to either an existing
* file object or an an existing capability.
*/
int
-cap_new(struct thread *td, struct cap_new_args *uap)
+sys_cap_new(struct thread *td, struct cap_new_args *uap)
{
int error, capfd;
int fd = uap->fd;
struct file *fp;
cap_rights_t rights = uap->rights;
AUDIT_ARG_FD(fd);
AUDIT_ARG_RIGHTS(rights);
error = fget(td, fd, rights, &fp);
if (error)
return (error);
AUDIT_ARG_FILE(td->td_proc, fp);
error = kern_capwrap(td, fp, rights, &capfd);
if (error)
return (error);
/*
* Release our reference to the file (kern_capwrap has held a reference
* for the filedesc array).
*/
fdrop(fp, td);
td->td_retval[0] = capfd;
return (0);
}
/*
* System call to query the rights mask associated with a capability.
*/
int
-cap_getrights(struct thread *td, struct cap_getrights_args *uap)
+sys_cap_getrights(struct thread *td, struct cap_getrights_args *uap)
{
struct capability *cp;
struct file *fp;
int error;
AUDIT_ARG_FD(uap->fd);
error = fgetcap(td, uap->fd, &fp);
if (error)
return (error);
cp = fp->f_data;
error = copyout(&cp->cap_rights, uap->rightsp, sizeof(*uap->rightsp));
fdrop(fp, td);
return (error);
}
/*
* Create a capability to wrap around an existing file.
*/
int
kern_capwrap(struct thread *td, struct file *fp, cap_rights_t rights,
int *capfdp)
{
struct capability *cp, *cp_old;
struct file *fp_object, *fcapp;
int error;
if ((rights | CAP_MASK_VALID) != CAP_MASK_VALID)
return (EINVAL);
/*
* If a new capability is being derived from an existing capability,
* then the new capability rights must be a subset of the existing
* rights.
*/
if (fp->f_type == DTYPE_CAPABILITY) {
cp_old = fp->f_data;
if ((cp_old->cap_rights | rights) != cp_old->cap_rights)
return (ENOTCAPABLE);
}
/*
* Allocate a new file descriptor to hang the capability off of.
*/
error = falloc(td, &fcapp, capfdp, fp->f_flag);
if (error)
return (error);
/*
* Rather than nesting capabilities, directly reference the object an
* existing capability references. There's nothing else interesting
* to preserve for future use, as we've incorporated the previous
* rights mask into the new one. This prevents us from having to
* deal with capability chains.
*/
if (fp->f_type == DTYPE_CAPABILITY)
fp_object = ((struct capability *)fp->f_data)->cap_object;
else
fp_object = fp;
fhold(fp_object);
cp = uma_zalloc(capability_zone, M_WAITOK | M_ZERO);
cp->cap_rights = rights;
cp->cap_object = fp_object;
cp->cap_file = fcapp;
if (fp->f_flag & DFLAG_PASSABLE)
finit(fcapp, fp->f_flag, DTYPE_CAPABILITY, cp,
&capability_ops);
else
finit(fcapp, fp->f_flag, DTYPE_CAPABILITY, cp,
&capability_ops_unpassable);
/*
* Release our private reference (the proc filedesc still has one).
*/
fdrop(fcapp, td);
return (0);
}
/*
* Given a file descriptor, test it against a capability rights mask and then
* return the file descriptor on which to actually perform the requested
* operation. As long as the reference to fp_cap remains valid, the returned
* pointer in *fp will remain valid, so no extra reference management is
* required, and the caller should fdrop() fp_cap as normal when done with
* both.
*/
int
cap_funwrap(struct file *fp_cap, cap_rights_t rights, struct file **fpp)
{
struct capability *c;
int error;
if (fp_cap->f_type != DTYPE_CAPABILITY) {
*fpp = fp_cap;
return (0);
}
c = fp_cap->f_data;
error = cap_check(c, rights);
if (error)
return (error);
*fpp = c->cap_object;
return (0);
}
/*
* Slightly different routine for memory mapping file descriptors: unwrap the
* capability and check CAP_MMAP, but also return a bitmask representing the
* maximum mapping rights the capability allows on the object.
*/
int
cap_funwrap_mmap(struct file *fp_cap, cap_rights_t rights, u_char *maxprotp,
struct file **fpp)
{
struct capability *c;
u_char maxprot;
int error;
if (fp_cap->f_type != DTYPE_CAPABILITY) {
*fpp = fp_cap;
*maxprotp = VM_PROT_ALL;
return (0);
}
c = fp_cap->f_data;
error = cap_check(c, rights | CAP_MMAP);
if (error)
return (error);
*fpp = c->cap_object;
maxprot = 0;
if (c->cap_rights & CAP_READ)
maxprot |= VM_PROT_READ;
if (c->cap_rights & CAP_WRITE)
maxprot |= VM_PROT_WRITE;
if (c->cap_rights & CAP_MAPEXEC)
maxprot |= VM_PROT_EXECUTE;
*maxprotp = maxprot;
return (0);
}
/*
* When a capability is closed, simply drop the reference on the underlying
* object and free the capability. fdrop() will handle the case where the
* underlying object also needs to close, and the caller will have already
* performed any object-specific lock or mqueue handling.
*/
static int
capability_close(struct file *fp, struct thread *td)
{
struct capability *c;
struct file *fp_object;
KASSERT(fp->f_type == DTYPE_CAPABILITY,
("capability_close: !capability"));
c = fp->f_data;
fp->f_ops = &badfileops;
fp->f_data = NULL;
fp_object = c->cap_object;
uma_zfree(capability_zone, c);
return (fdrop(fp_object, td));
}
/*
* In general, file descriptor operations should never make it to the
* capability, only the underlying file descriptor operation vector, so panic
* if we do turn up here.
*/
static int
capability_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
int flags, struct thread *td)
{
panic("capability_read");
}
static int
capability_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
int flags, struct thread *td)
{
panic("capability_write");
}
static int
capability_truncate(struct file *fp, off_t length, struct ucred *active_cred,
struct thread *td)
{
panic("capability_truncate");
}
static int
capability_ioctl(struct file *fp, u_long com, void *data,
struct ucred *active_cred, struct thread *td)
{
panic("capability_ioctl");
}
static int
capability_poll(struct file *fp, int events, struct ucred *active_cred,
struct thread *td)
{
panic("capability_poll");
}
static int
capability_kqfilter(struct file *fp, struct knote *kn)
{
panic("capability_kqfilter");
}
static int
capability_stat(struct file *fp, struct stat *sb, struct ucred *active_cred,
struct thread *td)
{
panic("capability_stat");
}
int
capability_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
struct thread *td)
{
panic("capability_chmod");
}
int
capability_chown(struct file *fp, uid_t uid, gid_t gid,
struct ucred *active_cred, struct thread *td)
{
panic("capability_chown");
}
#else /* !CAPABILITIES */
/*
* Stub Capability functions for when options CAPABILITIES isn't compiled
* into the kernel.
*/
int
-cap_new(struct thread *td, struct cap_new_args *uap)
+sys_cap_new(struct thread *td, struct cap_new_args *uap)
{
return (ENOSYS);
}
int
-cap_getrights(struct thread *td, struct cap_getrights_args *uap)
+sys_cap_getrights(struct thread *td, struct cap_getrights_args *uap)
{
return (ENOSYS);
}
int
cap_funwrap(struct file *fp_cap, cap_rights_t rights, struct file **fpp)
{
KASSERT(fp_cap->f_type != DTYPE_CAPABILITY,
("cap_funwrap: saw capability"));
*fpp = fp_cap;
return (0);
}
int
cap_funwrap_mmap(struct file *fp_cap, cap_rights_t rights, u_char *maxprotp,
struct file **fpp)
{
KASSERT(fp_cap->f_type != DTYPE_CAPABILITY,
("cap_funwrap_mmap: saw capability"));
*fpp = fp_cap;
*maxprotp = VM_PROT_ALL;
return (0);
}
#endif /* CAPABILITIES */
Index: head/sys/kern/sys_generic.c
===================================================================
--- head/sys/kern/sys_generic.c (revision 225616)
+++ head/sys/kern/sys_generic.c (revision 225617)
@@ -1,1700 +1,1700 @@
/*-
* Copyright (c) 1982, 1986, 1989, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)sys_generic.c 8.5 (Berkeley) 1/21/94
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_capsicum.h"
#include "opt_compat.h"
#include "opt_ktrace.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/sysproto.h>
#include <sys/capability.h>
#include <sys/filedesc.h>
#include <sys/filio.h>
#include <sys/fcntl.h>
#include <sys/file.h>
#include <sys/proc.h>
#include <sys/signalvar.h>
#include <sys/socketvar.h>
#include <sys/uio.h>
#include <sys/kernel.h>
#include <sys/ktr.h>
#include <sys/limits.h>
#include <sys/malloc.h>
#include <sys/poll.h>
#include <sys/resourcevar.h>
#include <sys/selinfo.h>
#include <sys/sleepqueue.h>
#include <sys/syscallsubr.h>
#include <sys/sysctl.h>
#include <sys/sysent.h>
#include <sys/vnode.h>
#include <sys/bio.h>
#include <sys/buf.h>
#include <sys/condvar.h>
#ifdef KTRACE
#include <sys/ktrace.h>
#endif
#include <security/audit/audit.h>
static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
MALLOC_DEFINE(M_IOV, "iov", "large iov's");
static int pollout(struct thread *, struct pollfd *, struct pollfd *,
u_int);
static int pollscan(struct thread *, struct pollfd *, u_int);
static int pollrescan(struct thread *);
static int selscan(struct thread *, fd_mask **, fd_mask **, int);
static int selrescan(struct thread *, fd_mask **, fd_mask **);
static void selfdalloc(struct thread *, void *);
static void selfdfree(struct seltd *, struct selfd *);
static int dofileread(struct thread *, int, struct file *, struct uio *,
off_t, int);
static int dofilewrite(struct thread *, int, struct file *, struct uio *,
off_t, int);
static void doselwakeup(struct selinfo *, int);
static void seltdinit(struct thread *);
static int seltdwait(struct thread *, int);
static void seltdclear(struct thread *);
/*
* One seltd per-thread allocated on demand as needed.
*
* t - protected by st_mtx
* k - Only accessed by curthread or read-only
*/
struct seltd {
STAILQ_HEAD(, selfd) st_selq; /* (k) List of selfds. */
struct selfd *st_free1; /* (k) free fd for read set. */
struct selfd *st_free2; /* (k) free fd for write set. */
struct mtx st_mtx; /* Protects struct seltd */
struct cv st_wait; /* (t) Wait channel. */
int st_flags; /* (t) SELTD_ flags. */
};
#define SELTD_PENDING 0x0001 /* We have pending events. */
#define SELTD_RESCAN 0x0002 /* Doing a rescan. */
/*
* One selfd allocated per-thread per-file-descriptor.
* f - protected by sf_mtx
*/
struct selfd {
STAILQ_ENTRY(selfd) sf_link; /* (k) fds owned by this td. */
TAILQ_ENTRY(selfd) sf_threads; /* (f) fds on this selinfo. */
struct selinfo *sf_si; /* (f) selinfo when linked. */
struct mtx *sf_mtx; /* Pointer to selinfo mtx. */
struct seltd *sf_td; /* (k) owning seltd. */
void *sf_cookie; /* (k) fd or pollfd. */
};
static uma_zone_t selfd_zone;
static struct mtx_pool *mtxpool_select;
#ifndef _SYS_SYSPROTO_H_
struct read_args {
int fd;
void *buf;
size_t nbyte;
};
#endif
int
-read(td, uap)
+sys_read(td, uap)
struct thread *td;
struct read_args *uap;
{
struct uio auio;
struct iovec aiov;
int error;
if (uap->nbyte > INT_MAX)
return (EINVAL);
aiov.iov_base = uap->buf;
aiov.iov_len = uap->nbyte;
auio.uio_iov = &aiov;
auio.uio_iovcnt = 1;
auio.uio_resid = uap->nbyte;
auio.uio_segflg = UIO_USERSPACE;
error = kern_readv(td, uap->fd, &auio);
return(error);
}
/*
* Positioned read system call
*/
#ifndef _SYS_SYSPROTO_H_
struct pread_args {
int fd;
void *buf;
size_t nbyte;
int pad;
off_t offset;
};
#endif
int
-pread(td, uap)
+sys_pread(td, uap)
struct thread *td;
struct pread_args *uap;
{
struct uio auio;
struct iovec aiov;
int error;
if (uap->nbyte > INT_MAX)
return (EINVAL);
aiov.iov_base = uap->buf;
aiov.iov_len = uap->nbyte;
auio.uio_iov = &aiov;
auio.uio_iovcnt = 1;
auio.uio_resid = uap->nbyte;
auio.uio_segflg = UIO_USERSPACE;
error = kern_preadv(td, uap->fd, &auio, uap->offset);
return(error);
}
int
freebsd6_pread(td, uap)
struct thread *td;
struct freebsd6_pread_args *uap;
{
struct pread_args oargs;
oargs.fd = uap->fd;
oargs.buf = uap->buf;
oargs.nbyte = uap->nbyte;
oargs.offset = uap->offset;
- return (pread(td, &oargs));
+ return (sys_pread(td, &oargs));
}
/*
* Scatter read system call.
*/
#ifndef _SYS_SYSPROTO_H_
struct readv_args {
int fd;
struct iovec *iovp;
u_int iovcnt;
};
#endif
int
-readv(struct thread *td, struct readv_args *uap)
+sys_readv(struct thread *td, struct readv_args *uap)
{
struct uio *auio;
int error;
error = copyinuio(uap->iovp, uap->iovcnt, &auio);
if (error)
return (error);
error = kern_readv(td, uap->fd, auio);
free(auio, M_IOV);
return (error);
}
int
kern_readv(struct thread *td, int fd, struct uio *auio)
{
struct file *fp;
int error;
error = fget_read(td, fd, CAP_READ | CAP_SEEK, &fp);
if (error)
return (error);
error = dofileread(td, fd, fp, auio, (off_t)-1, 0);
fdrop(fp, td);
return (error);
}
/*
* Scatter positioned read system call.
*/
#ifndef _SYS_SYSPROTO_H_
struct preadv_args {
int fd;
struct iovec *iovp;
u_int iovcnt;
off_t offset;
};
#endif
int
-preadv(struct thread *td, struct preadv_args *uap)
+sys_preadv(struct thread *td, struct preadv_args *uap)
{
struct uio *auio;
int error;
error = copyinuio(uap->iovp, uap->iovcnt, &auio);
if (error)
return (error);
error = kern_preadv(td, uap->fd, auio, uap->offset);
free(auio, M_IOV);
return (error);
}
int
kern_preadv(td, fd, auio, offset)
struct thread *td;
int fd;
struct uio *auio;
off_t offset;
{
struct file *fp;
int error;
error = fget_read(td, fd, CAP_READ, &fp);
if (error)
return (error);
if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
error = ESPIPE;
else if (offset < 0 && fp->f_vnode->v_type != VCHR)
error = EINVAL;
else
error = dofileread(td, fd, fp, auio, offset, FOF_OFFSET);
fdrop(fp, td);
return (error);
}
/*
* Common code for readv and preadv that reads data in
* from a file using the passed in uio, offset, and flags.
*/
static int
dofileread(td, fd, fp, auio, offset, flags)
struct thread *td;
int fd;
struct file *fp;
struct uio *auio;
off_t offset;
int flags;
{
ssize_t cnt;
int error;
#ifdef KTRACE
struct uio *ktruio = NULL;
#endif
/* Finish zero length reads right here */
if (auio->uio_resid == 0) {
td->td_retval[0] = 0;
return(0);
}
auio->uio_rw = UIO_READ;
auio->uio_offset = offset;
auio->uio_td = td;
#ifdef KTRACE
if (KTRPOINT(td, KTR_GENIO))
ktruio = cloneuio(auio);
#endif
cnt = auio->uio_resid;
if ((error = fo_read(fp, auio, td->td_ucred, flags, td))) {
if (auio->uio_resid != cnt && (error == ERESTART ||
error == EINTR || error == EWOULDBLOCK))
error = 0;
}
cnt -= auio->uio_resid;
#ifdef KTRACE
if (ktruio != NULL) {
ktruio->uio_resid = cnt;
ktrgenio(fd, UIO_READ, ktruio, error);
}
#endif
td->td_retval[0] = cnt;
return (error);
}
#ifndef _SYS_SYSPROTO_H_
struct write_args {
int fd;
const void *buf;
size_t nbyte;
};
#endif
int
-write(td, uap)
+sys_write(td, uap)
struct thread *td;
struct write_args *uap;
{
struct uio auio;
struct iovec aiov;
int error;
if (uap->nbyte > INT_MAX)
return (EINVAL);
aiov.iov_base = (void *)(uintptr_t)uap->buf;
aiov.iov_len = uap->nbyte;
auio.uio_iov = &aiov;
auio.uio_iovcnt = 1;
auio.uio_resid = uap->nbyte;
auio.uio_segflg = UIO_USERSPACE;
error = kern_writev(td, uap->fd, &auio);
return(error);
}
/*
* Positioned write system call.
*/
#ifndef _SYS_SYSPROTO_H_
struct pwrite_args {
int fd;
const void *buf;
size_t nbyte;
int pad;
off_t offset;
};
#endif
int
-pwrite(td, uap)
+sys_pwrite(td, uap)
struct thread *td;
struct pwrite_args *uap;
{
struct uio auio;
struct iovec aiov;
int error;
if (uap->nbyte > INT_MAX)
return (EINVAL);
aiov.iov_base = (void *)(uintptr_t)uap->buf;
aiov.iov_len = uap->nbyte;
auio.uio_iov = &aiov;
auio.uio_iovcnt = 1;
auio.uio_resid = uap->nbyte;
auio.uio_segflg = UIO_USERSPACE;
error = kern_pwritev(td, uap->fd, &auio, uap->offset);
return(error);
}
int
freebsd6_pwrite(td, uap)
struct thread *td;
struct freebsd6_pwrite_args *uap;
{
struct pwrite_args oargs;
oargs.fd = uap->fd;
oargs.buf = uap->buf;
oargs.nbyte = uap->nbyte;
oargs.offset = uap->offset;
- return (pwrite(td, &oargs));
+ return (sys_pwrite(td, &oargs));
}
/*
* Gather write system call.
*/
#ifndef _SYS_SYSPROTO_H_
struct writev_args {
int fd;
struct iovec *iovp;
u_int iovcnt;
};
#endif
int
-writev(struct thread *td, struct writev_args *uap)
+sys_writev(struct thread *td, struct writev_args *uap)
{
struct uio *auio;
int error;
error = copyinuio(uap->iovp, uap->iovcnt, &auio);
if (error)
return (error);
error = kern_writev(td, uap->fd, auio);
free(auio, M_IOV);
return (error);
}
int
kern_writev(struct thread *td, int fd, struct uio *auio)
{
struct file *fp;
int error;
error = fget_write(td, fd, CAP_WRITE | CAP_SEEK, &fp);
if (error)
return (error);
error = dofilewrite(td, fd, fp, auio, (off_t)-1, 0);
fdrop(fp, td);
return (error);
}
/*
* Gather positioned write system call.
*/
#ifndef _SYS_SYSPROTO_H_
struct pwritev_args {
int fd;
struct iovec *iovp;
u_int iovcnt;
off_t offset;
};
#endif
int
-pwritev(struct thread *td, struct pwritev_args *uap)
+sys_pwritev(struct thread *td, struct pwritev_args *uap)
{
struct uio *auio;
int error;
error = copyinuio(uap->iovp, uap->iovcnt, &auio);
if (error)
return (error);
error = kern_pwritev(td, uap->fd, auio, uap->offset);
free(auio, M_IOV);
return (error);
}
int
kern_pwritev(td, fd, auio, offset)
struct thread *td;
struct uio *auio;
int fd;
off_t offset;
{
struct file *fp;
int error;
error = fget_write(td, fd, CAP_WRITE, &fp);
if (error)
return (error);
if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
error = ESPIPE;
else if (offset < 0 && fp->f_vnode->v_type != VCHR)
error = EINVAL;
else
error = dofilewrite(td, fd, fp, auio, offset, FOF_OFFSET);
fdrop(fp, td);
return (error);
}
/*
* Common code for writev and pwritev that writes data to
* a file using the passed in uio, offset, and flags.
*/
static int
dofilewrite(td, fd, fp, auio, offset, flags)
struct thread *td;
int fd;
struct file *fp;
struct uio *auio;
off_t offset;
int flags;
{
ssize_t cnt;
int error;
#ifdef KTRACE
struct uio *ktruio = NULL;
#endif
auio->uio_rw = UIO_WRITE;
auio->uio_td = td;
auio->uio_offset = offset;
#ifdef KTRACE
if (KTRPOINT(td, KTR_GENIO))
ktruio = cloneuio(auio);
#endif
cnt = auio->uio_resid;
if (fp->f_type == DTYPE_VNODE)
bwillwrite();
if ((error = fo_write(fp, auio, td->td_ucred, flags, td))) {
if (auio->uio_resid != cnt && (error == ERESTART ||
error == EINTR || error == EWOULDBLOCK))
error = 0;
/* Socket layer is responsible for issuing SIGPIPE. */
if (fp->f_type != DTYPE_SOCKET && error == EPIPE) {
PROC_LOCK(td->td_proc);
tdsignal(td, SIGPIPE);
PROC_UNLOCK(td->td_proc);
}
}
cnt -= auio->uio_resid;
#ifdef KTRACE
if (ktruio != NULL) {
ktruio->uio_resid = cnt;
ktrgenio(fd, UIO_WRITE, ktruio, error);
}
#endif
td->td_retval[0] = cnt;
return (error);
}
/*
* Truncate a file given a file descriptor.
*
* Can't use fget_write() here, since must return EINVAL and not EBADF if the
* descriptor isn't writable.
*/
int
kern_ftruncate(td, fd, length)
struct thread *td;
int fd;
off_t length;
{
struct file *fp;
int error;
AUDIT_ARG_FD(fd);
if (length < 0)
return (EINVAL);
error = fget(td, fd, CAP_FTRUNCATE, &fp);
if (error)
return (error);
AUDIT_ARG_FILE(td->td_proc, fp);
if (!(fp->f_flag & FWRITE)) {
fdrop(fp, td);
return (EINVAL);
}
error = fo_truncate(fp, length, td->td_ucred, td);
fdrop(fp, td);
return (error);
}
#ifndef _SYS_SYSPROTO_H_
struct ftruncate_args {
int fd;
int pad;
off_t length;
};
#endif
int
-ftruncate(td, uap)
+sys_ftruncate(td, uap)
struct thread *td;
struct ftruncate_args *uap;
{
return (kern_ftruncate(td, uap->fd, uap->length));
}
#if defined(COMPAT_43)
#ifndef _SYS_SYSPROTO_H_
struct oftruncate_args {
int fd;
long length;
};
#endif
int
oftruncate(td, uap)
struct thread *td;
struct oftruncate_args *uap;
{
return (kern_ftruncate(td, uap->fd, uap->length));
}
#endif /* COMPAT_43 */
#ifndef _SYS_SYSPROTO_H_
struct ioctl_args {
int fd;
u_long com;
caddr_t data;
};
#endif
/* ARGSUSED */
int
-ioctl(struct thread *td, struct ioctl_args *uap)
+sys_ioctl(struct thread *td, struct ioctl_args *uap)
{
u_long com;
int arg, error;
u_int size;
caddr_t data;
if (uap->com > 0xffffffff) {
printf(
"WARNING pid %d (%s): ioctl sign-extension ioctl %lx\n",
td->td_proc->p_pid, td->td_name, uap->com);
uap->com &= 0xffffffff;
}
com = uap->com;
/*
* Interpret high order word to find amount of data to be
* copied to/from the user's address space.
*/
size = IOCPARM_LEN(com);
if ((size > IOCPARM_MAX) ||
((com & (IOC_VOID | IOC_IN | IOC_OUT)) == 0) ||
#if defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
((com & IOC_OUT) && size == 0) ||
#else
((com & (IOC_IN | IOC_OUT)) && size == 0) ||
#endif
((com & IOC_VOID) && size > 0 && size != sizeof(int)))
return (ENOTTY);
if (size > 0) {
if (com & IOC_VOID) {
/* Integer argument. */
arg = (intptr_t)uap->data;
data = (void *)&arg;
size = 0;
} else
data = malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
} else
data = (void *)&uap->data;
if (com & IOC_IN) {
error = copyin(uap->data, data, (u_int)size);
if (error) {
if (size > 0)
free(data, M_IOCTLOPS);
return (error);
}
} else if (com & IOC_OUT) {
/*
* Zero the buffer so the user always
* gets back something deterministic.
*/
bzero(data, size);
}
error = kern_ioctl(td, uap->fd, com, data);
if (error == 0 && (com & IOC_OUT))
error = copyout(data, uap->data, (u_int)size);
if (size > 0)
free(data, M_IOCTLOPS);
return (error);
}
int
kern_ioctl(struct thread *td, int fd, u_long com, caddr_t data)
{
struct file *fp;
struct filedesc *fdp;
int error;
int tmp;
AUDIT_ARG_FD(fd);
AUDIT_ARG_CMD(com);
if ((error = fget(td, fd, CAP_IOCTL, &fp)) != 0)
return (error);
if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
fdrop(fp, td);
return (EBADF);
}
fdp = td->td_proc->p_fd;
switch (com) {
case FIONCLEX:
FILEDESC_XLOCK(fdp);
fdp->fd_ofileflags[fd] &= ~UF_EXCLOSE;
FILEDESC_XUNLOCK(fdp);
goto out;
case FIOCLEX:
FILEDESC_XLOCK(fdp);
fdp->fd_ofileflags[fd] |= UF_EXCLOSE;
FILEDESC_XUNLOCK(fdp);
goto out;
case FIONBIO:
if ((tmp = *(int *)data))
atomic_set_int(&fp->f_flag, FNONBLOCK);
else
atomic_clear_int(&fp->f_flag, FNONBLOCK);
data = (void *)&tmp;
break;
case FIOASYNC:
if ((tmp = *(int *)data))
atomic_set_int(&fp->f_flag, FASYNC);
else
atomic_clear_int(&fp->f_flag, FASYNC);
data = (void *)&tmp;
break;
}
error = fo_ioctl(fp, com, data, td->td_ucred, td);
out:
fdrop(fp, td);
return (error);
}
int
poll_no_poll(int events)
{
/*
* Return true for read/write. If the user asked for something
* special, return POLLNVAL, so that clients have a way of
* determining reliably whether or not the extended
* functionality is present without hard-coding knowledge
* of specific filesystem implementations.
*/
if (events & ~POLLSTANDARD)
return (POLLNVAL);
return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
}
int
-pselect(struct thread *td, struct pselect_args *uap)
+sys_pselect(struct thread *td, struct pselect_args *uap)
{
struct timespec ts;
struct timeval tv, *tvp;
sigset_t set, *uset;
int error;
if (uap->ts != NULL) {
error = copyin(uap->ts, &ts, sizeof(ts));
if (error != 0)
return (error);
TIMESPEC_TO_TIMEVAL(&tv, &ts);
tvp = &tv;
} else
tvp = NULL;
if (uap->sm != NULL) {
error = copyin(uap->sm, &set, sizeof(set));
if (error != 0)
return (error);
uset = &set;
} else
uset = NULL;
return (kern_pselect(td, uap->nd, uap->in, uap->ou, uap->ex, tvp,
uset, NFDBITS));
}
int
kern_pselect(struct thread *td, int nd, fd_set *in, fd_set *ou, fd_set *ex,
struct timeval *tvp, sigset_t *uset, int abi_nfdbits)
{
int error;
if (uset != NULL) {
error = kern_sigprocmask(td, SIG_SETMASK, uset,
&td->td_oldsigmask, 0);
if (error != 0)
return (error);
td->td_pflags |= TDP_OLDMASK;
/*
* Make sure that ast() is called on return to
* usermode and TDP_OLDMASK is cleared, restoring old
* sigmask.
*/
thread_lock(td);
td->td_flags |= TDF_ASTPENDING;
thread_unlock(td);
}
error = kern_select(td, nd, in, ou, ex, tvp, abi_nfdbits);
return (error);
}
#ifndef _SYS_SYSPROTO_H_
struct select_args {
int nd;
fd_set *in, *ou, *ex;
struct timeval *tv;
};
#endif
int
-select(struct thread *td, struct select_args *uap)
+sys_select(struct thread *td, struct select_args *uap)
{
struct timeval tv, *tvp;
int error;
if (uap->tv != NULL) {
error = copyin(uap->tv, &tv, sizeof(tv));
if (error)
return (error);
tvp = &tv;
} else
tvp = NULL;
return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp,
NFDBITS));
}
int
kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou,
fd_set *fd_ex, struct timeval *tvp, int abi_nfdbits)
{
struct filedesc *fdp;
/*
* The magic 2048 here is chosen to be just enough for FD_SETSIZE
* infds with the new FD_SETSIZE of 1024, and more than enough for
* FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
* of 256.
*/
fd_mask s_selbits[howmany(2048, NFDBITS)];
fd_mask *ibits[3], *obits[3], *selbits, *sbp;
struct timeval atv, rtv, ttv;
int error, timo;
u_int nbufbytes, ncpbytes, ncpubytes, nfdbits;
if (nd < 0)
return (EINVAL);
fdp = td->td_proc->p_fd;
if (nd > fdp->fd_lastfile + 1)
nd = fdp->fd_lastfile + 1;
/*
* Allocate just enough bits for the non-null fd_sets. Use the
* preallocated auto buffer if possible.
*/
nfdbits = roundup(nd, NFDBITS);
ncpbytes = nfdbits / NBBY;
ncpubytes = roundup(nd, abi_nfdbits) / NBBY;
nbufbytes = 0;
if (fd_in != NULL)
nbufbytes += 2 * ncpbytes;
if (fd_ou != NULL)
nbufbytes += 2 * ncpbytes;
if (fd_ex != NULL)
nbufbytes += 2 * ncpbytes;
if (nbufbytes <= sizeof s_selbits)
selbits = &s_selbits[0];
else
selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
/*
* Assign pointers into the bit buffers and fetch the input bits.
* Put the output buffers together so that they can be bzeroed
* together.
*/
sbp = selbits;
#define getbits(name, x) \
do { \
if (name == NULL) { \
ibits[x] = NULL; \
obits[x] = NULL; \
} else { \
ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \
obits[x] = sbp; \
sbp += ncpbytes / sizeof *sbp; \
error = copyin(name, ibits[x], ncpubytes); \
if (error != 0) \
goto done; \
bzero((char *)ibits[x] + ncpubytes, \
ncpbytes - ncpubytes); \
} \
} while (0)
getbits(fd_in, 0);
getbits(fd_ou, 1);
getbits(fd_ex, 2);
#undef getbits
#if BYTE_ORDER == BIG_ENDIAN && defined(__LP64__)
/*
* XXX: swizzle_fdset assumes that if abi_nfdbits != NFDBITS,
* we are running under 32-bit emulation. This should be more
* generic.
*/
#define swizzle_fdset(bits) \
if (abi_nfdbits != NFDBITS && bits != NULL) { \
int i; \
for (i = 0; i < ncpbytes / sizeof *sbp; i++) \
bits[i] = (bits[i] >> 32) | (bits[i] << 32); \
}
#else
#define swizzle_fdset(bits)
#endif
/* Make sure the bit order makes it through an ABI transition */
swizzle_fdset(ibits[0]);
swizzle_fdset(ibits[1]);
swizzle_fdset(ibits[2]);
if (nbufbytes != 0)
bzero(selbits, nbufbytes / 2);
if (tvp != NULL) {
atv = *tvp;
if (itimerfix(&atv)) {
error = EINVAL;
goto done;
}
getmicrouptime(&rtv);
timevaladd(&atv, &rtv);
} else {
atv.tv_sec = 0;
atv.tv_usec = 0;
}
timo = 0;
seltdinit(td);
/* Iterate until the timeout expires or descriptors become ready. */
for (;;) {
error = selscan(td, ibits, obits, nd);
if (error || td->td_retval[0] != 0)
break;
if (atv.tv_sec || atv.tv_usec) {
getmicrouptime(&rtv);
if (timevalcmp(&rtv, &atv, >=))
break;
ttv = atv;
timevalsub(&ttv, &rtv);
timo = ttv.tv_sec > 24 * 60 * 60 ?
24 * 60 * 60 * hz : tvtohz(&ttv);
}
error = seltdwait(td, timo);
if (error)
break;
error = selrescan(td, ibits, obits);
if (error || td->td_retval[0] != 0)
break;
}
seltdclear(td);
done:
/* select is not restarted after signals... */
if (error == ERESTART)
error = EINTR;
if (error == EWOULDBLOCK)
error = 0;
/* swizzle bit order back, if necessary */
swizzle_fdset(obits[0]);
swizzle_fdset(obits[1]);
swizzle_fdset(obits[2]);
#undef swizzle_fdset
#define putbits(name, x) \
if (name && (error2 = copyout(obits[x], name, ncpubytes))) \
error = error2;
if (error == 0) {
int error2;
putbits(fd_in, 0);
putbits(fd_ou, 1);
putbits(fd_ex, 2);
#undef putbits
}
if (selbits != &s_selbits[0])
free(selbits, M_SELECT);
return (error);
}
/*
* Convert a select bit set to poll flags.
*
* The backend always returns POLLHUP/POLLERR if appropriate and we
* return this as a set bit in any set.
*/
static int select_flags[3] = {
POLLRDNORM | POLLHUP | POLLERR,
POLLWRNORM | POLLHUP | POLLERR,
POLLRDBAND | POLLERR
};
/*
* Compute the fo_poll flags required for a fd given by the index and
* bit position in the fd_mask array.
*/
static __inline int
selflags(fd_mask **ibits, int idx, fd_mask bit)
{
int flags;
int msk;
flags = 0;
for (msk = 0; msk < 3; msk++) {
if (ibits[msk] == NULL)
continue;
if ((ibits[msk][idx] & bit) == 0)
continue;
flags |= select_flags[msk];
}
return (flags);
}
/*
* Set the appropriate output bits given a mask of fired events and the
* input bits originally requested.
*/
static __inline int
selsetbits(fd_mask **ibits, fd_mask **obits, int idx, fd_mask bit, int events)
{
int msk;
int n;
n = 0;
for (msk = 0; msk < 3; msk++) {
if ((events & select_flags[msk]) == 0)
continue;
if (ibits[msk] == NULL)
continue;
if ((ibits[msk][idx] & bit) == 0)
continue;
/*
* XXX Check for a duplicate set. This can occur because a
* socket calls selrecord() twice for each poll() call
* resulting in two selfds per real fd. selrescan() will
* call selsetbits twice as a result.
*/
if ((obits[msk][idx] & bit) != 0)
continue;
obits[msk][idx] |= bit;
n++;
}
return (n);
}
static __inline int
getselfd_cap(struct filedesc *fdp, int fd, struct file **fpp)
{
struct file *fp;
#ifdef CAPABILITIES
struct file *fp_fromcap;
int error;
#endif
if ((fp = fget_unlocked(fdp, fd)) == NULL)
return (EBADF);
#ifdef CAPABILITIES
/*
* If the file descriptor is for a capability, test rights and use
* the file descriptor references by the capability.
*/
error = cap_funwrap(fp, CAP_POLL_EVENT, &fp_fromcap);
if (error) {
fdrop(fp, curthread);
return (error);
}
if (fp != fp_fromcap) {
fhold(fp_fromcap);
fdrop(fp, curthread);
fp = fp_fromcap;
}
#endif /* CAPABILITIES */
*fpp = fp;
return (0);
}
/*
* Traverse the list of fds attached to this thread's seltd and check for
* completion.
*/
static int
selrescan(struct thread *td, fd_mask **ibits, fd_mask **obits)
{
struct filedesc *fdp;
struct selinfo *si;
struct seltd *stp;
struct selfd *sfp;
struct selfd *sfn;
struct file *fp;
fd_mask bit;
int fd, ev, n, idx;
int error;
fdp = td->td_proc->p_fd;
stp = td->td_sel;
n = 0;
STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) {
fd = (int)(uintptr_t)sfp->sf_cookie;
si = sfp->sf_si;
selfdfree(stp, sfp);
/* If the selinfo wasn't cleared the event didn't fire. */
if (si != NULL)
continue;
error = getselfd_cap(fdp, fd, &fp);
if (error)
return (error);
idx = fd / NFDBITS;
bit = (fd_mask)1 << (fd % NFDBITS);
ev = fo_poll(fp, selflags(ibits, idx, bit), td->td_ucred, td);
fdrop(fp, td);
if (ev != 0)
n += selsetbits(ibits, obits, idx, bit, ev);
}
stp->st_flags = 0;
td->td_retval[0] = n;
return (0);
}
/*
* Perform the initial filedescriptor scan and register ourselves with
* each selinfo.
*/
static int
selscan(td, ibits, obits, nfd)
struct thread *td;
fd_mask **ibits, **obits;
int nfd;
{
struct filedesc *fdp;
struct file *fp;
fd_mask bit;
int ev, flags, end, fd;
int n, idx;
int error;
fdp = td->td_proc->p_fd;
n = 0;
for (idx = 0, fd = 0; fd < nfd; idx++) {
end = imin(fd + NFDBITS, nfd);
for (bit = 1; fd < end; bit <<= 1, fd++) {
/* Compute the list of events we're interested in. */
flags = selflags(ibits, idx, bit);
if (flags == 0)
continue;
error = getselfd_cap(fdp, fd, &fp);
if (error)
return (error);
selfdalloc(td, (void *)(uintptr_t)fd);
ev = fo_poll(fp, flags, td->td_ucred, td);
fdrop(fp, td);
if (ev != 0)
n += selsetbits(ibits, obits, idx, bit, ev);
}
}
td->td_retval[0] = n;
return (0);
}
#ifndef _SYS_SYSPROTO_H_
struct poll_args {
struct pollfd *fds;
u_int nfds;
int timeout;
};
#endif
int
-poll(td, uap)
+sys_poll(td, uap)
struct thread *td;
struct poll_args *uap;
{
struct pollfd *bits;
struct pollfd smallbits[32];
struct timeval atv, rtv, ttv;
int error = 0, timo;
u_int nfds;
size_t ni;
nfds = uap->nfds;
if (nfds > maxfilesperproc && nfds > FD_SETSIZE)
return (EINVAL);
ni = nfds * sizeof(struct pollfd);
if (ni > sizeof(smallbits))
bits = malloc(ni, M_TEMP, M_WAITOK);
else
bits = smallbits;
error = copyin(uap->fds, bits, ni);
if (error)
goto done;
if (uap->timeout != INFTIM) {
atv.tv_sec = uap->timeout / 1000;
atv.tv_usec = (uap->timeout % 1000) * 1000;
if (itimerfix(&atv)) {
error = EINVAL;
goto done;
}
getmicrouptime(&rtv);
timevaladd(&atv, &rtv);
} else {
atv.tv_sec = 0;
atv.tv_usec = 0;
}
timo = 0;
seltdinit(td);
/* Iterate until the timeout expires or descriptors become ready. */
for (;;) {
error = pollscan(td, bits, nfds);
if (error || td->td_retval[0] != 0)
break;
if (atv.tv_sec || atv.tv_usec) {
getmicrouptime(&rtv);
if (timevalcmp(&rtv, &atv, >=))
break;
ttv = atv;
timevalsub(&ttv, &rtv);
timo = ttv.tv_sec > 24 * 60 * 60 ?
24 * 60 * 60 * hz : tvtohz(&ttv);
}
error = seltdwait(td, timo);
if (error)
break;
error = pollrescan(td);
if (error || td->td_retval[0] != 0)
break;
}
seltdclear(td);
done:
/* poll is not restarted after signals... */
if (error == ERESTART)
error = EINTR;
if (error == EWOULDBLOCK)
error = 0;
if (error == 0) {
error = pollout(td, bits, uap->fds, nfds);
if (error)
goto out;
}
out:
if (ni > sizeof(smallbits))
free(bits, M_TEMP);
return (error);
}
static int
pollrescan(struct thread *td)
{
struct seltd *stp;
struct selfd *sfp;
struct selfd *sfn;
struct selinfo *si;
struct filedesc *fdp;
struct file *fp;
struct pollfd *fd;
int n;
n = 0;
fdp = td->td_proc->p_fd;
stp = td->td_sel;
FILEDESC_SLOCK(fdp);
STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) {
fd = (struct pollfd *)sfp->sf_cookie;
si = sfp->sf_si;
selfdfree(stp, sfp);
/* If the selinfo wasn't cleared the event didn't fire. */
if (si != NULL)
continue;
fp = fdp->fd_ofiles[fd->fd];
#ifdef CAPABILITIES
if ((fp == NULL)
|| (cap_funwrap(fp, CAP_POLL_EVENT, &fp) != 0)) {
#else
if (fp == NULL) {
#endif
fd->revents = POLLNVAL;
n++;
continue;
}
/*
* Note: backend also returns POLLHUP and
* POLLERR if appropriate.
*/
fd->revents = fo_poll(fp, fd->events, td->td_ucred, td);
if (fd->revents != 0)
n++;
}
FILEDESC_SUNLOCK(fdp);
stp->st_flags = 0;
td->td_retval[0] = n;
return (0);
}
static int
pollout(td, fds, ufds, nfd)
struct thread *td;
struct pollfd *fds;
struct pollfd *ufds;
u_int nfd;
{
int error = 0;
u_int i = 0;
u_int n = 0;
for (i = 0; i < nfd; i++) {
error = copyout(&fds->revents, &ufds->revents,
sizeof(ufds->revents));
if (error)
return (error);
if (fds->revents != 0)
n++;
fds++;
ufds++;
}
td->td_retval[0] = n;
return (0);
}
static int
pollscan(td, fds, nfd)
struct thread *td;
struct pollfd *fds;
u_int nfd;
{
struct filedesc *fdp = td->td_proc->p_fd;
int i;
struct file *fp;
int n = 0;
FILEDESC_SLOCK(fdp);
for (i = 0; i < nfd; i++, fds++) {
if (fds->fd >= fdp->fd_nfiles) {
fds->revents = POLLNVAL;
n++;
} else if (fds->fd < 0) {
fds->revents = 0;
} else {
fp = fdp->fd_ofiles[fds->fd];
#ifdef CAPABILITIES
if ((fp == NULL)
|| (cap_funwrap(fp, CAP_POLL_EVENT, &fp) != 0)) {
#else
if (fp == NULL) {
#endif
fds->revents = POLLNVAL;
n++;
} else {
/*
* Note: backend also returns POLLHUP and
* POLLERR if appropriate.
*/
selfdalloc(td, fds);
fds->revents = fo_poll(fp, fds->events,
td->td_ucred, td);
/*
* POSIX requires POLLOUT to be never
* set simultaneously with POLLHUP.
*/
if ((fds->revents & POLLHUP) != 0)
fds->revents &= ~POLLOUT;
if (fds->revents != 0)
n++;
}
}
}
FILEDESC_SUNLOCK(fdp);
td->td_retval[0] = n;
return (0);
}
/*
* OpenBSD poll system call.
*
* XXX this isn't quite a true representation.. OpenBSD uses select ops.
*/
#ifndef _SYS_SYSPROTO_H_
struct openbsd_poll_args {
struct pollfd *fds;
u_int nfds;
int timeout;
};
#endif
int
-openbsd_poll(td, uap)
+sys_openbsd_poll(td, uap)
register struct thread *td;
register struct openbsd_poll_args *uap;
{
- return (poll(td, (struct poll_args *)uap));
+ return (sys_poll(td, (struct poll_args *)uap));
}
/*
* XXX This was created specifically to support netncp and netsmb. This
* allows the caller to specify a socket to wait for events on. It returns
* 0 if any events matched and an error otherwise. There is no way to
* determine which events fired.
*/
int
selsocket(struct socket *so, int events, struct timeval *tvp, struct thread *td)
{
struct timeval atv, rtv, ttv;
int error, timo;
if (tvp != NULL) {
atv = *tvp;
if (itimerfix(&atv))
return (EINVAL);
getmicrouptime(&rtv);
timevaladd(&atv, &rtv);
} else {
atv.tv_sec = 0;
atv.tv_usec = 0;
}
timo = 0;
seltdinit(td);
/*
* Iterate until the timeout expires or the socket becomes ready.
*/
for (;;) {
selfdalloc(td, NULL);
error = sopoll(so, events, NULL, td);
/* error here is actually the ready events. */
if (error)
return (0);
if (atv.tv_sec || atv.tv_usec) {
getmicrouptime(&rtv);
if (timevalcmp(&rtv, &atv, >=)) {
seltdclear(td);
return (EWOULDBLOCK);
}
ttv = atv;
timevalsub(&ttv, &rtv);
timo = ttv.tv_sec > 24 * 60 * 60 ?
24 * 60 * 60 * hz : tvtohz(&ttv);
}
error = seltdwait(td, timo);
seltdclear(td);
if (error)
break;
}
/* XXX Duplicates ncp/smb behavior. */
if (error == ERESTART)
error = 0;
return (error);
}
/*
* Preallocate two selfds associated with 'cookie'. Some fo_poll routines
* have two select sets, one for read and another for write.
*/
static void
selfdalloc(struct thread *td, void *cookie)
{
struct seltd *stp;
stp = td->td_sel;
if (stp->st_free1 == NULL)
stp->st_free1 = uma_zalloc(selfd_zone, M_WAITOK|M_ZERO);
stp->st_free1->sf_td = stp;
stp->st_free1->sf_cookie = cookie;
if (stp->st_free2 == NULL)
stp->st_free2 = uma_zalloc(selfd_zone, M_WAITOK|M_ZERO);
stp->st_free2->sf_td = stp;
stp->st_free2->sf_cookie = cookie;
}
static void
selfdfree(struct seltd *stp, struct selfd *sfp)
{
STAILQ_REMOVE(&stp->st_selq, sfp, selfd, sf_link);
mtx_lock(sfp->sf_mtx);
if (sfp->sf_si)
TAILQ_REMOVE(&sfp->sf_si->si_tdlist, sfp, sf_threads);
mtx_unlock(sfp->sf_mtx);
uma_zfree(selfd_zone, sfp);
}
/* Drain the waiters tied to all the selfd belonging the specified selinfo. */
void
seldrain(sip)
struct selinfo *sip;
{
/*
* This feature is already provided by doselwakeup(), thus it is
* enough to go for it.
* Eventually, the context, should take care to avoid races
* between thread calling select()/poll() and file descriptor
* detaching, but, again, the races are just the same as
* selwakeup().
*/
doselwakeup(sip, -1);
}
/*
* Record a select request.
*/
void
selrecord(selector, sip)
struct thread *selector;
struct selinfo *sip;
{
struct selfd *sfp;
struct seltd *stp;
struct mtx *mtxp;
stp = selector->td_sel;
/*
* Don't record when doing a rescan.
*/
if (stp->st_flags & SELTD_RESCAN)
return;
/*
* Grab one of the preallocated descriptors.
*/
sfp = NULL;
if ((sfp = stp->st_free1) != NULL)
stp->st_free1 = NULL;
else if ((sfp = stp->st_free2) != NULL)
stp->st_free2 = NULL;
else
panic("selrecord: No free selfd on selq");
mtxp = sip->si_mtx;
if (mtxp == NULL)
mtxp = mtx_pool_find(mtxpool_select, sip);
/*
* Initialize the sfp and queue it in the thread.
*/
sfp->sf_si = sip;
sfp->sf_mtx = mtxp;
STAILQ_INSERT_TAIL(&stp->st_selq, sfp, sf_link);
/*
* Now that we've locked the sip, check for initialization.
*/
mtx_lock(mtxp);
if (sip->si_mtx == NULL) {
sip->si_mtx = mtxp;
TAILQ_INIT(&sip->si_tdlist);
}
/*
* Add this thread to the list of selfds listening on this selinfo.
*/
TAILQ_INSERT_TAIL(&sip->si_tdlist, sfp, sf_threads);
mtx_unlock(sip->si_mtx);
}
/* Wake up a selecting thread. */
void
selwakeup(sip)
struct selinfo *sip;
{
doselwakeup(sip, -1);
}
/* Wake up a selecting thread, and set its priority. */
void
selwakeuppri(sip, pri)
struct selinfo *sip;
int pri;
{
doselwakeup(sip, pri);
}
/*
* Do a wakeup when a selectable event occurs.
*/
static void
doselwakeup(sip, pri)
struct selinfo *sip;
int pri;
{
struct selfd *sfp;
struct selfd *sfn;
struct seltd *stp;
/* If it's not initialized there can't be any waiters. */
if (sip->si_mtx == NULL)
return;
/*
* Locking the selinfo locks all selfds associated with it.
*/
mtx_lock(sip->si_mtx);
TAILQ_FOREACH_SAFE(sfp, &sip->si_tdlist, sf_threads, sfn) {
/*
* Once we remove this sfp from the list and clear the
* sf_si seltdclear will know to ignore this si.
*/
TAILQ_REMOVE(&sip->si_tdlist, sfp, sf_threads);
sfp->sf_si = NULL;
stp = sfp->sf_td;
mtx_lock(&stp->st_mtx);
stp->st_flags |= SELTD_PENDING;
cv_broadcastpri(&stp->st_wait, pri);
mtx_unlock(&stp->st_mtx);
}
mtx_unlock(sip->si_mtx);
}
static void
seltdinit(struct thread *td)
{
struct seltd *stp;
if ((stp = td->td_sel) != NULL)
goto out;
td->td_sel = stp = malloc(sizeof(*stp), M_SELECT, M_WAITOK|M_ZERO);
mtx_init(&stp->st_mtx, "sellck", NULL, MTX_DEF);
cv_init(&stp->st_wait, "select");
out:
stp->st_flags = 0;
STAILQ_INIT(&stp->st_selq);
}
static int
seltdwait(struct thread *td, int timo)
{
struct seltd *stp;
int error;
stp = td->td_sel;
/*
* An event of interest may occur while we do not hold the seltd
* locked so check the pending flag before we sleep.
*/
mtx_lock(&stp->st_mtx);
/*
* Any further calls to selrecord will be a rescan.
*/
stp->st_flags |= SELTD_RESCAN;
if (stp->st_flags & SELTD_PENDING) {
mtx_unlock(&stp->st_mtx);
return (0);
}
if (timo > 0)
error = cv_timedwait_sig(&stp->st_wait, &stp->st_mtx, timo);
else
error = cv_wait_sig(&stp->st_wait, &stp->st_mtx);
mtx_unlock(&stp->st_mtx);
return (error);
}
void
seltdfini(struct thread *td)
{
struct seltd *stp;
stp = td->td_sel;
if (stp == NULL)
return;
if (stp->st_free1)
uma_zfree(selfd_zone, stp->st_free1);
if (stp->st_free2)
uma_zfree(selfd_zone, stp->st_free2);
td->td_sel = NULL;
free(stp, M_SELECT);
}
/*
* Remove the references to the thread from all of the objects we were
* polling.
*/
static void
seltdclear(struct thread *td)
{
struct seltd *stp;
struct selfd *sfp;
struct selfd *sfn;
stp = td->td_sel;
STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn)
selfdfree(stp, sfp);
stp->st_flags = 0;
}
static void selectinit(void *);
SYSINIT(select, SI_SUB_SYSCALLS, SI_ORDER_ANY, selectinit, NULL);
static void
selectinit(void *dummy __unused)
{
selfd_zone = uma_zcreate("selfd", sizeof(struct selfd), NULL, NULL,
NULL, NULL, UMA_ALIGN_PTR, 0);
mtxpool_select = mtx_pool_create("select mtxpool", 128, MTX_DEF);
}
Index: head/sys/kern/sys_pipe.c
===================================================================
--- head/sys/kern/sys_pipe.c (revision 225616)
+++ head/sys/kern/sys_pipe.c (revision 225617)
@@ -1,1626 +1,1626 @@
/*-
* Copyright (c) 1996 John S. Dyson
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice immediately at the beginning of the file, without modification,
* this list of conditions, and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Absolutely no warranty of function or purpose is made by the author
* John S. Dyson.
* 4. Modifications may be freely made to this file if the above conditions
* are met.
*/
/*
* This file contains a high-performance replacement for the socket-based
* pipes scheme originally used in FreeBSD/4.4Lite. It does not support
* all features of sockets, but does do everything that pipes normally
* do.
*/
/*
* This code has two modes of operation, a small write mode and a large
* write mode. The small write mode acts like conventional pipes with
* a kernel buffer. If the buffer is less than PIPE_MINDIRECT, then the
* "normal" pipe buffering is done. If the buffer is between PIPE_MINDIRECT
* and PIPE_SIZE in size, the sending process pins the underlying pages in
* memory, and the receiving process copies directly from these pinned pages
* in the sending process.
*
* If the sending process receives a signal, it is possible that it will
* go away, and certainly its address space can change, because control
* is returned back to the user-mode side. In that case, the pipe code
* arranges to copy the buffer supplied by the user process, to a pageable
* kernel buffer, and the receiving process will grab the data from the
* pageable kernel buffer. Since signals don't happen all that often,
* the copy operation is normally eliminated.
*
* The constant PIPE_MINDIRECT is chosen to make sure that buffering will
* happen for small transfers so that the system will not spend all of
* its time context switching.
*
* In order to limit the resource use of pipes, two sysctls exist:
*
* kern.ipc.maxpipekva - This is a hard limit on the amount of pageable
* address space available to us in pipe_map. This value is normally
* autotuned, but may also be loader tuned.
*
* kern.ipc.pipekva - This read-only sysctl tracks the current amount of
* memory in use by pipes.
*
* Based on how large pipekva is relative to maxpipekva, the following
* will happen:
*
* 0% - 50%:
* New pipes are given 16K of memory backing, pipes may dynamically
* grow to as large as 64K where needed.
* 50% - 75%:
* New pipes are given 4K (or PAGE_SIZE) of memory backing,
* existing pipes may NOT grow.
* 75% - 100%:
* New pipes are given 4K (or PAGE_SIZE) of memory backing,
* existing pipes will be shrunk down to 4K whenever possible.
*
* Resizing may be disabled by setting kern.ipc.piperesizeallowed=0. If
* that is set, the only resize that will occur is the 0 -> SMALL_PIPE_SIZE
* resize which MUST occur for reverse-direction pipes when they are
* first used.
*
* Additional information about the current state of pipes may be obtained
* from kern.ipc.pipes, kern.ipc.pipefragretry, kern.ipc.pipeallocfail,
* and kern.ipc.piperesizefail.
*
* Locking rules: There are two locks present here: A mutex, used via
* PIPE_LOCK, and a flag, used via pipelock(). All locking is done via
* the flag, as mutexes can not persist over uiomove. The mutex
* exists only to guard access to the flag, and is not in itself a
* locking mechanism. Also note that there is only a single mutex for
* both directions of a pipe.
*
* As pipelock() may have to sleep before it can acquire the flag, it
* is important to reread all data after a call to pipelock(); everything
* in the structure may have changed.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/fcntl.h>
#include <sys/file.h>
#include <sys/filedesc.h>
#include <sys/filio.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/ttycom.h>
#include <sys/stat.h>
#include <sys/malloc.h>
#include <sys/poll.h>
#include <sys/selinfo.h>
#include <sys/signalvar.h>
#include <sys/syscallsubr.h>
#include <sys/sysctl.h>
#include <sys/sysproto.h>
#include <sys/pipe.h>
#include <sys/proc.h>
#include <sys/vnode.h>
#include <sys/uio.h>
#include <sys/event.h>
#include <security/mac/mac_framework.h>
#include <vm/vm.h>
#include <vm/vm_param.h>
#include <vm/vm_object.h>
#include <vm/vm_kern.h>
#include <vm/vm_extern.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
#include <vm/vm_page.h>
#include <vm/uma.h>
/*
* Use this define if you want to disable *fancy* VM things. Expect an
* approx 30% decrease in transfer rate. This could be useful for
* NetBSD or OpenBSD.
*/
/* #define PIPE_NODIRECT */
/*
* interfaces to the outside world
*/
static fo_rdwr_t pipe_read;
static fo_rdwr_t pipe_write;
static fo_truncate_t pipe_truncate;
static fo_ioctl_t pipe_ioctl;
static fo_poll_t pipe_poll;
static fo_kqfilter_t pipe_kqfilter;
static fo_stat_t pipe_stat;
static fo_close_t pipe_close;
static struct fileops pipeops = {
.fo_read = pipe_read,
.fo_write = pipe_write,
.fo_truncate = pipe_truncate,
.fo_ioctl = pipe_ioctl,
.fo_poll = pipe_poll,
.fo_kqfilter = pipe_kqfilter,
.fo_stat = pipe_stat,
.fo_close = pipe_close,
.fo_chmod = invfo_chmod,
.fo_chown = invfo_chown,
.fo_flags = DFLAG_PASSABLE
};
static void filt_pipedetach(struct knote *kn);
static int filt_piperead(struct knote *kn, long hint);
static int filt_pipewrite(struct knote *kn, long hint);
static struct filterops pipe_rfiltops = {
.f_isfd = 1,
.f_detach = filt_pipedetach,
.f_event = filt_piperead
};
static struct filterops pipe_wfiltops = {
.f_isfd = 1,
.f_detach = filt_pipedetach,
.f_event = filt_pipewrite
};
/*
* Default pipe buffer size(s), this can be kind-of large now because pipe
* space is pageable. The pipe code will try to maintain locality of
* reference for performance reasons, so small amounts of outstanding I/O
* will not wipe the cache.
*/
#define MINPIPESIZE (PIPE_SIZE/3)
#define MAXPIPESIZE (2*PIPE_SIZE/3)
static long amountpipekva;
static int pipefragretry;
static int pipeallocfail;
static int piperesizefail;
static int piperesizeallowed = 1;
SYSCTL_LONG(_kern_ipc, OID_AUTO, maxpipekva, CTLFLAG_RDTUN,
&maxpipekva, 0, "Pipe KVA limit");
SYSCTL_LONG(_kern_ipc, OID_AUTO, pipekva, CTLFLAG_RD,
&amountpipekva, 0, "Pipe KVA usage");
SYSCTL_INT(_kern_ipc, OID_AUTO, pipefragretry, CTLFLAG_RD,
&pipefragretry, 0, "Pipe allocation retries due to fragmentation");
SYSCTL_INT(_kern_ipc, OID_AUTO, pipeallocfail, CTLFLAG_RD,
&pipeallocfail, 0, "Pipe allocation failures");
SYSCTL_INT(_kern_ipc, OID_AUTO, piperesizefail, CTLFLAG_RD,
&piperesizefail, 0, "Pipe resize failures");
SYSCTL_INT(_kern_ipc, OID_AUTO, piperesizeallowed, CTLFLAG_RW,
&piperesizeallowed, 0, "Pipe resizing allowed");
static void pipeinit(void *dummy __unused);
static void pipeclose(struct pipe *cpipe);
static void pipe_free_kmem(struct pipe *cpipe);
static int pipe_create(struct pipe *pipe, int backing);
static __inline int pipelock(struct pipe *cpipe, int catch);
static __inline void pipeunlock(struct pipe *cpipe);
static __inline void pipeselwakeup(struct pipe *cpipe);
#ifndef PIPE_NODIRECT
static int pipe_build_write_buffer(struct pipe *wpipe, struct uio *uio);
static void pipe_destroy_write_buffer(struct pipe *wpipe);
static int pipe_direct_write(struct pipe *wpipe, struct uio *uio);
static void pipe_clone_write_buffer(struct pipe *wpipe);
#endif
static int pipespace(struct pipe *cpipe, int size);
static int pipespace_new(struct pipe *cpipe, int size);
static int pipe_zone_ctor(void *mem, int size, void *arg, int flags);
static int pipe_zone_init(void *mem, int size, int flags);
static void pipe_zone_fini(void *mem, int size);
static uma_zone_t pipe_zone;
SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, pipeinit, NULL);
static void
pipeinit(void *dummy __unused)
{
pipe_zone = uma_zcreate("pipe", sizeof(struct pipepair),
pipe_zone_ctor, NULL, pipe_zone_init, pipe_zone_fini,
UMA_ALIGN_PTR, 0);
KASSERT(pipe_zone != NULL, ("pipe_zone not initialized"));
}
static int
pipe_zone_ctor(void *mem, int size, void *arg, int flags)
{
struct pipepair *pp;
struct pipe *rpipe, *wpipe;
KASSERT(size == sizeof(*pp), ("pipe_zone_ctor: wrong size"));
pp = (struct pipepair *)mem;
/*
* We zero both pipe endpoints to make sure all the kmem pointers
* are NULL, flag fields are zero'd, etc. We timestamp both
* endpoints with the same time.
*/
rpipe = &pp->pp_rpipe;
bzero(rpipe, sizeof(*rpipe));
vfs_timestamp(&rpipe->pipe_ctime);
rpipe->pipe_atime = rpipe->pipe_mtime = rpipe->pipe_ctime;
wpipe = &pp->pp_wpipe;
bzero(wpipe, sizeof(*wpipe));
wpipe->pipe_ctime = rpipe->pipe_ctime;
wpipe->pipe_atime = wpipe->pipe_mtime = rpipe->pipe_ctime;
rpipe->pipe_peer = wpipe;
rpipe->pipe_pair = pp;
wpipe->pipe_peer = rpipe;
wpipe->pipe_pair = pp;
/*
* Mark both endpoints as present; they will later get free'd
* one at a time. When both are free'd, then the whole pair
* is released.
*/
rpipe->pipe_present = PIPE_ACTIVE;
wpipe->pipe_present = PIPE_ACTIVE;
/*
* Eventually, the MAC Framework may initialize the label
* in ctor or init, but for now we do it elswhere to avoid
* blocking in ctor or init.
*/
pp->pp_label = NULL;
return (0);
}
static int
pipe_zone_init(void *mem, int size, int flags)
{
struct pipepair *pp;
KASSERT(size == sizeof(*pp), ("pipe_zone_init: wrong size"));
pp = (struct pipepair *)mem;
mtx_init(&pp->pp_mtx, "pipe mutex", NULL, MTX_DEF | MTX_RECURSE);
return (0);
}
static void
pipe_zone_fini(void *mem, int size)
{
struct pipepair *pp;
KASSERT(size == sizeof(*pp), ("pipe_zone_fini: wrong size"));
pp = (struct pipepair *)mem;
mtx_destroy(&pp->pp_mtx);
}
/*
* The pipe system call for the DTYPE_PIPE type of pipes. If we fail, let
* the zone pick up the pieces via pipeclose().
*/
int
kern_pipe(struct thread *td, int fildes[2])
{
struct filedesc *fdp = td->td_proc->p_fd;
struct file *rf, *wf;
struct pipepair *pp;
struct pipe *rpipe, *wpipe;
int fd, error;
pp = uma_zalloc(pipe_zone, M_WAITOK);
#ifdef MAC
/*
* The MAC label is shared between the connected endpoints. As a
* result mac_pipe_init() and mac_pipe_create() are called once
* for the pair, and not on the endpoints.
*/
mac_pipe_init(pp);
mac_pipe_create(td->td_ucred, pp);
#endif
rpipe = &pp->pp_rpipe;
wpipe = &pp->pp_wpipe;
knlist_init_mtx(&rpipe->pipe_sel.si_note, PIPE_MTX(rpipe));
knlist_init_mtx(&wpipe->pipe_sel.si_note, PIPE_MTX(wpipe));
/* Only the forward direction pipe is backed by default */
if ((error = pipe_create(rpipe, 1)) != 0 ||
(error = pipe_create(wpipe, 0)) != 0) {
pipeclose(rpipe);
pipeclose(wpipe);
return (error);
}
rpipe->pipe_state |= PIPE_DIRECTOK;
wpipe->pipe_state |= PIPE_DIRECTOK;
error = falloc(td, &rf, &fd, 0);
if (error) {
pipeclose(rpipe);
pipeclose(wpipe);
return (error);
}
/* An extra reference on `rf' has been held for us by falloc(). */
fildes[0] = fd;
/*
* Warning: once we've gotten past allocation of the fd for the
* read-side, we can only drop the read side via fdrop() in order
* to avoid races against processes which manage to dup() the read
* side while we are blocked trying to allocate the write side.
*/
finit(rf, FREAD | FWRITE, DTYPE_PIPE, rpipe, &pipeops);
error = falloc(td, &wf, &fd, 0);
if (error) {
fdclose(fdp, rf, fildes[0], td);
fdrop(rf, td);
/* rpipe has been closed by fdrop(). */
pipeclose(wpipe);
return (error);
}
/* An extra reference on `wf' has been held for us by falloc(). */
finit(wf, FREAD | FWRITE, DTYPE_PIPE, wpipe, &pipeops);
fdrop(wf, td);
fildes[1] = fd;
fdrop(rf, td);
return (0);
}
/* ARGSUSED */
int
-pipe(struct thread *td, struct pipe_args *uap)
+sys_pipe(struct thread *td, struct pipe_args *uap)
{
int error;
int fildes[2];
error = kern_pipe(td, fildes);
if (error)
return (error);
td->td_retval[0] = fildes[0];
td->td_retval[1] = fildes[1];
return (0);
}
/*
* Allocate kva for pipe circular buffer, the space is pageable
* This routine will 'realloc' the size of a pipe safely, if it fails
* it will retain the old buffer.
* If it fails it will return ENOMEM.
*/
static int
pipespace_new(cpipe, size)
struct pipe *cpipe;
int size;
{
caddr_t buffer;
int error, cnt, firstseg;
static int curfail = 0;
static struct timeval lastfail;
KASSERT(!mtx_owned(PIPE_MTX(cpipe)), ("pipespace: pipe mutex locked"));
KASSERT(!(cpipe->pipe_state & PIPE_DIRECTW),
("pipespace: resize of direct writes not allowed"));
retry:
cnt = cpipe->pipe_buffer.cnt;
if (cnt > size)
size = cnt;
size = round_page(size);
buffer = (caddr_t) vm_map_min(pipe_map);
error = vm_map_find(pipe_map, NULL, 0,
(vm_offset_t *) &buffer, size, 1,
VM_PROT_ALL, VM_PROT_ALL, 0);
if (error != KERN_SUCCESS) {
if ((cpipe->pipe_buffer.buffer == NULL) &&
(size > SMALL_PIPE_SIZE)) {
size = SMALL_PIPE_SIZE;
pipefragretry++;
goto retry;
}
if (cpipe->pipe_buffer.buffer == NULL) {
pipeallocfail++;
if (ppsratecheck(&lastfail, &curfail, 1))
printf("kern.ipc.maxpipekva exceeded; see tuning(7)\n");
} else {
piperesizefail++;
}
return (ENOMEM);
}
/* copy data, then free old resources if we're resizing */
if (cnt > 0) {
if (cpipe->pipe_buffer.in <= cpipe->pipe_buffer.out) {
firstseg = cpipe->pipe_buffer.size - cpipe->pipe_buffer.out;
bcopy(&cpipe->pipe_buffer.buffer[cpipe->pipe_buffer.out],
buffer, firstseg);
if ((cnt - firstseg) > 0)
bcopy(cpipe->pipe_buffer.buffer, &buffer[firstseg],
cpipe->pipe_buffer.in);
} else {
bcopy(&cpipe->pipe_buffer.buffer[cpipe->pipe_buffer.out],
buffer, cnt);
}
}
pipe_free_kmem(cpipe);
cpipe->pipe_buffer.buffer = buffer;
cpipe->pipe_buffer.size = size;
cpipe->pipe_buffer.in = cnt;
cpipe->pipe_buffer.out = 0;
cpipe->pipe_buffer.cnt = cnt;
atomic_add_long(&amountpipekva, cpipe->pipe_buffer.size);
return (0);
}
/*
* Wrapper for pipespace_new() that performs locking assertions.
*/
static int
pipespace(cpipe, size)
struct pipe *cpipe;
int size;
{
KASSERT(cpipe->pipe_state & PIPE_LOCKFL,
("Unlocked pipe passed to pipespace"));
return (pipespace_new(cpipe, size));
}
/*
* lock a pipe for I/O, blocking other access
*/
static __inline int
pipelock(cpipe, catch)
struct pipe *cpipe;
int catch;
{
int error;
PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
while (cpipe->pipe_state & PIPE_LOCKFL) {
cpipe->pipe_state |= PIPE_LWANT;
error = msleep(cpipe, PIPE_MTX(cpipe),
catch ? (PRIBIO | PCATCH) : PRIBIO,
"pipelk", 0);
if (error != 0)
return (error);
}
cpipe->pipe_state |= PIPE_LOCKFL;
return (0);
}
/*
* unlock a pipe I/O lock
*/
static __inline void
pipeunlock(cpipe)
struct pipe *cpipe;
{
PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
KASSERT(cpipe->pipe_state & PIPE_LOCKFL,
("Unlocked pipe passed to pipeunlock"));
cpipe->pipe_state &= ~PIPE_LOCKFL;
if (cpipe->pipe_state & PIPE_LWANT) {
cpipe->pipe_state &= ~PIPE_LWANT;
wakeup(cpipe);
}
}
static __inline void
pipeselwakeup(cpipe)
struct pipe *cpipe;
{
PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
if (cpipe->pipe_state & PIPE_SEL) {
selwakeuppri(&cpipe->pipe_sel, PSOCK);
if (!SEL_WAITING(&cpipe->pipe_sel))
cpipe->pipe_state &= ~PIPE_SEL;
}
if ((cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio)
pgsigio(&cpipe->pipe_sigio, SIGIO, 0);
KNOTE_LOCKED(&cpipe->pipe_sel.si_note, 0);
}
/*
* Initialize and allocate VM and memory for pipe. The structure
* will start out zero'd from the ctor, so we just manage the kmem.
*/
static int
pipe_create(pipe, backing)
struct pipe *pipe;
int backing;
{
int error;
if (backing) {
if (amountpipekva > maxpipekva / 2)
error = pipespace_new(pipe, SMALL_PIPE_SIZE);
else
error = pipespace_new(pipe, PIPE_SIZE);
} else {
/* If we're not backing this pipe, no need to do anything. */
error = 0;
}
return (error);
}
/* ARGSUSED */
static int
pipe_read(fp, uio, active_cred, flags, td)
struct file *fp;
struct uio *uio;
struct ucred *active_cred;
struct thread *td;
int flags;
{
struct pipe *rpipe = fp->f_data;
int error;
int nread = 0;
u_int size;
PIPE_LOCK(rpipe);
++rpipe->pipe_busy;
error = pipelock(rpipe, 1);
if (error)
goto unlocked_error;
#ifdef MAC
error = mac_pipe_check_read(active_cred, rpipe->pipe_pair);
if (error)
goto locked_error;
#endif
if (amountpipekva > (3 * maxpipekva) / 4) {
if (!(rpipe->pipe_state & PIPE_DIRECTW) &&
(rpipe->pipe_buffer.size > SMALL_PIPE_SIZE) &&
(rpipe->pipe_buffer.cnt <= SMALL_PIPE_SIZE) &&
(piperesizeallowed == 1)) {
PIPE_UNLOCK(rpipe);
pipespace(rpipe, SMALL_PIPE_SIZE);
PIPE_LOCK(rpipe);
}
}
while (uio->uio_resid) {
/*
* normal pipe buffer receive
*/
if (rpipe->pipe_buffer.cnt > 0) {
size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out;
if (size > rpipe->pipe_buffer.cnt)
size = rpipe->pipe_buffer.cnt;
if (size > (u_int) uio->uio_resid)
size = (u_int) uio->uio_resid;
PIPE_UNLOCK(rpipe);
error = uiomove(
&rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out],
size, uio);
PIPE_LOCK(rpipe);
if (error)
break;
rpipe->pipe_buffer.out += size;
if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size)
rpipe->pipe_buffer.out = 0;
rpipe->pipe_buffer.cnt -= size;
/*
* If there is no more to read in the pipe, reset
* its pointers to the beginning. This improves
* cache hit stats.
*/
if (rpipe->pipe_buffer.cnt == 0) {
rpipe->pipe_buffer.in = 0;
rpipe->pipe_buffer.out = 0;
}
nread += size;
#ifndef PIPE_NODIRECT
/*
* Direct copy, bypassing a kernel buffer.
*/
} else if ((size = rpipe->pipe_map.cnt) &&
(rpipe->pipe_state & PIPE_DIRECTW)) {
if (size > (u_int) uio->uio_resid)
size = (u_int) uio->uio_resid;
PIPE_UNLOCK(rpipe);
error = uiomove_fromphys(rpipe->pipe_map.ms,
rpipe->pipe_map.pos, size, uio);
PIPE_LOCK(rpipe);
if (error)
break;
nread += size;
rpipe->pipe_map.pos += size;
rpipe->pipe_map.cnt -= size;
if (rpipe->pipe_map.cnt == 0) {
rpipe->pipe_state &= ~PIPE_DIRECTW;
wakeup(rpipe);
}
#endif
} else {
/*
* detect EOF condition
* read returns 0 on EOF, no need to set error
*/
if (rpipe->pipe_state & PIPE_EOF)
break;
/*
* If the "write-side" has been blocked, wake it up now.
*/
if (rpipe->pipe_state & PIPE_WANTW) {
rpipe->pipe_state &= ~PIPE_WANTW;
wakeup(rpipe);
}
/*
* Break if some data was read.
*/
if (nread > 0)
break;
/*
* Unlock the pipe buffer for our remaining processing.
* We will either break out with an error or we will
* sleep and relock to loop.
*/
pipeunlock(rpipe);
/*
* Handle non-blocking mode operation or
* wait for more data.
*/
if (fp->f_flag & FNONBLOCK) {
error = EAGAIN;
} else {
rpipe->pipe_state |= PIPE_WANTR;
if ((error = msleep(rpipe, PIPE_MTX(rpipe),
PRIBIO | PCATCH,
"piperd", 0)) == 0)
error = pipelock(rpipe, 1);
}
if (error)
goto unlocked_error;
}
}
#ifdef MAC
locked_error:
#endif
pipeunlock(rpipe);
/* XXX: should probably do this before getting any locks. */
if (error == 0)
vfs_timestamp(&rpipe->pipe_atime);
unlocked_error:
--rpipe->pipe_busy;
/*
* PIPE_WANT processing only makes sense if pipe_busy is 0.
*/
if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) {
rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW);
wakeup(rpipe);
} else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) {
/*
* Handle write blocking hysteresis.
*/
if (rpipe->pipe_state & PIPE_WANTW) {
rpipe->pipe_state &= ~PIPE_WANTW;
wakeup(rpipe);
}
}
if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF)
pipeselwakeup(rpipe);
PIPE_UNLOCK(rpipe);
return (error);
}
#ifndef PIPE_NODIRECT
/*
* Map the sending processes' buffer into kernel space and wire it.
* This is similar to a physical write operation.
*/
static int
pipe_build_write_buffer(wpipe, uio)
struct pipe *wpipe;
struct uio *uio;
{
u_int size;
int i;
PIPE_LOCK_ASSERT(wpipe, MA_NOTOWNED);
KASSERT(wpipe->pipe_state & PIPE_DIRECTW,
("Clone attempt on non-direct write pipe!"));
size = (u_int) uio->uio_iov->iov_len;
if (size > wpipe->pipe_buffer.size)
size = wpipe->pipe_buffer.size;
if ((i = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map,
(vm_offset_t)uio->uio_iov->iov_base, size, VM_PROT_READ,
wpipe->pipe_map.ms, PIPENPAGES)) < 0)
return (EFAULT);
/*
* set up the control block
*/
wpipe->pipe_map.npages = i;
wpipe->pipe_map.pos =
((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK;
wpipe->pipe_map.cnt = size;
/*
* and update the uio data
*/
uio->uio_iov->iov_len -= size;
uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + size;
if (uio->uio_iov->iov_len == 0)
uio->uio_iov++;
uio->uio_resid -= size;
uio->uio_offset += size;
return (0);
}
/*
* unmap and unwire the process buffer
*/
static void
pipe_destroy_write_buffer(wpipe)
struct pipe *wpipe;
{
PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
vm_page_unhold_pages(wpipe->pipe_map.ms, wpipe->pipe_map.npages);
wpipe->pipe_map.npages = 0;
}
/*
* In the case of a signal, the writing process might go away. This
* code copies the data into the circular buffer so that the source
* pages can be freed without loss of data.
*/
static void
pipe_clone_write_buffer(wpipe)
struct pipe *wpipe;
{
struct uio uio;
struct iovec iov;
int size;
int pos;
PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
size = wpipe->pipe_map.cnt;
pos = wpipe->pipe_map.pos;
wpipe->pipe_buffer.in = size;
wpipe->pipe_buffer.out = 0;
wpipe->pipe_buffer.cnt = size;
wpipe->pipe_state &= ~PIPE_DIRECTW;
PIPE_UNLOCK(wpipe);
iov.iov_base = wpipe->pipe_buffer.buffer;
iov.iov_len = size;
uio.uio_iov = &iov;
uio.uio_iovcnt = 1;
uio.uio_offset = 0;
uio.uio_resid = size;
uio.uio_segflg = UIO_SYSSPACE;
uio.uio_rw = UIO_READ;
uio.uio_td = curthread;
uiomove_fromphys(wpipe->pipe_map.ms, pos, size, &uio);
PIPE_LOCK(wpipe);
pipe_destroy_write_buffer(wpipe);
}
/*
* This implements the pipe buffer write mechanism. Note that only
* a direct write OR a normal pipe write can be pending at any given time.
* If there are any characters in the pipe buffer, the direct write will
* be deferred until the receiving process grabs all of the bytes from
* the pipe buffer. Then the direct mapping write is set-up.
*/
static int
pipe_direct_write(wpipe, uio)
struct pipe *wpipe;
struct uio *uio;
{
int error;
retry:
PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
error = pipelock(wpipe, 1);
if (wpipe->pipe_state & PIPE_EOF)
error = EPIPE;
if (error) {
pipeunlock(wpipe);
goto error1;
}
while (wpipe->pipe_state & PIPE_DIRECTW) {
if (wpipe->pipe_state & PIPE_WANTR) {
wpipe->pipe_state &= ~PIPE_WANTR;
wakeup(wpipe);
}
pipeselwakeup(wpipe);
wpipe->pipe_state |= PIPE_WANTW;
pipeunlock(wpipe);
error = msleep(wpipe, PIPE_MTX(wpipe),
PRIBIO | PCATCH, "pipdww", 0);
if (error)
goto error1;
else
goto retry;
}
wpipe->pipe_map.cnt = 0; /* transfer not ready yet */
if (wpipe->pipe_buffer.cnt > 0) {
if (wpipe->pipe_state & PIPE_WANTR) {
wpipe->pipe_state &= ~PIPE_WANTR;
wakeup(wpipe);
}
pipeselwakeup(wpipe);
wpipe->pipe_state |= PIPE_WANTW;
pipeunlock(wpipe);
error = msleep(wpipe, PIPE_MTX(wpipe),
PRIBIO | PCATCH, "pipdwc", 0);
if (error)
goto error1;
else
goto retry;
}
wpipe->pipe_state |= PIPE_DIRECTW;
PIPE_UNLOCK(wpipe);
error = pipe_build_write_buffer(wpipe, uio);
PIPE_LOCK(wpipe);
if (error) {
wpipe->pipe_state &= ~PIPE_DIRECTW;
pipeunlock(wpipe);
goto error1;
}
error = 0;
while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) {
if (wpipe->pipe_state & PIPE_EOF) {
pipe_destroy_write_buffer(wpipe);
pipeselwakeup(wpipe);
pipeunlock(wpipe);
error = EPIPE;
goto error1;
}
if (wpipe->pipe_state & PIPE_WANTR) {
wpipe->pipe_state &= ~PIPE_WANTR;
wakeup(wpipe);
}
pipeselwakeup(wpipe);
pipeunlock(wpipe);
error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH,
"pipdwt", 0);
pipelock(wpipe, 0);
}
if (wpipe->pipe_state & PIPE_EOF)
error = EPIPE;
if (wpipe->pipe_state & PIPE_DIRECTW) {
/*
* this bit of trickery substitutes a kernel buffer for
* the process that might be going away.
*/
pipe_clone_write_buffer(wpipe);
} else {
pipe_destroy_write_buffer(wpipe);
}
pipeunlock(wpipe);
return (error);
error1:
wakeup(wpipe);
return (error);
}
#endif
static int
pipe_write(fp, uio, active_cred, flags, td)
struct file *fp;
struct uio *uio;
struct ucred *active_cred;
struct thread *td;
int flags;
{
int error = 0;
int desiredsize, orig_resid;
struct pipe *wpipe, *rpipe;
rpipe = fp->f_data;
wpipe = rpipe->pipe_peer;
PIPE_LOCK(rpipe);
error = pipelock(wpipe, 1);
if (error) {
PIPE_UNLOCK(rpipe);
return (error);
}
/*
* detect loss of pipe read side, issue SIGPIPE if lost.
*/
if (wpipe->pipe_present != PIPE_ACTIVE ||
(wpipe->pipe_state & PIPE_EOF)) {
pipeunlock(wpipe);
PIPE_UNLOCK(rpipe);
return (EPIPE);
}
#ifdef MAC
error = mac_pipe_check_write(active_cred, wpipe->pipe_pair);
if (error) {
pipeunlock(wpipe);
PIPE_UNLOCK(rpipe);
return (error);
}
#endif
++wpipe->pipe_busy;
/* Choose a larger size if it's advantageous */
desiredsize = max(SMALL_PIPE_SIZE, wpipe->pipe_buffer.size);
while (desiredsize < wpipe->pipe_buffer.cnt + uio->uio_resid) {
if (piperesizeallowed != 1)
break;
if (amountpipekva > maxpipekva / 2)
break;
if (desiredsize == BIG_PIPE_SIZE)
break;
desiredsize = desiredsize * 2;
}
/* Choose a smaller size if we're in a OOM situation */
if ((amountpipekva > (3 * maxpipekva) / 4) &&
(wpipe->pipe_buffer.size > SMALL_PIPE_SIZE) &&
(wpipe->pipe_buffer.cnt <= SMALL_PIPE_SIZE) &&
(piperesizeallowed == 1))
desiredsize = SMALL_PIPE_SIZE;
/* Resize if the above determined that a new size was necessary */
if ((desiredsize != wpipe->pipe_buffer.size) &&
((wpipe->pipe_state & PIPE_DIRECTW) == 0)) {
PIPE_UNLOCK(wpipe);
pipespace(wpipe, desiredsize);
PIPE_LOCK(wpipe);
}
if (wpipe->pipe_buffer.size == 0) {
/*
* This can only happen for reverse direction use of pipes
* in a complete OOM situation.
*/
error = ENOMEM;
--wpipe->pipe_busy;
pipeunlock(wpipe);
PIPE_UNLOCK(wpipe);
return (error);
}
pipeunlock(wpipe);
orig_resid = uio->uio_resid;
while (uio->uio_resid) {
int space;
pipelock(wpipe, 0);
if (wpipe->pipe_state & PIPE_EOF) {
pipeunlock(wpipe);
error = EPIPE;
break;
}
#ifndef PIPE_NODIRECT
/*
* If the transfer is large, we can gain performance if
* we do process-to-process copies directly.
* If the write is non-blocking, we don't use the
* direct write mechanism.
*
* The direct write mechanism will detect the reader going
* away on us.
*/
if (uio->uio_segflg == UIO_USERSPACE &&
uio->uio_iov->iov_len >= PIPE_MINDIRECT &&
wpipe->pipe_buffer.size >= PIPE_MINDIRECT &&
(fp->f_flag & FNONBLOCK) == 0) {
pipeunlock(wpipe);
error = pipe_direct_write(wpipe, uio);
if (error)
break;
continue;
}
#endif
/*
* Pipe buffered writes cannot be coincidental with
* direct writes. We wait until the currently executing
* direct write is completed before we start filling the
* pipe buffer. We break out if a signal occurs or the
* reader goes away.
*/
if (wpipe->pipe_state & PIPE_DIRECTW) {
if (wpipe->pipe_state & PIPE_WANTR) {
wpipe->pipe_state &= ~PIPE_WANTR;
wakeup(wpipe);
}
pipeselwakeup(wpipe);
wpipe->pipe_state |= PIPE_WANTW;
pipeunlock(wpipe);
error = msleep(wpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH,
"pipbww", 0);
if (error)
break;
else
continue;
}
space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
/* Writes of size <= PIPE_BUF must be atomic. */
if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF))
space = 0;
if (space > 0) {
int size; /* Transfer size */
int segsize; /* first segment to transfer */
/*
* Transfer size is minimum of uio transfer
* and free space in pipe buffer.
*/
if (space > uio->uio_resid)
size = uio->uio_resid;
else
size = space;
/*
* First segment to transfer is minimum of
* transfer size and contiguous space in
* pipe buffer. If first segment to transfer
* is less than the transfer size, we've got
* a wraparound in the buffer.
*/
segsize = wpipe->pipe_buffer.size -
wpipe->pipe_buffer.in;
if (segsize > size)
segsize = size;
/* Transfer first segment */
PIPE_UNLOCK(rpipe);
error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in],
segsize, uio);
PIPE_LOCK(rpipe);
if (error == 0 && segsize < size) {
KASSERT(wpipe->pipe_buffer.in + segsize ==
wpipe->pipe_buffer.size,
("Pipe buffer wraparound disappeared"));
/*
* Transfer remaining part now, to
* support atomic writes. Wraparound
* happened.
*/
PIPE_UNLOCK(rpipe);
error = uiomove(
&wpipe->pipe_buffer.buffer[0],
size - segsize, uio);
PIPE_LOCK(rpipe);
}
if (error == 0) {
wpipe->pipe_buffer.in += size;
if (wpipe->pipe_buffer.in >=
wpipe->pipe_buffer.size) {
KASSERT(wpipe->pipe_buffer.in ==
size - segsize +
wpipe->pipe_buffer.size,
("Expected wraparound bad"));
wpipe->pipe_buffer.in = size - segsize;
}
wpipe->pipe_buffer.cnt += size;
KASSERT(wpipe->pipe_buffer.cnt <=
wpipe->pipe_buffer.size,
("Pipe buffer overflow"));
}
pipeunlock(wpipe);
if (error != 0)
break;
} else {
/*
* If the "read-side" has been blocked, wake it up now.
*/
if (wpipe->pipe_state & PIPE_WANTR) {
wpipe->pipe_state &= ~PIPE_WANTR;
wakeup(wpipe);
}
/*
* don't block on non-blocking I/O
*/
if (fp->f_flag & FNONBLOCK) {
error = EAGAIN;
pipeunlock(wpipe);
break;
}
/*
* We have no more space and have something to offer,
* wake up select/poll.
*/
pipeselwakeup(wpipe);
wpipe->pipe_state |= PIPE_WANTW;
pipeunlock(wpipe);
error = msleep(wpipe, PIPE_MTX(rpipe),
PRIBIO | PCATCH, "pipewr", 0);
if (error != 0)
break;
}
}
pipelock(wpipe, 0);
--wpipe->pipe_busy;
if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANT)) {
wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR);
wakeup(wpipe);
} else if (wpipe->pipe_buffer.cnt > 0) {
/*
* If we have put any characters in the buffer, we wake up
* the reader.
*/
if (wpipe->pipe_state & PIPE_WANTR) {
wpipe->pipe_state &= ~PIPE_WANTR;
wakeup(wpipe);
}
}
/*
* Don't return EPIPE if I/O was successful
*/
if ((wpipe->pipe_buffer.cnt == 0) &&
(uio->uio_resid == 0) &&
(error == EPIPE)) {
error = 0;
}
if (error == 0)
vfs_timestamp(&wpipe->pipe_mtime);
/*
* We have something to offer,
* wake up select/poll.
*/
if (wpipe->pipe_buffer.cnt)
pipeselwakeup(wpipe);
pipeunlock(wpipe);
PIPE_UNLOCK(rpipe);
return (error);
}
/* ARGSUSED */
static int
pipe_truncate(fp, length, active_cred, td)
struct file *fp;
off_t length;
struct ucred *active_cred;
struct thread *td;
{
return (EINVAL);
}
/*
* we implement a very minimal set of ioctls for compatibility with sockets.
*/
static int
pipe_ioctl(fp, cmd, data, active_cred, td)
struct file *fp;
u_long cmd;
void *data;
struct ucred *active_cred;
struct thread *td;
{
struct pipe *mpipe = fp->f_data;
int error;
PIPE_LOCK(mpipe);
#ifdef MAC
error = mac_pipe_check_ioctl(active_cred, mpipe->pipe_pair, cmd, data);
if (error) {
PIPE_UNLOCK(mpipe);
return (error);
}
#endif
error = 0;
switch (cmd) {
case FIONBIO:
break;
case FIOASYNC:
if (*(int *)data) {
mpipe->pipe_state |= PIPE_ASYNC;
} else {
mpipe->pipe_state &= ~PIPE_ASYNC;
}
break;
case FIONREAD:
if (mpipe->pipe_state & PIPE_DIRECTW)
*(int *)data = mpipe->pipe_map.cnt;
else
*(int *)data = mpipe->pipe_buffer.cnt;
break;
case FIOSETOWN:
PIPE_UNLOCK(mpipe);
error = fsetown(*(int *)data, &mpipe->pipe_sigio);
goto out_unlocked;
case FIOGETOWN:
*(int *)data = fgetown(&mpipe->pipe_sigio);
break;
/* This is deprecated, FIOSETOWN should be used instead. */
case TIOCSPGRP:
PIPE_UNLOCK(mpipe);
error = fsetown(-(*(int *)data), &mpipe->pipe_sigio);
goto out_unlocked;
/* This is deprecated, FIOGETOWN should be used instead. */
case TIOCGPGRP:
*(int *)data = -fgetown(&mpipe->pipe_sigio);
break;
default:
error = ENOTTY;
break;
}
PIPE_UNLOCK(mpipe);
out_unlocked:
return (error);
}
static int
pipe_poll(fp, events, active_cred, td)
struct file *fp;
int events;
struct ucred *active_cred;
struct thread *td;
{
struct pipe *rpipe = fp->f_data;
struct pipe *wpipe;
int revents = 0;
#ifdef MAC
int error;
#endif
wpipe = rpipe->pipe_peer;
PIPE_LOCK(rpipe);
#ifdef MAC
error = mac_pipe_check_poll(active_cred, rpipe->pipe_pair);
if (error)
goto locked_error;
#endif
if (events & (POLLIN | POLLRDNORM))
if ((rpipe->pipe_state & PIPE_DIRECTW) ||
(rpipe->pipe_buffer.cnt > 0))
revents |= events & (POLLIN | POLLRDNORM);
if (events & (POLLOUT | POLLWRNORM))
if (wpipe->pipe_present != PIPE_ACTIVE ||
(wpipe->pipe_state & PIPE_EOF) ||
(((wpipe->pipe_state & PIPE_DIRECTW) == 0) &&
(wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF))
revents |= events & (POLLOUT | POLLWRNORM);
if ((events & POLLINIGNEOF) == 0) {
if (rpipe->pipe_state & PIPE_EOF) {
revents |= (events & (POLLIN | POLLRDNORM));
if (wpipe->pipe_present != PIPE_ACTIVE ||
(wpipe->pipe_state & PIPE_EOF))
revents |= POLLHUP;
}
}
if (revents == 0) {
if (events & (POLLIN | POLLRDNORM)) {
selrecord(td, &rpipe->pipe_sel);
if (SEL_WAITING(&rpipe->pipe_sel))
rpipe->pipe_state |= PIPE_SEL;
}
if (events & (POLLOUT | POLLWRNORM)) {
selrecord(td, &wpipe->pipe_sel);
if (SEL_WAITING(&wpipe->pipe_sel))
wpipe->pipe_state |= PIPE_SEL;
}
}
#ifdef MAC
locked_error:
#endif
PIPE_UNLOCK(rpipe);
return (revents);
}
/*
* We shouldn't need locks here as we're doing a read and this should
* be a natural race.
*/
static int
pipe_stat(fp, ub, active_cred, td)
struct file *fp;
struct stat *ub;
struct ucred *active_cred;
struct thread *td;
{
struct pipe *pipe = fp->f_data;
#ifdef MAC
int error;
PIPE_LOCK(pipe);
error = mac_pipe_check_stat(active_cred, pipe->pipe_pair);
PIPE_UNLOCK(pipe);
if (error)
return (error);
#endif
bzero(ub, sizeof(*ub));
ub->st_mode = S_IFIFO;
ub->st_blksize = PAGE_SIZE;
if (pipe->pipe_state & PIPE_DIRECTW)
ub->st_size = pipe->pipe_map.cnt;
else
ub->st_size = pipe->pipe_buffer.cnt;
ub->st_blocks = (ub->st_size + ub->st_blksize - 1) / ub->st_blksize;
ub->st_atim = pipe->pipe_atime;
ub->st_mtim = pipe->pipe_mtime;
ub->st_ctim = pipe->pipe_ctime;
ub->st_uid = fp->f_cred->cr_uid;
ub->st_gid = fp->f_cred->cr_gid;
/*
* Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen.
* XXX (st_dev, st_ino) should be unique.
*/
return (0);
}
/* ARGSUSED */
static int
pipe_close(fp, td)
struct file *fp;
struct thread *td;
{
struct pipe *cpipe = fp->f_data;
fp->f_ops = &badfileops;
fp->f_data = NULL;
funsetown(&cpipe->pipe_sigio);
pipeclose(cpipe);
return (0);
}
static void
pipe_free_kmem(cpipe)
struct pipe *cpipe;
{
KASSERT(!mtx_owned(PIPE_MTX(cpipe)),
("pipe_free_kmem: pipe mutex locked"));
if (cpipe->pipe_buffer.buffer != NULL) {
atomic_subtract_long(&amountpipekva, cpipe->pipe_buffer.size);
vm_map_remove(pipe_map,
(vm_offset_t)cpipe->pipe_buffer.buffer,
(vm_offset_t)cpipe->pipe_buffer.buffer + cpipe->pipe_buffer.size);
cpipe->pipe_buffer.buffer = NULL;
}
#ifndef PIPE_NODIRECT
{
cpipe->pipe_map.cnt = 0;
cpipe->pipe_map.pos = 0;
cpipe->pipe_map.npages = 0;
}
#endif
}
/*
* shutdown the pipe
*/
static void
pipeclose(cpipe)
struct pipe *cpipe;
{
struct pipepair *pp;
struct pipe *ppipe;
KASSERT(cpipe != NULL, ("pipeclose: cpipe == NULL"));
PIPE_LOCK(cpipe);
pipelock(cpipe, 0);
pp = cpipe->pipe_pair;
pipeselwakeup(cpipe);
/*
* If the other side is blocked, wake it up saying that
* we want to close it down.
*/
cpipe->pipe_state |= PIPE_EOF;
while (cpipe->pipe_busy) {
wakeup(cpipe);
cpipe->pipe_state |= PIPE_WANT;
pipeunlock(cpipe);
msleep(cpipe, PIPE_MTX(cpipe), PRIBIO, "pipecl", 0);
pipelock(cpipe, 0);
}
/*
* Disconnect from peer, if any.
*/
ppipe = cpipe->pipe_peer;
if (ppipe->pipe_present == PIPE_ACTIVE) {
pipeselwakeup(ppipe);
ppipe->pipe_state |= PIPE_EOF;
wakeup(ppipe);
KNOTE_LOCKED(&ppipe->pipe_sel.si_note, 0);
}
/*
* Mark this endpoint as free. Release kmem resources. We
* don't mark this endpoint as unused until we've finished
* doing that, or the pipe might disappear out from under
* us.
*/
PIPE_UNLOCK(cpipe);
pipe_free_kmem(cpipe);
PIPE_LOCK(cpipe);
cpipe->pipe_present = PIPE_CLOSING;
pipeunlock(cpipe);
/*
* knlist_clear() may sleep dropping the PIPE_MTX. Set the
* PIPE_FINALIZED, that allows other end to free the
* pipe_pair, only after the knotes are completely dismantled.
*/
knlist_clear(&cpipe->pipe_sel.si_note, 1);
cpipe->pipe_present = PIPE_FINALIZED;
seldrain(&cpipe->pipe_sel);
knlist_destroy(&cpipe->pipe_sel.si_note);
/*
* If both endpoints are now closed, release the memory for the
* pipe pair. If not, unlock.
*/
if (ppipe->pipe_present == PIPE_FINALIZED) {
PIPE_UNLOCK(cpipe);
#ifdef MAC
mac_pipe_destroy(pp);
#endif
uma_zfree(pipe_zone, cpipe->pipe_pair);
} else
PIPE_UNLOCK(cpipe);
}
/*ARGSUSED*/
static int
pipe_kqfilter(struct file *fp, struct knote *kn)
{
struct pipe *cpipe;
cpipe = kn->kn_fp->f_data;
PIPE_LOCK(cpipe);
switch (kn->kn_filter) {
case EVFILT_READ:
kn->kn_fop = &pipe_rfiltops;
break;
case EVFILT_WRITE:
kn->kn_fop = &pipe_wfiltops;
if (cpipe->pipe_peer->pipe_present != PIPE_ACTIVE) {
/* other end of pipe has been closed */
PIPE_UNLOCK(cpipe);
return (EPIPE);
}
cpipe = cpipe->pipe_peer;
break;
default:
PIPE_UNLOCK(cpipe);
return (EINVAL);
}
knlist_add(&cpipe->pipe_sel.si_note, kn, 1);
PIPE_UNLOCK(cpipe);
return (0);
}
static void
filt_pipedetach(struct knote *kn)
{
struct pipe *cpipe = (struct pipe *)kn->kn_fp->f_data;
PIPE_LOCK(cpipe);
if (kn->kn_filter == EVFILT_WRITE)
cpipe = cpipe->pipe_peer;
knlist_remove(&cpipe->pipe_sel.si_note, kn, 1);
PIPE_UNLOCK(cpipe);
}
/*ARGSUSED*/
static int
filt_piperead(struct knote *kn, long hint)
{
struct pipe *rpipe = kn->kn_fp->f_data;
struct pipe *wpipe = rpipe->pipe_peer;
int ret;
PIPE_LOCK(rpipe);
kn->kn_data = rpipe->pipe_buffer.cnt;
if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW))
kn->kn_data = rpipe->pipe_map.cnt;
if ((rpipe->pipe_state & PIPE_EOF) ||
wpipe->pipe_present != PIPE_ACTIVE ||
(wpipe->pipe_state & PIPE_EOF)) {
kn->kn_flags |= EV_EOF;
PIPE_UNLOCK(rpipe);
return (1);
}
ret = kn->kn_data > 0;
PIPE_UNLOCK(rpipe);
return ret;
}
/*ARGSUSED*/
static int
filt_pipewrite(struct knote *kn, long hint)
{
struct pipe *rpipe = kn->kn_fp->f_data;
struct pipe *wpipe = rpipe->pipe_peer;
PIPE_LOCK(rpipe);
if (wpipe->pipe_present != PIPE_ACTIVE ||
(wpipe->pipe_state & PIPE_EOF)) {
kn->kn_data = 0;
kn->kn_flags |= EV_EOF;
PIPE_UNLOCK(rpipe);
return (1);
}
kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
if (wpipe->pipe_state & PIPE_DIRECTW)
kn->kn_data = 0;
PIPE_UNLOCK(rpipe);
return (kn->kn_data >= PIPE_BUF);
}
Index: head/sys/kern/sys_procdesc.c
===================================================================
--- head/sys/kern/sys_procdesc.c (revision 225616)
+++ head/sys/kern/sys_procdesc.c (revision 225617)
@@ -1,524 +1,524 @@
/*-
* Copyright (c) 2009 Robert N. M. Watson
* All rights reserved.
*
* This software was developed at the University of Cambridge Computer
* Laboratory with support from a grant from Google, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*-
* FreeBSD process descriptor facility.
*
* Some processes are represented by a file descriptor, which will be used in
* preference to signaling and pids for the purposes of process management,
* and is, in effect, a form of capability. When a process descriptor is
* used with a process, it ceases to be visible to certain traditional UNIX
* process facilities, such as waitpid(2).
*
* Some semantics:
*
* - At most one process descriptor will exist for any process, although
* references to that descriptor may be held from many processes (or even
* be in flight between processes over a local domain socket).
* - Last close on the process descriptor will terminate the process using
* SIGKILL and reparent it to init so that there's a process to reap it
* when it's done exiting.
* - If the process exits before the descriptor is closed, it will not
* generate SIGCHLD on termination, or be picked up by waitpid().
* - The pdkill(2) system call may be used to deliver a signal to the process
* using its process descriptor.
* - The pdwait4(2) system call may be used to block (or not) on a process
* descriptor to collect termination information.
*
* Open questions:
*
* - How to handle ptrace(2)?
* - Will we want to add a pidtoprocdesc(2) system call to allow process
* descriptors to be created for processes without pfork(2)?
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_procdesc.h"
#include <sys/param.h>
#include <sys/capability.h>
#include <sys/fcntl.h>
#include <sys/file.h>
#include <sys/filedesc.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/poll.h>
#include <sys/proc.h>
#include <sys/procdesc.h>
#include <sys/resourcevar.h>
#include <sys/stat.h>
#include <sys/sysproto.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
#include <sys/ucred.h>
#include <security/audit/audit.h>
#include <vm/uma.h>
#ifdef PROCDESC
FEATURE(process_descriptors, "Process Descriptors");
static uma_zone_t procdesc_zone;
static fo_rdwr_t procdesc_read;
static fo_rdwr_t procdesc_write;
static fo_truncate_t procdesc_truncate;
static fo_ioctl_t procdesc_ioctl;
static fo_poll_t procdesc_poll;
static fo_kqfilter_t procdesc_kqfilter;
static fo_stat_t procdesc_stat;
static fo_close_t procdesc_close;
static fo_chmod_t procdesc_chmod;
static fo_chown_t procdesc_chown;
static struct fileops procdesc_ops = {
.fo_read = procdesc_read,
.fo_write = procdesc_write,
.fo_truncate = procdesc_truncate,
.fo_ioctl = procdesc_ioctl,
.fo_poll = procdesc_poll,
.fo_kqfilter = procdesc_kqfilter,
.fo_stat = procdesc_stat,
.fo_close = procdesc_close,
.fo_chmod = procdesc_chmod,
.fo_chown = procdesc_chown,
.fo_flags = DFLAG_PASSABLE,
};
/*
* Initialize with VFS so that process descriptors are available along with
* other file descriptor types. As long as it runs before init(8) starts,
* there shouldn't be a problem.
*/
static void
procdesc_init(void *dummy __unused)
{
procdesc_zone = uma_zcreate("procdesc", sizeof(struct procdesc),
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
if (procdesc_zone == NULL)
panic("procdesc_init: procdesc_zone not initialized");
}
SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, procdesc_init, NULL);
/*
* Return a locked process given a process descriptor, or ESRCH if it has
* died.
*/
int
procdesc_find(struct thread *td, int fd, cap_rights_t rights,
struct proc **p)
{
struct procdesc *pd;
struct file *fp;
int error;
error = fget(td, fd, rights, &fp);
if (error)
return (error);
if (fp->f_type != DTYPE_PROCDESC) {
error = EBADF;
goto out;
}
pd = fp->f_data;
sx_slock(&proctree_lock);
if (pd->pd_proc != NULL) {
*p = pd->pd_proc;
PROC_LOCK(*p);
} else
error = ESRCH;
sx_sunlock(&proctree_lock);
out:
fdrop(fp, td);
return (error);
}
/*
* Function to be used by procstat(1) sysctls when returning procdesc
* information.
*/
pid_t
procdesc_pid(struct file *fp_procdesc)
{
struct procdesc *pd;
KASSERT(fp_procdesc->f_type == DTYPE_PROCDESC,
("procdesc_pid: !procdesc"));
pd = fp_procdesc->f_data;
return (pd->pd_pid);
}
/*
* Retrieve the PID associated with a process descriptor.
*/
int
kern_pdgetpid(struct thread *td, int fd, cap_rights_t rights, pid_t *pidp)
{
struct file *fp;
int error;
error = fget(td, fd, rights, &fp);
if (error)
return (error);
if (fp->f_type != DTYPE_PROCDESC) {
error = EBADF;
goto out;
}
*pidp = procdesc_pid(fp);
out:
fdrop(fp, td);
return (error);
}
/*
* System call to return the pid of a process given its process descriptor.
*/
int
-pdgetpid(struct thread *td, struct pdgetpid_args *uap)
+sys_pdgetpid(struct thread *td, struct pdgetpid_args *uap)
{
pid_t pid;
int error;
AUDIT_ARG_FD(uap->fd);
error = kern_pdgetpid(td, uap->fd, CAP_PDGETPID, &pid);
if (error == 0)
error = copyout(&pid, uap->pidp, sizeof(pid));
return (error);
}
/*
* When a new process is forked by pdfork(), a file descriptor is allocated
* by the fork code first, then the process is forked, and then we get a
* chance to set up the process descriptor. Failure is not permitted at this
* point, so procdesc_new() must succeed.
*/
void
procdesc_new(struct proc *p, int flags)
{
struct procdesc *pd;
pd = uma_zalloc(procdesc_zone, M_WAITOK | M_ZERO);
pd->pd_proc = p;
pd->pd_pid = p->p_pid;
p->p_procdesc = pd;
pd->pd_flags = 0;
if (flags & PD_DAEMON)
pd->pd_flags |= PDF_DAEMON;
PROCDESC_LOCK_INIT(pd);
/*
* Process descriptors start out with two references: one from their
* struct file, and the other from their struct proc.
*/
refcount_init(&pd->pd_refcount, 2);
}
/*
* Initialize a file with a process descriptor.
*/
void
procdesc_finit(struct procdesc *pdp, struct file *fp)
{
finit(fp, FREAD | FWRITE, DTYPE_PROCDESC, pdp, &procdesc_ops);
}
static void
procdesc_free(struct procdesc *pd)
{
/*
* When the last reference is released, we assert that the descriptor
* has been closed, but not that the process has exited, as we will
* detach the descriptor before the process dies if the descript is
* closed, as we can't wait synchronously.
*/
if (refcount_release(&pd->pd_refcount)) {
KASSERT(pd->pd_proc == NULL,
("procdesc_free: pd_proc != NULL"));
KASSERT((pd->pd_flags & PDF_CLOSED),
("procdesc_free: !PDF_CLOSED"));
PROCDESC_LOCK_DESTROY(pd);
uma_zfree(procdesc_zone, pd);
}
}
/*
* procdesc_exit() - notify a process descriptor that its process is exiting.
* We use the proctree_lock to ensure that process exit either happens
* strictly before or strictly after a concurrent call to procdesc_close().
*/
int
procdesc_exit(struct proc *p)
{
struct procdesc *pd;
sx_assert(&proctree_lock, SA_XLOCKED);
PROC_LOCK_ASSERT(p, MA_OWNED);
KASSERT(p->p_procdesc != NULL, ("procdesc_exit: p_procdesc NULL"));
pd = p->p_procdesc;
PROCDESC_LOCK(pd);
KASSERT((pd->pd_flags & PDF_CLOSED) == 0 || p->p_pptr == initproc,
("procdesc_exit: closed && parent not init"));
pd->pd_flags |= PDF_EXITED;
/*
* If the process descriptor has been closed, then we have nothing
* to do; return 1 so that init will get SIGCHLD and do the reaping.
* Clean up the procdesc now rather than letting it happen during
* that reap.
*/
if (pd->pd_flags & PDF_CLOSED) {
PROCDESC_UNLOCK(pd);
pd->pd_proc = NULL;
p->p_procdesc = NULL;
procdesc_free(pd);
return (1);
}
if (pd->pd_flags & PDF_SELECTED) {
pd->pd_flags &= ~PDF_SELECTED;
selwakeup(&pd->pd_selinfo);
}
PROCDESC_UNLOCK(pd);
return (0);
}
/*
* When a process descriptor is reaped, perhaps as a result of close() or
* pdwait4(), release the process's reference on the process descriptor.
*/
void
procdesc_reap(struct proc *p)
{
struct procdesc *pd;
sx_assert(&proctree_lock, SA_XLOCKED);
KASSERT(p->p_procdesc != NULL, ("procdesc_reap: p_procdesc == NULL"));
pd = p->p_procdesc;
pd->pd_proc = NULL;
procdesc_free(pd);
}
/*
* procdesc_close() - last close on a process descriptor. If the process is
* still running, terminate with SIGKILL (unless PD_DAEMON is set) and let
* init(8) clean up the mess; if not, we have to clean up the zombie ourselves.
*/
static int
procdesc_close(struct file *fp, struct thread *td)
{
struct procdesc *pd;
struct proc *p;
KASSERT(fp->f_type == DTYPE_PROCDESC, ("procdesc_close: !procdesc"));
pd = fp->f_data;
fp->f_ops = &badfileops;
fp->f_data = NULL;
sx_xlock(&proctree_lock);
PROCDESC_LOCK(pd);
pd->pd_flags |= PDF_CLOSED;
PROCDESC_UNLOCK(pd);
p = pd->pd_proc;
PROC_LOCK(p);
if (p->p_state == PRS_ZOMBIE) {
/*
* If the process is already dead and just awaiting reaping,
* do that now. This will release the process's reference to
* the process descriptor when it calls back into
* procdesc_reap().
*/
PROC_SLOCK(p);
proc_reap(curthread, p, NULL, 0, NULL);
} else {
/*
* If the process is not yet dead, we need to kill it, but we
* can't wait around synchronously for it to go away, as that
* path leads to madness (and deadlocks). First, detach the
* process from its descriptor so that its exit status will
* be reported normally.
*/
pd->pd_proc = NULL;
p->p_procdesc = NULL;
procdesc_free(pd);
/*
* Next, reparent it to init(8) so that there's someone to
* pick up the pieces; finally, terminate with prejudice.
*/
p->p_sigparent = SIGCHLD;
proc_reparent(p, initproc);
if ((pd->pd_flags & PD_DAEMON) == 0)
- psignal(p, SIGKILL);
+ kern_psignal(p, SIGKILL);
PROC_UNLOCK(p);
sx_xunlock(&proctree_lock);
}
/*
* Release the file descriptor's reference on the process descriptor.
*/
procdesc_free(pd);
return (0);
}
static int
procdesc_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
int flags, struct thread *td)
{
return (EOPNOTSUPP);
}
static int
procdesc_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
int flags, struct thread *td)
{
return (EOPNOTSUPP);
}
static int
procdesc_truncate(struct file *fp, off_t length, struct ucred *active_cred,
struct thread *td)
{
return (EOPNOTSUPP);
}
static int
procdesc_ioctl(struct file *fp, u_long com, void *data,
struct ucred *active_cred, struct thread *td)
{
return (EOPNOTSUPP);
}
static int
procdesc_poll(struct file *fp, int events, struct ucred *active_cred,
struct thread *td)
{
struct procdesc *pd;
int revents;
revents = 0;
pd = fp->f_data;
PROCDESC_LOCK(pd);
if (pd->pd_flags & PDF_EXITED)
revents |= POLLHUP;
if (revents == 0) {
selrecord(td, &pd->pd_selinfo);
pd->pd_flags |= PDF_SELECTED;
}
PROCDESC_UNLOCK(pd);
return (revents);
}
static int
procdesc_kqfilter(struct file *fp, struct knote *kn)
{
return (EOPNOTSUPP);
}
static int
procdesc_stat(struct file *fp, struct stat *sb, struct ucred *active_cred,
struct thread *td)
{
struct procdesc *pd;
struct timeval pstart;
/*
* XXXRW: Perhaps we should cache some more information from the
* process so that we can return it reliably here even after it has
* died. For example, caching its credential data.
*/
bzero(sb, sizeof(*sb));
pd = fp->f_data;
sx_slock(&proctree_lock);
if (pd->pd_proc != NULL) {
PROC_LOCK(pd->pd_proc);
/* Set birth and [acm] times to process start time. */
pstart = pd->pd_proc->p_stats->p_start;
timevaladd(&pstart, &boottime);
TIMEVAL_TO_TIMESPEC(&pstart, &sb->st_birthtim);
sb->st_atim = sb->st_birthtim;
sb->st_ctim = sb->st_birthtim;
sb->st_mtim = sb->st_birthtim;
if (pd->pd_proc->p_state != PRS_ZOMBIE)
sb->st_mode = S_IFREG | S_IRWXU;
else
sb->st_mode = S_IFREG;
sb->st_uid = pd->pd_proc->p_ucred->cr_ruid;
sb->st_gid = pd->pd_proc->p_ucred->cr_rgid;
PROC_UNLOCK(pd->pd_proc);
} else
sb->st_mode = S_IFREG;
sx_sunlock(&proctree_lock);
return (0);
}
static int
procdesc_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
struct thread *td)
{
return (EOPNOTSUPP);
}
static int
procdesc_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
struct thread *td)
{
return (EOPNOTSUPP);
}
#else /* !PROCDESC */
int
-pdgetpid(struct thread *td, struct pdgetpid_args *uap)
+sys_pdgetpid(struct thread *td, struct pdgetpid_args *uap)
{
return (ENOSYS);
}
#endif /* PROCDESC */
Index: head/sys/kern/sys_process.c
===================================================================
--- head/sys/kern/sys_process.c (revision 225616)
+++ head/sys/kern/sys_process.c (revision 225617)
@@ -1,1242 +1,1242 @@
/*-
* Copyright (c) 1994, Sean Eric Fagan
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by Sean Eric Fagan.
* 4. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_compat.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/syscallsubr.h>
#include <sys/sysent.h>
#include <sys/sysproto.h>
#include <sys/proc.h>
#include <sys/vnode.h>
#include <sys/ptrace.h>
#include <sys/sx.h>
#include <sys/malloc.h>
#include <sys/signalvar.h>
#include <machine/reg.h>
#include <security/audit/audit.h>
#include <vm/vm.h>
#include <vm/pmap.h>
#include <vm/vm_extern.h>
#include <vm/vm_map.h>
#include <vm/vm_kern.h>
#include <vm/vm_object.h>
#include <vm/vm_page.h>
#include <vm/vm_pager.h>
#include <vm/vm_param.h>
#ifdef COMPAT_FREEBSD32
#include <sys/procfs.h>
#include <compat/freebsd32/freebsd32_signal.h>
struct ptrace_io_desc32 {
int piod_op;
uint32_t piod_offs;
uint32_t piod_addr;
uint32_t piod_len;
};
struct ptrace_vm_entry32 {
int pve_entry;
int pve_timestamp;
uint32_t pve_start;
uint32_t pve_end;
uint32_t pve_offset;
u_int pve_prot;
u_int pve_pathlen;
int32_t pve_fileid;
u_int pve_fsid;
uint32_t pve_path;
};
struct ptrace_lwpinfo32 {
lwpid_t pl_lwpid; /* LWP described. */
int pl_event; /* Event that stopped the LWP. */
int pl_flags; /* LWP flags. */
sigset_t pl_sigmask; /* LWP signal mask */
sigset_t pl_siglist; /* LWP pending signal */
struct siginfo32 pl_siginfo; /* siginfo for signal */
char pl_tdname[MAXCOMLEN + 1]; /* LWP name. */
int pl_child_pid; /* New child pid */
};
#endif
/*
* Functions implemented using PROC_ACTION():
*
* proc_read_regs(proc, regs)
* Get the current user-visible register set from the process
* and copy it into the regs structure (<machine/reg.h>).
* The process is stopped at the time read_regs is called.
*
* proc_write_regs(proc, regs)
* Update the current register set from the passed in regs
* structure. Take care to avoid clobbering special CPU
* registers or privileged bits in the PSL.
* Depending on the architecture this may have fix-up work to do,
* especially if the IAR or PCW are modified.
* The process is stopped at the time write_regs is called.
*
* proc_read_fpregs, proc_write_fpregs
* deal with the floating point register set, otherwise as above.
*
* proc_read_dbregs, proc_write_dbregs
* deal with the processor debug register set, otherwise as above.
*
* proc_sstep(proc)
* Arrange for the process to trap after executing a single instruction.
*/
#define PROC_ACTION(action) do { \
int error; \
\
PROC_LOCK_ASSERT(td->td_proc, MA_OWNED); \
if ((td->td_proc->p_flag & P_INMEM) == 0) \
error = EIO; \
else \
error = (action); \
return (error); \
} while(0)
int
proc_read_regs(struct thread *td, struct reg *regs)
{
PROC_ACTION(fill_regs(td, regs));
}
int
proc_write_regs(struct thread *td, struct reg *regs)
{
PROC_ACTION(set_regs(td, regs));
}
int
proc_read_dbregs(struct thread *td, struct dbreg *dbregs)
{
PROC_ACTION(fill_dbregs(td, dbregs));
}
int
proc_write_dbregs(struct thread *td, struct dbreg *dbregs)
{
PROC_ACTION(set_dbregs(td, dbregs));
}
/*
* Ptrace doesn't support fpregs at all, and there are no security holes
* or translations for fpregs, so we can just copy them.
*/
int
proc_read_fpregs(struct thread *td, struct fpreg *fpregs)
{
PROC_ACTION(fill_fpregs(td, fpregs));
}
int
proc_write_fpregs(struct thread *td, struct fpreg *fpregs)
{
PROC_ACTION(set_fpregs(td, fpregs));
}
#ifdef COMPAT_FREEBSD32
/* For 32 bit binaries, we need to expose the 32 bit regs layouts. */
int
proc_read_regs32(struct thread *td, struct reg32 *regs32)
{
PROC_ACTION(fill_regs32(td, regs32));
}
int
proc_write_regs32(struct thread *td, struct reg32 *regs32)
{
PROC_ACTION(set_regs32(td, regs32));
}
int
proc_read_dbregs32(struct thread *td, struct dbreg32 *dbregs32)
{
PROC_ACTION(fill_dbregs32(td, dbregs32));
}
int
proc_write_dbregs32(struct thread *td, struct dbreg32 *dbregs32)
{
PROC_ACTION(set_dbregs32(td, dbregs32));
}
int
proc_read_fpregs32(struct thread *td, struct fpreg32 *fpregs32)
{
PROC_ACTION(fill_fpregs32(td, fpregs32));
}
int
proc_write_fpregs32(struct thread *td, struct fpreg32 *fpregs32)
{
PROC_ACTION(set_fpregs32(td, fpregs32));
}
#endif
int
proc_sstep(struct thread *td)
{
PROC_ACTION(ptrace_single_step(td));
}
int
proc_rwmem(struct proc *p, struct uio *uio)
{
vm_map_t map;
vm_offset_t pageno; /* page number */
vm_prot_t reqprot;
int error, fault_flags, page_offset, writing;
/*
* Assert that someone has locked this vmspace. (Should be
* curthread but we can't assert that.) This keeps the process
* from exiting out from under us until this operation completes.
*/
KASSERT(p->p_lock >= 1, ("%s: process %p (pid %d) not held", __func__,
p, p->p_pid));
/*
* The map we want...
*/
map = &p->p_vmspace->vm_map;
/*
* If we are writing, then we request vm_fault() to create a private
* copy of each page. Since these copies will not be writeable by the
* process, we must explicity request that they be dirtied.
*/
writing = uio->uio_rw == UIO_WRITE;
reqprot = writing ? VM_PROT_COPY | VM_PROT_READ : VM_PROT_READ;
fault_flags = writing ? VM_FAULT_DIRTY : VM_FAULT_NORMAL;
/*
* Only map in one page at a time. We don't have to, but it
* makes things easier. This way is trivial - right?
*/
do {
vm_offset_t uva;
u_int len;
vm_page_t m;
uva = (vm_offset_t)uio->uio_offset;
/*
* Get the page number of this segment.
*/
pageno = trunc_page(uva);
page_offset = uva - pageno;
/*
* How many bytes to copy
*/
len = min(PAGE_SIZE - page_offset, uio->uio_resid);
/*
* Fault and hold the page on behalf of the process.
*/
error = vm_fault_hold(map, pageno, reqprot, fault_flags, &m);
if (error != KERN_SUCCESS) {
if (error == KERN_RESOURCE_SHORTAGE)
error = ENOMEM;
else
error = EFAULT;
break;
}
/*
* Now do the i/o move.
*/
error = uiomove_fromphys(&m, page_offset, len, uio);
/* Make the I-cache coherent for breakpoints. */
if (writing && error == 0) {
vm_map_lock_read(map);
if (vm_map_check_protection(map, pageno, pageno +
PAGE_SIZE, VM_PROT_EXECUTE))
vm_sync_icache(map, uva, len);
vm_map_unlock_read(map);
}
/*
* Release the page.
*/
vm_page_lock(m);
vm_page_unhold(m);
vm_page_unlock(m);
} while (error == 0 && uio->uio_resid > 0);
return (error);
}
static int
ptrace_vm_entry(struct thread *td, struct proc *p, struct ptrace_vm_entry *pve)
{
struct vattr vattr;
vm_map_t map;
vm_map_entry_t entry;
vm_object_t obj, tobj, lobj;
struct vmspace *vm;
struct vnode *vp;
char *freepath, *fullpath;
u_int pathlen;
int error, index, vfslocked;
error = 0;
obj = NULL;
vm = vmspace_acquire_ref(p);
map = &vm->vm_map;
vm_map_lock_read(map);
do {
entry = map->header.next;
index = 0;
while (index < pve->pve_entry && entry != &map->header) {
entry = entry->next;
index++;
}
if (index != pve->pve_entry) {
error = EINVAL;
break;
}
while (entry != &map->header &&
(entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0) {
entry = entry->next;
index++;
}
if (entry == &map->header) {
error = ENOENT;
break;
}
/* We got an entry. */
pve->pve_entry = index + 1;
pve->pve_timestamp = map->timestamp;
pve->pve_start = entry->start;
pve->pve_end = entry->end - 1;
pve->pve_offset = entry->offset;
pve->pve_prot = entry->protection;
/* Backing object's path needed? */
if (pve->pve_pathlen == 0)
break;
pathlen = pve->pve_pathlen;
pve->pve_pathlen = 0;
obj = entry->object.vm_object;
if (obj != NULL)
VM_OBJECT_LOCK(obj);
} while (0);
vm_map_unlock_read(map);
vmspace_free(vm);
pve->pve_fsid = VNOVAL;
pve->pve_fileid = VNOVAL;
if (error == 0 && obj != NULL) {
lobj = obj;
for (tobj = obj; tobj != NULL; tobj = tobj->backing_object) {
if (tobj != obj)
VM_OBJECT_LOCK(tobj);
if (lobj != obj)
VM_OBJECT_UNLOCK(lobj);
lobj = tobj;
pve->pve_offset += tobj->backing_object_offset;
}
vp = (lobj->type == OBJT_VNODE) ? lobj->handle : NULL;
if (vp != NULL)
vref(vp);
if (lobj != obj)
VM_OBJECT_UNLOCK(lobj);
VM_OBJECT_UNLOCK(obj);
if (vp != NULL) {
freepath = NULL;
fullpath = NULL;
vn_fullpath(td, vp, &fullpath, &freepath);
vfslocked = VFS_LOCK_GIANT(vp->v_mount);
vn_lock(vp, LK_SHARED | LK_RETRY);
if (VOP_GETATTR(vp, &vattr, td->td_ucred) == 0) {
pve->pve_fileid = vattr.va_fileid;
pve->pve_fsid = vattr.va_fsid;
}
vput(vp);
VFS_UNLOCK_GIANT(vfslocked);
if (fullpath != NULL) {
pve->pve_pathlen = strlen(fullpath) + 1;
if (pve->pve_pathlen <= pathlen) {
error = copyout(fullpath, pve->pve_path,
pve->pve_pathlen);
} else
error = ENAMETOOLONG;
}
if (freepath != NULL)
free(freepath, M_TEMP);
}
}
return (error);
}
#ifdef COMPAT_FREEBSD32
static int
ptrace_vm_entry32(struct thread *td, struct proc *p,
struct ptrace_vm_entry32 *pve32)
{
struct ptrace_vm_entry pve;
int error;
pve.pve_entry = pve32->pve_entry;
pve.pve_pathlen = pve32->pve_pathlen;
pve.pve_path = (void *)(uintptr_t)pve32->pve_path;
error = ptrace_vm_entry(td, p, &pve);
if (error == 0) {
pve32->pve_entry = pve.pve_entry;
pve32->pve_timestamp = pve.pve_timestamp;
pve32->pve_start = pve.pve_start;
pve32->pve_end = pve.pve_end;
pve32->pve_offset = pve.pve_offset;
pve32->pve_prot = pve.pve_prot;
pve32->pve_fileid = pve.pve_fileid;
pve32->pve_fsid = pve.pve_fsid;
}
pve32->pve_pathlen = pve.pve_pathlen;
return (error);
}
static void
ptrace_lwpinfo_to32(const struct ptrace_lwpinfo *pl,
struct ptrace_lwpinfo32 *pl32)
{
pl32->pl_lwpid = pl->pl_lwpid;
pl32->pl_event = pl->pl_event;
pl32->pl_flags = pl->pl_flags;
pl32->pl_sigmask = pl->pl_sigmask;
pl32->pl_siglist = pl->pl_siglist;
siginfo_to_siginfo32(&pl->pl_siginfo, &pl32->pl_siginfo);
strcpy(pl32->pl_tdname, pl->pl_tdname);
pl32->pl_child_pid = pl->pl_child_pid;
}
#endif /* COMPAT_FREEBSD32 */
/*
* Process debugging system call.
*/
#ifndef _SYS_SYSPROTO_H_
struct ptrace_args {
int req;
pid_t pid;
caddr_t addr;
int data;
};
#endif
#ifdef COMPAT_FREEBSD32
/*
* This CPP subterfuge is to try and reduce the number of ifdefs in
* the body of the code.
* COPYIN(uap->addr, &r.reg, sizeof r.reg);
* becomes either:
* copyin(uap->addr, &r.reg, sizeof r.reg);
* or
* copyin(uap->addr, &r.reg32, sizeof r.reg32);
* .. except this is done at runtime.
*/
#define COPYIN(u, k, s) wrap32 ? \
copyin(u, k ## 32, s ## 32) : \
copyin(u, k, s)
#define COPYOUT(k, u, s) wrap32 ? \
copyout(k ## 32, u, s ## 32) : \
copyout(k, u, s)
#else
#define COPYIN(u, k, s) copyin(u, k, s)
#define COPYOUT(k, u, s) copyout(k, u, s)
#endif
int
-ptrace(struct thread *td, struct ptrace_args *uap)
+sys_ptrace(struct thread *td, struct ptrace_args *uap)
{
/*
* XXX this obfuscation is to reduce stack usage, but the register
* structs may be too large to put on the stack anyway.
*/
union {
struct ptrace_io_desc piod;
struct ptrace_lwpinfo pl;
struct ptrace_vm_entry pve;
struct dbreg dbreg;
struct fpreg fpreg;
struct reg reg;
#ifdef COMPAT_FREEBSD32
struct dbreg32 dbreg32;
struct fpreg32 fpreg32;
struct reg32 reg32;
struct ptrace_io_desc32 piod32;
struct ptrace_lwpinfo32 pl32;
struct ptrace_vm_entry32 pve32;
#endif
} r;
void *addr;
int error = 0;
#ifdef COMPAT_FREEBSD32
int wrap32 = 0;
if (SV_CURPROC_FLAG(SV_ILP32))
wrap32 = 1;
#endif
AUDIT_ARG_PID(uap->pid);
AUDIT_ARG_CMD(uap->req);
AUDIT_ARG_VALUE(uap->data);
addr = &r;
switch (uap->req) {
case PT_GETREGS:
case PT_GETFPREGS:
case PT_GETDBREGS:
case PT_LWPINFO:
break;
case PT_SETREGS:
error = COPYIN(uap->addr, &r.reg, sizeof r.reg);
break;
case PT_SETFPREGS:
error = COPYIN(uap->addr, &r.fpreg, sizeof r.fpreg);
break;
case PT_SETDBREGS:
error = COPYIN(uap->addr, &r.dbreg, sizeof r.dbreg);
break;
case PT_IO:
error = COPYIN(uap->addr, &r.piod, sizeof r.piod);
break;
case PT_VM_ENTRY:
error = COPYIN(uap->addr, &r.pve, sizeof r.pve);
break;
default:
addr = uap->addr;
break;
}
if (error)
return (error);
error = kern_ptrace(td, uap->req, uap->pid, addr, uap->data);
if (error)
return (error);
switch (uap->req) {
case PT_VM_ENTRY:
error = COPYOUT(&r.pve, uap->addr, sizeof r.pve);
break;
case PT_IO:
error = COPYOUT(&r.piod, uap->addr, sizeof r.piod);
break;
case PT_GETREGS:
error = COPYOUT(&r.reg, uap->addr, sizeof r.reg);
break;
case PT_GETFPREGS:
error = COPYOUT(&r.fpreg, uap->addr, sizeof r.fpreg);
break;
case PT_GETDBREGS:
error = COPYOUT(&r.dbreg, uap->addr, sizeof r.dbreg);
break;
case PT_LWPINFO:
error = copyout(&r.pl, uap->addr, uap->data);
break;
}
return (error);
}
#undef COPYIN
#undef COPYOUT
#ifdef COMPAT_FREEBSD32
/*
* PROC_READ(regs, td2, addr);
* becomes either:
* proc_read_regs(td2, addr);
* or
* proc_read_regs32(td2, addr);
* .. except this is done at runtime. There is an additional
* complication in that PROC_WRITE disallows 32 bit consumers
* from writing to 64 bit address space targets.
*/
#define PROC_READ(w, t, a) wrap32 ? \
proc_read_ ## w ## 32(t, a) : \
proc_read_ ## w (t, a)
#define PROC_WRITE(w, t, a) wrap32 ? \
(safe ? proc_write_ ## w ## 32(t, a) : EINVAL ) : \
proc_write_ ## w (t, a)
#else
#define PROC_READ(w, t, a) proc_read_ ## w (t, a)
#define PROC_WRITE(w, t, a) proc_write_ ## w (t, a)
#endif
int
kern_ptrace(struct thread *td, int req, pid_t pid, void *addr, int data)
{
struct iovec iov;
struct uio uio;
struct proc *curp, *p, *pp;
struct thread *td2 = NULL;
struct ptrace_io_desc *piod = NULL;
struct ptrace_lwpinfo *pl;
int error, write, tmp, num;
int proctree_locked = 0;
lwpid_t tid = 0, *buf;
#ifdef COMPAT_FREEBSD32
int wrap32 = 0, safe = 0;
struct ptrace_io_desc32 *piod32 = NULL;
struct ptrace_lwpinfo32 *pl32 = NULL;
struct ptrace_lwpinfo plr;
#endif
curp = td->td_proc;
/* Lock proctree before locking the process. */
switch (req) {
case PT_TRACE_ME:
case PT_ATTACH:
case PT_STEP:
case PT_CONTINUE:
case PT_TO_SCE:
case PT_TO_SCX:
case PT_SYSCALL:
case PT_FOLLOW_FORK:
case PT_DETACH:
sx_xlock(&proctree_lock);
proctree_locked = 1;
break;
default:
break;
}
write = 0;
if (req == PT_TRACE_ME) {
p = td->td_proc;
PROC_LOCK(p);
} else {
if (pid <= PID_MAX) {
if ((p = pfind(pid)) == NULL) {
if (proctree_locked)
sx_xunlock(&proctree_lock);
return (ESRCH);
}
} else {
td2 = tdfind(pid, -1);
if (td2 == NULL) {
if (proctree_locked)
sx_xunlock(&proctree_lock);
return (ESRCH);
}
p = td2->td_proc;
tid = pid;
pid = p->p_pid;
}
}
AUDIT_ARG_PROCESS(p);
if ((p->p_flag & P_WEXIT) != 0) {
error = ESRCH;
goto fail;
}
if ((error = p_cansee(td, p)) != 0)
goto fail;
if ((error = p_candebug(td, p)) != 0)
goto fail;
/*
* System processes can't be debugged.
*/
if ((p->p_flag & P_SYSTEM) != 0) {
error = EINVAL;
goto fail;
}
if (tid == 0) {
if ((p->p_flag & P_STOPPED_TRACE) != 0) {
KASSERT(p->p_xthread != NULL, ("NULL p_xthread"));
td2 = p->p_xthread;
} else {
td2 = FIRST_THREAD_IN_PROC(p);
}
tid = td2->td_tid;
}
#ifdef COMPAT_FREEBSD32
/*
* Test if we're a 32 bit client and what the target is.
* Set the wrap controls accordingly.
*/
if (SV_CURPROC_FLAG(SV_ILP32)) {
if (SV_PROC_FLAG(td2->td_proc, SV_ILP32))
safe = 1;
wrap32 = 1;
}
#endif
/*
* Permissions check
*/
switch (req) {
case PT_TRACE_ME:
/* Always legal. */
break;
case PT_ATTACH:
/* Self */
if (p->p_pid == td->td_proc->p_pid) {
error = EINVAL;
goto fail;
}
/* Already traced */
if (p->p_flag & P_TRACED) {
error = EBUSY;
goto fail;
}
/* Can't trace an ancestor if you're being traced. */
if (curp->p_flag & P_TRACED) {
for (pp = curp->p_pptr; pp != NULL; pp = pp->p_pptr) {
if (pp == p) {
error = EINVAL;
goto fail;
}
}
}
/* OK */
break;
case PT_CLEARSTEP:
/* Allow thread to clear single step for itself */
if (td->td_tid == tid)
break;
/* FALLTHROUGH */
default:
/* not being traced... */
if ((p->p_flag & P_TRACED) == 0) {
error = EPERM;
goto fail;
}
/* not being traced by YOU */
if (p->p_pptr != td->td_proc) {
error = EBUSY;
goto fail;
}
/* not currently stopped */
if ((p->p_flag & (P_STOPPED_SIG | P_STOPPED_TRACE)) == 0 ||
p->p_suspcount != p->p_numthreads ||
(p->p_flag & P_WAITED) == 0) {
error = EBUSY;
goto fail;
}
if ((p->p_flag & P_STOPPED_TRACE) == 0) {
static int count = 0;
if (count++ == 0)
printf("P_STOPPED_TRACE not set.\n");
}
/* OK */
break;
}
/* Keep this process around until we finish this request. */
_PHOLD(p);
#ifdef FIX_SSTEP
/*
* Single step fixup ala procfs
*/
FIX_SSTEP(td2);
#endif
/*
* Actually do the requests
*/
td->td_retval[0] = 0;
switch (req) {
case PT_TRACE_ME:
/* set my trace flag and "owner" so it can read/write me */
p->p_flag |= P_TRACED;
p->p_oppid = p->p_pptr->p_pid;
break;
case PT_ATTACH:
/* security check done above */
/*
* It would be nice if the tracing relationship was separate
* from the parent relationship but that would require
* another set of links in the proc struct or for "wait"
* to scan the entire proc table. To make life easier,
* we just re-parent the process we're trying to trace.
* The old parent is remembered so we can put things back
* on a "detach".
*/
p->p_flag |= P_TRACED;
p->p_oppid = p->p_pptr->p_pid;
if (p->p_pptr != td->td_proc) {
/* Remember that a child is being debugged(traced). */
p->p_pptr->p_dbg_child++;
proc_reparent(p, td->td_proc);
}
data = SIGSTOP;
goto sendsig; /* in PT_CONTINUE below */
case PT_CLEARSTEP:
error = ptrace_clear_single_step(td2);
break;
case PT_SETSTEP:
error = ptrace_single_step(td2);
break;
case PT_SUSPEND:
td2->td_dbgflags |= TDB_SUSPEND;
thread_lock(td2);
td2->td_flags |= TDF_NEEDSUSPCHK;
thread_unlock(td2);
break;
case PT_RESUME:
td2->td_dbgflags &= ~TDB_SUSPEND;
break;
case PT_FOLLOW_FORK:
if (data)
p->p_flag |= P_FOLLOWFORK;
else
p->p_flag &= ~P_FOLLOWFORK;
break;
case PT_STEP:
case PT_CONTINUE:
case PT_TO_SCE:
case PT_TO_SCX:
case PT_SYSCALL:
case PT_DETACH:
/* Zero means do not send any signal */
if (data < 0 || data > _SIG_MAXSIG) {
error = EINVAL;
break;
}
switch (req) {
case PT_STEP:
error = ptrace_single_step(td2);
if (error)
goto out;
break;
case PT_CONTINUE:
case PT_TO_SCE:
case PT_TO_SCX:
case PT_SYSCALL:
if (addr != (void *)1) {
error = ptrace_set_pc(td2,
(u_long)(uintfptr_t)addr);
if (error)
goto out;
}
switch (req) {
case PT_TO_SCE:
p->p_stops |= S_PT_SCE;
break;
case PT_TO_SCX:
p->p_stops |= S_PT_SCX;
break;
case PT_SYSCALL:
p->p_stops |= S_PT_SCE | S_PT_SCX;
break;
}
break;
case PT_DETACH:
/* reset process parent */
if (p->p_oppid != p->p_pptr->p_pid) {
struct proc *pp;
PROC_LOCK(p->p_pptr);
sigqueue_take(p->p_ksi);
PROC_UNLOCK(p->p_pptr);
PROC_UNLOCK(p);
pp = pfind(p->p_oppid);
if (pp == NULL)
pp = initproc;
else
PROC_UNLOCK(pp);
PROC_LOCK(p);
proc_reparent(p, pp);
p->p_pptr->p_dbg_child--;
if (pp == initproc)
p->p_sigparent = SIGCHLD;
}
p->p_oppid = 0;
p->p_flag &= ~(P_TRACED | P_WAITED | P_FOLLOWFORK);
/* should we send SIGCHLD? */
/* childproc_continued(p); */
break;
}
sendsig:
if (proctree_locked) {
sx_xunlock(&proctree_lock);
proctree_locked = 0;
}
p->p_xstat = data;
p->p_xthread = NULL;
if ((p->p_flag & (P_STOPPED_SIG | P_STOPPED_TRACE)) != 0) {
/* deliver or queue signal */
td2->td_dbgflags &= ~TDB_XSIG;
td2->td_xsig = data;
if (req == PT_DETACH) {
struct thread *td3;
FOREACH_THREAD_IN_PROC(p, td3) {
td3->td_dbgflags &= ~TDB_SUSPEND;
}
}
/*
* unsuspend all threads, to not let a thread run,
* you should use PT_SUSPEND to suspend it before
* continuing process.
*/
PROC_SLOCK(p);
p->p_flag &= ~(P_STOPPED_TRACE|P_STOPPED_SIG|P_WAITED);
thread_unsuspend(p);
PROC_SUNLOCK(p);
} else {
if (data)
- psignal(p, data);
+ kern_psignal(p, data);
}
break;
case PT_WRITE_I:
case PT_WRITE_D:
td2->td_dbgflags |= TDB_USERWR;
write = 1;
/* FALLTHROUGH */
case PT_READ_I:
case PT_READ_D:
PROC_UNLOCK(p);
tmp = 0;
/* write = 0 set above */
iov.iov_base = write ? (caddr_t)&data : (caddr_t)&tmp;
iov.iov_len = sizeof(int);
uio.uio_iov = &iov;
uio.uio_iovcnt = 1;
uio.uio_offset = (off_t)(uintptr_t)addr;
uio.uio_resid = sizeof(int);
uio.uio_segflg = UIO_SYSSPACE; /* i.e.: the uap */
uio.uio_rw = write ? UIO_WRITE : UIO_READ;
uio.uio_td = td;
error = proc_rwmem(p, &uio);
if (uio.uio_resid != 0) {
/*
* XXX proc_rwmem() doesn't currently return ENOSPC,
* so I think write() can bogusly return 0.
* XXX what happens for short writes? We don't want
* to write partial data.
* XXX proc_rwmem() returns EPERM for other invalid
* addresses. Convert this to EINVAL. Does this
* clobber returns of EPERM for other reasons?
*/
if (error == 0 || error == ENOSPC || error == EPERM)
error = EINVAL; /* EOF */
}
if (!write)
td->td_retval[0] = tmp;
PROC_LOCK(p);
break;
case PT_IO:
#ifdef COMPAT_FREEBSD32
if (wrap32) {
piod32 = addr;
iov.iov_base = (void *)(uintptr_t)piod32->piod_addr;
iov.iov_len = piod32->piod_len;
uio.uio_offset = (off_t)(uintptr_t)piod32->piod_offs;
uio.uio_resid = piod32->piod_len;
} else
#endif
{
piod = addr;
iov.iov_base = piod->piod_addr;
iov.iov_len = piod->piod_len;
uio.uio_offset = (off_t)(uintptr_t)piod->piod_offs;
uio.uio_resid = piod->piod_len;
}
uio.uio_iov = &iov;
uio.uio_iovcnt = 1;
uio.uio_segflg = UIO_USERSPACE;
uio.uio_td = td;
#ifdef COMPAT_FREEBSD32
tmp = wrap32 ? piod32->piod_op : piod->piod_op;
#else
tmp = piod->piod_op;
#endif
switch (tmp) {
case PIOD_READ_D:
case PIOD_READ_I:
uio.uio_rw = UIO_READ;
break;
case PIOD_WRITE_D:
case PIOD_WRITE_I:
td2->td_dbgflags |= TDB_USERWR;
uio.uio_rw = UIO_WRITE;
break;
default:
error = EINVAL;
goto out;
}
PROC_UNLOCK(p);
error = proc_rwmem(p, &uio);
#ifdef COMPAT_FREEBSD32
if (wrap32)
piod32->piod_len -= uio.uio_resid;
else
#endif
piod->piod_len -= uio.uio_resid;
PROC_LOCK(p);
break;
case PT_KILL:
data = SIGKILL;
goto sendsig; /* in PT_CONTINUE above */
case PT_SETREGS:
td2->td_dbgflags |= TDB_USERWR;
error = PROC_WRITE(regs, td2, addr);
break;
case PT_GETREGS:
error = PROC_READ(regs, td2, addr);
break;
case PT_SETFPREGS:
td2->td_dbgflags |= TDB_USERWR;
error = PROC_WRITE(fpregs, td2, addr);
break;
case PT_GETFPREGS:
error = PROC_READ(fpregs, td2, addr);
break;
case PT_SETDBREGS:
td2->td_dbgflags |= TDB_USERWR;
error = PROC_WRITE(dbregs, td2, addr);
break;
case PT_GETDBREGS:
error = PROC_READ(dbregs, td2, addr);
break;
case PT_LWPINFO:
if (data <= 0 ||
#ifdef COMPAT_FREEBSD32
(!wrap32 && data > sizeof(*pl)) ||
(wrap32 && data > sizeof(*pl32))) {
#else
data > sizeof(*pl)) {
#endif
error = EINVAL;
break;
}
#ifdef COMPAT_FREEBSD32
if (wrap32) {
pl = &plr;
pl32 = addr;
} else
#endif
pl = addr;
pl->pl_lwpid = td2->td_tid;
pl->pl_flags = 0;
if (td2->td_dbgflags & TDB_XSIG) {
pl->pl_event = PL_EVENT_SIGNAL;
if (td2->td_dbgksi.ksi_signo != 0 &&
#ifdef COMPAT_FREEBSD32
((!wrap32 && data >= offsetof(struct ptrace_lwpinfo,
pl_siginfo) + sizeof(pl->pl_siginfo)) ||
(wrap32 && data >= offsetof(struct ptrace_lwpinfo32,
pl_siginfo) + sizeof(struct siginfo32)))
#else
data >= offsetof(struct ptrace_lwpinfo, pl_siginfo)
+ sizeof(pl->pl_siginfo)
#endif
){
pl->pl_flags |= PL_FLAG_SI;
pl->pl_siginfo = td2->td_dbgksi.ksi_info;
}
}
if ((pl->pl_flags & PL_FLAG_SI) == 0)
bzero(&pl->pl_siginfo, sizeof(pl->pl_siginfo));
if (td2->td_dbgflags & TDB_SCE)
pl->pl_flags |= PL_FLAG_SCE;
else if (td2->td_dbgflags & TDB_SCX)
pl->pl_flags |= PL_FLAG_SCX;
if (td2->td_dbgflags & TDB_EXEC)
pl->pl_flags |= PL_FLAG_EXEC;
if (td2->td_dbgflags & TDB_FORK) {
pl->pl_flags |= PL_FLAG_FORKED;
pl->pl_child_pid = td2->td_dbg_forked;
}
pl->pl_sigmask = td2->td_sigmask;
pl->pl_siglist = td2->td_siglist;
strcpy(pl->pl_tdname, td2->td_name);
#ifdef COMPAT_FREEBSD32
if (wrap32)
ptrace_lwpinfo_to32(pl, pl32);
#endif
break;
case PT_GETNUMLWPS:
td->td_retval[0] = p->p_numthreads;
break;
case PT_GETLWPLIST:
if (data <= 0) {
error = EINVAL;
break;
}
num = imin(p->p_numthreads, data);
PROC_UNLOCK(p);
buf = malloc(num * sizeof(lwpid_t), M_TEMP, M_WAITOK);
tmp = 0;
PROC_LOCK(p);
FOREACH_THREAD_IN_PROC(p, td2) {
if (tmp >= num)
break;
buf[tmp++] = td2->td_tid;
}
PROC_UNLOCK(p);
error = copyout(buf, addr, tmp * sizeof(lwpid_t));
free(buf, M_TEMP);
if (!error)
td->td_retval[0] = tmp;
PROC_LOCK(p);
break;
case PT_VM_TIMESTAMP:
td->td_retval[0] = p->p_vmspace->vm_map.timestamp;
break;
case PT_VM_ENTRY:
PROC_UNLOCK(p);
#ifdef COMPAT_FREEBSD32
if (wrap32)
error = ptrace_vm_entry32(td, p, addr);
else
#endif
error = ptrace_vm_entry(td, p, addr);
PROC_LOCK(p);
break;
default:
#ifdef __HAVE_PTRACE_MACHDEP
if (req >= PT_FIRSTMACH) {
PROC_UNLOCK(p);
error = cpu_ptrace(td2, req, addr, data);
PROC_LOCK(p);
} else
#endif
/* Unknown request. */
error = EINVAL;
break;
}
out:
/* Drop our hold on this process now that the request has completed. */
_PRELE(p);
fail:
PROC_UNLOCK(p);
if (proctree_locked)
sx_xunlock(&proctree_lock);
return (error);
}
#undef PROC_READ
#undef PROC_WRITE
/*
* Stop a process because of a debugging event;
* stay stopped until p->p_step is cleared
* (cleared by PIOCCONT in procfs).
*/
void
stopevent(struct proc *p, unsigned int event, unsigned int val)
{
PROC_LOCK_ASSERT(p, MA_OWNED);
p->p_step = 1;
do {
p->p_xstat = val;
p->p_xthread = NULL;
p->p_stype = event; /* Which event caused the stop? */
wakeup(&p->p_stype); /* Wake up any PIOCWAIT'ing procs */
msleep(&p->p_step, &p->p_mtx, PWAIT, "stopevent", 0);
} while (p->p_step);
}
Index: head/sys/kern/sysv_msg.c
===================================================================
--- head/sys/kern/sysv_msg.c (revision 225616)
+++ head/sys/kern/sysv_msg.c (revision 225617)
@@ -1,1590 +1,1592 @@
/*-
* Implementation of SVID messages
*
* Author: Daniel Boulet
*
* Copyright 1993 Daniel Boulet and RTMX Inc.
*
* This system call was implemented by Daniel Boulet under contract from RTMX.
*
* Redistribution and use in source forms, with and without modification,
* are permitted provided that this entire comment appears intact.
*
* Redistribution in binary form may occur without any restrictions.
* Obviously, it would be nice if you gave credit where credit is due
* but requiring it would be too onerous.
*
* This software is provided ``AS IS'' without any warranties of any kind.
*/
/*-
* Copyright (c) 2003-2005 McAfee, Inc.
* All rights reserved.
*
* This software was developed for the FreeBSD Project in part by McAfee
* Research, the Security Research Division of McAfee, Inc under DARPA/SPAWAR
* contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS research
* program.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_compat.h"
#include "opt_sysvipc.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/sysproto.h>
#include <sys/kernel.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/module.h>
#include <sys/msg.h>
#include <sys/racct.h>
#include <sys/syscall.h>
#include <sys/syscallsubr.h>
#include <sys/sysent.h>
#include <sys/sysctl.h>
#include <sys/malloc.h>
#include <sys/jail.h>
#include <security/mac/mac_framework.h>
FEATURE(sysv_msg, "System V message queues support");
static MALLOC_DEFINE(M_MSG, "msg", "SVID compatible message queues");
static int msginit(void);
static int msgunload(void);
static int sysvmsg_modload(struct module *, int, void *);
+
#ifdef MSG_DEBUG
#define DPRINTF(a) printf a
#else
#define DPRINTF(a) (void)0
#endif
static void msg_freehdr(struct msg *msghdr);
#ifndef MSGSSZ
#define MSGSSZ 8 /* Each segment must be 2^N long */
#endif
#ifndef MSGSEG
#define MSGSEG 2048 /* must be less than 32767 */
#endif
#define MSGMAX (MSGSSZ*MSGSEG)
#ifndef MSGMNB
#define MSGMNB 2048 /* max # of bytes in a queue */
#endif
#ifndef MSGMNI
#define MSGMNI 40
#endif
#ifndef MSGTQL
#define MSGTQL 40
#endif
/*
* Based on the configuration parameters described in an SVR2 (yes, two)
* config(1m) man page.
*
* Each message is broken up and stored in segments that are msgssz bytes
* long. For efficiency reasons, this should be a power of two. Also,
* it doesn't make sense if it is less than 8 or greater than about 256.
* Consequently, msginit in kern/sysv_msg.c checks that msgssz is a power of
* two between 8 and 1024 inclusive (and panic's if it isn't).
*/
struct msginfo msginfo = {
MSGMAX, /* max chars in a message */
MSGMNI, /* # of message queue identifiers */
MSGMNB, /* max chars in a queue */
MSGTQL, /* max messages in system */
MSGSSZ, /* size of a message segment */
/* (must be small power of 2 greater than 4) */
MSGSEG /* number of message segments */
};
/*
* macros to convert between msqid_ds's and msqid's.
* (specific to this implementation)
*/
#define MSQID(ix,ds) ((ix) & 0xffff | (((ds).msg_perm.seq << 16) & 0xffff0000))
#define MSQID_IX(id) ((id) & 0xffff)
#define MSQID_SEQ(id) (((id) >> 16) & 0xffff)
/*
* The rest of this file is specific to this particular implementation.
*/
struct msgmap {
short next; /* next segment in buffer */
/* -1 -> available */
/* 0..(MSGSEG-1) -> index of next segment */
};
#define MSG_LOCKED 01000 /* Is this msqid_ds locked? */
static int nfree_msgmaps; /* # of free map entries */
static short free_msgmaps; /* head of linked list of free map entries */
static struct msg *free_msghdrs;/* list of free msg headers */
static char *msgpool; /* MSGMAX byte long msg buffer pool */
static struct msgmap *msgmaps; /* MSGSEG msgmap structures */
static struct msg *msghdrs; /* MSGTQL msg headers */
static struct msqid_kernel *msqids; /* MSGMNI msqid_kernel struct's */
static struct mtx msq_mtx; /* global mutex for message queues. */
static struct syscall_helper_data msg_syscalls[] = {
SYSCALL_INIT_HELPER(msgctl),
SYSCALL_INIT_HELPER(msgget),
SYSCALL_INIT_HELPER(msgsnd),
SYSCALL_INIT_HELPER(msgrcv),
#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
SYSCALL_INIT_HELPER(msgsys),
- SYSCALL_INIT_HELPER(freebsd7_msgctl),
+ SYSCALL_INIT_HELPER_COMPAT(freebsd7_msgctl),
#endif
SYSCALL_INIT_LAST
};
#ifdef COMPAT_FREEBSD32
#include <compat/freebsd32/freebsd32.h>
#include <compat/freebsd32/freebsd32_ipc.h>
#include <compat/freebsd32/freebsd32_proto.h>
#include <compat/freebsd32/freebsd32_signal.h>
#include <compat/freebsd32/freebsd32_syscall.h>
#include <compat/freebsd32/freebsd32_util.h>
static struct syscall_helper_data msg32_syscalls[] = {
SYSCALL32_INIT_HELPER(freebsd32_msgctl),
SYSCALL32_INIT_HELPER(freebsd32_msgsnd),
SYSCALL32_INIT_HELPER(freebsd32_msgrcv),
- SYSCALL32_INIT_HELPER(msgget),
+ SYSCALL32_INIT_HELPER_COMPAT(msgget),
SYSCALL32_INIT_HELPER(freebsd32_msgsys),
#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
SYSCALL32_INIT_HELPER(freebsd7_freebsd32_msgctl),
#endif
SYSCALL_INIT_LAST
};
#endif
static int
msginit()
{
int i, error;
TUNABLE_INT_FETCH("kern.ipc.msgseg", &msginfo.msgseg);
TUNABLE_INT_FETCH("kern.ipc.msgssz", &msginfo.msgssz);
msginfo.msgmax = msginfo.msgseg * msginfo.msgssz;
TUNABLE_INT_FETCH("kern.ipc.msgmni", &msginfo.msgmni);
TUNABLE_INT_FETCH("kern.ipc.msgmnb", &msginfo.msgmnb);
TUNABLE_INT_FETCH("kern.ipc.msgtql", &msginfo.msgtql);
msgpool = malloc(msginfo.msgmax, M_MSG, M_WAITOK);
msgmaps = malloc(sizeof(struct msgmap) * msginfo.msgseg, M_MSG, M_WAITOK);
msghdrs = malloc(sizeof(struct msg) * msginfo.msgtql, M_MSG, M_WAITOK);
msqids = malloc(sizeof(struct msqid_kernel) * msginfo.msgmni, M_MSG,
M_WAITOK);
/*
* msginfo.msgssz should be a power of two for efficiency reasons.
* It is also pretty silly if msginfo.msgssz is less than 8
* or greater than about 256 so ...
*/
i = 8;
while (i < 1024 && i != msginfo.msgssz)
i <<= 1;
if (i != msginfo.msgssz) {
DPRINTF(("msginfo.msgssz=%d (0x%x)\n", msginfo.msgssz,
msginfo.msgssz));
panic("msginfo.msgssz not a small power of 2");
}
if (msginfo.msgseg > 32767) {
DPRINTF(("msginfo.msgseg=%d\n", msginfo.msgseg));
panic("msginfo.msgseg > 32767");
}
for (i = 0; i < msginfo.msgseg; i++) {
if (i > 0)
msgmaps[i-1].next = i;
msgmaps[i].next = -1; /* implies entry is available */
}
free_msgmaps = 0;
nfree_msgmaps = msginfo.msgseg;
for (i = 0; i < msginfo.msgtql; i++) {
msghdrs[i].msg_type = 0;
if (i > 0)
msghdrs[i-1].msg_next = &msghdrs[i];
msghdrs[i].msg_next = NULL;
#ifdef MAC
mac_sysvmsg_init(&msghdrs[i]);
#endif
}
free_msghdrs = &msghdrs[0];
for (i = 0; i < msginfo.msgmni; i++) {
msqids[i].u.msg_qbytes = 0; /* implies entry is available */
msqids[i].u.msg_perm.seq = 0; /* reset to a known value */
msqids[i].u.msg_perm.mode = 0;
#ifdef MAC
mac_sysvmsq_init(&msqids[i]);
#endif
}
mtx_init(&msq_mtx, "msq", NULL, MTX_DEF);
error = syscall_helper_register(msg_syscalls);
if (error != 0)
return (error);
#ifdef COMPAT_FREEBSD32
error = syscall32_helper_register(msg32_syscalls);
if (error != 0)
return (error);
#endif
return (0);
}
static int
msgunload()
{
struct msqid_kernel *msqkptr;
int msqid;
#ifdef MAC
int i;
#endif
syscall_helper_unregister(msg_syscalls);
#ifdef COMPAT_FREEBSD32
syscall32_helper_unregister(msg32_syscalls);
#endif
for (msqid = 0; msqid < msginfo.msgmni; msqid++) {
/*
* Look for an unallocated and unlocked msqid_ds.
* msqid_ds's can be locked by msgsnd or msgrcv while
* they are copying the message in/out. We can't
* re-use the entry until they release it.
*/
msqkptr = &msqids[msqid];
if (msqkptr->u.msg_qbytes != 0 ||
(msqkptr->u.msg_perm.mode & MSG_LOCKED) != 0)
break;
}
if (msqid != msginfo.msgmni)
return (EBUSY);
#ifdef MAC
for (i = 0; i < msginfo.msgtql; i++)
mac_sysvmsg_destroy(&msghdrs[i]);
for (msqid = 0; msqid < msginfo.msgmni; msqid++)
mac_sysvmsq_destroy(&msqids[msqid]);
#endif
free(msgpool, M_MSG);
free(msgmaps, M_MSG);
free(msghdrs, M_MSG);
free(msqids, M_MSG);
mtx_destroy(&msq_mtx);
return (0);
}
static int
sysvmsg_modload(struct module *module, int cmd, void *arg)
{
int error = 0;
switch (cmd) {
case MOD_LOAD:
error = msginit();
if (error != 0)
msgunload();
break;
case MOD_UNLOAD:
error = msgunload();
break;
case MOD_SHUTDOWN:
break;
default:
error = EINVAL;
break;
}
return (error);
}
static moduledata_t sysvmsg_mod = {
"sysvmsg",
&sysvmsg_modload,
NULL
};
DECLARE_MODULE(sysvmsg, sysvmsg_mod, SI_SUB_SYSV_MSG, SI_ORDER_FIRST);
MODULE_VERSION(sysvmsg, 1);
static void
msg_freehdr(msghdr)
struct msg *msghdr;
{
while (msghdr->msg_ts > 0) {
short next;
if (msghdr->msg_spot < 0 || msghdr->msg_spot >= msginfo.msgseg)
panic("msghdr->msg_spot out of range");
next = msgmaps[msghdr->msg_spot].next;
msgmaps[msghdr->msg_spot].next = free_msgmaps;
free_msgmaps = msghdr->msg_spot;
nfree_msgmaps++;
msghdr->msg_spot = next;
if (msghdr->msg_ts >= msginfo.msgssz)
msghdr->msg_ts -= msginfo.msgssz;
else
msghdr->msg_ts = 0;
}
if (msghdr->msg_spot != -1)
panic("msghdr->msg_spot != -1");
msghdr->msg_next = free_msghdrs;
free_msghdrs = msghdr;
#ifdef MAC
mac_sysvmsg_cleanup(msghdr);
#endif
}
#ifndef _SYS_SYSPROTO_H_
struct msgctl_args {
int msqid;
int cmd;
struct msqid_ds *buf;
};
#endif
int
-msgctl(td, uap)
+sys_msgctl(td, uap)
struct thread *td;
register struct msgctl_args *uap;
{
int msqid = uap->msqid;
int cmd = uap->cmd;
struct msqid_ds msqbuf;
int error;
DPRINTF(("call to msgctl(%d, %d, %p)\n", msqid, cmd, uap->buf));
if (cmd == IPC_SET &&
(error = copyin(uap->buf, &msqbuf, sizeof(msqbuf))) != 0)
return (error);
error = kern_msgctl(td, msqid, cmd, &msqbuf);
if (cmd == IPC_STAT && error == 0)
error = copyout(&msqbuf, uap->buf, sizeof(struct msqid_ds));
return (error);
}
int
kern_msgctl(td, msqid, cmd, msqbuf)
struct thread *td;
int msqid;
int cmd;
struct msqid_ds *msqbuf;
{
int rval, error, msqix;
register struct msqid_kernel *msqkptr;
if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
return (ENOSYS);
msqix = IPCID_TO_IX(msqid);
if (msqix < 0 || msqix >= msginfo.msgmni) {
DPRINTF(("msqid (%d) out of range (0<=msqid<%d)\n", msqix,
msginfo.msgmni));
return (EINVAL);
}
msqkptr = &msqids[msqix];
mtx_lock(&msq_mtx);
if (msqkptr->u.msg_qbytes == 0) {
DPRINTF(("no such msqid\n"));
error = EINVAL;
goto done2;
}
if (msqkptr->u.msg_perm.seq != IPCID_TO_SEQ(msqid)) {
DPRINTF(("wrong sequence number\n"));
error = EINVAL;
goto done2;
}
#ifdef MAC
error = mac_sysvmsq_check_msqctl(td->td_ucred, msqkptr, cmd);
if (error != 0)
goto done2;
#endif
error = 0;
rval = 0;
switch (cmd) {
case IPC_RMID:
{
struct msg *msghdr;
if ((error = ipcperm(td, &msqkptr->u.msg_perm, IPC_M)))
goto done2;
#ifdef MAC
/*
* Check that the thread has MAC access permissions to
* individual msghdrs. Note: We need to do this in a
* separate loop because the actual loop alters the
* msq/msghdr info as it progresses, and there is no going
* back if half the way through we discover that the
* thread cannot free a certain msghdr. The msq will get
* into an inconsistent state.
*/
for (msghdr = msqkptr->u.msg_first; msghdr != NULL;
msghdr = msghdr->msg_next) {
error = mac_sysvmsq_check_msgrmid(td->td_ucred, msghdr);
if (error != 0)
goto done2;
}
#endif
racct_sub_cred(msqkptr->cred, RACCT_NMSGQ, 1);
racct_sub_cred(msqkptr->cred, RACCT_MSGQQUEUED, msqkptr->u.msg_qnum);
racct_sub_cred(msqkptr->cred, RACCT_MSGQSIZE, msqkptr->u.msg_cbytes);
crfree(msqkptr->cred);
msqkptr->cred = NULL;
/* Free the message headers */
msghdr = msqkptr->u.msg_first;
while (msghdr != NULL) {
struct msg *msghdr_tmp;
/* Free the segments of each message */
msqkptr->u.msg_cbytes -= msghdr->msg_ts;
msqkptr->u.msg_qnum--;
msghdr_tmp = msghdr;
msghdr = msghdr->msg_next;
msg_freehdr(msghdr_tmp);
}
if (msqkptr->u.msg_cbytes != 0)
panic("msg_cbytes is screwed up");
if (msqkptr->u.msg_qnum != 0)
panic("msg_qnum is screwed up");
msqkptr->u.msg_qbytes = 0; /* Mark it as free */
#ifdef MAC
mac_sysvmsq_cleanup(msqkptr);
#endif
wakeup(msqkptr);
}
break;
case IPC_SET:
if ((error = ipcperm(td, &msqkptr->u.msg_perm, IPC_M)))
goto done2;
if (msqbuf->msg_qbytes > msqkptr->u.msg_qbytes) {
error = priv_check(td, PRIV_IPC_MSGSIZE);
if (error)
goto done2;
}
if (msqbuf->msg_qbytes > msginfo.msgmnb) {
DPRINTF(("can't increase msg_qbytes beyond %d"
"(truncating)\n", msginfo.msgmnb));
msqbuf->msg_qbytes = msginfo.msgmnb; /* silently restrict qbytes to system limit */
}
if (msqbuf->msg_qbytes == 0) {
DPRINTF(("can't reduce msg_qbytes to 0\n"));
error = EINVAL; /* non-standard errno! */
goto done2;
}
msqkptr->u.msg_perm.uid = msqbuf->msg_perm.uid; /* change the owner */
msqkptr->u.msg_perm.gid = msqbuf->msg_perm.gid; /* change the owner */
msqkptr->u.msg_perm.mode = (msqkptr->u.msg_perm.mode & ~0777) |
(msqbuf->msg_perm.mode & 0777);
msqkptr->u.msg_qbytes = msqbuf->msg_qbytes;
msqkptr->u.msg_ctime = time_second;
break;
case IPC_STAT:
if ((error = ipcperm(td, &msqkptr->u.msg_perm, IPC_R))) {
DPRINTF(("requester doesn't have read access\n"));
goto done2;
}
*msqbuf = msqkptr->u;
break;
default:
DPRINTF(("invalid command %d\n", cmd));
error = EINVAL;
goto done2;
}
if (error == 0)
td->td_retval[0] = rval;
done2:
mtx_unlock(&msq_mtx);
return (error);
}
#ifndef _SYS_SYSPROTO_H_
struct msgget_args {
key_t key;
int msgflg;
};
#endif
+
int
-msgget(td, uap)
+sys_msgget(td, uap)
struct thread *td;
register struct msgget_args *uap;
{
int msqid, error = 0;
int key = uap->key;
int msgflg = uap->msgflg;
struct ucred *cred = td->td_ucred;
register struct msqid_kernel *msqkptr = NULL;
DPRINTF(("msgget(0x%x, 0%o)\n", key, msgflg));
if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
return (ENOSYS);
mtx_lock(&msq_mtx);
if (key != IPC_PRIVATE) {
for (msqid = 0; msqid < msginfo.msgmni; msqid++) {
msqkptr = &msqids[msqid];
if (msqkptr->u.msg_qbytes != 0 &&
msqkptr->u.msg_perm.key == key)
break;
}
if (msqid < msginfo.msgmni) {
DPRINTF(("found public key\n"));
if ((msgflg & IPC_CREAT) && (msgflg & IPC_EXCL)) {
DPRINTF(("not exclusive\n"));
error = EEXIST;
goto done2;
}
if ((error = ipcperm(td, &msqkptr->u.msg_perm,
msgflg & 0700))) {
DPRINTF(("requester doesn't have 0%o access\n",
msgflg & 0700));
goto done2;
}
#ifdef MAC
error = mac_sysvmsq_check_msqget(cred, msqkptr);
if (error != 0)
goto done2;
#endif
goto found;
}
}
DPRINTF(("need to allocate the msqid_ds\n"));
if (key == IPC_PRIVATE || (msgflg & IPC_CREAT)) {
for (msqid = 0; msqid < msginfo.msgmni; msqid++) {
/*
* Look for an unallocated and unlocked msqid_ds.
* msqid_ds's can be locked by msgsnd or msgrcv while
* they are copying the message in/out. We can't
* re-use the entry until they release it.
*/
msqkptr = &msqids[msqid];
if (msqkptr->u.msg_qbytes == 0 &&
(msqkptr->u.msg_perm.mode & MSG_LOCKED) == 0)
break;
}
if (msqid == msginfo.msgmni) {
DPRINTF(("no more msqid_ds's available\n"));
error = ENOSPC;
goto done2;
}
#ifdef RACCT
PROC_LOCK(td->td_proc);
error = racct_add(td->td_proc, RACCT_NMSGQ, 1);
PROC_UNLOCK(td->td_proc);
if (error != 0) {
error = ENOSPC;
goto done2;
}
#endif
DPRINTF(("msqid %d is available\n", msqid));
msqkptr->u.msg_perm.key = key;
msqkptr->u.msg_perm.cuid = cred->cr_uid;
msqkptr->u.msg_perm.uid = cred->cr_uid;
msqkptr->u.msg_perm.cgid = cred->cr_gid;
msqkptr->u.msg_perm.gid = cred->cr_gid;
msqkptr->u.msg_perm.mode = (msgflg & 0777);
msqkptr->cred = crhold(cred);
/* Make sure that the returned msqid is unique */
msqkptr->u.msg_perm.seq = (msqkptr->u.msg_perm.seq + 1) & 0x7fff;
msqkptr->u.msg_first = NULL;
msqkptr->u.msg_last = NULL;
msqkptr->u.msg_cbytes = 0;
msqkptr->u.msg_qnum = 0;
msqkptr->u.msg_qbytes = msginfo.msgmnb;
msqkptr->u.msg_lspid = 0;
msqkptr->u.msg_lrpid = 0;
msqkptr->u.msg_stime = 0;
msqkptr->u.msg_rtime = 0;
msqkptr->u.msg_ctime = time_second;
#ifdef MAC
mac_sysvmsq_create(cred, msqkptr);
#endif
} else {
DPRINTF(("didn't find it and wasn't asked to create it\n"));
error = ENOENT;
goto done2;
}
found:
/* Construct the unique msqid */
td->td_retval[0] = IXSEQ_TO_IPCID(msqid, msqkptr->u.msg_perm);
done2:
mtx_unlock(&msq_mtx);
return (error);
}
#ifndef _SYS_SYSPROTO_H_
struct msgsnd_args {
int msqid;
const void *msgp;
size_t msgsz;
int msgflg;
};
#endif
int
kern_msgsnd(td, msqid, msgp, msgsz, msgflg, mtype)
struct thread *td;
int msqid;
const void *msgp; /* XXX msgp is actually mtext. */
size_t msgsz;
int msgflg;
long mtype;
{
int msqix, segs_needed, error = 0;
register struct msqid_kernel *msqkptr;
register struct msg *msghdr;
short next;
#ifdef RACCT
size_t saved_msgsz;
#endif
if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
return (ENOSYS);
mtx_lock(&msq_mtx);
msqix = IPCID_TO_IX(msqid);
if (msqix < 0 || msqix >= msginfo.msgmni) {
DPRINTF(("msqid (%d) out of range (0<=msqid<%d)\n", msqix,
msginfo.msgmni));
error = EINVAL;
goto done2;
}
msqkptr = &msqids[msqix];
if (msqkptr->u.msg_qbytes == 0) {
DPRINTF(("no such message queue id\n"));
error = EINVAL;
goto done2;
}
if (msqkptr->u.msg_perm.seq != IPCID_TO_SEQ(msqid)) {
DPRINTF(("wrong sequence number\n"));
error = EINVAL;
goto done2;
}
if ((error = ipcperm(td, &msqkptr->u.msg_perm, IPC_W))) {
DPRINTF(("requester doesn't have write access\n"));
goto done2;
}
#ifdef MAC
error = mac_sysvmsq_check_msqsnd(td->td_ucred, msqkptr);
if (error != 0)
goto done2;
#endif
#ifdef RACCT
PROC_LOCK(td->td_proc);
if (racct_add(td->td_proc, RACCT_MSGQQUEUED, 1)) {
PROC_UNLOCK(td->td_proc);
error = EAGAIN;
goto done2;
}
saved_msgsz = msgsz;
if (racct_add(td->td_proc, RACCT_MSGQSIZE, msgsz)) {
racct_sub(td->td_proc, RACCT_MSGQQUEUED, 1);
PROC_UNLOCK(td->td_proc);
error = EAGAIN;
goto done2;
}
PROC_UNLOCK(td->td_proc);
#endif
segs_needed = (msgsz + msginfo.msgssz - 1) / msginfo.msgssz;
DPRINTF(("msgsz=%zu, msgssz=%d, segs_needed=%d\n", msgsz,
msginfo.msgssz, segs_needed));
for (;;) {
int need_more_resources = 0;
/*
* check msgsz
* (inside this loop in case msg_qbytes changes while we sleep)
*/
if (msgsz > msqkptr->u.msg_qbytes) {
DPRINTF(("msgsz > msqkptr->u.msg_qbytes\n"));
error = EINVAL;
goto done3;
}
if (msqkptr->u.msg_perm.mode & MSG_LOCKED) {
DPRINTF(("msqid is locked\n"));
need_more_resources = 1;
}
if (msgsz + msqkptr->u.msg_cbytes > msqkptr->u.msg_qbytes) {
DPRINTF(("msgsz + msg_cbytes > msg_qbytes\n"));
need_more_resources = 1;
}
if (segs_needed > nfree_msgmaps) {
DPRINTF(("segs_needed > nfree_msgmaps\n"));
need_more_resources = 1;
}
if (free_msghdrs == NULL) {
DPRINTF(("no more msghdrs\n"));
need_more_resources = 1;
}
if (need_more_resources) {
int we_own_it;
if ((msgflg & IPC_NOWAIT) != 0) {
DPRINTF(("need more resources but caller "
"doesn't want to wait\n"));
error = EAGAIN;
goto done3;
}
if ((msqkptr->u.msg_perm.mode & MSG_LOCKED) != 0) {
DPRINTF(("we don't own the msqid_ds\n"));
we_own_it = 0;
} else {
/* Force later arrivals to wait for our
request */
DPRINTF(("we own the msqid_ds\n"));
msqkptr->u.msg_perm.mode |= MSG_LOCKED;
we_own_it = 1;
}
DPRINTF(("msgsnd: goodnight\n"));
error = msleep(msqkptr, &msq_mtx, (PZERO - 4) | PCATCH,
"msgsnd", hz);
DPRINTF(("msgsnd: good morning, error=%d\n", error));
if (we_own_it)
msqkptr->u.msg_perm.mode &= ~MSG_LOCKED;
if (error == EWOULDBLOCK) {
DPRINTF(("msgsnd: timed out\n"));
continue;
}
if (error != 0) {
DPRINTF(("msgsnd: interrupted system call\n"));
error = EINTR;
goto done3;
}
/*
* Make sure that the msq queue still exists
*/
if (msqkptr->u.msg_qbytes == 0) {
DPRINTF(("msqid deleted\n"));
error = EIDRM;
goto done3;
}
} else {
DPRINTF(("got all the resources that we need\n"));
break;
}
}
/*
* We have the resources that we need.
* Make sure!
*/
if (msqkptr->u.msg_perm.mode & MSG_LOCKED)
panic("msg_perm.mode & MSG_LOCKED");
if (segs_needed > nfree_msgmaps)
panic("segs_needed > nfree_msgmaps");
if (msgsz + msqkptr->u.msg_cbytes > msqkptr->u.msg_qbytes)
panic("msgsz + msg_cbytes > msg_qbytes");
if (free_msghdrs == NULL)
panic("no more msghdrs");
/*
* Re-lock the msqid_ds in case we page-fault when copying in the
* message
*/
if ((msqkptr->u.msg_perm.mode & MSG_LOCKED) != 0)
panic("msqid_ds is already locked");
msqkptr->u.msg_perm.mode |= MSG_LOCKED;
/*
* Allocate a message header
*/
msghdr = free_msghdrs;
free_msghdrs = msghdr->msg_next;
msghdr->msg_spot = -1;
msghdr->msg_ts = msgsz;
msghdr->msg_type = mtype;
#ifdef MAC
/*
* XXXMAC: Should the mac_sysvmsq_check_msgmsq check follow here
* immediately? Or, should it be checked just before the msg is
* enqueued in the msgq (as it is done now)?
*/
mac_sysvmsg_create(td->td_ucred, msqkptr, msghdr);
#endif
/*
* Allocate space for the message
*/
while (segs_needed > 0) {
if (nfree_msgmaps <= 0)
panic("not enough msgmaps");
if (free_msgmaps == -1)
panic("nil free_msgmaps");
next = free_msgmaps;
if (next <= -1)
panic("next too low #1");
if (next >= msginfo.msgseg)
panic("next out of range #1");
DPRINTF(("allocating segment %d to message\n", next));
free_msgmaps = msgmaps[next].next;
nfree_msgmaps--;
msgmaps[next].next = msghdr->msg_spot;
msghdr->msg_spot = next;
segs_needed--;
}
/*
* Validate the message type
*/
if (msghdr->msg_type < 1) {
msg_freehdr(msghdr);
msqkptr->u.msg_perm.mode &= ~MSG_LOCKED;
wakeup(msqkptr);
DPRINTF(("mtype (%ld) < 1\n", msghdr->msg_type));
error = EINVAL;
goto done3;
}
/*
* Copy in the message body
*/
next = msghdr->msg_spot;
while (msgsz > 0) {
size_t tlen;
if (msgsz > msginfo.msgssz)
tlen = msginfo.msgssz;
else
tlen = msgsz;
if (next <= -1)
panic("next too low #2");
if (next >= msginfo.msgseg)
panic("next out of range #2");
mtx_unlock(&msq_mtx);
if ((error = copyin(msgp, &msgpool[next * msginfo.msgssz],
tlen)) != 0) {
mtx_lock(&msq_mtx);
DPRINTF(("error %d copying in message segment\n",
error));
msg_freehdr(msghdr);
msqkptr->u.msg_perm.mode &= ~MSG_LOCKED;
wakeup(msqkptr);
goto done3;
}
mtx_lock(&msq_mtx);
msgsz -= tlen;
msgp = (const char *)msgp + tlen;
next = msgmaps[next].next;
}
if (next != -1)
panic("didn't use all the msg segments");
/*
* We've got the message. Unlock the msqid_ds.
*/
msqkptr->u.msg_perm.mode &= ~MSG_LOCKED;
/*
* Make sure that the msqid_ds is still allocated.
*/
if (msqkptr->u.msg_qbytes == 0) {
msg_freehdr(msghdr);
wakeup(msqkptr);
error = EIDRM;
goto done3;
}
#ifdef MAC
/*
* Note: Since the task/thread allocates the msghdr and usually
* primes it with its own MAC label, for a majority of policies, it
* won't be necessary to check whether the msghdr has access
* permissions to the msgq. The mac_sysvmsq_check_msqsnd check would
* suffice in that case. However, this hook may be required where
* individual policies derive a non-identical label for the msghdr
* from the current thread label and may want to check the msghdr
* enqueue permissions, along with read/write permissions to the
* msgq.
*/
error = mac_sysvmsq_check_msgmsq(td->td_ucred, msghdr, msqkptr);
if (error != 0) {
msg_freehdr(msghdr);
wakeup(msqkptr);
goto done3;
}
#endif
/*
* Put the message into the queue
*/
if (msqkptr->u.msg_first == NULL) {
msqkptr->u.msg_first = msghdr;
msqkptr->u.msg_last = msghdr;
} else {
msqkptr->u.msg_last->msg_next = msghdr;
msqkptr->u.msg_last = msghdr;
}
msqkptr->u.msg_last->msg_next = NULL;
msqkptr->u.msg_cbytes += msghdr->msg_ts;
msqkptr->u.msg_qnum++;
msqkptr->u.msg_lspid = td->td_proc->p_pid;
msqkptr->u.msg_stime = time_second;
wakeup(msqkptr);
td->td_retval[0] = 0;
done3:
#ifdef RACCT
if (error != 0) {
PROC_LOCK(td->td_proc);
racct_sub(td->td_proc, RACCT_MSGQQUEUED, 1);
racct_sub(td->td_proc, RACCT_MSGQSIZE, saved_msgsz);
PROC_UNLOCK(td->td_proc);
}
#endif
done2:
mtx_unlock(&msq_mtx);
return (error);
}
int
-msgsnd(td, uap)
+sys_msgsnd(td, uap)
struct thread *td;
register struct msgsnd_args *uap;
{
int error;
long mtype;
DPRINTF(("call to msgsnd(%d, %p, %zu, %d)\n", uap->msqid, uap->msgp,
uap->msgsz, uap->msgflg));
if ((error = copyin(uap->msgp, &mtype, sizeof(mtype))) != 0) {
DPRINTF(("error %d copying the message type\n", error));
return (error);
}
return (kern_msgsnd(td, uap->msqid,
(const char *)uap->msgp + sizeof(mtype),
uap->msgsz, uap->msgflg, mtype));
}
#ifndef _SYS_SYSPROTO_H_
struct msgrcv_args {
int msqid;
void *msgp;
size_t msgsz;
long msgtyp;
int msgflg;
};
#endif
int
kern_msgrcv(td, msqid, msgp, msgsz, msgtyp, msgflg, mtype)
struct thread *td;
int msqid;
void *msgp; /* XXX msgp is actually mtext. */
size_t msgsz;
long msgtyp;
int msgflg;
long *mtype;
{
size_t len;
register struct msqid_kernel *msqkptr;
register struct msg *msghdr;
int msqix, error = 0;
short next;
if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
return (ENOSYS);
msqix = IPCID_TO_IX(msqid);
if (msqix < 0 || msqix >= msginfo.msgmni) {
DPRINTF(("msqid (%d) out of range (0<=msqid<%d)\n", msqix,
msginfo.msgmni));
return (EINVAL);
}
msqkptr = &msqids[msqix];
mtx_lock(&msq_mtx);
if (msqkptr->u.msg_qbytes == 0) {
DPRINTF(("no such message queue id\n"));
error = EINVAL;
goto done2;
}
if (msqkptr->u.msg_perm.seq != IPCID_TO_SEQ(msqid)) {
DPRINTF(("wrong sequence number\n"));
error = EINVAL;
goto done2;
}
if ((error = ipcperm(td, &msqkptr->u.msg_perm, IPC_R))) {
DPRINTF(("requester doesn't have read access\n"));
goto done2;
}
#ifdef MAC
error = mac_sysvmsq_check_msqrcv(td->td_ucred, msqkptr);
if (error != 0)
goto done2;
#endif
msghdr = NULL;
while (msghdr == NULL) {
if (msgtyp == 0) {
msghdr = msqkptr->u.msg_first;
if (msghdr != NULL) {
if (msgsz < msghdr->msg_ts &&
(msgflg & MSG_NOERROR) == 0) {
DPRINTF(("first message on the queue "
"is too big (want %zu, got %d)\n",
msgsz, msghdr->msg_ts));
error = E2BIG;
goto done2;
}
#ifdef MAC
error = mac_sysvmsq_check_msgrcv(td->td_ucred,
msghdr);
if (error != 0)
goto done2;
#endif
if (msqkptr->u.msg_first == msqkptr->u.msg_last) {
msqkptr->u.msg_first = NULL;
msqkptr->u.msg_last = NULL;
} else {
msqkptr->u.msg_first = msghdr->msg_next;
if (msqkptr->u.msg_first == NULL)
panic("msg_first/last screwed up #1");
}
}
} else {
struct msg *previous;
struct msg **prev;
previous = NULL;
prev = &(msqkptr->u.msg_first);
while ((msghdr = *prev) != NULL) {
/*
* Is this message's type an exact match or is
* this message's type less than or equal to
* the absolute value of a negative msgtyp?
* Note that the second half of this test can
* NEVER be true if msgtyp is positive since
* msg_type is always positive!
*/
if (msgtyp == msghdr->msg_type ||
msghdr->msg_type <= -msgtyp) {
DPRINTF(("found message type %ld, "
"requested %ld\n",
msghdr->msg_type, msgtyp));
if (msgsz < msghdr->msg_ts &&
(msgflg & MSG_NOERROR) == 0) {
DPRINTF(("requested message "
"on the queue is too big "
"(want %zu, got %hu)\n",
msgsz, msghdr->msg_ts));
error = E2BIG;
goto done2;
}
#ifdef MAC
error = mac_sysvmsq_check_msgrcv(
td->td_ucred, msghdr);
if (error != 0)
goto done2;
#endif
*prev = msghdr->msg_next;
if (msghdr == msqkptr->u.msg_last) {
if (previous == NULL) {
if (prev !=
&msqkptr->u.msg_first)
panic("msg_first/last screwed up #2");
msqkptr->u.msg_first =
NULL;
msqkptr->u.msg_last =
NULL;
} else {
if (prev ==
&msqkptr->u.msg_first)
panic("msg_first/last screwed up #3");
msqkptr->u.msg_last =
previous;
}
}
break;
}
previous = msghdr;
prev = &(msghdr->msg_next);
}
}
/*
* We've either extracted the msghdr for the appropriate
* message or there isn't one.
* If there is one then bail out of this loop.
*/
if (msghdr != NULL)
break;
/*
* Hmph! No message found. Does the user want to wait?
*/
if ((msgflg & IPC_NOWAIT) != 0) {
DPRINTF(("no appropriate message found (msgtyp=%ld)\n",
msgtyp));
/* The SVID says to return ENOMSG. */
error = ENOMSG;
goto done2;
}
/*
* Wait for something to happen
*/
DPRINTF(("msgrcv: goodnight\n"));
error = msleep(msqkptr, &msq_mtx, (PZERO - 4) | PCATCH,
"msgrcv", 0);
DPRINTF(("msgrcv: good morning (error=%d)\n", error));
if (error != 0) {
DPRINTF(("msgrcv: interrupted system call\n"));
error = EINTR;
goto done2;
}
/*
* Make sure that the msq queue still exists
*/
if (msqkptr->u.msg_qbytes == 0 ||
msqkptr->u.msg_perm.seq != IPCID_TO_SEQ(msqid)) {
DPRINTF(("msqid deleted\n"));
error = EIDRM;
goto done2;
}
}
/*
* Return the message to the user.
*
* First, do the bookkeeping (before we risk being interrupted).
*/
msqkptr->u.msg_cbytes -= msghdr->msg_ts;
msqkptr->u.msg_qnum--;
msqkptr->u.msg_lrpid = td->td_proc->p_pid;
msqkptr->u.msg_rtime = time_second;
racct_sub_cred(msqkptr->cred, RACCT_MSGQQUEUED, 1);
racct_sub_cred(msqkptr->cred, RACCT_MSGQSIZE, msghdr->msg_ts);
/*
* Make msgsz the actual amount that we'll be returning.
* Note that this effectively truncates the message if it is too long
* (since msgsz is never increased).
*/
DPRINTF(("found a message, msgsz=%zu, msg_ts=%hu\n", msgsz,
msghdr->msg_ts));
if (msgsz > msghdr->msg_ts)
msgsz = msghdr->msg_ts;
*mtype = msghdr->msg_type;
/*
* Return the segments to the user
*/
next = msghdr->msg_spot;
for (len = 0; len < msgsz; len += msginfo.msgssz) {
size_t tlen;
if (msgsz - len > msginfo.msgssz)
tlen = msginfo.msgssz;
else
tlen = msgsz - len;
if (next <= -1)
panic("next too low #3");
if (next >= msginfo.msgseg)
panic("next out of range #3");
mtx_unlock(&msq_mtx);
error = copyout(&msgpool[next * msginfo.msgssz], msgp, tlen);
mtx_lock(&msq_mtx);
if (error != 0) {
DPRINTF(("error (%d) copying out message segment\n",
error));
msg_freehdr(msghdr);
wakeup(msqkptr);
goto done2;
}
msgp = (char *)msgp + tlen;
next = msgmaps[next].next;
}
/*
* Done, return the actual number of bytes copied out.
*/
msg_freehdr(msghdr);
wakeup(msqkptr);
td->td_retval[0] = msgsz;
done2:
mtx_unlock(&msq_mtx);
return (error);
}
int
-msgrcv(td, uap)
+sys_msgrcv(td, uap)
struct thread *td;
register struct msgrcv_args *uap;
{
int error;
long mtype;
DPRINTF(("call to msgrcv(%d, %p, %zu, %ld, %d)\n", uap->msqid,
uap->msgp, uap->msgsz, uap->msgtyp, uap->msgflg));
if ((error = kern_msgrcv(td, uap->msqid,
(char *)uap->msgp + sizeof(mtype), uap->msgsz,
uap->msgtyp, uap->msgflg, &mtype)) != 0)
return (error);
if ((error = copyout(&mtype, uap->msgp, sizeof(mtype))) != 0)
DPRINTF(("error %d copying the message type\n", error));
return (error);
}
static int
sysctl_msqids(SYSCTL_HANDLER_ARGS)
{
return (SYSCTL_OUT(req, msqids,
sizeof(struct msqid_kernel) * msginfo.msgmni));
}
SYSCTL_INT(_kern_ipc, OID_AUTO, msgmax, CTLFLAG_RD, &msginfo.msgmax, 0,
"Maximum message size");
SYSCTL_INT(_kern_ipc, OID_AUTO, msgmni, CTLFLAG_RDTUN, &msginfo.msgmni, 0,
"Number of message queue identifiers");
SYSCTL_INT(_kern_ipc, OID_AUTO, msgmnb, CTLFLAG_RDTUN, &msginfo.msgmnb, 0,
"Maximum number of bytes in a queue");
SYSCTL_INT(_kern_ipc, OID_AUTO, msgtql, CTLFLAG_RDTUN, &msginfo.msgtql, 0,
"Maximum number of messages in the system");
SYSCTL_INT(_kern_ipc, OID_AUTO, msgssz, CTLFLAG_RDTUN, &msginfo.msgssz, 0,
"Size of a message segment");
SYSCTL_INT(_kern_ipc, OID_AUTO, msgseg, CTLFLAG_RDTUN, &msginfo.msgseg, 0,
"Number of message segments");
SYSCTL_PROC(_kern_ipc, OID_AUTO, msqids, CTLTYPE_OPAQUE | CTLFLAG_RD,
NULL, 0, sysctl_msqids, "", "Message queue IDs");
#ifdef COMPAT_FREEBSD32
int
freebsd32_msgsys(struct thread *td, struct freebsd32_msgsys_args *uap)
{
#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
switch (uap->which) {
case 0:
return (freebsd7_freebsd32_msgctl(td,
(struct freebsd7_freebsd32_msgctl_args *)&uap->a2));
case 2:
return (freebsd32_msgsnd(td,
(struct freebsd32_msgsnd_args *)&uap->a2));
case 3:
return (freebsd32_msgrcv(td,
(struct freebsd32_msgrcv_args *)&uap->a2));
default:
- return (msgsys(td, (struct msgsys_args *)uap));
+ return (sys_msgsys(td, (struct msgsys_args *)uap));
}
#else
return (nosys(td, NULL));
#endif
}
#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
int
freebsd7_freebsd32_msgctl(struct thread *td,
struct freebsd7_freebsd32_msgctl_args *uap)
{
struct msqid_ds msqbuf;
struct msqid_ds32_old msqbuf32;
int error;
if (uap->cmd == IPC_SET) {
error = copyin(uap->buf, &msqbuf32, sizeof(msqbuf32));
if (error)
return (error);
freebsd32_ipcperm_old_in(&msqbuf32.msg_perm, &msqbuf.msg_perm);
PTRIN_CP(msqbuf32, msqbuf, msg_first);
PTRIN_CP(msqbuf32, msqbuf, msg_last);
CP(msqbuf32, msqbuf, msg_cbytes);
CP(msqbuf32, msqbuf, msg_qnum);
CP(msqbuf32, msqbuf, msg_qbytes);
CP(msqbuf32, msqbuf, msg_lspid);
CP(msqbuf32, msqbuf, msg_lrpid);
CP(msqbuf32, msqbuf, msg_stime);
CP(msqbuf32, msqbuf, msg_rtime);
CP(msqbuf32, msqbuf, msg_ctime);
}
error = kern_msgctl(td, uap->msqid, uap->cmd, &msqbuf);
if (error)
return (error);
if (uap->cmd == IPC_STAT) {
bzero(&msqbuf32, sizeof(msqbuf32));
freebsd32_ipcperm_old_out(&msqbuf.msg_perm, &msqbuf32.msg_perm);
PTROUT_CP(msqbuf, msqbuf32, msg_first);
PTROUT_CP(msqbuf, msqbuf32, msg_last);
CP(msqbuf, msqbuf32, msg_cbytes);
CP(msqbuf, msqbuf32, msg_qnum);
CP(msqbuf, msqbuf32, msg_qbytes);
CP(msqbuf, msqbuf32, msg_lspid);
CP(msqbuf, msqbuf32, msg_lrpid);
CP(msqbuf, msqbuf32, msg_stime);
CP(msqbuf, msqbuf32, msg_rtime);
CP(msqbuf, msqbuf32, msg_ctime);
error = copyout(&msqbuf32, uap->buf, sizeof(struct msqid_ds32));
}
return (error);
}
#endif
int
freebsd32_msgctl(struct thread *td, struct freebsd32_msgctl_args *uap)
{
struct msqid_ds msqbuf;
struct msqid_ds32 msqbuf32;
int error;
if (uap->cmd == IPC_SET) {
error = copyin(uap->buf, &msqbuf32, sizeof(msqbuf32));
if (error)
return (error);
freebsd32_ipcperm_in(&msqbuf32.msg_perm, &msqbuf.msg_perm);
PTRIN_CP(msqbuf32, msqbuf, msg_first);
PTRIN_CP(msqbuf32, msqbuf, msg_last);
CP(msqbuf32, msqbuf, msg_cbytes);
CP(msqbuf32, msqbuf, msg_qnum);
CP(msqbuf32, msqbuf, msg_qbytes);
CP(msqbuf32, msqbuf, msg_lspid);
CP(msqbuf32, msqbuf, msg_lrpid);
CP(msqbuf32, msqbuf, msg_stime);
CP(msqbuf32, msqbuf, msg_rtime);
CP(msqbuf32, msqbuf, msg_ctime);
}
error = kern_msgctl(td, uap->msqid, uap->cmd, &msqbuf);
if (error)
return (error);
if (uap->cmd == IPC_STAT) {
freebsd32_ipcperm_out(&msqbuf.msg_perm, &msqbuf32.msg_perm);
PTROUT_CP(msqbuf, msqbuf32, msg_first);
PTROUT_CP(msqbuf, msqbuf32, msg_last);
CP(msqbuf, msqbuf32, msg_cbytes);
CP(msqbuf, msqbuf32, msg_qnum);
CP(msqbuf, msqbuf32, msg_qbytes);
CP(msqbuf, msqbuf32, msg_lspid);
CP(msqbuf, msqbuf32, msg_lrpid);
CP(msqbuf, msqbuf32, msg_stime);
CP(msqbuf, msqbuf32, msg_rtime);
CP(msqbuf, msqbuf32, msg_ctime);
error = copyout(&msqbuf32, uap->buf, sizeof(struct msqid_ds32));
}
return (error);
}
int
freebsd32_msgsnd(struct thread *td, struct freebsd32_msgsnd_args *uap)
{
const void *msgp;
long mtype;
int32_t mtype32;
int error;
msgp = PTRIN(uap->msgp);
if ((error = copyin(msgp, &mtype32, sizeof(mtype32))) != 0)
return (error);
mtype = mtype32;
return (kern_msgsnd(td, uap->msqid,
(const char *)msgp + sizeof(mtype32),
uap->msgsz, uap->msgflg, mtype));
}
int
freebsd32_msgrcv(struct thread *td, struct freebsd32_msgrcv_args *uap)
{
void *msgp;
long mtype;
int32_t mtype32;
int error;
msgp = PTRIN(uap->msgp);
if ((error = kern_msgrcv(td, uap->msqid,
(char *)msgp + sizeof(mtype32), uap->msgsz,
uap->msgtyp, uap->msgflg, &mtype)) != 0)
return (error);
mtype32 = (int32_t)mtype;
return (copyout(&mtype32, msgp, sizeof(mtype32)));
}
#endif
#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
/* XXX casting to (sy_call_t *) is bogus, as usual. */
static sy_call_t *msgcalls[] = {
- (sy_call_t *)freebsd7_msgctl, (sy_call_t *)msgget,
- (sy_call_t *)msgsnd, (sy_call_t *)msgrcv
+ (sy_call_t *)freebsd7_msgctl, (sy_call_t *)sys_msgget,
+ (sy_call_t *)sys_msgsnd, (sy_call_t *)sys_msgrcv
};
/*
* Entry point for all MSG calls.
*/
int
-msgsys(td, uap)
+sys_msgsys(td, uap)
struct thread *td;
/* XXX actually varargs. */
struct msgsys_args /* {
int which;
int a2;
int a3;
int a4;
int a5;
int a6;
} */ *uap;
{
int error;
if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
return (ENOSYS);
if (uap->which < 0 ||
uap->which >= sizeof(msgcalls)/sizeof(msgcalls[0]))
return (EINVAL);
error = (*msgcalls[uap->which])(td, &uap->a2);
return (error);
}
#ifndef CP
#define CP(src, dst, fld) do { (dst).fld = (src).fld; } while (0)
#endif
#ifndef _SYS_SYSPROTO_H_
struct freebsd7_msgctl_args {
int msqid;
int cmd;
struct msqid_ds_old *buf;
};
#endif
int
freebsd7_msgctl(td, uap)
struct thread *td;
struct freebsd7_msgctl_args *uap;
{
struct msqid_ds_old msqold;
struct msqid_ds msqbuf;
int error;
DPRINTF(("call to freebsd7_msgctl(%d, %d, %p)\n", uap->msqid, uap->cmd,
uap->buf));
if (uap->cmd == IPC_SET) {
error = copyin(uap->buf, &msqold, sizeof(msqold));
if (error)
return (error);
ipcperm_old2new(&msqold.msg_perm, &msqbuf.msg_perm);
CP(msqold, msqbuf, msg_first);
CP(msqold, msqbuf, msg_last);
CP(msqold, msqbuf, msg_cbytes);
CP(msqold, msqbuf, msg_qnum);
CP(msqold, msqbuf, msg_qbytes);
CP(msqold, msqbuf, msg_lspid);
CP(msqold, msqbuf, msg_lrpid);
CP(msqold, msqbuf, msg_stime);
CP(msqold, msqbuf, msg_rtime);
CP(msqold, msqbuf, msg_ctime);
}
error = kern_msgctl(td, uap->msqid, uap->cmd, &msqbuf);
if (error)
return (error);
if (uap->cmd == IPC_STAT) {
bzero(&msqold, sizeof(msqold));
ipcperm_new2old(&msqbuf.msg_perm, &msqold.msg_perm);
CP(msqbuf, msqold, msg_first);
CP(msqbuf, msqold, msg_last);
CP(msqbuf, msqold, msg_cbytes);
CP(msqbuf, msqold, msg_qnum);
CP(msqbuf, msqold, msg_qbytes);
CP(msqbuf, msqold, msg_lspid);
CP(msqbuf, msqold, msg_lrpid);
CP(msqbuf, msqold, msg_stime);
CP(msqbuf, msqold, msg_rtime);
CP(msqbuf, msqold, msg_ctime);
error = copyout(&msqold, uap->buf, sizeof(struct msqid_ds_old));
}
return (error);
}
#undef CP
#endif /* COMPAT_FREEBSD4 || COMPAT_FREEBSD5 || COMPAT_FREEBSD6 ||
COMPAT_FREEBSD7 */
Index: head/sys/kern/sysv_sem.c
===================================================================
--- head/sys/kern/sysv_sem.c (revision 225616)
+++ head/sys/kern/sysv_sem.c (revision 225617)
@@ -1,1666 +1,1666 @@
/*-
* Implementation of SVID semaphores
*
* Author: Daniel Boulet
*
* This software is provided ``AS IS'' without any warranties of any kind.
*/
/*-
* Copyright (c) 2003-2005 McAfee, Inc.
* All rights reserved.
*
* This software was developed for the FreeBSD Project in part by McAfee
* Research, the Security Research Division of McAfee, Inc under DARPA/SPAWAR
* contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS research
* program.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_compat.h"
#include "opt_sysvipc.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/sysproto.h>
#include <sys/eventhandler.h>
#include <sys/kernel.h>
#include <sys/proc.h>
#include <sys/lock.h>
#include <sys/module.h>
#include <sys/mutex.h>
#include <sys/racct.h>
#include <sys/sem.h>
#include <sys/syscall.h>
#include <sys/syscallsubr.h>
#include <sys/sysent.h>
#include <sys/sysctl.h>
#include <sys/uio.h>
#include <sys/malloc.h>
#include <sys/jail.h>
#include <security/mac/mac_framework.h>
FEATURE(sysv_sem, "System V semaphores support");
static MALLOC_DEFINE(M_SEM, "sem", "SVID compatible semaphores");
#ifdef SEM_DEBUG
#define DPRINTF(a) printf a
#else
#define DPRINTF(a)
#endif
static int seminit(void);
static int sysvsem_modload(struct module *, int, void *);
static int semunload(void);
static void semexit_myhook(void *arg, struct proc *p);
static int sysctl_sema(SYSCTL_HANDLER_ARGS);
static int semvalid(int semid, struct semid_kernel *semakptr);
#ifndef _SYS_SYSPROTO_H_
struct __semctl_args;
int __semctl(struct thread *td, struct __semctl_args *uap);
struct semget_args;
int semget(struct thread *td, struct semget_args *uap);
struct semop_args;
int semop(struct thread *td, struct semop_args *uap);
#endif
static struct sem_undo *semu_alloc(struct thread *td);
static int semundo_adjust(struct thread *td, struct sem_undo **supptr,
int semid, int semseq, int semnum, int adjval);
static void semundo_clear(int semid, int semnum);
static struct mtx sem_mtx; /* semaphore global lock */
static struct mtx sem_undo_mtx;
static int semtot = 0;
static struct semid_kernel *sema; /* semaphore id pool */
static struct mtx *sema_mtx; /* semaphore id pool mutexes*/
static struct sem *sem; /* semaphore pool */
LIST_HEAD(, sem_undo) semu_list; /* list of active undo structures */
LIST_HEAD(, sem_undo) semu_free_list; /* list of free undo structures */
static int *semu; /* undo structure pool */
static eventhandler_tag semexit_tag;
#define SEMUNDO_MTX sem_undo_mtx
#define SEMUNDO_LOCK() mtx_lock(&SEMUNDO_MTX);
#define SEMUNDO_UNLOCK() mtx_unlock(&SEMUNDO_MTX);
#define SEMUNDO_LOCKASSERT(how) mtx_assert(&SEMUNDO_MTX, (how));
struct sem {
u_short semval; /* semaphore value */
pid_t sempid; /* pid of last operation */
u_short semncnt; /* # awaiting semval > cval */
u_short semzcnt; /* # awaiting semval = 0 */
};
/*
* Undo structure (one per process)
*/
struct sem_undo {
LIST_ENTRY(sem_undo) un_next; /* ptr to next active undo structure */
struct proc *un_proc; /* owner of this structure */
short un_cnt; /* # of active entries */
struct undo {
short un_adjval; /* adjust on exit values */
short un_num; /* semaphore # */
int un_id; /* semid */
unsigned short un_seq;
} un_ent[1]; /* undo entries */
};
/*
* Configuration parameters
*/
#ifndef SEMMNI
#define SEMMNI 50 /* # of semaphore identifiers */
#endif
#ifndef SEMMNS
#define SEMMNS 340 /* # of semaphores in system */
#endif
#ifndef SEMUME
#define SEMUME 50 /* max # of undo entries per process */
#endif
#ifndef SEMMNU
#define SEMMNU 150 /* # of undo structures in system */
#endif
/* shouldn't need tuning */
#ifndef SEMMSL
#define SEMMSL SEMMNS /* max # of semaphores per id */
#endif
#ifndef SEMOPM
#define SEMOPM 100 /* max # of operations per semop call */
#endif
#define SEMVMX 32767 /* semaphore maximum value */
#define SEMAEM 16384 /* adjust on exit max value */
/*
* Due to the way semaphore memory is allocated, we have to ensure that
* SEMUSZ is properly aligned.
*/
#define SEM_ALIGN(bytes) (((bytes) + (sizeof(long) - 1)) & ~(sizeof(long) - 1))
/* actual size of an undo structure */
#define SEMUSZ SEM_ALIGN(offsetof(struct sem_undo, un_ent[SEMUME]))
/*
* Macro to find a particular sem_undo vector
*/
#define SEMU(ix) \
((struct sem_undo *)(((intptr_t)semu)+ix * seminfo.semusz))
/*
* semaphore info struct
*/
struct seminfo seminfo = {
SEMMNI, /* # of semaphore identifiers */
SEMMNS, /* # of semaphores in system */
SEMMNU, /* # of undo structures in system */
SEMMSL, /* max # of semaphores per id */
SEMOPM, /* max # of operations per semop call */
SEMUME, /* max # of undo entries per process */
SEMUSZ, /* size in bytes of undo structure */
SEMVMX, /* semaphore maximum value */
SEMAEM /* adjust on exit max value */
};
SYSCTL_INT(_kern_ipc, OID_AUTO, semmni, CTLFLAG_RDTUN, &seminfo.semmni, 0,
"Number of semaphore identifiers");
SYSCTL_INT(_kern_ipc, OID_AUTO, semmns, CTLFLAG_RDTUN, &seminfo.semmns, 0,
"Maximum number of semaphores in the system");
SYSCTL_INT(_kern_ipc, OID_AUTO, semmnu, CTLFLAG_RDTUN, &seminfo.semmnu, 0,
"Maximum number of undo structures in the system");
SYSCTL_INT(_kern_ipc, OID_AUTO, semmsl, CTLFLAG_RW, &seminfo.semmsl, 0,
"Max semaphores per id");
SYSCTL_INT(_kern_ipc, OID_AUTO, semopm, CTLFLAG_RDTUN, &seminfo.semopm, 0,
"Max operations per semop call");
SYSCTL_INT(_kern_ipc, OID_AUTO, semume, CTLFLAG_RDTUN, &seminfo.semume, 0,
"Max undo entries per process");
SYSCTL_INT(_kern_ipc, OID_AUTO, semusz, CTLFLAG_RDTUN, &seminfo.semusz, 0,
"Size in bytes of undo structure");
SYSCTL_INT(_kern_ipc, OID_AUTO, semvmx, CTLFLAG_RW, &seminfo.semvmx, 0,
"Semaphore maximum value");
SYSCTL_INT(_kern_ipc, OID_AUTO, semaem, CTLFLAG_RW, &seminfo.semaem, 0,
"Adjust on exit max value");
SYSCTL_PROC(_kern_ipc, OID_AUTO, sema, CTLTYPE_OPAQUE | CTLFLAG_RD,
NULL, 0, sysctl_sema, "", "Semaphore id pool");
static struct syscall_helper_data sem_syscalls[] = {
SYSCALL_INIT_HELPER(__semctl),
SYSCALL_INIT_HELPER(semget),
SYSCALL_INIT_HELPER(semop),
#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
SYSCALL_INIT_HELPER(semsys),
- SYSCALL_INIT_HELPER(freebsd7___semctl),
+ SYSCALL_INIT_HELPER_COMPAT(freebsd7___semctl),
#endif
SYSCALL_INIT_LAST
};
#ifdef COMPAT_FREEBSD32
#include <compat/freebsd32/freebsd32.h>
#include <compat/freebsd32/freebsd32_ipc.h>
#include <compat/freebsd32/freebsd32_proto.h>
#include <compat/freebsd32/freebsd32_signal.h>
#include <compat/freebsd32/freebsd32_syscall.h>
#include <compat/freebsd32/freebsd32_util.h>
static struct syscall_helper_data sem32_syscalls[] = {
SYSCALL32_INIT_HELPER(freebsd32_semctl),
- SYSCALL32_INIT_HELPER(semget),
- SYSCALL32_INIT_HELPER(semop),
+ SYSCALL32_INIT_HELPER_COMPAT(semget),
+ SYSCALL32_INIT_HELPER_COMPAT(semop),
SYSCALL32_INIT_HELPER(freebsd32_semsys),
#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
SYSCALL32_INIT_HELPER(freebsd7_freebsd32_semctl),
#endif
SYSCALL_INIT_LAST
};
#endif
static int
seminit(void)
{
int i, error;
TUNABLE_INT_FETCH("kern.ipc.semmni", &seminfo.semmni);
TUNABLE_INT_FETCH("kern.ipc.semmns", &seminfo.semmns);
TUNABLE_INT_FETCH("kern.ipc.semmnu", &seminfo.semmnu);
TUNABLE_INT_FETCH("kern.ipc.semmsl", &seminfo.semmsl);
TUNABLE_INT_FETCH("kern.ipc.semopm", &seminfo.semopm);
TUNABLE_INT_FETCH("kern.ipc.semume", &seminfo.semume);
TUNABLE_INT_FETCH("kern.ipc.semusz", &seminfo.semusz);
TUNABLE_INT_FETCH("kern.ipc.semvmx", &seminfo.semvmx);
TUNABLE_INT_FETCH("kern.ipc.semaem", &seminfo.semaem);
sem = malloc(sizeof(struct sem) * seminfo.semmns, M_SEM, M_WAITOK);
sema = malloc(sizeof(struct semid_kernel) * seminfo.semmni, M_SEM,
M_WAITOK);
sema_mtx = malloc(sizeof(struct mtx) * seminfo.semmni, M_SEM,
M_WAITOK | M_ZERO);
semu = malloc(seminfo.semmnu * seminfo.semusz, M_SEM, M_WAITOK);
for (i = 0; i < seminfo.semmni; i++) {
sema[i].u.sem_base = 0;
sema[i].u.sem_perm.mode = 0;
sema[i].u.sem_perm.seq = 0;
#ifdef MAC
mac_sysvsem_init(&sema[i]);
#endif
}
for (i = 0; i < seminfo.semmni; i++)
mtx_init(&sema_mtx[i], "semid", NULL, MTX_DEF);
LIST_INIT(&semu_free_list);
for (i = 0; i < seminfo.semmnu; i++) {
struct sem_undo *suptr = SEMU(i);
suptr->un_proc = NULL;
LIST_INSERT_HEAD(&semu_free_list, suptr, un_next);
}
LIST_INIT(&semu_list);
mtx_init(&sem_mtx, "sem", NULL, MTX_DEF);
mtx_init(&sem_undo_mtx, "semu", NULL, MTX_DEF);
semexit_tag = EVENTHANDLER_REGISTER(process_exit, semexit_myhook, NULL,
EVENTHANDLER_PRI_ANY);
error = syscall_helper_register(sem_syscalls);
if (error != 0)
return (error);
#ifdef COMPAT_FREEBSD32
error = syscall32_helper_register(sem32_syscalls);
if (error != 0)
return (error);
#endif
return (0);
}
static int
semunload(void)
{
int i;
/* XXXKIB */
if (semtot != 0)
return (EBUSY);
#ifdef COMPAT_FREEBSD32
syscall32_helper_unregister(sem32_syscalls);
#endif
syscall_helper_unregister(sem_syscalls);
EVENTHANDLER_DEREGISTER(process_exit, semexit_tag);
#ifdef MAC
for (i = 0; i < seminfo.semmni; i++)
mac_sysvsem_destroy(&sema[i]);
#endif
free(sem, M_SEM);
free(sema, M_SEM);
free(semu, M_SEM);
for (i = 0; i < seminfo.semmni; i++)
mtx_destroy(&sema_mtx[i]);
free(sema_mtx, M_SEM);
mtx_destroy(&sem_mtx);
mtx_destroy(&sem_undo_mtx);
return (0);
}
static int
sysvsem_modload(struct module *module, int cmd, void *arg)
{
int error = 0;
switch (cmd) {
case MOD_LOAD:
error = seminit();
if (error != 0)
semunload();
break;
case MOD_UNLOAD:
error = semunload();
break;
case MOD_SHUTDOWN:
break;
default:
error = EINVAL;
break;
}
return (error);
}
static moduledata_t sysvsem_mod = {
"sysvsem",
&sysvsem_modload,
NULL
};
DECLARE_MODULE(sysvsem, sysvsem_mod, SI_SUB_SYSV_SEM, SI_ORDER_FIRST);
MODULE_VERSION(sysvsem, 1);
/*
* Allocate a new sem_undo structure for a process
* (returns ptr to structure or NULL if no more room)
*/
static struct sem_undo *
semu_alloc(struct thread *td)
{
struct sem_undo *suptr;
SEMUNDO_LOCKASSERT(MA_OWNED);
if ((suptr = LIST_FIRST(&semu_free_list)) == NULL)
return (NULL);
LIST_REMOVE(suptr, un_next);
LIST_INSERT_HEAD(&semu_list, suptr, un_next);
suptr->un_cnt = 0;
suptr->un_proc = td->td_proc;
return (suptr);
}
static int
semu_try_free(struct sem_undo *suptr)
{
SEMUNDO_LOCKASSERT(MA_OWNED);
if (suptr->un_cnt != 0)
return (0);
LIST_REMOVE(suptr, un_next);
LIST_INSERT_HEAD(&semu_free_list, suptr, un_next);
return (1);
}
/*
* Adjust a particular entry for a particular proc
*/
static int
semundo_adjust(struct thread *td, struct sem_undo **supptr, int semid,
int semseq, int semnum, int adjval)
{
struct proc *p = td->td_proc;
struct sem_undo *suptr;
struct undo *sunptr;
int i;
SEMUNDO_LOCKASSERT(MA_OWNED);
/* Look for and remember the sem_undo if the caller doesn't provide
it */
suptr = *supptr;
if (suptr == NULL) {
LIST_FOREACH(suptr, &semu_list, un_next) {
if (suptr->un_proc == p) {
*supptr = suptr;
break;
}
}
if (suptr == NULL) {
if (adjval == 0)
return(0);
suptr = semu_alloc(td);
if (suptr == NULL)
return (ENOSPC);
*supptr = suptr;
}
}
/*
* Look for the requested entry and adjust it (delete if adjval becomes
* 0).
*/
sunptr = &suptr->un_ent[0];
for (i = 0; i < suptr->un_cnt; i++, sunptr++) {
if (sunptr->un_id != semid || sunptr->un_num != semnum)
continue;
if (adjval != 0) {
adjval += sunptr->un_adjval;
if (adjval > seminfo.semaem || adjval < -seminfo.semaem)
return (ERANGE);
}
sunptr->un_adjval = adjval;
if (sunptr->un_adjval == 0) {
suptr->un_cnt--;
if (i < suptr->un_cnt)
suptr->un_ent[i] =
suptr->un_ent[suptr->un_cnt];
if (suptr->un_cnt == 0)
semu_try_free(suptr);
}
return (0);
}
/* Didn't find the right entry - create it */
if (adjval == 0)
return (0);
if (adjval > seminfo.semaem || adjval < -seminfo.semaem)
return (ERANGE);
if (suptr->un_cnt != seminfo.semume) {
sunptr = &suptr->un_ent[suptr->un_cnt];
suptr->un_cnt++;
sunptr->un_adjval = adjval;
sunptr->un_id = semid;
sunptr->un_num = semnum;
sunptr->un_seq = semseq;
} else
return (EINVAL);
return (0);
}
static void
semundo_clear(int semid, int semnum)
{
struct sem_undo *suptr, *suptr1;
struct undo *sunptr;
int i;
SEMUNDO_LOCKASSERT(MA_OWNED);
LIST_FOREACH_SAFE(suptr, &semu_list, un_next, suptr1) {
sunptr = &suptr->un_ent[0];
for (i = 0; i < suptr->un_cnt; i++, sunptr++) {
if (sunptr->un_id != semid)
continue;
if (semnum == -1 || sunptr->un_num == semnum) {
suptr->un_cnt--;
if (i < suptr->un_cnt) {
suptr->un_ent[i] =
suptr->un_ent[suptr->un_cnt];
continue;
}
semu_try_free(suptr);
}
if (semnum != -1)
break;
}
}
}
static int
semvalid(int semid, struct semid_kernel *semakptr)
{
return ((semakptr->u.sem_perm.mode & SEM_ALLOC) == 0 ||
semakptr->u.sem_perm.seq != IPCID_TO_SEQ(semid) ? EINVAL : 0);
}
/*
* Note that the user-mode half of this passes a union, not a pointer.
*/
#ifndef _SYS_SYSPROTO_H_
struct __semctl_args {
int semid;
int semnum;
int cmd;
union semun *arg;
};
#endif
int
-__semctl(struct thread *td, struct __semctl_args *uap)
+sys___semctl(struct thread *td, struct __semctl_args *uap)
{
struct semid_ds dsbuf;
union semun arg, semun;
register_t rval;
int error;
switch (uap->cmd) {
case SEM_STAT:
case IPC_SET:
case IPC_STAT:
case GETALL:
case SETVAL:
case SETALL:
error = copyin(uap->arg, &arg, sizeof(arg));
if (error)
return (error);
break;
}
switch (uap->cmd) {
case SEM_STAT:
case IPC_STAT:
semun.buf = &dsbuf;
break;
case IPC_SET:
error = copyin(arg.buf, &dsbuf, sizeof(dsbuf));
if (error)
return (error);
semun.buf = &dsbuf;
break;
case GETALL:
case SETALL:
semun.array = arg.array;
break;
case SETVAL:
semun.val = arg.val;
break;
}
error = kern_semctl(td, uap->semid, uap->semnum, uap->cmd, &semun,
&rval);
if (error)
return (error);
switch (uap->cmd) {
case SEM_STAT:
case IPC_STAT:
error = copyout(&dsbuf, arg.buf, sizeof(dsbuf));
break;
}
if (error == 0)
td->td_retval[0] = rval;
return (error);
}
int
kern_semctl(struct thread *td, int semid, int semnum, int cmd,
union semun *arg, register_t *rval)
{
u_short *array;
struct ucred *cred = td->td_ucred;
int i, error;
struct semid_ds *sbuf;
struct semid_kernel *semakptr;
struct mtx *sema_mtxp;
u_short usval, count;
int semidx;
DPRINTF(("call to semctl(%d, %d, %d, 0x%p)\n",
semid, semnum, cmd, arg));
if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
return (ENOSYS);
array = NULL;
switch(cmd) {
case SEM_STAT:
/*
* For this command we assume semid is an array index
* rather than an IPC id.
*/
if (semid < 0 || semid >= seminfo.semmni)
return (EINVAL);
semakptr = &sema[semid];
sema_mtxp = &sema_mtx[semid];
mtx_lock(sema_mtxp);
if ((semakptr->u.sem_perm.mode & SEM_ALLOC) == 0) {
error = EINVAL;
goto done2;
}
if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
goto done2;
#ifdef MAC
error = mac_sysvsem_check_semctl(cred, semakptr, cmd);
if (error != 0)
goto done2;
#endif
bcopy(&semakptr->u, arg->buf, sizeof(struct semid_ds));
*rval = IXSEQ_TO_IPCID(semid, semakptr->u.sem_perm);
mtx_unlock(sema_mtxp);
return (0);
}
semidx = IPCID_TO_IX(semid);
if (semidx < 0 || semidx >= seminfo.semmni)
return (EINVAL);
semakptr = &sema[semidx];
sema_mtxp = &sema_mtx[semidx];
if (cmd == IPC_RMID)
mtx_lock(&sem_mtx);
mtx_lock(sema_mtxp);
#ifdef MAC
error = mac_sysvsem_check_semctl(cred, semakptr, cmd);
if (error != 0)
goto done2;
#endif
error = 0;
*rval = 0;
switch (cmd) {
case IPC_RMID:
if ((error = semvalid(semid, semakptr)) != 0)
goto done2;
if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_M)))
goto done2;
semakptr->u.sem_perm.cuid = cred->cr_uid;
semakptr->u.sem_perm.uid = cred->cr_uid;
semakptr->u.sem_perm.mode = 0;
racct_sub_cred(semakptr->cred, RACCT_NSEM, semakptr->u.sem_nsems);
crfree(semakptr->cred);
semakptr->cred = NULL;
SEMUNDO_LOCK();
semundo_clear(semidx, -1);
SEMUNDO_UNLOCK();
#ifdef MAC
mac_sysvsem_cleanup(semakptr);
#endif
wakeup(semakptr);
for (i = 0; i < seminfo.semmni; i++) {
if ((sema[i].u.sem_perm.mode & SEM_ALLOC) &&
sema[i].u.sem_base > semakptr->u.sem_base)
mtx_lock_flags(&sema_mtx[i], LOP_DUPOK);
}
for (i = semakptr->u.sem_base - sem; i < semtot; i++)
sem[i] = sem[i + semakptr->u.sem_nsems];
for (i = 0; i < seminfo.semmni; i++) {
if ((sema[i].u.sem_perm.mode & SEM_ALLOC) &&
sema[i].u.sem_base > semakptr->u.sem_base) {
sema[i].u.sem_base -= semakptr->u.sem_nsems;
mtx_unlock(&sema_mtx[i]);
}
}
semtot -= semakptr->u.sem_nsems;
break;
case IPC_SET:
if ((error = semvalid(semid, semakptr)) != 0)
goto done2;
if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_M)))
goto done2;
sbuf = arg->buf;
semakptr->u.sem_perm.uid = sbuf->sem_perm.uid;
semakptr->u.sem_perm.gid = sbuf->sem_perm.gid;
semakptr->u.sem_perm.mode = (semakptr->u.sem_perm.mode &
~0777) | (sbuf->sem_perm.mode & 0777);
semakptr->u.sem_ctime = time_second;
break;
case IPC_STAT:
if ((error = semvalid(semid, semakptr)) != 0)
goto done2;
if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
goto done2;
bcopy(&semakptr->u, arg->buf, sizeof(struct semid_ds));
break;
case GETNCNT:
if ((error = semvalid(semid, semakptr)) != 0)
goto done2;
if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
goto done2;
if (semnum < 0 || semnum >= semakptr->u.sem_nsems) {
error = EINVAL;
goto done2;
}
*rval = semakptr->u.sem_base[semnum].semncnt;
break;
case GETPID:
if ((error = semvalid(semid, semakptr)) != 0)
goto done2;
if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
goto done2;
if (semnum < 0 || semnum >= semakptr->u.sem_nsems) {
error = EINVAL;
goto done2;
}
*rval = semakptr->u.sem_base[semnum].sempid;
break;
case GETVAL:
if ((error = semvalid(semid, semakptr)) != 0)
goto done2;
if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
goto done2;
if (semnum < 0 || semnum >= semakptr->u.sem_nsems) {
error = EINVAL;
goto done2;
}
*rval = semakptr->u.sem_base[semnum].semval;
break;
case GETALL:
/*
* Unfortunately, callers of this function don't know
* in advance how many semaphores are in this set.
* While we could just allocate the maximum size array
* and pass the actual size back to the caller, that
* won't work for SETALL since we can't copyin() more
* data than the user specified as we may return a
* spurious EFAULT.
*
* Note that the number of semaphores in a set is
* fixed for the life of that set. The only way that
* the 'count' could change while are blocked in
* malloc() is if this semaphore set were destroyed
* and a new one created with the same index.
* However, semvalid() will catch that due to the
* sequence number unless exactly 0x8000 (or a
* multiple thereof) semaphore sets for the same index
* are created and destroyed while we are in malloc!
*
*/
count = semakptr->u.sem_nsems;
mtx_unlock(sema_mtxp);
array = malloc(sizeof(*array) * count, M_TEMP, M_WAITOK);
mtx_lock(sema_mtxp);
if ((error = semvalid(semid, semakptr)) != 0)
goto done2;
KASSERT(count == semakptr->u.sem_nsems, ("nsems changed"));
if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
goto done2;
for (i = 0; i < semakptr->u.sem_nsems; i++)
array[i] = semakptr->u.sem_base[i].semval;
mtx_unlock(sema_mtxp);
error = copyout(array, arg->array, count * sizeof(*array));
mtx_lock(sema_mtxp);
break;
case GETZCNT:
if ((error = semvalid(semid, semakptr)) != 0)
goto done2;
if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
goto done2;
if (semnum < 0 || semnum >= semakptr->u.sem_nsems) {
error = EINVAL;
goto done2;
}
*rval = semakptr->u.sem_base[semnum].semzcnt;
break;
case SETVAL:
if ((error = semvalid(semid, semakptr)) != 0)
goto done2;
if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_W)))
goto done2;
if (semnum < 0 || semnum >= semakptr->u.sem_nsems) {
error = EINVAL;
goto done2;
}
if (arg->val < 0 || arg->val > seminfo.semvmx) {
error = ERANGE;
goto done2;
}
semakptr->u.sem_base[semnum].semval = arg->val;
SEMUNDO_LOCK();
semundo_clear(semidx, semnum);
SEMUNDO_UNLOCK();
wakeup(semakptr);
break;
case SETALL:
/*
* See comment on GETALL for why 'count' shouldn't change
* and why we require a userland buffer.
*/
count = semakptr->u.sem_nsems;
mtx_unlock(sema_mtxp);
array = malloc(sizeof(*array) * count, M_TEMP, M_WAITOK);
error = copyin(arg->array, array, count * sizeof(*array));
mtx_lock(sema_mtxp);
if (error)
break;
if ((error = semvalid(semid, semakptr)) != 0)
goto done2;
KASSERT(count == semakptr->u.sem_nsems, ("nsems changed"));
if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_W)))
goto done2;
for (i = 0; i < semakptr->u.sem_nsems; i++) {
usval = array[i];
if (usval > seminfo.semvmx) {
error = ERANGE;
break;
}
semakptr->u.sem_base[i].semval = usval;
}
SEMUNDO_LOCK();
semundo_clear(semidx, -1);
SEMUNDO_UNLOCK();
wakeup(semakptr);
break;
default:
error = EINVAL;
break;
}
done2:
mtx_unlock(sema_mtxp);
if (cmd == IPC_RMID)
mtx_unlock(&sem_mtx);
if (array != NULL)
free(array, M_TEMP);
return(error);
}
#ifndef _SYS_SYSPROTO_H_
struct semget_args {
key_t key;
int nsems;
int semflg;
};
#endif
int
-semget(struct thread *td, struct semget_args *uap)
+sys_semget(struct thread *td, struct semget_args *uap)
{
int semid, error = 0;
int key = uap->key;
int nsems = uap->nsems;
int semflg = uap->semflg;
struct ucred *cred = td->td_ucred;
DPRINTF(("semget(0x%x, %d, 0%o)\n", key, nsems, semflg));
if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
return (ENOSYS);
mtx_lock(&sem_mtx);
if (key != IPC_PRIVATE) {
for (semid = 0; semid < seminfo.semmni; semid++) {
if ((sema[semid].u.sem_perm.mode & SEM_ALLOC) &&
sema[semid].u.sem_perm.key == key)
break;
}
if (semid < seminfo.semmni) {
DPRINTF(("found public key\n"));
if ((error = ipcperm(td, &sema[semid].u.sem_perm,
semflg & 0700))) {
goto done2;
}
if (nsems > 0 && sema[semid].u.sem_nsems < nsems) {
DPRINTF(("too small\n"));
error = EINVAL;
goto done2;
}
if ((semflg & IPC_CREAT) && (semflg & IPC_EXCL)) {
DPRINTF(("not exclusive\n"));
error = EEXIST;
goto done2;
}
#ifdef MAC
error = mac_sysvsem_check_semget(cred, &sema[semid]);
if (error != 0)
goto done2;
#endif
goto found;
}
}
DPRINTF(("need to allocate the semid_kernel\n"));
if (key == IPC_PRIVATE || (semflg & IPC_CREAT)) {
if (nsems <= 0 || nsems > seminfo.semmsl) {
DPRINTF(("nsems out of range (0<%d<=%d)\n", nsems,
seminfo.semmsl));
error = EINVAL;
goto done2;
}
if (nsems > seminfo.semmns - semtot) {
DPRINTF((
"not enough semaphores left (need %d, got %d)\n",
nsems, seminfo.semmns - semtot));
error = ENOSPC;
goto done2;
}
for (semid = 0; semid < seminfo.semmni; semid++) {
if ((sema[semid].u.sem_perm.mode & SEM_ALLOC) == 0)
break;
}
if (semid == seminfo.semmni) {
DPRINTF(("no more semid_kernel's available\n"));
error = ENOSPC;
goto done2;
}
#ifdef RACCT
PROC_LOCK(td->td_proc);
error = racct_add(td->td_proc, RACCT_NSEM, nsems);
PROC_UNLOCK(td->td_proc);
if (error != 0) {
error = ENOSPC;
goto done2;
}
#endif
DPRINTF(("semid %d is available\n", semid));
mtx_lock(&sema_mtx[semid]);
KASSERT((sema[semid].u.sem_perm.mode & SEM_ALLOC) == 0,
("Lost semaphore %d", semid));
sema[semid].u.sem_perm.key = key;
sema[semid].u.sem_perm.cuid = cred->cr_uid;
sema[semid].u.sem_perm.uid = cred->cr_uid;
sema[semid].u.sem_perm.cgid = cred->cr_gid;
sema[semid].u.sem_perm.gid = cred->cr_gid;
sema[semid].u.sem_perm.mode = (semflg & 0777) | SEM_ALLOC;
sema[semid].cred = crhold(cred);
sema[semid].u.sem_perm.seq =
(sema[semid].u.sem_perm.seq + 1) & 0x7fff;
sema[semid].u.sem_nsems = nsems;
sema[semid].u.sem_otime = 0;
sema[semid].u.sem_ctime = time_second;
sema[semid].u.sem_base = &sem[semtot];
semtot += nsems;
bzero(sema[semid].u.sem_base,
sizeof(sema[semid].u.sem_base[0])*nsems);
#ifdef MAC
mac_sysvsem_create(cred, &sema[semid]);
#endif
mtx_unlock(&sema_mtx[semid]);
DPRINTF(("sembase = %p, next = %p\n",
sema[semid].u.sem_base, &sem[semtot]));
} else {
DPRINTF(("didn't find it and wasn't asked to create it\n"));
error = ENOENT;
goto done2;
}
found:
td->td_retval[0] = IXSEQ_TO_IPCID(semid, sema[semid].u.sem_perm);
done2:
mtx_unlock(&sem_mtx);
return (error);
}
#ifndef _SYS_SYSPROTO_H_
struct semop_args {
int semid;
struct sembuf *sops;
size_t nsops;
};
#endif
int
-semop(struct thread *td, struct semop_args *uap)
+sys_semop(struct thread *td, struct semop_args *uap)
{
#define SMALL_SOPS 8
struct sembuf small_sops[SMALL_SOPS];
int semid = uap->semid;
size_t nsops = uap->nsops;
struct sembuf *sops;
struct semid_kernel *semakptr;
struct sembuf *sopptr = 0;
struct sem *semptr = 0;
struct sem_undo *suptr;
struct mtx *sema_mtxp;
size_t i, j, k;
int error;
int do_wakeup, do_undos;
unsigned short seq;
#ifdef SEM_DEBUG
sops = NULL;
#endif
DPRINTF(("call to semop(%d, %p, %u)\n", semid, sops, nsops));
if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
return (ENOSYS);
semid = IPCID_TO_IX(semid); /* Convert back to zero origin */
if (semid < 0 || semid >= seminfo.semmni)
return (EINVAL);
/* Allocate memory for sem_ops */
if (nsops <= SMALL_SOPS)
sops = small_sops;
else if (nsops > seminfo.semopm) {
DPRINTF(("too many sops (max=%d, nsops=%d)\n", seminfo.semopm,
nsops));
return (E2BIG);
} else {
#ifdef RACCT
PROC_LOCK(td->td_proc);
if (nsops > racct_get_available(td->td_proc, RACCT_NSEMOP)) {
PROC_UNLOCK(td->td_proc);
return (E2BIG);
}
PROC_UNLOCK(td->td_proc);
#endif
sops = malloc(nsops * sizeof(*sops), M_TEMP, M_WAITOK);
}
if ((error = copyin(uap->sops, sops, nsops * sizeof(sops[0]))) != 0) {
DPRINTF(("error = %d from copyin(%p, %p, %d)\n", error,
uap->sops, sops, nsops * sizeof(sops[0])));
if (sops != small_sops)
free(sops, M_SEM);
return (error);
}
semakptr = &sema[semid];
sema_mtxp = &sema_mtx[semid];
mtx_lock(sema_mtxp);
if ((semakptr->u.sem_perm.mode & SEM_ALLOC) == 0) {
error = EINVAL;
goto done2;
}
seq = semakptr->u.sem_perm.seq;
if (seq != IPCID_TO_SEQ(uap->semid)) {
error = EINVAL;
goto done2;
}
/*
* Initial pass thru sops to see what permissions are needed.
* Also perform any checks that don't need repeating on each
* attempt to satisfy the request vector.
*/
j = 0; /* permission needed */
do_undos = 0;
for (i = 0; i < nsops; i++) {
sopptr = &sops[i];
if (sopptr->sem_num >= semakptr->u.sem_nsems) {
error = EFBIG;
goto done2;
}
if (sopptr->sem_flg & SEM_UNDO && sopptr->sem_op != 0)
do_undos = 1;
j |= (sopptr->sem_op == 0) ? SEM_R : SEM_A;
}
if ((error = ipcperm(td, &semakptr->u.sem_perm, j))) {
DPRINTF(("error = %d from ipaccess\n", error));
goto done2;
}
#ifdef MAC
error = mac_sysvsem_check_semop(td->td_ucred, semakptr, j);
if (error != 0)
goto done2;
#endif
/*
* Loop trying to satisfy the vector of requests.
* If we reach a point where we must wait, any requests already
* performed are rolled back and we go to sleep until some other
* process wakes us up. At this point, we start all over again.
*
* This ensures that from the perspective of other tasks, a set
* of requests is atomic (never partially satisfied).
*/
for (;;) {
do_wakeup = 0;
error = 0; /* error return if necessary */
for (i = 0; i < nsops; i++) {
sopptr = &sops[i];
semptr = &semakptr->u.sem_base[sopptr->sem_num];
DPRINTF((
"semop: semakptr=%p, sem_base=%p, "
"semptr=%p, sem[%d]=%d : op=%d, flag=%s\n",
semakptr, semakptr->u.sem_base, semptr,
sopptr->sem_num, semptr->semval, sopptr->sem_op,
(sopptr->sem_flg & IPC_NOWAIT) ?
"nowait" : "wait"));
if (sopptr->sem_op < 0) {
if (semptr->semval + sopptr->sem_op < 0) {
DPRINTF(("semop: can't do it now\n"));
break;
} else {
semptr->semval += sopptr->sem_op;
if (semptr->semval == 0 &&
semptr->semzcnt > 0)
do_wakeup = 1;
}
} else if (sopptr->sem_op == 0) {
if (semptr->semval != 0) {
DPRINTF(("semop: not zero now\n"));
break;
}
} else if (semptr->semval + sopptr->sem_op >
seminfo.semvmx) {
error = ERANGE;
break;
} else {
if (semptr->semncnt > 0)
do_wakeup = 1;
semptr->semval += sopptr->sem_op;
}
}
/*
* Did we get through the entire vector?
*/
if (i >= nsops)
goto done;
/*
* No ... rollback anything that we've already done
*/
DPRINTF(("semop: rollback 0 through %d\n", i-1));
for (j = 0; j < i; j++)
semakptr->u.sem_base[sops[j].sem_num].semval -=
sops[j].sem_op;
/* If we detected an error, return it */
if (error != 0)
goto done2;
/*
* If the request that we couldn't satisfy has the
* NOWAIT flag set then return with EAGAIN.
*/
if (sopptr->sem_flg & IPC_NOWAIT) {
error = EAGAIN;
goto done2;
}
if (sopptr->sem_op == 0)
semptr->semzcnt++;
else
semptr->semncnt++;
DPRINTF(("semop: good night!\n"));
error = msleep(semakptr, sema_mtxp, (PZERO - 4) | PCATCH,
"semwait", 0);
DPRINTF(("semop: good morning (error=%d)!\n", error));
/* return code is checked below, after sem[nz]cnt-- */
/*
* Make sure that the semaphore still exists
*/
seq = semakptr->u.sem_perm.seq;
if ((semakptr->u.sem_perm.mode & SEM_ALLOC) == 0 ||
seq != IPCID_TO_SEQ(uap->semid)) {
error = EIDRM;
goto done2;
}
/*
* Renew the semaphore's pointer after wakeup since
* during msleep sem_base may have been modified and semptr
* is not valid any more
*/
semptr = &semakptr->u.sem_base[sopptr->sem_num];
/*
* The semaphore is still alive. Readjust the count of
* waiting processes.
*/
if (sopptr->sem_op == 0)
semptr->semzcnt--;
else
semptr->semncnt--;
/*
* Is it really morning, or was our sleep interrupted?
* (Delayed check of msleep() return code because we
* need to decrement sem[nz]cnt either way.)
*/
if (error != 0) {
error = EINTR;
goto done2;
}
DPRINTF(("semop: good morning!\n"));
}
done:
/*
* Process any SEM_UNDO requests.
*/
if (do_undos) {
SEMUNDO_LOCK();
suptr = NULL;
for (i = 0; i < nsops; i++) {
/*
* We only need to deal with SEM_UNDO's for non-zero
* op's.
*/
int adjval;
if ((sops[i].sem_flg & SEM_UNDO) == 0)
continue;
adjval = sops[i].sem_op;
if (adjval == 0)
continue;
error = semundo_adjust(td, &suptr, semid, seq,
sops[i].sem_num, -adjval);
if (error == 0)
continue;
/*
* Oh-Oh! We ran out of either sem_undo's or undo's.
* Rollback the adjustments to this point and then
* rollback the semaphore ups and down so we can return
* with an error with all structures restored. We
* rollback the undo's in the exact reverse order that
* we applied them. This guarantees that we won't run
* out of space as we roll things back out.
*/
for (j = 0; j < i; j++) {
k = i - j - 1;
if ((sops[k].sem_flg & SEM_UNDO) == 0)
continue;
adjval = sops[k].sem_op;
if (adjval == 0)
continue;
if (semundo_adjust(td, &suptr, semid, seq,
sops[k].sem_num, adjval) != 0)
panic("semop - can't undo undos");
}
for (j = 0; j < nsops; j++)
semakptr->u.sem_base[sops[j].sem_num].semval -=
sops[j].sem_op;
DPRINTF(("error = %d from semundo_adjust\n", error));
SEMUNDO_UNLOCK();
goto done2;
} /* loop through the sops */
SEMUNDO_UNLOCK();
} /* if (do_undos) */
/* We're definitely done - set the sempid's and time */
for (i = 0; i < nsops; i++) {
sopptr = &sops[i];
semptr = &semakptr->u.sem_base[sopptr->sem_num];
semptr->sempid = td->td_proc->p_pid;
}
semakptr->u.sem_otime = time_second;
/*
* Do a wakeup if any semaphore was up'd whilst something was
* sleeping on it.
*/
if (do_wakeup) {
DPRINTF(("semop: doing wakeup\n"));
wakeup(semakptr);
DPRINTF(("semop: back from wakeup\n"));
}
DPRINTF(("semop: done\n"));
td->td_retval[0] = 0;
done2:
mtx_unlock(sema_mtxp);
if (sops != small_sops)
free(sops, M_SEM);
return (error);
}
/*
* Go through the undo structures for this process and apply the adjustments to
* semaphores.
*/
static void
semexit_myhook(void *arg, struct proc *p)
{
struct sem_undo *suptr;
struct semid_kernel *semakptr;
struct mtx *sema_mtxp;
int semid, semnum, adjval, ix;
unsigned short seq;
/*
* Go through the chain of undo vectors looking for one
* associated with this process.
*/
SEMUNDO_LOCK();
LIST_FOREACH(suptr, &semu_list, un_next) {
if (suptr->un_proc == p)
break;
}
if (suptr == NULL) {
SEMUNDO_UNLOCK();
return;
}
LIST_REMOVE(suptr, un_next);
DPRINTF(("proc @%p has undo structure with %d entries\n", p,
suptr->un_cnt));
/*
* If there are any active undo elements then process them.
*/
if (suptr->un_cnt > 0) {
SEMUNDO_UNLOCK();
for (ix = 0; ix < suptr->un_cnt; ix++) {
semid = suptr->un_ent[ix].un_id;
semnum = suptr->un_ent[ix].un_num;
adjval = suptr->un_ent[ix].un_adjval;
seq = suptr->un_ent[ix].un_seq;
semakptr = &sema[semid];
sema_mtxp = &sema_mtx[semid];
mtx_lock(sema_mtxp);
if ((semakptr->u.sem_perm.mode & SEM_ALLOC) == 0 ||
(semakptr->u.sem_perm.seq != seq)) {
mtx_unlock(sema_mtxp);
continue;
}
if (semnum >= semakptr->u.sem_nsems)
panic("semexit - semnum out of range");
DPRINTF((
"semexit: %p id=%d num=%d(adj=%d) ; sem=%d\n",
suptr->un_proc, suptr->un_ent[ix].un_id,
suptr->un_ent[ix].un_num,
suptr->un_ent[ix].un_adjval,
semakptr->u.sem_base[semnum].semval));
if (adjval < 0 && semakptr->u.sem_base[semnum].semval <
-adjval)
semakptr->u.sem_base[semnum].semval = 0;
else
semakptr->u.sem_base[semnum].semval += adjval;
wakeup(semakptr);
DPRINTF(("semexit: back from wakeup\n"));
mtx_unlock(sema_mtxp);
}
SEMUNDO_LOCK();
}
/*
* Deallocate the undo vector.
*/
DPRINTF(("removing vector\n"));
suptr->un_proc = NULL;
suptr->un_cnt = 0;
LIST_INSERT_HEAD(&semu_free_list, suptr, un_next);
SEMUNDO_UNLOCK();
}
static int
sysctl_sema(SYSCTL_HANDLER_ARGS)
{
return (SYSCTL_OUT(req, sema,
sizeof(struct semid_kernel) * seminfo.semmni));
}
#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
/* XXX casting to (sy_call_t *) is bogus, as usual. */
static sy_call_t *semcalls[] = {
- (sy_call_t *)freebsd7___semctl, (sy_call_t *)semget,
- (sy_call_t *)semop
+ (sy_call_t *)freebsd7___semctl, (sy_call_t *)sys_semget,
+ (sy_call_t *)sys_semop
};
/*
* Entry point for all SEM calls.
*/
int
-semsys(td, uap)
+sys_semsys(td, uap)
struct thread *td;
/* XXX actually varargs. */
struct semsys_args /* {
int which;
int a2;
int a3;
int a4;
int a5;
} */ *uap;
{
int error;
if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
return (ENOSYS);
if (uap->which < 0 ||
uap->which >= sizeof(semcalls)/sizeof(semcalls[0]))
return (EINVAL);
error = (*semcalls[uap->which])(td, &uap->a2);
return (error);
}
#ifndef CP
#define CP(src, dst, fld) do { (dst).fld = (src).fld; } while (0)
#endif
#ifndef _SYS_SYSPROTO_H_
struct freebsd7___semctl_args {
int semid;
int semnum;
int cmd;
union semun_old *arg;
};
#endif
int
freebsd7___semctl(struct thread *td, struct freebsd7___semctl_args *uap)
{
struct semid_ds_old dsold;
struct semid_ds dsbuf;
union semun_old arg;
union semun semun;
register_t rval;
int error;
switch (uap->cmd) {
case SEM_STAT:
case IPC_SET:
case IPC_STAT:
case GETALL:
case SETVAL:
case SETALL:
error = copyin(uap->arg, &arg, sizeof(arg));
if (error)
return (error);
break;
}
switch (uap->cmd) {
case SEM_STAT:
case IPC_STAT:
semun.buf = &dsbuf;
break;
case IPC_SET:
error = copyin(arg.buf, &dsold, sizeof(dsold));
if (error)
return (error);
ipcperm_old2new(&dsold.sem_perm, &dsbuf.sem_perm);
CP(dsold, dsbuf, sem_base);
CP(dsold, dsbuf, sem_nsems);
CP(dsold, dsbuf, sem_otime);
CP(dsold, dsbuf, sem_ctime);
semun.buf = &dsbuf;
break;
case GETALL:
case SETALL:
semun.array = arg.array;
break;
case SETVAL:
semun.val = arg.val;
break;
}
error = kern_semctl(td, uap->semid, uap->semnum, uap->cmd, &semun,
&rval);
if (error)
return (error);
switch (uap->cmd) {
case SEM_STAT:
case IPC_STAT:
bzero(&dsold, sizeof(dsold));
ipcperm_new2old(&dsbuf.sem_perm, &dsold.sem_perm);
CP(dsbuf, dsold, sem_base);
CP(dsbuf, dsold, sem_nsems);
CP(dsbuf, dsold, sem_otime);
CP(dsbuf, dsold, sem_ctime);
error = copyout(&dsold, arg.buf, sizeof(dsold));
break;
}
if (error == 0)
td->td_retval[0] = rval;
return (error);
}
#endif /* COMPAT_FREEBSD{4,5,6,7} */
#ifdef COMPAT_FREEBSD32
int
freebsd32_semsys(struct thread *td, struct freebsd32_semsys_args *uap)
{
#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
switch (uap->which) {
case 0:
return (freebsd7_freebsd32_semctl(td,
(struct freebsd7_freebsd32_semctl_args *)&uap->a2));
default:
- return (semsys(td, (struct semsys_args *)uap));
+ return (sys_semsys(td, (struct semsys_args *)uap));
}
#else
return (nosys(td, NULL));
#endif
}
#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
int
freebsd7_freebsd32_semctl(struct thread *td,
struct freebsd7_freebsd32_semctl_args *uap)
{
struct semid_ds32_old dsbuf32;
struct semid_ds dsbuf;
union semun semun;
union semun32 arg;
register_t rval;
int error;
switch (uap->cmd) {
case SEM_STAT:
case IPC_SET:
case IPC_STAT:
case GETALL:
case SETVAL:
case SETALL:
error = copyin(uap->arg, &arg, sizeof(arg));
if (error)
return (error);
break;
}
switch (uap->cmd) {
case SEM_STAT:
case IPC_STAT:
semun.buf = &dsbuf;
break;
case IPC_SET:
error = copyin(PTRIN(arg.buf), &dsbuf32, sizeof(dsbuf32));
if (error)
return (error);
freebsd32_ipcperm_old_in(&dsbuf32.sem_perm, &dsbuf.sem_perm);
PTRIN_CP(dsbuf32, dsbuf, sem_base);
CP(dsbuf32, dsbuf, sem_nsems);
CP(dsbuf32, dsbuf, sem_otime);
CP(dsbuf32, dsbuf, sem_ctime);
semun.buf = &dsbuf;
break;
case GETALL:
case SETALL:
semun.array = PTRIN(arg.array);
break;
case SETVAL:
semun.val = arg.val;
break;
}
error = kern_semctl(td, uap->semid, uap->semnum, uap->cmd, &semun,
&rval);
if (error)
return (error);
switch (uap->cmd) {
case SEM_STAT:
case IPC_STAT:
bzero(&dsbuf32, sizeof(dsbuf32));
freebsd32_ipcperm_old_out(&dsbuf.sem_perm, &dsbuf32.sem_perm);
PTROUT_CP(dsbuf, dsbuf32, sem_base);
CP(dsbuf, dsbuf32, sem_nsems);
CP(dsbuf, dsbuf32, sem_otime);
CP(dsbuf, dsbuf32, sem_ctime);
error = copyout(&dsbuf32, PTRIN(arg.buf), sizeof(dsbuf32));
break;
}
if (error == 0)
td->td_retval[0] = rval;
return (error);
}
#endif
int
freebsd32_semctl(struct thread *td, struct freebsd32_semctl_args *uap)
{
struct semid_ds32 dsbuf32;
struct semid_ds dsbuf;
union semun semun;
union semun32 arg;
register_t rval;
int error;
switch (uap->cmd) {
case SEM_STAT:
case IPC_SET:
case IPC_STAT:
case GETALL:
case SETVAL:
case SETALL:
error = copyin(uap->arg, &arg, sizeof(arg));
if (error)
return (error);
break;
}
switch (uap->cmd) {
case SEM_STAT:
case IPC_STAT:
semun.buf = &dsbuf;
break;
case IPC_SET:
error = copyin(PTRIN(arg.buf), &dsbuf32, sizeof(dsbuf32));
if (error)
return (error);
freebsd32_ipcperm_in(&dsbuf32.sem_perm, &dsbuf.sem_perm);
PTRIN_CP(dsbuf32, dsbuf, sem_base);
CP(dsbuf32, dsbuf, sem_nsems);
CP(dsbuf32, dsbuf, sem_otime);
CP(dsbuf32, dsbuf, sem_ctime);
semun.buf = &dsbuf;
break;
case GETALL:
case SETALL:
semun.array = PTRIN(arg.array);
break;
case SETVAL:
semun.val = arg.val;
break;
}
error = kern_semctl(td, uap->semid, uap->semnum, uap->cmd, &semun,
&rval);
if (error)
return (error);
switch (uap->cmd) {
case SEM_STAT:
case IPC_STAT:
bzero(&dsbuf32, sizeof(dsbuf32));
freebsd32_ipcperm_out(&dsbuf.sem_perm, &dsbuf32.sem_perm);
PTROUT_CP(dsbuf, dsbuf32, sem_base);
CP(dsbuf, dsbuf32, sem_nsems);
CP(dsbuf, dsbuf32, sem_otime);
CP(dsbuf, dsbuf32, sem_ctime);
error = copyout(&dsbuf32, PTRIN(arg.buf), sizeof(dsbuf32));
break;
}
if (error == 0)
td->td_retval[0] = rval;
return (error);
}
#endif /* COMPAT_FREEBSD32 */
Index: head/sys/kern/sysv_shm.c
===================================================================
--- head/sys/kern/sysv_shm.c (revision 225616)
+++ head/sys/kern/sysv_shm.c (revision 225617)
@@ -1,1408 +1,1408 @@
/* $NetBSD: sysv_shm.c,v 1.23 1994/07/04 23:25:12 glass Exp $ */
/*-
* Copyright (c) 1994 Adam Glass and Charles Hannum. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by Adam Glass and Charles
* Hannum.
* 4. The names of the authors may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*-
* Copyright (c) 2003-2005 McAfee, Inc.
* All rights reserved.
*
* This software was developed for the FreeBSD Project in part by McAfee
* Research, the Security Research Division of McAfee, Inc under DARPA/SPAWAR
* contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS research
* program.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_compat.h"
#include "opt_sysvipc.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/limits.h>
#include <sys/lock.h>
#include <sys/sysctl.h>
#include <sys/shm.h>
#include <sys/proc.h>
#include <sys/malloc.h>
#include <sys/mman.h>
#include <sys/module.h>
#include <sys/mutex.h>
#include <sys/racct.h>
#include <sys/resourcevar.h>
#include <sys/stat.h>
#include <sys/syscall.h>
#include <sys/syscallsubr.h>
#include <sys/sysent.h>
#include <sys/sysproto.h>
#include <sys/jail.h>
#include <security/mac/mac_framework.h>
#include <vm/vm.h>
#include <vm/vm_param.h>
#include <vm/pmap.h>
#include <vm/vm_object.h>
#include <vm/vm_map.h>
#include <vm/vm_page.h>
#include <vm/vm_pager.h>
FEATURE(sysv_shm, "System V shared memory segments support");
static MALLOC_DEFINE(M_SHM, "shm", "SVID compatible shared memory segments");
static int shmget_allocate_segment(struct thread *td,
struct shmget_args *uap, int mode);
static int shmget_existing(struct thread *td, struct shmget_args *uap,
int mode, int segnum);
#define SHMSEG_FREE 0x0200
#define SHMSEG_REMOVED 0x0400
#define SHMSEG_ALLOCATED 0x0800
#define SHMSEG_WANTED 0x1000
static int shm_last_free, shm_nused, shmalloced;
vm_size_t shm_committed;
static struct shmid_kernel *shmsegs;
struct shmmap_state {
vm_offset_t va;
int shmid;
};
static void shm_deallocate_segment(struct shmid_kernel *);
static int shm_find_segment_by_key(key_t);
static struct shmid_kernel *shm_find_segment_by_shmid(int);
static struct shmid_kernel *shm_find_segment_by_shmidx(int);
static int shm_delete_mapping(struct vmspace *vm, struct shmmap_state *);
static void shmrealloc(void);
static int shminit(void);
static int sysvshm_modload(struct module *, int, void *);
static int shmunload(void);
static void shmexit_myhook(struct vmspace *vm);
static void shmfork_myhook(struct proc *p1, struct proc *p2);
static int sysctl_shmsegs(SYSCTL_HANDLER_ARGS);
/*
* Tuneable values.
*/
#ifndef SHMMAXPGS
#define SHMMAXPGS 131072 /* Note: sysv shared memory is swap backed. */
#endif
#ifndef SHMMAX
#define SHMMAX (SHMMAXPGS*PAGE_SIZE)
#endif
#ifndef SHMMIN
#define SHMMIN 1
#endif
#ifndef SHMMNI
#define SHMMNI 192
#endif
#ifndef SHMSEG
#define SHMSEG 128
#endif
#ifndef SHMALL
#define SHMALL (SHMMAXPGS)
#endif
struct shminfo shminfo = {
SHMMAX,
SHMMIN,
SHMMNI,
SHMSEG,
SHMALL
};
static int shm_use_phys;
static int shm_allow_removed;
SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmmax, CTLFLAG_RW, &shminfo.shmmax, 0,
"Maximum shared memory segment size");
SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmmin, CTLFLAG_RW, &shminfo.shmmin, 0,
"Minimum shared memory segment size");
SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmmni, CTLFLAG_RDTUN, &shminfo.shmmni, 0,
"Number of shared memory identifiers");
SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmseg, CTLFLAG_RDTUN, &shminfo.shmseg, 0,
"Number of segments per process");
SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmall, CTLFLAG_RW, &shminfo.shmall, 0,
"Maximum number of pages available for shared memory");
SYSCTL_INT(_kern_ipc, OID_AUTO, shm_use_phys, CTLFLAG_RW,
&shm_use_phys, 0, "Enable/Disable locking of shared memory pages in core");
SYSCTL_INT(_kern_ipc, OID_AUTO, shm_allow_removed, CTLFLAG_RW,
&shm_allow_removed, 0,
"Enable/Disable attachment to attached segments marked for removal");
SYSCTL_PROC(_kern_ipc, OID_AUTO, shmsegs, CTLTYPE_OPAQUE | CTLFLAG_RD,
NULL, 0, sysctl_shmsegs, "",
"Current number of shared memory segments allocated");
static int
shm_find_segment_by_key(key)
key_t key;
{
int i;
for (i = 0; i < shmalloced; i++)
if ((shmsegs[i].u.shm_perm.mode & SHMSEG_ALLOCATED) &&
shmsegs[i].u.shm_perm.key == key)
return (i);
return (-1);
}
static struct shmid_kernel *
shm_find_segment_by_shmid(int shmid)
{
int segnum;
struct shmid_kernel *shmseg;
segnum = IPCID_TO_IX(shmid);
if (segnum < 0 || segnum >= shmalloced)
return (NULL);
shmseg = &shmsegs[segnum];
if ((shmseg->u.shm_perm.mode & SHMSEG_ALLOCATED) == 0 ||
(!shm_allow_removed &&
(shmseg->u.shm_perm.mode & SHMSEG_REMOVED) != 0) ||
shmseg->u.shm_perm.seq != IPCID_TO_SEQ(shmid))
return (NULL);
return (shmseg);
}
static struct shmid_kernel *
shm_find_segment_by_shmidx(int segnum)
{
struct shmid_kernel *shmseg;
if (segnum < 0 || segnum >= shmalloced)
return (NULL);
shmseg = &shmsegs[segnum];
if ((shmseg->u.shm_perm.mode & SHMSEG_ALLOCATED) == 0 ||
(!shm_allow_removed &&
(shmseg->u.shm_perm.mode & SHMSEG_REMOVED) != 0))
return (NULL);
return (shmseg);
}
static void
shm_deallocate_segment(shmseg)
struct shmid_kernel *shmseg;
{
vm_size_t size;
GIANT_REQUIRED;
vm_object_deallocate(shmseg->object);
shmseg->object = NULL;
size = round_page(shmseg->u.shm_segsz);
shm_committed -= btoc(size);
shm_nused--;
shmseg->u.shm_perm.mode = SHMSEG_FREE;
#ifdef MAC
mac_sysvshm_cleanup(shmseg);
#endif
racct_sub_cred(shmseg->cred, RACCT_NSHM, 1);
racct_sub_cred(shmseg->cred, RACCT_SHMSIZE, size);
crfree(shmseg->cred);
shmseg->cred = NULL;
}
static int
shm_delete_mapping(struct vmspace *vm, struct shmmap_state *shmmap_s)
{
struct shmid_kernel *shmseg;
int segnum, result;
vm_size_t size;
GIANT_REQUIRED;
segnum = IPCID_TO_IX(shmmap_s->shmid);
shmseg = &shmsegs[segnum];
size = round_page(shmseg->u.shm_segsz);
result = vm_map_remove(&vm->vm_map, shmmap_s->va, shmmap_s->va + size);
if (result != KERN_SUCCESS)
return (EINVAL);
shmmap_s->shmid = -1;
shmseg->u.shm_dtime = time_second;
if ((--shmseg->u.shm_nattch <= 0) &&
(shmseg->u.shm_perm.mode & SHMSEG_REMOVED)) {
shm_deallocate_segment(shmseg);
shm_last_free = segnum;
}
return (0);
}
#ifndef _SYS_SYSPROTO_H_
struct shmdt_args {
const void *shmaddr;
};
#endif
int
-shmdt(td, uap)
+sys_shmdt(td, uap)
struct thread *td;
struct shmdt_args *uap;
{
struct proc *p = td->td_proc;
struct shmmap_state *shmmap_s;
#ifdef MAC
struct shmid_kernel *shmsegptr;
#endif
int i;
int error = 0;
if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
return (ENOSYS);
mtx_lock(&Giant);
shmmap_s = p->p_vmspace->vm_shm;
if (shmmap_s == NULL) {
error = EINVAL;
goto done2;
}
for (i = 0; i < shminfo.shmseg; i++, shmmap_s++) {
if (shmmap_s->shmid != -1 &&
shmmap_s->va == (vm_offset_t)uap->shmaddr) {
break;
}
}
if (i == shminfo.shmseg) {
error = EINVAL;
goto done2;
}
#ifdef MAC
shmsegptr = &shmsegs[IPCID_TO_IX(shmmap_s->shmid)];
error = mac_sysvshm_check_shmdt(td->td_ucred, shmsegptr);
if (error != 0)
goto done2;
#endif
error = shm_delete_mapping(p->p_vmspace, shmmap_s);
done2:
mtx_unlock(&Giant);
return (error);
}
#ifndef _SYS_SYSPROTO_H_
struct shmat_args {
int shmid;
const void *shmaddr;
int shmflg;
};
#endif
int
kern_shmat(td, shmid, shmaddr, shmflg)
struct thread *td;
int shmid;
const void *shmaddr;
int shmflg;
{
struct proc *p = td->td_proc;
int i, flags;
struct shmid_kernel *shmseg;
struct shmmap_state *shmmap_s = NULL;
vm_offset_t attach_va;
vm_prot_t prot;
vm_size_t size;
int rv;
int error = 0;
if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
return (ENOSYS);
mtx_lock(&Giant);
shmmap_s = p->p_vmspace->vm_shm;
if (shmmap_s == NULL) {
shmmap_s = malloc(shminfo.shmseg * sizeof(struct shmmap_state),
M_SHM, M_WAITOK);
for (i = 0; i < shminfo.shmseg; i++)
shmmap_s[i].shmid = -1;
p->p_vmspace->vm_shm = shmmap_s;
}
shmseg = shm_find_segment_by_shmid(shmid);
if (shmseg == NULL) {
error = EINVAL;
goto done2;
}
error = ipcperm(td, &shmseg->u.shm_perm,
(shmflg & SHM_RDONLY) ? IPC_R : IPC_R|IPC_W);
if (error)
goto done2;
#ifdef MAC
error = mac_sysvshm_check_shmat(td->td_ucred, shmseg, shmflg);
if (error != 0)
goto done2;
#endif
for (i = 0; i < shminfo.shmseg; i++) {
if (shmmap_s->shmid == -1)
break;
shmmap_s++;
}
if (i >= shminfo.shmseg) {
error = EMFILE;
goto done2;
}
size = round_page(shmseg->u.shm_segsz);
prot = VM_PROT_READ;
if ((shmflg & SHM_RDONLY) == 0)
prot |= VM_PROT_WRITE;
flags = MAP_ANON | MAP_SHARED;
if (shmaddr) {
flags |= MAP_FIXED;
if (shmflg & SHM_RND) {
attach_va = (vm_offset_t)shmaddr & ~(SHMLBA-1);
} else if (((vm_offset_t)shmaddr & (SHMLBA-1)) == 0) {
attach_va = (vm_offset_t)shmaddr;
} else {
error = EINVAL;
goto done2;
}
} else {
/*
* This is just a hint to vm_map_find() about where to
* put it.
*/
PROC_LOCK(p);
attach_va = round_page((vm_offset_t)p->p_vmspace->vm_daddr +
lim_max(p, RLIMIT_DATA));
PROC_UNLOCK(p);
}
vm_object_reference(shmseg->object);
rv = vm_map_find(&p->p_vmspace->vm_map, shmseg->object,
0, &attach_va, size, (flags & MAP_FIXED) ? VMFS_NO_SPACE :
VMFS_ANY_SPACE, prot, prot, 0);
if (rv != KERN_SUCCESS) {
vm_object_deallocate(shmseg->object);
error = ENOMEM;
goto done2;
}
vm_map_inherit(&p->p_vmspace->vm_map,
attach_va, attach_va + size, VM_INHERIT_SHARE);
shmmap_s->va = attach_va;
shmmap_s->shmid = shmid;
shmseg->u.shm_lpid = p->p_pid;
shmseg->u.shm_atime = time_second;
shmseg->u.shm_nattch++;
td->td_retval[0] = attach_va;
done2:
mtx_unlock(&Giant);
return (error);
}
int
-shmat(td, uap)
+sys_shmat(td, uap)
struct thread *td;
struct shmat_args *uap;
{
return kern_shmat(td, uap->shmid, uap->shmaddr, uap->shmflg);
}
int
kern_shmctl(td, shmid, cmd, buf, bufsz)
struct thread *td;
int shmid;
int cmd;
void *buf;
size_t *bufsz;
{
int error = 0;
struct shmid_kernel *shmseg;
if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
return (ENOSYS);
mtx_lock(&Giant);
switch (cmd) {
/*
* It is possible that kern_shmctl is being called from the Linux ABI
* layer, in which case, we will need to implement IPC_INFO. It should
* be noted that other shmctl calls will be funneled through here for
* Linix binaries as well.
*
* NB: The Linux ABI layer will convert this data to structure(s) more
* consistent with the Linux ABI.
*/
case IPC_INFO:
memcpy(buf, &shminfo, sizeof(shminfo));
if (bufsz)
*bufsz = sizeof(shminfo);
td->td_retval[0] = shmalloced;
goto done2;
case SHM_INFO: {
struct shm_info shm_info;
shm_info.used_ids = shm_nused;
shm_info.shm_rss = 0; /*XXX where to get from ? */
shm_info.shm_tot = 0; /*XXX where to get from ? */
shm_info.shm_swp = 0; /*XXX where to get from ? */
shm_info.swap_attempts = 0; /*XXX where to get from ? */
shm_info.swap_successes = 0; /*XXX where to get from ? */
memcpy(buf, &shm_info, sizeof(shm_info));
if (bufsz)
*bufsz = sizeof(shm_info);
td->td_retval[0] = shmalloced;
goto done2;
}
}
if (cmd == SHM_STAT)
shmseg = shm_find_segment_by_shmidx(shmid);
else
shmseg = shm_find_segment_by_shmid(shmid);
if (shmseg == NULL) {
error = EINVAL;
goto done2;
}
#ifdef MAC
error = mac_sysvshm_check_shmctl(td->td_ucred, shmseg, cmd);
if (error != 0)
goto done2;
#endif
switch (cmd) {
case SHM_STAT:
case IPC_STAT:
error = ipcperm(td, &shmseg->u.shm_perm, IPC_R);
if (error)
goto done2;
memcpy(buf, &shmseg->u, sizeof(struct shmid_ds));
if (bufsz)
*bufsz = sizeof(struct shmid_ds);
if (cmd == SHM_STAT)
td->td_retval[0] = IXSEQ_TO_IPCID(shmid, shmseg->u.shm_perm);
break;
case IPC_SET: {
struct shmid_ds *shmid;
shmid = (struct shmid_ds *)buf;
error = ipcperm(td, &shmseg->u.shm_perm, IPC_M);
if (error)
goto done2;
shmseg->u.shm_perm.uid = shmid->shm_perm.uid;
shmseg->u.shm_perm.gid = shmid->shm_perm.gid;
shmseg->u.shm_perm.mode =
(shmseg->u.shm_perm.mode & ~ACCESSPERMS) |
(shmid->shm_perm.mode & ACCESSPERMS);
shmseg->u.shm_ctime = time_second;
break;
}
case IPC_RMID:
error = ipcperm(td, &shmseg->u.shm_perm, IPC_M);
if (error)
goto done2;
shmseg->u.shm_perm.key = IPC_PRIVATE;
shmseg->u.shm_perm.mode |= SHMSEG_REMOVED;
if (shmseg->u.shm_nattch <= 0) {
shm_deallocate_segment(shmseg);
shm_last_free = IPCID_TO_IX(shmid);
}
break;
#if 0
case SHM_LOCK:
case SHM_UNLOCK:
#endif
default:
error = EINVAL;
break;
}
done2:
mtx_unlock(&Giant);
return (error);
}
#ifndef _SYS_SYSPROTO_H_
struct shmctl_args {
int shmid;
int cmd;
struct shmid_ds *buf;
};
#endif
int
-shmctl(td, uap)
+sys_shmctl(td, uap)
struct thread *td;
struct shmctl_args *uap;
{
int error = 0;
struct shmid_ds buf;
size_t bufsz;
/*
* The only reason IPC_INFO, SHM_INFO, SHM_STAT exists is to support
* Linux binaries. If we see the call come through the FreeBSD ABI,
* return an error back to the user since we do not to support this.
*/
if (uap->cmd == IPC_INFO || uap->cmd == SHM_INFO ||
uap->cmd == SHM_STAT)
return (EINVAL);
/* IPC_SET needs to copyin the buffer before calling kern_shmctl */
if (uap->cmd == IPC_SET) {
if ((error = copyin(uap->buf, &buf, sizeof(struct shmid_ds))))
goto done;
}
error = kern_shmctl(td, uap->shmid, uap->cmd, (void *)&buf, &bufsz);
if (error)
goto done;
/* Cases in which we need to copyout */
switch (uap->cmd) {
case IPC_STAT:
error = copyout(&buf, uap->buf, bufsz);
break;
}
done:
if (error) {
/* Invalidate the return value */
td->td_retval[0] = -1;
}
return (error);
}
static int
shmget_existing(td, uap, mode, segnum)
struct thread *td;
struct shmget_args *uap;
int mode;
int segnum;
{
struct shmid_kernel *shmseg;
int error;
shmseg = &shmsegs[segnum];
if (shmseg->u.shm_perm.mode & SHMSEG_REMOVED) {
/*
* This segment is in the process of being allocated. Wait
* until it's done, and look the key up again (in case the
* allocation failed or it was freed).
*/
shmseg->u.shm_perm.mode |= SHMSEG_WANTED;
error = tsleep(shmseg, PLOCK | PCATCH, "shmget", 0);
if (error)
return (error);
return (EAGAIN);
}
if ((uap->shmflg & (IPC_CREAT | IPC_EXCL)) == (IPC_CREAT | IPC_EXCL))
return (EEXIST);
#ifdef MAC
error = mac_sysvshm_check_shmget(td->td_ucred, shmseg, uap->shmflg);
if (error != 0)
return (error);
#endif
if (uap->size != 0 && uap->size > shmseg->u.shm_segsz)
return (EINVAL);
td->td_retval[0] = IXSEQ_TO_IPCID(segnum, shmseg->u.shm_perm);
return (0);
}
static int
shmget_allocate_segment(td, uap, mode)
struct thread *td;
struct shmget_args *uap;
int mode;
{
int i, segnum, shmid;
size_t size;
struct ucred *cred = td->td_ucred;
struct shmid_kernel *shmseg;
vm_object_t shm_object;
GIANT_REQUIRED;
if (uap->size < shminfo.shmmin || uap->size > shminfo.shmmax)
return (EINVAL);
if (shm_nused >= shminfo.shmmni) /* Any shmids left? */
return (ENOSPC);
size = round_page(uap->size);
if (shm_committed + btoc(size) > shminfo.shmall)
return (ENOMEM);
if (shm_last_free < 0) {
shmrealloc(); /* Maybe expand the shmsegs[] array. */
for (i = 0; i < shmalloced; i++)
if (shmsegs[i].u.shm_perm.mode & SHMSEG_FREE)
break;
if (i == shmalloced)
return (ENOSPC);
segnum = i;
} else {
segnum = shm_last_free;
shm_last_free = -1;
}
shmseg = &shmsegs[segnum];
#ifdef RACCT
PROC_LOCK(td->td_proc);
if (racct_add(td->td_proc, RACCT_NSHM, 1)) {
PROC_UNLOCK(td->td_proc);
return (ENOSPC);
}
if (racct_add(td->td_proc, RACCT_SHMSIZE, size)) {
racct_sub(td->td_proc, RACCT_NSHM, 1);
PROC_UNLOCK(td->td_proc);
return (ENOMEM);
}
PROC_UNLOCK(td->td_proc);
#endif
/*
* In case we sleep in malloc(), mark the segment present but deleted
* so that noone else tries to create the same key.
*/
shmseg->u.shm_perm.mode = SHMSEG_ALLOCATED | SHMSEG_REMOVED;
shmseg->u.shm_perm.key = uap->key;
shmseg->u.shm_perm.seq = (shmseg->u.shm_perm.seq + 1) & 0x7fff;
shmid = IXSEQ_TO_IPCID(segnum, shmseg->u.shm_perm);
/*
* We make sure that we have allocated a pager before we need
* to.
*/
shm_object = vm_pager_allocate(shm_use_phys ? OBJT_PHYS : OBJT_SWAP,
0, size, VM_PROT_DEFAULT, 0, cred);
if (shm_object == NULL) {
#ifdef RACCT
PROC_LOCK(td->td_proc);
racct_sub(td->td_proc, RACCT_NSHM, 1);
racct_sub(td->td_proc, RACCT_SHMSIZE, size);
PROC_UNLOCK(td->td_proc);
#endif
return (ENOMEM);
}
VM_OBJECT_LOCK(shm_object);
vm_object_clear_flag(shm_object, OBJ_ONEMAPPING);
vm_object_set_flag(shm_object, OBJ_NOSPLIT);
VM_OBJECT_UNLOCK(shm_object);
shmseg->object = shm_object;
shmseg->u.shm_perm.cuid = shmseg->u.shm_perm.uid = cred->cr_uid;
shmseg->u.shm_perm.cgid = shmseg->u.shm_perm.gid = cred->cr_gid;
shmseg->u.shm_perm.mode = (shmseg->u.shm_perm.mode & SHMSEG_WANTED) |
(mode & ACCESSPERMS) | SHMSEG_ALLOCATED;
shmseg->cred = crhold(cred);
shmseg->u.shm_segsz = uap->size;
shmseg->u.shm_cpid = td->td_proc->p_pid;
shmseg->u.shm_lpid = shmseg->u.shm_nattch = 0;
shmseg->u.shm_atime = shmseg->u.shm_dtime = 0;
#ifdef MAC
mac_sysvshm_create(cred, shmseg);
#endif
shmseg->u.shm_ctime = time_second;
shm_committed += btoc(size);
shm_nused++;
if (shmseg->u.shm_perm.mode & SHMSEG_WANTED) {
/*
* Somebody else wanted this key while we were asleep. Wake
* them up now.
*/
shmseg->u.shm_perm.mode &= ~SHMSEG_WANTED;
wakeup(shmseg);
}
td->td_retval[0] = shmid;
return (0);
}
#ifndef _SYS_SYSPROTO_H_
struct shmget_args {
key_t key;
size_t size;
int shmflg;
};
#endif
int
-shmget(td, uap)
+sys_shmget(td, uap)
struct thread *td;
struct shmget_args *uap;
{
int segnum, mode;
int error;
if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
return (ENOSYS);
mtx_lock(&Giant);
mode = uap->shmflg & ACCESSPERMS;
if (uap->key != IPC_PRIVATE) {
again:
segnum = shm_find_segment_by_key(uap->key);
if (segnum >= 0) {
error = shmget_existing(td, uap, mode, segnum);
if (error == EAGAIN)
goto again;
goto done2;
}
if ((uap->shmflg & IPC_CREAT) == 0) {
error = ENOENT;
goto done2;
}
}
error = shmget_allocate_segment(td, uap, mode);
done2:
mtx_unlock(&Giant);
return (error);
}
static void
shmfork_myhook(p1, p2)
struct proc *p1, *p2;
{
struct shmmap_state *shmmap_s;
size_t size;
int i;
mtx_lock(&Giant);
size = shminfo.shmseg * sizeof(struct shmmap_state);
shmmap_s = malloc(size, M_SHM, M_WAITOK);
bcopy(p1->p_vmspace->vm_shm, shmmap_s, size);
p2->p_vmspace->vm_shm = shmmap_s;
for (i = 0; i < shminfo.shmseg; i++, shmmap_s++)
if (shmmap_s->shmid != -1)
shmsegs[IPCID_TO_IX(shmmap_s->shmid)].u.shm_nattch++;
mtx_unlock(&Giant);
}
static void
shmexit_myhook(struct vmspace *vm)
{
struct shmmap_state *base, *shm;
int i;
if ((base = vm->vm_shm) != NULL) {
vm->vm_shm = NULL;
mtx_lock(&Giant);
for (i = 0, shm = base; i < shminfo.shmseg; i++, shm++) {
if (shm->shmid != -1)
shm_delete_mapping(vm, shm);
}
mtx_unlock(&Giant);
free(base, M_SHM);
}
}
static void
shmrealloc(void)
{
int i;
struct shmid_kernel *newsegs;
if (shmalloced >= shminfo.shmmni)
return;
newsegs = malloc(shminfo.shmmni * sizeof(*newsegs), M_SHM, M_WAITOK);
if (newsegs == NULL)
return;
for (i = 0; i < shmalloced; i++)
bcopy(&shmsegs[i], &newsegs[i], sizeof(newsegs[0]));
for (; i < shminfo.shmmni; i++) {
shmsegs[i].u.shm_perm.mode = SHMSEG_FREE;
shmsegs[i].u.shm_perm.seq = 0;
#ifdef MAC
mac_sysvshm_init(&shmsegs[i]);
#endif
}
free(shmsegs, M_SHM);
shmsegs = newsegs;
shmalloced = shminfo.shmmni;
}
static struct syscall_helper_data shm_syscalls[] = {
SYSCALL_INIT_HELPER(shmat),
SYSCALL_INIT_HELPER(shmctl),
SYSCALL_INIT_HELPER(shmdt),
SYSCALL_INIT_HELPER(shmget),
#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
- SYSCALL_INIT_HELPER(freebsd7_shmctl),
+ SYSCALL_INIT_HELPER_COMPAT(freebsd7_shmctl),
#endif
#if defined(__i386__) && (defined(COMPAT_FREEBSD4) || defined(COMPAT_43))
SYSCALL_INIT_HELPER(shmsys),
#endif
SYSCALL_INIT_LAST
};
#ifdef COMPAT_FREEBSD32
#include <compat/freebsd32/freebsd32.h>
#include <compat/freebsd32/freebsd32_ipc.h>
#include <compat/freebsd32/freebsd32_proto.h>
#include <compat/freebsd32/freebsd32_signal.h>
#include <compat/freebsd32/freebsd32_syscall.h>
#include <compat/freebsd32/freebsd32_util.h>
static struct syscall_helper_data shm32_syscalls[] = {
- SYSCALL32_INIT_HELPER(shmat),
- SYSCALL32_INIT_HELPER(shmdt),
- SYSCALL32_INIT_HELPER(shmget),
+ SYSCALL32_INIT_HELPER_COMPAT(shmat),
+ SYSCALL32_INIT_HELPER_COMPAT(shmdt),
+ SYSCALL32_INIT_HELPER_COMPAT(shmget),
SYSCALL32_INIT_HELPER(freebsd32_shmsys),
SYSCALL32_INIT_HELPER(freebsd32_shmctl),
#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
SYSCALL32_INIT_HELPER(freebsd7_freebsd32_shmctl),
#endif
SYSCALL_INIT_LAST
};
#endif
static int
shminit()
{
int i, error;
#ifndef BURN_BRIDGES
if (TUNABLE_ULONG_FETCH("kern.ipc.shmmaxpgs", &shminfo.shmall) != 0)
printf("kern.ipc.shmmaxpgs is now called kern.ipc.shmall!\n");
#endif
TUNABLE_ULONG_FETCH("kern.ipc.shmall", &shminfo.shmall);
/* Initialize shmmax dealing with possible overflow. */
for (i = PAGE_SIZE; i > 0; i--) {
shminfo.shmmax = shminfo.shmall * i;
if (shminfo.shmmax >= shminfo.shmall)
break;
}
TUNABLE_ULONG_FETCH("kern.ipc.shmmin", &shminfo.shmmin);
TUNABLE_ULONG_FETCH("kern.ipc.shmmni", &shminfo.shmmni);
TUNABLE_ULONG_FETCH("kern.ipc.shmseg", &shminfo.shmseg);
TUNABLE_INT_FETCH("kern.ipc.shm_use_phys", &shm_use_phys);
shmalloced = shminfo.shmmni;
shmsegs = malloc(shmalloced * sizeof(shmsegs[0]), M_SHM, M_WAITOK);
for (i = 0; i < shmalloced; i++) {
shmsegs[i].u.shm_perm.mode = SHMSEG_FREE;
shmsegs[i].u.shm_perm.seq = 0;
#ifdef MAC
mac_sysvshm_init(&shmsegs[i]);
#endif
}
shm_last_free = 0;
shm_nused = 0;
shm_committed = 0;
shmexit_hook = &shmexit_myhook;
shmfork_hook = &shmfork_myhook;
error = syscall_helper_register(shm_syscalls);
if (error != 0)
return (error);
#ifdef COMPAT_FREEBSD32
error = syscall32_helper_register(shm32_syscalls);
if (error != 0)
return (error);
#endif
return (0);
}
static int
shmunload()
{
int i;
if (shm_nused > 0)
return (EBUSY);
#ifdef COMPAT_FREEBSD32
syscall32_helper_unregister(shm32_syscalls);
#endif
syscall_helper_unregister(shm_syscalls);
for (i = 0; i < shmalloced; i++) {
#ifdef MAC
mac_sysvshm_destroy(&shmsegs[i]);
#endif
/*
* Objects might be still mapped into the processes
* address spaces. Actual free would happen on the
* last mapping destruction.
*/
if (shmsegs[i].u.shm_perm.mode != SHMSEG_FREE)
vm_object_deallocate(shmsegs[i].object);
}
free(shmsegs, M_SHM);
shmexit_hook = NULL;
shmfork_hook = NULL;
return (0);
}
static int
sysctl_shmsegs(SYSCTL_HANDLER_ARGS)
{
return (SYSCTL_OUT(req, shmsegs, shmalloced * sizeof(shmsegs[0])));
}
#if defined(__i386__) && (defined(COMPAT_FREEBSD4) || defined(COMPAT_43))
struct oshmid_ds {
struct ipc_perm_old shm_perm; /* operation perms */
int shm_segsz; /* size of segment (bytes) */
u_short shm_cpid; /* pid, creator */
u_short shm_lpid; /* pid, last operation */
short shm_nattch; /* no. of current attaches */
time_t shm_atime; /* last attach time */
time_t shm_dtime; /* last detach time */
time_t shm_ctime; /* last change time */
void *shm_handle; /* internal handle for shm segment */
};
struct oshmctl_args {
int shmid;
int cmd;
struct oshmid_ds *ubuf;
};
static int
oshmctl(struct thread *td, struct oshmctl_args *uap)
{
#ifdef COMPAT_43
int error = 0;
struct shmid_kernel *shmseg;
struct oshmid_ds outbuf;
if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
return (ENOSYS);
mtx_lock(&Giant);
shmseg = shm_find_segment_by_shmid(uap->shmid);
if (shmseg == NULL) {
error = EINVAL;
goto done2;
}
switch (uap->cmd) {
case IPC_STAT:
error = ipcperm(td, &shmseg->u.shm_perm, IPC_R);
if (error)
goto done2;
#ifdef MAC
error = mac_sysvshm_check_shmctl(td->td_ucred, shmseg, uap->cmd);
if (error != 0)
goto done2;
#endif
ipcperm_new2old(&shmseg->u.shm_perm, &outbuf.shm_perm);
outbuf.shm_segsz = shmseg->u.shm_segsz;
outbuf.shm_cpid = shmseg->u.shm_cpid;
outbuf.shm_lpid = shmseg->u.shm_lpid;
outbuf.shm_nattch = shmseg->u.shm_nattch;
outbuf.shm_atime = shmseg->u.shm_atime;
outbuf.shm_dtime = shmseg->u.shm_dtime;
outbuf.shm_ctime = shmseg->u.shm_ctime;
outbuf.shm_handle = shmseg->object;
error = copyout(&outbuf, uap->ubuf, sizeof(outbuf));
if (error)
goto done2;
break;
default:
error = freebsd7_shmctl(td, (struct freebsd7_shmctl_args *)uap);
break;
}
done2:
mtx_unlock(&Giant);
return (error);
#else
return (EINVAL);
#endif
}
/* XXX casting to (sy_call_t *) is bogus, as usual. */
static sy_call_t *shmcalls[] = {
- (sy_call_t *)shmat, (sy_call_t *)oshmctl,
- (sy_call_t *)shmdt, (sy_call_t *)shmget,
+ (sy_call_t *)sys_shmat, (sy_call_t *)oshmctl,
+ (sy_call_t *)sys_shmdt, (sy_call_t *)sys_shmget,
(sy_call_t *)freebsd7_shmctl
};
int
-shmsys(td, uap)
+sys_shmsys(td, uap)
struct thread *td;
/* XXX actually varargs. */
struct shmsys_args /* {
int which;
int a2;
int a3;
int a4;
} */ *uap;
{
int error;
if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
return (ENOSYS);
if (uap->which < 0 ||
uap->which >= sizeof(shmcalls)/sizeof(shmcalls[0]))
return (EINVAL);
mtx_lock(&Giant);
error = (*shmcalls[uap->which])(td, &uap->a2);
mtx_unlock(&Giant);
return (error);
}
#endif /* i386 && (COMPAT_FREEBSD4 || COMPAT_43) */
#ifdef COMPAT_FREEBSD32
int
freebsd32_shmsys(struct thread *td, struct freebsd32_shmsys_args *uap)
{
#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
switch (uap->which) {
case 0: { /* shmat */
struct shmat_args ap;
ap.shmid = uap->a2;
ap.shmaddr = PTRIN(uap->a3);
ap.shmflg = uap->a4;
return (sysent[SYS_shmat].sy_call(td, &ap));
}
case 2: { /* shmdt */
struct shmdt_args ap;
ap.shmaddr = PTRIN(uap->a2);
return (sysent[SYS_shmdt].sy_call(td, &ap));
}
case 3: { /* shmget */
struct shmget_args ap;
ap.key = uap->a2;
ap.size = uap->a3;
ap.shmflg = uap->a4;
return (sysent[SYS_shmget].sy_call(td, &ap));
}
case 4: { /* shmctl */
struct freebsd7_freebsd32_shmctl_args ap;
ap.shmid = uap->a2;
ap.cmd = uap->a3;
ap.buf = PTRIN(uap->a4);
return (freebsd7_freebsd32_shmctl(td, &ap));
}
case 1: /* oshmctl */
default:
return (EINVAL);
}
#else
return (nosys(td, NULL));
#endif
}
#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
int
freebsd7_freebsd32_shmctl(struct thread *td,
struct freebsd7_freebsd32_shmctl_args *uap)
{
int error = 0;
union {
struct shmid_ds shmid_ds;
struct shm_info shm_info;
struct shminfo shminfo;
} u;
union {
struct shmid_ds32_old shmid_ds32;
struct shm_info32 shm_info32;
struct shminfo32 shminfo32;
} u32;
size_t sz;
if (uap->cmd == IPC_SET) {
if ((error = copyin(uap->buf, &u32.shmid_ds32,
sizeof(u32.shmid_ds32))))
goto done;
freebsd32_ipcperm_old_in(&u32.shmid_ds32.shm_perm,
&u.shmid_ds.shm_perm);
CP(u32.shmid_ds32, u.shmid_ds, shm_segsz);
CP(u32.shmid_ds32, u.shmid_ds, shm_lpid);
CP(u32.shmid_ds32, u.shmid_ds, shm_cpid);
CP(u32.shmid_ds32, u.shmid_ds, shm_nattch);
CP(u32.shmid_ds32, u.shmid_ds, shm_atime);
CP(u32.shmid_ds32, u.shmid_ds, shm_dtime);
CP(u32.shmid_ds32, u.shmid_ds, shm_ctime);
}
error = kern_shmctl(td, uap->shmid, uap->cmd, (void *)&u, &sz);
if (error)
goto done;
/* Cases in which we need to copyout */
switch (uap->cmd) {
case IPC_INFO:
CP(u.shminfo, u32.shminfo32, shmmax);
CP(u.shminfo, u32.shminfo32, shmmin);
CP(u.shminfo, u32.shminfo32, shmmni);
CP(u.shminfo, u32.shminfo32, shmseg);
CP(u.shminfo, u32.shminfo32, shmall);
error = copyout(&u32.shminfo32, uap->buf,
sizeof(u32.shminfo32));
break;
case SHM_INFO:
CP(u.shm_info, u32.shm_info32, used_ids);
CP(u.shm_info, u32.shm_info32, shm_rss);
CP(u.shm_info, u32.shm_info32, shm_tot);
CP(u.shm_info, u32.shm_info32, shm_swp);
CP(u.shm_info, u32.shm_info32, swap_attempts);
CP(u.shm_info, u32.shm_info32, swap_successes);
error = copyout(&u32.shm_info32, uap->buf,
sizeof(u32.shm_info32));
break;
case SHM_STAT:
case IPC_STAT:
freebsd32_ipcperm_old_out(&u.shmid_ds.shm_perm,
&u32.shmid_ds32.shm_perm);
if (u.shmid_ds.shm_segsz > INT32_MAX)
u32.shmid_ds32.shm_segsz = INT32_MAX;
else
CP(u.shmid_ds, u32.shmid_ds32, shm_segsz);
CP(u.shmid_ds, u32.shmid_ds32, shm_lpid);
CP(u.shmid_ds, u32.shmid_ds32, shm_cpid);
CP(u.shmid_ds, u32.shmid_ds32, shm_nattch);
CP(u.shmid_ds, u32.shmid_ds32, shm_atime);
CP(u.shmid_ds, u32.shmid_ds32, shm_dtime);
CP(u.shmid_ds, u32.shmid_ds32, shm_ctime);
u32.shmid_ds32.shm_internal = 0;
error = copyout(&u32.shmid_ds32, uap->buf,
sizeof(u32.shmid_ds32));
break;
}
done:
if (error) {
/* Invalidate the return value */
td->td_retval[0] = -1;
}
return (error);
}
#endif
int
freebsd32_shmctl(struct thread *td, struct freebsd32_shmctl_args *uap)
{
int error = 0;
union {
struct shmid_ds shmid_ds;
struct shm_info shm_info;
struct shminfo shminfo;
} u;
union {
struct shmid_ds32 shmid_ds32;
struct shm_info32 shm_info32;
struct shminfo32 shminfo32;
} u32;
size_t sz;
if (uap->cmd == IPC_SET) {
if ((error = copyin(uap->buf, &u32.shmid_ds32,
sizeof(u32.shmid_ds32))))
goto done;
freebsd32_ipcperm_in(&u32.shmid_ds32.shm_perm,
&u.shmid_ds.shm_perm);
CP(u32.shmid_ds32, u.shmid_ds, shm_segsz);
CP(u32.shmid_ds32, u.shmid_ds, shm_lpid);
CP(u32.shmid_ds32, u.shmid_ds, shm_cpid);
CP(u32.shmid_ds32, u.shmid_ds, shm_nattch);
CP(u32.shmid_ds32, u.shmid_ds, shm_atime);
CP(u32.shmid_ds32, u.shmid_ds, shm_dtime);
CP(u32.shmid_ds32, u.shmid_ds, shm_ctime);
}
error = kern_shmctl(td, uap->shmid, uap->cmd, (void *)&u, &sz);
if (error)
goto done;
/* Cases in which we need to copyout */
switch (uap->cmd) {
case IPC_INFO:
CP(u.shminfo, u32.shminfo32, shmmax);
CP(u.shminfo, u32.shminfo32, shmmin);
CP(u.shminfo, u32.shminfo32, shmmni);
CP(u.shminfo, u32.shminfo32, shmseg);
CP(u.shminfo, u32.shminfo32, shmall);
error = copyout(&u32.shminfo32, uap->buf,
sizeof(u32.shminfo32));
break;
case SHM_INFO:
CP(u.shm_info, u32.shm_info32, used_ids);
CP(u.shm_info, u32.shm_info32, shm_rss);
CP(u.shm_info, u32.shm_info32, shm_tot);
CP(u.shm_info, u32.shm_info32, shm_swp);
CP(u.shm_info, u32.shm_info32, swap_attempts);
CP(u.shm_info, u32.shm_info32, swap_successes);
error = copyout(&u32.shm_info32, uap->buf,
sizeof(u32.shm_info32));
break;
case SHM_STAT:
case IPC_STAT:
freebsd32_ipcperm_out(&u.shmid_ds.shm_perm,
&u32.shmid_ds32.shm_perm);
if (u.shmid_ds.shm_segsz > INT32_MAX)
u32.shmid_ds32.shm_segsz = INT32_MAX;
else
CP(u.shmid_ds, u32.shmid_ds32, shm_segsz);
CP(u.shmid_ds, u32.shmid_ds32, shm_lpid);
CP(u.shmid_ds, u32.shmid_ds32, shm_cpid);
CP(u.shmid_ds, u32.shmid_ds32, shm_nattch);
CP(u.shmid_ds, u32.shmid_ds32, shm_atime);
CP(u.shmid_ds, u32.shmid_ds32, shm_dtime);
CP(u.shmid_ds, u32.shmid_ds32, shm_ctime);
error = copyout(&u32.shmid_ds32, uap->buf,
sizeof(u32.shmid_ds32));
break;
}
done:
if (error) {
/* Invalidate the return value */
td->td_retval[0] = -1;
}
return (error);
}
#endif
#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
#ifndef CP
#define CP(src, dst, fld) do { (dst).fld = (src).fld; } while (0)
#endif
#ifndef _SYS_SYSPROTO_H_
struct freebsd7_shmctl_args {
int shmid;
int cmd;
struct shmid_ds_old *buf;
};
#endif
int
freebsd7_shmctl(td, uap)
struct thread *td;
struct freebsd7_shmctl_args *uap;
{
int error = 0;
struct shmid_ds_old old;
struct shmid_ds buf;
size_t bufsz;
/*
* The only reason IPC_INFO, SHM_INFO, SHM_STAT exists is to support
* Linux binaries. If we see the call come through the FreeBSD ABI,
* return an error back to the user since we do not to support this.
*/
if (uap->cmd == IPC_INFO || uap->cmd == SHM_INFO ||
uap->cmd == SHM_STAT)
return (EINVAL);
/* IPC_SET needs to copyin the buffer before calling kern_shmctl */
if (uap->cmd == IPC_SET) {
if ((error = copyin(uap->buf, &old, sizeof(old))))
goto done;
ipcperm_old2new(&old.shm_perm, &buf.shm_perm);
CP(old, buf, shm_segsz);
CP(old, buf, shm_lpid);
CP(old, buf, shm_cpid);
CP(old, buf, shm_nattch);
CP(old, buf, shm_atime);
CP(old, buf, shm_dtime);
CP(old, buf, shm_ctime);
}
error = kern_shmctl(td, uap->shmid, uap->cmd, (void *)&buf, &bufsz);
if (error)
goto done;
/* Cases in which we need to copyout */
switch (uap->cmd) {
case IPC_STAT:
ipcperm_new2old(&buf.shm_perm, &old.shm_perm);
if (buf.shm_segsz > INT_MAX)
old.shm_segsz = INT_MAX;
else
CP(buf, old, shm_segsz);
CP(buf, old, shm_lpid);
CP(buf, old, shm_cpid);
if (buf.shm_nattch > SHRT_MAX)
old.shm_nattch = SHRT_MAX;
else
CP(buf, old, shm_nattch);
CP(buf, old, shm_atime);
CP(buf, old, shm_dtime);
CP(buf, old, shm_ctime);
old.shm_internal = NULL;
error = copyout(&old, uap->buf, sizeof(old));
break;
}
done:
if (error) {
/* Invalidate the return value */
td->td_retval[0] = -1;
}
return (error);
}
#endif /* COMPAT_FREEBSD4 || COMPAT_FREEBSD5 || COMPAT_FREEBSD6 ||
COMPAT_FREEBSD7 */
static int
sysvshm_modload(struct module *module, int cmd, void *arg)
{
int error = 0;
switch (cmd) {
case MOD_LOAD:
error = shminit();
if (error != 0)
shmunload();
break;
case MOD_UNLOAD:
error = shmunload();
break;
case MOD_SHUTDOWN:
break;
default:
error = EINVAL;
break;
}
return (error);
}
static moduledata_t sysvshm_mod = {
"sysvshm",
&sysvshm_modload,
NULL
};
DECLARE_MODULE(sysvshm, sysvshm_mod, SI_SUB_SYSV_SHM, SI_ORDER_FIRST);
MODULE_VERSION(sysvshm, 1);
Index: head/sys/kern/tty.c
===================================================================
--- head/sys/kern/tty.c (revision 225616)
+++ head/sys/kern/tty.c (revision 225617)
@@ -1,2200 +1,2200 @@
/*-
* Copyright (c) 2008 Ed Schouten <ed@FreeBSD.org>
* All rights reserved.
*
* Portions of this software were developed under sponsorship from Snow
* B.V., the Netherlands.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_capsicum.h"
#include "opt_compat.h"
#include <sys/param.h>
#include <sys/capability.h>
#include <sys/conf.h>
#include <sys/cons.h>
#include <sys/fcntl.h>
#include <sys/file.h>
#include <sys/filedesc.h>
#include <sys/filio.h>
#ifdef COMPAT_43TTY
#include <sys/ioctl_compat.h>
#endif /* COMPAT_43TTY */
#include <sys/kernel.h>
#include <sys/limits.h>
#include <sys/malloc.h>
#include <sys/mount.h>
#include <sys/poll.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/serial.h>
#include <sys/signal.h>
#include <sys/stat.h>
#include <sys/sx.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
#include <sys/tty.h>
#include <sys/ttycom.h>
#define TTYDEFCHARS
#include <sys/ttydefaults.h>
#undef TTYDEFCHARS
#include <sys/ucred.h>
#include <sys/vnode.h>
#include <machine/stdarg.h>
static MALLOC_DEFINE(M_TTY, "tty", "tty device");
static void tty_rel_free(struct tty *tp);
static TAILQ_HEAD(, tty) tty_list = TAILQ_HEAD_INITIALIZER(tty_list);
static struct sx tty_list_sx;
SX_SYSINIT(tty_list, &tty_list_sx, "tty list");
static unsigned int tty_list_count = 0;
/* Character device of /dev/console. */
static struct cdev *dev_console;
static const char *dev_console_filename;
/*
* Flags that are supported and stored by this implementation.
*/
#define TTYSUP_IFLAG (IGNBRK|BRKINT|IGNPAR|PARMRK|INPCK|ISTRIP|\
INLCR|IGNCR|ICRNL|IXON|IXOFF|IXANY|IMAXBEL)
#define TTYSUP_OFLAG (OPOST|ONLCR|TAB3|ONOEOT|OCRNL|ONOCR|ONLRET)
#define TTYSUP_LFLAG (ECHOKE|ECHOE|ECHOK|ECHO|ECHONL|ECHOPRT|\
ECHOCTL|ISIG|ICANON|ALTWERASE|IEXTEN|TOSTOP|\
FLUSHO|NOKERNINFO|NOFLSH)
#define TTYSUP_CFLAG (CIGNORE|CSIZE|CSTOPB|CREAD|PARENB|PARODD|\
HUPCL|CLOCAL|CCTS_OFLOW|CRTS_IFLOW|CDTR_IFLOW|\
CDSR_OFLOW|CCAR_OFLOW)
#define TTY_CALLOUT(tp,d) (dev2unit(d) & TTYUNIT_CALLOUT)
/*
* Set TTY buffer sizes.
*/
#define TTYBUF_MAX 65536
static void
tty_watermarks(struct tty *tp)
{
size_t bs = 0;
/* Provide an input buffer for 0.2 seconds of data. */
if (tp->t_termios.c_cflag & CREAD)
bs = MIN(tp->t_termios.c_ispeed / 5, TTYBUF_MAX);
ttyinq_setsize(&tp->t_inq, tp, bs);
/* Set low watermark at 10% (when 90% is available). */
tp->t_inlow = (ttyinq_getallocatedsize(&tp->t_inq) * 9) / 10;
/* Provide an ouput buffer for 0.2 seconds of data. */
bs = MIN(tp->t_termios.c_ospeed / 5, TTYBUF_MAX);
ttyoutq_setsize(&tp->t_outq, tp, bs);
/* Set low watermark at 10% (when 90% is available). */
tp->t_outlow = (ttyoutq_getallocatedsize(&tp->t_outq) * 9) / 10;
}
static int
tty_drain(struct tty *tp)
{
int error;
if (ttyhook_hashook(tp, getc_inject))
/* buffer is inaccessible */
return (0);
while (ttyoutq_bytesused(&tp->t_outq) > 0) {
ttydevsw_outwakeup(tp);
/* Could be handled synchronously. */
if (ttyoutq_bytesused(&tp->t_outq) == 0)
return (0);
/* Wait for data to be drained. */
error = tty_wait(tp, &tp->t_outwait);
if (error)
return (error);
}
return (0);
}
/*
* Though ttydev_enter() and ttydev_leave() seem to be related, they
* don't have to be used together. ttydev_enter() is used by the cdev
* operations to prevent an actual operation from being processed when
* the TTY has been abandoned. ttydev_leave() is used by ttydev_open()
* and ttydev_close() to determine whether per-TTY data should be
* deallocated.
*/
static __inline int
ttydev_enter(struct tty *tp)
{
tty_lock(tp);
if (tty_gone(tp) || !tty_opened(tp)) {
/* Device is already gone. */
tty_unlock(tp);
return (ENXIO);
}
return (0);
}
static void
ttydev_leave(struct tty *tp)
{
tty_lock_assert(tp, MA_OWNED);
if (tty_opened(tp) || tp->t_flags & TF_OPENCLOSE) {
/* Device is still opened somewhere. */
tty_unlock(tp);
return;
}
tp->t_flags |= TF_OPENCLOSE;
/* Stop asynchronous I/O. */
funsetown(&tp->t_sigio);
/* Remove console TTY. */
if (constty == tp)
constty_clear();
/* Drain any output. */
MPASS((tp->t_flags & TF_STOPPED) == 0);
if (!tty_gone(tp))
tty_drain(tp);
ttydisc_close(tp);
/* Destroy associated buffers already. */
ttyinq_free(&tp->t_inq);
tp->t_inlow = 0;
ttyoutq_free(&tp->t_outq);
tp->t_outlow = 0;
knlist_clear(&tp->t_inpoll.si_note, 1);
knlist_clear(&tp->t_outpoll.si_note, 1);
if (!tty_gone(tp))
ttydevsw_close(tp);
tp->t_flags &= ~TF_OPENCLOSE;
cv_broadcast(&tp->t_dcdwait);
tty_rel_free(tp);
}
/*
* Operations that are exposed through the character device in /dev.
*/
static int
ttydev_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
{
struct tty *tp = dev->si_drv1;
int error = 0;
tty_lock(tp);
if (tty_gone(tp)) {
/* Device is already gone. */
tty_unlock(tp);
return (ENXIO);
}
/*
* Block when other processes are currently opening or closing
* the TTY.
*/
while (tp->t_flags & TF_OPENCLOSE) {
error = tty_wait(tp, &tp->t_dcdwait);
if (error != 0) {
tty_unlock(tp);
return (error);
}
}
tp->t_flags |= TF_OPENCLOSE;
/*
* Make sure the "tty" and "cua" device cannot be opened at the
* same time.
*/
if (TTY_CALLOUT(tp, dev)) {
if (tp->t_flags & TF_OPENED_IN) {
error = EBUSY;
goto done;
}
} else {
if (tp->t_flags & TF_OPENED_OUT) {
error = EBUSY;
goto done;
}
}
if (tp->t_flags & TF_EXCLUDE && priv_check(td, PRIV_TTY_EXCLUSIVE)) {
error = EBUSY;
goto done;
}
if (!tty_opened(tp)) {
/* Set proper termios flags. */
if (TTY_CALLOUT(tp, dev))
tp->t_termios = tp->t_termios_init_out;
else
tp->t_termios = tp->t_termios_init_in;
ttydevsw_param(tp, &tp->t_termios);
/* Prevent modem control on callout devices and /dev/console. */
if (TTY_CALLOUT(tp, dev) || dev == dev_console)
tp->t_termios.c_cflag |= CLOCAL;
ttydevsw_modem(tp, SER_DTR|SER_RTS, 0);
error = ttydevsw_open(tp);
if (error != 0)
goto done;
ttydisc_open(tp);
tty_watermarks(tp);
}
/* Wait for Carrier Detect. */
if ((oflags & O_NONBLOCK) == 0 &&
(tp->t_termios.c_cflag & CLOCAL) == 0) {
while ((ttydevsw_modem(tp, 0, 0) & SER_DCD) == 0) {
error = tty_wait(tp, &tp->t_dcdwait);
if (error != 0)
goto done;
}
}
if (dev == dev_console)
tp->t_flags |= TF_OPENED_CONS;
else if (TTY_CALLOUT(tp, dev))
tp->t_flags |= TF_OPENED_OUT;
else
tp->t_flags |= TF_OPENED_IN;
done: tp->t_flags &= ~TF_OPENCLOSE;
cv_broadcast(&tp->t_dcdwait);
ttydev_leave(tp);
return (error);
}
static int
ttydev_close(struct cdev *dev, int fflag, int devtype, struct thread *td)
{
struct tty *tp = dev->si_drv1;
tty_lock(tp);
/*
* Don't actually close the device if it is being used as the
* console.
*/
MPASS((tp->t_flags & TF_OPENED) != TF_OPENED);
if (dev == dev_console)
tp->t_flags &= ~TF_OPENED_CONS;
else
tp->t_flags &= ~(TF_OPENED_IN|TF_OPENED_OUT);
if (tp->t_flags & TF_OPENED) {
tty_unlock(tp);
return (0);
}
/*
* This can only be called once. The callin and the callout
* devices cannot be opened at the same time.
*/
tp->t_flags &= ~(TF_EXCLUDE|TF_STOPPED);
/* Properly wake up threads that are stuck - revoke(). */
tp->t_revokecnt++;
tty_wakeup(tp, FREAD|FWRITE);
cv_broadcast(&tp->t_bgwait);
cv_broadcast(&tp->t_dcdwait);
ttydev_leave(tp);
return (0);
}
static __inline int
tty_is_ctty(struct tty *tp, struct proc *p)
{
tty_lock_assert(tp, MA_OWNED);
return (p->p_session == tp->t_session && p->p_flag & P_CONTROLT);
}
static int
tty_wait_background(struct tty *tp, struct thread *td, int sig)
{
struct proc *p = td->td_proc;
struct pgrp *pg;
ksiginfo_t ksi;
int error;
MPASS(sig == SIGTTIN || sig == SIGTTOU);
tty_lock_assert(tp, MA_OWNED);
for (;;) {
PROC_LOCK(p);
/*
* The process should only sleep, when:
* - This terminal is the controling terminal
* - Its process group is not the foreground process
* group
* - The parent process isn't waiting for the child to
* exit
* - the signal to send to the process isn't masked
*/
if (!tty_is_ctty(tp, p) || p->p_pgrp == tp->t_pgrp) {
/* Allow the action to happen. */
PROC_UNLOCK(p);
return (0);
}
if (SIGISMEMBER(p->p_sigacts->ps_sigignore, sig) ||
SIGISMEMBER(td->td_sigmask, sig)) {
/* Only allow them in write()/ioctl(). */
PROC_UNLOCK(p);
return (sig == SIGTTOU ? 0 : EIO);
}
pg = p->p_pgrp;
if (p->p_flag & P_PPWAIT || pg->pg_jobc == 0) {
/* Don't allow the action to happen. */
PROC_UNLOCK(p);
return (EIO);
}
PROC_UNLOCK(p);
/*
* Send the signal and sleep until we're the new
* foreground process group.
*/
if (sig != 0) {
ksiginfo_init(&ksi);
ksi.ksi_code = SI_KERNEL;
ksi.ksi_signo = sig;
sig = 0;
}
PGRP_LOCK(pg);
pgsignal(pg, ksi.ksi_signo, 1, &ksi);
PGRP_UNLOCK(pg);
error = tty_wait(tp, &tp->t_bgwait);
if (error)
return (error);
}
}
static int
ttydev_read(struct cdev *dev, struct uio *uio, int ioflag)
{
struct tty *tp = dev->si_drv1;
int error;
error = ttydev_enter(tp);
if (error)
goto done;
error = tty_wait_background(tp, curthread, SIGTTIN);
if (error) {
tty_unlock(tp);
goto done;
}
error = ttydisc_read(tp, uio, ioflag);
tty_unlock(tp);
/*
* The read() call should not throw an error when the device is
* being destroyed. Silently convert it to an EOF.
*/
done: if (error == ENXIO)
error = 0;
return (error);
}
static int
ttydev_write(struct cdev *dev, struct uio *uio, int ioflag)
{
struct tty *tp = dev->si_drv1;
int error;
error = ttydev_enter(tp);
if (error)
return (error);
if (tp->t_termios.c_lflag & TOSTOP) {
error = tty_wait_background(tp, curthread, SIGTTOU);
if (error)
goto done;
}
if (ioflag & IO_NDELAY && tp->t_flags & TF_BUSY_OUT) {
/* Allow non-blocking writes to bypass serialization. */
error = ttydisc_write(tp, uio, ioflag);
} else {
/* Serialize write() calls. */
while (tp->t_flags & TF_BUSY_OUT) {
error = tty_wait(tp, &tp->t_outserwait);
if (error)
goto done;
}
tp->t_flags |= TF_BUSY_OUT;
error = ttydisc_write(tp, uio, ioflag);
tp->t_flags &= ~TF_BUSY_OUT;
cv_signal(&tp->t_outserwait);
}
done: tty_unlock(tp);
return (error);
}
static int
ttydev_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag,
struct thread *td)
{
struct tty *tp = dev->si_drv1;
int error;
error = ttydev_enter(tp);
if (error)
return (error);
switch (cmd) {
case TIOCCBRK:
case TIOCCONS:
case TIOCDRAIN:
case TIOCEXCL:
case TIOCFLUSH:
case TIOCNXCL:
case TIOCSBRK:
case TIOCSCTTY:
case TIOCSETA:
case TIOCSETAF:
case TIOCSETAW:
case TIOCSPGRP:
case TIOCSTART:
case TIOCSTAT:
case TIOCSTI:
case TIOCSTOP:
case TIOCSWINSZ:
#if 0
case TIOCSDRAINWAIT:
case TIOCSETD:
#endif
#ifdef COMPAT_43TTY
case TIOCLBIC:
case TIOCLBIS:
case TIOCLSET:
case TIOCSETC:
case OTIOCSETD:
case TIOCSETN:
case TIOCSETP:
case TIOCSLTC:
#endif /* COMPAT_43TTY */
/*
* If the ioctl() causes the TTY to be modified, let it
* wait in the background.
*/
error = tty_wait_background(tp, curthread, SIGTTOU);
if (error)
goto done;
}
if (cmd == TIOCSETA || cmd == TIOCSETAW || cmd == TIOCSETAF) {
struct termios *old = &tp->t_termios;
struct termios *new = (struct termios *)data;
struct termios *lock = TTY_CALLOUT(tp, dev) ?
&tp->t_termios_lock_out : &tp->t_termios_lock_in;
int cc;
/*
* Lock state devices. Just overwrite the values of the
* commands that are currently in use.
*/
new->c_iflag = (old->c_iflag & lock->c_iflag) |
(new->c_iflag & ~lock->c_iflag);
new->c_oflag = (old->c_oflag & lock->c_oflag) |
(new->c_oflag & ~lock->c_oflag);
new->c_cflag = (old->c_cflag & lock->c_cflag) |
(new->c_cflag & ~lock->c_cflag);
new->c_lflag = (old->c_lflag & lock->c_lflag) |
(new->c_lflag & ~lock->c_lflag);
for (cc = 0; cc < NCCS; ++cc)
if (lock->c_cc[cc])
new->c_cc[cc] = old->c_cc[cc];
if (lock->c_ispeed)
new->c_ispeed = old->c_ispeed;
if (lock->c_ospeed)
new->c_ospeed = old->c_ospeed;
}
error = tty_ioctl(tp, cmd, data, fflag, td);
done: tty_unlock(tp);
return (error);
}
static int
ttydev_poll(struct cdev *dev, int events, struct thread *td)
{
struct tty *tp = dev->si_drv1;
int error, revents = 0;
error = ttydev_enter(tp);
if (error)
return ((events & (POLLIN|POLLRDNORM)) | POLLHUP);
if (events & (POLLIN|POLLRDNORM)) {
/* See if we can read something. */
if (ttydisc_read_poll(tp) > 0)
revents |= events & (POLLIN|POLLRDNORM);
}
if (tp->t_flags & TF_ZOMBIE) {
/* Hangup flag on zombie state. */
revents |= POLLHUP;
} else if (events & (POLLOUT|POLLWRNORM)) {
/* See if we can write something. */
if (ttydisc_write_poll(tp) > 0)
revents |= events & (POLLOUT|POLLWRNORM);
}
if (revents == 0) {
if (events & (POLLIN|POLLRDNORM))
selrecord(td, &tp->t_inpoll);
if (events & (POLLOUT|POLLWRNORM))
selrecord(td, &tp->t_outpoll);
}
tty_unlock(tp);
return (revents);
}
static int
ttydev_mmap(struct cdev *dev, vm_ooffset_t offset, vm_paddr_t *paddr,
int nprot, vm_memattr_t *memattr)
{
struct tty *tp = dev->si_drv1;
int error;
/* Handle mmap() through the driver. */
error = ttydev_enter(tp);
if (error)
return (-1);
error = ttydevsw_mmap(tp, offset, paddr, nprot, memattr);
tty_unlock(tp);
return (error);
}
/*
* kqueue support.
*/
static void
tty_kqops_read_detach(struct knote *kn)
{
struct tty *tp = kn->kn_hook;
knlist_remove(&tp->t_inpoll.si_note, kn, 0);
}
static int
tty_kqops_read_event(struct knote *kn, long hint)
{
struct tty *tp = kn->kn_hook;
tty_lock_assert(tp, MA_OWNED);
if (tty_gone(tp) || tp->t_flags & TF_ZOMBIE) {
kn->kn_flags |= EV_EOF;
return (1);
} else {
kn->kn_data = ttydisc_read_poll(tp);
return (kn->kn_data > 0);
}
}
static void
tty_kqops_write_detach(struct knote *kn)
{
struct tty *tp = kn->kn_hook;
knlist_remove(&tp->t_outpoll.si_note, kn, 0);
}
static int
tty_kqops_write_event(struct knote *kn, long hint)
{
struct tty *tp = kn->kn_hook;
tty_lock_assert(tp, MA_OWNED);
if (tty_gone(tp)) {
kn->kn_flags |= EV_EOF;
return (1);
} else {
kn->kn_data = ttydisc_write_poll(tp);
return (kn->kn_data > 0);
}
}
static struct filterops tty_kqops_read = {
.f_isfd = 1,
.f_detach = tty_kqops_read_detach,
.f_event = tty_kqops_read_event,
};
static struct filterops tty_kqops_write = {
.f_isfd = 1,
.f_detach = tty_kqops_write_detach,
.f_event = tty_kqops_write_event,
};
static int
ttydev_kqfilter(struct cdev *dev, struct knote *kn)
{
struct tty *tp = dev->si_drv1;
int error;
error = ttydev_enter(tp);
if (error)
return (error);
switch (kn->kn_filter) {
case EVFILT_READ:
kn->kn_hook = tp;
kn->kn_fop = &tty_kqops_read;
knlist_add(&tp->t_inpoll.si_note, kn, 1);
break;
case EVFILT_WRITE:
kn->kn_hook = tp;
kn->kn_fop = &tty_kqops_write;
knlist_add(&tp->t_outpoll.si_note, kn, 1);
break;
default:
error = EINVAL;
break;
}
tty_unlock(tp);
return (error);
}
static struct cdevsw ttydev_cdevsw = {
.d_version = D_VERSION,
.d_open = ttydev_open,
.d_close = ttydev_close,
.d_read = ttydev_read,
.d_write = ttydev_write,
.d_ioctl = ttydev_ioctl,
.d_kqfilter = ttydev_kqfilter,
.d_poll = ttydev_poll,
.d_mmap = ttydev_mmap,
.d_name = "ttydev",
.d_flags = D_TTY,
};
/*
* Init/lock-state devices
*/
static int
ttyil_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
{
struct tty *tp = dev->si_drv1;
int error = 0;
tty_lock(tp);
if (tty_gone(tp))
error = ENODEV;
tty_unlock(tp);
return (error);
}
static int
ttyil_close(struct cdev *dev, int flag, int mode, struct thread *td)
{
return (0);
}
static int
ttyil_rdwr(struct cdev *dev, struct uio *uio, int ioflag)
{
return (ENODEV);
}
static int
ttyil_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag,
struct thread *td)
{
struct tty *tp = dev->si_drv1;
int error;
tty_lock(tp);
if (tty_gone(tp)) {
error = ENODEV;
goto done;
}
error = ttydevsw_cioctl(tp, dev2unit(dev), cmd, data, td);
if (error != ENOIOCTL)
goto done;
error = 0;
switch (cmd) {
case TIOCGETA:
/* Obtain terminal flags through tcgetattr(). */
*(struct termios*)data = *(struct termios*)dev->si_drv2;
break;
case TIOCSETA:
/* Set terminal flags through tcsetattr(). */
error = priv_check(td, PRIV_TTY_SETA);
if (error)
break;
*(struct termios*)dev->si_drv2 = *(struct termios*)data;
break;
case TIOCGETD:
*(int *)data = TTYDISC;
break;
case TIOCGWINSZ:
bzero(data, sizeof(struct winsize));
break;
default:
error = ENOTTY;
}
done: tty_unlock(tp);
return (error);
}
static struct cdevsw ttyil_cdevsw = {
.d_version = D_VERSION,
.d_open = ttyil_open,
.d_close = ttyil_close,
.d_read = ttyil_rdwr,
.d_write = ttyil_rdwr,
.d_ioctl = ttyil_ioctl,
.d_name = "ttyil",
.d_flags = D_TTY,
};
static void
tty_init_termios(struct tty *tp)
{
struct termios *t = &tp->t_termios_init_in;
t->c_cflag = TTYDEF_CFLAG;
t->c_iflag = TTYDEF_IFLAG;
t->c_lflag = TTYDEF_LFLAG;
t->c_oflag = TTYDEF_OFLAG;
t->c_ispeed = TTYDEF_SPEED;
t->c_ospeed = TTYDEF_SPEED;
memcpy(&t->c_cc, ttydefchars, sizeof ttydefchars);
tp->t_termios_init_out = *t;
}
void
tty_init_console(struct tty *tp, speed_t s)
{
struct termios *ti = &tp->t_termios_init_in;
struct termios *to = &tp->t_termios_init_out;
if (s != 0) {
ti->c_ispeed = ti->c_ospeed = s;
to->c_ispeed = to->c_ospeed = s;
}
ti->c_cflag |= CLOCAL;
to->c_cflag |= CLOCAL;
}
/*
* Standard device routine implementations, mostly meant for
* pseudo-terminal device drivers. When a driver creates a new terminal
* device class, missing routines are patched.
*/
static int
ttydevsw_defopen(struct tty *tp)
{
return (0);
}
static void
ttydevsw_defclose(struct tty *tp)
{
}
static void
ttydevsw_defoutwakeup(struct tty *tp)
{
panic("Terminal device has output, while not implemented");
}
static void
ttydevsw_definwakeup(struct tty *tp)
{
}
static int
ttydevsw_defioctl(struct tty *tp, u_long cmd, caddr_t data, struct thread *td)
{
return (ENOIOCTL);
}
static int
ttydevsw_defcioctl(struct tty *tp, int unit, u_long cmd, caddr_t data, struct thread *td)
{
return (ENOIOCTL);
}
static int
ttydevsw_defparam(struct tty *tp, struct termios *t)
{
/*
* Allow the baud rate to be adjusted for pseudo-devices, but at
* least restrict it to 115200 to prevent excessive buffer
* usage. Also disallow 0, to prevent foot shooting.
*/
if (t->c_ispeed < B50)
t->c_ispeed = B50;
else if (t->c_ispeed > B115200)
t->c_ispeed = B115200;
if (t->c_ospeed < B50)
t->c_ospeed = B50;
else if (t->c_ospeed > B115200)
t->c_ospeed = B115200;
t->c_cflag |= CREAD;
return (0);
}
static int
ttydevsw_defmodem(struct tty *tp, int sigon, int sigoff)
{
/* Simulate a carrier to make the TTY layer happy. */
return (SER_DCD);
}
static int
ttydevsw_defmmap(struct tty *tp, vm_ooffset_t offset, vm_paddr_t *paddr,
int nprot, vm_memattr_t *memattr)
{
return (-1);
}
static void
ttydevsw_defpktnotify(struct tty *tp, char event)
{
}
static void
ttydevsw_deffree(void *softc)
{
panic("Terminal device freed without a free-handler");
}
/*
* TTY allocation and deallocation. TTY devices can be deallocated when
* the driver doesn't use it anymore, when the TTY isn't a session's
* controlling TTY and when the device node isn't opened through devfs.
*/
struct tty *
tty_alloc(struct ttydevsw *tsw, void *sc)
{
return (tty_alloc_mutex(tsw, sc, NULL));
}
struct tty *
tty_alloc_mutex(struct ttydevsw *tsw, void *sc, struct mtx *mutex)
{
struct tty *tp;
/* Make sure the driver defines all routines. */
#define PATCH_FUNC(x) do { \
if (tsw->tsw_ ## x == NULL) \
tsw->tsw_ ## x = ttydevsw_def ## x; \
} while (0)
PATCH_FUNC(open);
PATCH_FUNC(close);
PATCH_FUNC(outwakeup);
PATCH_FUNC(inwakeup);
PATCH_FUNC(ioctl);
PATCH_FUNC(cioctl);
PATCH_FUNC(param);
PATCH_FUNC(modem);
PATCH_FUNC(mmap);
PATCH_FUNC(pktnotify);
PATCH_FUNC(free);
#undef PATCH_FUNC
tp = malloc(sizeof(struct tty), M_TTY, M_WAITOK|M_ZERO);
tp->t_devsw = tsw;
tp->t_devswsoftc = sc;
tp->t_flags = tsw->tsw_flags;
tty_init_termios(tp);
cv_init(&tp->t_inwait, "ttyin");
cv_init(&tp->t_outwait, "ttyout");
cv_init(&tp->t_outserwait, "ttyosr");
cv_init(&tp->t_bgwait, "ttybg");
cv_init(&tp->t_dcdwait, "ttydcd");
/* Allow drivers to use a custom mutex to lock the TTY. */
if (mutex != NULL) {
tp->t_mtx = mutex;
} else {
tp->t_mtx = &tp->t_mtxobj;
mtx_init(&tp->t_mtxobj, "ttymtx", NULL, MTX_DEF);
}
knlist_init_mtx(&tp->t_inpoll.si_note, tp->t_mtx);
knlist_init_mtx(&tp->t_outpoll.si_note, tp->t_mtx);
sx_xlock(&tty_list_sx);
TAILQ_INSERT_TAIL(&tty_list, tp, t_list);
tty_list_count++;
sx_xunlock(&tty_list_sx);
return (tp);
}
static void
tty_dealloc(void *arg)
{
struct tty *tp = arg;
sx_xlock(&tty_list_sx);
TAILQ_REMOVE(&tty_list, tp, t_list);
tty_list_count--;
sx_xunlock(&tty_list_sx);
/* Make sure we haven't leaked buffers. */
MPASS(ttyinq_getsize(&tp->t_inq) == 0);
MPASS(ttyoutq_getsize(&tp->t_outq) == 0);
seldrain(&tp->t_inpoll);
seldrain(&tp->t_outpoll);
knlist_destroy(&tp->t_inpoll.si_note);
knlist_destroy(&tp->t_outpoll.si_note);
cv_destroy(&tp->t_inwait);
cv_destroy(&tp->t_outwait);
cv_destroy(&tp->t_bgwait);
cv_destroy(&tp->t_dcdwait);
cv_destroy(&tp->t_outserwait);
if (tp->t_mtx == &tp->t_mtxobj)
mtx_destroy(&tp->t_mtxobj);
ttydevsw_free(tp);
free(tp, M_TTY);
}
static void
tty_rel_free(struct tty *tp)
{
struct cdev *dev;
tty_lock_assert(tp, MA_OWNED);
#define TF_ACTIVITY (TF_GONE|TF_OPENED|TF_HOOK|TF_OPENCLOSE)
if (tp->t_sessioncnt != 0 || (tp->t_flags & TF_ACTIVITY) != TF_GONE) {
/* TTY is still in use. */
tty_unlock(tp);
return;
}
/* TTY can be deallocated. */
dev = tp->t_dev;
tp->t_dev = NULL;
tty_unlock(tp);
if (dev != NULL)
destroy_dev_sched_cb(dev, tty_dealloc, tp);
}
void
tty_rel_pgrp(struct tty *tp, struct pgrp *pg)
{
MPASS(tp->t_sessioncnt > 0);
tty_lock_assert(tp, MA_OWNED);
if (tp->t_pgrp == pg)
tp->t_pgrp = NULL;
tty_unlock(tp);
}
void
tty_rel_sess(struct tty *tp, struct session *sess)
{
MPASS(tp->t_sessioncnt > 0);
/* Current session has left. */
if (tp->t_session == sess) {
tp->t_session = NULL;
MPASS(tp->t_pgrp == NULL);
}
tp->t_sessioncnt--;
tty_rel_free(tp);
}
void
tty_rel_gone(struct tty *tp)
{
MPASS(!tty_gone(tp));
/* Simulate carrier removal. */
ttydisc_modem(tp, 0);
/* Wake up all blocked threads. */
tty_wakeup(tp, FREAD|FWRITE);
cv_broadcast(&tp->t_bgwait);
cv_broadcast(&tp->t_dcdwait);
tp->t_flags |= TF_GONE;
tty_rel_free(tp);
}
/*
* Exposing information about current TTY's through sysctl
*/
static void
tty_to_xtty(struct tty *tp, struct xtty *xt)
{
tty_lock_assert(tp, MA_OWNED);
xt->xt_size = sizeof(struct xtty);
xt->xt_insize = ttyinq_getsize(&tp->t_inq);
xt->xt_incc = ttyinq_bytescanonicalized(&tp->t_inq);
xt->xt_inlc = ttyinq_bytesline(&tp->t_inq);
xt->xt_inlow = tp->t_inlow;
xt->xt_outsize = ttyoutq_getsize(&tp->t_outq);
xt->xt_outcc = ttyoutq_bytesused(&tp->t_outq);
xt->xt_outlow = tp->t_outlow;
xt->xt_column = tp->t_column;
xt->xt_pgid = tp->t_pgrp ? tp->t_pgrp->pg_id : 0;
xt->xt_sid = tp->t_session ? tp->t_session->s_sid : 0;
xt->xt_flags = tp->t_flags;
xt->xt_dev = tp->t_dev ? dev2udev(tp->t_dev) : NODEV;
}
static int
sysctl_kern_ttys(SYSCTL_HANDLER_ARGS)
{
unsigned long lsize;
struct xtty *xtlist, *xt;
struct tty *tp;
int error;
sx_slock(&tty_list_sx);
lsize = tty_list_count * sizeof(struct xtty);
if (lsize == 0) {
sx_sunlock(&tty_list_sx);
return (0);
}
xtlist = xt = malloc(lsize, M_TTY, M_WAITOK);
TAILQ_FOREACH(tp, &tty_list, t_list) {
tty_lock(tp);
tty_to_xtty(tp, xt);
tty_unlock(tp);
xt++;
}
sx_sunlock(&tty_list_sx);
error = SYSCTL_OUT(req, xtlist, lsize);
free(xtlist, M_TTY);
return (error);
}
SYSCTL_PROC(_kern, OID_AUTO, ttys, CTLTYPE_OPAQUE|CTLFLAG_RD|CTLFLAG_MPSAFE,
0, 0, sysctl_kern_ttys, "S,xtty", "List of TTYs");
/*
* Device node creation. Device has been set up, now we can expose it to
* the user.
*/
void
tty_makedev(struct tty *tp, struct ucred *cred, const char *fmt, ...)
{
va_list ap;
struct cdev *dev;
const char *prefix = "tty";
char name[SPECNAMELEN - 3]; /* for "tty" and "cua". */
uid_t uid;
gid_t gid;
mode_t mode;
/* Remove "tty" prefix from devices like PTY's. */
if (tp->t_flags & TF_NOPREFIX)
prefix = "";
va_start(ap, fmt);
vsnrprintf(name, sizeof name, 32, fmt, ap);
va_end(ap);
if (cred == NULL) {
/* System device. */
uid = UID_ROOT;
gid = GID_WHEEL;
mode = S_IRUSR|S_IWUSR;
} else {
/* User device. */
uid = cred->cr_ruid;
gid = GID_TTY;
mode = S_IRUSR|S_IWUSR|S_IWGRP;
}
/* Master call-in device. */
dev = make_dev_cred(&ttydev_cdevsw, 0, cred,
uid, gid, mode, "%s%s", prefix, name);
dev->si_drv1 = tp;
tp->t_dev = dev;
/* Slave call-in devices. */
if (tp->t_flags & TF_INITLOCK) {
dev = make_dev_cred(&ttyil_cdevsw, TTYUNIT_INIT, cred,
uid, gid, mode, "%s%s.init", prefix, name);
dev_depends(tp->t_dev, dev);
dev->si_drv1 = tp;
dev->si_drv2 = &tp->t_termios_init_in;
dev = make_dev_cred(&ttyil_cdevsw, TTYUNIT_LOCK, cred,
uid, gid, mode, "%s%s.lock", prefix, name);
dev_depends(tp->t_dev, dev);
dev->si_drv1 = tp;
dev->si_drv2 = &tp->t_termios_lock_in;
}
/* Call-out devices. */
if (tp->t_flags & TF_CALLOUT) {
dev = make_dev_cred(&ttydev_cdevsw, TTYUNIT_CALLOUT, cred,
UID_UUCP, GID_DIALER, 0660, "cua%s", name);
dev_depends(tp->t_dev, dev);
dev->si_drv1 = tp;
/* Slave call-out devices. */
if (tp->t_flags & TF_INITLOCK) {
dev = make_dev_cred(&ttyil_cdevsw,
TTYUNIT_CALLOUT | TTYUNIT_INIT, cred,
UID_UUCP, GID_DIALER, 0660, "cua%s.init", name);
dev_depends(tp->t_dev, dev);
dev->si_drv1 = tp;
dev->si_drv2 = &tp->t_termios_init_out;
dev = make_dev_cred(&ttyil_cdevsw,
TTYUNIT_CALLOUT | TTYUNIT_LOCK, cred,
UID_UUCP, GID_DIALER, 0660, "cua%s.lock", name);
dev_depends(tp->t_dev, dev);
dev->si_drv1 = tp;
dev->si_drv2 = &tp->t_termios_lock_out;
}
}
}
/*
* Signalling processes.
*/
void
tty_signal_sessleader(struct tty *tp, int sig)
{
struct proc *p;
tty_lock_assert(tp, MA_OWNED);
MPASS(sig >= 1 && sig < NSIG);
/* Make signals start output again. */
tp->t_flags &= ~TF_STOPPED;
if (tp->t_session != NULL && tp->t_session->s_leader != NULL) {
p = tp->t_session->s_leader;
PROC_LOCK(p);
- psignal(p, sig);
+ kern_psignal(p, sig);
PROC_UNLOCK(p);
}
}
void
tty_signal_pgrp(struct tty *tp, int sig)
{
ksiginfo_t ksi;
tty_lock_assert(tp, MA_OWNED);
MPASS(sig >= 1 && sig < NSIG);
/* Make signals start output again. */
tp->t_flags &= ~TF_STOPPED;
if (sig == SIGINFO && !(tp->t_termios.c_lflag & NOKERNINFO))
tty_info(tp);
if (tp->t_pgrp != NULL) {
ksiginfo_init(&ksi);
ksi.ksi_signo = sig;
ksi.ksi_code = SI_KERNEL;
PGRP_LOCK(tp->t_pgrp);
pgsignal(tp->t_pgrp, sig, 1, &ksi);
PGRP_UNLOCK(tp->t_pgrp);
}
}
void
tty_wakeup(struct tty *tp, int flags)
{
if (tp->t_flags & TF_ASYNC && tp->t_sigio != NULL)
pgsigio(&tp->t_sigio, SIGIO, (tp->t_session != NULL));
if (flags & FWRITE) {
cv_broadcast(&tp->t_outwait);
selwakeup(&tp->t_outpoll);
KNOTE_LOCKED(&tp->t_outpoll.si_note, 0);
}
if (flags & FREAD) {
cv_broadcast(&tp->t_inwait);
selwakeup(&tp->t_inpoll);
KNOTE_LOCKED(&tp->t_inpoll.si_note, 0);
}
}
int
tty_wait(struct tty *tp, struct cv *cv)
{
int error;
int revokecnt = tp->t_revokecnt;
tty_lock_assert(tp, MA_OWNED|MA_NOTRECURSED);
MPASS(!tty_gone(tp));
error = cv_wait_sig(cv, tp->t_mtx);
/* Restart the system call when we may have been revoked. */
if (tp->t_revokecnt != revokecnt)
return (ERESTART);
/* Bail out when the device slipped away. */
if (tty_gone(tp))
return (ENXIO);
return (error);
}
int
tty_timedwait(struct tty *tp, struct cv *cv, int hz)
{
int error;
int revokecnt = tp->t_revokecnt;
tty_lock_assert(tp, MA_OWNED|MA_NOTRECURSED);
MPASS(!tty_gone(tp));
error = cv_timedwait_sig(cv, tp->t_mtx, hz);
/* Restart the system call when we may have been revoked. */
if (tp->t_revokecnt != revokecnt)
return (ERESTART);
/* Bail out when the device slipped away. */
if (tty_gone(tp))
return (ENXIO);
return (error);
}
void
tty_flush(struct tty *tp, int flags)
{
if (flags & FWRITE) {
tp->t_flags &= ~TF_HIWAT_OUT;
ttyoutq_flush(&tp->t_outq);
tty_wakeup(tp, FWRITE);
ttydevsw_pktnotify(tp, TIOCPKT_FLUSHWRITE);
}
if (flags & FREAD) {
tty_hiwat_in_unblock(tp);
ttyinq_flush(&tp->t_inq);
ttydevsw_inwakeup(tp);
ttydevsw_pktnotify(tp, TIOCPKT_FLUSHREAD);
}
}
static int
tty_generic_ioctl(struct tty *tp, u_long cmd, void *data, int fflag,
struct thread *td)
{
int error;
switch (cmd) {
/*
* Modem commands.
* The SER_* and TIOCM_* flags are the same, but one bit
* shifted. I don't know why.
*/
case TIOCSDTR:
ttydevsw_modem(tp, SER_DTR, 0);
return (0);
case TIOCCDTR:
ttydevsw_modem(tp, 0, SER_DTR);
return (0);
case TIOCMSET: {
int bits = *(int *)data;
ttydevsw_modem(tp,
(bits & (TIOCM_DTR | TIOCM_RTS)) >> 1,
((~bits) & (TIOCM_DTR | TIOCM_RTS)) >> 1);
return (0);
}
case TIOCMBIS: {
int bits = *(int *)data;
ttydevsw_modem(tp, (bits & (TIOCM_DTR | TIOCM_RTS)) >> 1, 0);
return (0);
}
case TIOCMBIC: {
int bits = *(int *)data;
ttydevsw_modem(tp, 0, (bits & (TIOCM_DTR | TIOCM_RTS)) >> 1);
return (0);
}
case TIOCMGET:
*(int *)data = TIOCM_LE + (ttydevsw_modem(tp, 0, 0) << 1);
return (0);
case FIOASYNC:
if (*(int *)data)
tp->t_flags |= TF_ASYNC;
else
tp->t_flags &= ~TF_ASYNC;
return (0);
case FIONBIO:
/* This device supports non-blocking operation. */
return (0);
case FIONREAD:
*(int *)data = ttyinq_bytescanonicalized(&tp->t_inq);
return (0);
case FIONWRITE:
case TIOCOUTQ:
*(int *)data = ttyoutq_bytesused(&tp->t_outq);
return (0);
case FIOSETOWN:
if (tp->t_session != NULL && !tty_is_ctty(tp, td->td_proc))
/* Not allowed to set ownership. */
return (ENOTTY);
/* Temporarily unlock the TTY to set ownership. */
tty_unlock(tp);
error = fsetown(*(int *)data, &tp->t_sigio);
tty_lock(tp);
return (error);
case FIOGETOWN:
if (tp->t_session != NULL && !tty_is_ctty(tp, td->td_proc))
/* Not allowed to set ownership. */
return (ENOTTY);
/* Get ownership. */
*(int *)data = fgetown(&tp->t_sigio);
return (0);
case TIOCGETA:
/* Obtain terminal flags through tcgetattr(). */
*(struct termios*)data = tp->t_termios;
return (0);
case TIOCSETA:
case TIOCSETAW:
case TIOCSETAF: {
struct termios *t = data;
/*
* Who makes up these funny rules? According to POSIX,
* input baud rate is set equal to the output baud rate
* when zero.
*/
if (t->c_ispeed == 0)
t->c_ispeed = t->c_ospeed;
/* Discard any unsupported bits. */
t->c_iflag &= TTYSUP_IFLAG;
t->c_oflag &= TTYSUP_OFLAG;
t->c_lflag &= TTYSUP_LFLAG;
t->c_cflag &= TTYSUP_CFLAG;
/* Set terminal flags through tcsetattr(). */
if (cmd == TIOCSETAW || cmd == TIOCSETAF) {
error = tty_drain(tp);
if (error)
return (error);
if (cmd == TIOCSETAF)
tty_flush(tp, FREAD);
}
/*
* Only call param() when the flags really change.
*/
if ((t->c_cflag & CIGNORE) == 0 &&
(tp->t_termios.c_cflag != t->c_cflag ||
tp->t_termios.c_ispeed != t->c_ispeed ||
tp->t_termios.c_ospeed != t->c_ospeed)) {
error = ttydevsw_param(tp, t);
if (error)
return (error);
/* XXX: CLOCAL? */
tp->t_termios.c_cflag = t->c_cflag & ~CIGNORE;
tp->t_termios.c_ispeed = t->c_ispeed;
tp->t_termios.c_ospeed = t->c_ospeed;
/* Baud rate has changed - update watermarks. */
tty_watermarks(tp);
}
/* Copy new non-device driver parameters. */
tp->t_termios.c_iflag = t->c_iflag;
tp->t_termios.c_oflag = t->c_oflag;
tp->t_termios.c_lflag = t->c_lflag;
memcpy(&tp->t_termios.c_cc, t->c_cc, sizeof t->c_cc);
ttydisc_optimize(tp);
if ((t->c_lflag & ICANON) == 0) {
/*
* When in non-canonical mode, wake up all
* readers. Canonicalize any partial input. VMIN
* and VTIME could also be adjusted.
*/
ttyinq_canonicalize(&tp->t_inq);
tty_wakeup(tp, FREAD);
}
/*
* For packet mode: notify the PTY consumer that VSTOP
* and VSTART may have been changed.
*/
if (tp->t_termios.c_iflag & IXON &&
tp->t_termios.c_cc[VSTOP] == CTRL('S') &&
tp->t_termios.c_cc[VSTART] == CTRL('Q'))
ttydevsw_pktnotify(tp, TIOCPKT_DOSTOP);
else
ttydevsw_pktnotify(tp, TIOCPKT_NOSTOP);
return (0);
}
case TIOCGETD:
/* For compatibility - we only support TTYDISC. */
*(int *)data = TTYDISC;
return (0);
case TIOCGPGRP:
if (!tty_is_ctty(tp, td->td_proc))
return (ENOTTY);
if (tp->t_pgrp != NULL)
*(int *)data = tp->t_pgrp->pg_id;
else
*(int *)data = NO_PID;
return (0);
case TIOCGSID:
if (!tty_is_ctty(tp, td->td_proc))
return (ENOTTY);
MPASS(tp->t_session);
*(int *)data = tp->t_session->s_sid;
return (0);
case TIOCSCTTY: {
struct proc *p = td->td_proc;
/* XXX: This looks awful. */
tty_unlock(tp);
sx_xlock(&proctree_lock);
tty_lock(tp);
if (!SESS_LEADER(p)) {
/* Only the session leader may do this. */
sx_xunlock(&proctree_lock);
return (EPERM);
}
if (tp->t_session != NULL && tp->t_session == p->p_session) {
/* This is already our controlling TTY. */
sx_xunlock(&proctree_lock);
return (0);
}
if (p->p_session->s_ttyp != NULL ||
(tp->t_session != NULL && tp->t_session->s_ttyvp != NULL &&
tp->t_session->s_ttyvp->v_type != VBAD)) {
/*
* There is already a relation between a TTY and
* a session, or the caller is not the session
* leader.
*
* Allow the TTY to be stolen when the vnode is
* invalid, but the reference to the TTY is
* still active. This allows immediate reuse of
* TTYs of which the session leader has been
* killed or the TTY revoked.
*/
sx_xunlock(&proctree_lock);
return (EPERM);
}
/* Connect the session to the TTY. */
tp->t_session = p->p_session;
tp->t_session->s_ttyp = tp;
tp->t_sessioncnt++;
sx_xunlock(&proctree_lock);
/* Assign foreground process group. */
tp->t_pgrp = p->p_pgrp;
PROC_LOCK(p);
p->p_flag |= P_CONTROLT;
PROC_UNLOCK(p);
return (0);
}
case TIOCSPGRP: {
struct pgrp *pg;
/*
* XXX: Temporarily unlock the TTY to locate the process
* group. This code would be lot nicer if we would ever
* decompose proctree_lock.
*/
tty_unlock(tp);
sx_slock(&proctree_lock);
pg = pgfind(*(int *)data);
if (pg != NULL)
PGRP_UNLOCK(pg);
if (pg == NULL || pg->pg_session != td->td_proc->p_session) {
sx_sunlock(&proctree_lock);
tty_lock(tp);
return (EPERM);
}
tty_lock(tp);
/*
* Determine if this TTY is the controlling TTY after
* relocking the TTY.
*/
if (!tty_is_ctty(tp, td->td_proc)) {
sx_sunlock(&proctree_lock);
return (ENOTTY);
}
tp->t_pgrp = pg;
sx_sunlock(&proctree_lock);
/* Wake up the background process groups. */
cv_broadcast(&tp->t_bgwait);
return (0);
}
case TIOCFLUSH: {
int flags = *(int *)data;
if (flags == 0)
flags = (FREAD|FWRITE);
else
flags &= (FREAD|FWRITE);
tty_flush(tp, flags);
return (0);
}
case TIOCDRAIN:
/* Drain TTY output. */
return tty_drain(tp);
case TIOCCONS:
/* Set terminal as console TTY. */
if (*(int *)data) {
error = priv_check(td, PRIV_TTY_CONSOLE);
if (error)
return (error);
/*
* XXX: constty should really need to be locked!
* XXX: allow disconnected constty's to be stolen!
*/
if (constty == tp)
return (0);
if (constty != NULL)
return (EBUSY);
tty_unlock(tp);
constty_set(tp);
tty_lock(tp);
} else if (constty == tp) {
constty_clear();
}
return (0);
case TIOCGWINSZ:
/* Obtain window size. */
*(struct winsize*)data = tp->t_winsize;
return (0);
case TIOCSWINSZ:
/* Set window size. */
if (bcmp(&tp->t_winsize, data, sizeof(struct winsize)) == 0)
return (0);
tp->t_winsize = *(struct winsize*)data;
tty_signal_pgrp(tp, SIGWINCH);
return (0);
case TIOCEXCL:
tp->t_flags |= TF_EXCLUDE;
return (0);
case TIOCNXCL:
tp->t_flags &= ~TF_EXCLUDE;
return (0);
case TIOCSTOP:
tp->t_flags |= TF_STOPPED;
ttydevsw_pktnotify(tp, TIOCPKT_STOP);
return (0);
case TIOCSTART:
tp->t_flags &= ~TF_STOPPED;
ttydevsw_outwakeup(tp);
ttydevsw_pktnotify(tp, TIOCPKT_START);
return (0);
case TIOCSTAT:
tty_info(tp);
return (0);
case TIOCSTI:
if ((fflag & FREAD) == 0 && priv_check(td, PRIV_TTY_STI))
return (EPERM);
if (!tty_is_ctty(tp, td->td_proc) &&
priv_check(td, PRIV_TTY_STI))
return (EACCES);
ttydisc_rint(tp, *(char *)data, 0);
ttydisc_rint_done(tp);
return (0);
}
#ifdef COMPAT_43TTY
return tty_ioctl_compat(tp, cmd, data, fflag, td);
#else /* !COMPAT_43TTY */
return (ENOIOCTL);
#endif /* COMPAT_43TTY */
}
int
tty_ioctl(struct tty *tp, u_long cmd, void *data, int fflag, struct thread *td)
{
int error;
tty_lock_assert(tp, MA_OWNED);
if (tty_gone(tp))
return (ENXIO);
error = ttydevsw_ioctl(tp, cmd, data, td);
if (error == ENOIOCTL)
error = tty_generic_ioctl(tp, cmd, data, fflag, td);
return (error);
}
dev_t
tty_udev(struct tty *tp)
{
if (tp->t_dev)
return dev2udev(tp->t_dev);
else
return NODEV;
}
int
tty_checkoutq(struct tty *tp)
{
/* 256 bytes should be enough to print a log message. */
return (ttyoutq_bytesleft(&tp->t_outq) >= 256);
}
void
tty_hiwat_in_block(struct tty *tp)
{
if ((tp->t_flags & TF_HIWAT_IN) == 0 &&
tp->t_termios.c_iflag & IXOFF &&
tp->t_termios.c_cc[VSTOP] != _POSIX_VDISABLE) {
/*
* Input flow control. Only enter the high watermark when we
* can successfully store the VSTOP character.
*/
if (ttyoutq_write_nofrag(&tp->t_outq,
&tp->t_termios.c_cc[VSTOP], 1) == 0)
tp->t_flags |= TF_HIWAT_IN;
} else {
/* No input flow control. */
tp->t_flags |= TF_HIWAT_IN;
}
}
void
tty_hiwat_in_unblock(struct tty *tp)
{
if (tp->t_flags & TF_HIWAT_IN &&
tp->t_termios.c_iflag & IXOFF &&
tp->t_termios.c_cc[VSTART] != _POSIX_VDISABLE) {
/*
* Input flow control. Only leave the high watermark when we
* can successfully store the VSTART character.
*/
if (ttyoutq_write_nofrag(&tp->t_outq,
&tp->t_termios.c_cc[VSTART], 1) == 0)
tp->t_flags &= ~TF_HIWAT_IN;
} else {
/* No input flow control. */
tp->t_flags &= ~TF_HIWAT_IN;
}
if (!tty_gone(tp))
ttydevsw_inwakeup(tp);
}
/*
* TTY hooks interface.
*/
static int
ttyhook_defrint(struct tty *tp, char c, int flags)
{
if (ttyhook_rint_bypass(tp, &c, 1) != 1)
return (-1);
return (0);
}
int
ttyhook_register(struct tty **rtp, struct proc *p, int fd,
struct ttyhook *th, void *softc)
{
struct tty *tp;
struct file *fp;
#ifdef CAPABILITIES
struct file *fp_cap;
#endif
struct cdev *dev;
struct cdevsw *cdp;
struct filedesc *fdp;
int error, ref;
/* Validate the file descriptor. */
if ((fdp = p->p_fd) == NULL)
return (EBADF);
fp = fget_unlocked(fdp, fd);
if (fp == NULL)
return (EBADF);
if (fp->f_ops == &badfileops) {
error = EBADF;
goto done1;
}
#ifdef CAPABILITIES
fp_cap = fp;
error = cap_funwrap(fp_cap, CAP_TTYHOOK, &fp);
if (error)
return (error);
#endif
/*
* Make sure the vnode is bound to a character device.
* Unlocked check for the vnode type is ok there, because we
* only shall prevent calling devvn_refthread on the file that
* never has been opened over a character device.
*/
if (fp->f_type != DTYPE_VNODE || fp->f_vnode->v_type != VCHR) {
error = EINVAL;
goto done1;
}
/* Make sure it is a TTY. */
cdp = devvn_refthread(fp->f_vnode, &dev, &ref);
if (cdp == NULL) {
error = ENXIO;
goto done1;
}
if (dev != fp->f_data) {
error = ENXIO;
goto done2;
}
if (cdp != &ttydev_cdevsw) {
error = ENOTTY;
goto done2;
}
tp = dev->si_drv1;
/* Try to attach the hook to the TTY. */
error = EBUSY;
tty_lock(tp);
MPASS((tp->t_hook == NULL) == ((tp->t_flags & TF_HOOK) == 0));
if (tp->t_flags & TF_HOOK)
goto done3;
tp->t_flags |= TF_HOOK;
tp->t_hook = th;
tp->t_hooksoftc = softc;
*rtp = tp;
error = 0;
/* Maybe we can switch into bypass mode now. */
ttydisc_optimize(tp);
/* Silently convert rint() calls to rint_bypass() when possible. */
if (!ttyhook_hashook(tp, rint) && ttyhook_hashook(tp, rint_bypass))
th->th_rint = ttyhook_defrint;
done3: tty_unlock(tp);
done2: dev_relthread(dev, ref);
done1: fdrop(fp, curthread);
return (error);
}
void
ttyhook_unregister(struct tty *tp)
{
tty_lock_assert(tp, MA_OWNED);
MPASS(tp->t_flags & TF_HOOK);
/* Disconnect the hook. */
tp->t_flags &= ~TF_HOOK;
tp->t_hook = NULL;
/* Maybe we need to leave bypass mode. */
ttydisc_optimize(tp);
/* Maybe deallocate the TTY as well. */
tty_rel_free(tp);
}
/*
* /dev/console handling.
*/
static int
ttyconsdev_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
{
struct tty *tp;
/* System has no console device. */
if (dev_console_filename == NULL)
return (ENXIO);
/* Look up corresponding TTY by device name. */
sx_slock(&tty_list_sx);
TAILQ_FOREACH(tp, &tty_list, t_list) {
if (strcmp(dev_console_filename, tty_devname(tp)) == 0) {
dev_console->si_drv1 = tp;
break;
}
}
sx_sunlock(&tty_list_sx);
/* System console has no TTY associated. */
if (dev_console->si_drv1 == NULL)
return (ENXIO);
return (ttydev_open(dev, oflags, devtype, td));
}
static int
ttyconsdev_write(struct cdev *dev, struct uio *uio, int ioflag)
{
log_console(uio);
return (ttydev_write(dev, uio, ioflag));
}
/*
* /dev/console is a little different than normal TTY's. When opened,
* it determines which TTY to use. When data gets written to it, it
* will be logged in the kernel message buffer.
*/
static struct cdevsw ttyconsdev_cdevsw = {
.d_version = D_VERSION,
.d_open = ttyconsdev_open,
.d_close = ttydev_close,
.d_read = ttydev_read,
.d_write = ttyconsdev_write,
.d_ioctl = ttydev_ioctl,
.d_kqfilter = ttydev_kqfilter,
.d_poll = ttydev_poll,
.d_mmap = ttydev_mmap,
.d_name = "ttyconsdev",
.d_flags = D_TTY,
};
static void
ttyconsdev_init(void *unused)
{
dev_console = make_dev_credf(MAKEDEV_ETERNAL, &ttyconsdev_cdevsw, 0,
NULL, UID_ROOT, GID_WHEEL, 0600, "console");
}
SYSINIT(tty, SI_SUB_DRIVERS, SI_ORDER_FIRST, ttyconsdev_init, NULL);
void
ttyconsdev_select(const char *name)
{
dev_console_filename = name;
}
/*
* Debugging routines.
*/
#include "opt_ddb.h"
#ifdef DDB
#include <ddb/ddb.h>
#include <ddb/db_sym.h>
static struct {
int flag;
char val;
} ttystates[] = {
#if 0
{ TF_NOPREFIX, 'N' },
#endif
{ TF_INITLOCK, 'I' },
{ TF_CALLOUT, 'C' },
/* Keep these together -> 'Oi' and 'Oo'. */
{ TF_OPENED, 'O' },
{ TF_OPENED_IN, 'i' },
{ TF_OPENED_OUT, 'o' },
{ TF_OPENED_CONS, 'c' },
{ TF_GONE, 'G' },
{ TF_OPENCLOSE, 'B' },
{ TF_ASYNC, 'Y' },
{ TF_LITERAL, 'L' },
/* Keep these together -> 'Hi' and 'Ho'. */
{ TF_HIWAT, 'H' },
{ TF_HIWAT_IN, 'i' },
{ TF_HIWAT_OUT, 'o' },
{ TF_STOPPED, 'S' },
{ TF_EXCLUDE, 'X' },
{ TF_BYPASS, 'l' },
{ TF_ZOMBIE, 'Z' },
{ TF_HOOK, 's' },
/* Keep these together -> 'bi' and 'bo'. */
{ TF_BUSY, 'b' },
{ TF_BUSY_IN, 'i' },
{ TF_BUSY_OUT, 'o' },
{ 0, '\0'},
};
#define TTY_FLAG_BITS \
"\20\1NOPREFIX\2INITLOCK\3CALLOUT\4OPENED_IN\5OPENED_OUT\6GONE" \
"\7OPENCLOSE\10ASYNC\11LITERAL\12HIWAT_IN\13HIWAT_OUT\14STOPPED" \
"\15EXCLUDE\16BYPASS\17ZOMBIE\20HOOK"
#define DB_PRINTSYM(name, addr) \
db_printf("%s " #name ": ", sep); \
db_printsym((db_addr_t) addr, DB_STGY_ANY); \
db_printf("\n");
static void
_db_show_devsw(const char *sep, const struct ttydevsw *tsw)
{
db_printf("%sdevsw: ", sep);
db_printsym((db_addr_t)tsw, DB_STGY_ANY);
db_printf(" (%p)\n", tsw);
DB_PRINTSYM(open, tsw->tsw_open);
DB_PRINTSYM(close, tsw->tsw_close);
DB_PRINTSYM(outwakeup, tsw->tsw_outwakeup);
DB_PRINTSYM(inwakeup, tsw->tsw_inwakeup);
DB_PRINTSYM(ioctl, tsw->tsw_ioctl);
DB_PRINTSYM(param, tsw->tsw_param);
DB_PRINTSYM(modem, tsw->tsw_modem);
DB_PRINTSYM(mmap, tsw->tsw_mmap);
DB_PRINTSYM(pktnotify, tsw->tsw_pktnotify);
DB_PRINTSYM(free, tsw->tsw_free);
}
static void
_db_show_hooks(const char *sep, const struct ttyhook *th)
{
db_printf("%shook: ", sep);
db_printsym((db_addr_t)th, DB_STGY_ANY);
db_printf(" (%p)\n", th);
if (th == NULL)
return;
DB_PRINTSYM(rint, th->th_rint);
DB_PRINTSYM(rint_bypass, th->th_rint_bypass);
DB_PRINTSYM(rint_done, th->th_rint_done);
DB_PRINTSYM(rint_poll, th->th_rint_poll);
DB_PRINTSYM(getc_inject, th->th_getc_inject);
DB_PRINTSYM(getc_capture, th->th_getc_capture);
DB_PRINTSYM(getc_poll, th->th_getc_poll);
DB_PRINTSYM(close, th->th_close);
}
static void
_db_show_termios(const char *name, const struct termios *t)
{
db_printf("%s: iflag 0x%x oflag 0x%x cflag 0x%x "
"lflag 0x%x ispeed %u ospeed %u\n", name,
t->c_iflag, t->c_oflag, t->c_cflag, t->c_lflag,
t->c_ispeed, t->c_ospeed);
}
/* DDB command to show TTY statistics. */
DB_SHOW_COMMAND(tty, db_show_tty)
{
struct tty *tp;
if (!have_addr) {
db_printf("usage: show tty <addr>\n");
return;
}
tp = (struct tty *)addr;
db_printf("0x%p: %s\n", tp, tty_devname(tp));
db_printf("\tmtx: %p\n", tp->t_mtx);
db_printf("\tflags: %b\n", tp->t_flags, TTY_FLAG_BITS);
db_printf("\trevokecnt: %u\n", tp->t_revokecnt);
/* Buffering mechanisms. */
db_printf("\tinq: %p begin %u linestart %u reprint %u end %u "
"nblocks %u quota %u\n", &tp->t_inq, tp->t_inq.ti_begin,
tp->t_inq.ti_linestart, tp->t_inq.ti_reprint, tp->t_inq.ti_end,
tp->t_inq.ti_nblocks, tp->t_inq.ti_quota);
db_printf("\toutq: %p begin %u end %u nblocks %u quota %u\n",
&tp->t_outq, tp->t_outq.to_begin, tp->t_outq.to_end,
tp->t_outq.to_nblocks, tp->t_outq.to_quota);
db_printf("\tinlow: %zu\n", tp->t_inlow);
db_printf("\toutlow: %zu\n", tp->t_outlow);
_db_show_termios("\ttermios", &tp->t_termios);
db_printf("\twinsize: row %u col %u xpixel %u ypixel %u\n",
tp->t_winsize.ws_row, tp->t_winsize.ws_col,
tp->t_winsize.ws_xpixel, tp->t_winsize.ws_ypixel);
db_printf("\tcolumn: %u\n", tp->t_column);
db_printf("\twritepos: %u\n", tp->t_writepos);
db_printf("\tcompatflags: 0x%x\n", tp->t_compatflags);
/* Init/lock-state devices. */
_db_show_termios("\ttermios_init_in", &tp->t_termios_init_in);
_db_show_termios("\ttermios_init_out", &tp->t_termios_init_out);
_db_show_termios("\ttermios_lock_in", &tp->t_termios_lock_in);
_db_show_termios("\ttermios_lock_out", &tp->t_termios_lock_out);
/* Hooks */
_db_show_devsw("\t", tp->t_devsw);
_db_show_hooks("\t", tp->t_hook);
/* Process info. */
db_printf("\tpgrp: %p gid %d jobc %d\n", tp->t_pgrp,
tp->t_pgrp ? tp->t_pgrp->pg_id : 0,
tp->t_pgrp ? tp->t_pgrp->pg_jobc : 0);
db_printf("\tsession: %p", tp->t_session);
if (tp->t_session != NULL)
db_printf(" count %u leader %p tty %p sid %d login %s",
tp->t_session->s_count, tp->t_session->s_leader,
tp->t_session->s_ttyp, tp->t_session->s_sid,
tp->t_session->s_login);
db_printf("\n");
db_printf("\tsessioncnt: %u\n", tp->t_sessioncnt);
db_printf("\tdevswsoftc: %p\n", tp->t_devswsoftc);
db_printf("\thooksoftc: %p\n", tp->t_hooksoftc);
db_printf("\tdev: %p\n", tp->t_dev);
}
/* DDB command to list TTYs. */
DB_SHOW_ALL_COMMAND(ttys, db_show_all_ttys)
{
struct tty *tp;
size_t isiz, osiz;
int i, j;
/* Make the output look like `pstat -t'. */
db_printf("PTR ");
#if defined(__LP64__)
db_printf(" ");
#endif
db_printf(" LINE INQ CAN LIN LOW OUTQ USE LOW "
"COL SESS PGID STATE\n");
TAILQ_FOREACH(tp, &tty_list, t_list) {
isiz = tp->t_inq.ti_nblocks * TTYINQ_DATASIZE;
osiz = tp->t_outq.to_nblocks * TTYOUTQ_DATASIZE;
db_printf("%p %10s %5zu %4u %4u %4zu %5zu %4u %4zu %5u %5d %5d ",
tp,
tty_devname(tp),
isiz,
tp->t_inq.ti_linestart - tp->t_inq.ti_begin,
tp->t_inq.ti_end - tp->t_inq.ti_linestart,
isiz - tp->t_inlow,
osiz,
tp->t_outq.to_end - tp->t_outq.to_begin,
osiz - tp->t_outlow,
MIN(tp->t_column, 99999),
tp->t_session ? tp->t_session->s_sid : 0,
tp->t_pgrp ? tp->t_pgrp->pg_id : 0);
/* Flag bits. */
for (i = j = 0; ttystates[i].flag; i++)
if (tp->t_flags & ttystates[i].flag) {
db_printf("%c", ttystates[i].val);
j++;
}
if (j == 0)
db_printf("-");
db_printf("\n");
}
}
#endif /* DDB */
Index: head/sys/kern/tty_pts.c
===================================================================
--- head/sys/kern/tty_pts.c (revision 225616)
+++ head/sys/kern/tty_pts.c (revision 225617)
@@ -1,856 +1,856 @@
/*-
* Copyright (c) 2008 Ed Schouten <ed@FreeBSD.org>
* All rights reserved.
*
* Portions of this software were developed under sponsorship from Snow
* B.V., the Netherlands.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
/* Add compatibility bits for FreeBSD. */
#define PTS_COMPAT
/* Add pty(4) compat bits. */
#define PTS_EXTERNAL
/* Add bits to make Linux binaries work. */
#define PTS_LINUX
#include <sys/param.h>
#include <sys/lock.h>
#include <sys/condvar.h>
#include <sys/conf.h>
#include <sys/fcntl.h>
#include <sys/file.h>
#include <sys/filedesc.h>
#include <sys/filio.h>
#include <sys/kernel.h>
#include <sys/limits.h>
#include <sys/malloc.h>
#include <sys/poll.h>
#include <sys/proc.h>
#include <sys/racct.h>
#include <sys/resourcevar.h>
#include <sys/serial.h>
#include <sys/stat.h>
#include <sys/syscall.h>
#include <sys/syscallsubr.h>
#include <sys/sysctl.h>
#include <sys/sysent.h>
#include <sys/sysproto.h>
#include <sys/systm.h>
#include <sys/tty.h>
#include <sys/ttycom.h>
#include <machine/stdarg.h>
/*
* Our utmp(5) format is limited to 8-byte TTY line names. This means
* we can at most allocate 1000 pseudo-terminals ("pts/999"). Allow
* users to increase this number, assuming they have manually increased
* UT_LINESIZE.
*/
static struct unrhdr *pts_pool;
static MALLOC_DEFINE(M_PTS, "pts", "pseudo tty device");
/*
* Per-PTS structure.
*
* List of locks
* (t) locked by tty_lock()
* (c) const until freeing
*/
struct pts_softc {
int pts_unit; /* (c) Device unit number. */
unsigned int pts_flags; /* (t) Device flags. */
#define PTS_PKT 0x1 /* Packet mode. */
#define PTS_FINISHED 0x2 /* Return errors on read()/write(). */
char pts_pkt; /* (t) Unread packet mode data. */
struct cv pts_inwait; /* (t) Blocking write() on master. */
struct selinfo pts_inpoll; /* (t) Select queue for write(). */
struct cv pts_outwait; /* (t) Blocking read() on master. */
struct selinfo pts_outpoll; /* (t) Select queue for read(). */
#ifdef PTS_EXTERNAL
struct cdev *pts_cdev; /* (c) Master device node. */
#endif /* PTS_EXTERNAL */
struct ucred *pts_cred; /* (c) Resource limit. */
};
/*
* Controller-side file operations.
*/
static int
ptsdev_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
int flags, struct thread *td)
{
struct tty *tp = fp->f_data;
struct pts_softc *psc = tty_softc(tp);
int error = 0;
char pkt;
if (uio->uio_resid == 0)
return (0);
tty_lock(tp);
for (;;) {
/*
* Implement packet mode. When packet mode is turned on,
* the first byte contains a bitmask of events that
* occured (start, stop, flush, window size, etc).
*/
if (psc->pts_flags & PTS_PKT && psc->pts_pkt) {
pkt = psc->pts_pkt;
psc->pts_pkt = 0;
tty_unlock(tp);
error = ureadc(pkt, uio);
return (error);
}
/*
* Transmit regular data.
*
* XXX: We shouldn't use ttydisc_getc_poll()! Even
* though in this implementation, there is likely going
* to be data, we should just call ttydisc_getc_uio()
* and use its return value to sleep.
*/
if (ttydisc_getc_poll(tp)) {
if (psc->pts_flags & PTS_PKT) {
/*
* XXX: Small race. Fortunately PTY
* consumers aren't multithreaded.
*/
tty_unlock(tp);
error = ureadc(TIOCPKT_DATA, uio);
if (error)
return (error);
tty_lock(tp);
}
error = ttydisc_getc_uio(tp, uio);
break;
}
/* Maybe the device isn't used anyway. */
if (psc->pts_flags & PTS_FINISHED)
break;
/* Wait for more data. */
if (fp->f_flag & O_NONBLOCK) {
error = EWOULDBLOCK;
break;
}
error = cv_wait_sig(&psc->pts_outwait, tp->t_mtx);
if (error != 0)
break;
}
tty_unlock(tp);
return (error);
}
static int
ptsdev_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
int flags, struct thread *td)
{
struct tty *tp = fp->f_data;
struct pts_softc *psc = tty_softc(tp);
char ib[256], *ibstart;
size_t iblen, rintlen;
int error = 0;
if (uio->uio_resid == 0)
return (0);
for (;;) {
ibstart = ib;
iblen = MIN(uio->uio_resid, sizeof ib);
error = uiomove(ib, iblen, uio);
tty_lock(tp);
if (error != 0) {
iblen = 0;
goto done;
}
/*
* When possible, avoid the slow path. rint_bypass()
* copies all input to the input queue at once.
*/
MPASS(iblen > 0);
do {
rintlen = ttydisc_rint_simple(tp, ibstart, iblen);
ibstart += rintlen;
iblen -= rintlen;
if (iblen == 0) {
/* All data written. */
break;
}
/* Maybe the device isn't used anyway. */
if (psc->pts_flags & PTS_FINISHED) {
error = EIO;
goto done;
}
/* Wait for more data. */
if (fp->f_flag & O_NONBLOCK) {
error = EWOULDBLOCK;
goto done;
}
/* Wake up users on the slave side. */
ttydisc_rint_done(tp);
error = cv_wait_sig(&psc->pts_inwait, tp->t_mtx);
if (error != 0)
goto done;
} while (iblen > 0);
if (uio->uio_resid == 0)
break;
tty_unlock(tp);
}
done: ttydisc_rint_done(tp);
tty_unlock(tp);
/*
* Don't account for the part of the buffer that we couldn't
* pass to the TTY.
*/
uio->uio_resid += iblen;
return (error);
}
static int
ptsdev_truncate(struct file *fp, off_t length, struct ucred *active_cred,
struct thread *td)
{
return (EINVAL);
}
static int
ptsdev_ioctl(struct file *fp, u_long cmd, void *data,
struct ucred *active_cred, struct thread *td)
{
struct tty *tp = fp->f_data;
struct pts_softc *psc = tty_softc(tp);
int error = 0, sig;
switch (cmd) {
case FIONBIO:
/* This device supports non-blocking operation. */
return (0);
case FIONREAD:
tty_lock(tp);
if (psc->pts_flags & PTS_FINISHED) {
/* Force read() to be called. */
*(int *)data = 1;
} else {
*(int *)data = ttydisc_getc_poll(tp);
}
tty_unlock(tp);
return (0);
case FIODGNAME: {
struct fiodgname_arg *fgn;
const char *p;
int i;
/* Reverse device name lookups, for ptsname() and ttyname(). */
fgn = data;
p = tty_devname(tp);
i = strlen(p) + 1;
if (i > fgn->len)
return (EINVAL);
return copyout(p, fgn->buf, i);
}
/*
* We need to implement TIOCGPGRP and TIOCGSID here again. When
* called on the pseudo-terminal master, it should not check if
* the terminal is the foreground terminal of the calling
* process.
*
* TIOCGETA is also implemented here. Various Linux PTY routines
* often call isatty(), which is implemented by tcgetattr().
*/
#ifdef PTS_LINUX
case TIOCGETA:
/* Obtain terminal flags through tcgetattr(). */
tty_lock(tp);
*(struct termios*)data = tp->t_termios;
tty_unlock(tp);
return (0);
#endif /* PTS_LINUX */
case TIOCSETAF:
case TIOCSETAW:
/*
* We must make sure we turn tcsetattr() calls of TCSAFLUSH and
* TCSADRAIN into something different. If an application would
* call TCSAFLUSH or TCSADRAIN on the master descriptor, it may
* deadlock waiting for all data to be read.
*/
cmd = TIOCSETA;
break;
#if defined(PTS_COMPAT) || defined(PTS_LINUX)
case TIOCGPTN:
/*
* Get the device unit number.
*/
if (psc->pts_unit < 0)
return (ENOTTY);
*(unsigned int *)data = psc->pts_unit;
return (0);
#endif /* PTS_COMPAT || PTS_LINUX */
case TIOCGPGRP:
/* Get the foreground process group ID. */
tty_lock(tp);
if (tp->t_pgrp != NULL)
*(int *)data = tp->t_pgrp->pg_id;
else
*(int *)data = NO_PID;
tty_unlock(tp);
return (0);
case TIOCGSID:
/* Get the session leader process ID. */
tty_lock(tp);
if (tp->t_session == NULL)
error = ENOTTY;
else
*(int *)data = tp->t_session->s_sid;
tty_unlock(tp);
return (error);
case TIOCPTMASTER:
/* Yes, we are a pseudo-terminal master. */
return (0);
case TIOCSIG:
/* Signal the foreground process group. */
sig = *(int *)data;
if (sig < 1 || sig >= NSIG)
return (EINVAL);
tty_lock(tp);
tty_signal_pgrp(tp, sig);
tty_unlock(tp);
return (0);
case TIOCPKT:
/* Enable/disable packet mode. */
tty_lock(tp);
if (*(int *)data)
psc->pts_flags |= PTS_PKT;
else
psc->pts_flags &= ~PTS_PKT;
tty_unlock(tp);
return (0);
}
/* Just redirect this ioctl to the slave device. */
tty_lock(tp);
error = tty_ioctl(tp, cmd, data, fp->f_flag, td);
tty_unlock(tp);
if (error == ENOIOCTL)
error = ENOTTY;
return (error);
}
static int
ptsdev_poll(struct file *fp, int events, struct ucred *active_cred,
struct thread *td)
{
struct tty *tp = fp->f_data;
struct pts_softc *psc = tty_softc(tp);
int revents = 0;
tty_lock(tp);
if (psc->pts_flags & PTS_FINISHED) {
/* Slave device is not opened. */
tty_unlock(tp);
return ((events & (POLLIN|POLLRDNORM)) | POLLHUP);
}
if (events & (POLLIN|POLLRDNORM)) {
/* See if we can getc something. */
if (ttydisc_getc_poll(tp) ||
(psc->pts_flags & PTS_PKT && psc->pts_pkt))
revents |= events & (POLLIN|POLLRDNORM);
}
if (events & (POLLOUT|POLLWRNORM)) {
/* See if we can rint something. */
if (ttydisc_rint_poll(tp))
revents |= events & (POLLOUT|POLLWRNORM);
}
/*
* No need to check for POLLHUP here. This device cannot be used
* as a callout device, which means we always have a carrier,
* because the master is.
*/
if (revents == 0) {
/*
* This code might look misleading, but the naming of
* poll events on this side is the opposite of the slave
* device.
*/
if (events & (POLLIN|POLLRDNORM))
selrecord(td, &psc->pts_outpoll);
if (events & (POLLOUT|POLLWRNORM))
selrecord(td, &psc->pts_inpoll);
}
tty_unlock(tp);
return (revents);
}
/*
* kqueue support.
*/
static void
pts_kqops_read_detach(struct knote *kn)
{
struct file *fp = kn->kn_fp;
struct tty *tp = fp->f_data;
struct pts_softc *psc = tty_softc(tp);
knlist_remove(&psc->pts_outpoll.si_note, kn, 0);
}
static int
pts_kqops_read_event(struct knote *kn, long hint)
{
struct file *fp = kn->kn_fp;
struct tty *tp = fp->f_data;
struct pts_softc *psc = tty_softc(tp);
if (psc->pts_flags & PTS_FINISHED) {
kn->kn_flags |= EV_EOF;
return (1);
} else {
kn->kn_data = ttydisc_getc_poll(tp);
return (kn->kn_data > 0);
}
}
static void
pts_kqops_write_detach(struct knote *kn)
{
struct file *fp = kn->kn_fp;
struct tty *tp = fp->f_data;
struct pts_softc *psc = tty_softc(tp);
knlist_remove(&psc->pts_inpoll.si_note, kn, 0);
}
static int
pts_kqops_write_event(struct knote *kn, long hint)
{
struct file *fp = kn->kn_fp;
struct tty *tp = fp->f_data;
struct pts_softc *psc = tty_softc(tp);
if (psc->pts_flags & PTS_FINISHED) {
kn->kn_flags |= EV_EOF;
return (1);
} else {
kn->kn_data = ttydisc_rint_poll(tp);
return (kn->kn_data > 0);
}
}
static struct filterops pts_kqops_read = {
.f_isfd = 1,
.f_detach = pts_kqops_read_detach,
.f_event = pts_kqops_read_event,
};
static struct filterops pts_kqops_write = {
.f_isfd = 1,
.f_detach = pts_kqops_write_detach,
.f_event = pts_kqops_write_event,
};
static int
ptsdev_kqfilter(struct file *fp, struct knote *kn)
{
struct tty *tp = fp->f_data;
struct pts_softc *psc = tty_softc(tp);
int error = 0;
tty_lock(tp);
switch (kn->kn_filter) {
case EVFILT_READ:
kn->kn_fop = &pts_kqops_read;
knlist_add(&psc->pts_outpoll.si_note, kn, 1);
break;
case EVFILT_WRITE:
kn->kn_fop = &pts_kqops_write;
knlist_add(&psc->pts_inpoll.si_note, kn, 1);
break;
default:
error = EINVAL;
break;
}
tty_unlock(tp);
return (error);
}
static int
ptsdev_stat(struct file *fp, struct stat *sb, struct ucred *active_cred,
struct thread *td)
{
struct tty *tp = fp->f_data;
#ifdef PTS_EXTERNAL
struct pts_softc *psc = tty_softc(tp);
#endif /* PTS_EXTERNAL */
struct cdev *dev = tp->t_dev;
/*
* According to POSIX, we must implement an fstat(). This also
* makes this implementation compatible with Linux binaries,
* because Linux calls fstat() on the pseudo-terminal master to
* obtain st_rdev.
*
* XXX: POSIX also mentions we must fill in st_dev, but how?
*/
bzero(sb, sizeof *sb);
#ifdef PTS_EXTERNAL
if (psc->pts_cdev != NULL)
sb->st_ino = sb->st_rdev = dev2udev(psc->pts_cdev);
else
#endif /* PTS_EXTERNAL */
sb->st_ino = sb->st_rdev = tty_udev(tp);
sb->st_atim = dev->si_atime;
sb->st_ctim = dev->si_ctime;
sb->st_mtim = dev->si_mtime;
sb->st_uid = dev->si_uid;
sb->st_gid = dev->si_gid;
sb->st_mode = dev->si_mode | S_IFCHR;
return (0);
}
static int
ptsdev_close(struct file *fp, struct thread *td)
{
struct tty *tp = fp->f_data;
/* Deallocate TTY device. */
tty_lock(tp);
tty_rel_gone(tp);
/*
* Open of /dev/ptmx or /dev/ptyXX changes the type of file
* from DTYPE_VNODE to DTYPE_PTS. vn_open() increases vnode
* use count, we need to decrement it, and possibly do other
* required cleanup.
*/
if (fp->f_vnode != NULL)
return (vnops.fo_close(fp, td));
return (0);
}
static struct fileops ptsdev_ops = {
.fo_read = ptsdev_read,
.fo_write = ptsdev_write,
.fo_truncate = ptsdev_truncate,
.fo_ioctl = ptsdev_ioctl,
.fo_poll = ptsdev_poll,
.fo_kqfilter = ptsdev_kqfilter,
.fo_stat = ptsdev_stat,
.fo_close = ptsdev_close,
.fo_chmod = invfo_chmod,
.fo_chown = invfo_chown,
.fo_flags = DFLAG_PASSABLE,
};
/*
* Driver-side hooks.
*/
static void
ptsdrv_outwakeup(struct tty *tp)
{
struct pts_softc *psc = tty_softc(tp);
cv_broadcast(&psc->pts_outwait);
selwakeup(&psc->pts_outpoll);
KNOTE_LOCKED(&psc->pts_outpoll.si_note, 0);
}
static void
ptsdrv_inwakeup(struct tty *tp)
{
struct pts_softc *psc = tty_softc(tp);
cv_broadcast(&psc->pts_inwait);
selwakeup(&psc->pts_inpoll);
KNOTE_LOCKED(&psc->pts_inpoll.si_note, 0);
}
static int
ptsdrv_open(struct tty *tp)
{
struct pts_softc *psc = tty_softc(tp);
psc->pts_flags &= ~PTS_FINISHED;
return (0);
}
static void
ptsdrv_close(struct tty *tp)
{
struct pts_softc *psc = tty_softc(tp);
/* Wake up any blocked readers/writers. */
psc->pts_flags |= PTS_FINISHED;
ptsdrv_outwakeup(tp);
ptsdrv_inwakeup(tp);
}
static void
ptsdrv_pktnotify(struct tty *tp, char event)
{
struct pts_softc *psc = tty_softc(tp);
/*
* Clear conflicting flags.
*/
switch (event) {
case TIOCPKT_STOP:
psc->pts_pkt &= ~TIOCPKT_START;
break;
case TIOCPKT_START:
psc->pts_pkt &= ~TIOCPKT_STOP;
break;
case TIOCPKT_NOSTOP:
psc->pts_pkt &= ~TIOCPKT_DOSTOP;
break;
case TIOCPKT_DOSTOP:
psc->pts_pkt &= ~TIOCPKT_NOSTOP;
break;
}
psc->pts_pkt |= event;
ptsdrv_outwakeup(tp);
}
static void
ptsdrv_free(void *softc)
{
struct pts_softc *psc = softc;
/* Make device number available again. */
if (psc->pts_unit >= 0)
free_unr(pts_pool, psc->pts_unit);
chgptscnt(psc->pts_cred->cr_ruidinfo, -1, 0);
racct_sub_cred(psc->pts_cred, RACCT_NPTS, 1);
crfree(psc->pts_cred);
seldrain(&psc->pts_inpoll);
seldrain(&psc->pts_outpoll);
knlist_destroy(&psc->pts_inpoll.si_note);
knlist_destroy(&psc->pts_outpoll.si_note);
#ifdef PTS_EXTERNAL
/* Destroy master device as well. */
if (psc->pts_cdev != NULL)
destroy_dev_sched(psc->pts_cdev);
#endif /* PTS_EXTERNAL */
free(psc, M_PTS);
}
static struct ttydevsw pts_class = {
.tsw_flags = TF_NOPREFIX,
.tsw_outwakeup = ptsdrv_outwakeup,
.tsw_inwakeup = ptsdrv_inwakeup,
.tsw_open = ptsdrv_open,
.tsw_close = ptsdrv_close,
.tsw_pktnotify = ptsdrv_pktnotify,
.tsw_free = ptsdrv_free,
};
#ifndef PTS_EXTERNAL
static
#endif /* !PTS_EXTERNAL */
int
pts_alloc(int fflags, struct thread *td, struct file *fp)
{
int unit, ok, error;
struct tty *tp;
struct pts_softc *psc;
struct proc *p = td->td_proc;
struct ucred *cred = td->td_ucred;
/* Resource limiting. */
PROC_LOCK(p);
error = racct_add(p, RACCT_NPTS, 1);
if (error != 0) {
PROC_UNLOCK(p);
return (EAGAIN);
}
ok = chgptscnt(cred->cr_ruidinfo, 1, lim_cur(p, RLIMIT_NPTS));
if (!ok) {
racct_sub(p, RACCT_NPTS, 1);
PROC_UNLOCK(p);
return (EAGAIN);
}
PROC_UNLOCK(p);
/* Try to allocate a new pts unit number. */
unit = alloc_unr(pts_pool);
if (unit < 0) {
racct_sub(p, RACCT_NPTS, 1);
chgptscnt(cred->cr_ruidinfo, -1, 0);
return (EAGAIN);
}
/* Allocate TTY and softc. */
psc = malloc(sizeof(struct pts_softc), M_PTS, M_WAITOK|M_ZERO);
cv_init(&psc->pts_inwait, "ptsin");
cv_init(&psc->pts_outwait, "ptsout");
psc->pts_unit = unit;
psc->pts_cred = crhold(cred);
tp = tty_alloc(&pts_class, psc);
knlist_init_mtx(&psc->pts_inpoll.si_note, tp->t_mtx);
knlist_init_mtx(&psc->pts_outpoll.si_note, tp->t_mtx);
/* Expose the slave device as well. */
tty_makedev(tp, td->td_ucred, "pts/%u", psc->pts_unit);
finit(fp, fflags, DTYPE_PTS, tp, &ptsdev_ops);
return (0);
}
#ifdef PTS_EXTERNAL
int
pts_alloc_external(int fflags, struct thread *td, struct file *fp,
struct cdev *dev, const char *name)
{
int ok, error;
struct tty *tp;
struct pts_softc *psc;
struct proc *p = td->td_proc;
struct ucred *cred = td->td_ucred;
/* Resource limiting. */
PROC_LOCK(p);
error = racct_add(p, RACCT_NPTS, 1);
if (error != 0) {
PROC_UNLOCK(p);
return (EAGAIN);
}
ok = chgptscnt(cred->cr_ruidinfo, 1, lim_cur(p, RLIMIT_NPTS));
if (!ok) {
racct_sub(p, RACCT_NPTS, 1);
PROC_UNLOCK(p);
return (EAGAIN);
}
PROC_UNLOCK(p);
/* Allocate TTY and softc. */
psc = malloc(sizeof(struct pts_softc), M_PTS, M_WAITOK|M_ZERO);
cv_init(&psc->pts_inwait, "ptsin");
cv_init(&psc->pts_outwait, "ptsout");
psc->pts_unit = -1;
psc->pts_cdev = dev;
psc->pts_cred = crhold(cred);
tp = tty_alloc(&pts_class, psc);
knlist_init_mtx(&psc->pts_inpoll.si_note, tp->t_mtx);
knlist_init_mtx(&psc->pts_outpoll.si_note, tp->t_mtx);
/* Expose the slave device as well. */
tty_makedev(tp, td->td_ucred, "%s", name);
finit(fp, fflags, DTYPE_PTS, tp, &ptsdev_ops);
return (0);
}
#endif /* PTS_EXTERNAL */
int
-posix_openpt(struct thread *td, struct posix_openpt_args *uap)
+sys_posix_openpt(struct thread *td, struct posix_openpt_args *uap)
{
int error, fd;
struct file *fp;
/*
* POSIX states it's unspecified when other flags are passed. We
* don't allow this.
*/
if (uap->flags & ~(O_RDWR|O_NOCTTY))
return (EINVAL);
error = falloc(td, &fp, &fd, 0);
if (error)
return (error);
/* Allocate the actual pseudo-TTY. */
error = pts_alloc(FFLAGS(uap->flags & O_ACCMODE), td, fp);
if (error != 0) {
fdclose(td->td_proc->p_fd, fp, fd, td);
return (error);
}
/* Pass it back to userspace. */
td->td_retval[0] = fd;
fdrop(fp, td);
return (0);
}
static void
pts_init(void *unused)
{
pts_pool = new_unrhdr(0, INT_MAX, NULL);
}
SYSINIT(pts, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, pts_init, NULL);
Index: head/sys/kern/uipc_mqueue.c
===================================================================
--- head/sys/kern/uipc_mqueue.c (revision 225616)
+++ head/sys/kern/uipc_mqueue.c (revision 225617)
@@ -1,2836 +1,2836 @@
/*-
* Copyright (c) 2005 David Xu <davidxu@freebsd.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
*/
/*
* POSIX message queue implementation.
*
* 1) A mqueue filesystem can be mounted, each message queue appears
* in mounted directory, user can change queue's permission and
* ownership, or remove a queue. Manually creating a file in the
* directory causes a message queue to be created in the kernel with
* default message queue attributes applied and same name used, this
* method is not advocated since mq_open syscall allows user to specify
* different attributes. Also the file system can be mounted multiple
* times at different mount points but shows same contents.
*
* 2) Standard POSIX message queue API. The syscalls do not use vfs layer,
* but directly operate on internal data structure, this allows user to
* use the IPC facility without having to mount mqueue file system.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_compat.h"
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/systm.h>
#include <sys/limits.h>
#include <sys/buf.h>
#include <sys/capability.h>
#include <sys/dirent.h>
#include <sys/event.h>
#include <sys/eventhandler.h>
#include <sys/fcntl.h>
#include <sys/file.h>
#include <sys/filedesc.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/module.h>
#include <sys/mount.h>
#include <sys/mqueue.h>
#include <sys/mutex.h>
#include <sys/namei.h>
#include <sys/posix4.h>
#include <sys/poll.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/queue.h>
#include <sys/sysproto.h>
#include <sys/stat.h>
#include <sys/syscall.h>
#include <sys/syscallsubr.h>
#include <sys/sysent.h>
#include <sys/sx.h>
#include <sys/sysctl.h>
#include <sys/taskqueue.h>
#include <sys/unistd.h>
#include <sys/vnode.h>
#include <machine/atomic.h>
FEATURE(p1003_1b_mqueue, "POSIX P1003.1B message queues support");
/*
* Limits and constants
*/
#define MQFS_NAMELEN NAME_MAX
#define MQFS_DELEN (8 + MQFS_NAMELEN)
/* node types */
typedef enum {
mqfstype_none = 0,
mqfstype_root,
mqfstype_dir,
mqfstype_this,
mqfstype_parent,
mqfstype_file,
mqfstype_symlink,
} mqfs_type_t;
struct mqfs_node;
/*
* mqfs_info: describes a mqfs instance
*/
struct mqfs_info {
struct sx mi_lock;
struct mqfs_node *mi_root;
struct unrhdr *mi_unrhdr;
};
struct mqfs_vdata {
LIST_ENTRY(mqfs_vdata) mv_link;
struct mqfs_node *mv_node;
struct vnode *mv_vnode;
struct task mv_task;
};
/*
* mqfs_node: describes a node (file or directory) within a mqfs
*/
struct mqfs_node {
char mn_name[MQFS_NAMELEN+1];
struct mqfs_info *mn_info;
struct mqfs_node *mn_parent;
LIST_HEAD(,mqfs_node) mn_children;
LIST_ENTRY(mqfs_node) mn_sibling;
LIST_HEAD(,mqfs_vdata) mn_vnodes;
int mn_refcount;
mqfs_type_t mn_type;
int mn_deleted;
uint32_t mn_fileno;
void *mn_data;
struct timespec mn_birth;
struct timespec mn_ctime;
struct timespec mn_atime;
struct timespec mn_mtime;
uid_t mn_uid;
gid_t mn_gid;
int mn_mode;
};
#define VTON(vp) (((struct mqfs_vdata *)((vp)->v_data))->mv_node)
#define VTOMQ(vp) ((struct mqueue *)(VTON(vp)->mn_data))
#define VFSTOMQFS(m) ((struct mqfs_info *)((m)->mnt_data))
#define FPTOMQ(fp) ((struct mqueue *)(((struct mqfs_node *) \
(fp)->f_data)->mn_data))
TAILQ_HEAD(msgq, mqueue_msg);
struct mqueue;
struct mqueue_notifier {
LIST_ENTRY(mqueue_notifier) nt_link;
struct sigevent nt_sigev;
ksiginfo_t nt_ksi;
struct proc *nt_proc;
};
struct mqueue {
struct mtx mq_mutex;
int mq_flags;
long mq_maxmsg;
long mq_msgsize;
long mq_curmsgs;
long mq_totalbytes;
struct msgq mq_msgq;
int mq_receivers;
int mq_senders;
struct selinfo mq_rsel;
struct selinfo mq_wsel;
struct mqueue_notifier *mq_notifier;
};
#define MQ_RSEL 0x01
#define MQ_WSEL 0x02
struct mqueue_msg {
TAILQ_ENTRY(mqueue_msg) msg_link;
unsigned int msg_prio;
unsigned int msg_size;
/* following real data... */
};
SYSCTL_NODE(_kern, OID_AUTO, mqueue, CTLFLAG_RW, 0,
"POSIX real time message queue");
static int default_maxmsg = 10;
static int default_msgsize = 1024;
static int maxmsg = 100;
SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmsg, CTLFLAG_RW,
&maxmsg, 0, "Default maximum messages in queue");
static int maxmsgsize = 16384;
SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmsgsize, CTLFLAG_RW,
&maxmsgsize, 0, "Default maximum message size");
static int maxmq = 100;
SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmq, CTLFLAG_RW,
&maxmq, 0, "maximum message queues");
static int curmq = 0;
SYSCTL_INT(_kern_mqueue, OID_AUTO, curmq, CTLFLAG_RW,
&curmq, 0, "current message queue number");
static int unloadable = 0;
static MALLOC_DEFINE(M_MQUEUEDATA, "mqdata", "mqueue data");
static eventhandler_tag exit_tag;
/* Only one instance per-system */
static struct mqfs_info mqfs_data;
static uma_zone_t mqnode_zone;
static uma_zone_t mqueue_zone;
static uma_zone_t mvdata_zone;
static uma_zone_t mqnoti_zone;
static struct vop_vector mqfs_vnodeops;
static struct fileops mqueueops;
/*
* Directory structure construction and manipulation
*/
#ifdef notyet
static struct mqfs_node *mqfs_create_dir(struct mqfs_node *parent,
const char *name, int namelen, struct ucred *cred, int mode);
static struct mqfs_node *mqfs_create_link(struct mqfs_node *parent,
const char *name, int namelen, struct ucred *cred, int mode);
#endif
static struct mqfs_node *mqfs_create_file(struct mqfs_node *parent,
const char *name, int namelen, struct ucred *cred, int mode);
static int mqfs_destroy(struct mqfs_node *mn);
static void mqfs_fileno_alloc(struct mqfs_info *mi, struct mqfs_node *mn);
static void mqfs_fileno_free(struct mqfs_info *mi, struct mqfs_node *mn);
static int mqfs_allocv(struct mount *mp, struct vnode **vpp, struct mqfs_node *pn);
/*
* Message queue construction and maniplation
*/
static struct mqueue *mqueue_alloc(const struct mq_attr *attr);
static void mqueue_free(struct mqueue *mq);
static int mqueue_send(struct mqueue *mq, const char *msg_ptr,
size_t msg_len, unsigned msg_prio, int waitok,
const struct timespec *abs_timeout);
static int mqueue_receive(struct mqueue *mq, char *msg_ptr,
size_t msg_len, unsigned *msg_prio, int waitok,
const struct timespec *abs_timeout);
static int _mqueue_send(struct mqueue *mq, struct mqueue_msg *msg,
int timo);
static int _mqueue_recv(struct mqueue *mq, struct mqueue_msg **msg,
int timo);
static void mqueue_send_notification(struct mqueue *mq);
static void mqueue_fdclose(struct thread *td, int fd, struct file *fp);
static void mq_proc_exit(void *arg, struct proc *p);
/*
* kqueue filters
*/
static void filt_mqdetach(struct knote *kn);
static int filt_mqread(struct knote *kn, long hint);
static int filt_mqwrite(struct knote *kn, long hint);
struct filterops mq_rfiltops = {
.f_isfd = 1,
.f_detach = filt_mqdetach,
.f_event = filt_mqread,
};
struct filterops mq_wfiltops = {
.f_isfd = 1,
.f_detach = filt_mqdetach,
.f_event = filt_mqwrite,
};
/*
* Initialize fileno bitmap
*/
static void
mqfs_fileno_init(struct mqfs_info *mi)
{
struct unrhdr *up;
up = new_unrhdr(1, INT_MAX, NULL);
mi->mi_unrhdr = up;
}
/*
* Tear down fileno bitmap
*/
static void
mqfs_fileno_uninit(struct mqfs_info *mi)
{
struct unrhdr *up;
up = mi->mi_unrhdr;
mi->mi_unrhdr = NULL;
delete_unrhdr(up);
}
/*
* Allocate a file number
*/
static void
mqfs_fileno_alloc(struct mqfs_info *mi, struct mqfs_node *mn)
{
/* make sure our parent has a file number */
if (mn->mn_parent && !mn->mn_parent->mn_fileno)
mqfs_fileno_alloc(mi, mn->mn_parent);
switch (mn->mn_type) {
case mqfstype_root:
case mqfstype_dir:
case mqfstype_file:
case mqfstype_symlink:
mn->mn_fileno = alloc_unr(mi->mi_unrhdr);
break;
case mqfstype_this:
KASSERT(mn->mn_parent != NULL,
("mqfstype_this node has no parent"));
mn->mn_fileno = mn->mn_parent->mn_fileno;
break;
case mqfstype_parent:
KASSERT(mn->mn_parent != NULL,
("mqfstype_parent node has no parent"));
if (mn->mn_parent == mi->mi_root) {
mn->mn_fileno = mn->mn_parent->mn_fileno;
break;
}
KASSERT(mn->mn_parent->mn_parent != NULL,
("mqfstype_parent node has no grandparent"));
mn->mn_fileno = mn->mn_parent->mn_parent->mn_fileno;
break;
default:
KASSERT(0,
("mqfs_fileno_alloc() called for unknown type node: %d",
mn->mn_type));
break;
}
}
/*
* Release a file number
*/
static void
mqfs_fileno_free(struct mqfs_info *mi, struct mqfs_node *mn)
{
switch (mn->mn_type) {
case mqfstype_root:
case mqfstype_dir:
case mqfstype_file:
case mqfstype_symlink:
free_unr(mi->mi_unrhdr, mn->mn_fileno);
break;
case mqfstype_this:
case mqfstype_parent:
/* ignore these, as they don't "own" their file number */
break;
default:
KASSERT(0,
("mqfs_fileno_free() called for unknown type node: %d",
mn->mn_type));
break;
}
}
static __inline struct mqfs_node *
mqnode_alloc(void)
{
return uma_zalloc(mqnode_zone, M_WAITOK | M_ZERO);
}
static __inline void
mqnode_free(struct mqfs_node *node)
{
uma_zfree(mqnode_zone, node);
}
static __inline void
mqnode_addref(struct mqfs_node *node)
{
atomic_fetchadd_int(&node->mn_refcount, 1);
}
static __inline void
mqnode_release(struct mqfs_node *node)
{
struct mqfs_info *mqfs;
int old, exp;
mqfs = node->mn_info;
old = atomic_fetchadd_int(&node->mn_refcount, -1);
if (node->mn_type == mqfstype_dir ||
node->mn_type == mqfstype_root)
exp = 3; /* include . and .. */
else
exp = 1;
if (old == exp) {
int locked = sx_xlocked(&mqfs->mi_lock);
if (!locked)
sx_xlock(&mqfs->mi_lock);
mqfs_destroy(node);
if (!locked)
sx_xunlock(&mqfs->mi_lock);
}
}
/*
* Add a node to a directory
*/
static int
mqfs_add_node(struct mqfs_node *parent, struct mqfs_node *node)
{
KASSERT(parent != NULL, ("%s(): parent is NULL", __func__));
KASSERT(parent->mn_info != NULL,
("%s(): parent has no mn_info", __func__));
KASSERT(parent->mn_type == mqfstype_dir ||
parent->mn_type == mqfstype_root,
("%s(): parent is not a directory", __func__));
node->mn_info = parent->mn_info;
node->mn_parent = parent;
LIST_INIT(&node->mn_children);
LIST_INIT(&node->mn_vnodes);
LIST_INSERT_HEAD(&parent->mn_children, node, mn_sibling);
mqnode_addref(parent);
return (0);
}
static struct mqfs_node *
mqfs_create_node(const char *name, int namelen, struct ucred *cred, int mode,
int nodetype)
{
struct mqfs_node *node;
node = mqnode_alloc();
strncpy(node->mn_name, name, namelen);
node->mn_type = nodetype;
node->mn_refcount = 1;
vfs_timestamp(&node->mn_birth);
node->mn_ctime = node->mn_atime = node->mn_mtime
= node->mn_birth;
node->mn_uid = cred->cr_uid;
node->mn_gid = cred->cr_gid;
node->mn_mode = mode;
return (node);
}
/*
* Create a file
*/
static struct mqfs_node *
mqfs_create_file(struct mqfs_node *parent, const char *name, int namelen,
struct ucred *cred, int mode)
{
struct mqfs_node *node;
node = mqfs_create_node(name, namelen, cred, mode, mqfstype_file);
if (mqfs_add_node(parent, node) != 0) {
mqnode_free(node);
return (NULL);
}
return (node);
}
/*
* Add . and .. to a directory
*/
static int
mqfs_fixup_dir(struct mqfs_node *parent)
{
struct mqfs_node *dir;
dir = mqnode_alloc();
dir->mn_name[0] = '.';
dir->mn_type = mqfstype_this;
dir->mn_refcount = 1;
if (mqfs_add_node(parent, dir) != 0) {
mqnode_free(dir);
return (-1);
}
dir = mqnode_alloc();
dir->mn_name[0] = dir->mn_name[1] = '.';
dir->mn_type = mqfstype_parent;
dir->mn_refcount = 1;
if (mqfs_add_node(parent, dir) != 0) {
mqnode_free(dir);
return (-1);
}
return (0);
}
#ifdef notyet
/*
* Create a directory
*/
static struct mqfs_node *
mqfs_create_dir(struct mqfs_node *parent, const char *name, int namelen,
struct ucred *cred, int mode)
{
struct mqfs_node *node;
node = mqfs_create_node(name, namelen, cred, mode, mqfstype_dir);
if (mqfs_add_node(parent, node) != 0) {
mqnode_free(node);
return (NULL);
}
if (mqfs_fixup_dir(node) != 0) {
mqfs_destroy(node);
return (NULL);
}
return (node);
}
/*
* Create a symlink
*/
static struct mqfs_node *
mqfs_create_link(struct mqfs_node *parent, const char *name, int namelen,
struct ucred *cred, int mode)
{
struct mqfs_node *node;
node = mqfs_create_node(name, namelen, cred, mode, mqfstype_symlink);
if (mqfs_add_node(parent, node) != 0) {
mqnode_free(node);
return (NULL);
}
return (node);
}
#endif
/*
* Destroy a node or a tree of nodes
*/
static int
mqfs_destroy(struct mqfs_node *node)
{
struct mqfs_node *parent;
KASSERT(node != NULL,
("%s(): node is NULL", __func__));
KASSERT(node->mn_info != NULL,
("%s(): node has no mn_info", __func__));
/* destroy children */
if (node->mn_type == mqfstype_dir || node->mn_type == mqfstype_root)
while (! LIST_EMPTY(&node->mn_children))
mqfs_destroy(LIST_FIRST(&node->mn_children));
/* unlink from parent */
if ((parent = node->mn_parent) != NULL) {
KASSERT(parent->mn_info == node->mn_info,
("%s(): parent has different mn_info", __func__));
LIST_REMOVE(node, mn_sibling);
}
if (node->mn_fileno != 0)
mqfs_fileno_free(node->mn_info, node);
if (node->mn_data != NULL)
mqueue_free(node->mn_data);
mqnode_free(node);
return (0);
}
/*
* Mount a mqfs instance
*/
static int
mqfs_mount(struct mount *mp)
{
struct statfs *sbp;
if (mp->mnt_flag & MNT_UPDATE)
return (EOPNOTSUPP);
mp->mnt_data = &mqfs_data;
MNT_ILOCK(mp);
mp->mnt_flag |= MNT_LOCAL;
mp->mnt_kern_flag |= MNTK_MPSAFE;
MNT_IUNLOCK(mp);
vfs_getnewfsid(mp);
sbp = &mp->mnt_stat;
vfs_mountedfrom(mp, "mqueue");
sbp->f_bsize = PAGE_SIZE;
sbp->f_iosize = PAGE_SIZE;
sbp->f_blocks = 1;
sbp->f_bfree = 0;
sbp->f_bavail = 0;
sbp->f_files = 1;
sbp->f_ffree = 0;
return (0);
}
/*
* Unmount a mqfs instance
*/
static int
mqfs_unmount(struct mount *mp, int mntflags)
{
int error;
error = vflush(mp, 0, (mntflags & MNT_FORCE) ? FORCECLOSE : 0,
curthread);
return (error);
}
/*
* Return a root vnode
*/
static int
mqfs_root(struct mount *mp, int flags, struct vnode **vpp)
{
struct mqfs_info *mqfs;
int ret;
mqfs = VFSTOMQFS(mp);
ret = mqfs_allocv(mp, vpp, mqfs->mi_root);
return (ret);
}
/*
* Return filesystem stats
*/
static int
mqfs_statfs(struct mount *mp, struct statfs *sbp)
{
/* XXX update statistics */
return (0);
}
/*
* Initialize a mqfs instance
*/
static int
mqfs_init(struct vfsconf *vfc)
{
struct mqfs_node *root;
struct mqfs_info *mi;
mqnode_zone = uma_zcreate("mqnode", sizeof(struct mqfs_node),
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
mqueue_zone = uma_zcreate("mqueue", sizeof(struct mqueue),
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
mvdata_zone = uma_zcreate("mvdata",
sizeof(struct mqfs_vdata), NULL, NULL, NULL,
NULL, UMA_ALIGN_PTR, 0);
mqnoti_zone = uma_zcreate("mqnotifier", sizeof(struct mqueue_notifier),
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
mi = &mqfs_data;
sx_init(&mi->mi_lock, "mqfs lock");
/* set up the root diretory */
root = mqfs_create_node("/", 1, curthread->td_ucred, 01777,
mqfstype_root);
root->mn_info = mi;
LIST_INIT(&root->mn_children);
LIST_INIT(&root->mn_vnodes);
mi->mi_root = root;
mqfs_fileno_init(mi);
mqfs_fileno_alloc(mi, root);
mqfs_fixup_dir(root);
exit_tag = EVENTHANDLER_REGISTER(process_exit, mq_proc_exit, NULL,
EVENTHANDLER_PRI_ANY);
mq_fdclose = mqueue_fdclose;
p31b_setcfg(CTL_P1003_1B_MESSAGE_PASSING, _POSIX_MESSAGE_PASSING);
return (0);
}
/*
* Destroy a mqfs instance
*/
static int
mqfs_uninit(struct vfsconf *vfc)
{
struct mqfs_info *mi;
if (!unloadable)
return (EOPNOTSUPP);
EVENTHANDLER_DEREGISTER(process_exit, exit_tag);
mi = &mqfs_data;
mqfs_destroy(mi->mi_root);
mi->mi_root = NULL;
mqfs_fileno_uninit(mi);
sx_destroy(&mi->mi_lock);
uma_zdestroy(mqnode_zone);
uma_zdestroy(mqueue_zone);
uma_zdestroy(mvdata_zone);
uma_zdestroy(mqnoti_zone);
return (0);
}
/*
* task routine
*/
static void
do_recycle(void *context, int pending __unused)
{
struct vnode *vp = (struct vnode *)context;
vrecycle(vp, curthread);
vdrop(vp);
}
/*
* Allocate a vnode
*/
static int
mqfs_allocv(struct mount *mp, struct vnode **vpp, struct mqfs_node *pn)
{
struct mqfs_vdata *vd;
struct mqfs_info *mqfs;
struct vnode *newvpp;
int error;
mqfs = pn->mn_info;
*vpp = NULL;
sx_xlock(&mqfs->mi_lock);
LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) {
if (vd->mv_vnode->v_mount == mp) {
vhold(vd->mv_vnode);
break;
}
}
if (vd != NULL) {
found:
*vpp = vd->mv_vnode;
sx_xunlock(&mqfs->mi_lock);
error = vget(*vpp, LK_RETRY | LK_EXCLUSIVE, curthread);
vdrop(*vpp);
return (error);
}
sx_xunlock(&mqfs->mi_lock);
error = getnewvnode("mqueue", mp, &mqfs_vnodeops, &newvpp);
if (error)
return (error);
vn_lock(newvpp, LK_EXCLUSIVE | LK_RETRY);
error = insmntque(newvpp, mp);
if (error != 0)
return (error);
sx_xlock(&mqfs->mi_lock);
/*
* Check if it has already been allocated
* while we were blocked.
*/
LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) {
if (vd->mv_vnode->v_mount == mp) {
vhold(vd->mv_vnode);
sx_xunlock(&mqfs->mi_lock);
vgone(newvpp);
vput(newvpp);
goto found;
}
}
*vpp = newvpp;
vd = uma_zalloc(mvdata_zone, M_WAITOK);
(*vpp)->v_data = vd;
vd->mv_vnode = *vpp;
vd->mv_node = pn;
TASK_INIT(&vd->mv_task, 0, do_recycle, *vpp);
LIST_INSERT_HEAD(&pn->mn_vnodes, vd, mv_link);
mqnode_addref(pn);
switch (pn->mn_type) {
case mqfstype_root:
(*vpp)->v_vflag = VV_ROOT;
/* fall through */
case mqfstype_dir:
case mqfstype_this:
case mqfstype_parent:
(*vpp)->v_type = VDIR;
break;
case mqfstype_file:
(*vpp)->v_type = VREG;
break;
case mqfstype_symlink:
(*vpp)->v_type = VLNK;
break;
case mqfstype_none:
KASSERT(0, ("mqfs_allocf called for null node\n"));
default:
panic("%s has unexpected type: %d", pn->mn_name, pn->mn_type);
}
sx_xunlock(&mqfs->mi_lock);
return (0);
}
/*
* Search a directory entry
*/
static struct mqfs_node *
mqfs_search(struct mqfs_node *pd, const char *name, int len)
{
struct mqfs_node *pn;
sx_assert(&pd->mn_info->mi_lock, SX_LOCKED);
LIST_FOREACH(pn, &pd->mn_children, mn_sibling) {
if (strncmp(pn->mn_name, name, len) == 0 &&
pn->mn_name[len] == '\0')
return (pn);
}
return (NULL);
}
/*
* Look up a file or directory.
*/
static int
mqfs_lookupx(struct vop_cachedlookup_args *ap)
{
struct componentname *cnp;
struct vnode *dvp, **vpp;
struct mqfs_node *pd;
struct mqfs_node *pn;
struct mqfs_info *mqfs;
int nameiop, flags, error, namelen;
char *pname;
struct thread *td;
cnp = ap->a_cnp;
vpp = ap->a_vpp;
dvp = ap->a_dvp;
pname = cnp->cn_nameptr;
namelen = cnp->cn_namelen;
td = cnp->cn_thread;
flags = cnp->cn_flags;
nameiop = cnp->cn_nameiop;
pd = VTON(dvp);
pn = NULL;
mqfs = pd->mn_info;
*vpp = NULLVP;
if (dvp->v_type != VDIR)
return (ENOTDIR);
error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, cnp->cn_thread);
if (error)
return (error);
/* shortcut: check if the name is too long */
if (cnp->cn_namelen >= MQFS_NAMELEN)
return (ENOENT);
/* self */
if (namelen == 1 && pname[0] == '.') {
if ((flags & ISLASTCN) && nameiop != LOOKUP)
return (EINVAL);
pn = pd;
*vpp = dvp;
VREF(dvp);
return (0);
}
/* parent */
if (cnp->cn_flags & ISDOTDOT) {
if (dvp->v_vflag & VV_ROOT)
return (EIO);
if ((flags & ISLASTCN) && nameiop != LOOKUP)
return (EINVAL);
VOP_UNLOCK(dvp, 0);
KASSERT(pd->mn_parent, ("non-root directory has no parent"));
pn = pd->mn_parent;
error = mqfs_allocv(dvp->v_mount, vpp, pn);
vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
return (error);
}
/* named node */
sx_xlock(&mqfs->mi_lock);
pn = mqfs_search(pd, pname, namelen);
if (pn != NULL)
mqnode_addref(pn);
sx_xunlock(&mqfs->mi_lock);
/* found */
if (pn != NULL) {
/* DELETE */
if (nameiop == DELETE && (flags & ISLASTCN)) {
error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td);
if (error) {
mqnode_release(pn);
return (error);
}
if (*vpp == dvp) {
VREF(dvp);
*vpp = dvp;
mqnode_release(pn);
return (0);
}
}
/* allocate vnode */
error = mqfs_allocv(dvp->v_mount, vpp, pn);
mqnode_release(pn);
if (error == 0 && cnp->cn_flags & MAKEENTRY)
cache_enter(dvp, *vpp, cnp);
return (error);
}
/* not found */
/* will create a new entry in the directory ? */
if ((nameiop == CREATE || nameiop == RENAME) && (flags & LOCKPARENT)
&& (flags & ISLASTCN)) {
error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td);
if (error)
return (error);
cnp->cn_flags |= SAVENAME;
return (EJUSTRETURN);
}
return (ENOENT);
}
#if 0
struct vop_lookup_args {
struct vop_generic_args a_gen;
struct vnode *a_dvp;
struct vnode **a_vpp;
struct componentname *a_cnp;
};
#endif
/*
* vnode lookup operation
*/
static int
mqfs_lookup(struct vop_cachedlookup_args *ap)
{
int rc;
rc = mqfs_lookupx(ap);
return (rc);
}
#if 0
struct vop_create_args {
struct vnode *a_dvp;
struct vnode **a_vpp;
struct componentname *a_cnp;
struct vattr *a_vap;
};
#endif
/*
* vnode creation operation
*/
static int
mqfs_create(struct vop_create_args *ap)
{
struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount);
struct componentname *cnp = ap->a_cnp;
struct mqfs_node *pd;
struct mqfs_node *pn;
struct mqueue *mq;
int error;
pd = VTON(ap->a_dvp);
if (pd->mn_type != mqfstype_root && pd->mn_type != mqfstype_dir)
return (ENOTDIR);
mq = mqueue_alloc(NULL);
if (mq == NULL)
return (EAGAIN);
sx_xlock(&mqfs->mi_lock);
if ((cnp->cn_flags & HASBUF) == 0)
panic("%s: no name", __func__);
pn = mqfs_create_file(pd, cnp->cn_nameptr, cnp->cn_namelen,
cnp->cn_cred, ap->a_vap->va_mode);
if (pn == NULL) {
sx_xunlock(&mqfs->mi_lock);
error = ENOSPC;
} else {
mqnode_addref(pn);
sx_xunlock(&mqfs->mi_lock);
error = mqfs_allocv(ap->a_dvp->v_mount, ap->a_vpp, pn);
mqnode_release(pn);
if (error)
mqfs_destroy(pn);
else
pn->mn_data = mq;
}
if (error)
mqueue_free(mq);
return (error);
}
/*
* Remove an entry
*/
static
int do_unlink(struct mqfs_node *pn, struct ucred *ucred)
{
struct mqfs_node *parent;
struct mqfs_vdata *vd;
int error = 0;
sx_assert(&pn->mn_info->mi_lock, SX_LOCKED);
if (ucred->cr_uid != pn->mn_uid &&
(error = priv_check_cred(ucred, PRIV_MQ_ADMIN, 0)) != 0)
error = EACCES;
else if (!pn->mn_deleted) {
parent = pn->mn_parent;
pn->mn_parent = NULL;
pn->mn_deleted = 1;
LIST_REMOVE(pn, mn_sibling);
LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) {
cache_purge(vd->mv_vnode);
vhold(vd->mv_vnode);
taskqueue_enqueue(taskqueue_thread, &vd->mv_task);
}
mqnode_release(pn);
mqnode_release(parent);
} else
error = ENOENT;
return (error);
}
#if 0
struct vop_remove_args {
struct vnode *a_dvp;
struct vnode *a_vp;
struct componentname *a_cnp;
};
#endif
/*
* vnode removal operation
*/
static int
mqfs_remove(struct vop_remove_args *ap)
{
struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount);
struct mqfs_node *pn;
int error;
if (ap->a_vp->v_type == VDIR)
return (EPERM);
pn = VTON(ap->a_vp);
sx_xlock(&mqfs->mi_lock);
error = do_unlink(pn, ap->a_cnp->cn_cred);
sx_xunlock(&mqfs->mi_lock);
return (error);
}
#if 0
struct vop_inactive_args {
struct vnode *a_vp;
struct thread *a_td;
};
#endif
static int
mqfs_inactive(struct vop_inactive_args *ap)
{
struct mqfs_node *pn = VTON(ap->a_vp);
if (pn->mn_deleted)
vrecycle(ap->a_vp, ap->a_td);
return (0);
}
#if 0
struct vop_reclaim_args {
struct vop_generic_args a_gen;
struct vnode *a_vp;
struct thread *a_td;
};
#endif
static int
mqfs_reclaim(struct vop_reclaim_args *ap)
{
struct mqfs_info *mqfs = VFSTOMQFS(ap->a_vp->v_mount);
struct vnode *vp = ap->a_vp;
struct mqfs_node *pn;
struct mqfs_vdata *vd;
vd = vp->v_data;
pn = vd->mv_node;
sx_xlock(&mqfs->mi_lock);
vp->v_data = NULL;
LIST_REMOVE(vd, mv_link);
uma_zfree(mvdata_zone, vd);
mqnode_release(pn);
sx_xunlock(&mqfs->mi_lock);
return (0);
}
#if 0
struct vop_open_args {
struct vop_generic_args a_gen;
struct vnode *a_vp;
int a_mode;
struct ucred *a_cred;
struct thread *a_td;
struct file *a_fp;
};
#endif
static int
mqfs_open(struct vop_open_args *ap)
{
return (0);
}
#if 0
struct vop_close_args {
struct vop_generic_args a_gen;
struct vnode *a_vp;
int a_fflag;
struct ucred *a_cred;
struct thread *a_td;
};
#endif
static int
mqfs_close(struct vop_close_args *ap)
{
return (0);
}
#if 0
struct vop_access_args {
struct vop_generic_args a_gen;
struct vnode *a_vp;
accmode_t a_accmode;
struct ucred *a_cred;
struct thread *a_td;
};
#endif
/*
* Verify permissions
*/
static int
mqfs_access(struct vop_access_args *ap)
{
struct vnode *vp = ap->a_vp;
struct vattr vattr;
int error;
error = VOP_GETATTR(vp, &vattr, ap->a_cred);
if (error)
return (error);
error = vaccess(vp->v_type, vattr.va_mode, vattr.va_uid,
vattr.va_gid, ap->a_accmode, ap->a_cred, NULL);
return (error);
}
#if 0
struct vop_getattr_args {
struct vop_generic_args a_gen;
struct vnode *a_vp;
struct vattr *a_vap;
struct ucred *a_cred;
};
#endif
/*
* Get file attributes
*/
static int
mqfs_getattr(struct vop_getattr_args *ap)
{
struct vnode *vp = ap->a_vp;
struct mqfs_node *pn = VTON(vp);
struct vattr *vap = ap->a_vap;
int error = 0;
vap->va_type = vp->v_type;
vap->va_mode = pn->mn_mode;
vap->va_nlink = 1;
vap->va_uid = pn->mn_uid;
vap->va_gid = pn->mn_gid;
vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
vap->va_fileid = pn->mn_fileno;
vap->va_size = 0;
vap->va_blocksize = PAGE_SIZE;
vap->va_bytes = vap->va_size = 0;
vap->va_atime = pn->mn_atime;
vap->va_mtime = pn->mn_mtime;
vap->va_ctime = pn->mn_ctime;
vap->va_birthtime = pn->mn_birth;
vap->va_gen = 0;
vap->va_flags = 0;
vap->va_rdev = NODEV;
vap->va_bytes = 0;
vap->va_filerev = 0;
return (error);
}
#if 0
struct vop_setattr_args {
struct vop_generic_args a_gen;
struct vnode *a_vp;
struct vattr *a_vap;
struct ucred *a_cred;
};
#endif
/*
* Set attributes
*/
static int
mqfs_setattr(struct vop_setattr_args *ap)
{
struct mqfs_node *pn;
struct vattr *vap;
struct vnode *vp;
struct thread *td;
int c, error;
uid_t uid;
gid_t gid;
td = curthread;
vap = ap->a_vap;
vp = ap->a_vp;
if ((vap->va_type != VNON) ||
(vap->va_nlink != VNOVAL) ||
(vap->va_fsid != VNOVAL) ||
(vap->va_fileid != VNOVAL) ||
(vap->va_blocksize != VNOVAL) ||
(vap->va_flags != VNOVAL && vap->va_flags != 0) ||
(vap->va_rdev != VNOVAL) ||
((int)vap->va_bytes != VNOVAL) ||
(vap->va_gen != VNOVAL)) {
return (EINVAL);
}
pn = VTON(vp);
error = c = 0;
if (vap->va_uid == (uid_t)VNOVAL)
uid = pn->mn_uid;
else
uid = vap->va_uid;
if (vap->va_gid == (gid_t)VNOVAL)
gid = pn->mn_gid;
else
gid = vap->va_gid;
if (uid != pn->mn_uid || gid != pn->mn_gid) {
/*
* To modify the ownership of a file, must possess VADMIN
* for that file.
*/
if ((error = VOP_ACCESS(vp, VADMIN, ap->a_cred, td)))
return (error);
/*
* XXXRW: Why is there a privilege check here: shouldn't the
* check in VOP_ACCESS() be enough? Also, are the group bits
* below definitely right?
*/
if (((ap->a_cred->cr_uid != pn->mn_uid) || uid != pn->mn_uid ||
(gid != pn->mn_gid && !groupmember(gid, ap->a_cred))) &&
(error = priv_check(td, PRIV_MQ_ADMIN)) != 0)
return (error);
pn->mn_uid = uid;
pn->mn_gid = gid;
c = 1;
}
if (vap->va_mode != (mode_t)VNOVAL) {
if ((ap->a_cred->cr_uid != pn->mn_uid) &&
(error = priv_check(td, PRIV_MQ_ADMIN)))
return (error);
pn->mn_mode = vap->va_mode;
c = 1;
}
if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) {
/* See the comment in ufs_vnops::ufs_setattr(). */
if ((error = VOP_ACCESS(vp, VADMIN, ap->a_cred, td)) &&
((vap->va_vaflags & VA_UTIMES_NULL) == 0 ||
(error = VOP_ACCESS(vp, VWRITE, ap->a_cred, td))))
return (error);
if (vap->va_atime.tv_sec != VNOVAL) {
pn->mn_atime = vap->va_atime;
}
if (vap->va_mtime.tv_sec != VNOVAL) {
pn->mn_mtime = vap->va_mtime;
}
c = 1;
}
if (c) {
vfs_timestamp(&pn->mn_ctime);
}
return (0);
}
#if 0
struct vop_read_args {
struct vop_generic_args a_gen;
struct vnode *a_vp;
struct uio *a_uio;
int a_ioflag;
struct ucred *a_cred;
};
#endif
/*
* Read from a file
*/
static int
mqfs_read(struct vop_read_args *ap)
{
char buf[80];
struct vnode *vp = ap->a_vp;
struct uio *uio = ap->a_uio;
struct mqfs_node *pn;
struct mqueue *mq;
int len, error;
if (vp->v_type != VREG)
return (EINVAL);
pn = VTON(vp);
mq = VTOMQ(vp);
snprintf(buf, sizeof(buf),
"QSIZE:%-10ld MAXMSG:%-10ld CURMSG:%-10ld MSGSIZE:%-10ld\n",
mq->mq_totalbytes,
mq->mq_maxmsg,
mq->mq_curmsgs,
mq->mq_msgsize);
buf[sizeof(buf)-1] = '\0';
len = strlen(buf);
error = uiomove_frombuf(buf, len, uio);
return (error);
}
#if 0
struct vop_readdir_args {
struct vop_generic_args a_gen;
struct vnode *a_vp;
struct uio *a_uio;
struct ucred *a_cred;
int *a_eofflag;
int *a_ncookies;
u_long **a_cookies;
};
#endif
/*
* Return directory entries.
*/
static int
mqfs_readdir(struct vop_readdir_args *ap)
{
struct vnode *vp;
struct mqfs_info *mi;
struct mqfs_node *pd;
struct mqfs_node *pn;
struct dirent entry;
struct uio *uio;
int *tmp_ncookies = NULL;
off_t offset;
int error, i;
vp = ap->a_vp;
mi = VFSTOMQFS(vp->v_mount);
pd = VTON(vp);
uio = ap->a_uio;
if (vp->v_type != VDIR)
return (ENOTDIR);
if (uio->uio_offset < 0)
return (EINVAL);
if (ap->a_ncookies != NULL) {
tmp_ncookies = ap->a_ncookies;
*ap->a_ncookies = 0;
ap->a_ncookies = NULL;
}
error = 0;
offset = 0;
sx_xlock(&mi->mi_lock);
LIST_FOREACH(pn, &pd->mn_children, mn_sibling) {
entry.d_reclen = sizeof(entry);
if (!pn->mn_fileno)
mqfs_fileno_alloc(mi, pn);
entry.d_fileno = pn->mn_fileno;
for (i = 0; i < MQFS_NAMELEN - 1 && pn->mn_name[i] != '\0'; ++i)
entry.d_name[i] = pn->mn_name[i];
entry.d_name[i] = 0;
entry.d_namlen = i;
switch (pn->mn_type) {
case mqfstype_root:
case mqfstype_dir:
case mqfstype_this:
case mqfstype_parent:
entry.d_type = DT_DIR;
break;
case mqfstype_file:
entry.d_type = DT_REG;
break;
case mqfstype_symlink:
entry.d_type = DT_LNK;
break;
default:
panic("%s has unexpected node type: %d", pn->mn_name,
pn->mn_type);
}
if (entry.d_reclen > uio->uio_resid)
break;
if (offset >= uio->uio_offset) {
error = vfs_read_dirent(ap, &entry, offset);
if (error)
break;
}
offset += entry.d_reclen;
}
sx_xunlock(&mi->mi_lock);
uio->uio_offset = offset;
if (tmp_ncookies != NULL)
ap->a_ncookies = tmp_ncookies;
return (error);
}
#ifdef notyet
#if 0
struct vop_mkdir_args {
struct vnode *a_dvp;
struvt vnode **a_vpp;
struvt componentname *a_cnp;
struct vattr *a_vap;
};
#endif
/*
* Create a directory.
*/
static int
mqfs_mkdir(struct vop_mkdir_args *ap)
{
struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount);
struct componentname *cnp = ap->a_cnp;
struct mqfs_node *pd = VTON(ap->a_dvp);
struct mqfs_node *pn;
int error;
if (pd->mn_type != mqfstype_root && pd->mn_type != mqfstype_dir)
return (ENOTDIR);
sx_xlock(&mqfs->mi_lock);
if ((cnp->cn_flags & HASBUF) == 0)
panic("%s: no name", __func__);
pn = mqfs_create_dir(pd, cnp->cn_nameptr, cnp->cn_namelen,
ap->a_vap->cn_cred, ap->a_vap->va_mode);
if (pn != NULL)
mqnode_addref(pn);
sx_xunlock(&mqfs->mi_lock);
if (pn == NULL) {
error = ENOSPC;
} else {
error = mqfs_allocv(ap->a_dvp->v_mount, ap->a_vpp, pn);
mqnode_release(pn);
}
return (error);
}
#if 0
struct vop_rmdir_args {
struct vnode *a_dvp;
struct vnode *a_vp;
struct componentname *a_cnp;
};
#endif
/*
* Remove a directory.
*/
static int
mqfs_rmdir(struct vop_rmdir_args *ap)
{
struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount);
struct mqfs_node *pn = VTON(ap->a_vp);
struct mqfs_node *pt;
if (pn->mn_type != mqfstype_dir)
return (ENOTDIR);
sx_xlock(&mqfs->mi_lock);
if (pn->mn_deleted) {
sx_xunlock(&mqfs->mi_lock);
return (ENOENT);
}
pt = LIST_FIRST(&pn->mn_children);
pt = LIST_NEXT(pt, mn_sibling);
pt = LIST_NEXT(pt, mn_sibling);
if (pt != NULL) {
sx_xunlock(&mqfs->mi_lock);
return (ENOTEMPTY);
}
pt = pn->mn_parent;
pn->mn_parent = NULL;
pn->mn_deleted = 1;
LIST_REMOVE(pn, mn_sibling);
mqnode_release(pn);
mqnode_release(pt);
sx_xunlock(&mqfs->mi_lock);
cache_purge(ap->a_vp);
return (0);
}
#endif /* notyet */
/*
* Allocate a message queue
*/
static struct mqueue *
mqueue_alloc(const struct mq_attr *attr)
{
struct mqueue *mq;
if (curmq >= maxmq)
return (NULL);
mq = uma_zalloc(mqueue_zone, M_WAITOK | M_ZERO);
TAILQ_INIT(&mq->mq_msgq);
if (attr != NULL) {
mq->mq_maxmsg = attr->mq_maxmsg;
mq->mq_msgsize = attr->mq_msgsize;
} else {
mq->mq_maxmsg = default_maxmsg;
mq->mq_msgsize = default_msgsize;
}
mtx_init(&mq->mq_mutex, "mqueue lock", NULL, MTX_DEF);
knlist_init_mtx(&mq->mq_rsel.si_note, &mq->mq_mutex);
knlist_init_mtx(&mq->mq_wsel.si_note, &mq->mq_mutex);
atomic_add_int(&curmq, 1);
return (mq);
}
/*
* Destroy a message queue
*/
static void
mqueue_free(struct mqueue *mq)
{
struct mqueue_msg *msg;
while ((msg = TAILQ_FIRST(&mq->mq_msgq)) != NULL) {
TAILQ_REMOVE(&mq->mq_msgq, msg, msg_link);
free(msg, M_MQUEUEDATA);
}
mtx_destroy(&mq->mq_mutex);
seldrain(&mq->mq_rsel);
seldrain(&mq->mq_wsel);
knlist_destroy(&mq->mq_rsel.si_note);
knlist_destroy(&mq->mq_wsel.si_note);
uma_zfree(mqueue_zone, mq);
atomic_add_int(&curmq, -1);
}
/*
* Load a message from user space
*/
static struct mqueue_msg *
mqueue_loadmsg(const char *msg_ptr, size_t msg_size, int msg_prio)
{
struct mqueue_msg *msg;
size_t len;
int error;
len = sizeof(struct mqueue_msg) + msg_size;
msg = malloc(len, M_MQUEUEDATA, M_WAITOK);
error = copyin(msg_ptr, ((char *)msg) + sizeof(struct mqueue_msg),
msg_size);
if (error) {
free(msg, M_MQUEUEDATA);
msg = NULL;
} else {
msg->msg_size = msg_size;
msg->msg_prio = msg_prio;
}
return (msg);
}
/*
* Save a message to user space
*/
static int
mqueue_savemsg(struct mqueue_msg *msg, char *msg_ptr, int *msg_prio)
{
int error;
error = copyout(((char *)msg) + sizeof(*msg), msg_ptr,
msg->msg_size);
if (error == 0 && msg_prio != NULL)
error = copyout(&msg->msg_prio, msg_prio, sizeof(int));
return (error);
}
/*
* Free a message's memory
*/
static __inline void
mqueue_freemsg(struct mqueue_msg *msg)
{
free(msg, M_MQUEUEDATA);
}
/*
* Send a message. if waitok is false, thread will not be
* blocked if there is no data in queue, otherwise, absolute
* time will be checked.
*/
int
mqueue_send(struct mqueue *mq, const char *msg_ptr,
size_t msg_len, unsigned msg_prio, int waitok,
const struct timespec *abs_timeout)
{
struct mqueue_msg *msg;
struct timespec ts, ts2;
struct timeval tv;
int error;
if (msg_prio >= MQ_PRIO_MAX)
return (EINVAL);
if (msg_len > mq->mq_msgsize)
return (EMSGSIZE);
msg = mqueue_loadmsg(msg_ptr, msg_len, msg_prio);
if (msg == NULL)
return (EFAULT);
/* O_NONBLOCK case */
if (!waitok) {
error = _mqueue_send(mq, msg, -1);
if (error)
goto bad;
return (0);
}
/* we allow a null timeout (wait forever) */
if (abs_timeout == NULL) {
error = _mqueue_send(mq, msg, 0);
if (error)
goto bad;
return (0);
}
/* send it before checking time */
error = _mqueue_send(mq, msg, -1);
if (error == 0)
return (0);
if (error != EAGAIN)
goto bad;
if (abs_timeout->tv_nsec >= 1000000000 || abs_timeout->tv_nsec < 0) {
error = EINVAL;
goto bad;
}
for (;;) {
ts2 = *abs_timeout;
getnanotime(&ts);
timespecsub(&ts2, &ts);
if (ts2.tv_sec < 0 || (ts2.tv_sec == 0 && ts2.tv_nsec <= 0)) {
error = ETIMEDOUT;
break;
}
TIMESPEC_TO_TIMEVAL(&tv, &ts2);
error = _mqueue_send(mq, msg, tvtohz(&tv));
if (error != ETIMEDOUT)
break;
}
if (error == 0)
return (0);
bad:
mqueue_freemsg(msg);
return (error);
}
/*
* Common routine to send a message
*/
static int
_mqueue_send(struct mqueue *mq, struct mqueue_msg *msg, int timo)
{
struct mqueue_msg *msg2;
int error = 0;
mtx_lock(&mq->mq_mutex);
while (mq->mq_curmsgs >= mq->mq_maxmsg && error == 0) {
if (timo < 0) {
mtx_unlock(&mq->mq_mutex);
return (EAGAIN);
}
mq->mq_senders++;
error = msleep(&mq->mq_senders, &mq->mq_mutex,
PCATCH, "mqsend", timo);
mq->mq_senders--;
if (error == EAGAIN)
error = ETIMEDOUT;
}
if (mq->mq_curmsgs >= mq->mq_maxmsg) {
mtx_unlock(&mq->mq_mutex);
return (error);
}
error = 0;
if (TAILQ_EMPTY(&mq->mq_msgq)) {
TAILQ_INSERT_HEAD(&mq->mq_msgq, msg, msg_link);
} else {
if (msg->msg_prio <= TAILQ_LAST(&mq->mq_msgq, msgq)->msg_prio) {
TAILQ_INSERT_TAIL(&mq->mq_msgq, msg, msg_link);
} else {
TAILQ_FOREACH(msg2, &mq->mq_msgq, msg_link) {
if (msg2->msg_prio < msg->msg_prio)
break;
}
TAILQ_INSERT_BEFORE(msg2, msg, msg_link);
}
}
mq->mq_curmsgs++;
mq->mq_totalbytes += msg->msg_size;
if (mq->mq_receivers)
wakeup_one(&mq->mq_receivers);
else if (mq->mq_notifier != NULL)
mqueue_send_notification(mq);
if (mq->mq_flags & MQ_RSEL) {
mq->mq_flags &= ~MQ_RSEL;
selwakeup(&mq->mq_rsel);
}
KNOTE_LOCKED(&mq->mq_rsel.si_note, 0);
mtx_unlock(&mq->mq_mutex);
return (0);
}
/*
* Send realtime a signal to process which registered itself
* successfully by mq_notify.
*/
static void
mqueue_send_notification(struct mqueue *mq)
{
struct mqueue_notifier *nt;
struct thread *td;
struct proc *p;
int error;
mtx_assert(&mq->mq_mutex, MA_OWNED);
nt = mq->mq_notifier;
if (nt->nt_sigev.sigev_notify != SIGEV_NONE) {
p = nt->nt_proc;
error = sigev_findtd(p, &nt->nt_sigev, &td);
if (error) {
mq->mq_notifier = NULL;
return;
}
if (!KSI_ONQ(&nt->nt_ksi)) {
ksiginfo_set_sigev(&nt->nt_ksi, &nt->nt_sigev);
tdsendsignal(p, td, nt->nt_ksi.ksi_signo, &nt->nt_ksi);
}
PROC_UNLOCK(p);
}
mq->mq_notifier = NULL;
}
/*
* Get a message. if waitok is false, thread will not be
* blocked if there is no data in queue, otherwise, absolute
* time will be checked.
*/
int
mqueue_receive(struct mqueue *mq, char *msg_ptr,
size_t msg_len, unsigned *msg_prio, int waitok,
const struct timespec *abs_timeout)
{
struct mqueue_msg *msg;
struct timespec ts, ts2;
struct timeval tv;
int error;
if (msg_len < mq->mq_msgsize)
return (EMSGSIZE);
/* O_NONBLOCK case */
if (!waitok) {
error = _mqueue_recv(mq, &msg, -1);
if (error)
return (error);
goto received;
}
/* we allow a null timeout (wait forever). */
if (abs_timeout == NULL) {
error = _mqueue_recv(mq, &msg, 0);
if (error)
return (error);
goto received;
}
/* try to get a message before checking time */
error = _mqueue_recv(mq, &msg, -1);
if (error == 0)
goto received;
if (error != EAGAIN)
return (error);
if (abs_timeout->tv_nsec >= 1000000000 || abs_timeout->tv_nsec < 0) {
error = EINVAL;
return (error);
}
for (;;) {
ts2 = *abs_timeout;
getnanotime(&ts);
timespecsub(&ts2, &ts);
if (ts2.tv_sec < 0 || (ts2.tv_sec == 0 && ts2.tv_nsec <= 0)) {
error = ETIMEDOUT;
return (error);
}
TIMESPEC_TO_TIMEVAL(&tv, &ts2);
error = _mqueue_recv(mq, &msg, tvtohz(&tv));
if (error == 0)
break;
if (error != ETIMEDOUT)
return (error);
}
received:
error = mqueue_savemsg(msg, msg_ptr, msg_prio);
if (error == 0) {
curthread->td_retval[0] = msg->msg_size;
curthread->td_retval[1] = 0;
}
mqueue_freemsg(msg);
return (error);
}
/*
* Common routine to receive a message
*/
static int
_mqueue_recv(struct mqueue *mq, struct mqueue_msg **msg, int timo)
{
int error = 0;
mtx_lock(&mq->mq_mutex);
while ((*msg = TAILQ_FIRST(&mq->mq_msgq)) == NULL && error == 0) {
if (timo < 0) {
mtx_unlock(&mq->mq_mutex);
return (EAGAIN);
}
mq->mq_receivers++;
error = msleep(&mq->mq_receivers, &mq->mq_mutex,
PCATCH, "mqrecv", timo);
mq->mq_receivers--;
if (error == EAGAIN)
error = ETIMEDOUT;
}
if (*msg != NULL) {
error = 0;
TAILQ_REMOVE(&mq->mq_msgq, *msg, msg_link);
mq->mq_curmsgs--;
mq->mq_totalbytes -= (*msg)->msg_size;
if (mq->mq_senders)
wakeup_one(&mq->mq_senders);
if (mq->mq_flags & MQ_WSEL) {
mq->mq_flags &= ~MQ_WSEL;
selwakeup(&mq->mq_wsel);
}
KNOTE_LOCKED(&mq->mq_wsel.si_note, 0);
}
if (mq->mq_notifier != NULL && mq->mq_receivers == 0 &&
!TAILQ_EMPTY(&mq->mq_msgq)) {
mqueue_send_notification(mq);
}
mtx_unlock(&mq->mq_mutex);
return (error);
}
static __inline struct mqueue_notifier *
notifier_alloc(void)
{
return (uma_zalloc(mqnoti_zone, M_WAITOK | M_ZERO));
}
static __inline void
notifier_free(struct mqueue_notifier *p)
{
uma_zfree(mqnoti_zone, p);
}
static struct mqueue_notifier *
notifier_search(struct proc *p, int fd)
{
struct mqueue_notifier *nt;
LIST_FOREACH(nt, &p->p_mqnotifier, nt_link) {
if (nt->nt_ksi.ksi_mqd == fd)
break;
}
return (nt);
}
static __inline void
notifier_insert(struct proc *p, struct mqueue_notifier *nt)
{
LIST_INSERT_HEAD(&p->p_mqnotifier, nt, nt_link);
}
static __inline void
notifier_delete(struct proc *p, struct mqueue_notifier *nt)
{
LIST_REMOVE(nt, nt_link);
notifier_free(nt);
}
static void
notifier_remove(struct proc *p, struct mqueue *mq, int fd)
{
struct mqueue_notifier *nt;
mtx_assert(&mq->mq_mutex, MA_OWNED);
PROC_LOCK(p);
nt = notifier_search(p, fd);
if (nt != NULL) {
if (mq->mq_notifier == nt)
mq->mq_notifier = NULL;
sigqueue_take(&nt->nt_ksi);
notifier_delete(p, nt);
}
PROC_UNLOCK(p);
}
static int
kern_kmq_open(struct thread *td, const char *upath, int flags, mode_t mode,
const struct mq_attr *attr)
{
char path[MQFS_NAMELEN + 1];
struct mqfs_node *pn;
struct filedesc *fdp;
struct file *fp;
struct mqueue *mq;
int fd, error, len, cmode;
fdp = td->td_proc->p_fd;
cmode = (((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT);
mq = NULL;
if ((flags & O_CREAT) != 0 && attr != NULL) {
if (attr->mq_maxmsg <= 0 || attr->mq_maxmsg > maxmsg)
return (EINVAL);
if (attr->mq_msgsize <= 0 || attr->mq_msgsize > maxmsgsize)
return (EINVAL);
}
error = copyinstr(upath, path, MQFS_NAMELEN + 1, NULL);
if (error)
return (error);
/*
* The first character of name must be a slash (/) character
* and the remaining characters of name cannot include any slash
* characters.
*/
len = strlen(path);
if (len < 2 || path[0] != '/' || index(path + 1, '/') != NULL)
return (EINVAL);
error = falloc(td, &fp, &fd, 0);
if (error)
return (error);
sx_xlock(&mqfs_data.mi_lock);
pn = mqfs_search(mqfs_data.mi_root, path + 1, len - 1);
if (pn == NULL) {
if (!(flags & O_CREAT)) {
error = ENOENT;
} else {
mq = mqueue_alloc(attr);
if (mq == NULL) {
error = ENFILE;
} else {
pn = mqfs_create_file(mqfs_data.mi_root,
path + 1, len - 1, td->td_ucred,
cmode);
if (pn == NULL) {
error = ENOSPC;
mqueue_free(mq);
}
}
}
if (error == 0) {
pn->mn_data = mq;
}
} else {
if ((flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) {
error = EEXIST;
} else {
accmode_t accmode = 0;
if (flags & FREAD)
accmode |= VREAD;
if (flags & FWRITE)
accmode |= VWRITE;
error = vaccess(VREG, pn->mn_mode, pn->mn_uid,
pn->mn_gid, accmode, td->td_ucred, NULL);
}
}
if (error) {
sx_xunlock(&mqfs_data.mi_lock);
fdclose(fdp, fp, fd, td);
fdrop(fp, td);
return (error);
}
mqnode_addref(pn);
sx_xunlock(&mqfs_data.mi_lock);
finit(fp, flags & (FREAD | FWRITE | O_NONBLOCK), DTYPE_MQUEUE, pn,
&mqueueops);
FILEDESC_XLOCK(fdp);
if (fdp->fd_ofiles[fd] == fp)
fdp->fd_ofileflags[fd] |= UF_EXCLOSE;
FILEDESC_XUNLOCK(fdp);
td->td_retval[0] = fd;
fdrop(fp, td);
return (0);
}
/*
* Syscall to open a message queue.
*/
int
-kmq_open(struct thread *td, struct kmq_open_args *uap)
+sys_kmq_open(struct thread *td, struct kmq_open_args *uap)
{
struct mq_attr attr;
int flags, error;
if ((uap->flags & O_ACCMODE) == O_ACCMODE)
return (EINVAL);
flags = FFLAGS(uap->flags);
if ((flags & O_CREAT) != 0 && uap->attr != NULL) {
error = copyin(uap->attr, &attr, sizeof(attr));
if (error)
return (error);
}
return (kern_kmq_open(td, uap->path, flags, uap->mode,
uap->attr != NULL ? &attr : NULL));
}
/*
* Syscall to unlink a message queue.
*/
int
-kmq_unlink(struct thread *td, struct kmq_unlink_args *uap)
+sys_kmq_unlink(struct thread *td, struct kmq_unlink_args *uap)
{
char path[MQFS_NAMELEN+1];
struct mqfs_node *pn;
int error, len;
error = copyinstr(uap->path, path, MQFS_NAMELEN + 1, NULL);
if (error)
return (error);
len = strlen(path);
if (len < 2 || path[0] != '/' || index(path + 1, '/') != NULL)
return (EINVAL);
sx_xlock(&mqfs_data.mi_lock);
pn = mqfs_search(mqfs_data.mi_root, path + 1, len - 1);
if (pn != NULL)
error = do_unlink(pn, td->td_ucred);
else
error = ENOENT;
sx_xunlock(&mqfs_data.mi_lock);
return (error);
}
typedef int (*_fgetf)(struct thread *, int, cap_rights_t, struct file **);
/*
* Get message queue by giving file slot
*/
static int
_getmq(struct thread *td, int fd, cap_rights_t rights, _fgetf func,
struct file **fpp, struct mqfs_node **ppn, struct mqueue **pmq)
{
struct mqfs_node *pn;
int error;
error = func(td, fd, rights, fpp);
if (error)
return (error);
if (&mqueueops != (*fpp)->f_ops) {
fdrop(*fpp, td);
return (EBADF);
}
pn = (*fpp)->f_data;
if (ppn)
*ppn = pn;
if (pmq)
*pmq = pn->mn_data;
return (0);
}
static __inline int
getmq(struct thread *td, int fd, struct file **fpp, struct mqfs_node **ppn,
struct mqueue **pmq)
{
return _getmq(td, fd, CAP_POLL_EVENT, fget, fpp, ppn, pmq);
}
static __inline int
getmq_read(struct thread *td, int fd, struct file **fpp,
struct mqfs_node **ppn, struct mqueue **pmq)
{
return _getmq(td, fd, CAP_READ, fget_read, fpp, ppn, pmq);
}
static __inline int
getmq_write(struct thread *td, int fd, struct file **fpp,
struct mqfs_node **ppn, struct mqueue **pmq)
{
return _getmq(td, fd, CAP_WRITE, fget_write, fpp, ppn, pmq);
}
static int
kern_kmq_setattr(struct thread *td, int mqd, const struct mq_attr *attr,
struct mq_attr *oattr)
{
struct mqueue *mq;
struct file *fp;
u_int oflag, flag;
int error;
if (attr != NULL && (attr->mq_flags & ~O_NONBLOCK) != 0)
return (EINVAL);
error = getmq(td, mqd, &fp, NULL, &mq);
if (error)
return (error);
oattr->mq_maxmsg = mq->mq_maxmsg;
oattr->mq_msgsize = mq->mq_msgsize;
oattr->mq_curmsgs = mq->mq_curmsgs;
if (attr != NULL) {
do {
oflag = flag = fp->f_flag;
flag &= ~O_NONBLOCK;
flag |= (attr->mq_flags & O_NONBLOCK);
} while (atomic_cmpset_int(&fp->f_flag, oflag, flag) == 0);
} else
oflag = fp->f_flag;
oattr->mq_flags = (O_NONBLOCK & oflag);
fdrop(fp, td);
return (error);
}
int
-kmq_setattr(struct thread *td, struct kmq_setattr_args *uap)
+sys_kmq_setattr(struct thread *td, struct kmq_setattr_args *uap)
{
struct mq_attr attr, oattr;
int error;
if (uap->attr != NULL) {
error = copyin(uap->attr, &attr, sizeof(attr));
if (error != 0)
return (error);
}
error = kern_kmq_setattr(td, uap->mqd, uap->attr != NULL ? &attr : NULL,
&oattr);
if (error != 0)
return (error);
if (uap->oattr != NULL)
error = copyout(&oattr, uap->oattr, sizeof(oattr));
return (error);
}
int
-kmq_timedreceive(struct thread *td, struct kmq_timedreceive_args *uap)
+sys_kmq_timedreceive(struct thread *td, struct kmq_timedreceive_args *uap)
{
struct mqueue *mq;
struct file *fp;
struct timespec *abs_timeout, ets;
int error;
int waitok;
error = getmq_read(td, uap->mqd, &fp, NULL, &mq);
if (error)
return (error);
if (uap->abs_timeout != NULL) {
error = copyin(uap->abs_timeout, &ets, sizeof(ets));
if (error != 0)
return (error);
abs_timeout = &ets;
} else
abs_timeout = NULL;
waitok = !(fp->f_flag & O_NONBLOCK);
error = mqueue_receive(mq, uap->msg_ptr, uap->msg_len,
uap->msg_prio, waitok, abs_timeout);
fdrop(fp, td);
return (error);
}
int
-kmq_timedsend(struct thread *td, struct kmq_timedsend_args *uap)
+sys_kmq_timedsend(struct thread *td, struct kmq_timedsend_args *uap)
{
struct mqueue *mq;
struct file *fp;
struct timespec *abs_timeout, ets;
int error, waitok;
error = getmq_write(td, uap->mqd, &fp, NULL, &mq);
if (error)
return (error);
if (uap->abs_timeout != NULL) {
error = copyin(uap->abs_timeout, &ets, sizeof(ets));
if (error != 0)
return (error);
abs_timeout = &ets;
} else
abs_timeout = NULL;
waitok = !(fp->f_flag & O_NONBLOCK);
error = mqueue_send(mq, uap->msg_ptr, uap->msg_len,
uap->msg_prio, waitok, abs_timeout);
fdrop(fp, td);
return (error);
}
int
-kmq_notify(struct thread *td, struct kmq_notify_args *uap)
+sys_kmq_notify(struct thread *td, struct kmq_notify_args *uap)
{
struct sigevent ev;
struct filedesc *fdp;
struct proc *p;
struct mqueue *mq;
struct file *fp, *fp2;
struct mqueue_notifier *nt, *newnt = NULL;
int error;
p = td->td_proc;
fdp = td->td_proc->p_fd;
if (uap->sigev) {
error = copyin(uap->sigev, &ev, sizeof(ev));
if (error)
return (error);
if (ev.sigev_notify != SIGEV_SIGNAL &&
ev.sigev_notify != SIGEV_THREAD_ID &&
ev.sigev_notify != SIGEV_NONE)
return (EINVAL);
if ((ev.sigev_notify == SIGEV_SIGNAL ||
ev.sigev_notify == SIGEV_THREAD_ID) &&
!_SIG_VALID(ev.sigev_signo))
return (EINVAL);
}
error = getmq(td, uap->mqd, &fp, NULL, &mq);
if (error)
return (error);
again:
FILEDESC_SLOCK(fdp);
fp2 = fget_locked(fdp, uap->mqd);
if (fp2 == NULL) {
FILEDESC_SUNLOCK(fdp);
error = EBADF;
goto out;
}
error = cap_funwrap(fp2, CAP_POLL_EVENT, &fp2);
if (error) {
FILEDESC_SUNLOCK(fdp);
goto out;
}
if (fp2 != fp) {
FILEDESC_SUNLOCK(fdp);
error = EBADF;
goto out;
}
mtx_lock(&mq->mq_mutex);
FILEDESC_SUNLOCK(fdp);
if (uap->sigev != NULL) {
if (mq->mq_notifier != NULL) {
error = EBUSY;
} else {
PROC_LOCK(p);
nt = notifier_search(p, uap->mqd);
if (nt == NULL) {
if (newnt == NULL) {
PROC_UNLOCK(p);
mtx_unlock(&mq->mq_mutex);
newnt = notifier_alloc();
goto again;
}
}
if (nt != NULL) {
sigqueue_take(&nt->nt_ksi);
if (newnt != NULL) {
notifier_free(newnt);
newnt = NULL;
}
} else {
nt = newnt;
newnt = NULL;
ksiginfo_init(&nt->nt_ksi);
nt->nt_ksi.ksi_flags |= KSI_INS | KSI_EXT;
nt->nt_ksi.ksi_code = SI_MESGQ;
nt->nt_proc = p;
nt->nt_ksi.ksi_mqd = uap->mqd;
notifier_insert(p, nt);
}
nt->nt_sigev = ev;
mq->mq_notifier = nt;
PROC_UNLOCK(p);
/*
* if there is no receivers and message queue
* is not empty, we should send notification
* as soon as possible.
*/
if (mq->mq_receivers == 0 &&
!TAILQ_EMPTY(&mq->mq_msgq))
mqueue_send_notification(mq);
}
} else {
notifier_remove(p, mq, uap->mqd);
}
mtx_unlock(&mq->mq_mutex);
out:
fdrop(fp, td);
if (newnt != NULL)
notifier_free(newnt);
return (error);
}
static void
mqueue_fdclose(struct thread *td, int fd, struct file *fp)
{
struct filedesc *fdp;
struct mqueue *mq;
fdp = td->td_proc->p_fd;
FILEDESC_LOCK_ASSERT(fdp);
if (fp->f_ops == &mqueueops) {
mq = FPTOMQ(fp);
mtx_lock(&mq->mq_mutex);
notifier_remove(td->td_proc, mq, fd);
/* have to wakeup thread in same process */
if (mq->mq_flags & MQ_RSEL) {
mq->mq_flags &= ~MQ_RSEL;
selwakeup(&mq->mq_rsel);
}
if (mq->mq_flags & MQ_WSEL) {
mq->mq_flags &= ~MQ_WSEL;
selwakeup(&mq->mq_wsel);
}
mtx_unlock(&mq->mq_mutex);
}
}
static void
mq_proc_exit(void *arg __unused, struct proc *p)
{
struct filedesc *fdp;
struct file *fp;
struct mqueue *mq;
int i;
fdp = p->p_fd;
FILEDESC_SLOCK(fdp);
for (i = 0; i < fdp->fd_nfiles; ++i) {
fp = fget_locked(fdp, i);
if (fp != NULL && fp->f_ops == &mqueueops) {
mq = FPTOMQ(fp);
mtx_lock(&mq->mq_mutex);
notifier_remove(p, FPTOMQ(fp), i);
mtx_unlock(&mq->mq_mutex);
}
}
FILEDESC_SUNLOCK(fdp);
KASSERT(LIST_EMPTY(&p->p_mqnotifier), ("mq notifiers left"));
}
static int
mqf_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
int flags, struct thread *td)
{
return (EOPNOTSUPP);
}
static int
mqf_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
int flags, struct thread *td)
{
return (EOPNOTSUPP);
}
static int
mqf_truncate(struct file *fp, off_t length, struct ucred *active_cred,
struct thread *td)
{
return (EINVAL);
}
static int
mqf_ioctl(struct file *fp, u_long cmd, void *data,
struct ucred *active_cred, struct thread *td)
{
return (ENOTTY);
}
static int
mqf_poll(struct file *fp, int events, struct ucred *active_cred,
struct thread *td)
{
struct mqueue *mq = FPTOMQ(fp);
int revents = 0;
mtx_lock(&mq->mq_mutex);
if (events & (POLLIN | POLLRDNORM)) {
if (mq->mq_curmsgs) {
revents |= events & (POLLIN | POLLRDNORM);
} else {
mq->mq_flags |= MQ_RSEL;
selrecord(td, &mq->mq_rsel);
}
}
if (events & POLLOUT) {
if (mq->mq_curmsgs < mq->mq_maxmsg)
revents |= POLLOUT;
else {
mq->mq_flags |= MQ_WSEL;
selrecord(td, &mq->mq_wsel);
}
}
mtx_unlock(&mq->mq_mutex);
return (revents);
}
static int
mqf_close(struct file *fp, struct thread *td)
{
struct mqfs_node *pn;
fp->f_ops = &badfileops;
pn = fp->f_data;
fp->f_data = NULL;
sx_xlock(&mqfs_data.mi_lock);
mqnode_release(pn);
sx_xunlock(&mqfs_data.mi_lock);
return (0);
}
static int
mqf_stat(struct file *fp, struct stat *st, struct ucred *active_cred,
struct thread *td)
{
struct mqfs_node *pn = fp->f_data;
bzero(st, sizeof *st);
sx_xlock(&mqfs_data.mi_lock);
st->st_atim = pn->mn_atime;
st->st_mtim = pn->mn_mtime;
st->st_ctim = pn->mn_ctime;
st->st_birthtim = pn->mn_birth;
st->st_uid = pn->mn_uid;
st->st_gid = pn->mn_gid;
st->st_mode = S_IFIFO | pn->mn_mode;
sx_xunlock(&mqfs_data.mi_lock);
return (0);
}
static int
mqf_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
struct thread *td)
{
struct mqfs_node *pn;
int error;
error = 0;
pn = fp->f_data;
sx_xlock(&mqfs_data.mi_lock);
error = vaccess(VREG, pn->mn_mode, pn->mn_uid, pn->mn_gid, VADMIN,
active_cred, NULL);
if (error != 0)
goto out;
pn->mn_mode = mode & ACCESSPERMS;
out:
sx_xunlock(&mqfs_data.mi_lock);
return (error);
}
static int
mqf_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
struct thread *td)
{
struct mqfs_node *pn;
int error;
error = 0;
pn = fp->f_data;
sx_xlock(&mqfs_data.mi_lock);
if (uid == (uid_t)-1)
uid = pn->mn_uid;
if (gid == (gid_t)-1)
gid = pn->mn_gid;
if (((uid != pn->mn_uid && uid != active_cred->cr_uid) ||
(gid != pn->mn_gid && !groupmember(gid, active_cred))) &&
(error = priv_check_cred(active_cred, PRIV_VFS_CHOWN, 0)))
goto out;
pn->mn_uid = uid;
pn->mn_gid = gid;
out:
sx_xunlock(&mqfs_data.mi_lock);
return (error);
}
static int
mqf_kqfilter(struct file *fp, struct knote *kn)
{
struct mqueue *mq = FPTOMQ(fp);
int error = 0;
if (kn->kn_filter == EVFILT_READ) {
kn->kn_fop = &mq_rfiltops;
knlist_add(&mq->mq_rsel.si_note, kn, 0);
} else if (kn->kn_filter == EVFILT_WRITE) {
kn->kn_fop = &mq_wfiltops;
knlist_add(&mq->mq_wsel.si_note, kn, 0);
} else
error = EINVAL;
return (error);
}
static void
filt_mqdetach(struct knote *kn)
{
struct mqueue *mq = FPTOMQ(kn->kn_fp);
if (kn->kn_filter == EVFILT_READ)
knlist_remove(&mq->mq_rsel.si_note, kn, 0);
else if (kn->kn_filter == EVFILT_WRITE)
knlist_remove(&mq->mq_wsel.si_note, kn, 0);
else
panic("filt_mqdetach");
}
static int
filt_mqread(struct knote *kn, long hint)
{
struct mqueue *mq = FPTOMQ(kn->kn_fp);
mtx_assert(&mq->mq_mutex, MA_OWNED);
return (mq->mq_curmsgs != 0);
}
static int
filt_mqwrite(struct knote *kn, long hint)
{
struct mqueue *mq = FPTOMQ(kn->kn_fp);
mtx_assert(&mq->mq_mutex, MA_OWNED);
return (mq->mq_curmsgs < mq->mq_maxmsg);
}
static struct fileops mqueueops = {
.fo_read = mqf_read,
.fo_write = mqf_write,
.fo_truncate = mqf_truncate,
.fo_ioctl = mqf_ioctl,
.fo_poll = mqf_poll,
.fo_kqfilter = mqf_kqfilter,
.fo_stat = mqf_stat,
.fo_chmod = mqf_chmod,
.fo_chown = mqf_chown,
.fo_close = mqf_close
};
static struct vop_vector mqfs_vnodeops = {
.vop_default = &default_vnodeops,
.vop_access = mqfs_access,
.vop_cachedlookup = mqfs_lookup,
.vop_lookup = vfs_cache_lookup,
.vop_reclaim = mqfs_reclaim,
.vop_create = mqfs_create,
.vop_remove = mqfs_remove,
.vop_inactive = mqfs_inactive,
.vop_open = mqfs_open,
.vop_close = mqfs_close,
.vop_getattr = mqfs_getattr,
.vop_setattr = mqfs_setattr,
.vop_read = mqfs_read,
.vop_write = VOP_EOPNOTSUPP,
.vop_readdir = mqfs_readdir,
.vop_mkdir = VOP_EOPNOTSUPP,
.vop_rmdir = VOP_EOPNOTSUPP
};
static struct vfsops mqfs_vfsops = {
.vfs_init = mqfs_init,
.vfs_uninit = mqfs_uninit,
.vfs_mount = mqfs_mount,
.vfs_unmount = mqfs_unmount,
.vfs_root = mqfs_root,
.vfs_statfs = mqfs_statfs,
};
static struct vfsconf mqueuefs_vfsconf = {
.vfc_version = VFS_VERSION,
.vfc_name = "mqueuefs",
.vfc_vfsops = &mqfs_vfsops,
.vfc_typenum = -1,
.vfc_flags = VFCF_SYNTHETIC
};
static struct syscall_helper_data mq_syscalls[] = {
SYSCALL_INIT_HELPER(kmq_open),
SYSCALL_INIT_HELPER(kmq_setattr),
SYSCALL_INIT_HELPER(kmq_timedsend),
SYSCALL_INIT_HELPER(kmq_timedreceive),
SYSCALL_INIT_HELPER(kmq_notify),
SYSCALL_INIT_HELPER(kmq_unlink),
SYSCALL_INIT_LAST
};
#ifdef COMPAT_FREEBSD32
#include <compat/freebsd32/freebsd32.h>
#include <compat/freebsd32/freebsd32_proto.h>
#include <compat/freebsd32/freebsd32_syscall.h>
#include <compat/freebsd32/freebsd32_util.h>
static void
mq_attr_from32(const struct mq_attr32 *from, struct mq_attr *to)
{
to->mq_flags = from->mq_flags;
to->mq_maxmsg = from->mq_maxmsg;
to->mq_msgsize = from->mq_msgsize;
to->mq_curmsgs = from->mq_curmsgs;
}
static void
mq_attr_to32(const struct mq_attr *from, struct mq_attr32 *to)
{
to->mq_flags = from->mq_flags;
to->mq_maxmsg = from->mq_maxmsg;
to->mq_msgsize = from->mq_msgsize;
to->mq_curmsgs = from->mq_curmsgs;
}
int
freebsd32_kmq_open(struct thread *td, struct freebsd32_kmq_open_args *uap)
{
struct mq_attr attr;
struct mq_attr32 attr32;
int flags, error;
if ((uap->flags & O_ACCMODE) == O_ACCMODE)
return (EINVAL);
flags = FFLAGS(uap->flags);
if ((flags & O_CREAT) != 0 && uap->attr != NULL) {
error = copyin(uap->attr, &attr32, sizeof(attr32));
if (error)
return (error);
mq_attr_from32(&attr32, &attr);
}
return (kern_kmq_open(td, uap->path, flags, uap->mode,
uap->attr != NULL ? &attr : NULL));
}
int
freebsd32_kmq_setattr(struct thread *td, struct freebsd32_kmq_setattr_args *uap)
{
struct mq_attr attr, oattr;
struct mq_attr32 attr32, oattr32;
int error;
if (uap->attr != NULL) {
error = copyin(uap->attr, &attr32, sizeof(attr32));
if (error != 0)
return (error);
mq_attr_from32(&attr32, &attr);
}
error = kern_kmq_setattr(td, uap->mqd, uap->attr != NULL ? &attr : NULL,
&oattr);
if (error != 0)
return (error);
if (uap->oattr != NULL) {
mq_attr_to32(&oattr, &oattr32);
error = copyout(&oattr32, uap->oattr, sizeof(oattr32));
}
return (error);
}
int
freebsd32_kmq_timedsend(struct thread *td,
struct freebsd32_kmq_timedsend_args *uap)
{
struct mqueue *mq;
struct file *fp;
struct timespec32 ets32;
struct timespec *abs_timeout, ets;
int error;
int waitok;
error = getmq_read(td, uap->mqd, &fp, NULL, &mq);
if (error)
return (error);
if (uap->abs_timeout != NULL) {
error = copyin(uap->abs_timeout, &ets32, sizeof(ets32));
if (error != 0)
return (error);
CP(ets32, ets, tv_sec);
CP(ets32, ets, tv_nsec);
abs_timeout = &ets;
} else
abs_timeout = NULL;
waitok = !(fp->f_flag & O_NONBLOCK);
error = mqueue_send(mq, uap->msg_ptr, uap->msg_len,
uap->msg_prio, waitok, abs_timeout);
fdrop(fp, td);
return (error);
}
int
freebsd32_kmq_timedreceive(struct thread *td,
struct freebsd32_kmq_timedreceive_args *uap)
{
struct mqueue *mq;
struct file *fp;
struct timespec32 ets32;
struct timespec *abs_timeout, ets;
int error, waitok;
error = getmq_write(td, uap->mqd, &fp, NULL, &mq);
if (error)
return (error);
if (uap->abs_timeout != NULL) {
error = copyin(uap->abs_timeout, &ets32, sizeof(ets32));
if (error != 0)
return (error);
CP(ets32, ets, tv_sec);
CP(ets32, ets, tv_nsec);
abs_timeout = &ets;
} else
abs_timeout = NULL;
waitok = !(fp->f_flag & O_NONBLOCK);
error = mqueue_receive(mq, uap->msg_ptr, uap->msg_len,
uap->msg_prio, waitok, abs_timeout);
fdrop(fp, td);
return (error);
}
static struct syscall_helper_data mq32_syscalls[] = {
SYSCALL32_INIT_HELPER(freebsd32_kmq_open),
SYSCALL32_INIT_HELPER(freebsd32_kmq_setattr),
SYSCALL32_INIT_HELPER(freebsd32_kmq_timedsend),
SYSCALL32_INIT_HELPER(freebsd32_kmq_timedreceive),
- SYSCALL32_INIT_HELPER(kmq_notify),
- SYSCALL32_INIT_HELPER(kmq_unlink),
+ SYSCALL32_INIT_HELPER_COMPAT(kmq_notify),
+ SYSCALL32_INIT_HELPER_COMPAT(kmq_unlink),
SYSCALL_INIT_LAST
};
#endif
static int
mqinit(void)
{
int error;
error = syscall_helper_register(mq_syscalls);
if (error != 0)
return (error);
#ifdef COMPAT_FREEBSD32
error = syscall32_helper_register(mq32_syscalls);
if (error != 0)
return (error);
#endif
return (0);
}
static int
mqunload(void)
{
#ifdef COMPAT_FREEBSD32
syscall32_helper_unregister(mq32_syscalls);
#endif
syscall_helper_unregister(mq_syscalls);
return (0);
}
static int
mq_modload(struct module *module, int cmd, void *arg)
{
int error = 0;
error = vfs_modevent(module, cmd, arg);
if (error != 0)
return (error);
switch (cmd) {
case MOD_LOAD:
error = mqinit();
if (error != 0)
mqunload();
break;
case MOD_UNLOAD:
error = mqunload();
break;
default:
break;
}
return (error);
}
static moduledata_t mqueuefs_mod = {
"mqueuefs",
mq_modload,
&mqueuefs_vfsconf
};
DECLARE_MODULE(mqueuefs, mqueuefs_mod, SI_SUB_VFS, SI_ORDER_MIDDLE);
MODULE_VERSION(mqueuefs, 1);
Index: head/sys/kern/uipc_sem.c
===================================================================
--- head/sys/kern/uipc_sem.c (revision 225616)
+++ head/sys/kern/uipc_sem.c (revision 225617)
@@ -1,1091 +1,1091 @@
/*-
* Copyright (c) 2002 Alfred Perlstein <alfred@FreeBSD.org>
* Copyright (c) 2003-2005 SPARTA, Inc.
* Copyright (c) 2005 Robert N. M. Watson
* All rights reserved.
*
* This software was developed for the FreeBSD Project in part by Network
* Associates Laboratories, the Security Research Division of Network
* Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"),
* as part of the DARPA CHATS research program.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_compat.h"
#include "opt_posix.h"
#include <sys/param.h>
#include <sys/capability.h>
#include <sys/condvar.h>
#include <sys/fcntl.h>
#include <sys/file.h>
#include <sys/filedesc.h>
#include <sys/fnv_hash.h>
#include <sys/kernel.h>
#include <sys/ksem.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/module.h>
#include <sys/mutex.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/posix4.h>
#include <sys/_semaphore.h>
#include <sys/stat.h>
#include <sys/syscall.h>
#include <sys/syscallsubr.h>
#include <sys/sysctl.h>
#include <sys/sysent.h>
#include <sys/sysproto.h>
#include <sys/systm.h>
#include <sys/sx.h>
#include <sys/vnode.h>
#include <security/mac/mac_framework.h>
FEATURE(p1003_1b_semaphores, "POSIX P1003.1B semaphores support");
/*
* TODO
*
* - Resource limits?
* - Update fstat(1)
* - Replace global sem_lock with mtx_pool locks?
* - Add a MAC check_create() hook for creating new named semaphores.
*/
#ifndef SEM_MAX
#define SEM_MAX 30
#endif
#ifdef SEM_DEBUG
#define DP(x) printf x
#else
#define DP(x)
#endif
struct ksem_mapping {
char *km_path;
Fnv32_t km_fnv;
struct ksem *km_ksem;
LIST_ENTRY(ksem_mapping) km_link;
};
static MALLOC_DEFINE(M_KSEM, "ksem", "semaphore file descriptor");
static LIST_HEAD(, ksem_mapping) *ksem_dictionary;
static struct sx ksem_dict_lock;
static struct mtx ksem_count_lock;
static struct mtx sem_lock;
static u_long ksem_hash;
static int ksem_dead;
#define KSEM_HASH(fnv) (&ksem_dictionary[(fnv) & ksem_hash])
static int nsems = 0;
SYSCTL_DECL(_p1003_1b);
SYSCTL_INT(_p1003_1b, OID_AUTO, nsems, CTLFLAG_RD, &nsems, 0,
"Number of active kernel POSIX semaphores");
static int kern_sem_wait(struct thread *td, semid_t id, int tryflag,
struct timespec *abstime);
static int ksem_access(struct ksem *ks, struct ucred *ucred);
static struct ksem *ksem_alloc(struct ucred *ucred, mode_t mode,
unsigned int value);
static int ksem_create(struct thread *td, const char *path,
semid_t *semidp, mode_t mode, unsigned int value,
int flags, int compat32);
static void ksem_drop(struct ksem *ks);
static int ksem_get(struct thread *td, semid_t id, cap_rights_t rights,
struct file **fpp);
static struct ksem *ksem_hold(struct ksem *ks);
static void ksem_insert(char *path, Fnv32_t fnv, struct ksem *ks);
static struct ksem *ksem_lookup(char *path, Fnv32_t fnv);
static void ksem_module_destroy(void);
static int ksem_module_init(void);
static int ksem_remove(char *path, Fnv32_t fnv, struct ucred *ucred);
static int sem_modload(struct module *module, int cmd, void *arg);
static fo_rdwr_t ksem_read;
static fo_rdwr_t ksem_write;
static fo_truncate_t ksem_truncate;
static fo_ioctl_t ksem_ioctl;
static fo_poll_t ksem_poll;
static fo_kqfilter_t ksem_kqfilter;
static fo_stat_t ksem_stat;
static fo_close_t ksem_closef;
static fo_chmod_t ksem_chmod;
static fo_chown_t ksem_chown;
/* File descriptor operations. */
static struct fileops ksem_ops = {
.fo_read = ksem_read,
.fo_write = ksem_write,
.fo_truncate = ksem_truncate,
.fo_ioctl = ksem_ioctl,
.fo_poll = ksem_poll,
.fo_kqfilter = ksem_kqfilter,
.fo_stat = ksem_stat,
.fo_close = ksem_closef,
.fo_chmod = ksem_chmod,
.fo_chown = ksem_chown,
.fo_flags = DFLAG_PASSABLE
};
FEATURE(posix_sem, "POSIX semaphores");
static int
ksem_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
int flags, struct thread *td)
{
return (EOPNOTSUPP);
}
static int
ksem_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
int flags, struct thread *td)
{
return (EOPNOTSUPP);
}
static int
ksem_truncate(struct file *fp, off_t length, struct ucred *active_cred,
struct thread *td)
{
return (EINVAL);
}
static int
ksem_ioctl(struct file *fp, u_long com, void *data,
struct ucred *active_cred, struct thread *td)
{
return (EOPNOTSUPP);
}
static int
ksem_poll(struct file *fp, int events, struct ucred *active_cred,
struct thread *td)
{
return (EOPNOTSUPP);
}
static int
ksem_kqfilter(struct file *fp, struct knote *kn)
{
return (EOPNOTSUPP);
}
static int
ksem_stat(struct file *fp, struct stat *sb, struct ucred *active_cred,
struct thread *td)
{
struct ksem *ks;
#ifdef MAC
int error;
#endif
ks = fp->f_data;
#ifdef MAC
error = mac_posixsem_check_stat(active_cred, fp->f_cred, ks);
if (error)
return (error);
#endif
/*
* Attempt to return sanish values for fstat() on a semaphore
* file descriptor.
*/
bzero(sb, sizeof(*sb));
mtx_lock(&sem_lock);
sb->st_atim = ks->ks_atime;
sb->st_ctim = ks->ks_ctime;
sb->st_mtim = ks->ks_mtime;
sb->st_birthtim = ks->ks_birthtime;
sb->st_uid = ks->ks_uid;
sb->st_gid = ks->ks_gid;
sb->st_mode = S_IFREG | ks->ks_mode; /* XXX */
mtx_unlock(&sem_lock);
return (0);
}
static int
ksem_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
struct thread *td)
{
struct ksem *ks;
int error;
error = 0;
ks = fp->f_data;
mtx_lock(&sem_lock);
#ifdef MAC
error = mac_posixsem_check_setmode(active_cred, ks, mode);
if (error != 0)
goto out;
#endif
error = vaccess(VREG, ks->ks_mode, ks->ks_uid, ks->ks_gid, VADMIN,
active_cred, NULL);
if (error != 0)
goto out;
ks->ks_mode = mode & ACCESSPERMS;
out:
mtx_unlock(&sem_lock);
return (error);
}
static int
ksem_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
struct thread *td)
{
struct ksem *ks;
int error;
error = 0;
ks = fp->f_data;
mtx_lock(&sem_lock);
#ifdef MAC
error = mac_posixsem_check_setowner(active_cred, ks, uid, gid);
if (error != 0)
goto out;
#endif
if (uid == (uid_t)-1)
uid = ks->ks_uid;
if (gid == (gid_t)-1)
gid = ks->ks_gid;
if (((uid != ks->ks_uid && uid != active_cred->cr_uid) ||
(gid != ks->ks_gid && !groupmember(gid, active_cred))) &&
(error = priv_check_cred(active_cred, PRIV_VFS_CHOWN, 0)))
goto out;
ks->ks_uid = uid;
ks->ks_gid = gid;
out:
mtx_unlock(&sem_lock);
return (error);
}
static int
ksem_closef(struct file *fp, struct thread *td)
{
struct ksem *ks;
ks = fp->f_data;
fp->f_data = NULL;
ksem_drop(ks);
return (0);
}
/*
* ksem object management including creation and reference counting
* routines.
*/
static struct ksem *
ksem_alloc(struct ucred *ucred, mode_t mode, unsigned int value)
{
struct ksem *ks;
mtx_lock(&ksem_count_lock);
if (nsems == p31b_getcfg(CTL_P1003_1B_SEM_NSEMS_MAX) || ksem_dead) {
mtx_unlock(&ksem_count_lock);
return (NULL);
}
nsems++;
mtx_unlock(&ksem_count_lock);
ks = malloc(sizeof(*ks), M_KSEM, M_WAITOK | M_ZERO);
ks->ks_uid = ucred->cr_uid;
ks->ks_gid = ucred->cr_gid;
ks->ks_mode = mode;
ks->ks_value = value;
cv_init(&ks->ks_cv, "ksem");
vfs_timestamp(&ks->ks_birthtime);
ks->ks_atime = ks->ks_mtime = ks->ks_ctime = ks->ks_birthtime;
refcount_init(&ks->ks_ref, 1);
#ifdef MAC
mac_posixsem_init(ks);
mac_posixsem_create(ucred, ks);
#endif
return (ks);
}
static struct ksem *
ksem_hold(struct ksem *ks)
{
refcount_acquire(&ks->ks_ref);
return (ks);
}
static void
ksem_drop(struct ksem *ks)
{
if (refcount_release(&ks->ks_ref)) {
#ifdef MAC
mac_posixsem_destroy(ks);
#endif
cv_destroy(&ks->ks_cv);
free(ks, M_KSEM);
mtx_lock(&ksem_count_lock);
nsems--;
mtx_unlock(&ksem_count_lock);
}
}
/*
* Determine if the credentials have sufficient permissions for read
* and write access.
*/
static int
ksem_access(struct ksem *ks, struct ucred *ucred)
{
int error;
error = vaccess(VREG, ks->ks_mode, ks->ks_uid, ks->ks_gid,
VREAD | VWRITE, ucred, NULL);
if (error)
error = priv_check_cred(ucred, PRIV_SEM_WRITE, 0);
return (error);
}
/*
* Dictionary management. We maintain an in-kernel dictionary to map
* paths to semaphore objects. We use the FNV hash on the path to
* store the mappings in a hash table.
*/
static struct ksem *
ksem_lookup(char *path, Fnv32_t fnv)
{
struct ksem_mapping *map;
LIST_FOREACH(map, KSEM_HASH(fnv), km_link) {
if (map->km_fnv != fnv)
continue;
if (strcmp(map->km_path, path) == 0)
return (map->km_ksem);
}
return (NULL);
}
static void
ksem_insert(char *path, Fnv32_t fnv, struct ksem *ks)
{
struct ksem_mapping *map;
map = malloc(sizeof(struct ksem_mapping), M_KSEM, M_WAITOK);
map->km_path = path;
map->km_fnv = fnv;
map->km_ksem = ksem_hold(ks);
LIST_INSERT_HEAD(KSEM_HASH(fnv), map, km_link);
}
static int
ksem_remove(char *path, Fnv32_t fnv, struct ucred *ucred)
{
struct ksem_mapping *map;
int error;
LIST_FOREACH(map, KSEM_HASH(fnv), km_link) {
if (map->km_fnv != fnv)
continue;
if (strcmp(map->km_path, path) == 0) {
#ifdef MAC
error = mac_posixsem_check_unlink(ucred, map->km_ksem);
if (error)
return (error);
#endif
error = ksem_access(map->km_ksem, ucred);
if (error)
return (error);
LIST_REMOVE(map, km_link);
ksem_drop(map->km_ksem);
free(map->km_path, M_KSEM);
free(map, M_KSEM);
return (0);
}
}
return (ENOENT);
}
static int
ksem_create_copyout_semid(struct thread *td, semid_t *semidp, int fd,
int compat32)
{
semid_t semid;
#ifdef COMPAT_FREEBSD32
int32_t semid32;
#endif
void *ptr;
size_t ptrs;
#ifdef COMPAT_FREEBSD32
if (compat32) {
semid32 = fd;
ptr = &semid32;
ptrs = sizeof(semid32);
} else {
#endif
semid = fd;
ptr = &semid;
ptrs = sizeof(semid);
compat32 = 0; /* silence gcc */
#ifdef COMPAT_FREEBSD32
}
#endif
return (copyout(ptr, semidp, ptrs));
}
/* Other helper routines. */
static int
ksem_create(struct thread *td, const char *name, semid_t *semidp, mode_t mode,
unsigned int value, int flags, int compat32)
{
struct filedesc *fdp;
struct ksem *ks;
struct file *fp;
char *path;
Fnv32_t fnv;
int error, fd;
if (value > SEM_VALUE_MAX)
return (EINVAL);
fdp = td->td_proc->p_fd;
mode = (mode & ~fdp->fd_cmask) & ACCESSPERMS;
error = falloc(td, &fp, &fd, 0);
if (error) {
if (name == NULL)
error = ENOSPC;
return (error);
}
/*
* Go ahead and copyout the file descriptor now. This is a bit
* premature, but it is a lot easier to handle errors as opposed
* to later when we've possibly created a new semaphore, etc.
*/
error = ksem_create_copyout_semid(td, semidp, fd, compat32);
if (error) {
fdclose(fdp, fp, fd, td);
fdrop(fp, td);
return (error);
}
if (name == NULL) {
/* Create an anonymous semaphore. */
ks = ksem_alloc(td->td_ucred, mode, value);
if (ks == NULL)
error = ENOSPC;
else
ks->ks_flags |= KS_ANONYMOUS;
} else {
path = malloc(MAXPATHLEN, M_KSEM, M_WAITOK);
error = copyinstr(name, path, MAXPATHLEN, NULL);
/* Require paths to start with a '/' character. */
if (error == 0 && path[0] != '/')
error = EINVAL;
if (error) {
fdclose(fdp, fp, fd, td);
fdrop(fp, td);
free(path, M_KSEM);
return (error);
}
fnv = fnv_32_str(path, FNV1_32_INIT);
sx_xlock(&ksem_dict_lock);
ks = ksem_lookup(path, fnv);
if (ks == NULL) {
/* Object does not exist, create it if requested. */
if (flags & O_CREAT) {
ks = ksem_alloc(td->td_ucred, mode, value);
if (ks == NULL)
error = ENFILE;
else {
ksem_insert(path, fnv, ks);
path = NULL;
}
} else
error = ENOENT;
} else {
/*
* Object already exists, obtain a new
* reference if requested and permitted.
*/
if ((flags & (O_CREAT | O_EXCL)) ==
(O_CREAT | O_EXCL))
error = EEXIST;
else {
#ifdef MAC
error = mac_posixsem_check_open(td->td_ucred,
ks);
if (error == 0)
#endif
error = ksem_access(ks, td->td_ucred);
}
if (error == 0)
ksem_hold(ks);
#ifdef INVARIANTS
else
ks = NULL;
#endif
}
sx_xunlock(&ksem_dict_lock);
if (path)
free(path, M_KSEM);
}
if (error) {
KASSERT(ks == NULL, ("ksem_create error with a ksem"));
fdclose(fdp, fp, fd, td);
fdrop(fp, td);
return (error);
}
KASSERT(ks != NULL, ("ksem_create w/o a ksem"));
finit(fp, FREAD | FWRITE, DTYPE_SEM, ks, &ksem_ops);
FILEDESC_XLOCK(fdp);
if (fdp->fd_ofiles[fd] == fp)
fdp->fd_ofileflags[fd] |= UF_EXCLOSE;
FILEDESC_XUNLOCK(fdp);
fdrop(fp, td);
return (0);
}
static int
ksem_get(struct thread *td, semid_t id, cap_rights_t rights, struct file **fpp)
{
struct ksem *ks;
struct file *fp;
int error;
error = fget(td, id, rights, &fp);
if (error)
return (EINVAL);
if (fp->f_type != DTYPE_SEM) {
fdrop(fp, td);
return (EINVAL);
}
ks = fp->f_data;
if (ks->ks_flags & KS_DEAD) {
fdrop(fp, td);
return (EINVAL);
}
*fpp = fp;
return (0);
}
/* System calls. */
#ifndef _SYS_SYSPROTO_H_
struct ksem_init_args {
unsigned int value;
semid_t *idp;
};
#endif
int
-ksem_init(struct thread *td, struct ksem_init_args *uap)
+sys_ksem_init(struct thread *td, struct ksem_init_args *uap)
{
return (ksem_create(td, NULL, uap->idp, S_IRWXU | S_IRWXG, uap->value,
0, 0));
}
#ifndef _SYS_SYSPROTO_H_
struct ksem_open_args {
char *name;
int oflag;
mode_t mode;
unsigned int value;
semid_t *idp;
};
#endif
int
-ksem_open(struct thread *td, struct ksem_open_args *uap)
+sys_ksem_open(struct thread *td, struct ksem_open_args *uap)
{
DP((">>> ksem_open start, pid=%d\n", (int)td->td_proc->p_pid));
if ((uap->oflag & ~(O_CREAT | O_EXCL)) != 0)
return (EINVAL);
return (ksem_create(td, uap->name, uap->idp, uap->mode, uap->value,
uap->oflag, 0));
}
#ifndef _SYS_SYSPROTO_H_
struct ksem_unlink_args {
char *name;
};
#endif
int
-ksem_unlink(struct thread *td, struct ksem_unlink_args *uap)
+sys_ksem_unlink(struct thread *td, struct ksem_unlink_args *uap)
{
char *path;
Fnv32_t fnv;
int error;
path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
error = copyinstr(uap->name, path, MAXPATHLEN, NULL);
if (error) {
free(path, M_TEMP);
return (error);
}
fnv = fnv_32_str(path, FNV1_32_INIT);
sx_xlock(&ksem_dict_lock);
error = ksem_remove(path, fnv, td->td_ucred);
sx_xunlock(&ksem_dict_lock);
free(path, M_TEMP);
return (error);
}
#ifndef _SYS_SYSPROTO_H_
struct ksem_close_args {
semid_t id;
};
#endif
int
-ksem_close(struct thread *td, struct ksem_close_args *uap)
+sys_ksem_close(struct thread *td, struct ksem_close_args *uap)
{
struct ksem *ks;
struct file *fp;
int error;
/* No capability rights required to close a semaphore. */
error = ksem_get(td, uap->id, 0, &fp);
if (error)
return (error);
ks = fp->f_data;
if (ks->ks_flags & KS_ANONYMOUS) {
fdrop(fp, td);
return (EINVAL);
}
error = kern_close(td, uap->id);
fdrop(fp, td);
return (error);
}
#ifndef _SYS_SYSPROTO_H_
struct ksem_post_args {
semid_t id;
};
#endif
int
-ksem_post(struct thread *td, struct ksem_post_args *uap)
+sys_ksem_post(struct thread *td, struct ksem_post_args *uap)
{
struct file *fp;
struct ksem *ks;
int error;
error = ksem_get(td, uap->id, CAP_SEM_POST, &fp);
if (error)
return (error);
ks = fp->f_data;
mtx_lock(&sem_lock);
#ifdef MAC
error = mac_posixsem_check_post(td->td_ucred, fp->f_cred, ks);
if (error)
goto err;
#endif
if (ks->ks_value == SEM_VALUE_MAX) {
error = EOVERFLOW;
goto err;
}
++ks->ks_value;
if (ks->ks_waiters > 0)
cv_signal(&ks->ks_cv);
error = 0;
vfs_timestamp(&ks->ks_ctime);
err:
mtx_unlock(&sem_lock);
fdrop(fp, td);
return (error);
}
#ifndef _SYS_SYSPROTO_H_
struct ksem_wait_args {
semid_t id;
};
#endif
int
-ksem_wait(struct thread *td, struct ksem_wait_args *uap)
+sys_ksem_wait(struct thread *td, struct ksem_wait_args *uap)
{
return (kern_sem_wait(td, uap->id, 0, NULL));
}
#ifndef _SYS_SYSPROTO_H_
struct ksem_timedwait_args {
semid_t id;
const struct timespec *abstime;
};
#endif
int
-ksem_timedwait(struct thread *td, struct ksem_timedwait_args *uap)
+sys_ksem_timedwait(struct thread *td, struct ksem_timedwait_args *uap)
{
struct timespec abstime;
struct timespec *ts;
int error;
/*
* We allow a null timespec (wait forever).
*/
if (uap->abstime == NULL)
ts = NULL;
else {
error = copyin(uap->abstime, &abstime, sizeof(abstime));
if (error != 0)
return (error);
if (abstime.tv_nsec >= 1000000000 || abstime.tv_nsec < 0)
return (EINVAL);
ts = &abstime;
}
return (kern_sem_wait(td, uap->id, 0, ts));
}
#ifndef _SYS_SYSPROTO_H_
struct ksem_trywait_args {
semid_t id;
};
#endif
int
-ksem_trywait(struct thread *td, struct ksem_trywait_args *uap)
+sys_ksem_trywait(struct thread *td, struct ksem_trywait_args *uap)
{
return (kern_sem_wait(td, uap->id, 1, NULL));
}
static int
kern_sem_wait(struct thread *td, semid_t id, int tryflag,
struct timespec *abstime)
{
struct timespec ts1, ts2;
struct timeval tv;
struct file *fp;
struct ksem *ks;
int error;
DP((">>> kern_sem_wait entered! pid=%d\n", (int)td->td_proc->p_pid));
error = ksem_get(td, id, CAP_SEM_WAIT, &fp);
if (error)
return (error);
ks = fp->f_data;
mtx_lock(&sem_lock);
DP((">>> kern_sem_wait critical section entered! pid=%d\n",
(int)td->td_proc->p_pid));
#ifdef MAC
error = mac_posixsem_check_wait(td->td_ucred, fp->f_cred, ks);
if (error) {
DP(("kern_sem_wait mac failed\n"));
goto err;
}
#endif
DP(("kern_sem_wait value = %d, tryflag %d\n", ks->ks_value, tryflag));
vfs_timestamp(&ks->ks_atime);
while (ks->ks_value == 0) {
ks->ks_waiters++;
if (tryflag != 0)
error = EAGAIN;
else if (abstime == NULL)
error = cv_wait_sig(&ks->ks_cv, &sem_lock);
else {
for (;;) {
ts1 = *abstime;
getnanotime(&ts2);
timespecsub(&ts1, &ts2);
TIMESPEC_TO_TIMEVAL(&tv, &ts1);
if (tv.tv_sec < 0) {
error = ETIMEDOUT;
break;
}
error = cv_timedwait_sig(&ks->ks_cv,
&sem_lock, tvtohz(&tv));
if (error != EWOULDBLOCK)
break;
}
}
ks->ks_waiters--;
if (error)
goto err;
}
ks->ks_value--;
DP(("kern_sem_wait value post-decrement = %d\n", ks->ks_value));
error = 0;
err:
mtx_unlock(&sem_lock);
fdrop(fp, td);
DP(("<<< kern_sem_wait leaving, pid=%d, error = %d\n",
(int)td->td_proc->p_pid, error));
return (error);
}
#ifndef _SYS_SYSPROTO_H_
struct ksem_getvalue_args {
semid_t id;
int *val;
};
#endif
int
-ksem_getvalue(struct thread *td, struct ksem_getvalue_args *uap)
+sys_ksem_getvalue(struct thread *td, struct ksem_getvalue_args *uap)
{
struct file *fp;
struct ksem *ks;
int error, val;
error = ksem_get(td, uap->id, CAP_SEM_GETVALUE, &fp);
if (error)
return (error);
ks = fp->f_data;
mtx_lock(&sem_lock);
#ifdef MAC
error = mac_posixsem_check_getvalue(td->td_ucred, fp->f_cred, ks);
if (error) {
mtx_unlock(&sem_lock);
fdrop(fp, td);
return (error);
}
#endif
val = ks->ks_value;
vfs_timestamp(&ks->ks_atime);
mtx_unlock(&sem_lock);
fdrop(fp, td);
error = copyout(&val, uap->val, sizeof(val));
return (error);
}
#ifndef _SYS_SYSPROTO_H_
struct ksem_destroy_args {
semid_t id;
};
#endif
int
-ksem_destroy(struct thread *td, struct ksem_destroy_args *uap)
+sys_ksem_destroy(struct thread *td, struct ksem_destroy_args *uap)
{
struct file *fp;
struct ksem *ks;
int error;
/* No capability rights required to close a semaphore. */
error = ksem_get(td, uap->id, 0, &fp);
if (error)
return (error);
ks = fp->f_data;
if (!(ks->ks_flags & KS_ANONYMOUS)) {
fdrop(fp, td);
return (EINVAL);
}
mtx_lock(&sem_lock);
if (ks->ks_waiters != 0) {
mtx_unlock(&sem_lock);
error = EBUSY;
goto err;
}
ks->ks_flags |= KS_DEAD;
mtx_unlock(&sem_lock);
error = kern_close(td, uap->id);
err:
fdrop(fp, td);
return (error);
}
static struct syscall_helper_data ksem_syscalls[] = {
SYSCALL_INIT_HELPER(ksem_init),
SYSCALL_INIT_HELPER(ksem_open),
SYSCALL_INIT_HELPER(ksem_unlink),
SYSCALL_INIT_HELPER(ksem_close),
SYSCALL_INIT_HELPER(ksem_post),
SYSCALL_INIT_HELPER(ksem_wait),
SYSCALL_INIT_HELPER(ksem_timedwait),
SYSCALL_INIT_HELPER(ksem_trywait),
SYSCALL_INIT_HELPER(ksem_getvalue),
SYSCALL_INIT_HELPER(ksem_destroy),
SYSCALL_INIT_LAST
};
#ifdef COMPAT_FREEBSD32
#include <compat/freebsd32/freebsd32.h>
#include <compat/freebsd32/freebsd32_proto.h>
#include <compat/freebsd32/freebsd32_signal.h>
#include <compat/freebsd32/freebsd32_syscall.h>
#include <compat/freebsd32/freebsd32_util.h>
int
freebsd32_ksem_init(struct thread *td, struct freebsd32_ksem_init_args *uap)
{
return (ksem_create(td, NULL, uap->idp, S_IRWXU | S_IRWXG, uap->value,
0, 1));
}
int
freebsd32_ksem_open(struct thread *td, struct freebsd32_ksem_open_args *uap)
{
if ((uap->oflag & ~(O_CREAT | O_EXCL)) != 0)
return (EINVAL);
return (ksem_create(td, uap->name, uap->idp, uap->mode, uap->value,
uap->oflag, 1));
}
int
freebsd32_ksem_timedwait(struct thread *td,
struct freebsd32_ksem_timedwait_args *uap)
{
struct timespec32 abstime32;
struct timespec *ts, abstime;
int error;
/*
* We allow a null timespec (wait forever).
*/
if (uap->abstime == NULL)
ts = NULL;
else {
error = copyin(uap->abstime, &abstime32, sizeof(abstime32));
if (error != 0)
return (error);
CP(abstime32, abstime, tv_sec);
CP(abstime32, abstime, tv_nsec);
if (abstime.tv_nsec >= 1000000000 || abstime.tv_nsec < 0)
return (EINVAL);
ts = &abstime;
}
return (kern_sem_wait(td, uap->id, 0, ts));
}
static struct syscall_helper_data ksem32_syscalls[] = {
SYSCALL32_INIT_HELPER(freebsd32_ksem_init),
SYSCALL32_INIT_HELPER(freebsd32_ksem_open),
- SYSCALL32_INIT_HELPER(ksem_unlink),
- SYSCALL32_INIT_HELPER(ksem_close),
- SYSCALL32_INIT_HELPER(ksem_post),
- SYSCALL32_INIT_HELPER(ksem_wait),
+ SYSCALL32_INIT_HELPER_COMPAT(ksem_unlink),
+ SYSCALL32_INIT_HELPER_COMPAT(ksem_close),
+ SYSCALL32_INIT_HELPER_COMPAT(ksem_post),
+ SYSCALL32_INIT_HELPER_COMPAT(ksem_wait),
SYSCALL32_INIT_HELPER(freebsd32_ksem_timedwait),
- SYSCALL32_INIT_HELPER(ksem_trywait),
- SYSCALL32_INIT_HELPER(ksem_getvalue),
- SYSCALL32_INIT_HELPER(ksem_destroy),
+ SYSCALL32_INIT_HELPER_COMPAT(ksem_trywait),
+ SYSCALL32_INIT_HELPER_COMPAT(ksem_getvalue),
+ SYSCALL32_INIT_HELPER_COMPAT(ksem_destroy),
SYSCALL_INIT_LAST
};
#endif
static int
ksem_module_init(void)
{
int error;
mtx_init(&sem_lock, "sem", NULL, MTX_DEF);
mtx_init(&ksem_count_lock, "ksem count", NULL, MTX_DEF);
sx_init(&ksem_dict_lock, "ksem dictionary");
ksem_dictionary = hashinit(1024, M_KSEM, &ksem_hash);
p31b_setcfg(CTL_P1003_1B_SEMAPHORES, 200112L);
p31b_setcfg(CTL_P1003_1B_SEM_NSEMS_MAX, SEM_MAX);
p31b_setcfg(CTL_P1003_1B_SEM_VALUE_MAX, SEM_VALUE_MAX);
error = syscall_helper_register(ksem_syscalls);
if (error)
return (error);
#ifdef COMPAT_FREEBSD32
error = syscall32_helper_register(ksem32_syscalls);
if (error)
return (error);
#endif
return (0);
}
static void
ksem_module_destroy(void)
{
#ifdef COMPAT_FREEBSD32
syscall32_helper_unregister(ksem32_syscalls);
#endif
syscall_helper_unregister(ksem_syscalls);
p31b_setcfg(CTL_P1003_1B_SEMAPHORES, 0);
hashdestroy(ksem_dictionary, M_KSEM, ksem_hash);
sx_destroy(&ksem_dict_lock);
mtx_destroy(&ksem_count_lock);
mtx_destroy(&sem_lock);
p31b_unsetcfg(CTL_P1003_1B_SEM_VALUE_MAX);
p31b_unsetcfg(CTL_P1003_1B_SEM_NSEMS_MAX);
}
static int
sem_modload(struct module *module, int cmd, void *arg)
{
int error = 0;
switch (cmd) {
case MOD_LOAD:
error = ksem_module_init();
if (error)
ksem_module_destroy();
break;
case MOD_UNLOAD:
mtx_lock(&ksem_count_lock);
if (nsems != 0) {
error = EOPNOTSUPP;
mtx_unlock(&ksem_count_lock);
break;
}
ksem_dead = 1;
mtx_unlock(&ksem_count_lock);
ksem_module_destroy();
break;
case MOD_SHUTDOWN:
break;
default:
error = EINVAL;
break;
}
return (error);
}
static moduledata_t sem_mod = {
"sem",
&sem_modload,
NULL
};
DECLARE_MODULE(sem, sem_mod, SI_SUB_SYSV_SEM, SI_ORDER_FIRST);
MODULE_VERSION(sem, 1);
Index: head/sys/kern/uipc_shm.c
===================================================================
--- head/sys/kern/uipc_shm.c (revision 225616)
+++ head/sys/kern/uipc_shm.c (revision 225617)
@@ -1,727 +1,727 @@
/*-
* Copyright (c) 2006, 2011 Robert N. M. Watson
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* Support for shared swap-backed anonymous memory objects via
* shm_open(2) and shm_unlink(2). While most of the implementation is
* here, vm_mmap.c contains mapping logic changes.
*
* TODO:
*
* (1) Need to export data to a userland tool via a sysctl. Should ipcs(1)
* and ipcrm(1) be expanded or should new tools to manage both POSIX
* kernel semaphores and POSIX shared memory be written?
*
* (2) Add support for this file type to fstat(1).
*
* (3) Resource limits? Does this need its own resource limits or are the
* existing limits in mmap(2) sufficient?
*
* (4) Partial page truncation. vnode_pager_setsize() will zero any parts
* of a partially mapped page as a result of ftruncate(2)/truncate(2).
* We can do the same (with the same pmap evil), but do we need to
* worry about the bits on disk if the page is swapped out or will the
* swapper zero the parts of a page that are invalid if the page is
* swapped back in for us?
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_capsicum.h"
#include <sys/param.h>
#include <sys/capability.h>
#include <sys/fcntl.h>
#include <sys/file.h>
#include <sys/filedesc.h>
#include <sys/fnv_hash.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mman.h>
#include <sys/mutex.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/refcount.h>
#include <sys/resourcevar.h>
#include <sys/stat.h>
#include <sys/sysctl.h>
#include <sys/sysproto.h>
#include <sys/systm.h>
#include <sys/sx.h>
#include <sys/time.h>
#include <sys/vnode.h>
#include <security/mac/mac_framework.h>
#include <vm/vm.h>
#include <vm/vm_param.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
#include <vm/vm_object.h>
#include <vm/vm_page.h>
#include <vm/vm_pager.h>
#include <vm/swap_pager.h>
struct shm_mapping {
char *sm_path;
Fnv32_t sm_fnv;
struct shmfd *sm_shmfd;
LIST_ENTRY(shm_mapping) sm_link;
};
static MALLOC_DEFINE(M_SHMFD, "shmfd", "shared memory file descriptor");
static LIST_HEAD(, shm_mapping) *shm_dictionary;
static struct sx shm_dict_lock;
static struct mtx shm_timestamp_lock;
static u_long shm_hash;
#define SHM_HASH(fnv) (&shm_dictionary[(fnv) & shm_hash])
static int shm_access(struct shmfd *shmfd, struct ucred *ucred, int flags);
static struct shmfd *shm_alloc(struct ucred *ucred, mode_t mode);
static void shm_dict_init(void *arg);
static void shm_drop(struct shmfd *shmfd);
static struct shmfd *shm_hold(struct shmfd *shmfd);
static void shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd);
static struct shmfd *shm_lookup(char *path, Fnv32_t fnv);
static int shm_remove(char *path, Fnv32_t fnv, struct ucred *ucred);
static int shm_dotruncate(struct shmfd *shmfd, off_t length);
static fo_rdwr_t shm_read;
static fo_rdwr_t shm_write;
static fo_truncate_t shm_truncate;
static fo_ioctl_t shm_ioctl;
static fo_poll_t shm_poll;
static fo_kqfilter_t shm_kqfilter;
static fo_stat_t shm_stat;
static fo_close_t shm_close;
static fo_chmod_t shm_chmod;
static fo_chown_t shm_chown;
/* File descriptor operations. */
static struct fileops shm_ops = {
.fo_read = shm_read,
.fo_write = shm_write,
.fo_truncate = shm_truncate,
.fo_ioctl = shm_ioctl,
.fo_poll = shm_poll,
.fo_kqfilter = shm_kqfilter,
.fo_stat = shm_stat,
.fo_close = shm_close,
.fo_chmod = shm_chmod,
.fo_chown = shm_chown,
.fo_flags = DFLAG_PASSABLE
};
FEATURE(posix_shm, "POSIX shared memory");
static int
shm_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
int flags, struct thread *td)
{
return (EOPNOTSUPP);
}
static int
shm_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
int flags, struct thread *td)
{
return (EOPNOTSUPP);
}
static int
shm_truncate(struct file *fp, off_t length, struct ucred *active_cred,
struct thread *td)
{
struct shmfd *shmfd;
#ifdef MAC
int error;
#endif
shmfd = fp->f_data;
#ifdef MAC
error = mac_posixshm_check_truncate(active_cred, fp->f_cred, shmfd);
if (error)
return (error);
#endif
return (shm_dotruncate(shmfd, length));
}
static int
shm_ioctl(struct file *fp, u_long com, void *data,
struct ucred *active_cred, struct thread *td)
{
return (EOPNOTSUPP);
}
static int
shm_poll(struct file *fp, int events, struct ucred *active_cred,
struct thread *td)
{
return (EOPNOTSUPP);
}
static int
shm_kqfilter(struct file *fp, struct knote *kn)
{
return (EOPNOTSUPP);
}
static int
shm_stat(struct file *fp, struct stat *sb, struct ucred *active_cred,
struct thread *td)
{
struct shmfd *shmfd;
#ifdef MAC
int error;
#endif
shmfd = fp->f_data;
#ifdef MAC
error = mac_posixshm_check_stat(active_cred, fp->f_cred, shmfd);
if (error)
return (error);
#endif
/*
* Attempt to return sanish values for fstat() on a memory file
* descriptor.
*/
bzero(sb, sizeof(*sb));
sb->st_blksize = PAGE_SIZE;
sb->st_size = shmfd->shm_size;
sb->st_blocks = (sb->st_size + sb->st_blksize - 1) / sb->st_blksize;
mtx_lock(&shm_timestamp_lock);
sb->st_atim = shmfd->shm_atime;
sb->st_ctim = shmfd->shm_ctime;
sb->st_mtim = shmfd->shm_mtime;
sb->st_birthtim = shmfd->shm_birthtime;
sb->st_mode = S_IFREG | shmfd->shm_mode; /* XXX */
sb->st_uid = shmfd->shm_uid;
sb->st_gid = shmfd->shm_gid;
mtx_unlock(&shm_timestamp_lock);
return (0);
}
static int
shm_close(struct file *fp, struct thread *td)
{
struct shmfd *shmfd;
shmfd = fp->f_data;
fp->f_data = NULL;
shm_drop(shmfd);
return (0);
}
static int
shm_dotruncate(struct shmfd *shmfd, off_t length)
{
vm_object_t object;
vm_page_t m;
vm_pindex_t nobjsize;
vm_ooffset_t delta;
object = shmfd->shm_object;
VM_OBJECT_LOCK(object);
if (length == shmfd->shm_size) {
VM_OBJECT_UNLOCK(object);
return (0);
}
nobjsize = OFF_TO_IDX(length + PAGE_MASK);
/* Are we shrinking? If so, trim the end. */
if (length < shmfd->shm_size) {
delta = ptoa(object->size - nobjsize);
/* Toss in memory pages. */
if (nobjsize < object->size)
vm_object_page_remove(object, nobjsize, object->size,
0);
/* Toss pages from swap. */
if (object->type == OBJT_SWAP)
swap_pager_freespace(object, nobjsize, delta);
/* Free the swap accounted for shm */
swap_release_by_cred(delta, object->cred);
object->charge -= delta;
/*
* If the last page is partially mapped, then zero out
* the garbage at the end of the page. See comments
* in vnode_pager_setsize() for more details.
*
* XXXJHB: This handles in memory pages, but what about
* a page swapped out to disk?
*/
if ((length & PAGE_MASK) &&
(m = vm_page_lookup(object, OFF_TO_IDX(length))) != NULL &&
m->valid != 0) {
int base = (int)length & PAGE_MASK;
int size = PAGE_SIZE - base;
pmap_zero_page_area(m, base, size);
/*
* Update the valid bits to reflect the blocks that
* have been zeroed. Some of these valid bits may
* have already been set.
*/
vm_page_set_valid(m, base, size);
/*
* Round "base" to the next block boundary so that the
* dirty bit for a partially zeroed block is not
* cleared.
*/
base = roundup2(base, DEV_BSIZE);
vm_page_clear_dirty(m, base, PAGE_SIZE - base);
} else if ((length & PAGE_MASK) &&
__predict_false(object->cache != NULL)) {
vm_page_cache_free(object, OFF_TO_IDX(length),
nobjsize);
}
} else {
/* Attempt to reserve the swap */
delta = ptoa(nobjsize - object->size);
if (!swap_reserve_by_cred(delta, object->cred)) {
VM_OBJECT_UNLOCK(object);
return (ENOMEM);
}
object->charge += delta;
}
shmfd->shm_size = length;
mtx_lock(&shm_timestamp_lock);
vfs_timestamp(&shmfd->shm_ctime);
shmfd->shm_mtime = shmfd->shm_ctime;
mtx_unlock(&shm_timestamp_lock);
object->size = nobjsize;
VM_OBJECT_UNLOCK(object);
return (0);
}
/*
* shmfd object management including creation and reference counting
* routines.
*/
static struct shmfd *
shm_alloc(struct ucred *ucred, mode_t mode)
{
struct shmfd *shmfd;
shmfd = malloc(sizeof(*shmfd), M_SHMFD, M_WAITOK | M_ZERO);
shmfd->shm_size = 0;
shmfd->shm_uid = ucred->cr_uid;
shmfd->shm_gid = ucred->cr_gid;
shmfd->shm_mode = mode;
shmfd->shm_object = vm_pager_allocate(OBJT_DEFAULT, NULL,
shmfd->shm_size, VM_PROT_DEFAULT, 0, ucred);
KASSERT(shmfd->shm_object != NULL, ("shm_create: vm_pager_allocate"));
VM_OBJECT_LOCK(shmfd->shm_object);
vm_object_clear_flag(shmfd->shm_object, OBJ_ONEMAPPING);
vm_object_set_flag(shmfd->shm_object, OBJ_NOSPLIT);
VM_OBJECT_UNLOCK(shmfd->shm_object);
vfs_timestamp(&shmfd->shm_birthtime);
shmfd->shm_atime = shmfd->shm_mtime = shmfd->shm_ctime =
shmfd->shm_birthtime;
refcount_init(&shmfd->shm_refs, 1);
#ifdef MAC
mac_posixshm_init(shmfd);
mac_posixshm_create(ucred, shmfd);
#endif
return (shmfd);
}
static struct shmfd *
shm_hold(struct shmfd *shmfd)
{
refcount_acquire(&shmfd->shm_refs);
return (shmfd);
}
static void
shm_drop(struct shmfd *shmfd)
{
if (refcount_release(&shmfd->shm_refs)) {
#ifdef MAC
mac_posixshm_destroy(shmfd);
#endif
vm_object_deallocate(shmfd->shm_object);
free(shmfd, M_SHMFD);
}
}
/*
* Determine if the credentials have sufficient permissions for a
* specified combination of FREAD and FWRITE.
*/
static int
shm_access(struct shmfd *shmfd, struct ucred *ucred, int flags)
{
accmode_t accmode;
int error;
accmode = 0;
if (flags & FREAD)
accmode |= VREAD;
if (flags & FWRITE)
accmode |= VWRITE;
mtx_lock(&shm_timestamp_lock);
error = vaccess(VREG, shmfd->shm_mode, shmfd->shm_uid, shmfd->shm_gid,
accmode, ucred, NULL);
mtx_unlock(&shm_timestamp_lock);
return (error);
}
/*
* Dictionary management. We maintain an in-kernel dictionary to map
* paths to shmfd objects. We use the FNV hash on the path to store
* the mappings in a hash table.
*/
static void
shm_dict_init(void *arg)
{
mtx_init(&shm_timestamp_lock, "shm timestamps", NULL, MTX_DEF);
sx_init(&shm_dict_lock, "shm dictionary");
shm_dictionary = hashinit(1024, M_SHMFD, &shm_hash);
}
SYSINIT(shm_dict_init, SI_SUB_SYSV_SHM, SI_ORDER_ANY, shm_dict_init, NULL);
static struct shmfd *
shm_lookup(char *path, Fnv32_t fnv)
{
struct shm_mapping *map;
LIST_FOREACH(map, SHM_HASH(fnv), sm_link) {
if (map->sm_fnv != fnv)
continue;
if (strcmp(map->sm_path, path) == 0)
return (map->sm_shmfd);
}
return (NULL);
}
static void
shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd)
{
struct shm_mapping *map;
map = malloc(sizeof(struct shm_mapping), M_SHMFD, M_WAITOK);
map->sm_path = path;
map->sm_fnv = fnv;
map->sm_shmfd = shm_hold(shmfd);
LIST_INSERT_HEAD(SHM_HASH(fnv), map, sm_link);
}
static int
shm_remove(char *path, Fnv32_t fnv, struct ucred *ucred)
{
struct shm_mapping *map;
int error;
LIST_FOREACH(map, SHM_HASH(fnv), sm_link) {
if (map->sm_fnv != fnv)
continue;
if (strcmp(map->sm_path, path) == 0) {
#ifdef MAC
error = mac_posixshm_check_unlink(ucred, map->sm_shmfd);
if (error)
return (error);
#endif
error = shm_access(map->sm_shmfd, ucred,
FREAD | FWRITE);
if (error)
return (error);
LIST_REMOVE(map, sm_link);
shm_drop(map->sm_shmfd);
free(map->sm_path, M_SHMFD);
free(map, M_SHMFD);
return (0);
}
}
return (ENOENT);
}
/* System calls. */
int
-shm_open(struct thread *td, struct shm_open_args *uap)
+sys_shm_open(struct thread *td, struct shm_open_args *uap)
{
struct filedesc *fdp;
struct shmfd *shmfd;
struct file *fp;
char *path;
Fnv32_t fnv;
mode_t cmode;
int fd, error;
#ifdef CAPABILITY_MODE
/*
* shm_open(2) is only allowed for anonymous objects.
*/
if (IN_CAPABILITY_MODE(td) && (uap->path != SHM_ANON))
return (ECAPMODE);
#endif
if ((uap->flags & O_ACCMODE) != O_RDONLY &&
(uap->flags & O_ACCMODE) != O_RDWR)
return (EINVAL);
if ((uap->flags & ~(O_ACCMODE | O_CREAT | O_EXCL | O_TRUNC)) != 0)
return (EINVAL);
fdp = td->td_proc->p_fd;
cmode = (uap->mode & ~fdp->fd_cmask) & ACCESSPERMS;
error = falloc(td, &fp, &fd, 0);
if (error)
return (error);
/* A SHM_ANON path pointer creates an anonymous object. */
if (uap->path == SHM_ANON) {
/* A read-only anonymous object is pointless. */
if ((uap->flags & O_ACCMODE) == O_RDONLY) {
fdclose(fdp, fp, fd, td);
fdrop(fp, td);
return (EINVAL);
}
shmfd = shm_alloc(td->td_ucred, cmode);
} else {
path = malloc(MAXPATHLEN, M_SHMFD, M_WAITOK);
error = copyinstr(uap->path, path, MAXPATHLEN, NULL);
/* Require paths to start with a '/' character. */
if (error == 0 && path[0] != '/')
error = EINVAL;
if (error) {
fdclose(fdp, fp, fd, td);
fdrop(fp, td);
free(path, M_SHMFD);
return (error);
}
fnv = fnv_32_str(path, FNV1_32_INIT);
sx_xlock(&shm_dict_lock);
shmfd = shm_lookup(path, fnv);
if (shmfd == NULL) {
/* Object does not yet exist, create it if requested. */
if (uap->flags & O_CREAT) {
#ifdef MAC
error = mac_posixshm_check_create(td->td_ucred,
path);
if (error == 0) {
#endif
shmfd = shm_alloc(td->td_ucred, cmode);
shm_insert(path, fnv, shmfd);
#ifdef MAC
}
#endif
} else {
free(path, M_SHMFD);
error = ENOENT;
}
} else {
/*
* Object already exists, obtain a new
* reference if requested and permitted.
*/
free(path, M_SHMFD);
if ((uap->flags & (O_CREAT | O_EXCL)) ==
(O_CREAT | O_EXCL))
error = EEXIST;
else {
#ifdef MAC
error = mac_posixshm_check_open(td->td_ucred,
shmfd, FFLAGS(uap->flags & O_ACCMODE));
if (error == 0)
#endif
error = shm_access(shmfd, td->td_ucred,
FFLAGS(uap->flags & O_ACCMODE));
}
/*
* Truncate the file back to zero length if
* O_TRUNC was specified and the object was
* opened with read/write.
*/
if (error == 0 &&
(uap->flags & (O_ACCMODE | O_TRUNC)) ==
(O_RDWR | O_TRUNC)) {
#ifdef MAC
error = mac_posixshm_check_truncate(
td->td_ucred, fp->f_cred, shmfd);
if (error == 0)
#endif
shm_dotruncate(shmfd, 0);
}
if (error == 0)
shm_hold(shmfd);
}
sx_xunlock(&shm_dict_lock);
if (error) {
fdclose(fdp, fp, fd, td);
fdrop(fp, td);
return (error);
}
}
finit(fp, FFLAGS(uap->flags & O_ACCMODE), DTYPE_SHM, shmfd, &shm_ops);
FILEDESC_XLOCK(fdp);
if (fdp->fd_ofiles[fd] == fp)
fdp->fd_ofileflags[fd] |= UF_EXCLOSE;
FILEDESC_XUNLOCK(fdp);
td->td_retval[0] = fd;
fdrop(fp, td);
return (0);
}
int
-shm_unlink(struct thread *td, struct shm_unlink_args *uap)
+sys_shm_unlink(struct thread *td, struct shm_unlink_args *uap)
{
char *path;
Fnv32_t fnv;
int error;
path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
error = copyinstr(uap->path, path, MAXPATHLEN, NULL);
if (error) {
free(path, M_TEMP);
return (error);
}
fnv = fnv_32_str(path, FNV1_32_INIT);
sx_xlock(&shm_dict_lock);
error = shm_remove(path, fnv, td->td_ucred);
sx_xunlock(&shm_dict_lock);
free(path, M_TEMP);
return (error);
}
/*
* mmap() helper to validate mmap() requests against shm object state
* and give mmap() the vm_object to use for the mapping.
*/
int
shm_mmap(struct shmfd *shmfd, vm_size_t objsize, vm_ooffset_t foff,
vm_object_t *obj)
{
/*
* XXXRW: This validation is probably insufficient, and subject to
* sign errors. It should be fixed.
*/
if (foff >= shmfd->shm_size ||
foff + objsize > round_page(shmfd->shm_size))
return (EINVAL);
mtx_lock(&shm_timestamp_lock);
vfs_timestamp(&shmfd->shm_atime);
mtx_unlock(&shm_timestamp_lock);
vm_object_reference(shmfd->shm_object);
*obj = shmfd->shm_object;
return (0);
}
static int
shm_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
struct thread *td)
{
struct shmfd *shmfd;
int error;
error = 0;
shmfd = fp->f_data;
mtx_lock(&shm_timestamp_lock);
/*
* SUSv4 says that x bits of permission need not be affected.
* Be consistent with our shm_open there.
*/
#ifdef MAC
error = mac_posixshm_check_setmode(active_cred, shmfd, mode);
if (error != 0)
goto out;
#endif
error = vaccess(VREG, shmfd->shm_mode, shmfd->shm_uid,
shmfd->shm_gid, VADMIN, active_cred, NULL);
if (error != 0)
goto out;
shmfd->shm_mode = mode & ACCESSPERMS;
out:
mtx_unlock(&shm_timestamp_lock);
return (error);
}
static int
shm_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
struct thread *td)
{
struct shmfd *shmfd;
int error;
error = 0;
shmfd = fp->f_data;
mtx_lock(&shm_timestamp_lock);
#ifdef MAC
error = mac_posixshm_check_setowner(active_cred, shmfd, uid, gid);
if (error != 0)
goto out;
#endif
if (uid == (uid_t)-1)
uid = shmfd->shm_uid;
if (gid == (gid_t)-1)
gid = shmfd->shm_gid;
if (((uid != shmfd->shm_uid && uid != active_cred->cr_uid) ||
(gid != shmfd->shm_gid && !groupmember(gid, active_cred))) &&
(error = priv_check_cred(active_cred, PRIV_VFS_CHOWN, 0)))
goto out;
shmfd->shm_uid = uid;
shmfd->shm_gid = gid;
out:
mtx_unlock(&shm_timestamp_lock);
return (error);
}
Index: head/sys/kern/uipc_syscalls.c
===================================================================
--- head/sys/kern/uipc_syscalls.c (revision 225616)
+++ head/sys/kern/uipc_syscalls.c (revision 225617)
@@ -1,2766 +1,2766 @@
/*-
* Copyright (c) 1982, 1986, 1989, 1990, 1993
* The Regents of the University of California. All rights reserved.
*
* sendfile(2) and related extensions:
* Copyright (c) 1998, David Greenman. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)uipc_syscalls.c 8.4 (Berkeley) 2/21/94
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_capsicum.h"
#include "opt_inet.h"
#include "opt_inet6.h"
#include "opt_sctp.h"
#include "opt_compat.h"
#include "opt_ktrace.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/capability.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/sysproto.h>
#include <sys/malloc.h>
#include <sys/filedesc.h>
#include <sys/event.h>
#include <sys/proc.h>
#include <sys/fcntl.h>
#include <sys/file.h>
#include <sys/filio.h>
#include <sys/jail.h>
#include <sys/mount.h>
#include <sys/mbuf.h>
#include <sys/protosw.h>
#include <sys/sf_buf.h>
#include <sys/sysent.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/signalvar.h>
#include <sys/syscallsubr.h>
#include <sys/sysctl.h>
#include <sys/uio.h>
#include <sys/vnode.h>
#ifdef KTRACE
#include <sys/ktrace.h>
#endif
#ifdef COMPAT_FREEBSD32
#include <compat/freebsd32/freebsd32_util.h>
#endif
#include <net/vnet.h>
#include <security/audit/audit.h>
#include <security/mac/mac_framework.h>
#include <vm/vm.h>
#include <vm/vm_object.h>
#include <vm/vm_page.h>
#include <vm/vm_pageout.h>
#include <vm/vm_kern.h>
#include <vm/vm_extern.h>
#if defined(INET) || defined(INET6)
#ifdef SCTP
#include <netinet/sctp.h>
#include <netinet/sctp_peeloff.h>
#endif /* SCTP */
#endif /* INET || INET6 */
static int sendit(struct thread *td, int s, struct msghdr *mp, int flags);
static int recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp);
static int accept1(struct thread *td, struct accept_args *uap, int compat);
static int do_sendfile(struct thread *td, struct sendfile_args *uap, int compat);
static int getsockname1(struct thread *td, struct getsockname_args *uap,
int compat);
static int getpeername1(struct thread *td, struct getpeername_args *uap,
int compat);
/*
* NSFBUFS-related variables and associated sysctls
*/
int nsfbufs;
int nsfbufspeak;
int nsfbufsused;
SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufs, CTLFLAG_RDTUN, &nsfbufs, 0,
"Maximum number of sendfile(2) sf_bufs available");
SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufspeak, CTLFLAG_RD, &nsfbufspeak, 0,
"Number of sendfile(2) sf_bufs at peak usage");
SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufsused, CTLFLAG_RD, &nsfbufsused, 0,
"Number of sendfile(2) sf_bufs in use");
/*
* Convert a user file descriptor to a kernel file entry and check that, if
* it is a capability, the right rights are present. A reference on the file
* entry is held upon returning.
*/
static int
getsock_cap(struct filedesc *fdp, int fd, cap_rights_t rights,
struct file **fpp, u_int *fflagp)
{
struct file *fp;
#ifdef CAPABILITIES
struct file *fp_fromcap;
int error;
#endif
fp = NULL;
if ((fdp == NULL) || ((fp = fget_unlocked(fdp, fd)) == NULL))
return (EBADF);
#ifdef CAPABILITIES
/*
* If the file descriptor is for a capability, test rights and use
* the file descriptor referenced by the capability.
*/
error = cap_funwrap(fp, rights, &fp_fromcap);
if (error) {
fdrop(fp, curthread);
return (error);
}
if (fp != fp_fromcap) {
fhold(fp_fromcap);
fdrop(fp, curthread);
fp = fp_fromcap;
}
#endif /* CAPABILITIES */
if (fp->f_type != DTYPE_SOCKET) {
fdrop(fp, curthread);
return (ENOTSOCK);
}
if (fflagp != NULL)
*fflagp = fp->f_flag;
*fpp = fp;
return (0);
}
/*
* System call interface to the socket abstraction.
*/
#if defined(COMPAT_43)
#define COMPAT_OLDSOCK
#endif
int
-socket(td, uap)
+sys_socket(td, uap)
struct thread *td;
struct socket_args /* {
int domain;
int type;
int protocol;
} */ *uap;
{
struct filedesc *fdp;
struct socket *so;
struct file *fp;
int fd, error;
AUDIT_ARG_SOCKET(uap->domain, uap->type, uap->protocol);
#ifdef MAC
error = mac_socket_check_create(td->td_ucred, uap->domain, uap->type,
uap->protocol);
if (error)
return (error);
#endif
fdp = td->td_proc->p_fd;
error = falloc(td, &fp, &fd, 0);
if (error)
return (error);
/* An extra reference on `fp' has been held for us by falloc(). */
error = socreate(uap->domain, &so, uap->type, uap->protocol,
td->td_ucred, td);
if (error) {
fdclose(fdp, fp, fd, td);
} else {
finit(fp, FREAD | FWRITE, DTYPE_SOCKET, so, &socketops);
td->td_retval[0] = fd;
}
fdrop(fp, td);
return (error);
}
/* ARGSUSED */
int
-bind(td, uap)
+sys_bind(td, uap)
struct thread *td;
struct bind_args /* {
int s;
caddr_t name;
int namelen;
} */ *uap;
{
struct sockaddr *sa;
int error;
if ((error = getsockaddr(&sa, uap->name, uap->namelen)) != 0)
return (error);
error = kern_bind(td, uap->s, sa);
free(sa, M_SONAME);
return (error);
}
int
kern_bind(td, fd, sa)
struct thread *td;
int fd;
struct sockaddr *sa;
{
struct socket *so;
struct file *fp;
int error;
AUDIT_ARG_FD(fd);
error = getsock_cap(td->td_proc->p_fd, fd, CAP_BIND, &fp, NULL);
if (error)
return (error);
so = fp->f_data;
#ifdef KTRACE
if (KTRPOINT(td, KTR_STRUCT))
ktrsockaddr(sa);
#endif
#ifdef MAC
error = mac_socket_check_bind(td->td_ucred, so, sa);
if (error == 0)
#endif
error = sobind(so, sa, td);
fdrop(fp, td);
return (error);
}
/* ARGSUSED */
int
-listen(td, uap)
+sys_listen(td, uap)
struct thread *td;
struct listen_args /* {
int s;
int backlog;
} */ *uap;
{
struct socket *so;
struct file *fp;
int error;
AUDIT_ARG_FD(uap->s);
error = getsock_cap(td->td_proc->p_fd, uap->s, CAP_LISTEN, &fp, NULL);
if (error == 0) {
so = fp->f_data;
#ifdef MAC
error = mac_socket_check_listen(td->td_ucred, so);
if (error == 0)
#endif
error = solisten(so, uap->backlog, td);
fdrop(fp, td);
}
return(error);
}
/*
* accept1()
*/
static int
accept1(td, uap, compat)
struct thread *td;
struct accept_args /* {
int s;
struct sockaddr * __restrict name;
socklen_t * __restrict anamelen;
} */ *uap;
int compat;
{
struct sockaddr *name;
socklen_t namelen;
struct file *fp;
int error;
if (uap->name == NULL)
return (kern_accept(td, uap->s, NULL, NULL, NULL));
error = copyin(uap->anamelen, &namelen, sizeof (namelen));
if (error)
return (error);
error = kern_accept(td, uap->s, &name, &namelen, &fp);
/*
* return a namelen of zero for older code which might
* ignore the return value from accept.
*/
if (error) {
(void) copyout(&namelen,
uap->anamelen, sizeof(*uap->anamelen));
return (error);
}
if (error == 0 && name != NULL) {
#ifdef COMPAT_OLDSOCK
if (compat)
((struct osockaddr *)name)->sa_family =
name->sa_family;
#endif
error = copyout(name, uap->name, namelen);
}
if (error == 0)
error = copyout(&namelen, uap->anamelen,
sizeof(namelen));
if (error)
fdclose(td->td_proc->p_fd, fp, td->td_retval[0], td);
fdrop(fp, td);
free(name, M_SONAME);
return (error);
}
int
kern_accept(struct thread *td, int s, struct sockaddr **name,
socklen_t *namelen, struct file **fp)
{
struct filedesc *fdp;
struct file *headfp, *nfp = NULL;
struct sockaddr *sa = NULL;
int error;
struct socket *head, *so;
int fd;
u_int fflag;
pid_t pgid;
int tmp;
if (name) {
*name = NULL;
if (*namelen < 0)
return (EINVAL);
}
AUDIT_ARG_FD(s);
fdp = td->td_proc->p_fd;
error = getsock_cap(fdp, s, CAP_ACCEPT, &headfp, &fflag);
if (error)
return (error);
head = headfp->f_data;
if ((head->so_options & SO_ACCEPTCONN) == 0) {
error = EINVAL;
goto done;
}
#ifdef MAC
error = mac_socket_check_accept(td->td_ucred, head);
if (error != 0)
goto done;
#endif
error = falloc(td, &nfp, &fd, 0);
if (error)
goto done;
ACCEPT_LOCK();
if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->so_comp)) {
ACCEPT_UNLOCK();
error = EWOULDBLOCK;
goto noconnection;
}
while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) {
if (head->so_rcv.sb_state & SBS_CANTRCVMORE) {
head->so_error = ECONNABORTED;
break;
}
error = msleep(&head->so_timeo, &accept_mtx, PSOCK | PCATCH,
"accept", 0);
if (error) {
ACCEPT_UNLOCK();
goto noconnection;
}
}
if (head->so_error) {
error = head->so_error;
head->so_error = 0;
ACCEPT_UNLOCK();
goto noconnection;
}
so = TAILQ_FIRST(&head->so_comp);
KASSERT(!(so->so_qstate & SQ_INCOMP), ("accept1: so SQ_INCOMP"));
KASSERT(so->so_qstate & SQ_COMP, ("accept1: so not SQ_COMP"));
/*
* Before changing the flags on the socket, we have to bump the
* reference count. Otherwise, if the protocol calls sofree(),
* the socket will be released due to a zero refcount.
*/
SOCK_LOCK(so); /* soref() and so_state update */
soref(so); /* file descriptor reference */
TAILQ_REMOVE(&head->so_comp, so, so_list);
head->so_qlen--;
so->so_state |= (head->so_state & SS_NBIO);
so->so_qstate &= ~SQ_COMP;
so->so_head = NULL;
SOCK_UNLOCK(so);
ACCEPT_UNLOCK();
/* An extra reference on `nfp' has been held for us by falloc(). */
td->td_retval[0] = fd;
/* connection has been removed from the listen queue */
KNOTE_UNLOCKED(&head->so_rcv.sb_sel.si_note, 0);
pgid = fgetown(&head->so_sigio);
if (pgid != 0)
fsetown(pgid, &so->so_sigio);
finit(nfp, fflag, DTYPE_SOCKET, so, &socketops);
/* Sync socket nonblocking/async state with file flags */
tmp = fflag & FNONBLOCK;
(void) fo_ioctl(nfp, FIONBIO, &tmp, td->td_ucred, td);
tmp = fflag & FASYNC;
(void) fo_ioctl(nfp, FIOASYNC, &tmp, td->td_ucred, td);
sa = 0;
error = soaccept(so, &sa);
if (error) {
/*
* return a namelen of zero for older code which might
* ignore the return value from accept.
*/
if (name)
*namelen = 0;
goto noconnection;
}
if (sa == NULL) {
if (name)
*namelen = 0;
goto done;
}
if (name) {
/* check sa_len before it is destroyed */
if (*namelen > sa->sa_len)
*namelen = sa->sa_len;
#ifdef KTRACE
if (KTRPOINT(td, KTR_STRUCT))
ktrsockaddr(sa);
#endif
*name = sa;
sa = NULL;
}
noconnection:
if (sa)
free(sa, M_SONAME);
/*
* close the new descriptor, assuming someone hasn't ripped it
* out from under us.
*/
if (error)
fdclose(fdp, nfp, fd, td);
/*
* Release explicitly held references before returning. We return
* a reference on nfp to the caller on success if they request it.
*/
done:
if (fp != NULL) {
if (error == 0) {
*fp = nfp;
nfp = NULL;
} else
*fp = NULL;
}
if (nfp != NULL)
fdrop(nfp, td);
fdrop(headfp, td);
return (error);
}
int
-accept(td, uap)
+sys_accept(td, uap)
struct thread *td;
struct accept_args *uap;
{
return (accept1(td, uap, 0));
}
#ifdef COMPAT_OLDSOCK
int
oaccept(td, uap)
struct thread *td;
struct accept_args *uap;
{
return (accept1(td, uap, 1));
}
#endif /* COMPAT_OLDSOCK */
/* ARGSUSED */
int
-connect(td, uap)
+sys_connect(td, uap)
struct thread *td;
struct connect_args /* {
int s;
caddr_t name;
int namelen;
} */ *uap;
{
struct sockaddr *sa;
int error;
error = getsockaddr(&sa, uap->name, uap->namelen);
if (error)
return (error);
error = kern_connect(td, uap->s, sa);
free(sa, M_SONAME);
return (error);
}
int
kern_connect(td, fd, sa)
struct thread *td;
int fd;
struct sockaddr *sa;
{
struct socket *so;
struct file *fp;
int error;
int interrupted = 0;
AUDIT_ARG_FD(fd);
error = getsock_cap(td->td_proc->p_fd, fd, CAP_CONNECT, &fp, NULL);
if (error)
return (error);
so = fp->f_data;
if (so->so_state & SS_ISCONNECTING) {
error = EALREADY;
goto done1;
}
#ifdef KTRACE
if (KTRPOINT(td, KTR_STRUCT))
ktrsockaddr(sa);
#endif
#ifdef MAC
error = mac_socket_check_connect(td->td_ucred, so, sa);
if (error)
goto bad;
#endif
error = soconnect(so, sa, td);
if (error)
goto bad;
if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) {
error = EINPROGRESS;
goto done1;
}
SOCK_LOCK(so);
while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
error = msleep(&so->so_timeo, SOCK_MTX(so), PSOCK | PCATCH,
"connec", 0);
if (error) {
if (error == EINTR || error == ERESTART)
interrupted = 1;
break;
}
}
if (error == 0) {
error = so->so_error;
so->so_error = 0;
}
SOCK_UNLOCK(so);
bad:
if (!interrupted)
so->so_state &= ~SS_ISCONNECTING;
if (error == ERESTART)
error = EINTR;
done1:
fdrop(fp, td);
return (error);
}
int
kern_socketpair(struct thread *td, int domain, int type, int protocol,
int *rsv)
{
struct filedesc *fdp = td->td_proc->p_fd;
struct file *fp1, *fp2;
struct socket *so1, *so2;
int fd, error;
AUDIT_ARG_SOCKET(domain, type, protocol);
#ifdef MAC
/* We might want to have a separate check for socket pairs. */
error = mac_socket_check_create(td->td_ucred, domain, type,
protocol);
if (error)
return (error);
#endif
error = socreate(domain, &so1, type, protocol, td->td_ucred, td);
if (error)
return (error);
error = socreate(domain, &so2, type, protocol, td->td_ucred, td);
if (error)
goto free1;
/* On success extra reference to `fp1' and 'fp2' is set by falloc. */
error = falloc(td, &fp1, &fd, 0);
if (error)
goto free2;
rsv[0] = fd;
fp1->f_data = so1; /* so1 already has ref count */
error = falloc(td, &fp2, &fd, 0);
if (error)
goto free3;
fp2->f_data = so2; /* so2 already has ref count */
rsv[1] = fd;
error = soconnect2(so1, so2);
if (error)
goto free4;
if (type == SOCK_DGRAM) {
/*
* Datagram socket connection is asymmetric.
*/
error = soconnect2(so2, so1);
if (error)
goto free4;
}
finit(fp1, FREAD | FWRITE, DTYPE_SOCKET, fp1->f_data, &socketops);
finit(fp2, FREAD | FWRITE, DTYPE_SOCKET, fp2->f_data, &socketops);
fdrop(fp1, td);
fdrop(fp2, td);
return (0);
free4:
fdclose(fdp, fp2, rsv[1], td);
fdrop(fp2, td);
free3:
fdclose(fdp, fp1, rsv[0], td);
fdrop(fp1, td);
free2:
if (so2 != NULL)
(void)soclose(so2);
free1:
if (so1 != NULL)
(void)soclose(so1);
return (error);
}
int
-socketpair(struct thread *td, struct socketpair_args *uap)
+sys_socketpair(struct thread *td, struct socketpair_args *uap)
{
int error, sv[2];
error = kern_socketpair(td, uap->domain, uap->type,
uap->protocol, sv);
if (error)
return (error);
error = copyout(sv, uap->rsv, 2 * sizeof(int));
if (error) {
(void)kern_close(td, sv[0]);
(void)kern_close(td, sv[1]);
}
return (error);
}
static int
sendit(td, s, mp, flags)
struct thread *td;
int s;
struct msghdr *mp;
int flags;
{
struct mbuf *control;
struct sockaddr *to;
int error;
#ifdef CAPABILITY_MODE
if (IN_CAPABILITY_MODE(td) && (mp->msg_name != NULL))
return (ECAPMODE);
#endif
if (mp->msg_name != NULL) {
error = getsockaddr(&to, mp->msg_name, mp->msg_namelen);
if (error) {
to = NULL;
goto bad;
}
mp->msg_name = to;
} else {
to = NULL;
}
if (mp->msg_control) {
if (mp->msg_controllen < sizeof(struct cmsghdr)
#ifdef COMPAT_OLDSOCK
&& mp->msg_flags != MSG_COMPAT
#endif
) {
error = EINVAL;
goto bad;
}
error = sockargs(&control, mp->msg_control,
mp->msg_controllen, MT_CONTROL);
if (error)
goto bad;
#ifdef COMPAT_OLDSOCK
if (mp->msg_flags == MSG_COMPAT) {
struct cmsghdr *cm;
M_PREPEND(control, sizeof(*cm), M_WAIT);
cm = mtod(control, struct cmsghdr *);
cm->cmsg_len = control->m_len;
cm->cmsg_level = SOL_SOCKET;
cm->cmsg_type = SCM_RIGHTS;
}
#endif
} else {
control = NULL;
}
error = kern_sendit(td, s, mp, flags, control, UIO_USERSPACE);
bad:
if (to)
free(to, M_SONAME);
return (error);
}
int
kern_sendit(td, s, mp, flags, control, segflg)
struct thread *td;
int s;
struct msghdr *mp;
int flags;
struct mbuf *control;
enum uio_seg segflg;
{
struct file *fp;
struct uio auio;
struct iovec *iov;
struct socket *so;
int i;
int len, error;
cap_rights_t rights;
#ifdef KTRACE
struct uio *ktruio = NULL;
#endif
AUDIT_ARG_FD(s);
rights = CAP_WRITE;
if (mp->msg_name != NULL)
rights |= CAP_CONNECT;
error = getsock_cap(td->td_proc->p_fd, s, rights, &fp, NULL);
if (error)
return (error);
so = (struct socket *)fp->f_data;
#ifdef KTRACE
if (mp->msg_name != NULL && KTRPOINT(td, KTR_STRUCT))
ktrsockaddr(mp->msg_name);
#endif
#ifdef MAC
if (mp->msg_name != NULL) {
error = mac_socket_check_connect(td->td_ucred, so,
mp->msg_name);
if (error)
goto bad;
}
error = mac_socket_check_send(td->td_ucred, so);
if (error)
goto bad;
#endif
auio.uio_iov = mp->msg_iov;
auio.uio_iovcnt = mp->msg_iovlen;
auio.uio_segflg = segflg;
auio.uio_rw = UIO_WRITE;
auio.uio_td = td;
auio.uio_offset = 0; /* XXX */
auio.uio_resid = 0;
iov = mp->msg_iov;
for (i = 0; i < mp->msg_iovlen; i++, iov++) {
if ((auio.uio_resid += iov->iov_len) < 0) {
error = EINVAL;
goto bad;
}
}
#ifdef KTRACE
if (KTRPOINT(td, KTR_GENIO))
ktruio = cloneuio(&auio);
#endif
len = auio.uio_resid;
error = sosend(so, mp->msg_name, &auio, 0, control, flags, td);
if (error) {
if (auio.uio_resid != len && (error == ERESTART ||
error == EINTR || error == EWOULDBLOCK))
error = 0;
/* Generation of SIGPIPE can be controlled per socket */
if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) &&
!(flags & MSG_NOSIGNAL)) {
PROC_LOCK(td->td_proc);
tdsignal(td, SIGPIPE);
PROC_UNLOCK(td->td_proc);
}
}
if (error == 0)
td->td_retval[0] = len - auio.uio_resid;
#ifdef KTRACE
if (ktruio != NULL) {
ktruio->uio_resid = td->td_retval[0];
ktrgenio(s, UIO_WRITE, ktruio, error);
}
#endif
bad:
fdrop(fp, td);
return (error);
}
int
-sendto(td, uap)
+sys_sendto(td, uap)
struct thread *td;
struct sendto_args /* {
int s;
caddr_t buf;
size_t len;
int flags;
caddr_t to;
int tolen;
} */ *uap;
{
struct msghdr msg;
struct iovec aiov;
int error;
msg.msg_name = uap->to;
msg.msg_namelen = uap->tolen;
msg.msg_iov = &aiov;
msg.msg_iovlen = 1;
msg.msg_control = 0;
#ifdef COMPAT_OLDSOCK
msg.msg_flags = 0;
#endif
aiov.iov_base = uap->buf;
aiov.iov_len = uap->len;
error = sendit(td, uap->s, &msg, uap->flags);
return (error);
}
#ifdef COMPAT_OLDSOCK
int
osend(td, uap)
struct thread *td;
struct osend_args /* {
int s;
caddr_t buf;
int len;
int flags;
} */ *uap;
{
struct msghdr msg;
struct iovec aiov;
int error;
msg.msg_name = 0;
msg.msg_namelen = 0;
msg.msg_iov = &aiov;
msg.msg_iovlen = 1;
aiov.iov_base = uap->buf;
aiov.iov_len = uap->len;
msg.msg_control = 0;
msg.msg_flags = 0;
error = sendit(td, uap->s, &msg, uap->flags);
return (error);
}
int
osendmsg(td, uap)
struct thread *td;
struct osendmsg_args /* {
int s;
caddr_t msg;
int flags;
} */ *uap;
{
struct msghdr msg;
struct iovec *iov;
int error;
error = copyin(uap->msg, &msg, sizeof (struct omsghdr));
if (error)
return (error);
error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
if (error)
return (error);
msg.msg_iov = iov;
msg.msg_flags = MSG_COMPAT;
error = sendit(td, uap->s, &msg, uap->flags);
free(iov, M_IOV);
return (error);
}
#endif
int
-sendmsg(td, uap)
+sys_sendmsg(td, uap)
struct thread *td;
struct sendmsg_args /* {
int s;
caddr_t msg;
int flags;
} */ *uap;
{
struct msghdr msg;
struct iovec *iov;
int error;
error = copyin(uap->msg, &msg, sizeof (msg));
if (error)
return (error);
error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
if (error)
return (error);
msg.msg_iov = iov;
#ifdef COMPAT_OLDSOCK
msg.msg_flags = 0;
#endif
error = sendit(td, uap->s, &msg, uap->flags);
free(iov, M_IOV);
return (error);
}
int
kern_recvit(td, s, mp, fromseg, controlp)
struct thread *td;
int s;
struct msghdr *mp;
enum uio_seg fromseg;
struct mbuf **controlp;
{
struct uio auio;
struct iovec *iov;
int i;
socklen_t len;
int error;
struct mbuf *m, *control = 0;
caddr_t ctlbuf;
struct file *fp;
struct socket *so;
struct sockaddr *fromsa = 0;
#ifdef KTRACE
struct uio *ktruio = NULL;
#endif
if (controlp != NULL)
*controlp = NULL;
AUDIT_ARG_FD(s);
error = getsock_cap(td->td_proc->p_fd, s, CAP_READ, &fp, NULL);
if (error)
return (error);
so = fp->f_data;
#ifdef MAC
error = mac_socket_check_receive(td->td_ucred, so);
if (error) {
fdrop(fp, td);
return (error);
}
#endif
auio.uio_iov = mp->msg_iov;
auio.uio_iovcnt = mp->msg_iovlen;
auio.uio_segflg = UIO_USERSPACE;
auio.uio_rw = UIO_READ;
auio.uio_td = td;
auio.uio_offset = 0; /* XXX */
auio.uio_resid = 0;
iov = mp->msg_iov;
for (i = 0; i < mp->msg_iovlen; i++, iov++) {
if ((auio.uio_resid += iov->iov_len) < 0) {
fdrop(fp, td);
return (EINVAL);
}
}
#ifdef KTRACE
if (KTRPOINT(td, KTR_GENIO))
ktruio = cloneuio(&auio);
#endif
len = auio.uio_resid;
error = soreceive(so, &fromsa, &auio, (struct mbuf **)0,
(mp->msg_control || controlp) ? &control : (struct mbuf **)0,
&mp->msg_flags);
if (error) {
if (auio.uio_resid != (int)len && (error == ERESTART ||
error == EINTR || error == EWOULDBLOCK))
error = 0;
}
#ifdef KTRACE
if (ktruio != NULL) {
ktruio->uio_resid = (int)len - auio.uio_resid;
ktrgenio(s, UIO_READ, ktruio, error);
}
#endif
if (error)
goto out;
td->td_retval[0] = (int)len - auio.uio_resid;
if (mp->msg_name) {
len = mp->msg_namelen;
if (len <= 0 || fromsa == 0)
len = 0;
else {
/* save sa_len before it is destroyed by MSG_COMPAT */
len = MIN(len, fromsa->sa_len);
#ifdef COMPAT_OLDSOCK
if (mp->msg_flags & MSG_COMPAT)
((struct osockaddr *)fromsa)->sa_family =
fromsa->sa_family;
#endif
if (fromseg == UIO_USERSPACE) {
error = copyout(fromsa, mp->msg_name,
(unsigned)len);
if (error)
goto out;
} else
bcopy(fromsa, mp->msg_name, len);
}
mp->msg_namelen = len;
}
if (mp->msg_control && controlp == NULL) {
#ifdef COMPAT_OLDSOCK
/*
* We assume that old recvmsg calls won't receive access
* rights and other control info, esp. as control info
* is always optional and those options didn't exist in 4.3.
* If we receive rights, trim the cmsghdr; anything else
* is tossed.
*/
if (control && mp->msg_flags & MSG_COMPAT) {
if (mtod(control, struct cmsghdr *)->cmsg_level !=
SOL_SOCKET ||
mtod(control, struct cmsghdr *)->cmsg_type !=
SCM_RIGHTS) {
mp->msg_controllen = 0;
goto out;
}
control->m_len -= sizeof (struct cmsghdr);
control->m_data += sizeof (struct cmsghdr);
}
#endif
len = mp->msg_controllen;
m = control;
mp->msg_controllen = 0;
ctlbuf = mp->msg_control;
while (m && len > 0) {
unsigned int tocopy;
if (len >= m->m_len)
tocopy = m->m_len;
else {
mp->msg_flags |= MSG_CTRUNC;
tocopy = len;
}
if ((error = copyout(mtod(m, caddr_t),
ctlbuf, tocopy)) != 0)
goto out;
ctlbuf += tocopy;
len -= tocopy;
m = m->m_next;
}
mp->msg_controllen = ctlbuf - (caddr_t)mp->msg_control;
}
out:
fdrop(fp, td);
#ifdef KTRACE
if (fromsa && KTRPOINT(td, KTR_STRUCT))
ktrsockaddr(fromsa);
#endif
if (fromsa)
free(fromsa, M_SONAME);
if (error == 0 && controlp != NULL)
*controlp = control;
else if (control)
m_freem(control);
return (error);
}
static int
recvit(td, s, mp, namelenp)
struct thread *td;
int s;
struct msghdr *mp;
void *namelenp;
{
int error;
error = kern_recvit(td, s, mp, UIO_USERSPACE, NULL);
if (error)
return (error);
if (namelenp) {
error = copyout(&mp->msg_namelen, namelenp, sizeof (socklen_t));
#ifdef COMPAT_OLDSOCK
if (mp->msg_flags & MSG_COMPAT)
error = 0; /* old recvfrom didn't check */
#endif
}
return (error);
}
int
-recvfrom(td, uap)
+sys_recvfrom(td, uap)
struct thread *td;
struct recvfrom_args /* {
int s;
caddr_t buf;
size_t len;
int flags;
struct sockaddr * __restrict from;
socklen_t * __restrict fromlenaddr;
} */ *uap;
{
struct msghdr msg;
struct iovec aiov;
int error;
if (uap->fromlenaddr) {
error = copyin(uap->fromlenaddr,
&msg.msg_namelen, sizeof (msg.msg_namelen));
if (error)
goto done2;
} else {
msg.msg_namelen = 0;
}
msg.msg_name = uap->from;
msg.msg_iov = &aiov;
msg.msg_iovlen = 1;
aiov.iov_base = uap->buf;
aiov.iov_len = uap->len;
msg.msg_control = 0;
msg.msg_flags = uap->flags;
error = recvit(td, uap->s, &msg, uap->fromlenaddr);
done2:
return(error);
}
#ifdef COMPAT_OLDSOCK
int
orecvfrom(td, uap)
struct thread *td;
struct recvfrom_args *uap;
{
uap->flags |= MSG_COMPAT;
- return (recvfrom(td, uap));
+ return (sys_recvfrom(td, uap));
}
#endif
#ifdef COMPAT_OLDSOCK
int
orecv(td, uap)
struct thread *td;
struct orecv_args /* {
int s;
caddr_t buf;
int len;
int flags;
} */ *uap;
{
struct msghdr msg;
struct iovec aiov;
int error;
msg.msg_name = 0;
msg.msg_namelen = 0;
msg.msg_iov = &aiov;
msg.msg_iovlen = 1;
aiov.iov_base = uap->buf;
aiov.iov_len = uap->len;
msg.msg_control = 0;
msg.msg_flags = uap->flags;
error = recvit(td, uap->s, &msg, NULL);
return (error);
}
/*
* Old recvmsg. This code takes advantage of the fact that the old msghdr
* overlays the new one, missing only the flags, and with the (old) access
* rights where the control fields are now.
*/
int
orecvmsg(td, uap)
struct thread *td;
struct orecvmsg_args /* {
int s;
struct omsghdr *msg;
int flags;
} */ *uap;
{
struct msghdr msg;
struct iovec *iov;
int error;
error = copyin(uap->msg, &msg, sizeof (struct omsghdr));
if (error)
return (error);
error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
if (error)
return (error);
msg.msg_flags = uap->flags | MSG_COMPAT;
msg.msg_iov = iov;
error = recvit(td, uap->s, &msg, &uap->msg->msg_namelen);
if (msg.msg_controllen && error == 0)
error = copyout(&msg.msg_controllen,
&uap->msg->msg_accrightslen, sizeof (int));
free(iov, M_IOV);
return (error);
}
#endif
int
-recvmsg(td, uap)
+sys_recvmsg(td, uap)
struct thread *td;
struct recvmsg_args /* {
int s;
struct msghdr *msg;
int flags;
} */ *uap;
{
struct msghdr msg;
struct iovec *uiov, *iov;
int error;
error = copyin(uap->msg, &msg, sizeof (msg));
if (error)
return (error);
error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
if (error)
return (error);
msg.msg_flags = uap->flags;
#ifdef COMPAT_OLDSOCK
msg.msg_flags &= ~MSG_COMPAT;
#endif
uiov = msg.msg_iov;
msg.msg_iov = iov;
error = recvit(td, uap->s, &msg, NULL);
if (error == 0) {
msg.msg_iov = uiov;
error = copyout(&msg, uap->msg, sizeof(msg));
}
free(iov, M_IOV);
return (error);
}
/* ARGSUSED */
int
-shutdown(td, uap)
+sys_shutdown(td, uap)
struct thread *td;
struct shutdown_args /* {
int s;
int how;
} */ *uap;
{
struct socket *so;
struct file *fp;
int error;
AUDIT_ARG_FD(uap->s);
error = getsock_cap(td->td_proc->p_fd, uap->s, CAP_SHUTDOWN, &fp,
NULL);
if (error == 0) {
so = fp->f_data;
error = soshutdown(so, uap->how);
fdrop(fp, td);
}
return (error);
}
/* ARGSUSED */
int
-setsockopt(td, uap)
+sys_setsockopt(td, uap)
struct thread *td;
struct setsockopt_args /* {
int s;
int level;
int name;
caddr_t val;
int valsize;
} */ *uap;
{
return (kern_setsockopt(td, uap->s, uap->level, uap->name,
uap->val, UIO_USERSPACE, uap->valsize));
}
int
kern_setsockopt(td, s, level, name, val, valseg, valsize)
struct thread *td;
int s;
int level;
int name;
void *val;
enum uio_seg valseg;
socklen_t valsize;
{
int error;
struct socket *so;
struct file *fp;
struct sockopt sopt;
if (val == NULL && valsize != 0)
return (EFAULT);
if ((int)valsize < 0)
return (EINVAL);
sopt.sopt_dir = SOPT_SET;
sopt.sopt_level = level;
sopt.sopt_name = name;
sopt.sopt_val = val;
sopt.sopt_valsize = valsize;
switch (valseg) {
case UIO_USERSPACE:
sopt.sopt_td = td;
break;
case UIO_SYSSPACE:
sopt.sopt_td = NULL;
break;
default:
panic("kern_setsockopt called with bad valseg");
}
AUDIT_ARG_FD(s);
error = getsock_cap(td->td_proc->p_fd, s, CAP_SETSOCKOPT, &fp, NULL);
if (error == 0) {
so = fp->f_data;
error = sosetopt(so, &sopt);
fdrop(fp, td);
}
return(error);
}
/* ARGSUSED */
int
-getsockopt(td, uap)
+sys_getsockopt(td, uap)
struct thread *td;
struct getsockopt_args /* {
int s;
int level;
int name;
void * __restrict val;
socklen_t * __restrict avalsize;
} */ *uap;
{
socklen_t valsize;
int error;
if (uap->val) {
error = copyin(uap->avalsize, &valsize, sizeof (valsize));
if (error)
return (error);
}
error = kern_getsockopt(td, uap->s, uap->level, uap->name,
uap->val, UIO_USERSPACE, &valsize);
if (error == 0)
error = copyout(&valsize, uap->avalsize, sizeof (valsize));
return (error);
}
/*
* Kernel version of getsockopt.
* optval can be a userland or userspace. optlen is always a kernel pointer.
*/
int
kern_getsockopt(td, s, level, name, val, valseg, valsize)
struct thread *td;
int s;
int level;
int name;
void *val;
enum uio_seg valseg;
socklen_t *valsize;
{
int error;
struct socket *so;
struct file *fp;
struct sockopt sopt;
if (val == NULL)
*valsize = 0;
if ((int)*valsize < 0)
return (EINVAL);
sopt.sopt_dir = SOPT_GET;
sopt.sopt_level = level;
sopt.sopt_name = name;
sopt.sopt_val = val;
sopt.sopt_valsize = (size_t)*valsize; /* checked non-negative above */
switch (valseg) {
case UIO_USERSPACE:
sopt.sopt_td = td;
break;
case UIO_SYSSPACE:
sopt.sopt_td = NULL;
break;
default:
panic("kern_getsockopt called with bad valseg");
}
AUDIT_ARG_FD(s);
error = getsock_cap(td->td_proc->p_fd, s, CAP_GETSOCKOPT, &fp, NULL);
if (error == 0) {
so = fp->f_data;
error = sogetopt(so, &sopt);
*valsize = sopt.sopt_valsize;
fdrop(fp, td);
}
return (error);
}
/*
* getsockname1() - Get socket name.
*/
/* ARGSUSED */
static int
getsockname1(td, uap, compat)
struct thread *td;
struct getsockname_args /* {
int fdes;
struct sockaddr * __restrict asa;
socklen_t * __restrict alen;
} */ *uap;
int compat;
{
struct sockaddr *sa;
socklen_t len;
int error;
error = copyin(uap->alen, &len, sizeof(len));
if (error)
return (error);
error = kern_getsockname(td, uap->fdes, &sa, &len);
if (error)
return (error);
if (len != 0) {
#ifdef COMPAT_OLDSOCK
if (compat)
((struct osockaddr *)sa)->sa_family = sa->sa_family;
#endif
error = copyout(sa, uap->asa, (u_int)len);
}
free(sa, M_SONAME);
if (error == 0)
error = copyout(&len, uap->alen, sizeof(len));
return (error);
}
int
kern_getsockname(struct thread *td, int fd, struct sockaddr **sa,
socklen_t *alen)
{
struct socket *so;
struct file *fp;
socklen_t len;
int error;
if (*alen < 0)
return (EINVAL);
AUDIT_ARG_FD(fd);
error = getsock_cap(td->td_proc->p_fd, fd, CAP_GETSOCKNAME, &fp, NULL);
if (error)
return (error);
so = fp->f_data;
*sa = NULL;
CURVNET_SET(so->so_vnet);
error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, sa);
CURVNET_RESTORE();
if (error)
goto bad;
if (*sa == NULL)
len = 0;
else
len = MIN(*alen, (*sa)->sa_len);
*alen = len;
#ifdef KTRACE
if (KTRPOINT(td, KTR_STRUCT))
ktrsockaddr(*sa);
#endif
bad:
fdrop(fp, td);
if (error && *sa) {
free(*sa, M_SONAME);
*sa = NULL;
}
return (error);
}
int
-getsockname(td, uap)
+sys_getsockname(td, uap)
struct thread *td;
struct getsockname_args *uap;
{
return (getsockname1(td, uap, 0));
}
#ifdef COMPAT_OLDSOCK
int
ogetsockname(td, uap)
struct thread *td;
struct getsockname_args *uap;
{
return (getsockname1(td, uap, 1));
}
#endif /* COMPAT_OLDSOCK */
/*
* getpeername1() - Get name of peer for connected socket.
*/
/* ARGSUSED */
static int
getpeername1(td, uap, compat)
struct thread *td;
struct getpeername_args /* {
int fdes;
struct sockaddr * __restrict asa;
socklen_t * __restrict alen;
} */ *uap;
int compat;
{
struct sockaddr *sa;
socklen_t len;
int error;
error = copyin(uap->alen, &len, sizeof (len));
if (error)
return (error);
error = kern_getpeername(td, uap->fdes, &sa, &len);
if (error)
return (error);
if (len != 0) {
#ifdef COMPAT_OLDSOCK
if (compat)
((struct osockaddr *)sa)->sa_family = sa->sa_family;
#endif
error = copyout(sa, uap->asa, (u_int)len);
}
free(sa, M_SONAME);
if (error == 0)
error = copyout(&len, uap->alen, sizeof(len));
return (error);
}
int
kern_getpeername(struct thread *td, int fd, struct sockaddr **sa,
socklen_t *alen)
{
struct socket *so;
struct file *fp;
socklen_t len;
int error;
if (*alen < 0)
return (EINVAL);
AUDIT_ARG_FD(fd);
error = getsock_cap(td->td_proc->p_fd, fd, CAP_GETPEERNAME, &fp, NULL);
if (error)
return (error);
so = fp->f_data;
if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) {
error = ENOTCONN;
goto done;
}
*sa = NULL;
CURVNET_SET(so->so_vnet);
error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, sa);
CURVNET_RESTORE();
if (error)
goto bad;
if (*sa == NULL)
len = 0;
else
len = MIN(*alen, (*sa)->sa_len);
*alen = len;
#ifdef KTRACE
if (KTRPOINT(td, KTR_STRUCT))
ktrsockaddr(*sa);
#endif
bad:
if (error && *sa) {
free(*sa, M_SONAME);
*sa = NULL;
}
done:
fdrop(fp, td);
return (error);
}
int
-getpeername(td, uap)
+sys_getpeername(td, uap)
struct thread *td;
struct getpeername_args *uap;
{
return (getpeername1(td, uap, 0));
}
#ifdef COMPAT_OLDSOCK
int
ogetpeername(td, uap)
struct thread *td;
struct ogetpeername_args *uap;
{
/* XXX uap should have type `getpeername_args *' to begin with. */
return (getpeername1(td, (struct getpeername_args *)uap, 1));
}
#endif /* COMPAT_OLDSOCK */
int
sockargs(mp, buf, buflen, type)
struct mbuf **mp;
caddr_t buf;
int buflen, type;
{
struct sockaddr *sa;
struct mbuf *m;
int error;
if ((u_int)buflen > MLEN) {
#ifdef COMPAT_OLDSOCK
if (type == MT_SONAME && (u_int)buflen <= 112)
buflen = MLEN; /* unix domain compat. hack */
else
#endif
if ((u_int)buflen > MCLBYTES)
return (EINVAL);
}
m = m_get(M_WAIT, type);
if ((u_int)buflen > MLEN)
MCLGET(m, M_WAIT);
m->m_len = buflen;
error = copyin(buf, mtod(m, caddr_t), (u_int)buflen);
if (error)
(void) m_free(m);
else {
*mp = m;
if (type == MT_SONAME) {
sa = mtod(m, struct sockaddr *);
#if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
sa->sa_family = sa->sa_len;
#endif
sa->sa_len = buflen;
}
}
return (error);
}
int
getsockaddr(namp, uaddr, len)
struct sockaddr **namp;
caddr_t uaddr;
size_t len;
{
struct sockaddr *sa;
int error;
if (len > SOCK_MAXADDRLEN)
return (ENAMETOOLONG);
if (len < offsetof(struct sockaddr, sa_data[0]))
return (EINVAL);
sa = malloc(len, M_SONAME, M_WAITOK);
error = copyin(uaddr, sa, len);
if (error) {
free(sa, M_SONAME);
} else {
#if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
sa->sa_family = sa->sa_len;
#endif
sa->sa_len = len;
*namp = sa;
}
return (error);
}
#include <sys/condvar.h>
struct sendfile_sync {
struct mtx mtx;
struct cv cv;
unsigned count;
};
/*
* Detach mapped page and release resources back to the system.
*/
void
sf_buf_mext(void *addr, void *args)
{
vm_page_t m;
struct sendfile_sync *sfs;
m = sf_buf_page(args);
sf_buf_free(args);
vm_page_lock(m);
vm_page_unwire(m, 0);
/*
* Check for the object going away on us. This can
* happen since we don't hold a reference to it.
* If so, we're responsible for freeing the page.
*/
if (m->wire_count == 0 && m->object == NULL)
vm_page_free(m);
vm_page_unlock(m);
if (addr == NULL)
return;
sfs = addr;
mtx_lock(&sfs->mtx);
KASSERT(sfs->count> 0, ("Sendfile sync botchup count == 0"));
if (--sfs->count == 0)
cv_signal(&sfs->cv);
mtx_unlock(&sfs->mtx);
}
/*
* sendfile(2)
*
* int sendfile(int fd, int s, off_t offset, size_t nbytes,
* struct sf_hdtr *hdtr, off_t *sbytes, int flags)
*
* Send a file specified by 'fd' and starting at 'offset' to a socket
* specified by 's'. Send only 'nbytes' of the file or until EOF if nbytes ==
* 0. Optionally add a header and/or trailer to the socket output. If
* specified, write the total number of bytes sent into *sbytes.
*/
int
-sendfile(struct thread *td, struct sendfile_args *uap)
+sys_sendfile(struct thread *td, struct sendfile_args *uap)
{
return (do_sendfile(td, uap, 0));
}
static int
do_sendfile(struct thread *td, struct sendfile_args *uap, int compat)
{
struct sf_hdtr hdtr;
struct uio *hdr_uio, *trl_uio;
int error;
hdr_uio = trl_uio = NULL;
if (uap->hdtr != NULL) {
error = copyin(uap->hdtr, &hdtr, sizeof(hdtr));
if (error)
goto out;
if (hdtr.headers != NULL) {
error = copyinuio(hdtr.headers, hdtr.hdr_cnt, &hdr_uio);
if (error)
goto out;
}
if (hdtr.trailers != NULL) {
error = copyinuio(hdtr.trailers, hdtr.trl_cnt, &trl_uio);
if (error)
goto out;
}
}
error = kern_sendfile(td, uap, hdr_uio, trl_uio, compat);
out:
if (hdr_uio)
free(hdr_uio, M_IOV);
if (trl_uio)
free(trl_uio, M_IOV);
return (error);
}
#ifdef COMPAT_FREEBSD4
int
freebsd4_sendfile(struct thread *td, struct freebsd4_sendfile_args *uap)
{
struct sendfile_args args;
args.fd = uap->fd;
args.s = uap->s;
args.offset = uap->offset;
args.nbytes = uap->nbytes;
args.hdtr = uap->hdtr;
args.sbytes = uap->sbytes;
args.flags = uap->flags;
return (do_sendfile(td, &args, 1));
}
#endif /* COMPAT_FREEBSD4 */
int
kern_sendfile(struct thread *td, struct sendfile_args *uap,
struct uio *hdr_uio, struct uio *trl_uio, int compat)
{
struct file *sock_fp;
struct vnode *vp;
struct vm_object *obj = NULL;
struct socket *so = NULL;
struct mbuf *m = NULL;
struct sf_buf *sf;
struct vm_page *pg;
off_t off, xfsize, fsbytes = 0, sbytes = 0, rem = 0;
int error, hdrlen = 0, mnw = 0;
int vfslocked;
struct sendfile_sync *sfs = NULL;
/*
* The file descriptor must be a regular file and have a
* backing VM object.
* File offset must be positive. If it goes beyond EOF
* we send only the header/trailer and no payload data.
*/
AUDIT_ARG_FD(uap->fd);
if ((error = fgetvp_read(td, uap->fd, CAP_READ, &vp)) != 0)
goto out;
vfslocked = VFS_LOCK_GIANT(vp->v_mount);
vn_lock(vp, LK_SHARED | LK_RETRY);
if (vp->v_type == VREG) {
obj = vp->v_object;
if (obj != NULL) {
/*
* Temporarily increase the backing VM
* object's reference count so that a forced
* reclamation of its vnode does not
* immediately destroy it.
*/
VM_OBJECT_LOCK(obj);
if ((obj->flags & OBJ_DEAD) == 0) {
vm_object_reference_locked(obj);
VM_OBJECT_UNLOCK(obj);
} else {
VM_OBJECT_UNLOCK(obj);
obj = NULL;
}
}
}
VOP_UNLOCK(vp, 0);
VFS_UNLOCK_GIANT(vfslocked);
if (obj == NULL) {
error = EINVAL;
goto out;
}
if (uap->offset < 0) {
error = EINVAL;
goto out;
}
/*
* The socket must be a stream socket and connected.
* Remember if it a blocking or non-blocking socket.
*/
if ((error = getsock_cap(td->td_proc->p_fd, uap->s, CAP_WRITE,
&sock_fp, NULL)) != 0)
goto out;
so = sock_fp->f_data;
if (so->so_type != SOCK_STREAM) {
error = EINVAL;
goto out;
}
if ((so->so_state & SS_ISCONNECTED) == 0) {
error = ENOTCONN;
goto out;
}
/*
* Do not wait on memory allocations but return ENOMEM for
* caller to retry later.
* XXX: Experimental.
*/
if (uap->flags & SF_MNOWAIT)
mnw = 1;
if (uap->flags & SF_SYNC) {
sfs = malloc(sizeof *sfs, M_TEMP, M_WAITOK | M_ZERO);
mtx_init(&sfs->mtx, "sendfile", NULL, MTX_DEF);
cv_init(&sfs->cv, "sendfile");
}
#ifdef MAC
error = mac_socket_check_send(td->td_ucred, so);
if (error)
goto out;
#endif
/* If headers are specified copy them into mbufs. */
if (hdr_uio != NULL) {
hdr_uio->uio_td = td;
hdr_uio->uio_rw = UIO_WRITE;
if (hdr_uio->uio_resid > 0) {
/*
* In FBSD < 5.0 the nbytes to send also included
* the header. If compat is specified subtract the
* header size from nbytes.
*/
if (compat) {
if (uap->nbytes > hdr_uio->uio_resid)
uap->nbytes -= hdr_uio->uio_resid;
else
uap->nbytes = 0;
}
m = m_uiotombuf(hdr_uio, (mnw ? M_NOWAIT : M_WAITOK),
0, 0, 0);
if (m == NULL) {
error = mnw ? EAGAIN : ENOBUFS;
goto out;
}
hdrlen = m_length(m, NULL);
}
}
/*
* Protect against multiple writers to the socket.
*
* XXXRW: Historically this has assumed non-interruptibility, so now
* we implement that, but possibly shouldn't.
*/
(void)sblock(&so->so_snd, SBL_WAIT | SBL_NOINTR);
/*
* Loop through the pages of the file, starting with the requested
* offset. Get a file page (do I/O if necessary), map the file page
* into an sf_buf, attach an mbuf header to the sf_buf, and queue
* it on the socket.
* This is done in two loops. The inner loop turns as many pages
* as it can, up to available socket buffer space, without blocking
* into mbufs to have it bulk delivered into the socket send buffer.
* The outer loop checks the state and available space of the socket
* and takes care of the overall progress.
*/
for (off = uap->offset, rem = uap->nbytes; ; ) {
int loopbytes = 0;
int space = 0;
int done = 0;
/*
* Check the socket state for ongoing connection,
* no errors and space in socket buffer.
* If space is low allow for the remainder of the
* file to be processed if it fits the socket buffer.
* Otherwise block in waiting for sufficient space
* to proceed, or if the socket is nonblocking, return
* to userland with EAGAIN while reporting how far
* we've come.
* We wait until the socket buffer has significant free
* space to do bulk sends. This makes good use of file
* system read ahead and allows packet segmentation
* offloading hardware to take over lots of work. If
* we were not careful here we would send off only one
* sfbuf at a time.
*/
SOCKBUF_LOCK(&so->so_snd);
if (so->so_snd.sb_lowat < so->so_snd.sb_hiwat / 2)
so->so_snd.sb_lowat = so->so_snd.sb_hiwat / 2;
retry_space:
if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
error = EPIPE;
SOCKBUF_UNLOCK(&so->so_snd);
goto done;
} else if (so->so_error) {
error = so->so_error;
so->so_error = 0;
SOCKBUF_UNLOCK(&so->so_snd);
goto done;
}
space = sbspace(&so->so_snd);
if (space < rem &&
(space <= 0 ||
space < so->so_snd.sb_lowat)) {
if (so->so_state & SS_NBIO) {
SOCKBUF_UNLOCK(&so->so_snd);
error = EAGAIN;
goto done;
}
/*
* sbwait drops the lock while sleeping.
* When we loop back to retry_space the
* state may have changed and we retest
* for it.
*/
error = sbwait(&so->so_snd);
/*
* An error from sbwait usually indicates that we've
* been interrupted by a signal. If we've sent anything
* then return bytes sent, otherwise return the error.
*/
if (error) {
SOCKBUF_UNLOCK(&so->so_snd);
goto done;
}
goto retry_space;
}
SOCKBUF_UNLOCK(&so->so_snd);
/*
* Reduce space in the socket buffer by the size of
* the header mbuf chain.
* hdrlen is set to 0 after the first loop.
*/
space -= hdrlen;
/*
* Loop and construct maximum sized mbuf chain to be bulk
* dumped into socket buffer.
*/
while (space > loopbytes) {
vm_pindex_t pindex;
vm_offset_t pgoff;
struct mbuf *m0;
VM_OBJECT_LOCK(obj);
/*
* Calculate the amount to transfer.
* Not to exceed a page, the EOF,
* or the passed in nbytes.
*/
pgoff = (vm_offset_t)(off & PAGE_MASK);
xfsize = omin(PAGE_SIZE - pgoff,
obj->un_pager.vnp.vnp_size - uap->offset -
fsbytes - loopbytes);
if (uap->nbytes)
rem = (uap->nbytes - fsbytes - loopbytes);
else
rem = obj->un_pager.vnp.vnp_size -
uap->offset - fsbytes - loopbytes;
xfsize = omin(rem, xfsize);
xfsize = omin(space - loopbytes, xfsize);
if (xfsize <= 0) {
VM_OBJECT_UNLOCK(obj);
done = 1; /* all data sent */
break;
}
/*
* Attempt to look up the page. Allocate
* if not found or wait and loop if busy.
*/
pindex = OFF_TO_IDX(off);
pg = vm_page_grab(obj, pindex, VM_ALLOC_NOBUSY |
VM_ALLOC_NORMAL | VM_ALLOC_WIRED | VM_ALLOC_RETRY);
/*
* Check if page is valid for what we need,
* otherwise initiate I/O.
* If we already turned some pages into mbufs,
* send them off before we come here again and
* block.
*/
if (pg->valid && vm_page_is_valid(pg, pgoff, xfsize))
VM_OBJECT_UNLOCK(obj);
else if (m != NULL)
error = EAGAIN; /* send what we already got */
else if (uap->flags & SF_NODISKIO)
error = EBUSY;
else {
int bsize, resid;
/*
* Ensure that our page is still around
* when the I/O completes.
*/
vm_page_io_start(pg);
VM_OBJECT_UNLOCK(obj);
/*
* Get the page from backing store.
*/
vfslocked = VFS_LOCK_GIANT(vp->v_mount);
error = vn_lock(vp, LK_SHARED);
if (error != 0)
goto after_read;
bsize = vp->v_mount->mnt_stat.f_iosize;
/*
* XXXMAC: Because we don't have fp->f_cred
* here, we pass in NOCRED. This is probably
* wrong, but is consistent with our original
* implementation.
*/
error = vn_rdwr(UIO_READ, vp, NULL, MAXBSIZE,
trunc_page(off), UIO_NOCOPY, IO_NODELOCKED |
IO_VMIO | ((MAXBSIZE / bsize) << IO_SEQSHIFT),
td->td_ucred, NOCRED, &resid, td);
VOP_UNLOCK(vp, 0);
after_read:
VFS_UNLOCK_GIANT(vfslocked);
VM_OBJECT_LOCK(obj);
vm_page_io_finish(pg);
if (!error)
VM_OBJECT_UNLOCK(obj);
mbstat.sf_iocnt++;
}
if (error) {
vm_page_lock(pg);
vm_page_unwire(pg, 0);
/*
* See if anyone else might know about
* this page. If not and it is not valid,
* then free it.
*/
if (pg->wire_count == 0 && pg->valid == 0 &&
pg->busy == 0 && !(pg->oflags & VPO_BUSY))
vm_page_free(pg);
vm_page_unlock(pg);
VM_OBJECT_UNLOCK(obj);
if (error == EAGAIN)
error = 0; /* not a real error */
break;
}
/*
* Get a sendfile buf. When allocating the
* first buffer for mbuf chain, we usually
* wait as long as necessary, but this wait
* can be interrupted. For consequent
* buffers, do not sleep, since several
* threads might exhaust the buffers and then
* deadlock.
*/
sf = sf_buf_alloc(pg, (mnw || m != NULL) ? SFB_NOWAIT :
SFB_CATCH);
if (sf == NULL) {
mbstat.sf_allocfail++;
vm_page_lock(pg);
vm_page_unwire(pg, 0);
KASSERT(pg->object != NULL,
("kern_sendfile: object disappeared"));
vm_page_unlock(pg);
if (m == NULL)
error = (mnw ? EAGAIN : EINTR);
break;
}
/*
* Get an mbuf and set it up as having
* external storage.
*/
m0 = m_get((mnw ? M_NOWAIT : M_WAITOK), MT_DATA);
if (m0 == NULL) {
error = (mnw ? EAGAIN : ENOBUFS);
sf_buf_mext((void *)sf_buf_kva(sf), sf);
break;
}
MEXTADD(m0, sf_buf_kva(sf), PAGE_SIZE, sf_buf_mext,
sfs, sf, M_RDONLY, EXT_SFBUF);
m0->m_data = (char *)sf_buf_kva(sf) + pgoff;
m0->m_len = xfsize;
/* Append to mbuf chain. */
if (m != NULL)
m_cat(m, m0);
else
m = m0;
/* Keep track of bits processed. */
loopbytes += xfsize;
off += xfsize;
if (sfs != NULL) {
mtx_lock(&sfs->mtx);
sfs->count++;
mtx_unlock(&sfs->mtx);
}
}
/* Add the buffer chain to the socket buffer. */
if (m != NULL) {
int mlen, err;
mlen = m_length(m, NULL);
SOCKBUF_LOCK(&so->so_snd);
if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
error = EPIPE;
SOCKBUF_UNLOCK(&so->so_snd);
goto done;
}
SOCKBUF_UNLOCK(&so->so_snd);
CURVNET_SET(so->so_vnet);
/* Avoid error aliasing. */
err = (*so->so_proto->pr_usrreqs->pru_send)
(so, 0, m, NULL, NULL, td);
CURVNET_RESTORE();
if (err == 0) {
/*
* We need two counters to get the
* file offset and nbytes to send
* right:
* - sbytes contains the total amount
* of bytes sent, including headers.
* - fsbytes contains the total amount
* of bytes sent from the file.
*/
sbytes += mlen;
fsbytes += mlen;
if (hdrlen) {
fsbytes -= hdrlen;
hdrlen = 0;
}
} else if (error == 0)
error = err;
m = NULL; /* pru_send always consumes */
}
/* Quit outer loop on error or when we're done. */
if (done)
break;
if (error)
goto done;
}
/*
* Send trailers. Wimp out and use writev(2).
*/
if (trl_uio != NULL) {
sbunlock(&so->so_snd);
error = kern_writev(td, uap->s, trl_uio);
if (error == 0)
sbytes += td->td_retval[0];
goto out;
}
done:
sbunlock(&so->so_snd);
out:
/*
* If there was no error we have to clear td->td_retval[0]
* because it may have been set by writev.
*/
if (error == 0) {
td->td_retval[0] = 0;
}
if (uap->sbytes != NULL) {
copyout(&sbytes, uap->sbytes, sizeof(off_t));
}
if (obj != NULL)
vm_object_deallocate(obj);
if (vp != NULL) {
vfslocked = VFS_LOCK_GIANT(vp->v_mount);
vrele(vp);
VFS_UNLOCK_GIANT(vfslocked);
}
if (so)
fdrop(sock_fp, td);
if (m)
m_freem(m);
if (sfs != NULL) {
mtx_lock(&sfs->mtx);
if (sfs->count != 0)
cv_wait(&sfs->cv, &sfs->mtx);
KASSERT(sfs->count == 0, ("sendfile sync still busy"));
cv_destroy(&sfs->cv);
mtx_destroy(&sfs->mtx);
free(sfs, M_TEMP);
}
if (error == ERESTART)
error = EINTR;
return (error);
}
/*
* SCTP syscalls.
* Functionality only compiled in if SCTP is defined in the kernel Makefile,
* otherwise all return EOPNOTSUPP.
* XXX: We should make this loadable one day.
*/
int
-sctp_peeloff(td, uap)
+sys_sctp_peeloff(td, uap)
struct thread *td;
struct sctp_peeloff_args /* {
int sd;
caddr_t name;
} */ *uap;
{
#if (defined(INET) || defined(INET6)) && defined(SCTP)
struct filedesc *fdp;
struct file *nfp = NULL;
int error;
struct socket *head, *so;
int fd;
u_int fflag;
fdp = td->td_proc->p_fd;
AUDIT_ARG_FD(uap->sd);
error = fgetsock(td, uap->sd, CAP_PEELOFF, &head, &fflag);
if (error)
goto done2;
error = sctp_can_peel_off(head, (sctp_assoc_t)uap->name);
if (error)
goto done2;
/*
* At this point we know we do have a assoc to pull
* we proceed to get the fd setup. This may block
* but that is ok.
*/
error = falloc(td, &nfp, &fd, 0);
if (error)
goto done;
td->td_retval[0] = fd;
CURVNET_SET(head->so_vnet);
so = sonewconn(head, SS_ISCONNECTED);
if (so == NULL)
goto noconnection;
/*
* Before changing the flags on the socket, we have to bump the
* reference count. Otherwise, if the protocol calls sofree(),
* the socket will be released due to a zero refcount.
*/
SOCK_LOCK(so);
soref(so); /* file descriptor reference */
SOCK_UNLOCK(so);
ACCEPT_LOCK();
TAILQ_REMOVE(&head->so_comp, so, so_list);
head->so_qlen--;
so->so_state |= (head->so_state & SS_NBIO);
so->so_state &= ~SS_NOFDREF;
so->so_qstate &= ~SQ_COMP;
so->so_head = NULL;
ACCEPT_UNLOCK();
finit(nfp, fflag, DTYPE_SOCKET, so, &socketops);
error = sctp_do_peeloff(head, so, (sctp_assoc_t)uap->name);
if (error)
goto noconnection;
if (head->so_sigio != NULL)
fsetown(fgetown(&head->so_sigio), &so->so_sigio);
noconnection:
/*
* close the new descriptor, assuming someone hasn't ripped it
* out from under us.
*/
if (error)
fdclose(fdp, nfp, fd, td);
/*
* Release explicitly held references before returning.
*/
CURVNET_RESTORE();
done:
if (nfp != NULL)
fdrop(nfp, td);
fputsock(head);
done2:
return (error);
#else /* SCTP */
return (EOPNOTSUPP);
#endif /* SCTP */
}
int
-sctp_generic_sendmsg (td, uap)
+sys_sctp_generic_sendmsg (td, uap)
struct thread *td;
struct sctp_generic_sendmsg_args /* {
int sd,
caddr_t msg,
int mlen,
caddr_t to,
__socklen_t tolen,
struct sctp_sndrcvinfo *sinfo,
int flags
} */ *uap;
{
#if (defined(INET) || defined(INET6)) && defined(SCTP)
struct sctp_sndrcvinfo sinfo, *u_sinfo = NULL;
struct socket *so;
struct file *fp = NULL;
int error = 0, len;
struct sockaddr *to = NULL;
#ifdef KTRACE
struct uio *ktruio = NULL;
#endif
struct uio auio;
struct iovec iov[1];
cap_rights_t rights;
if (uap->sinfo) {
error = copyin(uap->sinfo, &sinfo, sizeof (sinfo));
if (error)
return (error);
u_sinfo = &sinfo;
}
rights = CAP_WRITE;
if (uap->tolen) {
error = getsockaddr(&to, uap->to, uap->tolen);
if (error) {
to = NULL;
goto sctp_bad2;
}
rights |= CAP_CONNECT;
}
AUDIT_ARG_FD(uap->sd);
error = getsock_cap(td->td_proc->p_fd, uap->sd, rights, &fp, NULL);
if (error)
goto sctp_bad;
#ifdef KTRACE
if (to && (KTRPOINT(td, KTR_STRUCT)))
ktrsockaddr(to);
#endif
iov[0].iov_base = uap->msg;
iov[0].iov_len = uap->mlen;
so = (struct socket *)fp->f_data;
#ifdef MAC
error = mac_socket_check_send(td->td_ucred, so);
if (error)
goto sctp_bad;
#endif /* MAC */
auio.uio_iov = iov;
auio.uio_iovcnt = 1;
auio.uio_segflg = UIO_USERSPACE;
auio.uio_rw = UIO_WRITE;
auio.uio_td = td;
auio.uio_offset = 0; /* XXX */
auio.uio_resid = 0;
len = auio.uio_resid = uap->mlen;
CURVNET_SET(so->so_vnet);
error = sctp_lower_sosend(so, to, &auio,
(struct mbuf *)NULL, (struct mbuf *)NULL,
uap->flags, u_sinfo, td);
CURVNET_RESTORE();
if (error) {
if (auio.uio_resid != len && (error == ERESTART ||
error == EINTR || error == EWOULDBLOCK))
error = 0;
/* Generation of SIGPIPE can be controlled per socket. */
if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) &&
!(uap->flags & MSG_NOSIGNAL)) {
PROC_LOCK(td->td_proc);
tdsignal(td, SIGPIPE);
PROC_UNLOCK(td->td_proc);
}
}
if (error == 0)
td->td_retval[0] = len - auio.uio_resid;
#ifdef KTRACE
if (ktruio != NULL) {
ktruio->uio_resid = td->td_retval[0];
ktrgenio(uap->sd, UIO_WRITE, ktruio, error);
}
#endif /* KTRACE */
sctp_bad:
if (fp)
fdrop(fp, td);
sctp_bad2:
if (to)
free(to, M_SONAME);
return (error);
#else /* SCTP */
return (EOPNOTSUPP);
#endif /* SCTP */
}
int
-sctp_generic_sendmsg_iov(td, uap)
+sys_sctp_generic_sendmsg_iov(td, uap)
struct thread *td;
struct sctp_generic_sendmsg_iov_args /* {
int sd,
struct iovec *iov,
int iovlen,
caddr_t to,
__socklen_t tolen,
struct sctp_sndrcvinfo *sinfo,
int flags
} */ *uap;
{
#if (defined(INET) || defined(INET6)) && defined(SCTP)
struct sctp_sndrcvinfo sinfo, *u_sinfo = NULL;
struct socket *so;
struct file *fp = NULL;
int error=0, len, i;
struct sockaddr *to = NULL;
#ifdef KTRACE
struct uio *ktruio = NULL;
#endif
struct uio auio;
struct iovec *iov, *tiov;
cap_rights_t rights;
if (uap->sinfo) {
error = copyin(uap->sinfo, &sinfo, sizeof (sinfo));
if (error)
return (error);
u_sinfo = &sinfo;
}
rights = CAP_WRITE;
if (uap->tolen) {
error = getsockaddr(&to, uap->to, uap->tolen);
if (error) {
to = NULL;
goto sctp_bad2;
}
rights |= CAP_CONNECT;
}
AUDIT_ARG_FD(uap->sd);
error = getsock_cap(td->td_proc->p_fd, uap->sd, rights, &fp, NULL);
if (error)
goto sctp_bad1;
#ifdef COMPAT_FREEBSD32
if (SV_CURPROC_FLAG(SV_ILP32))
error = freebsd32_copyiniov((struct iovec32 *)uap->iov,
uap->iovlen, &iov, EMSGSIZE);
else
#endif
error = copyiniov(uap->iov, uap->iovlen, &iov, EMSGSIZE);
if (error)
goto sctp_bad1;
#ifdef KTRACE
if (to && (KTRPOINT(td, KTR_STRUCT)))
ktrsockaddr(to);
#endif
so = (struct socket *)fp->f_data;
#ifdef MAC
error = mac_socket_check_send(td->td_ucred, so);
if (error)
goto sctp_bad;
#endif /* MAC */
auio.uio_iov = iov;
auio.uio_iovcnt = uap->iovlen;
auio.uio_segflg = UIO_USERSPACE;
auio.uio_rw = UIO_WRITE;
auio.uio_td = td;
auio.uio_offset = 0; /* XXX */
auio.uio_resid = 0;
tiov = iov;
for (i = 0; i <uap->iovlen; i++, tiov++) {
if ((auio.uio_resid += tiov->iov_len) < 0) {
error = EINVAL;
goto sctp_bad;
}
}
len = auio.uio_resid;
CURVNET_SET(so->so_vnet);
error = sctp_lower_sosend(so, to, &auio,
(struct mbuf *)NULL, (struct mbuf *)NULL,
uap->flags, u_sinfo, td);
CURVNET_RESTORE();
if (error) {
if (auio.uio_resid != len && (error == ERESTART ||
error == EINTR || error == EWOULDBLOCK))
error = 0;
/* Generation of SIGPIPE can be controlled per socket */
if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) &&
!(uap->flags & MSG_NOSIGNAL)) {
PROC_LOCK(td->td_proc);
tdsignal(td, SIGPIPE);
PROC_UNLOCK(td->td_proc);
}
}
if (error == 0)
td->td_retval[0] = len - auio.uio_resid;
#ifdef KTRACE
if (ktruio != NULL) {
ktruio->uio_resid = td->td_retval[0];
ktrgenio(uap->sd, UIO_WRITE, ktruio, error);
}
#endif /* KTRACE */
sctp_bad:
free(iov, M_IOV);
sctp_bad1:
if (fp)
fdrop(fp, td);
sctp_bad2:
if (to)
free(to, M_SONAME);
return (error);
#else /* SCTP */
return (EOPNOTSUPP);
#endif /* SCTP */
}
int
-sctp_generic_recvmsg(td, uap)
+sys_sctp_generic_recvmsg(td, uap)
struct thread *td;
struct sctp_generic_recvmsg_args /* {
int sd,
struct iovec *iov,
int iovlen,
struct sockaddr *from,
__socklen_t *fromlenaddr,
struct sctp_sndrcvinfo *sinfo,
int *msg_flags
} */ *uap;
{
#if (defined(INET) || defined(INET6)) && defined(SCTP)
uint8_t sockbufstore[256];
struct uio auio;
struct iovec *iov, *tiov;
struct sctp_sndrcvinfo sinfo;
struct socket *so;
struct file *fp = NULL;
struct sockaddr *fromsa;
int fromlen;
int len, i, msg_flags;
int error = 0;
#ifdef KTRACE
struct uio *ktruio = NULL;
#endif
AUDIT_ARG_FD(uap->sd);
error = getsock_cap(td->td_proc->p_fd, uap->sd, CAP_READ, &fp, NULL);
if (error) {
return (error);
}
#ifdef COMPAT_FREEBSD32
if (SV_CURPROC_FLAG(SV_ILP32))
error = freebsd32_copyiniov((struct iovec32 *)uap->iov,
uap->iovlen, &iov, EMSGSIZE);
else
#endif
error = copyiniov(uap->iov, uap->iovlen, &iov, EMSGSIZE);
if (error)
goto out1;
so = fp->f_data;
#ifdef MAC
error = mac_socket_check_receive(td->td_ucred, so);
if (error) {
goto out;
}
#endif /* MAC */
if (uap->fromlenaddr) {
error = copyin(uap->fromlenaddr,
&fromlen, sizeof (fromlen));
if (error) {
goto out;
}
} else {
fromlen = 0;
}
if (uap->msg_flags) {
error = copyin(uap->msg_flags, &msg_flags, sizeof (int));
if (error) {
goto out;
}
} else {
msg_flags = 0;
}
auio.uio_iov = iov;
auio.uio_iovcnt = uap->iovlen;
auio.uio_segflg = UIO_USERSPACE;
auio.uio_rw = UIO_READ;
auio.uio_td = td;
auio.uio_offset = 0; /* XXX */
auio.uio_resid = 0;
tiov = iov;
for (i = 0; i <uap->iovlen; i++, tiov++) {
if ((auio.uio_resid += tiov->iov_len) < 0) {
error = EINVAL;
goto out;
}
}
len = auio.uio_resid;
fromsa = (struct sockaddr *)sockbufstore;
#ifdef KTRACE
if (KTRPOINT(td, KTR_GENIO))
ktruio = cloneuio(&auio);
#endif /* KTRACE */
memset(&sinfo, 0, sizeof(struct sctp_sndrcvinfo));
CURVNET_SET(so->so_vnet);
error = sctp_sorecvmsg(so, &auio, (struct mbuf **)NULL,
fromsa, fromlen, &msg_flags,
(struct sctp_sndrcvinfo *)&sinfo, 1);
CURVNET_RESTORE();
if (error) {
if (auio.uio_resid != (int)len && (error == ERESTART ||
error == EINTR || error == EWOULDBLOCK))
error = 0;
} else {
if (uap->sinfo)
error = copyout(&sinfo, uap->sinfo, sizeof (sinfo));
}
#ifdef KTRACE
if (ktruio != NULL) {
ktruio->uio_resid = (int)len - auio.uio_resid;
ktrgenio(uap->sd, UIO_READ, ktruio, error);
}
#endif /* KTRACE */
if (error)
goto out;
td->td_retval[0] = (int)len - auio.uio_resid;
if (fromlen && uap->from) {
len = fromlen;
if (len <= 0 || fromsa == 0)
len = 0;
else {
len = MIN(len, fromsa->sa_len);
error = copyout(fromsa, uap->from, (unsigned)len);
if (error)
goto out;
}
error = copyout(&len, uap->fromlenaddr, sizeof (socklen_t));
if (error) {
goto out;
}
}
#ifdef KTRACE
if (KTRPOINT(td, KTR_STRUCT))
ktrsockaddr(fromsa);
#endif
if (uap->msg_flags) {
error = copyout(&msg_flags, uap->msg_flags, sizeof (int));
if (error) {
goto out;
}
}
out:
free(iov, M_IOV);
out1:
if (fp)
fdrop(fp, td);
return (error);
#else /* SCTP */
return (EOPNOTSUPP);
#endif /* SCTP */
}
Index: head/sys/kern/vfs_acl.c
===================================================================
--- head/sys/kern/vfs_acl.c (revision 225616)
+++ head/sys/kern/vfs_acl.c (revision 225617)
@@ -1,577 +1,577 @@
/*-
* Copyright (c) 1999-2006 Robert N. M. Watson
* All rights reserved.
*
* This software was developed by Robert Watson for the TrustedBSD Project.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* Developed by the TrustedBSD Project.
*
* ACL system calls and other functions common across different ACL types.
* Type-specific routines go into subr_acl_<type>.c.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/sysproto.h>
#include <sys/capability.h>
#include <sys/fcntl.h>
#include <sys/kernel.h>
#include <sys/malloc.h>
#include <sys/mount.h>
#include <sys/vnode.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/namei.h>
#include <sys/file.h>
#include <sys/filedesc.h>
#include <sys/proc.h>
#include <sys/sysent.h>
#include <sys/acl.h>
#include <security/mac/mac_framework.h>
CTASSERT(ACL_MAX_ENTRIES >= OLDACL_MAX_ENTRIES);
MALLOC_DEFINE(M_ACL, "acl", "Access Control Lists");
static int vacl_set_acl(struct thread *td, struct vnode *vp,
acl_type_t type, struct acl *aclp);
static int vacl_get_acl(struct thread *td, struct vnode *vp,
acl_type_t type, struct acl *aclp);
static int vacl_aclcheck(struct thread *td, struct vnode *vp,
acl_type_t type, struct acl *aclp);
int
acl_copy_oldacl_into_acl(const struct oldacl *source, struct acl *dest)
{
int i;
if (source->acl_cnt < 0 || source->acl_cnt > OLDACL_MAX_ENTRIES)
return (EINVAL);
bzero(dest, sizeof(*dest));
dest->acl_cnt = source->acl_cnt;
dest->acl_maxcnt = ACL_MAX_ENTRIES;
for (i = 0; i < dest->acl_cnt; i++) {
dest->acl_entry[i].ae_tag = source->acl_entry[i].ae_tag;
dest->acl_entry[i].ae_id = source->acl_entry[i].ae_id;
dest->acl_entry[i].ae_perm = source->acl_entry[i].ae_perm;
}
return (0);
}
int
acl_copy_acl_into_oldacl(const struct acl *source, struct oldacl *dest)
{
int i;
if (source->acl_cnt > OLDACL_MAX_ENTRIES)
return (EINVAL);
bzero(dest, sizeof(*dest));
dest->acl_cnt = source->acl_cnt;
for (i = 0; i < dest->acl_cnt; i++) {
dest->acl_entry[i].ae_tag = source->acl_entry[i].ae_tag;
dest->acl_entry[i].ae_id = source->acl_entry[i].ae_id;
dest->acl_entry[i].ae_perm = source->acl_entry[i].ae_perm;
}
return (0);
}
/*
* At one time, "struct ACL" was extended in order to add support for NFSv4
* ACLs. Instead of creating compatibility versions of all the ACL-related
* syscalls, they were left intact. It's possible to find out what the code
* calling these syscalls (libc) expects basing on "type" argument - if it's
* either ACL_TYPE_ACCESS_OLD or ACL_TYPE_DEFAULT_OLD (which previously were
* known as ACL_TYPE_ACCESS and ACL_TYPE_DEFAULT), then it's the "struct
* oldacl". If it's something else, then it's the new "struct acl". In the
* latter case, the routines below just copyin/copyout the contents. In the
* former case, they copyin the "struct oldacl" and convert it to the new
* format.
*/
static int
acl_copyin(void *user_acl, struct acl *kernel_acl, acl_type_t type)
{
int error;
struct oldacl old;
switch (type) {
case ACL_TYPE_ACCESS_OLD:
case ACL_TYPE_DEFAULT_OLD:
error = copyin(user_acl, &old, sizeof(old));
if (error != 0)
break;
acl_copy_oldacl_into_acl(&old, kernel_acl);
break;
default:
error = copyin(user_acl, kernel_acl, sizeof(*kernel_acl));
if (kernel_acl->acl_maxcnt != ACL_MAX_ENTRIES)
return (EINVAL);
}
return (error);
}
static int
acl_copyout(struct acl *kernel_acl, void *user_acl, acl_type_t type)
{
int error;
struct oldacl old;
switch (type) {
case ACL_TYPE_ACCESS_OLD:
case ACL_TYPE_DEFAULT_OLD:
error = acl_copy_acl_into_oldacl(kernel_acl, &old);
if (error != 0)
break;
error = copyout(&old, user_acl, sizeof(old));
break;
default:
if (fuword32((char *)user_acl +
offsetof(struct acl, acl_maxcnt)) != ACL_MAX_ENTRIES)
return (EINVAL);
error = copyout(kernel_acl, user_acl, sizeof(*kernel_acl));
}
return (error);
}
/*
* Convert "old" type - ACL_TYPE_{ACCESS,DEFAULT}_OLD - into its "new"
* counterpart. It's required for old (pre-NFSv4 ACLs) libc to work
* with new kernel. Fixing 'type' for old binaries with new libc
* is being done in lib/libc/posix1e/acl_support.c:_acl_type_unold().
*/
static int
acl_type_unold(int type)
{
switch (type) {
case ACL_TYPE_ACCESS_OLD:
return (ACL_TYPE_ACCESS);
case ACL_TYPE_DEFAULT_OLD:
return (ACL_TYPE_DEFAULT);
default:
return (type);
}
}
/*
* These calls wrap the real vnode operations, and are called by the syscall
* code once the syscall has converted the path or file descriptor to a vnode
* (unlocked). The aclp pointer is assumed still to point to userland, so
* this should not be consumed within the kernel except by syscall code.
* Other code should directly invoke VOP_{SET,GET}ACL.
*/
/*
* Given a vnode, set its ACL.
*/
static int
vacl_set_acl(struct thread *td, struct vnode *vp, acl_type_t type,
struct acl *aclp)
{
struct acl *inkernelacl;
struct mount *mp;
int error;
inkernelacl = acl_alloc(M_WAITOK);
error = acl_copyin(aclp, inkernelacl, type);
if (error != 0)
goto out;
error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
if (error != 0)
goto out;
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
#ifdef MAC
error = mac_vnode_check_setacl(td->td_ucred, vp, type, inkernelacl);
if (error != 0)
goto out_unlock;
#endif
error = VOP_SETACL(vp, acl_type_unold(type), inkernelacl,
td->td_ucred, td);
#ifdef MAC
out_unlock:
#endif
VOP_UNLOCK(vp, 0);
vn_finished_write(mp);
out:
acl_free(inkernelacl);
return (error);
}
/*
* Given a vnode, get its ACL.
*/
static int
vacl_get_acl(struct thread *td, struct vnode *vp, acl_type_t type,
struct acl *aclp)
{
struct acl *inkernelacl;
int error;
inkernelacl = acl_alloc(M_WAITOK);
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
#ifdef MAC
error = mac_vnode_check_getacl(td->td_ucred, vp, type);
if (error != 0)
goto out;
#endif
error = VOP_GETACL(vp, acl_type_unold(type), inkernelacl,
td->td_ucred, td);
#ifdef MAC
out:
#endif
VOP_UNLOCK(vp, 0);
if (error == 0)
error = acl_copyout(inkernelacl, aclp, type);
acl_free(inkernelacl);
return (error);
}
/*
* Given a vnode, delete its ACL.
*/
static int
vacl_delete(struct thread *td, struct vnode *vp, acl_type_t type)
{
struct mount *mp;
int error;
error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
if (error != 0)
return (error);
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
#ifdef MAC
error = mac_vnode_check_deleteacl(td->td_ucred, vp, type);
if (error != 0)
goto out;
#endif
error = VOP_SETACL(vp, acl_type_unold(type), 0, td->td_ucred, td);
#ifdef MAC
out:
#endif
VOP_UNLOCK(vp, 0);
vn_finished_write(mp);
return (error);
}
/*
* Given a vnode, check whether an ACL is appropriate for it
*/
static int
vacl_aclcheck(struct thread *td, struct vnode *vp, acl_type_t type,
struct acl *aclp)
{
struct acl *inkernelacl;
int error;
inkernelacl = acl_alloc(M_WAITOK);
error = acl_copyin(aclp, inkernelacl, type);
if (error != 0)
goto out;
error = VOP_ACLCHECK(vp, acl_type_unold(type), inkernelacl,
td->td_ucred, td);
out:
acl_free(inkernelacl);
return (error);
}
/*
* syscalls -- convert the path/fd to a vnode, and call vacl_whatever. Don't
* need to lock, as the vacl_ code will get/release any locks required.
*/
/*
* Given a file path, get an ACL for it
*/
int
-__acl_get_file(struct thread *td, struct __acl_get_file_args *uap)
+sys___acl_get_file(struct thread *td, struct __acl_get_file_args *uap)
{
struct nameidata nd;
int vfslocked, error;
NDINIT(&nd, LOOKUP, MPSAFE|FOLLOW, UIO_USERSPACE, uap->path, td);
error = namei(&nd);
vfslocked = NDHASGIANT(&nd);
if (error == 0) {
error = vacl_get_acl(td, nd.ni_vp, uap->type, uap->aclp);
NDFREE(&nd, 0);
}
VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
/*
* Given a file path, get an ACL for it; don't follow links.
*/
int
-__acl_get_link(struct thread *td, struct __acl_get_link_args *uap)
+sys___acl_get_link(struct thread *td, struct __acl_get_link_args *uap)
{
struct nameidata nd;
int vfslocked, error;
NDINIT(&nd, LOOKUP, MPSAFE|NOFOLLOW, UIO_USERSPACE, uap->path, td);
error = namei(&nd);
vfslocked = NDHASGIANT(&nd);
if (error == 0) {
error = vacl_get_acl(td, nd.ni_vp, uap->type, uap->aclp);
NDFREE(&nd, 0);
}
VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
/*
* Given a file path, set an ACL for it.
*/
int
-__acl_set_file(struct thread *td, struct __acl_set_file_args *uap)
+sys___acl_set_file(struct thread *td, struct __acl_set_file_args *uap)
{
struct nameidata nd;
int vfslocked, error;
NDINIT(&nd, LOOKUP, MPSAFE|FOLLOW, UIO_USERSPACE, uap->path, td);
error = namei(&nd);
vfslocked = NDHASGIANT(&nd);
if (error == 0) {
error = vacl_set_acl(td, nd.ni_vp, uap->type, uap->aclp);
NDFREE(&nd, 0);
}
VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
/*
* Given a file path, set an ACL for it; don't follow links.
*/
int
-__acl_set_link(struct thread *td, struct __acl_set_link_args *uap)
+sys___acl_set_link(struct thread *td, struct __acl_set_link_args *uap)
{
struct nameidata nd;
int vfslocked, error;
NDINIT(&nd, LOOKUP, MPSAFE|NOFOLLOW, UIO_USERSPACE, uap->path, td);
error = namei(&nd);
vfslocked = NDHASGIANT(&nd);
if (error == 0) {
error = vacl_set_acl(td, nd.ni_vp, uap->type, uap->aclp);
NDFREE(&nd, 0);
}
VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
/*
* Given a file descriptor, get an ACL for it.
*/
int
-__acl_get_fd(struct thread *td, struct __acl_get_fd_args *uap)
+sys___acl_get_fd(struct thread *td, struct __acl_get_fd_args *uap)
{
struct file *fp;
int vfslocked, error;
error = getvnode(td->td_proc->p_fd, uap->filedes, CAP_ACL_GET, &fp);
if (error == 0) {
vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
error = vacl_get_acl(td, fp->f_vnode, uap->type, uap->aclp);
fdrop(fp, td);
VFS_UNLOCK_GIANT(vfslocked);
}
return (error);
}
/*
* Given a file descriptor, set an ACL for it.
*/
int
-__acl_set_fd(struct thread *td, struct __acl_set_fd_args *uap)
+sys___acl_set_fd(struct thread *td, struct __acl_set_fd_args *uap)
{
struct file *fp;
int vfslocked, error;
error = getvnode(td->td_proc->p_fd, uap->filedes, CAP_ACL_SET, &fp);
if (error == 0) {
vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
error = vacl_set_acl(td, fp->f_vnode, uap->type, uap->aclp);
fdrop(fp, td);
VFS_UNLOCK_GIANT(vfslocked);
}
return (error);
}
/*
* Given a file path, delete an ACL from it.
*/
int
-__acl_delete_file(struct thread *td, struct __acl_delete_file_args *uap)
+sys___acl_delete_file(struct thread *td, struct __acl_delete_file_args *uap)
{
struct nameidata nd;
int vfslocked, error;
NDINIT(&nd, LOOKUP, MPSAFE|FOLLOW, UIO_USERSPACE, uap->path, td);
error = namei(&nd);
vfslocked = NDHASGIANT(&nd);
if (error == 0) {
error = vacl_delete(td, nd.ni_vp, uap->type);
NDFREE(&nd, 0);
}
VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
/*
* Given a file path, delete an ACL from it; don't follow links.
*/
int
-__acl_delete_link(struct thread *td, struct __acl_delete_link_args *uap)
+sys___acl_delete_link(struct thread *td, struct __acl_delete_link_args *uap)
{
struct nameidata nd;
int vfslocked, error;
NDINIT(&nd, LOOKUP, MPSAFE|NOFOLLOW, UIO_USERSPACE, uap->path, td);
error = namei(&nd);
vfslocked = NDHASGIANT(&nd);
if (error == 0) {
error = vacl_delete(td, nd.ni_vp, uap->type);
NDFREE(&nd, 0);
}
VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
/*
* Given a file path, delete an ACL from it.
*/
int
-__acl_delete_fd(struct thread *td, struct __acl_delete_fd_args *uap)
+sys___acl_delete_fd(struct thread *td, struct __acl_delete_fd_args *uap)
{
struct file *fp;
int vfslocked, error;
error = getvnode(td->td_proc->p_fd, uap->filedes, CAP_ACL_DELETE,
&fp);
if (error == 0) {
vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
error = vacl_delete(td, fp->f_vnode, uap->type);
fdrop(fp, td);
VFS_UNLOCK_GIANT(vfslocked);
}
return (error);
}
/*
* Given a file path, check an ACL for it.
*/
int
-__acl_aclcheck_file(struct thread *td, struct __acl_aclcheck_file_args *uap)
+sys___acl_aclcheck_file(struct thread *td, struct __acl_aclcheck_file_args *uap)
{
struct nameidata nd;
int vfslocked, error;
NDINIT(&nd, LOOKUP, MPSAFE|FOLLOW, UIO_USERSPACE, uap->path, td);
error = namei(&nd);
vfslocked = NDHASGIANT(&nd);
if (error == 0) {
error = vacl_aclcheck(td, nd.ni_vp, uap->type, uap->aclp);
NDFREE(&nd, 0);
}
VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
/*
* Given a file path, check an ACL for it; don't follow links.
*/
int
-__acl_aclcheck_link(struct thread *td, struct __acl_aclcheck_link_args *uap)
+sys___acl_aclcheck_link(struct thread *td, struct __acl_aclcheck_link_args *uap)
{
struct nameidata nd;
int vfslocked, error;
NDINIT(&nd, LOOKUP, MPSAFE|NOFOLLOW, UIO_USERSPACE, uap->path, td);
error = namei(&nd);
vfslocked = NDHASGIANT(&nd);
if (error == 0) {
error = vacl_aclcheck(td, nd.ni_vp, uap->type, uap->aclp);
NDFREE(&nd, 0);
}
VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
/*
* Given a file descriptor, check an ACL for it.
*/
int
-__acl_aclcheck_fd(struct thread *td, struct __acl_aclcheck_fd_args *uap)
+sys___acl_aclcheck_fd(struct thread *td, struct __acl_aclcheck_fd_args *uap)
{
struct file *fp;
int vfslocked, error;
error = getvnode(td->td_proc->p_fd, uap->filedes, CAP_ACL_CHECK,
&fp);
if (error == 0) {
vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
error = vacl_aclcheck(td, fp->f_vnode, uap->type, uap->aclp);
fdrop(fp, td);
VFS_UNLOCK_GIANT(vfslocked);
}
return (error);
}
struct acl *
acl_alloc(int flags)
{
struct acl *aclp;
aclp = malloc(sizeof(*aclp), M_ACL, flags);
aclp->acl_maxcnt = ACL_MAX_ENTRIES;
return (aclp);
}
void
acl_free(struct acl *aclp)
{
free(aclp, M_ACL);
}
Index: head/sys/kern/vfs_aio.c
===================================================================
--- head/sys/kern/vfs_aio.c (revision 225616)
+++ head/sys/kern/vfs_aio.c (revision 225617)
@@ -1,3002 +1,3002 @@
/*-
* Copyright (c) 1997 John S. Dyson. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. John S. Dyson's name may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* DISCLAIMER: This code isn't warranted to do anything useful. Anything
* bad that happens because of using this software isn't the responsibility
* of the author. This software is distributed AS-IS.
*/
/*
* This file contains support for the POSIX 1003.1B AIO/LIO facility.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_compat.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/malloc.h>
#include <sys/bio.h>
#include <sys/buf.h>
#include <sys/capability.h>
#include <sys/eventhandler.h>
#include <sys/sysproto.h>
#include <sys/filedesc.h>
#include <sys/kernel.h>
#include <sys/module.h>
#include <sys/kthread.h>
#include <sys/fcntl.h>
#include <sys/file.h>
#include <sys/limits.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/unistd.h>
#include <sys/posix4.h>
#include <sys/proc.h>
#include <sys/resourcevar.h>
#include <sys/signalvar.h>
#include <sys/protosw.h>
#include <sys/sema.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/syscall.h>
#include <sys/sysent.h>
#include <sys/sysctl.h>
#include <sys/sx.h>
#include <sys/taskqueue.h>
#include <sys/vnode.h>
#include <sys/conf.h>
#include <sys/event.h>
#include <sys/mount.h>
#include <machine/atomic.h>
#include <vm/vm.h>
#include <vm/vm_extern.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
#include <vm/vm_object.h>
#include <vm/uma.h>
#include <sys/aio.h>
#include "opt_vfs_aio.h"
/*
* Counter for allocating reference ids to new jobs. Wrapped to 1 on
* overflow. (XXX will be removed soon.)
*/
static u_long jobrefid;
/*
* Counter for aio_fsync.
*/
static uint64_t jobseqno;
#define JOBST_NULL 0
#define JOBST_JOBQSOCK 1
#define JOBST_JOBQGLOBAL 2
#define JOBST_JOBRUNNING 3
#define JOBST_JOBFINISHED 4
#define JOBST_JOBQBUF 5
#define JOBST_JOBQSYNC 6
#ifndef MAX_AIO_PER_PROC
#define MAX_AIO_PER_PROC 32
#endif
#ifndef MAX_AIO_QUEUE_PER_PROC
#define MAX_AIO_QUEUE_PER_PROC 256 /* Bigger than AIO_LISTIO_MAX */
#endif
#ifndef MAX_AIO_PROCS
#define MAX_AIO_PROCS 32
#endif
#ifndef MAX_AIO_QUEUE
#define MAX_AIO_QUEUE 1024 /* Bigger than AIO_LISTIO_MAX */
#endif
#ifndef TARGET_AIO_PROCS
#define TARGET_AIO_PROCS 4
#endif
#ifndef MAX_BUF_AIO
#define MAX_BUF_AIO 16
#endif
#ifndef AIOD_TIMEOUT_DEFAULT
#define AIOD_TIMEOUT_DEFAULT (10 * hz)
#endif
#ifndef AIOD_LIFETIME_DEFAULT
#define AIOD_LIFETIME_DEFAULT (30 * hz)
#endif
FEATURE(aio, "Asynchronous I/O");
static MALLOC_DEFINE(M_LIO, "lio", "listio aio control block list");
static SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0, "Async IO management");
static int max_aio_procs = MAX_AIO_PROCS;
SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs,
CTLFLAG_RW, &max_aio_procs, 0,
"Maximum number of kernel threads to use for handling async IO ");
static int num_aio_procs = 0;
SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs,
CTLFLAG_RD, &num_aio_procs, 0,
"Number of presently active kernel threads for async IO");
/*
* The code will adjust the actual number of AIO processes towards this
* number when it gets a chance.
*/
static int target_aio_procs = TARGET_AIO_PROCS;
SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs, CTLFLAG_RW, &target_aio_procs,
0, "Preferred number of ready kernel threads for async IO");
static int max_queue_count = MAX_AIO_QUEUE;
SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue, CTLFLAG_RW, &max_queue_count, 0,
"Maximum number of aio requests to queue, globally");
static int num_queue_count = 0;
SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count, CTLFLAG_RD, &num_queue_count, 0,
"Number of queued aio requests");
static int num_buf_aio = 0;
SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio, CTLFLAG_RD, &num_buf_aio, 0,
"Number of aio requests presently handled by the buf subsystem");
/* Number of async I/O thread in the process of being started */
/* XXX This should be local to aio_aqueue() */
static int num_aio_resv_start = 0;
static int aiod_timeout;
SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_timeout, CTLFLAG_RW, &aiod_timeout, 0,
"Timeout value for synchronous aio operations");
static int aiod_lifetime;
SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime, CTLFLAG_RW, &aiod_lifetime, 0,
"Maximum lifetime for idle aiod");
static int unloadable = 0;
SYSCTL_INT(_vfs_aio, OID_AUTO, unloadable, CTLFLAG_RW, &unloadable, 0,
"Allow unload of aio (not recommended)");
static int max_aio_per_proc = MAX_AIO_PER_PROC;
SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc, CTLFLAG_RW, &max_aio_per_proc,
0, "Maximum active aio requests per process (stored in the process)");
static int max_aio_queue_per_proc = MAX_AIO_QUEUE_PER_PROC;
SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc, CTLFLAG_RW,
&max_aio_queue_per_proc, 0,
"Maximum queued aio requests per process (stored in the process)");
static int max_buf_aio = MAX_BUF_AIO;
SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio, CTLFLAG_RW, &max_buf_aio, 0,
"Maximum buf aio requests per process (stored in the process)");
typedef struct oaiocb {
int aio_fildes; /* File descriptor */
off_t aio_offset; /* File offset for I/O */
volatile void *aio_buf; /* I/O buffer in process space */
size_t aio_nbytes; /* Number of bytes for I/O */
struct osigevent aio_sigevent; /* Signal to deliver */
int aio_lio_opcode; /* LIO opcode */
int aio_reqprio; /* Request priority -- ignored */
struct __aiocb_private _aiocb_private;
} oaiocb_t;
/*
* Below is a key of locks used to protect each member of struct aiocblist
* aioliojob and kaioinfo and any backends.
*
* * - need not protected
* a - locked by kaioinfo lock
* b - locked by backend lock, the backend lock can be null in some cases,
* for example, BIO belongs to this type, in this case, proc lock is
* reused.
* c - locked by aio_job_mtx, the lock for the generic file I/O backend.
*/
/*
* Current, there is only two backends: BIO and generic file I/O.
* socket I/O is served by generic file I/O, this is not a good idea, since
* disk file I/O and any other types without O_NONBLOCK flag can block daemon
* threads, if there is no thread to serve socket I/O, the socket I/O will be
* delayed too long or starved, we should create some threads dedicated to
* sockets to do non-blocking I/O, same for pipe and fifo, for these I/O
* systems we really need non-blocking interface, fiddling O_NONBLOCK in file
* structure is not safe because there is race between userland and aio
* daemons.
*/
struct aiocblist {
TAILQ_ENTRY(aiocblist) list; /* (b) internal list of for backend */
TAILQ_ENTRY(aiocblist) plist; /* (a) list of jobs for each backend */
TAILQ_ENTRY(aiocblist) allist; /* (a) list of all jobs in proc */
int jobflags; /* (a) job flags */
int jobstate; /* (b) job state */
int inputcharge; /* (*) input blockes */
int outputcharge; /* (*) output blockes */
struct buf *bp; /* (*) private to BIO backend,
* buffer pointer
*/
struct proc *userproc; /* (*) user process */
struct ucred *cred; /* (*) active credential when created */
struct file *fd_file; /* (*) pointer to file structure */
struct aioliojob *lio; /* (*) optional lio job */
struct aiocb *uuaiocb; /* (*) pointer in userspace of aiocb */
struct knlist klist; /* (a) list of knotes */
struct aiocb uaiocb; /* (*) kernel I/O control block */
ksiginfo_t ksi; /* (a) realtime signal info */
struct task biotask; /* (*) private to BIO backend */
uint64_t seqno; /* (*) job number */
int pending; /* (a) number of pending I/O, aio_fsync only */
};
/* jobflags */
#define AIOCBLIST_DONE 0x01
#define AIOCBLIST_BUFDONE 0x02
#define AIOCBLIST_RUNDOWN 0x04
#define AIOCBLIST_CHECKSYNC 0x08
/*
* AIO process info
*/
#define AIOP_FREE 0x1 /* proc on free queue */
struct aiothreadlist {
int aiothreadflags; /* (c) AIO proc flags */
TAILQ_ENTRY(aiothreadlist) list; /* (c) list of processes */
struct thread *aiothread; /* (*) the AIO thread */
};
/*
* data-structure for lio signal management
*/
struct aioliojob {
int lioj_flags; /* (a) listio flags */
int lioj_count; /* (a) listio flags */
int lioj_finished_count; /* (a) listio flags */
struct sigevent lioj_signal; /* (a) signal on all I/O done */
TAILQ_ENTRY(aioliojob) lioj_list; /* (a) lio list */
struct knlist klist; /* (a) list of knotes */
ksiginfo_t lioj_ksi; /* (a) Realtime signal info */
};
#define LIOJ_SIGNAL 0x1 /* signal on all done (lio) */
#define LIOJ_SIGNAL_POSTED 0x2 /* signal has been posted */
#define LIOJ_KEVENT_POSTED 0x4 /* kevent triggered */
/*
* per process aio data structure
*/
struct kaioinfo {
struct mtx kaio_mtx; /* the lock to protect this struct */
int kaio_flags; /* (a) per process kaio flags */
int kaio_maxactive_count; /* (*) maximum number of AIOs */
int kaio_active_count; /* (c) number of currently used AIOs */
int kaio_qallowed_count; /* (*) maxiumu size of AIO queue */
int kaio_count; /* (a) size of AIO queue */
int kaio_ballowed_count; /* (*) maximum number of buffers */
int kaio_buffer_count; /* (a) number of physio buffers */
TAILQ_HEAD(,aiocblist) kaio_all; /* (a) all AIOs in the process */
TAILQ_HEAD(,aiocblist) kaio_done; /* (a) done queue for process */
TAILQ_HEAD(,aioliojob) kaio_liojoblist; /* (a) list of lio jobs */
TAILQ_HEAD(,aiocblist) kaio_jobqueue; /* (a) job queue for process */
TAILQ_HEAD(,aiocblist) kaio_bufqueue; /* (a) buffer job queue for process */
TAILQ_HEAD(,aiocblist) kaio_sockqueue; /* (a) queue for aios waiting on sockets,
* NOT USED YET.
*/
TAILQ_HEAD(,aiocblist) kaio_syncqueue; /* (a) queue for aio_fsync */
struct task kaio_task; /* (*) task to kick aio threads */
};
#define AIO_LOCK(ki) mtx_lock(&(ki)->kaio_mtx)
#define AIO_UNLOCK(ki) mtx_unlock(&(ki)->kaio_mtx)
#define AIO_LOCK_ASSERT(ki, f) mtx_assert(&(ki)->kaio_mtx, (f))
#define AIO_MTX(ki) (&(ki)->kaio_mtx)
#define KAIO_RUNDOWN 0x1 /* process is being run down */
#define KAIO_WAKEUP 0x2 /* wakeup process when there is a significant event */
/*
* Operations used to interact with userland aio control blocks.
* Different ABIs provide their own operations.
*/
struct aiocb_ops {
int (*copyin)(struct aiocb *ujob, struct aiocb *kjob);
long (*fetch_status)(struct aiocb *ujob);
long (*fetch_error)(struct aiocb *ujob);
int (*store_status)(struct aiocb *ujob, long status);
int (*store_error)(struct aiocb *ujob, long error);
int (*store_kernelinfo)(struct aiocb *ujob, long jobref);
int (*store_aiocb)(struct aiocb **ujobp, struct aiocb *ujob);
};
static TAILQ_HEAD(,aiothreadlist) aio_freeproc; /* (c) Idle daemons */
static struct sema aio_newproc_sem;
static struct mtx aio_job_mtx;
static struct mtx aio_sock_mtx;
static TAILQ_HEAD(,aiocblist) aio_jobs; /* (c) Async job list */
static struct unrhdr *aiod_unr;
void aio_init_aioinfo(struct proc *p);
static int aio_onceonly(void);
static int aio_free_entry(struct aiocblist *aiocbe);
static void aio_process(struct aiocblist *aiocbe);
static int aio_newproc(int *);
int aio_aqueue(struct thread *td, struct aiocb *job,
struct aioliojob *lio, int type, struct aiocb_ops *ops);
static void aio_physwakeup(struct buf *bp);
static void aio_proc_rundown(void *arg, struct proc *p);
static void aio_proc_rundown_exec(void *arg, struct proc *p, struct image_params *imgp);
static int aio_qphysio(struct proc *p, struct aiocblist *iocb);
static void biohelper(void *, int);
static void aio_daemon(void *param);
static void aio_swake_cb(struct socket *, struct sockbuf *);
static int aio_unload(void);
static void aio_bio_done_notify(struct proc *userp, struct aiocblist *aiocbe, int type);
#define DONE_BUF 1
#define DONE_QUEUE 2
static int aio_kick(struct proc *userp);
static void aio_kick_nowait(struct proc *userp);
static void aio_kick_helper(void *context, int pending);
static int filt_aioattach(struct knote *kn);
static void filt_aiodetach(struct knote *kn);
static int filt_aio(struct knote *kn, long hint);
static int filt_lioattach(struct knote *kn);
static void filt_liodetach(struct knote *kn);
static int filt_lio(struct knote *kn, long hint);
/*
* Zones for:
* kaio Per process async io info
* aiop async io thread data
* aiocb async io jobs
* aiol list io job pointer - internal to aio_suspend XXX
* aiolio list io jobs
*/
static uma_zone_t kaio_zone, aiop_zone, aiocb_zone, aiol_zone, aiolio_zone;
/* kqueue filters for aio */
static struct filterops aio_filtops = {
.f_isfd = 0,
.f_attach = filt_aioattach,
.f_detach = filt_aiodetach,
.f_event = filt_aio,
};
static struct filterops lio_filtops = {
.f_isfd = 0,
.f_attach = filt_lioattach,
.f_detach = filt_liodetach,
.f_event = filt_lio
};
static eventhandler_tag exit_tag, exec_tag;
TASKQUEUE_DEFINE_THREAD(aiod_bio);
/*
* Main operations function for use as a kernel module.
*/
static int
aio_modload(struct module *module, int cmd, void *arg)
{
int error = 0;
switch (cmd) {
case MOD_LOAD:
aio_onceonly();
break;
case MOD_UNLOAD:
error = aio_unload();
break;
case MOD_SHUTDOWN:
break;
default:
error = EINVAL;
break;
}
return (error);
}
static moduledata_t aio_mod = {
"aio",
&aio_modload,
NULL
};
static struct syscall_helper_data aio_syscalls[] = {
SYSCALL_INIT_HELPER(aio_cancel),
SYSCALL_INIT_HELPER(aio_error),
SYSCALL_INIT_HELPER(aio_fsync),
SYSCALL_INIT_HELPER(aio_read),
SYSCALL_INIT_HELPER(aio_return),
SYSCALL_INIT_HELPER(aio_suspend),
SYSCALL_INIT_HELPER(aio_waitcomplete),
SYSCALL_INIT_HELPER(aio_write),
SYSCALL_INIT_HELPER(lio_listio),
SYSCALL_INIT_HELPER(oaio_read),
SYSCALL_INIT_HELPER(oaio_write),
SYSCALL_INIT_HELPER(olio_listio),
SYSCALL_INIT_LAST
};
#ifdef COMPAT_FREEBSD32
#include <sys/mount.h>
#include <sys/socket.h>
#include <compat/freebsd32/freebsd32.h>
#include <compat/freebsd32/freebsd32_proto.h>
#include <compat/freebsd32/freebsd32_signal.h>
#include <compat/freebsd32/freebsd32_syscall.h>
#include <compat/freebsd32/freebsd32_util.h>
static struct syscall_helper_data aio32_syscalls[] = {
SYSCALL32_INIT_HELPER(freebsd32_aio_return),
SYSCALL32_INIT_HELPER(freebsd32_aio_suspend),
SYSCALL32_INIT_HELPER(freebsd32_aio_cancel),
SYSCALL32_INIT_HELPER(freebsd32_aio_error),
SYSCALL32_INIT_HELPER(freebsd32_aio_fsync),
SYSCALL32_INIT_HELPER(freebsd32_aio_read),
SYSCALL32_INIT_HELPER(freebsd32_aio_write),
SYSCALL32_INIT_HELPER(freebsd32_aio_waitcomplete),
SYSCALL32_INIT_HELPER(freebsd32_lio_listio),
SYSCALL32_INIT_HELPER(freebsd32_oaio_read),
SYSCALL32_INIT_HELPER(freebsd32_oaio_write),
SYSCALL32_INIT_HELPER(freebsd32_olio_listio),
SYSCALL_INIT_LAST
};
#endif
DECLARE_MODULE(aio, aio_mod,
SI_SUB_VFS, SI_ORDER_ANY);
MODULE_VERSION(aio, 1);
/*
* Startup initialization
*/
static int
aio_onceonly(void)
{
int error;
/* XXX: should probably just use so->callback */
aio_swake = &aio_swake_cb;
exit_tag = EVENTHANDLER_REGISTER(process_exit, aio_proc_rundown, NULL,
EVENTHANDLER_PRI_ANY);
exec_tag = EVENTHANDLER_REGISTER(process_exec, aio_proc_rundown_exec, NULL,
EVENTHANDLER_PRI_ANY);
kqueue_add_filteropts(EVFILT_AIO, &aio_filtops);
kqueue_add_filteropts(EVFILT_LIO, &lio_filtops);
TAILQ_INIT(&aio_freeproc);
sema_init(&aio_newproc_sem, 0, "aio_new_proc");
mtx_init(&aio_job_mtx, "aio_job", NULL, MTX_DEF);
mtx_init(&aio_sock_mtx, "aio_sock", NULL, MTX_DEF);
TAILQ_INIT(&aio_jobs);
aiod_unr = new_unrhdr(1, INT_MAX, NULL);
kaio_zone = uma_zcreate("AIO", sizeof(struct kaioinfo), NULL, NULL,
NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
aiop_zone = uma_zcreate("AIOP", sizeof(struct aiothreadlist), NULL,
NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
aiocb_zone = uma_zcreate("AIOCB", sizeof(struct aiocblist), NULL, NULL,
NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
aiol_zone = uma_zcreate("AIOL", AIO_LISTIO_MAX*sizeof(intptr_t) , NULL,
NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
aiolio_zone = uma_zcreate("AIOLIO", sizeof(struct aioliojob), NULL,
NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
aiod_timeout = AIOD_TIMEOUT_DEFAULT;
aiod_lifetime = AIOD_LIFETIME_DEFAULT;
jobrefid = 1;
async_io_version = _POSIX_VERSION;
p31b_setcfg(CTL_P1003_1B_AIO_LISTIO_MAX, AIO_LISTIO_MAX);
p31b_setcfg(CTL_P1003_1B_AIO_MAX, MAX_AIO_QUEUE);
p31b_setcfg(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, 0);
error = syscall_helper_register(aio_syscalls);
if (error)
return (error);
#ifdef COMPAT_FREEBSD32
error = syscall32_helper_register(aio32_syscalls);
if (error)
return (error);
#endif
return (0);
}
/*
* Callback for unload of AIO when used as a module.
*/
static int
aio_unload(void)
{
int error;
/*
* XXX: no unloads by default, it's too dangerous.
* perhaps we could do it if locked out callers and then
* did an aio_proc_rundown() on each process.
*
* jhb: aio_proc_rundown() needs to run on curproc though,
* so I don't think that would fly.
*/
if (!unloadable)
return (EOPNOTSUPP);
#ifdef COMPAT_FREEBSD32
syscall32_helper_unregister(aio32_syscalls);
#endif
syscall_helper_unregister(aio_syscalls);
error = kqueue_del_filteropts(EVFILT_AIO);
if (error)
return error;
error = kqueue_del_filteropts(EVFILT_LIO);
if (error)
return error;
async_io_version = 0;
aio_swake = NULL;
taskqueue_free(taskqueue_aiod_bio);
delete_unrhdr(aiod_unr);
uma_zdestroy(kaio_zone);
uma_zdestroy(aiop_zone);
uma_zdestroy(aiocb_zone);
uma_zdestroy(aiol_zone);
uma_zdestroy(aiolio_zone);
EVENTHANDLER_DEREGISTER(process_exit, exit_tag);
EVENTHANDLER_DEREGISTER(process_exec, exec_tag);
mtx_destroy(&aio_job_mtx);
mtx_destroy(&aio_sock_mtx);
sema_destroy(&aio_newproc_sem);
p31b_setcfg(CTL_P1003_1B_AIO_LISTIO_MAX, -1);
p31b_setcfg(CTL_P1003_1B_AIO_MAX, -1);
p31b_setcfg(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, -1);
return (0);
}
/*
* Init the per-process aioinfo structure. The aioinfo limits are set
* per-process for user limit (resource) management.
*/
void
aio_init_aioinfo(struct proc *p)
{
struct kaioinfo *ki;
ki = uma_zalloc(kaio_zone, M_WAITOK);
mtx_init(&ki->kaio_mtx, "aiomtx", NULL, MTX_DEF);
ki->kaio_flags = 0;
ki->kaio_maxactive_count = max_aio_per_proc;
ki->kaio_active_count = 0;
ki->kaio_qallowed_count = max_aio_queue_per_proc;
ki->kaio_count = 0;
ki->kaio_ballowed_count = max_buf_aio;
ki->kaio_buffer_count = 0;
TAILQ_INIT(&ki->kaio_all);
TAILQ_INIT(&ki->kaio_done);
TAILQ_INIT(&ki->kaio_jobqueue);
TAILQ_INIT(&ki->kaio_bufqueue);
TAILQ_INIT(&ki->kaio_liojoblist);
TAILQ_INIT(&ki->kaio_sockqueue);
TAILQ_INIT(&ki->kaio_syncqueue);
TASK_INIT(&ki->kaio_task, 0, aio_kick_helper, p);
PROC_LOCK(p);
if (p->p_aioinfo == NULL) {
p->p_aioinfo = ki;
PROC_UNLOCK(p);
} else {
PROC_UNLOCK(p);
mtx_destroy(&ki->kaio_mtx);
uma_zfree(kaio_zone, ki);
}
while (num_aio_procs < MIN(target_aio_procs, max_aio_procs))
aio_newproc(NULL);
}
static int
aio_sendsig(struct proc *p, struct sigevent *sigev, ksiginfo_t *ksi)
{
struct thread *td;
int error;
error = sigev_findtd(p, sigev, &td);
if (error)
return (error);
if (!KSI_ONQ(ksi)) {
ksiginfo_set_sigev(ksi, sigev);
ksi->ksi_code = SI_ASYNCIO;
ksi->ksi_flags |= KSI_EXT | KSI_INS;
tdsendsignal(p, td, ksi->ksi_signo, ksi);
}
PROC_UNLOCK(p);
return (error);
}
/*
* Free a job entry. Wait for completion if it is currently active, but don't
* delay forever. If we delay, we return a flag that says that we have to
* restart the queue scan.
*/
static int
aio_free_entry(struct aiocblist *aiocbe)
{
struct kaioinfo *ki;
struct aioliojob *lj;
struct proc *p;
p = aiocbe->userproc;
MPASS(curproc == p);
ki = p->p_aioinfo;
MPASS(ki != NULL);
AIO_LOCK_ASSERT(ki, MA_OWNED);
MPASS(aiocbe->jobstate == JOBST_JOBFINISHED);
atomic_subtract_int(&num_queue_count, 1);
ki->kaio_count--;
MPASS(ki->kaio_count >= 0);
TAILQ_REMOVE(&ki->kaio_done, aiocbe, plist);
TAILQ_REMOVE(&ki->kaio_all, aiocbe, allist);
lj = aiocbe->lio;
if (lj) {
lj->lioj_count--;
lj->lioj_finished_count--;
if (lj->lioj_count == 0) {
TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
/* lio is going away, we need to destroy any knotes */
knlist_delete(&lj->klist, curthread, 1);
PROC_LOCK(p);
sigqueue_take(&lj->lioj_ksi);
PROC_UNLOCK(p);
uma_zfree(aiolio_zone, lj);
}
}
/* aiocbe is going away, we need to destroy any knotes */
knlist_delete(&aiocbe->klist, curthread, 1);
PROC_LOCK(p);
sigqueue_take(&aiocbe->ksi);
PROC_UNLOCK(p);
MPASS(aiocbe->bp == NULL);
aiocbe->jobstate = JOBST_NULL;
AIO_UNLOCK(ki);
/*
* The thread argument here is used to find the owning process
* and is also passed to fo_close() which may pass it to various
* places such as devsw close() routines. Because of that, we
* need a thread pointer from the process owning the job that is
* persistent and won't disappear out from under us or move to
* another process.
*
* Currently, all the callers of this function call it to remove
* an aiocblist from the current process' job list either via a
* syscall or due to the current process calling exit() or
* execve(). Thus, we know that p == curproc. We also know that
* curthread can't exit since we are curthread.
*
* Therefore, we use curthread as the thread to pass to
* knlist_delete(). This does mean that it is possible for the
* thread pointer at close time to differ from the thread pointer
* at open time, but this is already true of file descriptors in
* a multithreaded process.
*/
fdrop(aiocbe->fd_file, curthread);
crfree(aiocbe->cred);
uma_zfree(aiocb_zone, aiocbe);
AIO_LOCK(ki);
return (0);
}
static void
aio_proc_rundown_exec(void *arg, struct proc *p, struct image_params *imgp __unused)
{
aio_proc_rundown(arg, p);
}
/*
* Rundown the jobs for a given process.
*/
static void
aio_proc_rundown(void *arg, struct proc *p)
{
struct kaioinfo *ki;
struct aioliojob *lj;
struct aiocblist *cbe, *cbn;
struct file *fp;
struct socket *so;
int remove;
KASSERT(curthread->td_proc == p,
("%s: called on non-curproc", __func__));
ki = p->p_aioinfo;
if (ki == NULL)
return;
AIO_LOCK(ki);
ki->kaio_flags |= KAIO_RUNDOWN;
restart:
/*
* Try to cancel all pending requests. This code simulates
* aio_cancel on all pending I/O requests.
*/
TAILQ_FOREACH_SAFE(cbe, &ki->kaio_jobqueue, plist, cbn) {
remove = 0;
mtx_lock(&aio_job_mtx);
if (cbe->jobstate == JOBST_JOBQGLOBAL) {
TAILQ_REMOVE(&aio_jobs, cbe, list);
remove = 1;
} else if (cbe->jobstate == JOBST_JOBQSOCK) {
fp = cbe->fd_file;
MPASS(fp->f_type == DTYPE_SOCKET);
so = fp->f_data;
TAILQ_REMOVE(&so->so_aiojobq, cbe, list);
remove = 1;
} else if (cbe->jobstate == JOBST_JOBQSYNC) {
TAILQ_REMOVE(&ki->kaio_syncqueue, cbe, list);
remove = 1;
}
mtx_unlock(&aio_job_mtx);
if (remove) {
cbe->jobstate = JOBST_JOBFINISHED;
cbe->uaiocb._aiocb_private.status = -1;
cbe->uaiocb._aiocb_private.error = ECANCELED;
TAILQ_REMOVE(&ki->kaio_jobqueue, cbe, plist);
aio_bio_done_notify(p, cbe, DONE_QUEUE);
}
}
/* Wait for all running I/O to be finished */
if (TAILQ_FIRST(&ki->kaio_bufqueue) ||
TAILQ_FIRST(&ki->kaio_jobqueue)) {
ki->kaio_flags |= KAIO_WAKEUP;
msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO, "aioprn", hz);
goto restart;
}
/* Free all completed I/O requests. */
while ((cbe = TAILQ_FIRST(&ki->kaio_done)) != NULL)
aio_free_entry(cbe);
while ((lj = TAILQ_FIRST(&ki->kaio_liojoblist)) != NULL) {
if (lj->lioj_count == 0) {
TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
knlist_delete(&lj->klist, curthread, 1);
PROC_LOCK(p);
sigqueue_take(&lj->lioj_ksi);
PROC_UNLOCK(p);
uma_zfree(aiolio_zone, lj);
} else {
panic("LIO job not cleaned up: C:%d, FC:%d\n",
lj->lioj_count, lj->lioj_finished_count);
}
}
AIO_UNLOCK(ki);
taskqueue_drain(taskqueue_aiod_bio, &ki->kaio_task);
mtx_destroy(&ki->kaio_mtx);
uma_zfree(kaio_zone, ki);
p->p_aioinfo = NULL;
}
/*
* Select a job to run (called by an AIO daemon).
*/
static struct aiocblist *
aio_selectjob(struct aiothreadlist *aiop)
{
struct aiocblist *aiocbe;
struct kaioinfo *ki;
struct proc *userp;
mtx_assert(&aio_job_mtx, MA_OWNED);
TAILQ_FOREACH(aiocbe, &aio_jobs, list) {
userp = aiocbe->userproc;
ki = userp->p_aioinfo;
if (ki->kaio_active_count < ki->kaio_maxactive_count) {
TAILQ_REMOVE(&aio_jobs, aiocbe, list);
/* Account for currently active jobs. */
ki->kaio_active_count++;
aiocbe->jobstate = JOBST_JOBRUNNING;
break;
}
}
return (aiocbe);
}
/*
* Move all data to a permanent storage device, this code
* simulates fsync syscall.
*/
static int
aio_fsync_vnode(struct thread *td, struct vnode *vp)
{
struct mount *mp;
int vfslocked;
int error;
vfslocked = VFS_LOCK_GIANT(vp->v_mount);
if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
goto drop;
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
if (vp->v_object != NULL) {
VM_OBJECT_LOCK(vp->v_object);
vm_object_page_clean(vp->v_object, 0, 0, 0);
VM_OBJECT_UNLOCK(vp->v_object);
}
error = VOP_FSYNC(vp, MNT_WAIT, td);
VOP_UNLOCK(vp, 0);
vn_finished_write(mp);
drop:
VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
/*
* The AIO processing activity. This is the code that does the I/O request for
* the non-physio version of the operations. The normal vn operations are used,
* and this code should work in all instances for every type of file, including
* pipes, sockets, fifos, and regular files.
*
* XXX I don't think it works well for socket, pipe, and fifo.
*/
static void
aio_process(struct aiocblist *aiocbe)
{
struct ucred *td_savedcred;
struct thread *td;
struct aiocb *cb;
struct file *fp;
struct socket *so;
struct uio auio;
struct iovec aiov;
int cnt;
int error;
int oublock_st, oublock_end;
int inblock_st, inblock_end;
td = curthread;
td_savedcred = td->td_ucred;
td->td_ucred = aiocbe->cred;
cb = &aiocbe->uaiocb;
fp = aiocbe->fd_file;
if (cb->aio_lio_opcode == LIO_SYNC) {
error = 0;
cnt = 0;
if (fp->f_vnode != NULL)
error = aio_fsync_vnode(td, fp->f_vnode);
cb->_aiocb_private.error = error;
cb->_aiocb_private.status = 0;
td->td_ucred = td_savedcred;
return;
}
aiov.iov_base = (void *)(uintptr_t)cb->aio_buf;
aiov.iov_len = cb->aio_nbytes;
auio.uio_iov = &aiov;
auio.uio_iovcnt = 1;
auio.uio_offset = cb->aio_offset;
auio.uio_resid = cb->aio_nbytes;
cnt = cb->aio_nbytes;
auio.uio_segflg = UIO_USERSPACE;
auio.uio_td = td;
inblock_st = td->td_ru.ru_inblock;
oublock_st = td->td_ru.ru_oublock;
/*
* aio_aqueue() acquires a reference to the file that is
* released in aio_free_entry().
*/
if (cb->aio_lio_opcode == LIO_READ) {
auio.uio_rw = UIO_READ;
if (auio.uio_resid == 0)
error = 0;
else
error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, td);
} else {
if (fp->f_type == DTYPE_VNODE)
bwillwrite();
auio.uio_rw = UIO_WRITE;
error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, td);
}
inblock_end = td->td_ru.ru_inblock;
oublock_end = td->td_ru.ru_oublock;
aiocbe->inputcharge = inblock_end - inblock_st;
aiocbe->outputcharge = oublock_end - oublock_st;
if ((error) && (auio.uio_resid != cnt)) {
if (error == ERESTART || error == EINTR || error == EWOULDBLOCK)
error = 0;
if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE)) {
int sigpipe = 1;
if (fp->f_type == DTYPE_SOCKET) {
so = fp->f_data;
if (so->so_options & SO_NOSIGPIPE)
sigpipe = 0;
}
if (sigpipe) {
PROC_LOCK(aiocbe->userproc);
- psignal(aiocbe->userproc, SIGPIPE);
+ kern_psignal(aiocbe->userproc, SIGPIPE);
PROC_UNLOCK(aiocbe->userproc);
}
}
}
cnt -= auio.uio_resid;
cb->_aiocb_private.error = error;
cb->_aiocb_private.status = cnt;
td->td_ucred = td_savedcred;
}
static void
aio_bio_done_notify(struct proc *userp, struct aiocblist *aiocbe, int type)
{
struct aioliojob *lj;
struct kaioinfo *ki;
struct aiocblist *scb, *scbn;
int lj_done;
ki = userp->p_aioinfo;
AIO_LOCK_ASSERT(ki, MA_OWNED);
lj = aiocbe->lio;
lj_done = 0;
if (lj) {
lj->lioj_finished_count++;
if (lj->lioj_count == lj->lioj_finished_count)
lj_done = 1;
}
if (type == DONE_QUEUE) {
aiocbe->jobflags |= AIOCBLIST_DONE;
} else {
aiocbe->jobflags |= AIOCBLIST_BUFDONE;
}
TAILQ_INSERT_TAIL(&ki->kaio_done, aiocbe, plist);
aiocbe->jobstate = JOBST_JOBFINISHED;
if (ki->kaio_flags & KAIO_RUNDOWN)
goto notification_done;
if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL ||
aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID)
aio_sendsig(userp, &aiocbe->uaiocb.aio_sigevent, &aiocbe->ksi);
KNOTE_LOCKED(&aiocbe->klist, 1);
if (lj_done) {
if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
lj->lioj_flags |= LIOJ_KEVENT_POSTED;
KNOTE_LOCKED(&lj->klist, 1);
}
if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED))
== LIOJ_SIGNAL
&& (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) {
aio_sendsig(userp, &lj->lioj_signal, &lj->lioj_ksi);
lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
}
}
notification_done:
if (aiocbe->jobflags & AIOCBLIST_CHECKSYNC) {
TAILQ_FOREACH_SAFE(scb, &ki->kaio_syncqueue, list, scbn) {
if (aiocbe->fd_file == scb->fd_file &&
aiocbe->seqno < scb->seqno) {
if (--scb->pending == 0) {
mtx_lock(&aio_job_mtx);
scb->jobstate = JOBST_JOBQGLOBAL;
TAILQ_REMOVE(&ki->kaio_syncqueue, scb, list);
TAILQ_INSERT_TAIL(&aio_jobs, scb, list);
aio_kick_nowait(userp);
mtx_unlock(&aio_job_mtx);
}
}
}
}
if (ki->kaio_flags & KAIO_WAKEUP) {
ki->kaio_flags &= ~KAIO_WAKEUP;
wakeup(&userp->p_aioinfo);
}
}
/*
* The AIO daemon, most of the actual work is done in aio_process,
* but the setup (and address space mgmt) is done in this routine.
*/
static void
aio_daemon(void *_id)
{
struct aiocblist *aiocbe;
struct aiothreadlist *aiop;
struct kaioinfo *ki;
struct proc *curcp, *mycp, *userp;
struct vmspace *myvm, *tmpvm;
struct thread *td = curthread;
int id = (intptr_t)_id;
/*
* Local copies of curproc (cp) and vmspace (myvm)
*/
mycp = td->td_proc;
myvm = mycp->p_vmspace;
KASSERT(mycp->p_textvp == NULL, ("kthread has a textvp"));
/*
* Allocate and ready the aio control info. There is one aiop structure
* per daemon.
*/
aiop = uma_zalloc(aiop_zone, M_WAITOK);
aiop->aiothread = td;
aiop->aiothreadflags = 0;
/* The daemon resides in its own pgrp. */
- setsid(td, NULL);
+ sys_setsid(td, NULL);
/*
* Wakeup parent process. (Parent sleeps to keep from blasting away
* and creating too many daemons.)
*/
sema_post(&aio_newproc_sem);
mtx_lock(&aio_job_mtx);
for (;;) {
/*
* curcp is the current daemon process context.
* userp is the current user process context.
*/
curcp = mycp;
/*
* Take daemon off of free queue
*/
if (aiop->aiothreadflags & AIOP_FREE) {
TAILQ_REMOVE(&aio_freeproc, aiop, list);
aiop->aiothreadflags &= ~AIOP_FREE;
}
/*
* Check for jobs.
*/
while ((aiocbe = aio_selectjob(aiop)) != NULL) {
mtx_unlock(&aio_job_mtx);
userp = aiocbe->userproc;
/*
* Connect to process address space for user program.
*/
if (userp != curcp) {
/*
* Save the current address space that we are
* connected to.
*/
tmpvm = mycp->p_vmspace;
/*
* Point to the new user address space, and
* refer to it.
*/
mycp->p_vmspace = userp->p_vmspace;
atomic_add_int(&mycp->p_vmspace->vm_refcnt, 1);
/* Activate the new mapping. */
pmap_activate(FIRST_THREAD_IN_PROC(mycp));
/*
* If the old address space wasn't the daemons
* own address space, then we need to remove the
* daemon's reference from the other process
* that it was acting on behalf of.
*/
if (tmpvm != myvm) {
vmspace_free(tmpvm);
}
curcp = userp;
}
ki = userp->p_aioinfo;
/* Do the I/O function. */
aio_process(aiocbe);
mtx_lock(&aio_job_mtx);
/* Decrement the active job count. */
ki->kaio_active_count--;
mtx_unlock(&aio_job_mtx);
AIO_LOCK(ki);
TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist);
aio_bio_done_notify(userp, aiocbe, DONE_QUEUE);
AIO_UNLOCK(ki);
mtx_lock(&aio_job_mtx);
}
/*
* Disconnect from user address space.
*/
if (curcp != mycp) {
mtx_unlock(&aio_job_mtx);
/* Get the user address space to disconnect from. */
tmpvm = mycp->p_vmspace;
/* Get original address space for daemon. */
mycp->p_vmspace = myvm;
/* Activate the daemon's address space. */
pmap_activate(FIRST_THREAD_IN_PROC(mycp));
#ifdef DIAGNOSTIC
if (tmpvm == myvm) {
printf("AIOD: vmspace problem -- %d\n",
mycp->p_pid);
}
#endif
/* Remove our vmspace reference. */
vmspace_free(tmpvm);
curcp = mycp;
mtx_lock(&aio_job_mtx);
/*
* We have to restart to avoid race, we only sleep if
* no job can be selected, that should be
* curcp == mycp.
*/
continue;
}
mtx_assert(&aio_job_mtx, MA_OWNED);
TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
aiop->aiothreadflags |= AIOP_FREE;
/*
* If daemon is inactive for a long time, allow it to exit,
* thereby freeing resources.
*/
if (msleep(aiop->aiothread, &aio_job_mtx, PRIBIO, "aiordy",
aiod_lifetime)) {
if (TAILQ_EMPTY(&aio_jobs)) {
if ((aiop->aiothreadflags & AIOP_FREE) &&
(num_aio_procs > target_aio_procs)) {
TAILQ_REMOVE(&aio_freeproc, aiop, list);
num_aio_procs--;
mtx_unlock(&aio_job_mtx);
uma_zfree(aiop_zone, aiop);
free_unr(aiod_unr, id);
#ifdef DIAGNOSTIC
if (mycp->p_vmspace->vm_refcnt <= 1) {
printf("AIOD: bad vm refcnt for"
" exiting daemon: %d\n",
mycp->p_vmspace->vm_refcnt);
}
#endif
kproc_exit(0);
}
}
}
}
mtx_unlock(&aio_job_mtx);
panic("shouldn't be here\n");
}
/*
* Create a new AIO daemon. This is mostly a kernel-thread fork routine. The
* AIO daemon modifies its environment itself.
*/
static int
aio_newproc(int *start)
{
int error;
struct proc *p;
int id;
id = alloc_unr(aiod_unr);
error = kproc_create(aio_daemon, (void *)(intptr_t)id, &p,
RFNOWAIT, 0, "aiod%d", id);
if (error == 0) {
/*
* Wait until daemon is started.
*/
sema_wait(&aio_newproc_sem);
mtx_lock(&aio_job_mtx);
num_aio_procs++;
if (start != NULL)
(*start)--;
mtx_unlock(&aio_job_mtx);
} else {
free_unr(aiod_unr, id);
}
return (error);
}
/*
* Try the high-performance, low-overhead physio method for eligible
* VCHR devices. This method doesn't use an aio helper thread, and
* thus has very low overhead.
*
* Assumes that the caller, aio_aqueue(), has incremented the file
* structure's reference count, preventing its deallocation for the
* duration of this call.
*/
static int
aio_qphysio(struct proc *p, struct aiocblist *aiocbe)
{
struct aiocb *cb;
struct file *fp;
struct buf *bp;
struct vnode *vp;
struct kaioinfo *ki;
struct aioliojob *lj;
int error;
cb = &aiocbe->uaiocb;
fp = aiocbe->fd_file;
if (fp->f_type != DTYPE_VNODE)
return (-1);
vp = fp->f_vnode;
/*
* If its not a disk, we don't want to return a positive error.
* It causes the aio code to not fall through to try the thread
* way when you're talking to a regular file.
*/
if (!vn_isdisk(vp, &error)) {
if (error == ENOTBLK)
return (-1);
else
return (error);
}
if (vp->v_bufobj.bo_bsize == 0)
return (-1);
if (cb->aio_nbytes % vp->v_bufobj.bo_bsize)
return (-1);
if (cb->aio_nbytes > vp->v_rdev->si_iosize_max)
return (-1);
if (cb->aio_nbytes >
MAXPHYS - (((vm_offset_t) cb->aio_buf) & PAGE_MASK))
return (-1);
ki = p->p_aioinfo;
if (ki->kaio_buffer_count >= ki->kaio_ballowed_count)
return (-1);
/* Create and build a buffer header for a transfer. */
bp = (struct buf *)getpbuf(NULL);
BUF_KERNPROC(bp);
AIO_LOCK(ki);
ki->kaio_count++;
ki->kaio_buffer_count++;
lj = aiocbe->lio;
if (lj)
lj->lioj_count++;
AIO_UNLOCK(ki);
/*
* Get a copy of the kva from the physical buffer.
*/
error = 0;
bp->b_bcount = cb->aio_nbytes;
bp->b_bufsize = cb->aio_nbytes;
bp->b_iodone = aio_physwakeup;
bp->b_saveaddr = bp->b_data;
bp->b_data = (void *)(uintptr_t)cb->aio_buf;
bp->b_offset = cb->aio_offset;
bp->b_iooffset = cb->aio_offset;
bp->b_blkno = btodb(cb->aio_offset);
bp->b_iocmd = cb->aio_lio_opcode == LIO_WRITE ? BIO_WRITE : BIO_READ;
/*
* Bring buffer into kernel space.
*/
if (vmapbuf(bp) < 0) {
error = EFAULT;
goto doerror;
}
AIO_LOCK(ki);
aiocbe->bp = bp;
bp->b_caller1 = (void *)aiocbe;
TAILQ_INSERT_TAIL(&ki->kaio_bufqueue, aiocbe, plist);
TAILQ_INSERT_TAIL(&ki->kaio_all, aiocbe, allist);
aiocbe->jobstate = JOBST_JOBQBUF;
cb->_aiocb_private.status = cb->aio_nbytes;
AIO_UNLOCK(ki);
atomic_add_int(&num_queue_count, 1);
atomic_add_int(&num_buf_aio, 1);
bp->b_error = 0;
TASK_INIT(&aiocbe->biotask, 0, biohelper, aiocbe);
/* Perform transfer. */
dev_strategy(vp->v_rdev, bp);
return (0);
doerror:
AIO_LOCK(ki);
ki->kaio_count--;
ki->kaio_buffer_count--;
if (lj)
lj->lioj_count--;
aiocbe->bp = NULL;
AIO_UNLOCK(ki);
relpbuf(bp, NULL);
return (error);
}
/*
* Wake up aio requests that may be serviceable now.
*/
static void
aio_swake_cb(struct socket *so, struct sockbuf *sb)
{
struct aiocblist *cb, *cbn;
int opcode;
SOCKBUF_LOCK_ASSERT(sb);
if (sb == &so->so_snd)
opcode = LIO_WRITE;
else
opcode = LIO_READ;
sb->sb_flags &= ~SB_AIO;
mtx_lock(&aio_job_mtx);
TAILQ_FOREACH_SAFE(cb, &so->so_aiojobq, list, cbn) {
if (opcode == cb->uaiocb.aio_lio_opcode) {
if (cb->jobstate != JOBST_JOBQSOCK)
panic("invalid queue value");
/* XXX
* We don't have actual sockets backend yet,
* so we simply move the requests to the generic
* file I/O backend.
*/
TAILQ_REMOVE(&so->so_aiojobq, cb, list);
TAILQ_INSERT_TAIL(&aio_jobs, cb, list);
aio_kick_nowait(cb->userproc);
}
}
mtx_unlock(&aio_job_mtx);
}
static int
convert_old_sigevent(struct osigevent *osig, struct sigevent *nsig)
{
/*
* Only SIGEV_NONE, SIGEV_SIGNAL, and SIGEV_KEVENT are
* supported by AIO with the old sigevent structure.
*/
nsig->sigev_notify = osig->sigev_notify;
switch (nsig->sigev_notify) {
case SIGEV_NONE:
break;
case SIGEV_SIGNAL:
nsig->sigev_signo = osig->__sigev_u.__sigev_signo;
break;
case SIGEV_KEVENT:
nsig->sigev_notify_kqueue =
osig->__sigev_u.__sigev_notify_kqueue;
nsig->sigev_value.sival_ptr = osig->sigev_value.sival_ptr;
break;
default:
return (EINVAL);
}
return (0);
}
static int
aiocb_copyin_old_sigevent(struct aiocb *ujob, struct aiocb *kjob)
{
struct oaiocb *ojob;
int error;
bzero(kjob, sizeof(struct aiocb));
error = copyin(ujob, kjob, sizeof(struct oaiocb));
if (error)
return (error);
ojob = (struct oaiocb *)kjob;
return (convert_old_sigevent(&ojob->aio_sigevent, &kjob->aio_sigevent));
}
static int
aiocb_copyin(struct aiocb *ujob, struct aiocb *kjob)
{
return (copyin(ujob, kjob, sizeof(struct aiocb)));
}
static long
aiocb_fetch_status(struct aiocb *ujob)
{
return (fuword(&ujob->_aiocb_private.status));
}
static long
aiocb_fetch_error(struct aiocb *ujob)
{
return (fuword(&ujob->_aiocb_private.error));
}
static int
aiocb_store_status(struct aiocb *ujob, long status)
{
return (suword(&ujob->_aiocb_private.status, status));
}
static int
aiocb_store_error(struct aiocb *ujob, long error)
{
return (suword(&ujob->_aiocb_private.error, error));
}
static int
aiocb_store_kernelinfo(struct aiocb *ujob, long jobref)
{
return (suword(&ujob->_aiocb_private.kernelinfo, jobref));
}
static int
aiocb_store_aiocb(struct aiocb **ujobp, struct aiocb *ujob)
{
return (suword(ujobp, (long)ujob));
}
static struct aiocb_ops aiocb_ops = {
.copyin = aiocb_copyin,
.fetch_status = aiocb_fetch_status,
.fetch_error = aiocb_fetch_error,
.store_status = aiocb_store_status,
.store_error = aiocb_store_error,
.store_kernelinfo = aiocb_store_kernelinfo,
.store_aiocb = aiocb_store_aiocb,
};
static struct aiocb_ops aiocb_ops_osigevent = {
.copyin = aiocb_copyin_old_sigevent,
.fetch_status = aiocb_fetch_status,
.fetch_error = aiocb_fetch_error,
.store_status = aiocb_store_status,
.store_error = aiocb_store_error,
.store_kernelinfo = aiocb_store_kernelinfo,
.store_aiocb = aiocb_store_aiocb,
};
/*
* Queue a new AIO request. Choosing either the threaded or direct physio VCHR
* technique is done in this code.
*/
int
aio_aqueue(struct thread *td, struct aiocb *job, struct aioliojob *lj,
int type, struct aiocb_ops *ops)
{
struct proc *p = td->td_proc;
struct file *fp;
struct socket *so;
struct aiocblist *aiocbe, *cb;
struct kaioinfo *ki;
struct kevent kev;
struct sockbuf *sb;
int opcode;
int error;
int fd, kqfd;
int jid;
if (p->p_aioinfo == NULL)
aio_init_aioinfo(p);
ki = p->p_aioinfo;
ops->store_status(job, -1);
ops->store_error(job, 0);
ops->store_kernelinfo(job, -1);
if (num_queue_count >= max_queue_count ||
ki->kaio_count >= ki->kaio_qallowed_count) {
ops->store_error(job, EAGAIN);
return (EAGAIN);
}
aiocbe = uma_zalloc(aiocb_zone, M_WAITOK | M_ZERO);
aiocbe->inputcharge = 0;
aiocbe->outputcharge = 0;
knlist_init_mtx(&aiocbe->klist, AIO_MTX(ki));
error = ops->copyin(job, &aiocbe->uaiocb);
if (error) {
ops->store_error(job, error);
uma_zfree(aiocb_zone, aiocbe);
return (error);
}
if (aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT &&
aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_SIGNAL &&
aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_THREAD_ID &&
aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_NONE) {
ops->store_error(job, EINVAL);
uma_zfree(aiocb_zone, aiocbe);
return (EINVAL);
}
if ((aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL ||
aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID) &&
!_SIG_VALID(aiocbe->uaiocb.aio_sigevent.sigev_signo)) {
uma_zfree(aiocb_zone, aiocbe);
return (EINVAL);
}
ksiginfo_init(&aiocbe->ksi);
/* Save userspace address of the job info. */
aiocbe->uuaiocb = job;
/* Get the opcode. */
if (type != LIO_NOP)
aiocbe->uaiocb.aio_lio_opcode = type;
opcode = aiocbe->uaiocb.aio_lio_opcode;
/*
* Validate the opcode and fetch the file object for the specified
* file descriptor.
*
* XXXRW: Moved the opcode validation up here so that we don't
* retrieve a file descriptor without knowing what the capabiltity
* should be.
*/
fd = aiocbe->uaiocb.aio_fildes;
switch (opcode) {
case LIO_WRITE:
error = fget_write(td, fd, CAP_WRITE | CAP_SEEK, &fp);
break;
case LIO_READ:
error = fget_read(td, fd, CAP_READ | CAP_SEEK, &fp);
break;
case LIO_SYNC:
error = fget(td, fd, CAP_FSYNC, &fp);
break;
case LIO_NOP:
error = fget(td, fd, 0, &fp);
break;
default:
error = EINVAL;
}
if (error) {
uma_zfree(aiocb_zone, aiocbe);
ops->store_error(job, error);
return (error);
}
if (opcode == LIO_SYNC && fp->f_vnode == NULL) {
error = EINVAL;
goto aqueue_fail;
}
if (opcode != LIO_SYNC && aiocbe->uaiocb.aio_offset == -1LL) {
error = EINVAL;
goto aqueue_fail;
}
aiocbe->fd_file = fp;
mtx_lock(&aio_job_mtx);
jid = jobrefid++;
aiocbe->seqno = jobseqno++;
mtx_unlock(&aio_job_mtx);
error = ops->store_kernelinfo(job, jid);
if (error) {
error = EINVAL;
goto aqueue_fail;
}
aiocbe->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jid;
if (opcode == LIO_NOP) {
fdrop(fp, td);
uma_zfree(aiocb_zone, aiocbe);
return (0);
}
if (aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT)
goto no_kqueue;
kqfd = aiocbe->uaiocb.aio_sigevent.sigev_notify_kqueue;
kev.ident = (uintptr_t)aiocbe->uuaiocb;
kev.filter = EVFILT_AIO;
kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1;
kev.data = (intptr_t)aiocbe;
kev.udata = aiocbe->uaiocb.aio_sigevent.sigev_value.sival_ptr;
error = kqfd_register(kqfd, &kev, td, 1);
aqueue_fail:
if (error) {
fdrop(fp, td);
uma_zfree(aiocb_zone, aiocbe);
ops->store_error(job, error);
goto done;
}
no_kqueue:
ops->store_error(job, EINPROGRESS);
aiocbe->uaiocb._aiocb_private.error = EINPROGRESS;
aiocbe->userproc = p;
aiocbe->cred = crhold(td->td_ucred);
aiocbe->jobflags = 0;
aiocbe->lio = lj;
if (opcode == LIO_SYNC)
goto queueit;
if (fp->f_type == DTYPE_SOCKET) {
/*
* Alternate queueing for socket ops: Reach down into the
* descriptor to get the socket data. Then check to see if the
* socket is ready to be read or written (based on the requested
* operation).
*
* If it is not ready for io, then queue the aiocbe on the
* socket, and set the flags so we get a call when sbnotify()
* happens.
*
* Note if opcode is neither LIO_WRITE nor LIO_READ we lock
* and unlock the snd sockbuf for no reason.
*/
so = fp->f_data;
sb = (opcode == LIO_READ) ? &so->so_rcv : &so->so_snd;
SOCKBUF_LOCK(sb);
if (((opcode == LIO_READ) && (!soreadable(so))) || ((opcode ==
LIO_WRITE) && (!sowriteable(so)))) {
sb->sb_flags |= SB_AIO;
mtx_lock(&aio_job_mtx);
TAILQ_INSERT_TAIL(&so->so_aiojobq, aiocbe, list);
mtx_unlock(&aio_job_mtx);
AIO_LOCK(ki);
TAILQ_INSERT_TAIL(&ki->kaio_all, aiocbe, allist);
TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist);
aiocbe->jobstate = JOBST_JOBQSOCK;
ki->kaio_count++;
if (lj)
lj->lioj_count++;
AIO_UNLOCK(ki);
SOCKBUF_UNLOCK(sb);
atomic_add_int(&num_queue_count, 1);
error = 0;
goto done;
}
SOCKBUF_UNLOCK(sb);
}
if ((error = aio_qphysio(p, aiocbe)) == 0)
goto done;
#if 0
if (error > 0) {
aiocbe->uaiocb._aiocb_private.error = error;
ops->store_error(job, error);
goto done;
}
#endif
queueit:
/* No buffer for daemon I/O. */
aiocbe->bp = NULL;
atomic_add_int(&num_queue_count, 1);
AIO_LOCK(ki);
ki->kaio_count++;
if (lj)
lj->lioj_count++;
TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist);
TAILQ_INSERT_TAIL(&ki->kaio_all, aiocbe, allist);
if (opcode == LIO_SYNC) {
TAILQ_FOREACH(cb, &ki->kaio_jobqueue, plist) {
if (cb->fd_file == aiocbe->fd_file &&
cb->uaiocb.aio_lio_opcode != LIO_SYNC &&
cb->seqno < aiocbe->seqno) {
cb->jobflags |= AIOCBLIST_CHECKSYNC;
aiocbe->pending++;
}
}
TAILQ_FOREACH(cb, &ki->kaio_bufqueue, plist) {
if (cb->fd_file == aiocbe->fd_file &&
cb->uaiocb.aio_lio_opcode != LIO_SYNC &&
cb->seqno < aiocbe->seqno) {
cb->jobflags |= AIOCBLIST_CHECKSYNC;
aiocbe->pending++;
}
}
if (aiocbe->pending != 0) {
TAILQ_INSERT_TAIL(&ki->kaio_syncqueue, aiocbe, list);
aiocbe->jobstate = JOBST_JOBQSYNC;
AIO_UNLOCK(ki);
goto done;
}
}
mtx_lock(&aio_job_mtx);
TAILQ_INSERT_TAIL(&aio_jobs, aiocbe, list);
aiocbe->jobstate = JOBST_JOBQGLOBAL;
aio_kick_nowait(p);
mtx_unlock(&aio_job_mtx);
AIO_UNLOCK(ki);
error = 0;
done:
return (error);
}
static void
aio_kick_nowait(struct proc *userp)
{
struct kaioinfo *ki = userp->p_aioinfo;
struct aiothreadlist *aiop;
mtx_assert(&aio_job_mtx, MA_OWNED);
if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
TAILQ_REMOVE(&aio_freeproc, aiop, list);
aiop->aiothreadflags &= ~AIOP_FREE;
wakeup(aiop->aiothread);
} else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) &&
((ki->kaio_active_count + num_aio_resv_start) <
ki->kaio_maxactive_count)) {
taskqueue_enqueue(taskqueue_aiod_bio, &ki->kaio_task);
}
}
static int
aio_kick(struct proc *userp)
{
struct kaioinfo *ki = userp->p_aioinfo;
struct aiothreadlist *aiop;
int error, ret = 0;
mtx_assert(&aio_job_mtx, MA_OWNED);
retryproc:
if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
TAILQ_REMOVE(&aio_freeproc, aiop, list);
aiop->aiothreadflags &= ~AIOP_FREE;
wakeup(aiop->aiothread);
} else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) &&
((ki->kaio_active_count + num_aio_resv_start) <
ki->kaio_maxactive_count)) {
num_aio_resv_start++;
mtx_unlock(&aio_job_mtx);
error = aio_newproc(&num_aio_resv_start);
mtx_lock(&aio_job_mtx);
if (error) {
num_aio_resv_start--;
goto retryproc;
}
} else {
ret = -1;
}
return (ret);
}
static void
aio_kick_helper(void *context, int pending)
{
struct proc *userp = context;
mtx_lock(&aio_job_mtx);
while (--pending >= 0) {
if (aio_kick(userp))
break;
}
mtx_unlock(&aio_job_mtx);
}
/*
* Support the aio_return system call, as a side-effect, kernel resources are
* released.
*/
static int
kern_aio_return(struct thread *td, struct aiocb *uaiocb, struct aiocb_ops *ops)
{
struct proc *p = td->td_proc;
struct aiocblist *cb;
struct kaioinfo *ki;
int status, error;
ki = p->p_aioinfo;
if (ki == NULL)
return (EINVAL);
AIO_LOCK(ki);
TAILQ_FOREACH(cb, &ki->kaio_done, plist) {
if (cb->uuaiocb == uaiocb)
break;
}
if (cb != NULL) {
MPASS(cb->jobstate == JOBST_JOBFINISHED);
status = cb->uaiocb._aiocb_private.status;
error = cb->uaiocb._aiocb_private.error;
td->td_retval[0] = status;
if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
td->td_ru.ru_oublock += cb->outputcharge;
cb->outputcharge = 0;
} else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
td->td_ru.ru_inblock += cb->inputcharge;
cb->inputcharge = 0;
}
aio_free_entry(cb);
AIO_UNLOCK(ki);
ops->store_error(uaiocb, error);
ops->store_status(uaiocb, status);
} else {
error = EINVAL;
AIO_UNLOCK(ki);
}
return (error);
}
int
-aio_return(struct thread *td, struct aio_return_args *uap)
+sys_aio_return(struct thread *td, struct aio_return_args *uap)
{
return (kern_aio_return(td, uap->aiocbp, &aiocb_ops));
}
/*
* Allow a process to wakeup when any of the I/O requests are completed.
*/
static int
kern_aio_suspend(struct thread *td, int njoblist, struct aiocb **ujoblist,
struct timespec *ts)
{
struct proc *p = td->td_proc;
struct timeval atv;
struct kaioinfo *ki;
struct aiocblist *cb, *cbfirst;
int error, i, timo;
timo = 0;
if (ts) {
if (ts->tv_nsec < 0 || ts->tv_nsec >= 1000000000)
return (EINVAL);
TIMESPEC_TO_TIMEVAL(&atv, ts);
if (itimerfix(&atv))
return (EINVAL);
timo = tvtohz(&atv);
}
ki = p->p_aioinfo;
if (ki == NULL)
return (EAGAIN);
if (njoblist == 0)
return (0);
AIO_LOCK(ki);
for (;;) {
cbfirst = NULL;
error = 0;
TAILQ_FOREACH(cb, &ki->kaio_all, allist) {
for (i = 0; i < njoblist; i++) {
if (cb->uuaiocb == ujoblist[i]) {
if (cbfirst == NULL)
cbfirst = cb;
if (cb->jobstate == JOBST_JOBFINISHED)
goto RETURN;
}
}
}
/* All tasks were finished. */
if (cbfirst == NULL)
break;
ki->kaio_flags |= KAIO_WAKEUP;
error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH,
"aiospn", timo);
if (error == ERESTART)
error = EINTR;
if (error)
break;
}
RETURN:
AIO_UNLOCK(ki);
return (error);
}
int
-aio_suspend(struct thread *td, struct aio_suspend_args *uap)
+sys_aio_suspend(struct thread *td, struct aio_suspend_args *uap)
{
struct timespec ts, *tsp;
struct aiocb **ujoblist;
int error;
if (uap->nent < 0 || uap->nent > AIO_LISTIO_MAX)
return (EINVAL);
if (uap->timeout) {
/* Get timespec struct. */
if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0)
return (error);
tsp = &ts;
} else
tsp = NULL;
ujoblist = uma_zalloc(aiol_zone, M_WAITOK);
error = copyin(uap->aiocbp, ujoblist, uap->nent * sizeof(ujoblist[0]));
if (error == 0)
error = kern_aio_suspend(td, uap->nent, ujoblist, tsp);
uma_zfree(aiol_zone, ujoblist);
return (error);
}
/*
* aio_cancel cancels any non-physio aio operations not currently in
* progress.
*/
int
-aio_cancel(struct thread *td, struct aio_cancel_args *uap)
+sys_aio_cancel(struct thread *td, struct aio_cancel_args *uap)
{
struct proc *p = td->td_proc;
struct kaioinfo *ki;
struct aiocblist *cbe, *cbn;
struct file *fp;
struct socket *so;
int error;
int remove;
int cancelled = 0;
int notcancelled = 0;
struct vnode *vp;
/* Lookup file object. */
error = fget(td, uap->fd, 0, &fp);
if (error)
return (error);
ki = p->p_aioinfo;
if (ki == NULL)
goto done;
if (fp->f_type == DTYPE_VNODE) {
vp = fp->f_vnode;
if (vn_isdisk(vp, &error)) {
fdrop(fp, td);
td->td_retval[0] = AIO_NOTCANCELED;
return (0);
}
}
AIO_LOCK(ki);
TAILQ_FOREACH_SAFE(cbe, &ki->kaio_jobqueue, plist, cbn) {
if ((uap->fd == cbe->uaiocb.aio_fildes) &&
((uap->aiocbp == NULL) ||
(uap->aiocbp == cbe->uuaiocb))) {
remove = 0;
mtx_lock(&aio_job_mtx);
if (cbe->jobstate == JOBST_JOBQGLOBAL) {
TAILQ_REMOVE(&aio_jobs, cbe, list);
remove = 1;
} else if (cbe->jobstate == JOBST_JOBQSOCK) {
MPASS(fp->f_type == DTYPE_SOCKET);
so = fp->f_data;
TAILQ_REMOVE(&so->so_aiojobq, cbe, list);
remove = 1;
} else if (cbe->jobstate == JOBST_JOBQSYNC) {
TAILQ_REMOVE(&ki->kaio_syncqueue, cbe, list);
remove = 1;
}
mtx_unlock(&aio_job_mtx);
if (remove) {
TAILQ_REMOVE(&ki->kaio_jobqueue, cbe, plist);
cbe->uaiocb._aiocb_private.status = -1;
cbe->uaiocb._aiocb_private.error = ECANCELED;
aio_bio_done_notify(p, cbe, DONE_QUEUE);
cancelled++;
} else {
notcancelled++;
}
if (uap->aiocbp != NULL)
break;
}
}
AIO_UNLOCK(ki);
done:
fdrop(fp, td);
if (uap->aiocbp != NULL) {
if (cancelled) {
td->td_retval[0] = AIO_CANCELED;
return (0);
}
}
if (notcancelled) {
td->td_retval[0] = AIO_NOTCANCELED;
return (0);
}
if (cancelled) {
td->td_retval[0] = AIO_CANCELED;
return (0);
}
td->td_retval[0] = AIO_ALLDONE;
return (0);
}
/*
* aio_error is implemented in the kernel level for compatibility purposes
* only. For a user mode async implementation, it would be best to do it in
* a userland subroutine.
*/
static int
kern_aio_error(struct thread *td, struct aiocb *aiocbp, struct aiocb_ops *ops)
{
struct proc *p = td->td_proc;
struct aiocblist *cb;
struct kaioinfo *ki;
int status;
ki = p->p_aioinfo;
if (ki == NULL) {
td->td_retval[0] = EINVAL;
return (0);
}
AIO_LOCK(ki);
TAILQ_FOREACH(cb, &ki->kaio_all, allist) {
if (cb->uuaiocb == aiocbp) {
if (cb->jobstate == JOBST_JOBFINISHED)
td->td_retval[0] =
cb->uaiocb._aiocb_private.error;
else
td->td_retval[0] = EINPROGRESS;
AIO_UNLOCK(ki);
return (0);
}
}
AIO_UNLOCK(ki);
/*
* Hack for failure of aio_aqueue.
*/
status = ops->fetch_status(aiocbp);
if (status == -1) {
td->td_retval[0] = ops->fetch_error(aiocbp);
return (0);
}
td->td_retval[0] = EINVAL;
return (0);
}
int
-aio_error(struct thread *td, struct aio_error_args *uap)
+sys_aio_error(struct thread *td, struct aio_error_args *uap)
{
return (kern_aio_error(td, uap->aiocbp, &aiocb_ops));
}
/* syscall - asynchronous read from a file (REALTIME) */
int
-oaio_read(struct thread *td, struct oaio_read_args *uap)
+sys_oaio_read(struct thread *td, struct oaio_read_args *uap)
{
return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ,
&aiocb_ops_osigevent));
}
int
-aio_read(struct thread *td, struct aio_read_args *uap)
+sys_aio_read(struct thread *td, struct aio_read_args *uap)
{
return (aio_aqueue(td, uap->aiocbp, NULL, LIO_READ, &aiocb_ops));
}
/* syscall - asynchronous write to a file (REALTIME) */
int
-oaio_write(struct thread *td, struct oaio_write_args *uap)
+sys_oaio_write(struct thread *td, struct oaio_write_args *uap)
{
return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE,
&aiocb_ops_osigevent));
}
int
-aio_write(struct thread *td, struct aio_write_args *uap)
+sys_aio_write(struct thread *td, struct aio_write_args *uap)
{
return (aio_aqueue(td, uap->aiocbp, NULL, LIO_WRITE, &aiocb_ops));
}
static int
kern_lio_listio(struct thread *td, int mode, struct aiocb * const *uacb_list,
struct aiocb **acb_list, int nent, struct sigevent *sig,
struct aiocb_ops *ops)
{
struct proc *p = td->td_proc;
struct aiocb *iocb;
struct kaioinfo *ki;
struct aioliojob *lj;
struct kevent kev;
int error;
int nerror;
int i;
if ((mode != LIO_NOWAIT) && (mode != LIO_WAIT))
return (EINVAL);
if (nent < 0 || nent > AIO_LISTIO_MAX)
return (EINVAL);
if (p->p_aioinfo == NULL)
aio_init_aioinfo(p);
ki = p->p_aioinfo;
lj = uma_zalloc(aiolio_zone, M_WAITOK);
lj->lioj_flags = 0;
lj->lioj_count = 0;
lj->lioj_finished_count = 0;
knlist_init_mtx(&lj->klist, AIO_MTX(ki));
ksiginfo_init(&lj->lioj_ksi);
/*
* Setup signal.
*/
if (sig && (mode == LIO_NOWAIT)) {
bcopy(sig, &lj->lioj_signal, sizeof(lj->lioj_signal));
if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
/* Assume only new style KEVENT */
kev.filter = EVFILT_LIO;
kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1;
kev.ident = (uintptr_t)uacb_list; /* something unique */
kev.data = (intptr_t)lj;
/* pass user defined sigval data */
kev.udata = lj->lioj_signal.sigev_value.sival_ptr;
error = kqfd_register(
lj->lioj_signal.sigev_notify_kqueue, &kev, td, 1);
if (error) {
uma_zfree(aiolio_zone, lj);
return (error);
}
} else if (lj->lioj_signal.sigev_notify == SIGEV_NONE) {
;
} else if (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID) {
if (!_SIG_VALID(lj->lioj_signal.sigev_signo)) {
uma_zfree(aiolio_zone, lj);
return EINVAL;
}
lj->lioj_flags |= LIOJ_SIGNAL;
} else {
uma_zfree(aiolio_zone, lj);
return EINVAL;
}
}
AIO_LOCK(ki);
TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list);
/*
* Add extra aiocb count to avoid the lio to be freed
* by other threads doing aio_waitcomplete or aio_return,
* and prevent event from being sent until we have queued
* all tasks.
*/
lj->lioj_count = 1;
AIO_UNLOCK(ki);
/*
* Get pointers to the list of I/O requests.
*/
nerror = 0;
for (i = 0; i < nent; i++) {
iocb = acb_list[i];
if (iocb != NULL) {
error = aio_aqueue(td, iocb, lj, LIO_NOP, ops);
if (error != 0)
nerror++;
}
}
error = 0;
AIO_LOCK(ki);
if (mode == LIO_WAIT) {
while (lj->lioj_count - 1 != lj->lioj_finished_count) {
ki->kaio_flags |= KAIO_WAKEUP;
error = msleep(&p->p_aioinfo, AIO_MTX(ki),
PRIBIO | PCATCH, "aiospn", 0);
if (error == ERESTART)
error = EINTR;
if (error)
break;
}
} else {
if (lj->lioj_count - 1 == lj->lioj_finished_count) {
if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
lj->lioj_flags |= LIOJ_KEVENT_POSTED;
KNOTE_LOCKED(&lj->klist, 1);
}
if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED))
== LIOJ_SIGNAL
&& (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) {
aio_sendsig(p, &lj->lioj_signal,
&lj->lioj_ksi);
lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
}
}
}
lj->lioj_count--;
if (lj->lioj_count == 0) {
TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
knlist_delete(&lj->klist, curthread, 1);
PROC_LOCK(p);
sigqueue_take(&lj->lioj_ksi);
PROC_UNLOCK(p);
AIO_UNLOCK(ki);
uma_zfree(aiolio_zone, lj);
} else
AIO_UNLOCK(ki);
if (nerror)
return (EIO);
return (error);
}
/* syscall - list directed I/O (REALTIME) */
int
-olio_listio(struct thread *td, struct olio_listio_args *uap)
+sys_olio_listio(struct thread *td, struct olio_listio_args *uap)
{
struct aiocb **acb_list;
struct sigevent *sigp, sig;
struct osigevent osig;
int error, nent;
if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
return (EINVAL);
nent = uap->nent;
if (nent < 0 || nent > AIO_LISTIO_MAX)
return (EINVAL);
if (uap->sig && (uap->mode == LIO_NOWAIT)) {
error = copyin(uap->sig, &osig, sizeof(osig));
if (error)
return (error);
error = convert_old_sigevent(&osig, &sig);
if (error)
return (error);
sigp = &sig;
} else
sigp = NULL;
acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
error = copyin(uap->acb_list, acb_list, nent * sizeof(acb_list[0]));
if (error == 0)
error = kern_lio_listio(td, uap->mode,
(struct aiocb * const *)uap->acb_list, acb_list, nent, sigp,
&aiocb_ops_osigevent);
free(acb_list, M_LIO);
return (error);
}
/* syscall - list directed I/O (REALTIME) */
int
-lio_listio(struct thread *td, struct lio_listio_args *uap)
+sys_lio_listio(struct thread *td, struct lio_listio_args *uap)
{
struct aiocb **acb_list;
struct sigevent *sigp, sig;
int error, nent;
if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
return (EINVAL);
nent = uap->nent;
if (nent < 0 || nent > AIO_LISTIO_MAX)
return (EINVAL);
if (uap->sig && (uap->mode == LIO_NOWAIT)) {
error = copyin(uap->sig, &sig, sizeof(sig));
if (error)
return (error);
sigp = &sig;
} else
sigp = NULL;
acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
error = copyin(uap->acb_list, acb_list, nent * sizeof(acb_list[0]));
if (error == 0)
error = kern_lio_listio(td, uap->mode, uap->acb_list, acb_list,
nent, sigp, &aiocb_ops);
free(acb_list, M_LIO);
return (error);
}
/*
* Called from interrupt thread for physio, we should return as fast
* as possible, so we schedule a biohelper task.
*/
static void
aio_physwakeup(struct buf *bp)
{
struct aiocblist *aiocbe;
aiocbe = (struct aiocblist *)bp->b_caller1;
taskqueue_enqueue(taskqueue_aiod_bio, &aiocbe->biotask);
}
/*
* Task routine to perform heavy tasks, process wakeup, and signals.
*/
static void
biohelper(void *context, int pending)
{
struct aiocblist *aiocbe = context;
struct buf *bp;
struct proc *userp;
struct kaioinfo *ki;
int nblks;
bp = aiocbe->bp;
userp = aiocbe->userproc;
ki = userp->p_aioinfo;
AIO_LOCK(ki);
aiocbe->uaiocb._aiocb_private.status -= bp->b_resid;
aiocbe->uaiocb._aiocb_private.error = 0;
if (bp->b_ioflags & BIO_ERROR)
aiocbe->uaiocb._aiocb_private.error = bp->b_error;
nblks = btodb(aiocbe->uaiocb.aio_nbytes);
if (aiocbe->uaiocb.aio_lio_opcode == LIO_WRITE)
aiocbe->outputcharge += nblks;
else
aiocbe->inputcharge += nblks;
aiocbe->bp = NULL;
TAILQ_REMOVE(&userp->p_aioinfo->kaio_bufqueue, aiocbe, plist);
ki->kaio_buffer_count--;
aio_bio_done_notify(userp, aiocbe, DONE_BUF);
AIO_UNLOCK(ki);
/* Release mapping into kernel space. */
vunmapbuf(bp);
relpbuf(bp, NULL);
atomic_subtract_int(&num_buf_aio, 1);
}
/* syscall - wait for the next completion of an aio request */
static int
kern_aio_waitcomplete(struct thread *td, struct aiocb **aiocbp,
struct timespec *ts, struct aiocb_ops *ops)
{
struct proc *p = td->td_proc;
struct timeval atv;
struct kaioinfo *ki;
struct aiocblist *cb;
struct aiocb *uuaiocb;
int error, status, timo;
ops->store_aiocb(aiocbp, NULL);
timo = 0;
if (ts) {
if ((ts->tv_nsec < 0) || (ts->tv_nsec >= 1000000000))
return (EINVAL);
TIMESPEC_TO_TIMEVAL(&atv, ts);
if (itimerfix(&atv))
return (EINVAL);
timo = tvtohz(&atv);
}
if (p->p_aioinfo == NULL)
aio_init_aioinfo(p);
ki = p->p_aioinfo;
error = 0;
cb = NULL;
AIO_LOCK(ki);
while ((cb = TAILQ_FIRST(&ki->kaio_done)) == NULL) {
ki->kaio_flags |= KAIO_WAKEUP;
error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH,
"aiowc", timo);
if (timo && error == ERESTART)
error = EINTR;
if (error)
break;
}
if (cb != NULL) {
MPASS(cb->jobstate == JOBST_JOBFINISHED);
uuaiocb = cb->uuaiocb;
status = cb->uaiocb._aiocb_private.status;
error = cb->uaiocb._aiocb_private.error;
td->td_retval[0] = status;
if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
td->td_ru.ru_oublock += cb->outputcharge;
cb->outputcharge = 0;
} else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
td->td_ru.ru_inblock += cb->inputcharge;
cb->inputcharge = 0;
}
aio_free_entry(cb);
AIO_UNLOCK(ki);
ops->store_aiocb(aiocbp, uuaiocb);
ops->store_error(uuaiocb, error);
ops->store_status(uuaiocb, status);
} else
AIO_UNLOCK(ki);
return (error);
}
int
-aio_waitcomplete(struct thread *td, struct aio_waitcomplete_args *uap)
+sys_aio_waitcomplete(struct thread *td, struct aio_waitcomplete_args *uap)
{
struct timespec ts, *tsp;
int error;
if (uap->timeout) {
/* Get timespec struct. */
error = copyin(uap->timeout, &ts, sizeof(ts));
if (error)
return (error);
tsp = &ts;
} else
tsp = NULL;
return (kern_aio_waitcomplete(td, uap->aiocbp, tsp, &aiocb_ops));
}
static int
kern_aio_fsync(struct thread *td, int op, struct aiocb *aiocbp,
struct aiocb_ops *ops)
{
struct proc *p = td->td_proc;
struct kaioinfo *ki;
if (op != O_SYNC) /* XXX lack of O_DSYNC */
return (EINVAL);
ki = p->p_aioinfo;
if (ki == NULL)
aio_init_aioinfo(p);
return (aio_aqueue(td, aiocbp, NULL, LIO_SYNC, ops));
}
int
-aio_fsync(struct thread *td, struct aio_fsync_args *uap)
+sys_aio_fsync(struct thread *td, struct aio_fsync_args *uap)
{
return (kern_aio_fsync(td, uap->op, uap->aiocbp, &aiocb_ops));
}
/* kqueue attach function */
static int
filt_aioattach(struct knote *kn)
{
struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_sdata;
/*
* The aiocbe pointer must be validated before using it, so
* registration is restricted to the kernel; the user cannot
* set EV_FLAG1.
*/
if ((kn->kn_flags & EV_FLAG1) == 0)
return (EPERM);
kn->kn_ptr.p_aio = aiocbe;
kn->kn_flags &= ~EV_FLAG1;
knlist_add(&aiocbe->klist, kn, 0);
return (0);
}
/* kqueue detach function */
static void
filt_aiodetach(struct knote *kn)
{
struct aiocblist *aiocbe = kn->kn_ptr.p_aio;
if (!knlist_empty(&aiocbe->klist))
knlist_remove(&aiocbe->klist, kn, 0);
}
/* kqueue filter function */
/*ARGSUSED*/
static int
filt_aio(struct knote *kn, long hint)
{
struct aiocblist *aiocbe = kn->kn_ptr.p_aio;
kn->kn_data = aiocbe->uaiocb._aiocb_private.error;
if (aiocbe->jobstate != JOBST_JOBFINISHED)
return (0);
kn->kn_flags |= EV_EOF;
return (1);
}
/* kqueue attach function */
static int
filt_lioattach(struct knote *kn)
{
struct aioliojob * lj = (struct aioliojob *)kn->kn_sdata;
/*
* The aioliojob pointer must be validated before using it, so
* registration is restricted to the kernel; the user cannot
* set EV_FLAG1.
*/
if ((kn->kn_flags & EV_FLAG1) == 0)
return (EPERM);
kn->kn_ptr.p_lio = lj;
kn->kn_flags &= ~EV_FLAG1;
knlist_add(&lj->klist, kn, 0);
return (0);
}
/* kqueue detach function */
static void
filt_liodetach(struct knote *kn)
{
struct aioliojob * lj = kn->kn_ptr.p_lio;
if (!knlist_empty(&lj->klist))
knlist_remove(&lj->klist, kn, 0);
}
/* kqueue filter function */
/*ARGSUSED*/
static int
filt_lio(struct knote *kn, long hint)
{
struct aioliojob * lj = kn->kn_ptr.p_lio;
return (lj->lioj_flags & LIOJ_KEVENT_POSTED);
}
#ifdef COMPAT_FREEBSD32
struct __aiocb_private32 {
int32_t status;
int32_t error;
uint32_t kernelinfo;
};
typedef struct oaiocb32 {
int aio_fildes; /* File descriptor */
uint64_t aio_offset __packed; /* File offset for I/O */
uint32_t aio_buf; /* I/O buffer in process space */
uint32_t aio_nbytes; /* Number of bytes for I/O */
struct osigevent32 aio_sigevent; /* Signal to deliver */
int aio_lio_opcode; /* LIO opcode */
int aio_reqprio; /* Request priority -- ignored */
struct __aiocb_private32 _aiocb_private;
} oaiocb32_t;
typedef struct aiocb32 {
int32_t aio_fildes; /* File descriptor */
uint64_t aio_offset __packed; /* File offset for I/O */
uint32_t aio_buf; /* I/O buffer in process space */
uint32_t aio_nbytes; /* Number of bytes for I/O */
int __spare__[2];
uint32_t __spare2__;
int aio_lio_opcode; /* LIO opcode */
int aio_reqprio; /* Request priority -- ignored */
struct __aiocb_private32 _aiocb_private;
struct sigevent32 aio_sigevent; /* Signal to deliver */
} aiocb32_t;
static int
convert_old_sigevent32(struct osigevent32 *osig, struct sigevent *nsig)
{
/*
* Only SIGEV_NONE, SIGEV_SIGNAL, and SIGEV_KEVENT are
* supported by AIO with the old sigevent structure.
*/
CP(*osig, *nsig, sigev_notify);
switch (nsig->sigev_notify) {
case SIGEV_NONE:
break;
case SIGEV_SIGNAL:
nsig->sigev_signo = osig->__sigev_u.__sigev_signo;
break;
case SIGEV_KEVENT:
nsig->sigev_notify_kqueue =
osig->__sigev_u.__sigev_notify_kqueue;
PTRIN_CP(*osig, *nsig, sigev_value.sival_ptr);
break;
default:
return (EINVAL);
}
return (0);
}
static int
aiocb32_copyin_old_sigevent(struct aiocb *ujob, struct aiocb *kjob)
{
struct oaiocb32 job32;
int error;
bzero(kjob, sizeof(struct aiocb));
error = copyin(ujob, &job32, sizeof(job32));
if (error)
return (error);
CP(job32, *kjob, aio_fildes);
CP(job32, *kjob, aio_offset);
PTRIN_CP(job32, *kjob, aio_buf);
CP(job32, *kjob, aio_nbytes);
CP(job32, *kjob, aio_lio_opcode);
CP(job32, *kjob, aio_reqprio);
CP(job32, *kjob, _aiocb_private.status);
CP(job32, *kjob, _aiocb_private.error);
PTRIN_CP(job32, *kjob, _aiocb_private.kernelinfo);
return (convert_old_sigevent32(&job32.aio_sigevent,
&kjob->aio_sigevent));
}
static int
convert_sigevent32(struct sigevent32 *sig32, struct sigevent *sig)
{
CP(*sig32, *sig, sigev_notify);
switch (sig->sigev_notify) {
case SIGEV_NONE:
break;
case SIGEV_THREAD_ID:
CP(*sig32, *sig, sigev_notify_thread_id);
/* FALLTHROUGH */
case SIGEV_SIGNAL:
CP(*sig32, *sig, sigev_signo);
break;
case SIGEV_KEVENT:
CP(*sig32, *sig, sigev_notify_kqueue);
PTRIN_CP(*sig32, *sig, sigev_value.sival_ptr);
break;
default:
return (EINVAL);
}
return (0);
}
static int
aiocb32_copyin(struct aiocb *ujob, struct aiocb *kjob)
{
struct aiocb32 job32;
int error;
error = copyin(ujob, &job32, sizeof(job32));
if (error)
return (error);
CP(job32, *kjob, aio_fildes);
CP(job32, *kjob, aio_offset);
PTRIN_CP(job32, *kjob, aio_buf);
CP(job32, *kjob, aio_nbytes);
CP(job32, *kjob, aio_lio_opcode);
CP(job32, *kjob, aio_reqprio);
CP(job32, *kjob, _aiocb_private.status);
CP(job32, *kjob, _aiocb_private.error);
PTRIN_CP(job32, *kjob, _aiocb_private.kernelinfo);
return (convert_sigevent32(&job32.aio_sigevent, &kjob->aio_sigevent));
}
static long
aiocb32_fetch_status(struct aiocb *ujob)
{
struct aiocb32 *ujob32;
ujob32 = (struct aiocb32 *)ujob;
return (fuword32(&ujob32->_aiocb_private.status));
}
static long
aiocb32_fetch_error(struct aiocb *ujob)
{
struct aiocb32 *ujob32;
ujob32 = (struct aiocb32 *)ujob;
return (fuword32(&ujob32->_aiocb_private.error));
}
static int
aiocb32_store_status(struct aiocb *ujob, long status)
{
struct aiocb32 *ujob32;
ujob32 = (struct aiocb32 *)ujob;
return (suword32(&ujob32->_aiocb_private.status, status));
}
static int
aiocb32_store_error(struct aiocb *ujob, long error)
{
struct aiocb32 *ujob32;
ujob32 = (struct aiocb32 *)ujob;
return (suword32(&ujob32->_aiocb_private.error, error));
}
static int
aiocb32_store_kernelinfo(struct aiocb *ujob, long jobref)
{
struct aiocb32 *ujob32;
ujob32 = (struct aiocb32 *)ujob;
return (suword32(&ujob32->_aiocb_private.kernelinfo, jobref));
}
static int
aiocb32_store_aiocb(struct aiocb **ujobp, struct aiocb *ujob)
{
return (suword32(ujobp, (long)ujob));
}
static struct aiocb_ops aiocb32_ops = {
.copyin = aiocb32_copyin,
.fetch_status = aiocb32_fetch_status,
.fetch_error = aiocb32_fetch_error,
.store_status = aiocb32_store_status,
.store_error = aiocb32_store_error,
.store_kernelinfo = aiocb32_store_kernelinfo,
.store_aiocb = aiocb32_store_aiocb,
};
static struct aiocb_ops aiocb32_ops_osigevent = {
.copyin = aiocb32_copyin_old_sigevent,
.fetch_status = aiocb32_fetch_status,
.fetch_error = aiocb32_fetch_error,
.store_status = aiocb32_store_status,
.store_error = aiocb32_store_error,
.store_kernelinfo = aiocb32_store_kernelinfo,
.store_aiocb = aiocb32_store_aiocb,
};
int
freebsd32_aio_return(struct thread *td, struct freebsd32_aio_return_args *uap)
{
return (kern_aio_return(td, (struct aiocb *)uap->aiocbp, &aiocb32_ops));
}
int
freebsd32_aio_suspend(struct thread *td, struct freebsd32_aio_suspend_args *uap)
{
struct timespec32 ts32;
struct timespec ts, *tsp;
struct aiocb **ujoblist;
uint32_t *ujoblist32;
int error, i;
if (uap->nent < 0 || uap->nent > AIO_LISTIO_MAX)
return (EINVAL);
if (uap->timeout) {
/* Get timespec struct. */
if ((error = copyin(uap->timeout, &ts32, sizeof(ts32))) != 0)
return (error);
CP(ts32, ts, tv_sec);
CP(ts32, ts, tv_nsec);
tsp = &ts;
} else
tsp = NULL;
ujoblist = uma_zalloc(aiol_zone, M_WAITOK);
ujoblist32 = (uint32_t *)ujoblist;
error = copyin(uap->aiocbp, ujoblist32, uap->nent *
sizeof(ujoblist32[0]));
if (error == 0) {
for (i = uap->nent; i > 0; i--)
ujoblist[i] = PTRIN(ujoblist32[i]);
error = kern_aio_suspend(td, uap->nent, ujoblist, tsp);
}
uma_zfree(aiol_zone, ujoblist);
return (error);
}
int
freebsd32_aio_cancel(struct thread *td, struct freebsd32_aio_cancel_args *uap)
{
- return (aio_cancel(td, (struct aio_cancel_args *)uap));
+ return (sys_aio_cancel(td, (struct aio_cancel_args *)uap));
}
int
freebsd32_aio_error(struct thread *td, struct freebsd32_aio_error_args *uap)
{
return (kern_aio_error(td, (struct aiocb *)uap->aiocbp, &aiocb32_ops));
}
int
freebsd32_oaio_read(struct thread *td, struct freebsd32_oaio_read_args *uap)
{
return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ,
&aiocb32_ops_osigevent));
}
int
freebsd32_aio_read(struct thread *td, struct freebsd32_aio_read_args *uap)
{
return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ,
&aiocb32_ops));
}
int
freebsd32_oaio_write(struct thread *td, struct freebsd32_oaio_write_args *uap)
{
return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE,
&aiocb32_ops_osigevent));
}
int
freebsd32_aio_write(struct thread *td, struct freebsd32_aio_write_args *uap)
{
return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE,
&aiocb32_ops));
}
int
freebsd32_aio_waitcomplete(struct thread *td,
struct freebsd32_aio_waitcomplete_args *uap)
{
struct timespec32 ts32;
struct timespec ts, *tsp;
int error;
if (uap->timeout) {
/* Get timespec struct. */
error = copyin(uap->timeout, &ts32, sizeof(ts32));
if (error)
return (error);
CP(ts32, ts, tv_sec);
CP(ts32, ts, tv_nsec);
tsp = &ts;
} else
tsp = NULL;
return (kern_aio_waitcomplete(td, (struct aiocb **)uap->aiocbp, tsp,
&aiocb32_ops));
}
int
freebsd32_aio_fsync(struct thread *td, struct freebsd32_aio_fsync_args *uap)
{
return (kern_aio_fsync(td, uap->op, (struct aiocb *)uap->aiocbp,
&aiocb32_ops));
}
int
freebsd32_olio_listio(struct thread *td, struct freebsd32_olio_listio_args *uap)
{
struct aiocb **acb_list;
struct sigevent *sigp, sig;
struct osigevent32 osig;
uint32_t *acb_list32;
int error, i, nent;
if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
return (EINVAL);
nent = uap->nent;
if (nent < 0 || nent > AIO_LISTIO_MAX)
return (EINVAL);
if (uap->sig && (uap->mode == LIO_NOWAIT)) {
error = copyin(uap->sig, &osig, sizeof(osig));
if (error)
return (error);
error = convert_old_sigevent32(&osig, &sig);
if (error)
return (error);
sigp = &sig;
} else
sigp = NULL;
acb_list32 = malloc(sizeof(uint32_t) * nent, M_LIO, M_WAITOK);
error = copyin(uap->acb_list, acb_list32, nent * sizeof(uint32_t));
if (error) {
free(acb_list32, M_LIO);
return (error);
}
acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
for (i = 0; i < nent; i++)
acb_list[i] = PTRIN(acb_list32[i]);
free(acb_list32, M_LIO);
error = kern_lio_listio(td, uap->mode,
(struct aiocb * const *)uap->acb_list, acb_list, nent, sigp,
&aiocb32_ops_osigevent);
free(acb_list, M_LIO);
return (error);
}
int
freebsd32_lio_listio(struct thread *td, struct freebsd32_lio_listio_args *uap)
{
struct aiocb **acb_list;
struct sigevent *sigp, sig;
struct sigevent32 sig32;
uint32_t *acb_list32;
int error, i, nent;
if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
return (EINVAL);
nent = uap->nent;
if (nent < 0 || nent > AIO_LISTIO_MAX)
return (EINVAL);
if (uap->sig && (uap->mode == LIO_NOWAIT)) {
error = copyin(uap->sig, &sig32, sizeof(sig32));
if (error)
return (error);
error = convert_sigevent32(&sig32, &sig);
if (error)
return (error);
sigp = &sig;
} else
sigp = NULL;
acb_list32 = malloc(sizeof(uint32_t) * nent, M_LIO, M_WAITOK);
error = copyin(uap->acb_list, acb_list32, nent * sizeof(uint32_t));
if (error) {
free(acb_list32, M_LIO);
return (error);
}
acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
for (i = 0; i < nent; i++)
acb_list[i] = PTRIN(acb_list32[i]);
free(acb_list32, M_LIO);
error = kern_lio_listio(td, uap->mode,
(struct aiocb * const *)uap->acb_list, acb_list, nent, sigp,
&aiocb32_ops);
free(acb_list, M_LIO);
return (error);
}
#endif
Index: head/sys/kern/vfs_cache.c
===================================================================
--- head/sys/kern/vfs_cache.c (revision 225616)
+++ head/sys/kern/vfs_cache.c (revision 225617)
@@ -1,1248 +1,1248 @@
/*-
* Copyright (c) 1989, 1993, 1995
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* Poul-Henning Kamp of the FreeBSD Project.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_kdtrace.h"
#include "opt_ktrace.h"
#include <sys/param.h>
#include <sys/filedesc.h>
#include <sys/fnv_hash.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mount.h>
#include <sys/namei.h>
#include <sys/proc.h>
#include <sys/rwlock.h>
#include <sys/sdt.h>
#include <sys/syscallsubr.h>
#include <sys/sysctl.h>
#include <sys/sysproto.h>
#include <sys/systm.h>
#include <sys/vnode.h>
#ifdef KTRACE
#include <sys/ktrace.h>
#endif
#include <vm/uma.h>
SDT_PROVIDER_DECLARE(vfs);
SDT_PROBE_DEFINE3(vfs, namecache, enter, done, done, "struct vnode *", "char *",
"struct vnode *");
SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, done, "struct vnode *",
"char *");
SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, entry, "struct vnode *");
SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, hit, "struct vnode *",
"struct char *", "struct vnode *");
SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, miss, "struct vnode *");
SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, return, "int",
"struct vnode *", "struct char *");
SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, hit, "struct vnode *", "char *",
"struct vnode *");
SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit_negative, hit-negative,
"struct vnode *", "char *");
SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, miss, "struct vnode *",
"char *");
SDT_PROBE_DEFINE1(vfs, namecache, purge, done, done, "struct vnode *");
SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, done, "struct vnode *");
SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, done, "struct mount *");
SDT_PROBE_DEFINE3(vfs, namecache, zap, done, done, "struct vnode *", "char *",
"struct vnode *");
SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, done, "struct vnode *",
"char *");
/*
* This structure describes the elements in the cache of recent
* names looked up by namei.
*/
struct namecache {
LIST_ENTRY(namecache) nc_hash; /* hash chain */
LIST_ENTRY(namecache) nc_src; /* source vnode list */
TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */
struct vnode *nc_dvp; /* vnode of parent of name */
struct vnode *nc_vp; /* vnode the name refers to */
u_char nc_flag; /* flag bits */
u_char nc_nlen; /* length of name */
char nc_name[0]; /* segment name + nul */
};
/*
* Name caching works as follows:
*
* Names found by directory scans are retained in a cache
* for future reference. It is managed LRU, so frequently
* used names will hang around. Cache is indexed by hash value
* obtained from (vp, name) where vp refers to the directory
* containing name.
*
* If it is a "negative" entry, (i.e. for a name that is known NOT to
* exist) the vnode pointer will be NULL.
*
* Upon reaching the last segment of a path, if the reference
* is for DELETE, or NOCACHE is set (rewrite), and the
* name is located in the cache, it will be dropped.
*/
/*
* Structures associated with name cacheing.
*/
#define NCHHASH(hash) \
(&nchashtbl[(hash) & nchash])
static LIST_HEAD(nchashhead, namecache) *nchashtbl; /* Hash Table */
static TAILQ_HEAD(, namecache) ncneg; /* Hash Table */
static u_long nchash; /* size of hash table */
SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0,
"Size of namecache hash table");
static u_long ncnegfactor = 16; /* ratio of negative entries */
SYSCTL_ULONG(_vfs, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0,
"Ratio of negative namecache entries");
static u_long numneg; /* number of negative entries allocated */
SYSCTL_ULONG(_debug, OID_AUTO, numneg, CTLFLAG_RD, &numneg, 0,
"Number of negative entries in namecache");
static u_long numcache; /* number of cache entries allocated */
SYSCTL_ULONG(_debug, OID_AUTO, numcache, CTLFLAG_RD, &numcache, 0,
"Number of namecache entries");
static u_long numcachehv; /* number of cache entries with vnodes held */
SYSCTL_ULONG(_debug, OID_AUTO, numcachehv, CTLFLAG_RD, &numcachehv, 0,
"Number of namecache entries with vnodes held");
static u_int ncsizefactor = 2;
SYSCTL_UINT(_vfs, OID_AUTO, ncsizefactor, CTLFLAG_RW, &ncsizefactor, 0,
"Size factor for namecache");
struct nchstats nchstats; /* cache effectiveness statistics */
static struct rwlock cache_lock;
RW_SYSINIT(vfscache, &cache_lock, "Name Cache");
#define CACHE_UPGRADE_LOCK() rw_try_upgrade(&cache_lock)
#define CACHE_RLOCK() rw_rlock(&cache_lock)
#define CACHE_RUNLOCK() rw_runlock(&cache_lock)
#define CACHE_WLOCK() rw_wlock(&cache_lock)
#define CACHE_WUNLOCK() rw_wunlock(&cache_lock)
/*
* UMA zones for the VFS cache.
*
* The small cache is used for entries with short names, which are the
* most common. The large cache is used for entries which are too big to
* fit in the small cache.
*/
static uma_zone_t cache_zone_small;
static uma_zone_t cache_zone_large;
#define CACHE_PATH_CUTOFF 35
#define CACHE_ZONE_SMALL (sizeof(struct namecache) + CACHE_PATH_CUTOFF \
+ 1)
#define CACHE_ZONE_LARGE (sizeof(struct namecache) + NAME_MAX + 1)
#define cache_alloc(len) uma_zalloc(((len) <= CACHE_PATH_CUTOFF) ? \
cache_zone_small : cache_zone_large, M_WAITOK)
#define cache_free(ncp) do { \
if (ncp != NULL) \
uma_zfree(((ncp)->nc_nlen <= CACHE_PATH_CUTOFF) ? \
cache_zone_small : cache_zone_large, (ncp)); \
} while (0)
static int doingcache = 1; /* 1 => enable the cache */
SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0,
"VFS namecache enabled");
/* Export size information to userland */
SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, 0,
sizeof(struct namecache), "sizeof(struct namecache)");
/*
* The new name cache statistics
*/
static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0,
"Name cache statistics");
#define STATNODE(mode, name, var, descr) \
SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, mode, var, 0, descr);
STATNODE(CTLFLAG_RD, numneg, &numneg, "Number of negative cache entries");
STATNODE(CTLFLAG_RD, numcache, &numcache, "Number of cache entries");
static u_long numcalls; STATNODE(CTLFLAG_RD, numcalls, &numcalls,
"Number of cache lookups");
static u_long dothits; STATNODE(CTLFLAG_RD, dothits, &dothits,
"Number of '.' hits");
static u_long dotdothits; STATNODE(CTLFLAG_RD, dotdothits, &dotdothits,
"Number of '..' hits");
static u_long numchecks; STATNODE(CTLFLAG_RD, numchecks, &numchecks,
"Number of checks in lookup");
static u_long nummiss; STATNODE(CTLFLAG_RD, nummiss, &nummiss,
"Number of cache misses");
static u_long nummisszap; STATNODE(CTLFLAG_RD, nummisszap, &nummisszap,
"Number of cache misses we do not want to cache");
static u_long numposzaps; STATNODE(CTLFLAG_RD, numposzaps, &numposzaps,
"Number of cache hits (positive) we do not want to cache");
static u_long numposhits; STATNODE(CTLFLAG_RD, numposhits, &numposhits,
"Number of cache hits (positive)");
static u_long numnegzaps; STATNODE(CTLFLAG_RD, numnegzaps, &numnegzaps,
"Number of cache hits (negative) we do not want to cache");
static u_long numneghits; STATNODE(CTLFLAG_RD, numneghits, &numneghits,
"Number of cache hits (negative)");
static u_long numupgrades; STATNODE(CTLFLAG_RD, numupgrades, &numupgrades,
"Number of updates of the cache after lookup (write lock + retry)");
SYSCTL_OPAQUE(_vfs_cache, OID_AUTO, nchstats, CTLFLAG_RD | CTLFLAG_MPSAFE,
&nchstats, sizeof(nchstats), "LU",
"VFS cache effectiveness statistics");
static void cache_zap(struct namecache *ncp);
static int vn_vptocnp_locked(struct vnode **vp, struct ucred *cred, char *buf,
u_int *buflen);
static int vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir,
char *buf, char **retbuf, u_int buflen);
static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
/*
* Flags in namecache.nc_flag
*/
#define NCF_WHITE 0x01
#define NCF_ISDOTDOT 0x02
#ifdef DIAGNOSTIC
/*
* Grab an atomic snapshot of the name cache hash chain lengths
*/
SYSCTL_NODE(_debug, OID_AUTO, hashstat, CTLFLAG_RW, NULL, "hash table stats");
static int
sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
{
int error;
struct nchashhead *ncpp;
struct namecache *ncp;
int n_nchash;
int count;
n_nchash = nchash + 1; /* nchash is max index, not count */
if (!req->oldptr)
return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
/* Scan hash tables for applicable entries */
for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
CACHE_RLOCK();
count = 0;
LIST_FOREACH(ncp, ncpp, nc_hash) {
count++;
}
CACHE_RUNLOCK();
error = SYSCTL_OUT(req, &count, sizeof(count));
if (error)
return (error);
}
return (0);
}
SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD|
CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int",
"nchash chain lengths");
static int
sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)
{
int error;
struct nchashhead *ncpp;
struct namecache *ncp;
int n_nchash;
int count, maxlength, used, pct;
if (!req->oldptr)
return SYSCTL_OUT(req, 0, 4 * sizeof(int));
n_nchash = nchash + 1; /* nchash is max index, not count */
used = 0;
maxlength = 0;
/* Scan hash tables for applicable entries */
for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
count = 0;
CACHE_RLOCK();
LIST_FOREACH(ncp, ncpp, nc_hash) {
count++;
}
CACHE_RUNLOCK();
if (count)
used++;
if (maxlength < count)
maxlength = count;
}
n_nchash = nchash + 1;
pct = (used * 100 * 100) / n_nchash;
error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash));
if (error)
return (error);
error = SYSCTL_OUT(req, &used, sizeof(used));
if (error)
return (error);
error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength));
if (error)
return (error);
error = SYSCTL_OUT(req, &pct, sizeof(pct));
if (error)
return (error);
return (0);
}
SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD|
CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I",
"nchash chain lengths");
#endif
/*
* cache_zap():
*
* Removes a namecache entry from cache, whether it contains an actual
* pointer to a vnode or if it is just a negative cache entry.
*/
static void
cache_zap(ncp)
struct namecache *ncp;
{
struct vnode *vp;
rw_assert(&cache_lock, RA_WLOCKED);
CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp, ncp->nc_vp);
#ifdef KDTRACE_HOOKS
if (ncp->nc_vp != NULL) {
SDT_PROBE(vfs, namecache, zap, done, ncp->nc_dvp,
ncp->nc_name, ncp->nc_vp, 0, 0);
} else {
SDT_PROBE(vfs, namecache, zap_negative, done, ncp->nc_dvp,
ncp->nc_name, 0, 0, 0);
}
#endif
vp = NULL;
LIST_REMOVE(ncp, nc_hash);
if (ncp->nc_flag & NCF_ISDOTDOT) {
if (ncp == ncp->nc_dvp->v_cache_dd)
ncp->nc_dvp->v_cache_dd = NULL;
} else {
LIST_REMOVE(ncp, nc_src);
if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) {
vp = ncp->nc_dvp;
numcachehv--;
}
}
if (ncp->nc_vp) {
TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst);
if (ncp == ncp->nc_vp->v_cache_dd)
ncp->nc_vp->v_cache_dd = NULL;
} else {
TAILQ_REMOVE(&ncneg, ncp, nc_dst);
numneg--;
}
numcache--;
cache_free(ncp);
if (vp)
vdrop(vp);
}
/*
* Lookup an entry in the cache
*
* Lookup is called with dvp pointing to the directory to search,
* cnp pointing to the name of the entry being sought. If the lookup
* succeeds, the vnode is returned in *vpp, and a status of -1 is
* returned. If the lookup determines that the name does not exist
* (negative cacheing), a status of ENOENT is returned. If the lookup
* fails, a status of zero is returned. If the directory vnode is
* recycled out from under us due to a forced unmount, a status of
* ENOENT is returned.
*
* vpp is locked and ref'd on return. If we're looking up DOTDOT, dvp is
* unlocked. If we're looking up . an extra ref is taken, but the lock is
* not recursively acquired.
*/
int
cache_lookup(dvp, vpp, cnp)
struct vnode *dvp;
struct vnode **vpp;
struct componentname *cnp;
{
struct namecache *ncp;
uint32_t hash;
int error, ltype, wlocked;
if (!doingcache) {
cnp->cn_flags &= ~MAKEENTRY;
return (0);
}
retry:
CACHE_RLOCK();
wlocked = 0;
numcalls++;
error = 0;
retry_wlocked:
if (cnp->cn_nameptr[0] == '.') {
if (cnp->cn_namelen == 1) {
*vpp = dvp;
CTR2(KTR_VFS, "cache_lookup(%p, %s) found via .",
dvp, cnp->cn_nameptr);
dothits++;
SDT_PROBE(vfs, namecache, lookup, hit, dvp, ".",
*vpp, 0, 0);
goto success;
}
if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
dotdothits++;
if (dvp->v_cache_dd == NULL) {
SDT_PROBE(vfs, namecache, lookup, miss, dvp,
"..", NULL, 0, 0);
goto unlock;
}
if ((cnp->cn_flags & MAKEENTRY) == 0) {
if (!wlocked && !CACHE_UPGRADE_LOCK())
goto wlock;
if (dvp->v_cache_dd->nc_flag & NCF_ISDOTDOT)
cache_zap(dvp->v_cache_dd);
dvp->v_cache_dd = NULL;
CACHE_WUNLOCK();
return (0);
}
if (dvp->v_cache_dd->nc_flag & NCF_ISDOTDOT)
*vpp = dvp->v_cache_dd->nc_vp;
else
*vpp = dvp->v_cache_dd->nc_dvp;
/* Return failure if negative entry was found. */
if (*vpp == NULL) {
ncp = dvp->v_cache_dd;
goto negative_success;
}
CTR3(KTR_VFS, "cache_lookup(%p, %s) found %p via ..",
dvp, cnp->cn_nameptr, *vpp);
SDT_PROBE(vfs, namecache, lookup, hit, dvp, "..",
*vpp, 0, 0);
goto success;
}
}
hash = fnv_32_buf(cnp->cn_nameptr, cnp->cn_namelen, FNV1_32_INIT);
hash = fnv_32_buf(&dvp, sizeof(dvp), hash);
LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
numchecks++;
if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
!bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
break;
}
/* We failed to find an entry */
if (ncp == NULL) {
SDT_PROBE(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr,
NULL, 0, 0);
if ((cnp->cn_flags & MAKEENTRY) == 0) {
nummisszap++;
} else {
nummiss++;
}
nchstats.ncs_miss++;
goto unlock;
}
/* We don't want to have an entry, so dump it */
if ((cnp->cn_flags & MAKEENTRY) == 0) {
numposzaps++;
nchstats.ncs_badhits++;
if (!wlocked && !CACHE_UPGRADE_LOCK())
goto wlock;
cache_zap(ncp);
CACHE_WUNLOCK();
return (0);
}
/* We found a "positive" match, return the vnode */
if (ncp->nc_vp) {
numposhits++;
nchstats.ncs_goodhits++;
*vpp = ncp->nc_vp;
CTR4(KTR_VFS, "cache_lookup(%p, %s) found %p via ncp %p",
dvp, cnp->cn_nameptr, *vpp, ncp);
SDT_PROBE(vfs, namecache, lookup, hit, dvp, ncp->nc_name,
*vpp, 0, 0);
goto success;
}
negative_success:
/* We found a negative match, and want to create it, so purge */
if (cnp->cn_nameiop == CREATE) {
numnegzaps++;
nchstats.ncs_badhits++;
if (!wlocked && !CACHE_UPGRADE_LOCK())
goto wlock;
cache_zap(ncp);
CACHE_WUNLOCK();
return (0);
}
if (!wlocked && !CACHE_UPGRADE_LOCK())
goto wlock;
numneghits++;
/*
* We found a "negative" match, so we shift it to the end of
* the "negative" cache entries queue to satisfy LRU. Also,
* check to see if the entry is a whiteout; indicate this to
* the componentname, if so.
*/
TAILQ_REMOVE(&ncneg, ncp, nc_dst);
TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
nchstats.ncs_neghits++;
if (ncp->nc_flag & NCF_WHITE)
cnp->cn_flags |= ISWHITEOUT;
SDT_PROBE(vfs, namecache, lookup, hit_negative, dvp, ncp->nc_name,
0, 0, 0);
CACHE_WUNLOCK();
return (ENOENT);
wlock:
/*
* We need to update the cache after our lookup, so upgrade to
* a write lock and retry the operation.
*/
CACHE_RUNLOCK();
CACHE_WLOCK();
numupgrades++;
wlocked = 1;
goto retry_wlocked;
success:
/*
* On success we return a locked and ref'd vnode as per the lookup
* protocol.
*/
if (dvp == *vpp) { /* lookup on "." */
VREF(*vpp);
if (wlocked)
CACHE_WUNLOCK();
else
CACHE_RUNLOCK();
/*
* When we lookup "." we still can be asked to lock it
* differently...
*/
ltype = cnp->cn_lkflags & LK_TYPE_MASK;
if (ltype != VOP_ISLOCKED(*vpp)) {
if (ltype == LK_EXCLUSIVE) {
vn_lock(*vpp, LK_UPGRADE | LK_RETRY);
if ((*vpp)->v_iflag & VI_DOOMED) {
/* forced unmount */
vrele(*vpp);
*vpp = NULL;
return (ENOENT);
}
} else
vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY);
}
return (-1);
}
ltype = 0; /* silence gcc warning */
if (cnp->cn_flags & ISDOTDOT) {
ltype = VOP_ISLOCKED(dvp);
VOP_UNLOCK(dvp, 0);
}
VI_LOCK(*vpp);
if (wlocked)
CACHE_WUNLOCK();
else
CACHE_RUNLOCK();
error = vget(*vpp, cnp->cn_lkflags | LK_INTERLOCK, cnp->cn_thread);
if (cnp->cn_flags & ISDOTDOT) {
vn_lock(dvp, ltype | LK_RETRY);
if (dvp->v_iflag & VI_DOOMED) {
if (error == 0)
vput(*vpp);
*vpp = NULL;
return (ENOENT);
}
}
if (error) {
*vpp = NULL;
goto retry;
}
if ((cnp->cn_flags & ISLASTCN) &&
(cnp->cn_lkflags & LK_TYPE_MASK) == LK_EXCLUSIVE) {
ASSERT_VOP_ELOCKED(*vpp, "cache_lookup");
}
return (-1);
unlock:
if (wlocked)
CACHE_WUNLOCK();
else
CACHE_RUNLOCK();
return (0);
}
/*
* Add an entry to the cache.
*/
void
cache_enter(dvp, vp, cnp)
struct vnode *dvp;
struct vnode *vp;
struct componentname *cnp;
{
struct namecache *ncp, *n2;
struct nchashhead *ncpp;
uint32_t hash;
int flag;
int hold;
int zap;
int len;
CTR3(KTR_VFS, "cache_enter(%p, %p, %s)", dvp, vp, cnp->cn_nameptr);
VNASSERT(vp == NULL || (vp->v_iflag & VI_DOOMED) == 0, vp,
("cache_enter: Adding a doomed vnode"));
VNASSERT(dvp == NULL || (dvp->v_iflag & VI_DOOMED) == 0, dvp,
("cache_enter: Doomed vnode used as src"));
if (!doingcache)
return;
/*
* Avoid blowout in namecache entries.
*/
if (numcache >= desiredvnodes * ncsizefactor)
return;
flag = 0;
if (cnp->cn_nameptr[0] == '.') {
if (cnp->cn_namelen == 1)
return;
if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
CACHE_WLOCK();
/*
* If dotdot entry already exists, just retarget it
* to new parent vnode, otherwise continue with new
* namecache entry allocation.
*/
if ((ncp = dvp->v_cache_dd) != NULL &&
ncp->nc_flag & NCF_ISDOTDOT) {
KASSERT(ncp->nc_dvp == dvp,
("wrong isdotdot parent"));
if (ncp->nc_vp != NULL)
TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst,
ncp, nc_dst);
else
TAILQ_REMOVE(&ncneg, ncp, nc_dst);
if (vp != NULL)
TAILQ_INSERT_HEAD(&vp->v_cache_dst,
ncp, nc_dst);
else
TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
ncp->nc_vp = vp;
CACHE_WUNLOCK();
return;
}
dvp->v_cache_dd = NULL;
SDT_PROBE(vfs, namecache, enter, done, dvp, "..", vp,
0, 0);
CACHE_WUNLOCK();
flag = NCF_ISDOTDOT;
}
}
hold = 0;
zap = 0;
/*
* Calculate the hash key and setup as much of the new
* namecache entry as possible before acquiring the lock.
*/
ncp = cache_alloc(cnp->cn_namelen);
ncp->nc_vp = vp;
ncp->nc_dvp = dvp;
ncp->nc_flag = flag;
len = ncp->nc_nlen = cnp->cn_namelen;
hash = fnv_32_buf(cnp->cn_nameptr, len, FNV1_32_INIT);
strlcpy(ncp->nc_name, cnp->cn_nameptr, len + 1);
hash = fnv_32_buf(&dvp, sizeof(dvp), hash);
CACHE_WLOCK();
/*
* See if this vnode or negative entry is already in the cache
* with this name. This can happen with concurrent lookups of
* the same path name.
*/
ncpp = NCHHASH(hash);
LIST_FOREACH(n2, ncpp, nc_hash) {
if (n2->nc_dvp == dvp &&
n2->nc_nlen == cnp->cn_namelen &&
!bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) {
CACHE_WUNLOCK();
cache_free(ncp);
return;
}
}
if (flag == NCF_ISDOTDOT) {
/*
* See if we are trying to add .. entry, but some other lookup
* has populated v_cache_dd pointer already.
*/
if (dvp->v_cache_dd != NULL) {
CACHE_WUNLOCK();
cache_free(ncp);
return;
}
KASSERT(vp == NULL || vp->v_type == VDIR,
("wrong vnode type %p", vp));
dvp->v_cache_dd = ncp;
}
numcache++;
if (!vp) {
numneg++;
if (cnp->cn_flags & ISWHITEOUT)
ncp->nc_flag |= NCF_WHITE;
} else if (vp->v_type == VDIR) {
if (flag != NCF_ISDOTDOT) {
if ((n2 = vp->v_cache_dd) != NULL &&
(n2->nc_flag & NCF_ISDOTDOT) != 0)
cache_zap(n2);
vp->v_cache_dd = ncp;
}
} else {
vp->v_cache_dd = NULL;
}
/*
* Insert the new namecache entry into the appropriate chain
* within the cache entries table.
*/
LIST_INSERT_HEAD(ncpp, ncp, nc_hash);
if (flag != NCF_ISDOTDOT) {
if (LIST_EMPTY(&dvp->v_cache_src)) {
hold = 1;
numcachehv++;
}
LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
}
/*
* If the entry is "negative", we place it into the
* "negative" cache queue, otherwise, we place it into the
* destination vnode's cache entries queue.
*/
if (vp) {
TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
SDT_PROBE(vfs, namecache, enter, done, dvp, ncp->nc_name, vp,
0, 0);
} else {
TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
SDT_PROBE(vfs, namecache, enter_negative, done, dvp,
ncp->nc_name, 0, 0, 0);
}
if (numneg * ncnegfactor > numcache) {
ncp = TAILQ_FIRST(&ncneg);
zap = 1;
}
if (hold)
vhold(dvp);
if (zap)
cache_zap(ncp);
CACHE_WUNLOCK();
}
/*
* Name cache initialization, from vfs_init() when we are booting
*/
static void
nchinit(void *dummy __unused)
{
TAILQ_INIT(&ncneg);
cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL, NULL,
NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE, NULL,
NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
nchashtbl = hashinit(desiredvnodes * 2, M_VFSCACHE, &nchash);
}
SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL);
/*
* Invalidate all entries to a particular vnode.
*/
void
cache_purge(vp)
struct vnode *vp;
{
CTR1(KTR_VFS, "cache_purge(%p)", vp);
SDT_PROBE(vfs, namecache, purge, done, vp, 0, 0, 0, 0);
CACHE_WLOCK();
while (!LIST_EMPTY(&vp->v_cache_src))
cache_zap(LIST_FIRST(&vp->v_cache_src));
while (!TAILQ_EMPTY(&vp->v_cache_dst))
cache_zap(TAILQ_FIRST(&vp->v_cache_dst));
if (vp->v_cache_dd != NULL) {
KASSERT(vp->v_cache_dd->nc_flag & NCF_ISDOTDOT,
("lost dotdot link"));
cache_zap(vp->v_cache_dd);
}
KASSERT(vp->v_cache_dd == NULL, ("incomplete purge"));
CACHE_WUNLOCK();
}
/*
* Invalidate all negative entries for a particular directory vnode.
*/
void
cache_purge_negative(vp)
struct vnode *vp;
{
struct namecache *cp, *ncp;
CTR1(KTR_VFS, "cache_purge_negative(%p)", vp);
SDT_PROBE(vfs, namecache, purge_negative, done, vp, 0, 0, 0, 0);
CACHE_WLOCK();
LIST_FOREACH_SAFE(cp, &vp->v_cache_src, nc_src, ncp) {
if (cp->nc_vp == NULL)
cache_zap(cp);
}
CACHE_WUNLOCK();
}
/*
* Flush all entries referencing a particular filesystem.
*/
void
cache_purgevfs(mp)
struct mount *mp;
{
struct nchashhead *ncpp;
struct namecache *ncp, *nnp;
/* Scan hash tables for applicable entries */
SDT_PROBE(vfs, namecache, purgevfs, done, mp, 0, 0, 0, 0);
CACHE_WLOCK();
for (ncpp = &nchashtbl[nchash]; ncpp >= nchashtbl; ncpp--) {
LIST_FOREACH_SAFE(ncp, ncpp, nc_hash, nnp) {
if (ncp->nc_dvp->v_mount == mp)
cache_zap(ncp);
}
}
CACHE_WUNLOCK();
}
/*
* Perform canonical checks and cache lookup and pass on to filesystem
* through the vop_cachedlookup only if needed.
*/
int
vfs_cache_lookup(ap)
struct vop_lookup_args /* {
struct vnode *a_dvp;
struct vnode **a_vpp;
struct componentname *a_cnp;
} */ *ap;
{
struct vnode *dvp;
int error;
struct vnode **vpp = ap->a_vpp;
struct componentname *cnp = ap->a_cnp;
struct ucred *cred = cnp->cn_cred;
int flags = cnp->cn_flags;
struct thread *td = cnp->cn_thread;
*vpp = NULL;
dvp = ap->a_dvp;
if (dvp->v_type != VDIR)
return (ENOTDIR);
if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
(cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
return (EROFS);
error = VOP_ACCESS(dvp, VEXEC, cred, td);
if (error)
return (error);
error = cache_lookup(dvp, vpp, cnp);
if (error == 0)
return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
if (error == -1)
return (0);
return (error);
}
#ifndef _SYS_SYSPROTO_H_
struct __getcwd_args {
u_char *buf;
u_int buflen;
};
#endif
/*
* XXX All of these sysctls would probably be more productive dead.
*/
static int disablecwd;
SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0,
"Disable the getcwd syscall");
/* Implementation of the getcwd syscall. */
int
-__getcwd(td, uap)
+sys___getcwd(td, uap)
struct thread *td;
struct __getcwd_args *uap;
{
return (kern___getcwd(td, uap->buf, UIO_USERSPACE, uap->buflen));
}
int
kern___getcwd(struct thread *td, u_char *buf, enum uio_seg bufseg, u_int buflen)
{
char *bp, *tmpbuf;
struct filedesc *fdp;
struct vnode *cdir, *rdir;
int error, vfslocked;
if (disablecwd)
return (ENODEV);
if (buflen < 2)
return (EINVAL);
if (buflen > MAXPATHLEN)
buflen = MAXPATHLEN;
tmpbuf = malloc(buflen, M_TEMP, M_WAITOK);
fdp = td->td_proc->p_fd;
FILEDESC_SLOCK(fdp);
cdir = fdp->fd_cdir;
VREF(cdir);
rdir = fdp->fd_rdir;
VREF(rdir);
FILEDESC_SUNLOCK(fdp);
error = vn_fullpath1(td, cdir, rdir, tmpbuf, &bp, buflen);
vfslocked = VFS_LOCK_GIANT(rdir->v_mount);
vrele(rdir);
VFS_UNLOCK_GIANT(vfslocked);
vfslocked = VFS_LOCK_GIANT(cdir->v_mount);
vrele(cdir);
VFS_UNLOCK_GIANT(vfslocked);
if (!error) {
if (bufseg == UIO_SYSSPACE)
bcopy(bp, buf, strlen(bp) + 1);
else
error = copyout(bp, buf, strlen(bp) + 1);
#ifdef KTRACE
if (KTRPOINT(curthread, KTR_NAMEI))
ktrnamei(bp);
#endif
}
free(tmpbuf, M_TEMP);
return (error);
}
/*
* Thus begins the fullpath magic.
*/
#undef STATNODE
#define STATNODE(name, descr) \
static u_int name; \
SYSCTL_UINT(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, descr)
static int disablefullpath;
SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW, &disablefullpath, 0,
"Disable the vn_fullpath function");
/* These count for kern___getcwd(), too. */
STATNODE(numfullpathcalls, "Number of fullpath search calls");
STATNODE(numfullpathfail1, "Number of fullpath search errors (ENOTDIR)");
STATNODE(numfullpathfail2,
"Number of fullpath search errors (VOP_VPTOCNP failures)");
STATNODE(numfullpathfail4, "Number of fullpath search errors (ENOMEM)");
STATNODE(numfullpathfound, "Number of successful fullpath calls");
/*
* Retrieve the full filesystem path that correspond to a vnode from the name
* cache (if available)
*/
int
vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf)
{
char *buf;
struct filedesc *fdp;
struct vnode *rdir;
int error, vfslocked;
if (disablefullpath)
return (ENODEV);
if (vn == NULL)
return (EINVAL);
buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
fdp = td->td_proc->p_fd;
FILEDESC_SLOCK(fdp);
rdir = fdp->fd_rdir;
VREF(rdir);
FILEDESC_SUNLOCK(fdp);
error = vn_fullpath1(td, vn, rdir, buf, retbuf, MAXPATHLEN);
vfslocked = VFS_LOCK_GIANT(rdir->v_mount);
vrele(rdir);
VFS_UNLOCK_GIANT(vfslocked);
if (!error)
*freebuf = buf;
else
free(buf, M_TEMP);
return (error);
}
/*
* This function is similar to vn_fullpath, but it attempts to lookup the
* pathname relative to the global root mount point. This is required for the
* auditing sub-system, as audited pathnames must be absolute, relative to the
* global root mount point.
*/
int
vn_fullpath_global(struct thread *td, struct vnode *vn,
char **retbuf, char **freebuf)
{
char *buf;
int error;
if (disablefullpath)
return (ENODEV);
if (vn == NULL)
return (EINVAL);
buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
error = vn_fullpath1(td, vn, rootvnode, buf, retbuf, MAXPATHLEN);
if (!error)
*freebuf = buf;
else
free(buf, M_TEMP);
return (error);
}
int
vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf, u_int *buflen)
{
int error;
CACHE_RLOCK();
error = vn_vptocnp_locked(vp, cred, buf, buflen);
if (error == 0) {
/*
* vn_vptocnp_locked() dropped hold acquired by
* VOP_VPTOCNP immediately after locking the
* cache. Since we are going to drop the cache rlock,
* re-hold the result.
*/
vhold(*vp);
CACHE_RUNLOCK();
}
return (error);
}
static int
vn_vptocnp_locked(struct vnode **vp, struct ucred *cred, char *buf,
u_int *buflen)
{
struct vnode *dvp;
struct namecache *ncp;
int error, vfslocked;
TAILQ_FOREACH(ncp, &((*vp)->v_cache_dst), nc_dst) {
if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
break;
}
if (ncp != NULL) {
if (*buflen < ncp->nc_nlen) {
CACHE_RUNLOCK();
numfullpathfail4++;
error = ENOMEM;
SDT_PROBE(vfs, namecache, fullpath, return, error,
vp, NULL, 0, 0);
return (error);
}
*buflen -= ncp->nc_nlen;
memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
SDT_PROBE(vfs, namecache, fullpath, hit, ncp->nc_dvp,
ncp->nc_name, vp, 0, 0);
*vp = ncp->nc_dvp;
return (0);
}
SDT_PROBE(vfs, namecache, fullpath, miss, vp, 0, 0, 0, 0);
vhold(*vp);
CACHE_RUNLOCK();
vfslocked = VFS_LOCK_GIANT((*vp)->v_mount);
vn_lock(*vp, LK_SHARED | LK_RETRY);
error = VOP_VPTOCNP(*vp, &dvp, cred, buf, buflen);
VOP_UNLOCK(*vp, 0);
vdrop(*vp);
VFS_UNLOCK_GIANT(vfslocked);
if (error) {
numfullpathfail2++;
SDT_PROBE(vfs, namecache, fullpath, return, error, vp,
NULL, 0, 0);
return (error);
}
*vp = dvp;
CACHE_RLOCK();
if ((*vp)->v_iflag & VI_DOOMED) {
/* forced unmount */
CACHE_RUNLOCK();
vdrop(*vp);
error = ENOENT;
SDT_PROBE(vfs, namecache, fullpath, return, error, vp,
NULL, 0, 0);
return (error);
}
vdrop(*vp);
return (0);
}
/*
* The magic behind kern___getcwd() and vn_fullpath().
*/
static int
vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir,
char *buf, char **retbuf, u_int buflen)
{
int error, slash_prefixed;
#ifdef KDTRACE_HOOKS
struct vnode *startvp = vp;
#endif
buflen--;
buf[buflen] = '\0';
error = 0;
slash_prefixed = 0;
SDT_PROBE(vfs, namecache, fullpath, entry, vp, 0, 0, 0, 0);
numfullpathcalls++;
CACHE_RLOCK();
if (vp->v_type != VDIR) {
error = vn_vptocnp_locked(&vp, td->td_ucred, buf, &buflen);
if (error)
return (error);
if (buflen == 0) {
CACHE_RUNLOCK();
return (ENOMEM);
}
buf[--buflen] = '/';
slash_prefixed = 1;
}
while (vp != rdir && vp != rootvnode) {
if (vp->v_vflag & VV_ROOT) {
if (vp->v_iflag & VI_DOOMED) { /* forced unmount */
CACHE_RUNLOCK();
error = ENOENT;
SDT_PROBE(vfs, namecache, fullpath, return,
error, vp, NULL, 0, 0);
break;
}
vp = vp->v_mount->mnt_vnodecovered;
continue;
}
if (vp->v_type != VDIR) {
CACHE_RUNLOCK();
numfullpathfail1++;
error = ENOTDIR;
SDT_PROBE(vfs, namecache, fullpath, return,
error, vp, NULL, 0, 0);
break;
}
error = vn_vptocnp_locked(&vp, td->td_ucred, buf, &buflen);
if (error)
break;
if (buflen == 0) {
CACHE_RUNLOCK();
error = ENOMEM;
SDT_PROBE(vfs, namecache, fullpath, return, error,
startvp, NULL, 0, 0);
break;
}
buf[--buflen] = '/';
slash_prefixed = 1;
}
if (error)
return (error);
if (!slash_prefixed) {
if (buflen == 0) {
CACHE_RUNLOCK();
numfullpathfail4++;
SDT_PROBE(vfs, namecache, fullpath, return, ENOMEM,
startvp, NULL, 0, 0);
return (ENOMEM);
}
buf[--buflen] = '/';
}
numfullpathfound++;
CACHE_RUNLOCK();
SDT_PROBE(vfs, namecache, fullpath, return, 0, startvp, buf + buflen,
0, 0);
*retbuf = buf + buflen;
return (0);
}
int
vn_commname(struct vnode *vp, char *buf, u_int buflen)
{
struct namecache *ncp;
int l;
CACHE_RLOCK();
TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst)
if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
break;
if (ncp == NULL) {
CACHE_RUNLOCK();
return (ENOENT);
}
l = min(ncp->nc_nlen, buflen - 1);
memcpy(buf, ncp->nc_name, l);
CACHE_RUNLOCK();
buf[l] = '\0';
return (0);
}
Index: head/sys/kern/vfs_extattr.c
===================================================================
--- head/sys/kern/vfs_extattr.c (revision 225616)
+++ head/sys/kern/vfs_extattr.c (revision 225617)
@@ -1,795 +1,795 @@
/*-
* Copyright (c) 1999-2001 Robert N. M. Watson
* All rights reserved.
*
* This software was developed by Robert Watson for the TrustedBSD Project.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/capability.h>
#include <sys/lock.h>
#include <sys/mount.h>
#include <sys/mutex.h>
#include <sys/sysproto.h>
#include <sys/fcntl.h>
#include <sys/namei.h>
#include <sys/filedesc.h>
#include <sys/limits.h>
#include <sys/vnode.h>
#include <sys/proc.h>
#include <sys/extattr.h>
#include <security/audit/audit.h>
#include <security/mac/mac_framework.h>
/*
* Syscall to push extended attribute configuration information into the VFS.
* Accepts a path, which it converts to a mountpoint, as well as a command
* (int cmd), and attribute name and misc data.
*
* Currently this is used only by UFS1 extended attributes.
*/
int
-extattrctl(td, uap)
+sys_extattrctl(td, uap)
struct thread *td;
struct extattrctl_args /* {
const char *path;
int cmd;
const char *filename;
int attrnamespace;
const char *attrname;
} */ *uap;
{
struct vnode *filename_vp;
struct nameidata nd;
struct mount *mp, *mp_writable;
char attrname[EXTATTR_MAXNAMELEN];
int vfslocked, fnvfslocked, error;
AUDIT_ARG_CMD(uap->cmd);
AUDIT_ARG_VALUE(uap->attrnamespace);
/*
* uap->attrname is not always defined. We check again later when we
* invoke the VFS call so as to pass in NULL there if needed.
*/
if (uap->attrname != NULL) {
error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN,
NULL);
if (error)
return (error);
}
AUDIT_ARG_TEXT(attrname);
vfslocked = fnvfslocked = 0;
mp = NULL;
filename_vp = NULL;
if (uap->filename != NULL) {
NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW | AUDITVNODE2,
UIO_USERSPACE, uap->filename, td);
error = namei(&nd);
if (error)
return (error);
fnvfslocked = NDHASGIANT(&nd);
filename_vp = nd.ni_vp;
NDFREE(&nd, NDF_NO_VP_RELE);
}
/* uap->path is always defined. */
NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW | LOCKLEAF | AUDITVNODE1,
UIO_USERSPACE, uap->path, td);
error = namei(&nd);
if (error)
goto out;
vfslocked = NDHASGIANT(&nd);
mp = nd.ni_vp->v_mount;
error = vfs_busy(mp, 0);
if (error) {
NDFREE(&nd, 0);
mp = NULL;
goto out;
}
VOP_UNLOCK(nd.ni_vp, 0);
error = vn_start_write(nd.ni_vp, &mp_writable, V_WAIT | PCATCH);
NDFREE(&nd, NDF_NO_VP_UNLOCK);
if (error)
goto out;
if (filename_vp != NULL) {
/*
* uap->filename is not always defined. If it is,
* grab a vnode lock, which VFS_EXTATTRCTL() will
* later release.
*/
error = vn_lock(filename_vp, LK_EXCLUSIVE);
if (error) {
vn_finished_write(mp_writable);
goto out;
}
}
error = VFS_EXTATTRCTL(mp, uap->cmd, filename_vp, uap->attrnamespace,
uap->attrname != NULL ? attrname : NULL);
vn_finished_write(mp_writable);
out:
if (mp != NULL)
vfs_unbusy(mp);
/*
* VFS_EXTATTRCTL will have unlocked, but not de-ref'd, filename_vp,
* so vrele it if it is defined.
*/
if (filename_vp != NULL)
vrele(filename_vp);
VFS_UNLOCK_GIANT(fnvfslocked);
VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
/*-
* Set a named extended attribute on a file or directory
*
* Arguments: unlocked vnode "vp", attribute namespace "attrnamespace",
* kernelspace string pointer "attrname", userspace buffer
* pointer "data", buffer length "nbytes", thread "td".
* Returns: 0 on success, an error number otherwise
* Locks: none
* References: vp must be a valid reference for the duration of the call
*/
static int
extattr_set_vp(struct vnode *vp, int attrnamespace, const char *attrname,
void *data, size_t nbytes, struct thread *td)
{
struct mount *mp;
struct uio auio;
struct iovec aiov;
ssize_t cnt;
int error;
VFS_ASSERT_GIANT(vp->v_mount);
error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
if (error)
return (error);
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
aiov.iov_base = data;
aiov.iov_len = nbytes;
auio.uio_iov = &aiov;
auio.uio_iovcnt = 1;
auio.uio_offset = 0;
if (nbytes > INT_MAX) {
error = EINVAL;
goto done;
}
auio.uio_resid = nbytes;
auio.uio_rw = UIO_WRITE;
auio.uio_segflg = UIO_USERSPACE;
auio.uio_td = td;
cnt = nbytes;
#ifdef MAC
error = mac_vnode_check_setextattr(td->td_ucred, vp, attrnamespace,
attrname);
if (error)
goto done;
#endif
error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio,
td->td_ucred, td);
cnt -= auio.uio_resid;
td->td_retval[0] = cnt;
done:
VOP_UNLOCK(vp, 0);
vn_finished_write(mp);
return (error);
}
int
-extattr_set_fd(td, uap)
+sys_extattr_set_fd(td, uap)
struct thread *td;
struct extattr_set_fd_args /* {
int fd;
int attrnamespace;
const char *attrname;
void *data;
size_t nbytes;
} */ *uap;
{
struct file *fp;
char attrname[EXTATTR_MAXNAMELEN];
int vfslocked, error;
AUDIT_ARG_FD(uap->fd);
AUDIT_ARG_VALUE(uap->attrnamespace);
error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
if (error)
return (error);
AUDIT_ARG_TEXT(attrname);
error = getvnode(td->td_proc->p_fd, uap->fd, CAP_EXTATTR_SET, &fp);
if (error)
return (error);
vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
error = extattr_set_vp(fp->f_vnode, uap->attrnamespace,
attrname, uap->data, uap->nbytes, td);
fdrop(fp, td);
VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
int
-extattr_set_file(td, uap)
+sys_extattr_set_file(td, uap)
struct thread *td;
struct extattr_set_file_args /* {
const char *path;
int attrnamespace;
const char *attrname;
void *data;
size_t nbytes;
} */ *uap;
{
struct nameidata nd;
char attrname[EXTATTR_MAXNAMELEN];
int vfslocked, error;
AUDIT_ARG_VALUE(uap->attrnamespace);
error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
if (error)
return (error);
AUDIT_ARG_TEXT(attrname);
NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW | AUDITVNODE1, UIO_USERSPACE,
uap->path, td);
error = namei(&nd);
if (error)
return (error);
NDFREE(&nd, NDF_ONLY_PNBUF);
vfslocked = NDHASGIANT(&nd);
error = extattr_set_vp(nd.ni_vp, uap->attrnamespace, attrname,
uap->data, uap->nbytes, td);
vrele(nd.ni_vp);
VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
int
-extattr_set_link(td, uap)
+sys_extattr_set_link(td, uap)
struct thread *td;
struct extattr_set_link_args /* {
const char *path;
int attrnamespace;
const char *attrname;
void *data;
size_t nbytes;
} */ *uap;
{
struct nameidata nd;
char attrname[EXTATTR_MAXNAMELEN];
int vfslocked, error;
AUDIT_ARG_VALUE(uap->attrnamespace);
error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
if (error)
return (error);
AUDIT_ARG_TEXT(attrname);
NDINIT(&nd, LOOKUP, MPSAFE | NOFOLLOW | AUDITVNODE1, UIO_USERSPACE,
uap->path, td);
error = namei(&nd);
if (error)
return (error);
NDFREE(&nd, NDF_ONLY_PNBUF);
vfslocked = NDHASGIANT(&nd);
error = extattr_set_vp(nd.ni_vp, uap->attrnamespace, attrname,
uap->data, uap->nbytes, td);
vrele(nd.ni_vp);
VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
/*-
* Get a named extended attribute on a file or directory
*
* Arguments: unlocked vnode "vp", attribute namespace "attrnamespace",
* kernelspace string pointer "attrname", userspace buffer
* pointer "data", buffer length "nbytes", thread "td".
* Returns: 0 on success, an error number otherwise
* Locks: none
* References: vp must be a valid reference for the duration of the call
*/
static int
extattr_get_vp(struct vnode *vp, int attrnamespace, const char *attrname,
void *data, size_t nbytes, struct thread *td)
{
struct uio auio, *auiop;
struct iovec aiov;
ssize_t cnt;
size_t size, *sizep;
int error;
VFS_ASSERT_GIANT(vp->v_mount);
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
/*
* Slightly unusual semantics: if the user provides a NULL data
* pointer, they don't want to receive the data, just the maximum
* read length.
*/
auiop = NULL;
sizep = NULL;
cnt = 0;
if (data != NULL) {
aiov.iov_base = data;
aiov.iov_len = nbytes;
auio.uio_iov = &aiov;
auio.uio_iovcnt = 1;
auio.uio_offset = 0;
if (nbytes > INT_MAX) {
error = EINVAL;
goto done;
}
auio.uio_resid = nbytes;
auio.uio_rw = UIO_READ;
auio.uio_segflg = UIO_USERSPACE;
auio.uio_td = td;
auiop = &auio;
cnt = nbytes;
} else
sizep = &size;
#ifdef MAC
error = mac_vnode_check_getextattr(td->td_ucred, vp, attrnamespace,
attrname);
if (error)
goto done;
#endif
error = VOP_GETEXTATTR(vp, attrnamespace, attrname, auiop, sizep,
td->td_ucred, td);
if (auiop != NULL) {
cnt -= auio.uio_resid;
td->td_retval[0] = cnt;
} else
td->td_retval[0] = size;
done:
VOP_UNLOCK(vp, 0);
return (error);
}
int
-extattr_get_fd(td, uap)
+sys_extattr_get_fd(td, uap)
struct thread *td;
struct extattr_get_fd_args /* {
int fd;
int attrnamespace;
const char *attrname;
void *data;
size_t nbytes;
} */ *uap;
{
struct file *fp;
char attrname[EXTATTR_MAXNAMELEN];
int vfslocked, error;
AUDIT_ARG_FD(uap->fd);
AUDIT_ARG_VALUE(uap->attrnamespace);
error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
if (error)
return (error);
AUDIT_ARG_TEXT(attrname);
error = getvnode(td->td_proc->p_fd, uap->fd, CAP_EXTATTR_GET, &fp);
if (error)
return (error);
vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
error = extattr_get_vp(fp->f_vnode, uap->attrnamespace,
attrname, uap->data, uap->nbytes, td);
fdrop(fp, td);
VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
int
-extattr_get_file(td, uap)
+sys_extattr_get_file(td, uap)
struct thread *td;
struct extattr_get_file_args /* {
const char *path;
int attrnamespace;
const char *attrname;
void *data;
size_t nbytes;
} */ *uap;
{
struct nameidata nd;
char attrname[EXTATTR_MAXNAMELEN];
int vfslocked, error;
AUDIT_ARG_VALUE(uap->attrnamespace);
error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
if (error)
return (error);
AUDIT_ARG_TEXT(attrname);
NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW | AUDITVNODE1, UIO_USERSPACE,
uap->path, td);
error = namei(&nd);
if (error)
return (error);
NDFREE(&nd, NDF_ONLY_PNBUF);
vfslocked = NDHASGIANT(&nd);
error = extattr_get_vp(nd.ni_vp, uap->attrnamespace, attrname,
uap->data, uap->nbytes, td);
vrele(nd.ni_vp);
VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
int
-extattr_get_link(td, uap)
+sys_extattr_get_link(td, uap)
struct thread *td;
struct extattr_get_link_args /* {
const char *path;
int attrnamespace;
const char *attrname;
void *data;
size_t nbytes;
} */ *uap;
{
struct nameidata nd;
char attrname[EXTATTR_MAXNAMELEN];
int vfslocked, error;
AUDIT_ARG_VALUE(uap->attrnamespace);
error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
if (error)
return (error);
AUDIT_ARG_TEXT(attrname);
NDINIT(&nd, LOOKUP, MPSAFE | NOFOLLOW | AUDITVNODE1, UIO_USERSPACE,
uap->path, td);
error = namei(&nd);
if (error)
return (error);
NDFREE(&nd, NDF_ONLY_PNBUF);
vfslocked = NDHASGIANT(&nd);
error = extattr_get_vp(nd.ni_vp, uap->attrnamespace, attrname,
uap->data, uap->nbytes, td);
vrele(nd.ni_vp);
VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
/*
* extattr_delete_vp(): Delete a named extended attribute on a file or
* directory
*
* Arguments: unlocked vnode "vp", attribute namespace "attrnamespace",
* kernelspace string pointer "attrname", proc "p"
* Returns: 0 on success, an error number otherwise
* Locks: none
* References: vp must be a valid reference for the duration of the call
*/
static int
extattr_delete_vp(struct vnode *vp, int attrnamespace, const char *attrname,
struct thread *td)
{
struct mount *mp;
int error;
VFS_ASSERT_GIANT(vp->v_mount);
error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
if (error)
return (error);
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
#ifdef MAC
error = mac_vnode_check_deleteextattr(td->td_ucred, vp, attrnamespace,
attrname);
if (error)
goto done;
#endif
error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, td->td_ucred,
td);
if (error == EOPNOTSUPP)
error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL,
td->td_ucred, td);
#ifdef MAC
done:
#endif
VOP_UNLOCK(vp, 0);
vn_finished_write(mp);
return (error);
}
int
-extattr_delete_fd(td, uap)
+sys_extattr_delete_fd(td, uap)
struct thread *td;
struct extattr_delete_fd_args /* {
int fd;
int attrnamespace;
const char *attrname;
} */ *uap;
{
struct file *fp;
char attrname[EXTATTR_MAXNAMELEN];
int vfslocked, error;
AUDIT_ARG_FD(uap->fd);
AUDIT_ARG_VALUE(uap->attrnamespace);
error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
if (error)
return (error);
AUDIT_ARG_TEXT(attrname);
error = getvnode(td->td_proc->p_fd, uap->fd, CAP_EXTATTR_DELETE,
&fp);
if (error)
return (error);
vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
error = extattr_delete_vp(fp->f_vnode, uap->attrnamespace,
attrname, td);
fdrop(fp, td);
VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
int
-extattr_delete_file(td, uap)
+sys_extattr_delete_file(td, uap)
struct thread *td;
struct extattr_delete_file_args /* {
const char *path;
int attrnamespace;
const char *attrname;
} */ *uap;
{
struct nameidata nd;
char attrname[EXTATTR_MAXNAMELEN];
int vfslocked, error;
AUDIT_ARG_VALUE(uap->attrnamespace);
error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
if (error)
return(error);
AUDIT_ARG_TEXT(attrname);
NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW | AUDITVNODE1, UIO_USERSPACE,
uap->path, td);
error = namei(&nd);
if (error)
return(error);
NDFREE(&nd, NDF_ONLY_PNBUF);
vfslocked = NDHASGIANT(&nd);
error = extattr_delete_vp(nd.ni_vp, uap->attrnamespace, attrname, td);
vrele(nd.ni_vp);
VFS_UNLOCK_GIANT(vfslocked);
return(error);
}
int
-extattr_delete_link(td, uap)
+sys_extattr_delete_link(td, uap)
struct thread *td;
struct extattr_delete_link_args /* {
const char *path;
int attrnamespace;
const char *attrname;
} */ *uap;
{
struct nameidata nd;
char attrname[EXTATTR_MAXNAMELEN];
int vfslocked, error;
AUDIT_ARG_VALUE(uap->attrnamespace);
error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
if (error)
return(error);
AUDIT_ARG_TEXT(attrname);
NDINIT(&nd, LOOKUP, MPSAFE | NOFOLLOW | AUDITVNODE1, UIO_USERSPACE,
uap->path, td);
error = namei(&nd);
if (error)
return(error);
NDFREE(&nd, NDF_ONLY_PNBUF);
vfslocked = NDHASGIANT(&nd);
error = extattr_delete_vp(nd.ni_vp, uap->attrnamespace, attrname, td);
vrele(nd.ni_vp);
VFS_UNLOCK_GIANT(vfslocked);
return(error);
}
/*-
* Retrieve a list of extended attributes on a file or directory.
*
* Arguments: unlocked vnode "vp", attribute namespace 'attrnamespace",
* userspace buffer pointer "data", buffer length "nbytes",
* thread "td".
* Returns: 0 on success, an error number otherwise
* Locks: none
* References: vp must be a valid reference for the duration of the call
*/
static int
extattr_list_vp(struct vnode *vp, int attrnamespace, void *data,
size_t nbytes, struct thread *td)
{
struct uio auio, *auiop;
size_t size, *sizep;
struct iovec aiov;
ssize_t cnt;
int error;
VFS_ASSERT_GIANT(vp->v_mount);
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
auiop = NULL;
sizep = NULL;
cnt = 0;
if (data != NULL) {
aiov.iov_base = data;
aiov.iov_len = nbytes;
auio.uio_iov = &aiov;
auio.uio_iovcnt = 1;
auio.uio_offset = 0;
if (nbytes > INT_MAX) {
error = EINVAL;
goto done;
}
auio.uio_resid = nbytes;
auio.uio_rw = UIO_READ;
auio.uio_segflg = UIO_USERSPACE;
auio.uio_td = td;
auiop = &auio;
cnt = nbytes;
} else
sizep = &size;
#ifdef MAC
error = mac_vnode_check_listextattr(td->td_ucred, vp, attrnamespace);
if (error)
goto done;
#endif
error = VOP_LISTEXTATTR(vp, attrnamespace, auiop, sizep,
td->td_ucred, td);
if (auiop != NULL) {
cnt -= auio.uio_resid;
td->td_retval[0] = cnt;
} else
td->td_retval[0] = size;
done:
VOP_UNLOCK(vp, 0);
return (error);
}
int
-extattr_list_fd(td, uap)
+sys_extattr_list_fd(td, uap)
struct thread *td;
struct extattr_list_fd_args /* {
int fd;
int attrnamespace;
void *data;
size_t nbytes;
} */ *uap;
{
struct file *fp;
int vfslocked, error;
AUDIT_ARG_FD(uap->fd);
AUDIT_ARG_VALUE(uap->attrnamespace);
error = getvnode(td->td_proc->p_fd, uap->fd, CAP_EXTATTR_LIST, &fp);
if (error)
return (error);
vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
error = extattr_list_vp(fp->f_vnode, uap->attrnamespace, uap->data,
uap->nbytes, td);
fdrop(fp, td);
VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
int
-extattr_list_file(td, uap)
+sys_extattr_list_file(td, uap)
struct thread*td;
struct extattr_list_file_args /* {
const char *path;
int attrnamespace;
void *data;
size_t nbytes;
} */ *uap;
{
struct nameidata nd;
int vfslocked, error;
AUDIT_ARG_VALUE(uap->attrnamespace);
NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW | AUDITVNODE1, UIO_USERSPACE,
uap->path, td);
error = namei(&nd);
if (error)
return (error);
NDFREE(&nd, NDF_ONLY_PNBUF);
vfslocked = NDHASGIANT(&nd);
error = extattr_list_vp(nd.ni_vp, uap->attrnamespace, uap->data,
uap->nbytes, td);
vrele(nd.ni_vp);
VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
int
-extattr_list_link(td, uap)
+sys_extattr_list_link(td, uap)
struct thread*td;
struct extattr_list_link_args /* {
const char *path;
int attrnamespace;
void *data;
size_t nbytes;
} */ *uap;
{
struct nameidata nd;
int vfslocked, error;
AUDIT_ARG_VALUE(uap->attrnamespace);
NDINIT(&nd, LOOKUP, MPSAFE | NOFOLLOW | AUDITVNODE1, UIO_USERSPACE,
uap->path, td);
error = namei(&nd);
if (error)
return (error);
NDFREE(&nd, NDF_ONLY_PNBUF);
vfslocked = NDHASGIANT(&nd);
error = extattr_list_vp(nd.ni_vp, uap->attrnamespace, uap->data,
uap->nbytes, td);
vrele(nd.ni_vp);
VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
Index: head/sys/kern/vfs_mount.c
===================================================================
--- head/sys/kern/vfs_mount.c (revision 225616)
+++ head/sys/kern/vfs_mount.c (revision 225617)
@@ -1,1958 +1,1958 @@
/*-
* Copyright (c) 1999-2004 Poul-Henning Kamp
* Copyright (c) 1999 Michael Smith
* Copyright (c) 1989, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/conf.h>
#include <sys/fcntl.h>
#include <sys/jail.h>
#include <sys/kernel.h>
#include <sys/libkern.h>
#include <sys/malloc.h>
#include <sys/mount.h>
#include <sys/mutex.h>
#include <sys/namei.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/filedesc.h>
#include <sys/reboot.h>
#include <sys/sbuf.h>
#include <sys/syscallsubr.h>
#include <sys/sysproto.h>
#include <sys/sx.h>
#include <sys/sysctl.h>
#include <sys/sysent.h>
#include <sys/systm.h>
#include <sys/vnode.h>
#include <vm/uma.h>
#include <geom/geom.h>
#include <machine/stdarg.h>
#include <security/audit/audit.h>
#include <security/mac/mac_framework.h>
#define VFS_MOUNTARG_SIZE_MAX (1024 * 64)
static int vfs_domount(struct thread *td, const char *fstype,
char *fspath, int fsflags, struct vfsoptlist **optlist);
static void free_mntarg(struct mntarg *ma);
static int usermount = 0;
SYSCTL_INT(_vfs, OID_AUTO, usermount, CTLFLAG_RW, &usermount, 0,
"Unprivileged users may mount and unmount file systems");
MALLOC_DEFINE(M_MOUNT, "mount", "vfs mount structure");
MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker");
static uma_zone_t mount_zone;
/* List of mounted filesystems. */
struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist);
/* For any iteration/modification of mountlist */
struct mtx mountlist_mtx;
MTX_SYSINIT(mountlist, &mountlist_mtx, "mountlist", MTX_DEF);
/*
* Global opts, taken by all filesystems
*/
static const char *global_opts[] = {
"errmsg",
"fstype",
"fspath",
"ro",
"rw",
"nosuid",
"noexec",
NULL
};
static int
mount_init(void *mem, int size, int flags)
{
struct mount *mp;
mp = (struct mount *)mem;
mtx_init(&mp->mnt_mtx, "struct mount mtx", NULL, MTX_DEF);
lockinit(&mp->mnt_explock, PVFS, "explock", 0, 0);
return (0);
}
static void
mount_fini(void *mem, int size)
{
struct mount *mp;
mp = (struct mount *)mem;
lockdestroy(&mp->mnt_explock);
mtx_destroy(&mp->mnt_mtx);
}
static void
vfs_mount_init(void *dummy __unused)
{
mount_zone = uma_zcreate("Mountpoints", sizeof(struct mount), NULL,
NULL, mount_init, mount_fini, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
}
SYSINIT(vfs_mount, SI_SUB_VFS, SI_ORDER_ANY, vfs_mount_init, NULL);
/*
* ---------------------------------------------------------------------
* Functions for building and sanitizing the mount options
*/
/* Remove one mount option. */
static void
vfs_freeopt(struct vfsoptlist *opts, struct vfsopt *opt)
{
TAILQ_REMOVE(opts, opt, link);
free(opt->name, M_MOUNT);
if (opt->value != NULL)
free(opt->value, M_MOUNT);
free(opt, M_MOUNT);
}
/* Release all resources related to the mount options. */
void
vfs_freeopts(struct vfsoptlist *opts)
{
struct vfsopt *opt;
while (!TAILQ_EMPTY(opts)) {
opt = TAILQ_FIRST(opts);
vfs_freeopt(opts, opt);
}
free(opts, M_MOUNT);
}
void
vfs_deleteopt(struct vfsoptlist *opts, const char *name)
{
struct vfsopt *opt, *temp;
if (opts == NULL)
return;
TAILQ_FOREACH_SAFE(opt, opts, link, temp) {
if (strcmp(opt->name, name) == 0)
vfs_freeopt(opts, opt);
}
}
static int
vfs_isopt_ro(const char *opt)
{
if (strcmp(opt, "ro") == 0 || strcmp(opt, "rdonly") == 0 ||
strcmp(opt, "norw") == 0)
return (1);
return (0);
}
static int
vfs_isopt_rw(const char *opt)
{
if (strcmp(opt, "rw") == 0 || strcmp(opt, "noro") == 0)
return (1);
return (0);
}
/*
* Check if options are equal (with or without the "no" prefix).
*/
static int
vfs_equalopts(const char *opt1, const char *opt2)
{
char *p;
/* "opt" vs. "opt" or "noopt" vs. "noopt" */
if (strcmp(opt1, opt2) == 0)
return (1);
/* "noopt" vs. "opt" */
if (strncmp(opt1, "no", 2) == 0 && strcmp(opt1 + 2, opt2) == 0)
return (1);
/* "opt" vs. "noopt" */
if (strncmp(opt2, "no", 2) == 0 && strcmp(opt1, opt2 + 2) == 0)
return (1);
while ((p = strchr(opt1, '.')) != NULL &&
!strncmp(opt1, opt2, ++p - opt1)) {
opt2 += p - opt1;
opt1 = p;
/* "foo.noopt" vs. "foo.opt" */
if (strncmp(opt1, "no", 2) == 0 && strcmp(opt1 + 2, opt2) == 0)
return (1);
/* "foo.opt" vs. "foo.noopt" */
if (strncmp(opt2, "no", 2) == 0 && strcmp(opt1, opt2 + 2) == 0)
return (1);
}
/* "ro" / "rdonly" / "norw" / "rw" / "noro" */
if ((vfs_isopt_ro(opt1) || vfs_isopt_rw(opt1)) &&
(vfs_isopt_ro(opt2) || vfs_isopt_rw(opt2)))
return (1);
return (0);
}
/*
* If a mount option is specified several times,
* (with or without the "no" prefix) only keep
* the last occurence of it.
*/
static void
vfs_sanitizeopts(struct vfsoptlist *opts)
{
struct vfsopt *opt, *opt2, *tmp;
TAILQ_FOREACH_REVERSE(opt, opts, vfsoptlist, link) {
opt2 = TAILQ_PREV(opt, vfsoptlist, link);
while (opt2 != NULL) {
if (vfs_equalopts(opt->name, opt2->name)) {
tmp = TAILQ_PREV(opt2, vfsoptlist, link);
vfs_freeopt(opts, opt2);
opt2 = tmp;
} else {
opt2 = TAILQ_PREV(opt2, vfsoptlist, link);
}
}
}
}
/*
* Build a linked list of mount options from a struct uio.
*/
int
vfs_buildopts(struct uio *auio, struct vfsoptlist **options)
{
struct vfsoptlist *opts;
struct vfsopt *opt;
size_t memused, namelen, optlen;
unsigned int i, iovcnt;
int error;
opts = malloc(sizeof(struct vfsoptlist), M_MOUNT, M_WAITOK);
TAILQ_INIT(opts);
memused = 0;
iovcnt = auio->uio_iovcnt;
for (i = 0; i < iovcnt; i += 2) {
namelen = auio->uio_iov[i].iov_len;
optlen = auio->uio_iov[i + 1].iov_len;
memused += sizeof(struct vfsopt) + optlen + namelen;
/*
* Avoid consuming too much memory, and attempts to overflow
* memused.
*/
if (memused > VFS_MOUNTARG_SIZE_MAX ||
optlen > VFS_MOUNTARG_SIZE_MAX ||
namelen > VFS_MOUNTARG_SIZE_MAX) {
error = EINVAL;
goto bad;
}
opt = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK);
opt->name = malloc(namelen, M_MOUNT, M_WAITOK);
opt->value = NULL;
opt->len = 0;
opt->pos = i / 2;
opt->seen = 0;
/*
* Do this early, so jumps to "bad" will free the current
* option.
*/
TAILQ_INSERT_TAIL(opts, opt, link);
if (auio->uio_segflg == UIO_SYSSPACE) {
bcopy(auio->uio_iov[i].iov_base, opt->name, namelen);
} else {
error = copyin(auio->uio_iov[i].iov_base, opt->name,
namelen);
if (error)
goto bad;
}
/* Ensure names are null-terminated strings. */
if (namelen == 0 || opt->name[namelen - 1] != '\0') {
error = EINVAL;
goto bad;
}
if (optlen != 0) {
opt->len = optlen;
opt->value = malloc(optlen, M_MOUNT, M_WAITOK);
if (auio->uio_segflg == UIO_SYSSPACE) {
bcopy(auio->uio_iov[i + 1].iov_base, opt->value,
optlen);
} else {
error = copyin(auio->uio_iov[i + 1].iov_base,
opt->value, optlen);
if (error)
goto bad;
}
}
}
vfs_sanitizeopts(opts);
*options = opts;
return (0);
bad:
vfs_freeopts(opts);
return (error);
}
/*
* Merge the old mount options with the new ones passed
* in the MNT_UPDATE case.
*
* XXX: This function will keep a "nofoo" option in the new
* options. E.g, if the option's canonical name is "foo",
* "nofoo" ends up in the mount point's active options.
*/
static void
vfs_mergeopts(struct vfsoptlist *toopts, struct vfsoptlist *oldopts)
{
struct vfsopt *opt, *new;
TAILQ_FOREACH(opt, oldopts, link) {
new = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK);
new->name = strdup(opt->name, M_MOUNT);
if (opt->len != 0) {
new->value = malloc(opt->len, M_MOUNT, M_WAITOK);
bcopy(opt->value, new->value, opt->len);
} else
new->value = NULL;
new->len = opt->len;
new->seen = opt->seen;
TAILQ_INSERT_HEAD(toopts, new, link);
}
vfs_sanitizeopts(toopts);
}
/*
* Mount a filesystem.
*/
int
-nmount(td, uap)
+sys_nmount(td, uap)
struct thread *td;
struct nmount_args /* {
struct iovec *iovp;
unsigned int iovcnt;
int flags;
} */ *uap;
{
struct uio *auio;
int error;
u_int iovcnt;
AUDIT_ARG_FFLAGS(uap->flags);
CTR4(KTR_VFS, "%s: iovp %p with iovcnt %d and flags %d", __func__,
uap->iovp, uap->iovcnt, uap->flags);
/*
* Filter out MNT_ROOTFS. We do not want clients of nmount() in
* userspace to set this flag, but we must filter it out if we want
* MNT_UPDATE on the root file system to work.
* MNT_ROOTFS should only be set by the kernel when mounting its
* root file system.
*/
uap->flags &= ~MNT_ROOTFS;
iovcnt = uap->iovcnt;
/*
* Check that we have an even number of iovec's
* and that we have at least two options.
*/
if ((iovcnt & 1) || (iovcnt < 4)) {
CTR2(KTR_VFS, "%s: failed for invalid iovcnt %d", __func__,
uap->iovcnt);
return (EINVAL);
}
error = copyinuio(uap->iovp, iovcnt, &auio);
if (error) {
CTR2(KTR_VFS, "%s: failed for invalid uio op with %d errno",
__func__, error);
return (error);
}
error = vfs_donmount(td, uap->flags, auio);
free(auio, M_IOV);
return (error);
}
/*
* ---------------------------------------------------------------------
* Various utility functions
*/
void
vfs_ref(struct mount *mp)
{
CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
MNT_ILOCK(mp);
MNT_REF(mp);
MNT_IUNLOCK(mp);
}
void
vfs_rel(struct mount *mp)
{
CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
MNT_ILOCK(mp);
MNT_REL(mp);
MNT_IUNLOCK(mp);
}
/*
* Allocate and initialize the mount point struct.
*/
struct mount *
vfs_mount_alloc(struct vnode *vp, struct vfsconf *vfsp, const char *fspath,
struct ucred *cred)
{
struct mount *mp;
mp = uma_zalloc(mount_zone, M_WAITOK);
bzero(&mp->mnt_startzero,
__rangeof(struct mount, mnt_startzero, mnt_endzero));
TAILQ_INIT(&mp->mnt_nvnodelist);
mp->mnt_nvnodelistsize = 0;
mp->mnt_ref = 0;
(void) vfs_busy(mp, MBF_NOWAIT);
mp->mnt_op = vfsp->vfc_vfsops;
mp->mnt_vfc = vfsp;
vfsp->vfc_refcount++; /* XXX Unlocked */
mp->mnt_stat.f_type = vfsp->vfc_typenum;
mp->mnt_gen++;
strlcpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
mp->mnt_vnodecovered = vp;
mp->mnt_cred = crdup(cred);
mp->mnt_stat.f_owner = cred->cr_uid;
strlcpy(mp->mnt_stat.f_mntonname, fspath, MNAMELEN);
mp->mnt_iosize_max = DFLTPHYS;
#ifdef MAC
mac_mount_init(mp);
mac_mount_create(cred, mp);
#endif
arc4rand(&mp->mnt_hashseed, sizeof mp->mnt_hashseed, 0);
return (mp);
}
/*
* Destroy the mount struct previously allocated by vfs_mount_alloc().
*/
void
vfs_mount_destroy(struct mount *mp)
{
MNT_ILOCK(mp);
mp->mnt_kern_flag |= MNTK_REFEXPIRE;
if (mp->mnt_kern_flag & MNTK_MWAIT) {
mp->mnt_kern_flag &= ~MNTK_MWAIT;
wakeup(mp);
}
while (mp->mnt_ref)
msleep(mp, MNT_MTX(mp), PVFS, "mntref", 0);
KASSERT(mp->mnt_ref == 0,
("%s: invalid refcount in the drain path @ %s:%d", __func__,
__FILE__, __LINE__));
if (mp->mnt_writeopcount != 0)
panic("vfs_mount_destroy: nonzero writeopcount");
if (mp->mnt_secondary_writes != 0)
panic("vfs_mount_destroy: nonzero secondary_writes");
mp->mnt_vfc->vfc_refcount--;
if (!TAILQ_EMPTY(&mp->mnt_nvnodelist)) {
struct vnode *vp;
TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes)
vprint("", vp);
panic("unmount: dangling vnode");
}
if (mp->mnt_nvnodelistsize != 0)
panic("vfs_mount_destroy: nonzero nvnodelistsize");
if (mp->mnt_lockref != 0)
panic("vfs_mount_destroy: nonzero lock refcount");
MNT_IUNLOCK(mp);
#ifdef MAC
mac_mount_destroy(mp);
#endif
if (mp->mnt_opt != NULL)
vfs_freeopts(mp->mnt_opt);
crfree(mp->mnt_cred);
uma_zfree(mount_zone, mp);
}
int
vfs_donmount(struct thread *td, int fsflags, struct uio *fsoptions)
{
struct vfsoptlist *optlist;
struct vfsopt *opt, *tmp_opt;
char *fstype, *fspath, *errmsg;
int error, fstypelen, fspathlen, errmsg_len, errmsg_pos;
errmsg = fspath = NULL;
errmsg_len = fspathlen = 0;
errmsg_pos = -1;
error = vfs_buildopts(fsoptions, &optlist);
if (error)
return (error);
if (vfs_getopt(optlist, "errmsg", (void **)&errmsg, &errmsg_len) == 0)
errmsg_pos = vfs_getopt_pos(optlist, "errmsg");
/*
* We need these two options before the others,
* and they are mandatory for any filesystem.
* Ensure they are NUL terminated as well.
*/
fstypelen = 0;
error = vfs_getopt(optlist, "fstype", (void **)&fstype, &fstypelen);
if (error || fstype[fstypelen - 1] != '\0') {
error = EINVAL;
if (errmsg != NULL)
strncpy(errmsg, "Invalid fstype", errmsg_len);
goto bail;
}
fspathlen = 0;
error = vfs_getopt(optlist, "fspath", (void **)&fspath, &fspathlen);
if (error || fspath[fspathlen - 1] != '\0') {
error = EINVAL;
if (errmsg != NULL)
strncpy(errmsg, "Invalid fspath", errmsg_len);
goto bail;
}
/*
* We need to see if we have the "update" option
* before we call vfs_domount(), since vfs_domount() has special
* logic based on MNT_UPDATE. This is very important
* when we want to update the root filesystem.
*/
TAILQ_FOREACH_SAFE(opt, optlist, link, tmp_opt) {
if (strcmp(opt->name, "update") == 0) {
fsflags |= MNT_UPDATE;
vfs_freeopt(optlist, opt);
}
else if (strcmp(opt->name, "async") == 0)
fsflags |= MNT_ASYNC;
else if (strcmp(opt->name, "force") == 0) {
fsflags |= MNT_FORCE;
vfs_freeopt(optlist, opt);
}
else if (strcmp(opt->name, "reload") == 0) {
fsflags |= MNT_RELOAD;
vfs_freeopt(optlist, opt);
}
else if (strcmp(opt->name, "multilabel") == 0)
fsflags |= MNT_MULTILABEL;
else if (strcmp(opt->name, "noasync") == 0)
fsflags &= ~MNT_ASYNC;
else if (strcmp(opt->name, "noatime") == 0)
fsflags |= MNT_NOATIME;
else if (strcmp(opt->name, "atime") == 0) {
free(opt->name, M_MOUNT);
opt->name = strdup("nonoatime", M_MOUNT);
}
else if (strcmp(opt->name, "noclusterr") == 0)
fsflags |= MNT_NOCLUSTERR;
else if (strcmp(opt->name, "clusterr") == 0) {
free(opt->name, M_MOUNT);
opt->name = strdup("nonoclusterr", M_MOUNT);
}
else if (strcmp(opt->name, "noclusterw") == 0)
fsflags |= MNT_NOCLUSTERW;
else if (strcmp(opt->name, "clusterw") == 0) {
free(opt->name, M_MOUNT);
opt->name = strdup("nonoclusterw", M_MOUNT);
}
else if (strcmp(opt->name, "noexec") == 0)
fsflags |= MNT_NOEXEC;
else if (strcmp(opt->name, "exec") == 0) {
free(opt->name, M_MOUNT);
opt->name = strdup("nonoexec", M_MOUNT);
}
else if (strcmp(opt->name, "nosuid") == 0)
fsflags |= MNT_NOSUID;
else if (strcmp(opt->name, "suid") == 0) {
free(opt->name, M_MOUNT);
opt->name = strdup("nonosuid", M_MOUNT);
}
else if (strcmp(opt->name, "nosymfollow") == 0)
fsflags |= MNT_NOSYMFOLLOW;
else if (strcmp(opt->name, "symfollow") == 0) {
free(opt->name, M_MOUNT);
opt->name = strdup("nonosymfollow", M_MOUNT);
}
else if (strcmp(opt->name, "noro") == 0)
fsflags &= ~MNT_RDONLY;
else if (strcmp(opt->name, "rw") == 0)
fsflags &= ~MNT_RDONLY;
else if (strcmp(opt->name, "ro") == 0)
fsflags |= MNT_RDONLY;
else if (strcmp(opt->name, "rdonly") == 0) {
free(opt->name, M_MOUNT);
opt->name = strdup("ro", M_MOUNT);
fsflags |= MNT_RDONLY;
}
else if (strcmp(opt->name, "suiddir") == 0)
fsflags |= MNT_SUIDDIR;
else if (strcmp(opt->name, "sync") == 0)
fsflags |= MNT_SYNCHRONOUS;
else if (strcmp(opt->name, "union") == 0)
fsflags |= MNT_UNION;
}
/*
* Be ultra-paranoid about making sure the type and fspath
* variables will fit in our mp buffers, including the
* terminating NUL.
*/
if (fstypelen >= MFSNAMELEN - 1 || fspathlen >= MNAMELEN - 1) {
error = ENAMETOOLONG;
goto bail;
}
error = vfs_domount(td, fstype, fspath, fsflags, &optlist);
bail:
/* copyout the errmsg */
if (errmsg_pos != -1 && ((2 * errmsg_pos + 1) < fsoptions->uio_iovcnt)
&& errmsg_len > 0 && errmsg != NULL) {
if (fsoptions->uio_segflg == UIO_SYSSPACE) {
bcopy(errmsg,
fsoptions->uio_iov[2 * errmsg_pos + 1].iov_base,
fsoptions->uio_iov[2 * errmsg_pos + 1].iov_len);
} else {
copyout(errmsg,
fsoptions->uio_iov[2 * errmsg_pos + 1].iov_base,
fsoptions->uio_iov[2 * errmsg_pos + 1].iov_len);
}
}
if (optlist != NULL)
vfs_freeopts(optlist);
return (error);
}
/*
* Old mount API.
*/
#ifndef _SYS_SYSPROTO_H_
struct mount_args {
char *type;
char *path;
int flags;
caddr_t data;
};
#endif
/* ARGSUSED */
int
-mount(td, uap)
+sys_mount(td, uap)
struct thread *td;
struct mount_args /* {
char *type;
char *path;
int flags;
caddr_t data;
} */ *uap;
{
char *fstype;
struct vfsconf *vfsp = NULL;
struct mntarg *ma = NULL;
int error;
AUDIT_ARG_FFLAGS(uap->flags);
/*
* Filter out MNT_ROOTFS. We do not want clients of mount() in
* userspace to set this flag, but we must filter it out if we want
* MNT_UPDATE on the root file system to work.
* MNT_ROOTFS should only be set by the kernel when mounting its
* root file system.
*/
uap->flags &= ~MNT_ROOTFS;
fstype = malloc(MFSNAMELEN, M_TEMP, M_WAITOK);
error = copyinstr(uap->type, fstype, MFSNAMELEN, NULL);
if (error) {
free(fstype, M_TEMP);
return (error);
}
AUDIT_ARG_TEXT(fstype);
mtx_lock(&Giant);
vfsp = vfs_byname_kld(fstype, td, &error);
free(fstype, M_TEMP);
if (vfsp == NULL) {
mtx_unlock(&Giant);
return (ENOENT);
}
if (vfsp->vfc_vfsops->vfs_cmount == NULL) {
mtx_unlock(&Giant);
return (EOPNOTSUPP);
}
ma = mount_argsu(ma, "fstype", uap->type, MNAMELEN);
ma = mount_argsu(ma, "fspath", uap->path, MNAMELEN);
ma = mount_argb(ma, uap->flags & MNT_RDONLY, "noro");
ma = mount_argb(ma, !(uap->flags & MNT_NOSUID), "nosuid");
ma = mount_argb(ma, !(uap->flags & MNT_NOEXEC), "noexec");
error = vfsp->vfc_vfsops->vfs_cmount(ma, uap->data, uap->flags);
mtx_unlock(&Giant);
return (error);
}
/*
* vfs_domount_first(): first file system mount (not update)
*/
static int
vfs_domount_first(
struct thread *td, /* Calling thread. */
struct vfsconf *vfsp, /* File system type. */
char *fspath, /* Mount path. */
struct vnode *vp, /* Vnode to be covered. */
int fsflags, /* Flags common to all filesystems. */
struct vfsoptlist **optlist /* Options local to the filesystem. */
)
{
struct vattr va;
struct mount *mp;
struct vnode *newdp;
int error;
mtx_assert(&Giant, MA_OWNED);
ASSERT_VOP_ELOCKED(vp, __func__);
KASSERT((fsflags & MNT_UPDATE) == 0, ("MNT_UPDATE shouldn't be here"));
/*
* If the user is not root, ensure that they own the directory
* onto which we are attempting to mount.
*/
error = VOP_GETATTR(vp, &va, td->td_ucred);
if (error == 0 && va.va_uid != td->td_ucred->cr_uid)
error = priv_check_cred(td->td_ucred, PRIV_VFS_ADMIN, 0);
if (error == 0)
error = vinvalbuf(vp, V_SAVE, 0, 0);
if (error == 0 && vp->v_type != VDIR)
error = ENOTDIR;
if (error == 0) {
VI_LOCK(vp);
if ((vp->v_iflag & VI_MOUNT) == 0 && vp->v_mountedhere == NULL)
vp->v_iflag |= VI_MOUNT;
else
error = EBUSY;
VI_UNLOCK(vp);
}
if (error != 0) {
vput(vp);
return (error);
}
VOP_UNLOCK(vp, 0);
/* Allocate and initialize the filesystem. */
mp = vfs_mount_alloc(vp, vfsp, fspath, td->td_ucred);
/* XXXMAC: pass to vfs_mount_alloc? */
mp->mnt_optnew = *optlist;
/* Set the mount level flags. */
mp->mnt_flag = (fsflags & (MNT_UPDATEMASK | MNT_ROOTFS | MNT_RDONLY));
/*
* Mount the filesystem.
* XXX The final recipients of VFS_MOUNT just overwrite the ndp they
* get. No freeing of cn_pnbuf.
*/
error = VFS_MOUNT(mp);
if (error != 0) {
vfs_unbusy(mp);
vfs_mount_destroy(mp);
VI_LOCK(vp);
vp->v_iflag &= ~VI_MOUNT;
VI_UNLOCK(vp);
vrele(vp);
return (error);
}
if (mp->mnt_opt != NULL)
vfs_freeopts(mp->mnt_opt);
mp->mnt_opt = mp->mnt_optnew;
*optlist = NULL;
(void)VFS_STATFS(mp, &mp->mnt_stat);
/*
* Prevent external consumers of mount options from reading mnt_optnew.
*/
mp->mnt_optnew = NULL;
MNT_ILOCK(mp);
if ((mp->mnt_flag & MNT_ASYNC) != 0 && mp->mnt_noasync == 0)
mp->mnt_kern_flag |= MNTK_ASYNC;
else
mp->mnt_kern_flag &= ~MNTK_ASYNC;
MNT_IUNLOCK(mp);
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
cache_purge(vp);
VI_LOCK(vp);
vp->v_iflag &= ~VI_MOUNT;
VI_UNLOCK(vp);
vp->v_mountedhere = mp;
/* Place the new filesystem at the end of the mount list. */
mtx_lock(&mountlist_mtx);
TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list);
mtx_unlock(&mountlist_mtx);
vfs_event_signal(NULL, VQ_MOUNT, 0);
if (VFS_ROOT(mp, LK_EXCLUSIVE, &newdp))
panic("mount: lost mount");
VOP_UNLOCK(newdp, 0);
VOP_UNLOCK(vp, 0);
mountcheckdirs(vp, newdp);
vrele(newdp);
if ((mp->mnt_flag & MNT_RDONLY) == 0)
vfs_allocate_syncvnode(mp);
vfs_unbusy(mp);
return (0);
}
/*
* vfs_domount_update(): update of mounted file system
*/
static int
vfs_domount_update(
struct thread *td, /* Calling thread. */
struct vnode *vp, /* Mount point vnode. */
int fsflags, /* Flags common to all filesystems. */
struct vfsoptlist **optlist /* Options local to the filesystem. */
)
{
struct oexport_args oexport;
struct export_args export;
struct mount *mp;
int error, export_error, flag;
mtx_assert(&Giant, MA_OWNED);
ASSERT_VOP_ELOCKED(vp, __func__);
KASSERT((fsflags & MNT_UPDATE) != 0, ("MNT_UPDATE should be here"));
if ((vp->v_vflag & VV_ROOT) == 0) {
vput(vp);
return (EINVAL);
}
mp = vp->v_mount;
/*
* We only allow the filesystem to be reloaded if it
* is currently mounted read-only.
*/
flag = mp->mnt_flag;
if ((fsflags & MNT_RELOAD) != 0 && (flag & MNT_RDONLY) == 0) {
vput(vp);
return (EOPNOTSUPP); /* Needs translation */
}
/*
* Only privileged root, or (if MNT_USER is set) the user that
* did the original mount is permitted to update it.
*/
error = vfs_suser(mp, td);
if (error != 0) {
vput(vp);
return (error);
}
if (vfs_busy(mp, MBF_NOWAIT)) {
vput(vp);
return (EBUSY);
}
VI_LOCK(vp);
if ((vp->v_iflag & VI_MOUNT) != 0 || vp->v_mountedhere != NULL) {
VI_UNLOCK(vp);
vfs_unbusy(mp);
vput(vp);
return (EBUSY);
}
vp->v_iflag |= VI_MOUNT;
VI_UNLOCK(vp);
VOP_UNLOCK(vp, 0);
MNT_ILOCK(mp);
mp->mnt_flag &= ~MNT_UPDATEMASK;
mp->mnt_flag |= fsflags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE |
MNT_SNAPSHOT | MNT_ROOTFS | MNT_UPDATEMASK | MNT_RDONLY);
if ((mp->mnt_flag & MNT_ASYNC) == 0)
mp->mnt_kern_flag &= ~MNTK_ASYNC;
MNT_IUNLOCK(mp);
mp->mnt_optnew = *optlist;
vfs_mergeopts(mp->mnt_optnew, mp->mnt_opt);
/*
* Mount the filesystem.
* XXX The final recipients of VFS_MOUNT just overwrite the ndp they
* get. No freeing of cn_pnbuf.
*/
error = VFS_MOUNT(mp);
export_error = 0;
if (error == 0) {
/* Process the export option. */
if (vfs_copyopt(mp->mnt_optnew, "export", &export,
sizeof(export)) == 0) {
export_error = vfs_export(mp, &export);
} else if (vfs_copyopt(mp->mnt_optnew, "export", &oexport,
sizeof(oexport)) == 0) {
export.ex_flags = oexport.ex_flags;
export.ex_root = oexport.ex_root;
export.ex_anon = oexport.ex_anon;
export.ex_addr = oexport.ex_addr;
export.ex_addrlen = oexport.ex_addrlen;
export.ex_mask = oexport.ex_mask;
export.ex_masklen = oexport.ex_masklen;
export.ex_indexfile = oexport.ex_indexfile;
export.ex_numsecflavors = 0;
export_error = vfs_export(mp, &export);
}
}
MNT_ILOCK(mp);
if (error == 0) {
mp->mnt_flag &= ~(MNT_UPDATE | MNT_RELOAD | MNT_FORCE |
MNT_SNAPSHOT);
} else {
/*
* If we fail, restore old mount flags. MNT_QUOTA is special,
* because it is not part of MNT_UPDATEMASK, but it could have
* changed in the meantime if quotactl(2) was called.
* All in all we want current value of MNT_QUOTA, not the old
* one.
*/
mp->mnt_flag = (mp->mnt_flag & MNT_QUOTA) | (flag & ~MNT_QUOTA);
}
if ((mp->mnt_flag & MNT_ASYNC) != 0 && mp->mnt_noasync == 0)
mp->mnt_kern_flag |= MNTK_ASYNC;
else
mp->mnt_kern_flag &= ~MNTK_ASYNC;
MNT_IUNLOCK(mp);
if (error != 0)
goto end;
if (mp->mnt_opt != NULL)
vfs_freeopts(mp->mnt_opt);
mp->mnt_opt = mp->mnt_optnew;
*optlist = NULL;
(void)VFS_STATFS(mp, &mp->mnt_stat);
/*
* Prevent external consumers of mount options from reading
* mnt_optnew.
*/
mp->mnt_optnew = NULL;
if ((mp->mnt_flag & MNT_RDONLY) == 0)
vfs_allocate_syncvnode(mp);
else
vfs_deallocate_syncvnode(mp);
end:
vfs_unbusy(mp);
VI_LOCK(vp);
vp->v_iflag &= ~VI_MOUNT;
VI_UNLOCK(vp);
vrele(vp);
return (error != 0 ? error : export_error);
}
/*
* vfs_domount(): actually attempt a filesystem mount.
*/
static int
vfs_domount(
struct thread *td, /* Calling thread. */
const char *fstype, /* Filesystem type. */
char *fspath, /* Mount path. */
int fsflags, /* Flags common to all filesystems. */
struct vfsoptlist **optlist /* Options local to the filesystem. */
)
{
struct vfsconf *vfsp;
struct nameidata nd;
struct vnode *vp;
int error;
/*
* Be ultra-paranoid about making sure the type and fspath
* variables will fit in our mp buffers, including the
* terminating NUL.
*/
if (strlen(fstype) >= MFSNAMELEN || strlen(fspath) >= MNAMELEN)
return (ENAMETOOLONG);
if (jailed(td->td_ucred) || usermount == 0) {
if ((error = priv_check(td, PRIV_VFS_MOUNT)) != 0)
return (error);
}
/*
* Do not allow NFS export or MNT_SUIDDIR by unprivileged users.
*/
if (fsflags & MNT_EXPORTED) {
error = priv_check(td, PRIV_VFS_MOUNT_EXPORTED);
if (error)
return (error);
}
if (fsflags & MNT_SUIDDIR) {
error = priv_check(td, PRIV_VFS_MOUNT_SUIDDIR);
if (error)
return (error);
}
/*
* Silently enforce MNT_NOSUID and MNT_USER for unprivileged users.
*/
if ((fsflags & (MNT_NOSUID | MNT_USER)) != (MNT_NOSUID | MNT_USER)) {
if (priv_check(td, PRIV_VFS_MOUNT_NONUSER) != 0)
fsflags |= MNT_NOSUID | MNT_USER;
}
/* Load KLDs before we lock the covered vnode to avoid reversals. */
vfsp = NULL;
if ((fsflags & MNT_UPDATE) == 0) {
/* Don't try to load KLDs if we're mounting the root. */
if (fsflags & MNT_ROOTFS)
vfsp = vfs_byname(fstype);
else
vfsp = vfs_byname_kld(fstype, td, &error);
if (vfsp == NULL)
return (ENODEV);
if (jailed(td->td_ucred) && !(vfsp->vfc_flags & VFCF_JAIL))
return (EPERM);
}
/*
* Get vnode to be covered or mount point's vnode in case of MNT_UPDATE.
*/
NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
UIO_SYSSPACE, fspath, td);
error = namei(&nd);
if (error != 0)
return (error);
if (!NDHASGIANT(&nd))
mtx_lock(&Giant);
NDFREE(&nd, NDF_ONLY_PNBUF);
vp = nd.ni_vp;
if ((fsflags & MNT_UPDATE) == 0) {
error = vfs_domount_first(td, vfsp, fspath, vp, fsflags,
optlist);
} else {
error = vfs_domount_update(td, vp, fsflags, optlist);
}
mtx_unlock(&Giant);
ASSERT_VI_UNLOCKED(vp, __func__);
ASSERT_VOP_UNLOCKED(vp, __func__);
return (error);
}
/*
* Unmount a filesystem.
*
* Note: unmount takes a path to the vnode mounted on as argument, not
* special file (as before).
*/
#ifndef _SYS_SYSPROTO_H_
struct unmount_args {
char *path;
int flags;
};
#endif
/* ARGSUSED */
int
-unmount(td, uap)
+sys_unmount(td, uap)
struct thread *td;
register struct unmount_args /* {
char *path;
int flags;
} */ *uap;
{
struct mount *mp;
char *pathbuf;
int error, id0, id1;
AUDIT_ARG_VALUE(uap->flags);
if (jailed(td->td_ucred) || usermount == 0) {
error = priv_check(td, PRIV_VFS_UNMOUNT);
if (error)
return (error);
}
pathbuf = malloc(MNAMELEN, M_TEMP, M_WAITOK);
error = copyinstr(uap->path, pathbuf, MNAMELEN, NULL);
if (error) {
free(pathbuf, M_TEMP);
return (error);
}
mtx_lock(&Giant);
if (uap->flags & MNT_BYFSID) {
AUDIT_ARG_TEXT(pathbuf);
/* Decode the filesystem ID. */
if (sscanf(pathbuf, "FSID:%d:%d", &id0, &id1) != 2) {
mtx_unlock(&Giant);
free(pathbuf, M_TEMP);
return (EINVAL);
}
mtx_lock(&mountlist_mtx);
TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) {
if (mp->mnt_stat.f_fsid.val[0] == id0 &&
mp->mnt_stat.f_fsid.val[1] == id1)
break;
}
mtx_unlock(&mountlist_mtx);
} else {
AUDIT_ARG_UPATH1(td, pathbuf);
mtx_lock(&mountlist_mtx);
TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) {
if (strcmp(mp->mnt_stat.f_mntonname, pathbuf) == 0)
break;
}
mtx_unlock(&mountlist_mtx);
}
free(pathbuf, M_TEMP);
if (mp == NULL) {
/*
* Previously we returned ENOENT for a nonexistent path and
* EINVAL for a non-mountpoint. We cannot tell these apart
* now, so in the !MNT_BYFSID case return the more likely
* EINVAL for compatibility.
*/
mtx_unlock(&Giant);
return ((uap->flags & MNT_BYFSID) ? ENOENT : EINVAL);
}
/*
* Don't allow unmounting the root filesystem.
*/
if (mp->mnt_flag & MNT_ROOTFS) {
mtx_unlock(&Giant);
return (EINVAL);
}
error = dounmount(mp, uap->flags, td);
mtx_unlock(&Giant);
return (error);
}
/*
* Do the actual filesystem unmount.
*/
int
dounmount(mp, flags, td)
struct mount *mp;
int flags;
struct thread *td;
{
struct vnode *coveredvp, *fsrootvp;
int error;
int async_flag;
int mnt_gen_r;
mtx_assert(&Giant, MA_OWNED);
if ((coveredvp = mp->mnt_vnodecovered) != NULL) {
mnt_gen_r = mp->mnt_gen;
VI_LOCK(coveredvp);
vholdl(coveredvp);
vn_lock(coveredvp, LK_EXCLUSIVE | LK_INTERLOCK | LK_RETRY);
vdrop(coveredvp);
/*
* Check for mp being unmounted while waiting for the
* covered vnode lock.
*/
if (coveredvp->v_mountedhere != mp ||
coveredvp->v_mountedhere->mnt_gen != mnt_gen_r) {
VOP_UNLOCK(coveredvp, 0);
return (EBUSY);
}
}
/*
* Only privileged root, or (if MNT_USER is set) the user that did the
* original mount is permitted to unmount this filesystem.
*/
error = vfs_suser(mp, td);
if (error) {
if (coveredvp)
VOP_UNLOCK(coveredvp, 0);
return (error);
}
MNT_ILOCK(mp);
if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
MNT_IUNLOCK(mp);
if (coveredvp)
VOP_UNLOCK(coveredvp, 0);
return (EBUSY);
}
mp->mnt_kern_flag |= MNTK_UNMOUNT | MNTK_NOINSMNTQ;
/* Allow filesystems to detect that a forced unmount is in progress. */
if (flags & MNT_FORCE)
mp->mnt_kern_flag |= MNTK_UNMOUNTF;
error = 0;
if (mp->mnt_lockref) {
if ((flags & MNT_FORCE) == 0) {
mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_NOINSMNTQ |
MNTK_UNMOUNTF);
if (mp->mnt_kern_flag & MNTK_MWAIT) {
mp->mnt_kern_flag &= ~MNTK_MWAIT;
wakeup(mp);
}
MNT_IUNLOCK(mp);
if (coveredvp)
VOP_UNLOCK(coveredvp, 0);
return (EBUSY);
}
mp->mnt_kern_flag |= MNTK_DRAINING;
error = msleep(&mp->mnt_lockref, MNT_MTX(mp), PVFS,
"mount drain", 0);
}
MNT_IUNLOCK(mp);
KASSERT(mp->mnt_lockref == 0,
("%s: invalid lock refcount in the drain path @ %s:%d",
__func__, __FILE__, __LINE__));
KASSERT(error == 0,
("%s: invalid return value for msleep in the drain path @ %s:%d",
__func__, __FILE__, __LINE__));
vn_start_write(NULL, &mp, V_WAIT);
if (mp->mnt_flag & MNT_EXPUBLIC)
vfs_setpublicfs(NULL, NULL, NULL);
vfs_msync(mp, MNT_WAIT);
MNT_ILOCK(mp);
async_flag = mp->mnt_flag & MNT_ASYNC;
mp->mnt_flag &= ~MNT_ASYNC;
mp->mnt_kern_flag &= ~MNTK_ASYNC;
MNT_IUNLOCK(mp);
cache_purgevfs(mp); /* remove cache entries for this file sys */
vfs_deallocate_syncvnode(mp);
/*
* For forced unmounts, move process cdir/rdir refs on the fs root
* vnode to the covered vnode. For non-forced unmounts we want
* such references to cause an EBUSY error.
*/
if ((flags & MNT_FORCE) &&
VFS_ROOT(mp, LK_EXCLUSIVE, &fsrootvp) == 0) {
if (mp->mnt_vnodecovered != NULL)
mountcheckdirs(fsrootvp, mp->mnt_vnodecovered);
if (fsrootvp == rootvnode) {
vrele(rootvnode);
rootvnode = NULL;
}
vput(fsrootvp);
}
if (((mp->mnt_flag & MNT_RDONLY) ||
(error = VFS_SYNC(mp, MNT_WAIT)) == 0) || (flags & MNT_FORCE) != 0)
error = VFS_UNMOUNT(mp, flags);
vn_finished_write(mp);
/*
* If we failed to flush the dirty blocks for this mount point,
* undo all the cdir/rdir and rootvnode changes we made above.
* Unless we failed to do so because the device is reporting that
* it doesn't exist anymore.
*/
if (error && error != ENXIO) {
if ((flags & MNT_FORCE) &&
VFS_ROOT(mp, LK_EXCLUSIVE, &fsrootvp) == 0) {
if (mp->mnt_vnodecovered != NULL)
mountcheckdirs(mp->mnt_vnodecovered, fsrootvp);
if (rootvnode == NULL) {
rootvnode = fsrootvp;
vref(rootvnode);
}
vput(fsrootvp);
}
MNT_ILOCK(mp);
mp->mnt_kern_flag &= ~MNTK_NOINSMNTQ;
if ((mp->mnt_flag & MNT_RDONLY) == 0) {
MNT_IUNLOCK(mp);
vfs_allocate_syncvnode(mp);
MNT_ILOCK(mp);
}
mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF);
mp->mnt_flag |= async_flag;
if ((mp->mnt_flag & MNT_ASYNC) != 0 && mp->mnt_noasync == 0)
mp->mnt_kern_flag |= MNTK_ASYNC;
if (mp->mnt_kern_flag & MNTK_MWAIT) {
mp->mnt_kern_flag &= ~MNTK_MWAIT;
wakeup(mp);
}
MNT_IUNLOCK(mp);
if (coveredvp)
VOP_UNLOCK(coveredvp, 0);
return (error);
}
mtx_lock(&mountlist_mtx);
TAILQ_REMOVE(&mountlist, mp, mnt_list);
mtx_unlock(&mountlist_mtx);
if (coveredvp != NULL) {
coveredvp->v_mountedhere = NULL;
vput(coveredvp);
}
vfs_event_signal(NULL, VQ_UNMOUNT, 0);
vfs_mount_destroy(mp);
return (0);
}
/*
* Report errors during filesystem mounting.
*/
void
vfs_mount_error(struct mount *mp, const char *fmt, ...)
{
struct vfsoptlist *moptlist = mp->mnt_optnew;
va_list ap;
int error, len;
char *errmsg;
error = vfs_getopt(moptlist, "errmsg", (void **)&errmsg, &len);
if (error || errmsg == NULL || len <= 0)
return;
va_start(ap, fmt);
vsnprintf(errmsg, (size_t)len, fmt, ap);
va_end(ap);
}
void
vfs_opterror(struct vfsoptlist *opts, const char *fmt, ...)
{
va_list ap;
int error, len;
char *errmsg;
error = vfs_getopt(opts, "errmsg", (void **)&errmsg, &len);
if (error || errmsg == NULL || len <= 0)
return;
va_start(ap, fmt);
vsnprintf(errmsg, (size_t)len, fmt, ap);
va_end(ap);
}
/*
* ---------------------------------------------------------------------
* Functions for querying mount options/arguments from filesystems.
*/
/*
* Check that no unknown options are given
*/
int
vfs_filteropt(struct vfsoptlist *opts, const char **legal)
{
struct vfsopt *opt;
char errmsg[255];
const char **t, *p, *q;
int ret = 0;
TAILQ_FOREACH(opt, opts, link) {
p = opt->name;
q = NULL;
if (p[0] == 'n' && p[1] == 'o')
q = p + 2;
for(t = global_opts; *t != NULL; t++) {
if (strcmp(*t, p) == 0)
break;
if (q != NULL) {
if (strcmp(*t, q) == 0)
break;
}
}
if (*t != NULL)
continue;
for(t = legal; *t != NULL; t++) {
if (strcmp(*t, p) == 0)
break;
if (q != NULL) {
if (strcmp(*t, q) == 0)
break;
}
}
if (*t != NULL)
continue;
snprintf(errmsg, sizeof(errmsg),
"mount option <%s> is unknown", p);
ret = EINVAL;
}
if (ret != 0) {
TAILQ_FOREACH(opt, opts, link) {
if (strcmp(opt->name, "errmsg") == 0) {
strncpy((char *)opt->value, errmsg, opt->len);
break;
}
}
if (opt == NULL)
printf("%s\n", errmsg);
}
return (ret);
}
/*
* Get a mount option by its name.
*
* Return 0 if the option was found, ENOENT otherwise.
* If len is non-NULL it will be filled with the length
* of the option. If buf is non-NULL, it will be filled
* with the address of the option.
*/
int
vfs_getopt(opts, name, buf, len)
struct vfsoptlist *opts;
const char *name;
void **buf;
int *len;
{
struct vfsopt *opt;
KASSERT(opts != NULL, ("vfs_getopt: caller passed 'opts' as NULL"));
TAILQ_FOREACH(opt, opts, link) {
if (strcmp(name, opt->name) == 0) {
opt->seen = 1;
if (len != NULL)
*len = opt->len;
if (buf != NULL)
*buf = opt->value;
return (0);
}
}
return (ENOENT);
}
int
vfs_getopt_pos(struct vfsoptlist *opts, const char *name)
{
struct vfsopt *opt;
if (opts == NULL)
return (-1);
TAILQ_FOREACH(opt, opts, link) {
if (strcmp(name, opt->name) == 0) {
opt->seen = 1;
return (opt->pos);
}
}
return (-1);
}
char *
vfs_getopts(struct vfsoptlist *opts, const char *name, int *error)
{
struct vfsopt *opt;
*error = 0;
TAILQ_FOREACH(opt, opts, link) {
if (strcmp(name, opt->name) != 0)
continue;
opt->seen = 1;
if (opt->len == 0 ||
((char *)opt->value)[opt->len - 1] != '\0') {
*error = EINVAL;
return (NULL);
}
return (opt->value);
}
*error = ENOENT;
return (NULL);
}
int
vfs_flagopt(struct vfsoptlist *opts, const char *name, uint64_t *w,
uint64_t val)
{
struct vfsopt *opt;
TAILQ_FOREACH(opt, opts, link) {
if (strcmp(name, opt->name) == 0) {
opt->seen = 1;
if (w != NULL)
*w |= val;
return (1);
}
}
if (w != NULL)
*w &= ~val;
return (0);
}
int
vfs_scanopt(struct vfsoptlist *opts, const char *name, const char *fmt, ...)
{
va_list ap;
struct vfsopt *opt;
int ret;
KASSERT(opts != NULL, ("vfs_getopt: caller passed 'opts' as NULL"));
TAILQ_FOREACH(opt, opts, link) {
if (strcmp(name, opt->name) != 0)
continue;
opt->seen = 1;
if (opt->len == 0 || opt->value == NULL)
return (0);
if (((char *)opt->value)[opt->len - 1] != '\0')
return (0);
va_start(ap, fmt);
ret = vsscanf(opt->value, fmt, ap);
va_end(ap);
return (ret);
}
return (0);
}
int
vfs_setopt(struct vfsoptlist *opts, const char *name, void *value, int len)
{
struct vfsopt *opt;
TAILQ_FOREACH(opt, opts, link) {
if (strcmp(name, opt->name) != 0)
continue;
opt->seen = 1;
if (opt->value == NULL)
opt->len = len;
else {
if (opt->len != len)
return (EINVAL);
bcopy(value, opt->value, len);
}
return (0);
}
return (ENOENT);
}
int
vfs_setopt_part(struct vfsoptlist *opts, const char *name, void *value, int len)
{
struct vfsopt *opt;
TAILQ_FOREACH(opt, opts, link) {
if (strcmp(name, opt->name) != 0)
continue;
opt->seen = 1;
if (opt->value == NULL)
opt->len = len;
else {
if (opt->len < len)
return (EINVAL);
opt->len = len;
bcopy(value, opt->value, len);
}
return (0);
}
return (ENOENT);
}
int
vfs_setopts(struct vfsoptlist *opts, const char *name, const char *value)
{
struct vfsopt *opt;
TAILQ_FOREACH(opt, opts, link) {
if (strcmp(name, opt->name) != 0)
continue;
opt->seen = 1;
if (opt->value == NULL)
opt->len = strlen(value) + 1;
else if (strlcpy(opt->value, value, opt->len) >= opt->len)
return (EINVAL);
return (0);
}
return (ENOENT);
}
/*
* Find and copy a mount option.
*
* The size of the buffer has to be specified
* in len, if it is not the same length as the
* mount option, EINVAL is returned.
* Returns ENOENT if the option is not found.
*/
int
vfs_copyopt(opts, name, dest, len)
struct vfsoptlist *opts;
const char *name;
void *dest;
int len;
{
struct vfsopt *opt;
KASSERT(opts != NULL, ("vfs_copyopt: caller passed 'opts' as NULL"));
TAILQ_FOREACH(opt, opts, link) {
if (strcmp(name, opt->name) == 0) {
opt->seen = 1;
if (len != opt->len)
return (EINVAL);
bcopy(opt->value, dest, opt->len);
return (0);
}
}
return (ENOENT);
}
/*
* This is a helper function for filesystems to traverse their
* vnodes. See MNT_VNODE_FOREACH() in sys/mount.h
*/
struct vnode *
__mnt_vnode_next(struct vnode **mvp, struct mount *mp)
{
struct vnode *vp;
mtx_assert(MNT_MTX(mp), MA_OWNED);
KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
if (should_yield()) {
MNT_IUNLOCK(mp);
kern_yield(PRI_UNCHANGED);
MNT_ILOCK(mp);
}
vp = TAILQ_NEXT(*mvp, v_nmntvnodes);
while (vp != NULL && vp->v_type == VMARKER)
vp = TAILQ_NEXT(vp, v_nmntvnodes);
/* Check if we are done */
if (vp == NULL) {
__mnt_vnode_markerfree(mvp, mp);
return (NULL);
}
TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
return (vp);
}
struct vnode *
__mnt_vnode_first(struct vnode **mvp, struct mount *mp)
{
struct vnode *vp;
mtx_assert(MNT_MTX(mp), MA_OWNED);
vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
while (vp != NULL && vp->v_type == VMARKER)
vp = TAILQ_NEXT(vp, v_nmntvnodes);
/* Check if we are done */
if (vp == NULL) {
*mvp = NULL;
return (NULL);
}
MNT_REF(mp);
MNT_IUNLOCK(mp);
*mvp = (struct vnode *) malloc(sizeof(struct vnode),
M_VNODE_MARKER,
M_WAITOK | M_ZERO);
MNT_ILOCK(mp);
(*mvp)->v_type = VMARKER;
vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
while (vp != NULL && vp->v_type == VMARKER)
vp = TAILQ_NEXT(vp, v_nmntvnodes);
/* Check if we are done */
if (vp == NULL) {
MNT_IUNLOCK(mp);
free(*mvp, M_VNODE_MARKER);
MNT_ILOCK(mp);
*mvp = NULL;
MNT_REL(mp);
return (NULL);
}
(*mvp)->v_mount = mp;
TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
return (vp);
}
void
__mnt_vnode_markerfree(struct vnode **mvp, struct mount *mp)
{
if (*mvp == NULL)
return;
mtx_assert(MNT_MTX(mp), MA_OWNED);
KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
MNT_IUNLOCK(mp);
free(*mvp, M_VNODE_MARKER);
MNT_ILOCK(mp);
*mvp = NULL;
MNT_REL(mp);
}
int
__vfs_statfs(struct mount *mp, struct statfs *sbp)
{
int error;
error = mp->mnt_op->vfs_statfs(mp, &mp->mnt_stat);
if (sbp != &mp->mnt_stat)
*sbp = mp->mnt_stat;
return (error);
}
void
vfs_mountedfrom(struct mount *mp, const char *from)
{
bzero(mp->mnt_stat.f_mntfromname, sizeof mp->mnt_stat.f_mntfromname);
strlcpy(mp->mnt_stat.f_mntfromname, from,
sizeof mp->mnt_stat.f_mntfromname);
}
/*
* ---------------------------------------------------------------------
* This is the api for building mount args and mounting filesystems from
* inside the kernel.
*
* The API works by accumulation of individual args. First error is
* latched.
*
* XXX: should be documented in new manpage kernel_mount(9)
*/
/* A memory allocation which must be freed when we are done */
struct mntaarg {
SLIST_ENTRY(mntaarg) next;
};
/* The header for the mount arguments */
struct mntarg {
struct iovec *v;
int len;
int error;
SLIST_HEAD(, mntaarg) list;
};
/*
* Add a boolean argument.
*
* flag is the boolean value.
* name must start with "no".
*/
struct mntarg *
mount_argb(struct mntarg *ma, int flag, const char *name)
{
KASSERT(name[0] == 'n' && name[1] == 'o',
("mount_argb(...,%s): name must start with 'no'", name));
return (mount_arg(ma, name + (flag ? 2 : 0), NULL, 0));
}
/*
* Add an argument printf style
*/
struct mntarg *
mount_argf(struct mntarg *ma, const char *name, const char *fmt, ...)
{
va_list ap;
struct mntaarg *maa;
struct sbuf *sb;
int len;
if (ma == NULL) {
ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO);
SLIST_INIT(&ma->list);
}
if (ma->error)
return (ma);
ma->v = realloc(ma->v, sizeof *ma->v * (ma->len + 2),
M_MOUNT, M_WAITOK);
ma->v[ma->len].iov_base = (void *)(uintptr_t)name;
ma->v[ma->len].iov_len = strlen(name) + 1;
ma->len++;
sb = sbuf_new_auto();
va_start(ap, fmt);
sbuf_vprintf(sb, fmt, ap);
va_end(ap);
sbuf_finish(sb);
len = sbuf_len(sb) + 1;
maa = malloc(sizeof *maa + len, M_MOUNT, M_WAITOK | M_ZERO);
SLIST_INSERT_HEAD(&ma->list, maa, next);
bcopy(sbuf_data(sb), maa + 1, len);
sbuf_delete(sb);
ma->v[ma->len].iov_base = maa + 1;
ma->v[ma->len].iov_len = len;
ma->len++;
return (ma);
}
/*
* Add an argument which is a userland string.
*/
struct mntarg *
mount_argsu(struct mntarg *ma, const char *name, const void *val, int len)
{
struct mntaarg *maa;
char *tbuf;
if (val == NULL)
return (ma);
if (ma == NULL) {
ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO);
SLIST_INIT(&ma->list);
}
if (ma->error)
return (ma);
maa = malloc(sizeof *maa + len, M_MOUNT, M_WAITOK | M_ZERO);
SLIST_INSERT_HEAD(&ma->list, maa, next);
tbuf = (void *)(maa + 1);
ma->error = copyinstr(val, tbuf, len, NULL);
return (mount_arg(ma, name, tbuf, -1));
}
/*
* Plain argument.
*
* If length is -1, treat value as a C string.
*/
struct mntarg *
mount_arg(struct mntarg *ma, const char *name, const void *val, int len)
{
if (ma == NULL) {
ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO);
SLIST_INIT(&ma->list);
}
if (ma->error)
return (ma);
ma->v = realloc(ma->v, sizeof *ma->v * (ma->len + 2),
M_MOUNT, M_WAITOK);
ma->v[ma->len].iov_base = (void *)(uintptr_t)name;
ma->v[ma->len].iov_len = strlen(name) + 1;
ma->len++;
ma->v[ma->len].iov_base = (void *)(uintptr_t)val;
if (len < 0)
ma->v[ma->len].iov_len = strlen(val) + 1;
else
ma->v[ma->len].iov_len = len;
ma->len++;
return (ma);
}
/*
* Free a mntarg structure
*/
static void
free_mntarg(struct mntarg *ma)
{
struct mntaarg *maa;
while (!SLIST_EMPTY(&ma->list)) {
maa = SLIST_FIRST(&ma->list);
SLIST_REMOVE_HEAD(&ma->list, next);
free(maa, M_MOUNT);
}
free(ma->v, M_MOUNT);
free(ma, M_MOUNT);
}
/*
* Mount a filesystem
*/
int
kernel_mount(struct mntarg *ma, int flags)
{
struct uio auio;
int error;
KASSERT(ma != NULL, ("kernel_mount NULL ma"));
KASSERT(ma->v != NULL, ("kernel_mount NULL ma->v"));
KASSERT(!(ma->len & 1), ("kernel_mount odd ma->len (%d)", ma->len));
auio.uio_iov = ma->v;
auio.uio_iovcnt = ma->len;
auio.uio_segflg = UIO_SYSSPACE;
error = ma->error;
if (!error)
error = vfs_donmount(curthread, flags, &auio);
free_mntarg(ma);
return (error);
}
/*
* A printflike function to mount a filesystem.
*/
int
kernel_vmount(int flags, ...)
{
struct mntarg *ma = NULL;
va_list ap;
const char *cp;
const void *vp;
int error;
va_start(ap, flags);
for (;;) {
cp = va_arg(ap, const char *);
if (cp == NULL)
break;
vp = va_arg(ap, const void *);
ma = mount_arg(ma, cp, vp, (vp != NULL ? -1 : 0));
}
va_end(ap);
error = kernel_mount(ma, flags);
return (error);
}
void
vfs_oexport_conv(const struct oexport_args *oexp, struct export_args *exp)
{
bcopy(oexp, exp, sizeof(*oexp));
exp->ex_numsecflavors = 0;
}
Index: head/sys/kern/vfs_syscalls.c
===================================================================
--- head/sys/kern/vfs_syscalls.c (revision 225616)
+++ head/sys/kern/vfs_syscalls.c (revision 225617)
@@ -1,4847 +1,4847 @@
/*-
* Copyright (c) 1989, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)vfs_syscalls.c 8.13 (Berkeley) 4/15/94
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_capsicum.h"
#include "opt_compat.h"
#include "opt_kdtrace.h"
#include "opt_ktrace.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/bio.h>
#include <sys/buf.h>
#include <sys/capability.h>
#include <sys/disk.h>
#include <sys/sysent.h>
#include <sys/malloc.h>
#include <sys/mount.h>
#include <sys/mutex.h>
#include <sys/sysproto.h>
#include <sys/namei.h>
#include <sys/filedesc.h>
#include <sys/kernel.h>
#include <sys/fcntl.h>
#include <sys/file.h>
#include <sys/filio.h>
#include <sys/limits.h>
#include <sys/linker.h>
#include <sys/sdt.h>
#include <sys/stat.h>
#include <sys/sx.h>
#include <sys/unistd.h>
#include <sys/vnode.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/dirent.h>
#include <sys/jail.h>
#include <sys/syscallsubr.h>
#include <sys/sysctl.h>
#ifdef KTRACE
#include <sys/ktrace.h>
#endif
#include <machine/stdarg.h>
#include <security/audit/audit.h>
#include <security/mac/mac_framework.h>
#include <vm/vm.h>
#include <vm/vm_object.h>
#include <vm/vm_page.h>
#include <vm/uma.h>
SDT_PROVIDER_DEFINE(vfs);
SDT_PROBE_DEFINE(vfs, , stat, mode, mode);
SDT_PROBE_ARGTYPE(vfs, , stat, mode, 0, "char *");
SDT_PROBE_ARGTYPE(vfs, , stat, mode, 1, "int");
SDT_PROBE_DEFINE(vfs, , stat, reg, reg);
SDT_PROBE_ARGTYPE(vfs, , stat, reg, 0, "char *");
SDT_PROBE_ARGTYPE(vfs, , stat, reg, 1, "int");
static int chroot_refuse_vdir_fds(struct filedesc *fdp);
static int getutimes(const struct timeval *, enum uio_seg, struct timespec *);
static int setfflags(struct thread *td, struct vnode *, int);
static int setutimes(struct thread *td, struct vnode *,
const struct timespec *, int, int);
static int vn_access(struct vnode *vp, int user_flags, struct ucred *cred,
struct thread *td);
/*
* The module initialization routine for POSIX asynchronous I/O will
* set this to the version of AIO that it implements. (Zero means
* that it is not implemented.) This value is used here by pathconf()
* and in kern_descrip.c by fpathconf().
*/
int async_io_version;
#ifdef DEBUG
static int syncprt = 0;
SYSCTL_INT(_debug, OID_AUTO, syncprt, CTLFLAG_RW, &syncprt, 0, "");
#endif
/*
* Sync each mounted filesystem.
*/
#ifndef _SYS_SYSPROTO_H_
struct sync_args {
int dummy;
};
#endif
/* ARGSUSED */
int
-sync(td, uap)
+sys_sync(td, uap)
struct thread *td;
struct sync_args *uap;
{
struct mount *mp, *nmp;
int vfslocked;
mtx_lock(&mountlist_mtx);
for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
nmp = TAILQ_NEXT(mp, mnt_list);
continue;
}
vfslocked = VFS_LOCK_GIANT(mp);
if ((mp->mnt_flag & MNT_RDONLY) == 0 &&
vn_start_write(NULL, &mp, V_NOWAIT) == 0) {
MNT_ILOCK(mp);
mp->mnt_noasync++;
mp->mnt_kern_flag &= ~MNTK_ASYNC;
MNT_IUNLOCK(mp);
vfs_msync(mp, MNT_NOWAIT);
VFS_SYNC(mp, MNT_NOWAIT);
MNT_ILOCK(mp);
mp->mnt_noasync--;
if ((mp->mnt_flag & MNT_ASYNC) != 0 &&
mp->mnt_noasync == 0)
mp->mnt_kern_flag |= MNTK_ASYNC;
MNT_IUNLOCK(mp);
vn_finished_write(mp);
}
VFS_UNLOCK_GIANT(vfslocked);
mtx_lock(&mountlist_mtx);
nmp = TAILQ_NEXT(mp, mnt_list);
vfs_unbusy(mp);
}
mtx_unlock(&mountlist_mtx);
return (0);
}
/*
* Change filesystem quotas.
*/
#ifndef _SYS_SYSPROTO_H_
struct quotactl_args {
char *path;
int cmd;
int uid;
caddr_t arg;
};
#endif
int
-quotactl(td, uap)
+sys_quotactl(td, uap)
struct thread *td;
register struct quotactl_args /* {
char *path;
int cmd;
int uid;
caddr_t arg;
} */ *uap;
{
struct mount *mp;
int vfslocked;
int error;
struct nameidata nd;
AUDIT_ARG_CMD(uap->cmd);
AUDIT_ARG_UID(uap->uid);
if (!prison_allow(td->td_ucred, PR_ALLOW_QUOTAS))
return (EPERM);
NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
UIO_USERSPACE, uap->path, td);
if ((error = namei(&nd)) != 0)
return (error);
vfslocked = NDHASGIANT(&nd);
NDFREE(&nd, NDF_ONLY_PNBUF);
mp = nd.ni_vp->v_mount;
vfs_ref(mp);
vput(nd.ni_vp);
error = vfs_busy(mp, 0);
vfs_rel(mp);
if (error) {
VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, uap->arg);
vfs_unbusy(mp);
VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
/*
* Used by statfs conversion routines to scale the block size up if
* necessary so that all of the block counts are <= 'max_size'. Note
* that 'max_size' should be a bitmask, i.e. 2^n - 1 for some non-zero
* value of 'n'.
*/
void
statfs_scale_blocks(struct statfs *sf, long max_size)
{
uint64_t count;
int shift;
KASSERT(powerof2(max_size + 1), ("%s: invalid max_size", __func__));
/*
* Attempt to scale the block counts to give a more accurate
* overview to userland of the ratio of free space to used
* space. To do this, find the largest block count and compute
* a divisor that lets it fit into a signed integer <= max_size.
*/
if (sf->f_bavail < 0)
count = -sf->f_bavail;
else
count = sf->f_bavail;
count = MAX(sf->f_blocks, MAX(sf->f_bfree, count));
if (count <= max_size)
return;
count >>= flsl(max_size);
shift = 0;
while (count > 0) {
shift++;
count >>=1;
}
sf->f_bsize <<= shift;
sf->f_blocks >>= shift;
sf->f_bfree >>= shift;
sf->f_bavail >>= shift;
}
/*
* Get filesystem statistics.
*/
#ifndef _SYS_SYSPROTO_H_
struct statfs_args {
char *path;
struct statfs *buf;
};
#endif
int
-statfs(td, uap)
+sys_statfs(td, uap)
struct thread *td;
register struct statfs_args /* {
char *path;
struct statfs *buf;
} */ *uap;
{
struct statfs sf;
int error;
error = kern_statfs(td, uap->path, UIO_USERSPACE, &sf);
if (error == 0)
error = copyout(&sf, uap->buf, sizeof(sf));
return (error);
}
int
kern_statfs(struct thread *td, char *path, enum uio_seg pathseg,
struct statfs *buf)
{
struct mount *mp;
struct statfs *sp, sb;
int vfslocked;
int error;
struct nameidata nd;
NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | MPSAFE |
AUDITVNODE1, pathseg, path, td);
error = namei(&nd);
if (error)
return (error);
vfslocked = NDHASGIANT(&nd);
mp = nd.ni_vp->v_mount;
vfs_ref(mp);
NDFREE(&nd, NDF_ONLY_PNBUF);
vput(nd.ni_vp);
error = vfs_busy(mp, 0);
vfs_rel(mp);
if (error) {
VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
#ifdef MAC
error = mac_mount_check_stat(td->td_ucred, mp);
if (error)
goto out;
#endif
/*
* Set these in case the underlying filesystem fails to do so.
*/
sp = &mp->mnt_stat;
sp->f_version = STATFS_VERSION;
sp->f_namemax = NAME_MAX;
sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
error = VFS_STATFS(mp, sp);
if (error)
goto out;
if (priv_check(td, PRIV_VFS_GENERATION)) {
bcopy(sp, &sb, sizeof(sb));
sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
prison_enforce_statfs(td->td_ucred, mp, &sb);
sp = &sb;
}
*buf = *sp;
out:
vfs_unbusy(mp);
VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
/*
* Get filesystem statistics.
*/
#ifndef _SYS_SYSPROTO_H_
struct fstatfs_args {
int fd;
struct statfs *buf;
};
#endif
int
-fstatfs(td, uap)
+sys_fstatfs(td, uap)
struct thread *td;
register struct fstatfs_args /* {
int fd;
struct statfs *buf;
} */ *uap;
{
struct statfs sf;
int error;
error = kern_fstatfs(td, uap->fd, &sf);
if (error == 0)
error = copyout(&sf, uap->buf, sizeof(sf));
return (error);
}
int
kern_fstatfs(struct thread *td, int fd, struct statfs *buf)
{
struct file *fp;
struct mount *mp;
struct statfs *sp, sb;
int vfslocked;
struct vnode *vp;
int error;
AUDIT_ARG_FD(fd);
error = getvnode(td->td_proc->p_fd, fd, CAP_FSTATFS, &fp);
if (error)
return (error);
vp = fp->f_vnode;
vfslocked = VFS_LOCK_GIANT(vp->v_mount);
vn_lock(vp, LK_SHARED | LK_RETRY);
#ifdef AUDIT
AUDIT_ARG_VNODE1(vp);
#endif
mp = vp->v_mount;
if (mp)
vfs_ref(mp);
VOP_UNLOCK(vp, 0);
fdrop(fp, td);
if (mp == NULL) {
error = EBADF;
goto out;
}
error = vfs_busy(mp, 0);
vfs_rel(mp);
if (error) {
VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
#ifdef MAC
error = mac_mount_check_stat(td->td_ucred, mp);
if (error)
goto out;
#endif
/*
* Set these in case the underlying filesystem fails to do so.
*/
sp = &mp->mnt_stat;
sp->f_version = STATFS_VERSION;
sp->f_namemax = NAME_MAX;
sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
error = VFS_STATFS(mp, sp);
if (error)
goto out;
if (priv_check(td, PRIV_VFS_GENERATION)) {
bcopy(sp, &sb, sizeof(sb));
sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
prison_enforce_statfs(td->td_ucred, mp, &sb);
sp = &sb;
}
*buf = *sp;
out:
if (mp)
vfs_unbusy(mp);
VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
/*
* Get statistics on all filesystems.
*/
#ifndef _SYS_SYSPROTO_H_
struct getfsstat_args {
struct statfs *buf;
long bufsize;
int flags;
};
#endif
int
-getfsstat(td, uap)
+sys_getfsstat(td, uap)
struct thread *td;
register struct getfsstat_args /* {
struct statfs *buf;
long bufsize;
int flags;
} */ *uap;
{
return (kern_getfsstat(td, &uap->buf, uap->bufsize, UIO_USERSPACE,
uap->flags));
}
/*
* If (bufsize > 0 && bufseg == UIO_SYSSPACE)
* The caller is responsible for freeing memory which will be allocated
* in '*buf'.
*/
int
kern_getfsstat(struct thread *td, struct statfs **buf, size_t bufsize,
enum uio_seg bufseg, int flags)
{
struct mount *mp, *nmp;
struct statfs *sfsp, *sp, sb;
size_t count, maxcount;
int vfslocked;
int error;
maxcount = bufsize / sizeof(struct statfs);
if (bufsize == 0)
sfsp = NULL;
else if (bufseg == UIO_USERSPACE)
sfsp = *buf;
else /* if (bufseg == UIO_SYSSPACE) */ {
count = 0;
mtx_lock(&mountlist_mtx);
TAILQ_FOREACH(mp, &mountlist, mnt_list) {
count++;
}
mtx_unlock(&mountlist_mtx);
if (maxcount > count)
maxcount = count;
sfsp = *buf = malloc(maxcount * sizeof(struct statfs), M_TEMP,
M_WAITOK);
}
count = 0;
mtx_lock(&mountlist_mtx);
for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
if (prison_canseemount(td->td_ucred, mp) != 0) {
nmp = TAILQ_NEXT(mp, mnt_list);
continue;
}
#ifdef MAC
if (mac_mount_check_stat(td->td_ucred, mp) != 0) {
nmp = TAILQ_NEXT(mp, mnt_list);
continue;
}
#endif
if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
nmp = TAILQ_NEXT(mp, mnt_list);
continue;
}
vfslocked = VFS_LOCK_GIANT(mp);
if (sfsp && count < maxcount) {
sp = &mp->mnt_stat;
/*
* Set these in case the underlying filesystem
* fails to do so.
*/
sp->f_version = STATFS_VERSION;
sp->f_namemax = NAME_MAX;
sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
/*
* If MNT_NOWAIT or MNT_LAZY is specified, do not
* refresh the fsstat cache. MNT_NOWAIT or MNT_LAZY
* overrides MNT_WAIT.
*/
if (((flags & (MNT_LAZY|MNT_NOWAIT)) == 0 ||
(flags & MNT_WAIT)) &&
(error = VFS_STATFS(mp, sp))) {
VFS_UNLOCK_GIANT(vfslocked);
mtx_lock(&mountlist_mtx);
nmp = TAILQ_NEXT(mp, mnt_list);
vfs_unbusy(mp);
continue;
}
if (priv_check(td, PRIV_VFS_GENERATION)) {
bcopy(sp, &sb, sizeof(sb));
sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
prison_enforce_statfs(td->td_ucred, mp, &sb);
sp = &sb;
}
if (bufseg == UIO_SYSSPACE)
bcopy(sp, sfsp, sizeof(*sp));
else /* if (bufseg == UIO_USERSPACE) */ {
error = copyout(sp, sfsp, sizeof(*sp));
if (error) {
vfs_unbusy(mp);
VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
}
sfsp++;
}
VFS_UNLOCK_GIANT(vfslocked);
count++;
mtx_lock(&mountlist_mtx);
nmp = TAILQ_NEXT(mp, mnt_list);
vfs_unbusy(mp);
}
mtx_unlock(&mountlist_mtx);
if (sfsp && count > maxcount)
td->td_retval[0] = maxcount;
else
td->td_retval[0] = count;
return (0);
}
#ifdef COMPAT_FREEBSD4
/*
* Get old format filesystem statistics.
*/
static void cvtstatfs(struct statfs *, struct ostatfs *);
#ifndef _SYS_SYSPROTO_H_
struct freebsd4_statfs_args {
char *path;
struct ostatfs *buf;
};
#endif
int
freebsd4_statfs(td, uap)
struct thread *td;
struct freebsd4_statfs_args /* {
char *path;
struct ostatfs *buf;
} */ *uap;
{
struct ostatfs osb;
struct statfs sf;
int error;
error = kern_statfs(td, uap->path, UIO_USERSPACE, &sf);
if (error)
return (error);
cvtstatfs(&sf, &osb);
return (copyout(&osb, uap->buf, sizeof(osb)));
}
/*
* Get filesystem statistics.
*/
#ifndef _SYS_SYSPROTO_H_
struct freebsd4_fstatfs_args {
int fd;
struct ostatfs *buf;
};
#endif
int
freebsd4_fstatfs(td, uap)
struct thread *td;
struct freebsd4_fstatfs_args /* {
int fd;
struct ostatfs *buf;
} */ *uap;
{
struct ostatfs osb;
struct statfs sf;
int error;
error = kern_fstatfs(td, uap->fd, &sf);
if (error)
return (error);
cvtstatfs(&sf, &osb);
return (copyout(&osb, uap->buf, sizeof(osb)));
}
/*
* Get statistics on all filesystems.
*/
#ifndef _SYS_SYSPROTO_H_
struct freebsd4_getfsstat_args {
struct ostatfs *buf;
long bufsize;
int flags;
};
#endif
int
freebsd4_getfsstat(td, uap)
struct thread *td;
register struct freebsd4_getfsstat_args /* {
struct ostatfs *buf;
long bufsize;
int flags;
} */ *uap;
{
struct statfs *buf, *sp;
struct ostatfs osb;
size_t count, size;
int error;
count = uap->bufsize / sizeof(struct ostatfs);
size = count * sizeof(struct statfs);
error = kern_getfsstat(td, &buf, size, UIO_SYSSPACE, uap->flags);
if (size > 0) {
count = td->td_retval[0];
sp = buf;
while (count > 0 && error == 0) {
cvtstatfs(sp, &osb);
error = copyout(&osb, uap->buf, sizeof(osb));
sp++;
uap->buf++;
count--;
}
free(buf, M_TEMP);
}
return (error);
}
/*
* Implement fstatfs() for (NFS) file handles.
*/
#ifndef _SYS_SYSPROTO_H_
struct freebsd4_fhstatfs_args {
struct fhandle *u_fhp;
struct ostatfs *buf;
};
#endif
int
freebsd4_fhstatfs(td, uap)
struct thread *td;
struct freebsd4_fhstatfs_args /* {
struct fhandle *u_fhp;
struct ostatfs *buf;
} */ *uap;
{
struct ostatfs osb;
struct statfs sf;
fhandle_t fh;
int error;
error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
if (error)
return (error);
error = kern_fhstatfs(td, fh, &sf);
if (error)
return (error);
cvtstatfs(&sf, &osb);
return (copyout(&osb, uap->buf, sizeof(osb)));
}
/*
* Convert a new format statfs structure to an old format statfs structure.
*/
static void
cvtstatfs(nsp, osp)
struct statfs *nsp;
struct ostatfs *osp;
{
statfs_scale_blocks(nsp, LONG_MAX);
bzero(osp, sizeof(*osp));
osp->f_bsize = nsp->f_bsize;
osp->f_iosize = MIN(nsp->f_iosize, LONG_MAX);
osp->f_blocks = nsp->f_blocks;
osp->f_bfree = nsp->f_bfree;
osp->f_bavail = nsp->f_bavail;
osp->f_files = MIN(nsp->f_files, LONG_MAX);
osp->f_ffree = MIN(nsp->f_ffree, LONG_MAX);
osp->f_owner = nsp->f_owner;
osp->f_type = nsp->f_type;
osp->f_flags = nsp->f_flags;
osp->f_syncwrites = MIN(nsp->f_syncwrites, LONG_MAX);
osp->f_asyncwrites = MIN(nsp->f_asyncwrites, LONG_MAX);
osp->f_syncreads = MIN(nsp->f_syncreads, LONG_MAX);
osp->f_asyncreads = MIN(nsp->f_asyncreads, LONG_MAX);
strlcpy(osp->f_fstypename, nsp->f_fstypename,
MIN(MFSNAMELEN, OMFSNAMELEN));
strlcpy(osp->f_mntonname, nsp->f_mntonname,
MIN(MNAMELEN, OMNAMELEN));
strlcpy(osp->f_mntfromname, nsp->f_mntfromname,
MIN(MNAMELEN, OMNAMELEN));
osp->f_fsid = nsp->f_fsid;
}
#endif /* COMPAT_FREEBSD4 */
/*
* Change current working directory to a given file descriptor.
*/
#ifndef _SYS_SYSPROTO_H_
struct fchdir_args {
int fd;
};
#endif
int
-fchdir(td, uap)
+sys_fchdir(td, uap)
struct thread *td;
struct fchdir_args /* {
int fd;
} */ *uap;
{
register struct filedesc *fdp = td->td_proc->p_fd;
struct vnode *vp, *tdp, *vpold;
struct mount *mp;
struct file *fp;
int vfslocked;
int error;
AUDIT_ARG_FD(uap->fd);
if ((error = getvnode(fdp, uap->fd, CAP_FCHDIR, &fp)) != 0)
return (error);
vp = fp->f_vnode;
VREF(vp);
fdrop(fp, td);
vfslocked = VFS_LOCK_GIANT(vp->v_mount);
vn_lock(vp, LK_SHARED | LK_RETRY);
AUDIT_ARG_VNODE1(vp);
error = change_dir(vp, td);
while (!error && (mp = vp->v_mountedhere) != NULL) {
int tvfslocked;
if (vfs_busy(mp, 0))
continue;
tvfslocked = VFS_LOCK_GIANT(mp);
error = VFS_ROOT(mp, LK_SHARED, &tdp);
vfs_unbusy(mp);
if (error) {
VFS_UNLOCK_GIANT(tvfslocked);
break;
}
vput(vp);
VFS_UNLOCK_GIANT(vfslocked);
vp = tdp;
vfslocked = tvfslocked;
}
if (error) {
vput(vp);
VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
VOP_UNLOCK(vp, 0);
VFS_UNLOCK_GIANT(vfslocked);
FILEDESC_XLOCK(fdp);
vpold = fdp->fd_cdir;
fdp->fd_cdir = vp;
FILEDESC_XUNLOCK(fdp);
vfslocked = VFS_LOCK_GIANT(vpold->v_mount);
vrele(vpold);
VFS_UNLOCK_GIANT(vfslocked);
return (0);
}
/*
* Change current working directory (``.'').
*/
#ifndef _SYS_SYSPROTO_H_
struct chdir_args {
char *path;
};
#endif
int
-chdir(td, uap)
+sys_chdir(td, uap)
struct thread *td;
struct chdir_args /* {
char *path;
} */ *uap;
{
return (kern_chdir(td, uap->path, UIO_USERSPACE));
}
int
kern_chdir(struct thread *td, char *path, enum uio_seg pathseg)
{
register struct filedesc *fdp = td->td_proc->p_fd;
int error;
struct nameidata nd;
struct vnode *vp;
int vfslocked;
NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1 |
MPSAFE, pathseg, path, td);
if ((error = namei(&nd)) != 0)
return (error);
vfslocked = NDHASGIANT(&nd);
if ((error = change_dir(nd.ni_vp, td)) != 0) {
vput(nd.ni_vp);
VFS_UNLOCK_GIANT(vfslocked);
NDFREE(&nd, NDF_ONLY_PNBUF);
return (error);
}
VOP_UNLOCK(nd.ni_vp, 0);
VFS_UNLOCK_GIANT(vfslocked);
NDFREE(&nd, NDF_ONLY_PNBUF);
FILEDESC_XLOCK(fdp);
vp = fdp->fd_cdir;
fdp->fd_cdir = nd.ni_vp;
FILEDESC_XUNLOCK(fdp);
vfslocked = VFS_LOCK_GIANT(vp->v_mount);
vrele(vp);
VFS_UNLOCK_GIANT(vfslocked);
return (0);
}
/*
* Helper function for raised chroot(2) security function: Refuse if
* any filedescriptors are open directories.
*/
static int
chroot_refuse_vdir_fds(fdp)
struct filedesc *fdp;
{
struct vnode *vp;
struct file *fp;
int fd;
FILEDESC_LOCK_ASSERT(fdp);
for (fd = 0; fd < fdp->fd_nfiles ; fd++) {
fp = fget_locked(fdp, fd);
if (fp == NULL)
continue;
if (fp->f_type == DTYPE_VNODE) {
vp = fp->f_vnode;
if (vp->v_type == VDIR)
return (EPERM);
}
}
return (0);
}
/*
* This sysctl determines if we will allow a process to chroot(2) if it
* has a directory open:
* 0: disallowed for all processes.
* 1: allowed for processes that were not already chroot(2)'ed.
* 2: allowed for all processes.
*/
static int chroot_allow_open_directories = 1;
SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW,
&chroot_allow_open_directories, 0, "");
/*
* Change notion of root (``/'') directory.
*/
#ifndef _SYS_SYSPROTO_H_
struct chroot_args {
char *path;
};
#endif
int
-chroot(td, uap)
+sys_chroot(td, uap)
struct thread *td;
struct chroot_args /* {
char *path;
} */ *uap;
{
int error;
struct nameidata nd;
int vfslocked;
error = priv_check(td, PRIV_VFS_CHROOT);
if (error)
return (error);
NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | MPSAFE |
AUDITVNODE1, UIO_USERSPACE, uap->path, td);
error = namei(&nd);
if (error)
goto error;
vfslocked = NDHASGIANT(&nd);
if ((error = change_dir(nd.ni_vp, td)) != 0)
goto e_vunlock;
#ifdef MAC
if ((error = mac_vnode_check_chroot(td->td_ucred, nd.ni_vp)))
goto e_vunlock;
#endif
VOP_UNLOCK(nd.ni_vp, 0);
error = change_root(nd.ni_vp, td);
vrele(nd.ni_vp);
VFS_UNLOCK_GIANT(vfslocked);
NDFREE(&nd, NDF_ONLY_PNBUF);
return (error);
e_vunlock:
vput(nd.ni_vp);
VFS_UNLOCK_GIANT(vfslocked);
error:
NDFREE(&nd, NDF_ONLY_PNBUF);
return (error);
}
/*
* Common routine for chroot and chdir. Callers must provide a locked vnode
* instance.
*/
int
change_dir(vp, td)
struct vnode *vp;
struct thread *td;
{
int error;
ASSERT_VOP_LOCKED(vp, "change_dir(): vp not locked");
if (vp->v_type != VDIR)
return (ENOTDIR);
#ifdef MAC
error = mac_vnode_check_chdir(td->td_ucred, vp);
if (error)
return (error);
#endif
error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td);
return (error);
}
/*
* Common routine for kern_chroot() and jail_attach(). The caller is
* responsible for invoking priv_check() and mac_vnode_check_chroot() to
* authorize this operation.
*/
int
change_root(vp, td)
struct vnode *vp;
struct thread *td;
{
struct filedesc *fdp;
struct vnode *oldvp;
int vfslocked;
int error;
VFS_ASSERT_GIANT(vp->v_mount);
fdp = td->td_proc->p_fd;
FILEDESC_XLOCK(fdp);
if (chroot_allow_open_directories == 0 ||
(chroot_allow_open_directories == 1 && fdp->fd_rdir != rootvnode)) {
error = chroot_refuse_vdir_fds(fdp);
if (error) {
FILEDESC_XUNLOCK(fdp);
return (error);
}
}
oldvp = fdp->fd_rdir;
fdp->fd_rdir = vp;
VREF(fdp->fd_rdir);
if (!fdp->fd_jdir) {
fdp->fd_jdir = vp;
VREF(fdp->fd_jdir);
}
FILEDESC_XUNLOCK(fdp);
vfslocked = VFS_LOCK_GIANT(oldvp->v_mount);
vrele(oldvp);
VFS_UNLOCK_GIANT(vfslocked);
return (0);
}
static __inline cap_rights_t
flags_to_rights(int flags)
{
cap_rights_t rights = 0;
switch ((flags & O_ACCMODE)) {
case O_RDONLY:
rights |= CAP_READ;
break;
case O_RDWR:
rights |= CAP_READ;
/* fall through */
case O_WRONLY:
rights |= CAP_WRITE;
break;
case O_EXEC:
rights |= CAP_FEXECVE;
break;
}
if (flags & O_CREAT)
rights |= CAP_CREATE;
if (flags & O_TRUNC)
rights |= CAP_FTRUNCATE;
if ((flags & O_EXLOCK) || (flags & O_SHLOCK))
rights |= CAP_FLOCK;
return (rights);
}
/*
* Check permissions, allocate an open file structure, and call the device
* open routine if any.
*/
#ifndef _SYS_SYSPROTO_H_
struct open_args {
char *path;
int flags;
int mode;
};
#endif
int
-open(td, uap)
+sys_open(td, uap)
struct thread *td;
register struct open_args /* {
char *path;
int flags;
int mode;
} */ *uap;
{
return (kern_open(td, uap->path, UIO_USERSPACE, uap->flags, uap->mode));
}
#ifndef _SYS_SYSPROTO_H_
struct openat_args {
int fd;
char *path;
int flag;
int mode;
};
#endif
int
-openat(struct thread *td, struct openat_args *uap)
+sys_openat(struct thread *td, struct openat_args *uap)
{
return (kern_openat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flag,
uap->mode));
}
int
kern_open(struct thread *td, char *path, enum uio_seg pathseg, int flags,
int mode)
{
return (kern_openat(td, AT_FDCWD, path, pathseg, flags, mode));
}
int
kern_openat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
int flags, int mode)
{
struct proc *p = td->td_proc;
struct filedesc *fdp = p->p_fd;
struct file *fp;
struct vnode *vp;
int cmode;
struct file *nfp;
int type, indx = -1, error, error_open;
struct flock lf;
struct nameidata nd;
int vfslocked;
cap_rights_t rights_needed = CAP_LOOKUP;
AUDIT_ARG_FFLAGS(flags);
AUDIT_ARG_MODE(mode);
/* XXX: audit dirfd */
rights_needed |= flags_to_rights(flags);
/*
* Only one of the O_EXEC, O_RDONLY, O_WRONLY and O_RDWR flags
* may be specified.
*/
if (flags & O_EXEC) {
if (flags & O_ACCMODE)
return (EINVAL);
} else if ((flags & O_ACCMODE) == O_ACCMODE)
return (EINVAL);
else
flags = FFLAGS(flags);
/*
* allocate the file descriptor, but don't install a descriptor yet
*/
error = falloc_noinstall(td, &nfp);
if (error)
return (error);
/* An extra reference on `nfp' has been held for us by falloc_noinstall(). */
fp = nfp;
/* Set the flags early so the finit in devfs can pick them up. */
fp->f_flag = flags & FMASK;
cmode = ((mode &~ fdp->fd_cmask) & ALLPERMS) &~ S_ISTXT;
NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | AUDITVNODE1 | MPSAFE, pathseg,
path, fd, rights_needed, td);
td->td_dupfd = -1; /* XXX check for fdopen */
error = vn_open(&nd, &flags, cmode, fp);
if (error) {
/*
* If the vn_open replaced the method vector, something
* wonderous happened deep below and we just pass it up
* pretending we know what we do.
*/
if (error == ENXIO && fp->f_ops != &badfileops)
goto success;
/*
* handle special fdopen() case. bleh. dupfdopen() is
* responsible for dropping the old contents of ofiles[indx]
* if it succeeds.
*
* Don't do this for relative (capability) lookups; we don't
* understand exactly what would happen, and we don't think
* that it ever should.
*/
if ((nd.ni_strictrelative == 0) &&
(error == ENODEV || error == ENXIO) &&
(td->td_dupfd >= 0)) {
/* XXX from fdopen */
error_open = error;
if ((error = finstall(td, fp, &indx, flags)) != 0)
goto bad_unlocked;
if ((error = dupfdopen(td, fdp, indx, td->td_dupfd,
flags, error_open)) == 0)
goto success;
}
/*
* Clean up the descriptor, but only if another thread hadn't
* replaced or closed it.
*/
if (indx != -1)
fdclose(fdp, fp, indx, td);
fdrop(fp, td);
if (error == ERESTART)
error = EINTR;
return (error);
}
td->td_dupfd = 0;
vfslocked = NDHASGIANT(&nd);
NDFREE(&nd, NDF_ONLY_PNBUF);
vp = nd.ni_vp;
/*
* Store the vnode, for any f_type. Typically, the vnode use
* count is decremented by direct call to vn_closefile() for
* files that switched type in the cdevsw fdopen() method.
*/
fp->f_vnode = vp;
/*
* If the file wasn't claimed by devfs bind it to the normal
* vnode operations here.
*/
if (fp->f_ops == &badfileops) {
KASSERT(vp->v_type != VFIFO, ("Unexpected fifo."));
fp->f_seqcount = 1;
finit(fp, flags & FMASK, DTYPE_VNODE, vp, &vnops);
}
VOP_UNLOCK(vp, 0);
if (fp->f_type == DTYPE_VNODE && (flags & (O_EXLOCK | O_SHLOCK)) != 0) {
lf.l_whence = SEEK_SET;
lf.l_start = 0;
lf.l_len = 0;
if (flags & O_EXLOCK)
lf.l_type = F_WRLCK;
else
lf.l_type = F_RDLCK;
type = F_FLOCK;
if ((flags & FNONBLOCK) == 0)
type |= F_WAIT;
if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
type)) != 0)
goto bad;
atomic_set_int(&fp->f_flag, FHASLOCK);
}
if (flags & O_TRUNC) {
error = fo_truncate(fp, 0, td->td_ucred, td);
if (error)
goto bad;
}
VFS_UNLOCK_GIANT(vfslocked);
success:
/*
* If we haven't already installed the FD (for dupfdopen), do so now.
*/
if (indx == -1) {
#ifdef CAPABILITIES
if (nd.ni_strictrelative == 1) {
/*
* We are doing a strict relative lookup; wrap the
* result in a capability.
*/
if ((error = kern_capwrap(td, fp, nd.ni_baserights,
&indx)) != 0)
goto bad_unlocked;
} else
#endif
if ((error = finstall(td, fp, &indx, flags)) != 0)
goto bad_unlocked;
}
/*
* Release our private reference, leaving the one associated with
* the descriptor table intact.
*/
fdrop(fp, td);
td->td_retval[0] = indx;
return (0);
bad:
VFS_UNLOCK_GIANT(vfslocked);
bad_unlocked:
if (indx != -1)
fdclose(fdp, fp, indx, td);
fdrop(fp, td);
td->td_retval[0] = -1;
return (error);
}
#ifdef COMPAT_43
/*
* Create a file.
*/
#ifndef _SYS_SYSPROTO_H_
struct ocreat_args {
char *path;
int mode;
};
#endif
int
ocreat(td, uap)
struct thread *td;
register struct ocreat_args /* {
char *path;
int mode;
} */ *uap;
{
return (kern_open(td, uap->path, UIO_USERSPACE,
O_WRONLY | O_CREAT | O_TRUNC, uap->mode));
}
#endif /* COMPAT_43 */
/*
* Create a special file.
*/
#ifndef _SYS_SYSPROTO_H_
struct mknod_args {
char *path;
int mode;
int dev;
};
#endif
int
-mknod(td, uap)
+sys_mknod(td, uap)
struct thread *td;
register struct mknod_args /* {
char *path;
int mode;
int dev;
} */ *uap;
{
return (kern_mknod(td, uap->path, UIO_USERSPACE, uap->mode, uap->dev));
}
#ifndef _SYS_SYSPROTO_H_
struct mknodat_args {
int fd;
char *path;
mode_t mode;
dev_t dev;
};
#endif
int
-mknodat(struct thread *td, struct mknodat_args *uap)
+sys_mknodat(struct thread *td, struct mknodat_args *uap)
{
return (kern_mknodat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode,
uap->dev));
}
int
kern_mknod(struct thread *td, char *path, enum uio_seg pathseg, int mode,
int dev)
{
return (kern_mknodat(td, AT_FDCWD, path, pathseg, mode, dev));
}
int
kern_mknodat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
int mode, int dev)
{
struct vnode *vp;
struct mount *mp;
struct vattr vattr;
int error;
int whiteout = 0;
struct nameidata nd;
int vfslocked;
AUDIT_ARG_MODE(mode);
AUDIT_ARG_DEV(dev);
switch (mode & S_IFMT) {
case S_IFCHR:
case S_IFBLK:
error = priv_check(td, PRIV_VFS_MKNOD_DEV);
break;
case S_IFMT:
error = priv_check(td, PRIV_VFS_MKNOD_BAD);
break;
case S_IFWHT:
error = priv_check(td, PRIV_VFS_MKNOD_WHT);
break;
case S_IFIFO:
if (dev == 0)
return (kern_mkfifoat(td, fd, path, pathseg, mode));
/* FALLTHROUGH */
default:
error = EINVAL;
break;
}
if (error)
return (error);
restart:
bwillwrite();
NDINIT_ATRIGHTS(&nd, CREATE,
LOCKPARENT | SAVENAME | MPSAFE | AUDITVNODE1, pathseg, path, fd,
CAP_MKFIFO, td);
if ((error = namei(&nd)) != 0)
return (error);
vfslocked = NDHASGIANT(&nd);
vp = nd.ni_vp;
if (vp != NULL) {
NDFREE(&nd, NDF_ONLY_PNBUF);
if (vp == nd.ni_dvp)
vrele(nd.ni_dvp);
else
vput(nd.ni_dvp);
vrele(vp);
VFS_UNLOCK_GIANT(vfslocked);
return (EEXIST);
} else {
VATTR_NULL(&vattr);
vattr.va_mode = (mode & ALLPERMS) &
~td->td_proc->p_fd->fd_cmask;
vattr.va_rdev = dev;
whiteout = 0;
switch (mode & S_IFMT) {
case S_IFMT: /* used by badsect to flag bad sectors */
vattr.va_type = VBAD;
break;
case S_IFCHR:
vattr.va_type = VCHR;
break;
case S_IFBLK:
vattr.va_type = VBLK;
break;
case S_IFWHT:
whiteout = 1;
break;
default:
panic("kern_mknod: invalid mode");
}
}
if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
NDFREE(&nd, NDF_ONLY_PNBUF);
vput(nd.ni_dvp);
VFS_UNLOCK_GIANT(vfslocked);
if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
return (error);
goto restart;
}
#ifdef MAC
if (error == 0 && !whiteout)
error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp,
&nd.ni_cnd, &vattr);
#endif
if (!error) {
if (whiteout)
error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE);
else {
error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp,
&nd.ni_cnd, &vattr);
if (error == 0)
vput(nd.ni_vp);
}
}
NDFREE(&nd, NDF_ONLY_PNBUF);
vput(nd.ni_dvp);
vn_finished_write(mp);
VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
/*
* Create a named pipe.
*/
#ifndef _SYS_SYSPROTO_H_
struct mkfifo_args {
char *path;
int mode;
};
#endif
int
-mkfifo(td, uap)
+sys_mkfifo(td, uap)
struct thread *td;
register struct mkfifo_args /* {
char *path;
int mode;
} */ *uap;
{
return (kern_mkfifo(td, uap->path, UIO_USERSPACE, uap->mode));
}
#ifndef _SYS_SYSPROTO_H_
struct mkfifoat_args {
int fd;
char *path;
mode_t mode;
};
#endif
int
-mkfifoat(struct thread *td, struct mkfifoat_args *uap)
+sys_mkfifoat(struct thread *td, struct mkfifoat_args *uap)
{
return (kern_mkfifoat(td, uap->fd, uap->path, UIO_USERSPACE,
uap->mode));
}
int
kern_mkfifo(struct thread *td, char *path, enum uio_seg pathseg, int mode)
{
return (kern_mkfifoat(td, AT_FDCWD, path, pathseg, mode));
}
int
kern_mkfifoat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
int mode)
{
struct mount *mp;
struct vattr vattr;
int error;
struct nameidata nd;
int vfslocked;
AUDIT_ARG_MODE(mode);
restart:
bwillwrite();
NDINIT_AT(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE | AUDITVNODE1,
pathseg, path, fd, td);
if ((error = namei(&nd)) != 0)
return (error);
vfslocked = NDHASGIANT(&nd);
if (nd.ni_vp != NULL) {
NDFREE(&nd, NDF_ONLY_PNBUF);
if (nd.ni_vp == nd.ni_dvp)
vrele(nd.ni_dvp);
else
vput(nd.ni_dvp);
vrele(nd.ni_vp);
VFS_UNLOCK_GIANT(vfslocked);
return (EEXIST);
}
if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
NDFREE(&nd, NDF_ONLY_PNBUF);
vput(nd.ni_dvp);
VFS_UNLOCK_GIANT(vfslocked);
if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
return (error);
goto restart;
}
VATTR_NULL(&vattr);
vattr.va_type = VFIFO;
vattr.va_mode = (mode & ALLPERMS) & ~td->td_proc->p_fd->fd_cmask;
#ifdef MAC
error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
&vattr);
if (error)
goto out;
#endif
error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
if (error == 0)
vput(nd.ni_vp);
#ifdef MAC
out:
#endif
vput(nd.ni_dvp);
vn_finished_write(mp);
VFS_UNLOCK_GIANT(vfslocked);
NDFREE(&nd, NDF_ONLY_PNBUF);
return (error);
}
/*
* Make a hard file link.
*/
#ifndef _SYS_SYSPROTO_H_
struct link_args {
char *path;
char *link;
};
#endif
int
-link(td, uap)
+sys_link(td, uap)
struct thread *td;
register struct link_args /* {
char *path;
char *link;
} */ *uap;
{
return (kern_link(td, uap->path, uap->link, UIO_USERSPACE));
}
#ifndef _SYS_SYSPROTO_H_
struct linkat_args {
int fd1;
char *path1;
int fd2;
char *path2;
int flag;
};
#endif
int
-linkat(struct thread *td, struct linkat_args *uap)
+sys_linkat(struct thread *td, struct linkat_args *uap)
{
int flag;
flag = uap->flag;
if (flag & ~AT_SYMLINK_FOLLOW)
return (EINVAL);
return (kern_linkat(td, uap->fd1, uap->fd2, uap->path1, uap->path2,
UIO_USERSPACE, (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW));
}
int hardlink_check_uid = 0;
SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_uid, CTLFLAG_RW,
&hardlink_check_uid, 0,
"Unprivileged processes cannot create hard links to files owned by other "
"users");
static int hardlink_check_gid = 0;
SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_gid, CTLFLAG_RW,
&hardlink_check_gid, 0,
"Unprivileged processes cannot create hard links to files owned by other "
"groups");
static int
can_hardlink(struct vnode *vp, struct ucred *cred)
{
struct vattr va;
int error;
if (!hardlink_check_uid && !hardlink_check_gid)
return (0);
error = VOP_GETATTR(vp, &va, cred);
if (error != 0)
return (error);
if (hardlink_check_uid && cred->cr_uid != va.va_uid) {
error = priv_check_cred(cred, PRIV_VFS_LINK, 0);
if (error)
return (error);
}
if (hardlink_check_gid && !groupmember(va.va_gid, cred)) {
error = priv_check_cred(cred, PRIV_VFS_LINK, 0);
if (error)
return (error);
}
return (0);
}
int
kern_link(struct thread *td, char *path, char *link, enum uio_seg segflg)
{
return (kern_linkat(td, AT_FDCWD, AT_FDCWD, path,link, segflg, FOLLOW));
}
int
kern_linkat(struct thread *td, int fd1, int fd2, char *path1, char *path2,
enum uio_seg segflg, int follow)
{
struct vnode *vp;
struct mount *mp;
struct nameidata nd;
int vfslocked;
int lvfslocked;
int error;
bwillwrite();
NDINIT_AT(&nd, LOOKUP, follow | MPSAFE | AUDITVNODE1, segflg, path1,
fd1, td);
if ((error = namei(&nd)) != 0)
return (error);
vfslocked = NDHASGIANT(&nd);
NDFREE(&nd, NDF_ONLY_PNBUF);
vp = nd.ni_vp;
if (vp->v_type == VDIR) {
vrele(vp);
VFS_UNLOCK_GIANT(vfslocked);
return (EPERM); /* POSIX */
}
if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
vrele(vp);
VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
NDINIT_AT(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE | AUDITVNODE2,
segflg, path2, fd2, td);
if ((error = namei(&nd)) == 0) {
lvfslocked = NDHASGIANT(&nd);
if (nd.ni_vp != NULL) {
if (nd.ni_dvp == nd.ni_vp)
vrele(nd.ni_dvp);
else
vput(nd.ni_dvp);
vrele(nd.ni_vp);
error = EEXIST;
} else if ((error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY))
== 0) {
error = can_hardlink(vp, td->td_ucred);
if (error == 0)
#ifdef MAC
error = mac_vnode_check_link(td->td_ucred,
nd.ni_dvp, vp, &nd.ni_cnd);
if (error == 0)
#endif
error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd);
VOP_UNLOCK(vp, 0);
vput(nd.ni_dvp);
}
NDFREE(&nd, NDF_ONLY_PNBUF);
VFS_UNLOCK_GIANT(lvfslocked);
}
vrele(vp);
vn_finished_write(mp);
VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
/*
* Make a symbolic link.
*/
#ifndef _SYS_SYSPROTO_H_
struct symlink_args {
char *path;
char *link;
};
#endif
int
-symlink(td, uap)
+sys_symlink(td, uap)
struct thread *td;
register struct symlink_args /* {
char *path;
char *link;
} */ *uap;
{
return (kern_symlink(td, uap->path, uap->link, UIO_USERSPACE));
}
#ifndef _SYS_SYSPROTO_H_
struct symlinkat_args {
char *path;
int fd;
char *path2;
};
#endif
int
-symlinkat(struct thread *td, struct symlinkat_args *uap)
+sys_symlinkat(struct thread *td, struct symlinkat_args *uap)
{
return (kern_symlinkat(td, uap->path1, uap->fd, uap->path2,
UIO_USERSPACE));
}
int
kern_symlink(struct thread *td, char *path, char *link, enum uio_seg segflg)
{
return (kern_symlinkat(td, path, AT_FDCWD, link, segflg));
}
int
kern_symlinkat(struct thread *td, char *path1, int fd, char *path2,
enum uio_seg segflg)
{
struct mount *mp;
struct vattr vattr;
char *syspath;
int error;
struct nameidata nd;
int vfslocked;
if (segflg == UIO_SYSSPACE) {
syspath = path1;
} else {
syspath = uma_zalloc(namei_zone, M_WAITOK);
if ((error = copyinstr(path1, syspath, MAXPATHLEN, NULL)) != 0)
goto out;
}
AUDIT_ARG_TEXT(syspath);
restart:
bwillwrite();
NDINIT_AT(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE | AUDITVNODE1,
segflg, path2, fd, td);
if ((error = namei(&nd)) != 0)
goto out;
vfslocked = NDHASGIANT(&nd);
if (nd.ni_vp) {
NDFREE(&nd, NDF_ONLY_PNBUF);
if (nd.ni_vp == nd.ni_dvp)
vrele(nd.ni_dvp);
else
vput(nd.ni_dvp);
vrele(nd.ni_vp);
VFS_UNLOCK_GIANT(vfslocked);
error = EEXIST;
goto out;
}
if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
NDFREE(&nd, NDF_ONLY_PNBUF);
vput(nd.ni_dvp);
VFS_UNLOCK_GIANT(vfslocked);
if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
goto out;
goto restart;
}
VATTR_NULL(&vattr);
vattr.va_mode = ACCESSPERMS &~ td->td_proc->p_fd->fd_cmask;
#ifdef MAC
vattr.va_type = VLNK;
error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
&vattr);
if (error)
goto out2;
#endif
error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, syspath);
if (error == 0)
vput(nd.ni_vp);
#ifdef MAC
out2:
#endif
NDFREE(&nd, NDF_ONLY_PNBUF);
vput(nd.ni_dvp);
vn_finished_write(mp);
VFS_UNLOCK_GIANT(vfslocked);
out:
if (segflg != UIO_SYSSPACE)
uma_zfree(namei_zone, syspath);
return (error);
}
/*
* Delete a whiteout from the filesystem.
*/
int
-undelete(td, uap)
+sys_undelete(td, uap)
struct thread *td;
register struct undelete_args /* {
char *path;
} */ *uap;
{
int error;
struct mount *mp;
struct nameidata nd;
int vfslocked;
restart:
bwillwrite();
NDINIT(&nd, DELETE, LOCKPARENT | DOWHITEOUT | MPSAFE | AUDITVNODE1,
UIO_USERSPACE, uap->path, td);
error = namei(&nd);
if (error)
return (error);
vfslocked = NDHASGIANT(&nd);
if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) {
NDFREE(&nd, NDF_ONLY_PNBUF);
if (nd.ni_vp == nd.ni_dvp)
vrele(nd.ni_dvp);
else
vput(nd.ni_dvp);
if (nd.ni_vp)
vrele(nd.ni_vp);
VFS_UNLOCK_GIANT(vfslocked);
return (EEXIST);
}
if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
NDFREE(&nd, NDF_ONLY_PNBUF);
vput(nd.ni_dvp);
VFS_UNLOCK_GIANT(vfslocked);
if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
return (error);
goto restart;
}
error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE);
NDFREE(&nd, NDF_ONLY_PNBUF);
vput(nd.ni_dvp);
vn_finished_write(mp);
VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
/*
* Delete a name from the filesystem.
*/
#ifndef _SYS_SYSPROTO_H_
struct unlink_args {
char *path;
};
#endif
int
-unlink(td, uap)
+sys_unlink(td, uap)
struct thread *td;
struct unlink_args /* {
char *path;
} */ *uap;
{
return (kern_unlink(td, uap->path, UIO_USERSPACE));
}
#ifndef _SYS_SYSPROTO_H_
struct unlinkat_args {
int fd;
char *path;
int flag;
};
#endif
int
-unlinkat(struct thread *td, struct unlinkat_args *uap)
+sys_unlinkat(struct thread *td, struct unlinkat_args *uap)
{
int flag = uap->flag;
int fd = uap->fd;
char *path = uap->path;
if (flag & ~AT_REMOVEDIR)
return (EINVAL);
if (flag & AT_REMOVEDIR)
return (kern_rmdirat(td, fd, path, UIO_USERSPACE));
else
return (kern_unlinkat(td, fd, path, UIO_USERSPACE, 0));
}
int
kern_unlink(struct thread *td, char *path, enum uio_seg pathseg)
{
return (kern_unlinkat(td, AT_FDCWD, path, pathseg, 0));
}
int
kern_unlinkat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
ino_t oldinum)
{
struct mount *mp;
struct vnode *vp;
int error;
struct nameidata nd;
struct stat sb;
int vfslocked;
restart:
bwillwrite();
NDINIT_AT(&nd, DELETE, LOCKPARENT | LOCKLEAF | MPSAFE | AUDITVNODE1,
pathseg, path, fd, td);
if ((error = namei(&nd)) != 0)
return (error == EINVAL ? EPERM : error);
vfslocked = NDHASGIANT(&nd);
vp = nd.ni_vp;
if (vp->v_type == VDIR && oldinum == 0) {
error = EPERM; /* POSIX */
} else if (oldinum != 0 &&
((error = vn_stat(vp, &sb, td->td_ucred, NOCRED, td)) == 0) &&
sb.st_ino != oldinum) {
error = EIDRM; /* Identifier removed */
} else {
/*
* The root of a mounted filesystem cannot be deleted.
*
* XXX: can this only be a VDIR case?
*/
if (vp->v_vflag & VV_ROOT)
error = EBUSY;
}
if (error == 0) {
if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
NDFREE(&nd, NDF_ONLY_PNBUF);
vput(nd.ni_dvp);
if (vp == nd.ni_dvp)
vrele(vp);
else
vput(vp);
VFS_UNLOCK_GIANT(vfslocked);
if ((error = vn_start_write(NULL, &mp,
V_XSLEEP | PCATCH)) != 0)
return (error);
goto restart;
}
#ifdef MAC
error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp,
&nd.ni_cnd);
if (error)
goto out;
#endif
error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
#ifdef MAC
out:
#endif
vn_finished_write(mp);
}
NDFREE(&nd, NDF_ONLY_PNBUF);
vput(nd.ni_dvp);
if (vp == nd.ni_dvp)
vrele(vp);
else
vput(vp);
VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
/*
* Reposition read/write file offset.
*/
#ifndef _SYS_SYSPROTO_H_
struct lseek_args {
int fd;
int pad;
off_t offset;
int whence;
};
#endif
int
-lseek(td, uap)
+sys_lseek(td, uap)
struct thread *td;
register struct lseek_args /* {
int fd;
int pad;
off_t offset;
int whence;
} */ *uap;
{
struct ucred *cred = td->td_ucred;
struct file *fp;
struct vnode *vp;
struct vattr vattr;
off_t offset, size;
int error, noneg;
int vfslocked;
AUDIT_ARG_FD(uap->fd);
if ((error = fget(td, uap->fd, CAP_SEEK, &fp)) != 0)
return (error);
if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) {
fdrop(fp, td);
return (ESPIPE);
}
vp = fp->f_vnode;
vfslocked = VFS_LOCK_GIANT(vp->v_mount);
noneg = (vp->v_type != VCHR);
offset = uap->offset;
switch (uap->whence) {
case L_INCR:
if (noneg &&
(fp->f_offset < 0 ||
(offset > 0 && fp->f_offset > OFF_MAX - offset))) {
error = EOVERFLOW;
break;
}
offset += fp->f_offset;
break;
case L_XTND:
vn_lock(vp, LK_SHARED | LK_RETRY);
error = VOP_GETATTR(vp, &vattr, cred);
VOP_UNLOCK(vp, 0);
if (error)
break;
/*
* If the file references a disk device, then fetch
* the media size and use that to determine the ending
* offset.
*/
if (vattr.va_size == 0 && vp->v_type == VCHR &&
fo_ioctl(fp, DIOCGMEDIASIZE, &size, cred, td) == 0)
vattr.va_size = size;
if (noneg &&
(vattr.va_size > OFF_MAX ||
(offset > 0 && vattr.va_size > OFF_MAX - offset))) {
error = EOVERFLOW;
break;
}
offset += vattr.va_size;
break;
case L_SET:
break;
case SEEK_DATA:
error = fo_ioctl(fp, FIOSEEKDATA, &offset, cred, td);
break;
case SEEK_HOLE:
error = fo_ioctl(fp, FIOSEEKHOLE, &offset, cred, td);
break;
default:
error = EINVAL;
}
if (error == 0 && noneg && offset < 0)
error = EINVAL;
if (error != 0)
goto drop;
fp->f_offset = offset;
*(off_t *)(td->td_retval) = fp->f_offset;
drop:
fdrop(fp, td);
VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
#if defined(COMPAT_43)
/*
* Reposition read/write file offset.
*/
#ifndef _SYS_SYSPROTO_H_
struct olseek_args {
int fd;
long offset;
int whence;
};
#endif
int
olseek(td, uap)
struct thread *td;
register struct olseek_args /* {
int fd;
long offset;
int whence;
} */ *uap;
{
struct lseek_args /* {
int fd;
int pad;
off_t offset;
int whence;
} */ nuap;
nuap.fd = uap->fd;
nuap.offset = uap->offset;
nuap.whence = uap->whence;
- return (lseek(td, &nuap));
+ return (sys_lseek(td, &nuap));
}
#endif /* COMPAT_43 */
/* Version with the 'pad' argument */
int
freebsd6_lseek(td, uap)
struct thread *td;
register struct freebsd6_lseek_args *uap;
{
struct lseek_args ouap;
ouap.fd = uap->fd;
ouap.offset = uap->offset;
ouap.whence = uap->whence;
- return (lseek(td, &ouap));
+ return (sys_lseek(td, &ouap));
}
/*
* Check access permissions using passed credentials.
*/
static int
vn_access(vp, user_flags, cred, td)
struct vnode *vp;
int user_flags;
struct ucred *cred;
struct thread *td;
{
int error;
accmode_t accmode;
/* Flags == 0 means only check for existence. */
error = 0;
if (user_flags) {
accmode = 0;
if (user_flags & R_OK)
accmode |= VREAD;
if (user_flags & W_OK)
accmode |= VWRITE;
if (user_flags & X_OK)
accmode |= VEXEC;
#ifdef MAC
error = mac_vnode_check_access(cred, vp, accmode);
if (error)
return (error);
#endif
if ((accmode & VWRITE) == 0 || (error = vn_writechk(vp)) == 0)
error = VOP_ACCESS(vp, accmode, cred, td);
}
return (error);
}
/*
* Check access permissions using "real" credentials.
*/
#ifndef _SYS_SYSPROTO_H_
struct access_args {
char *path;
int flags;
};
#endif
int
-access(td, uap)
+sys_access(td, uap)
struct thread *td;
register struct access_args /* {
char *path;
int flags;
} */ *uap;
{
return (kern_access(td, uap->path, UIO_USERSPACE, uap->flags));
}
#ifndef _SYS_SYSPROTO_H_
struct faccessat_args {
int dirfd;
char *path;
int mode;
int flag;
}
#endif
int
-faccessat(struct thread *td, struct faccessat_args *uap)
+sys_faccessat(struct thread *td, struct faccessat_args *uap)
{
if (uap->flag & ~AT_EACCESS)
return (EINVAL);
return (kern_accessat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flag,
uap->mode));
}
int
kern_access(struct thread *td, char *path, enum uio_seg pathseg, int mode)
{
return (kern_accessat(td, AT_FDCWD, path, pathseg, 0, mode));
}
int
kern_accessat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
int flags, int mode)
{
struct ucred *cred, *tmpcred;
struct vnode *vp;
struct nameidata nd;
int vfslocked;
int error;
/*
* Create and modify a temporary credential instead of one that
* is potentially shared.
*/
if (!(flags & AT_EACCESS)) {
cred = td->td_ucred;
tmpcred = crdup(cred);
tmpcred->cr_uid = cred->cr_ruid;
tmpcred->cr_groups[0] = cred->cr_rgid;
td->td_ucred = tmpcred;
} else
cred = tmpcred = td->td_ucred;
AUDIT_ARG_VALUE(mode);
NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | MPSAFE |
AUDITVNODE1, pathseg, path, fd, CAP_FSTAT, td);
if ((error = namei(&nd)) != 0)
goto out1;
vfslocked = NDHASGIANT(&nd);
vp = nd.ni_vp;
error = vn_access(vp, mode, tmpcred, td);
NDFREE(&nd, NDF_ONLY_PNBUF);
vput(vp);
VFS_UNLOCK_GIANT(vfslocked);
out1:
if (!(flags & AT_EACCESS)) {
td->td_ucred = cred;
crfree(tmpcred);
}
return (error);
}
/*
* Check access permissions using "effective" credentials.
*/
#ifndef _SYS_SYSPROTO_H_
struct eaccess_args {
char *path;
int flags;
};
#endif
int
-eaccess(td, uap)
+sys_eaccess(td, uap)
struct thread *td;
register struct eaccess_args /* {
char *path;
int flags;
} */ *uap;
{
return (kern_eaccess(td, uap->path, UIO_USERSPACE, uap->flags));
}
int
kern_eaccess(struct thread *td, char *path, enum uio_seg pathseg, int flags)
{
return (kern_accessat(td, AT_FDCWD, path, pathseg, AT_EACCESS, flags));
}
#if defined(COMPAT_43)
/*
* Get file status; this version follows links.
*/
#ifndef _SYS_SYSPROTO_H_
struct ostat_args {
char *path;
struct ostat *ub;
};
#endif
int
ostat(td, uap)
struct thread *td;
register struct ostat_args /* {
char *path;
struct ostat *ub;
} */ *uap;
{
struct stat sb;
struct ostat osb;
int error;
error = kern_stat(td, uap->path, UIO_USERSPACE, &sb);
if (error)
return (error);
cvtstat(&sb, &osb);
error = copyout(&osb, uap->ub, sizeof (osb));
return (error);
}
/*
* Get file status; this version does not follow links.
*/
#ifndef _SYS_SYSPROTO_H_
struct olstat_args {
char *path;
struct ostat *ub;
};
#endif
int
olstat(td, uap)
struct thread *td;
register struct olstat_args /* {
char *path;
struct ostat *ub;
} */ *uap;
{
struct stat sb;
struct ostat osb;
int error;
error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb);
if (error)
return (error);
cvtstat(&sb, &osb);
error = copyout(&osb, uap->ub, sizeof (osb));
return (error);
}
/*
* Convert from an old to a new stat structure.
*/
void
cvtstat(st, ost)
struct stat *st;
struct ostat *ost;
{
ost->st_dev = st->st_dev;
ost->st_ino = st->st_ino;
ost->st_mode = st->st_mode;
ost->st_nlink = st->st_nlink;
ost->st_uid = st->st_uid;
ost->st_gid = st->st_gid;
ost->st_rdev = st->st_rdev;
if (st->st_size < (quad_t)1 << 32)
ost->st_size = st->st_size;
else
ost->st_size = -2;
ost->st_atim = st->st_atim;
ost->st_mtim = st->st_mtim;
ost->st_ctim = st->st_ctim;
ost->st_blksize = st->st_blksize;
ost->st_blocks = st->st_blocks;
ost->st_flags = st->st_flags;
ost->st_gen = st->st_gen;
}
#endif /* COMPAT_43 */
/*
* Get file status; this version follows links.
*/
#ifndef _SYS_SYSPROTO_H_
struct stat_args {
char *path;
struct stat *ub;
};
#endif
int
-stat(td, uap)
+sys_stat(td, uap)
struct thread *td;
register struct stat_args /* {
char *path;
struct stat *ub;
} */ *uap;
{
struct stat sb;
int error;
error = kern_stat(td, uap->path, UIO_USERSPACE, &sb);
if (error == 0)
error = copyout(&sb, uap->ub, sizeof (sb));
return (error);
}
#ifndef _SYS_SYSPROTO_H_
struct fstatat_args {
int fd;
char *path;
struct stat *buf;
int flag;
}
#endif
int
-fstatat(struct thread *td, struct fstatat_args *uap)
+sys_fstatat(struct thread *td, struct fstatat_args *uap)
{
struct stat sb;
int error;
error = kern_statat(td, uap->flag, uap->fd, uap->path,
UIO_USERSPACE, &sb);
if (error == 0)
error = copyout(&sb, uap->buf, sizeof (sb));
return (error);
}
int
kern_stat(struct thread *td, char *path, enum uio_seg pathseg, struct stat *sbp)
{
return (kern_statat(td, 0, AT_FDCWD, path, pathseg, sbp));
}
int
kern_statat(struct thread *td, int flag, int fd, char *path,
enum uio_seg pathseg, struct stat *sbp)
{
return (kern_statat_vnhook(td, flag, fd, path, pathseg, sbp, NULL));
}
int
kern_statat_vnhook(struct thread *td, int flag, int fd, char *path,
enum uio_seg pathseg, struct stat *sbp,
void (*hook)(struct vnode *vp, struct stat *sbp))
{
struct nameidata nd;
struct stat sb;
int error, vfslocked;
if (flag & ~AT_SYMLINK_NOFOLLOW)
return (EINVAL);
NDINIT_ATRIGHTS(&nd, LOOKUP, ((flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW :
FOLLOW) | LOCKSHARED | LOCKLEAF | AUDITVNODE1 | MPSAFE, pathseg,
path, fd, CAP_FSTAT, td);
if ((error = namei(&nd)) != 0)
return (error);
vfslocked = NDHASGIANT(&nd);
error = vn_stat(nd.ni_vp, &sb, td->td_ucred, NOCRED, td);
if (!error) {
SDT_PROBE(vfs, , stat, mode, path, sb.st_mode, 0, 0, 0);
if (S_ISREG(sb.st_mode))
SDT_PROBE(vfs, , stat, reg, path, pathseg, 0, 0, 0);
if (__predict_false(hook != NULL))
hook(nd.ni_vp, &sb);
}
NDFREE(&nd, NDF_ONLY_PNBUF);
vput(nd.ni_vp);
VFS_UNLOCK_GIANT(vfslocked);
if (error)
return (error);
*sbp = sb;
#ifdef KTRACE
if (KTRPOINT(td, KTR_STRUCT))
ktrstat(&sb);
#endif
return (0);
}
/*
* Get file status; this version does not follow links.
*/
#ifndef _SYS_SYSPROTO_H_
struct lstat_args {
char *path;
struct stat *ub;
};
#endif
int
-lstat(td, uap)
+sys_lstat(td, uap)
struct thread *td;
register struct lstat_args /* {
char *path;
struct stat *ub;
} */ *uap;
{
struct stat sb;
int error;
error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb);
if (error == 0)
error = copyout(&sb, uap->ub, sizeof (sb));
return (error);
}
int
kern_lstat(struct thread *td, char *path, enum uio_seg pathseg, struct stat *sbp)
{
return (kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, path, pathseg,
sbp));
}
/*
* Implementation of the NetBSD [l]stat() functions.
*/
void
cvtnstat(sb, nsb)
struct stat *sb;
struct nstat *nsb;
{
bzero(nsb, sizeof *nsb);
nsb->st_dev = sb->st_dev;
nsb->st_ino = sb->st_ino;
nsb->st_mode = sb->st_mode;
nsb->st_nlink = sb->st_nlink;
nsb->st_uid = sb->st_uid;
nsb->st_gid = sb->st_gid;
nsb->st_rdev = sb->st_rdev;
nsb->st_atim = sb->st_atim;
nsb->st_mtim = sb->st_mtim;
nsb->st_ctim = sb->st_ctim;
nsb->st_size = sb->st_size;
nsb->st_blocks = sb->st_blocks;
nsb->st_blksize = sb->st_blksize;
nsb->st_flags = sb->st_flags;
nsb->st_gen = sb->st_gen;
nsb->st_birthtim = sb->st_birthtim;
}
#ifndef _SYS_SYSPROTO_H_
struct nstat_args {
char *path;
struct nstat *ub;
};
#endif
int
-nstat(td, uap)
+sys_nstat(td, uap)
struct thread *td;
register struct nstat_args /* {
char *path;
struct nstat *ub;
} */ *uap;
{
struct stat sb;
struct nstat nsb;
int error;
error = kern_stat(td, uap->path, UIO_USERSPACE, &sb);
if (error)
return (error);
cvtnstat(&sb, &nsb);
error = copyout(&nsb, uap->ub, sizeof (nsb));
return (error);
}
/*
* NetBSD lstat. Get file status; this version does not follow links.
*/
#ifndef _SYS_SYSPROTO_H_
struct lstat_args {
char *path;
struct stat *ub;
};
#endif
int
-nlstat(td, uap)
+sys_nlstat(td, uap)
struct thread *td;
register struct nlstat_args /* {
char *path;
struct nstat *ub;
} */ *uap;
{
struct stat sb;
struct nstat nsb;
int error;
error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb);
if (error)
return (error);
cvtnstat(&sb, &nsb);
error = copyout(&nsb, uap->ub, sizeof (nsb));
return (error);
}
/*
* Get configurable pathname variables.
*/
#ifndef _SYS_SYSPROTO_H_
struct pathconf_args {
char *path;
int name;
};
#endif
int
-pathconf(td, uap)
+sys_pathconf(td, uap)
struct thread *td;
register struct pathconf_args /* {
char *path;
int name;
} */ *uap;
{
return (kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name, FOLLOW));
}
#ifndef _SYS_SYSPROTO_H_
struct lpathconf_args {
char *path;
int name;
};
#endif
int
-lpathconf(td, uap)
+sys_lpathconf(td, uap)
struct thread *td;
register struct lpathconf_args /* {
char *path;
int name;
} */ *uap;
{
return (kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name, NOFOLLOW));
}
int
kern_pathconf(struct thread *td, char *path, enum uio_seg pathseg, int name,
u_long flags)
{
struct nameidata nd;
int error, vfslocked;
NDINIT(&nd, LOOKUP, LOCKSHARED | LOCKLEAF | MPSAFE | AUDITVNODE1 |
flags, pathseg, path, td);
if ((error = namei(&nd)) != 0)
return (error);
vfslocked = NDHASGIANT(&nd);
NDFREE(&nd, NDF_ONLY_PNBUF);
/* If asynchronous I/O is available, it works for all files. */
if (name == _PC_ASYNC_IO)
td->td_retval[0] = async_io_version;
else
error = VOP_PATHCONF(nd.ni_vp, name, td->td_retval);
vput(nd.ni_vp);
VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
/*
* Return target name of a symbolic link.
*/
#ifndef _SYS_SYSPROTO_H_
struct readlink_args {
char *path;
char *buf;
size_t count;
};
#endif
int
-readlink(td, uap)
+sys_readlink(td, uap)
struct thread *td;
register struct readlink_args /* {
char *path;
char *buf;
size_t count;
} */ *uap;
{
return (kern_readlink(td, uap->path, UIO_USERSPACE, uap->buf,
UIO_USERSPACE, uap->count));
}
#ifndef _SYS_SYSPROTO_H_
struct readlinkat_args {
int fd;
char *path;
char *buf;
size_t bufsize;
};
#endif
int
-readlinkat(struct thread *td, struct readlinkat_args *uap)
+sys_readlinkat(struct thread *td, struct readlinkat_args *uap)
{
return (kern_readlinkat(td, uap->fd, uap->path, UIO_USERSPACE,
uap->buf, UIO_USERSPACE, uap->bufsize));
}
int
kern_readlink(struct thread *td, char *path, enum uio_seg pathseg, char *buf,
enum uio_seg bufseg, size_t count)
{
return (kern_readlinkat(td, AT_FDCWD, path, pathseg, buf, bufseg,
count));
}
int
kern_readlinkat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
char *buf, enum uio_seg bufseg, size_t count)
{
struct vnode *vp;
struct iovec aiov;
struct uio auio;
int error;
struct nameidata nd;
int vfslocked;
if (count > INT_MAX)
return (EINVAL);
NDINIT_AT(&nd, LOOKUP, NOFOLLOW | LOCKSHARED | LOCKLEAF | MPSAFE |
AUDITVNODE1, pathseg, path, fd, td);
if ((error = namei(&nd)) != 0)
return (error);
NDFREE(&nd, NDF_ONLY_PNBUF);
vfslocked = NDHASGIANT(&nd);
vp = nd.ni_vp;
#ifdef MAC
error = mac_vnode_check_readlink(td->td_ucred, vp);
if (error) {
vput(vp);
VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
#endif
if (vp->v_type != VLNK)
error = EINVAL;
else {
aiov.iov_base = buf;
aiov.iov_len = count;
auio.uio_iov = &aiov;
auio.uio_iovcnt = 1;
auio.uio_offset = 0;
auio.uio_rw = UIO_READ;
auio.uio_segflg = bufseg;
auio.uio_td = td;
auio.uio_resid = count;
error = VOP_READLINK(vp, &auio, td->td_ucred);
}
vput(vp);
VFS_UNLOCK_GIANT(vfslocked);
td->td_retval[0] = count - auio.uio_resid;
return (error);
}
/*
* Common implementation code for chflags() and fchflags().
*/
static int
setfflags(td, vp, flags)
struct thread *td;
struct vnode *vp;
int flags;
{
int error;
struct mount *mp;
struct vattr vattr;
/*
* Prevent non-root users from setting flags on devices. When
* a device is reused, users can retain ownership of the device
* if they are allowed to set flags and programs assume that
* chown can't fail when done as root.
*/
if (vp->v_type == VCHR || vp->v_type == VBLK) {
error = priv_check(td, PRIV_VFS_CHFLAGS_DEV);
if (error)
return (error);
}
if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
return (error);
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
VATTR_NULL(&vattr);
vattr.va_flags = flags;
#ifdef MAC
error = mac_vnode_check_setflags(td->td_ucred, vp, vattr.va_flags);
if (error == 0)
#endif
error = VOP_SETATTR(vp, &vattr, td->td_ucred);
VOP_UNLOCK(vp, 0);
vn_finished_write(mp);
return (error);
}
/*
* Change flags of a file given a path name.
*/
#ifndef _SYS_SYSPROTO_H_
struct chflags_args {
char *path;
int flags;
};
#endif
int
-chflags(td, uap)
+sys_chflags(td, uap)
struct thread *td;
register struct chflags_args /* {
char *path;
int flags;
} */ *uap;
{
int error;
struct nameidata nd;
int vfslocked;
AUDIT_ARG_FFLAGS(uap->flags);
NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1, UIO_USERSPACE,
uap->path, td);
if ((error = namei(&nd)) != 0)
return (error);
NDFREE(&nd, NDF_ONLY_PNBUF);
vfslocked = NDHASGIANT(&nd);
error = setfflags(td, nd.ni_vp, uap->flags);
vrele(nd.ni_vp);
VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
/*
* Same as chflags() but doesn't follow symlinks.
*/
int
-lchflags(td, uap)
+sys_lchflags(td, uap)
struct thread *td;
register struct lchflags_args /* {
char *path;
int flags;
} */ *uap;
{
int error;
struct nameidata nd;
int vfslocked;
AUDIT_ARG_FFLAGS(uap->flags);
NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE | AUDITVNODE1, UIO_USERSPACE,
uap->path, td);
if ((error = namei(&nd)) != 0)
return (error);
vfslocked = NDHASGIANT(&nd);
NDFREE(&nd, NDF_ONLY_PNBUF);
error = setfflags(td, nd.ni_vp, uap->flags);
vrele(nd.ni_vp);
VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
/*
* Change flags of a file given a file descriptor.
*/
#ifndef _SYS_SYSPROTO_H_
struct fchflags_args {
int fd;
int flags;
};
#endif
int
-fchflags(td, uap)
+sys_fchflags(td, uap)
struct thread *td;
register struct fchflags_args /* {
int fd;
int flags;
} */ *uap;
{
struct file *fp;
int vfslocked;
int error;
AUDIT_ARG_FD(uap->fd);
AUDIT_ARG_FFLAGS(uap->flags);
if ((error = getvnode(td->td_proc->p_fd, uap->fd, CAP_FCHFLAGS,
&fp)) != 0)
return (error);
vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
#ifdef AUDIT
vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
AUDIT_ARG_VNODE1(fp->f_vnode);
VOP_UNLOCK(fp->f_vnode, 0);
#endif
error = setfflags(td, fp->f_vnode, uap->flags);
VFS_UNLOCK_GIANT(vfslocked);
fdrop(fp, td);
return (error);
}
/*
* Common implementation code for chmod(), lchmod() and fchmod().
*/
int
setfmode(td, cred, vp, mode)
struct thread *td;
struct ucred *cred;
struct vnode *vp;
int mode;
{
int error;
struct mount *mp;
struct vattr vattr;
if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
return (error);
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
VATTR_NULL(&vattr);
vattr.va_mode = mode & ALLPERMS;
#ifdef MAC
error = mac_vnode_check_setmode(cred, vp, vattr.va_mode);
if (error == 0)
#endif
error = VOP_SETATTR(vp, &vattr, cred);
VOP_UNLOCK(vp, 0);
vn_finished_write(mp);
return (error);
}
/*
* Change mode of a file given path name.
*/
#ifndef _SYS_SYSPROTO_H_
struct chmod_args {
char *path;
int mode;
};
#endif
int
-chmod(td, uap)
+sys_chmod(td, uap)
struct thread *td;
register struct chmod_args /* {
char *path;
int mode;
} */ *uap;
{
return (kern_chmod(td, uap->path, UIO_USERSPACE, uap->mode));
}
#ifndef _SYS_SYSPROTO_H_
struct fchmodat_args {
int dirfd;
char *path;
mode_t mode;
int flag;
}
#endif
int
-fchmodat(struct thread *td, struct fchmodat_args *uap)
+sys_fchmodat(struct thread *td, struct fchmodat_args *uap)
{
int flag = uap->flag;
int fd = uap->fd;
char *path = uap->path;
mode_t mode = uap->mode;
if (flag & ~AT_SYMLINK_NOFOLLOW)
return (EINVAL);
return (kern_fchmodat(td, fd, path, UIO_USERSPACE, mode, flag));
}
int
kern_chmod(struct thread *td, char *path, enum uio_seg pathseg, int mode)
{
return (kern_fchmodat(td, AT_FDCWD, path, pathseg, mode, 0));
}
/*
* Change mode of a file given path name (don't follow links.)
*/
#ifndef _SYS_SYSPROTO_H_
struct lchmod_args {
char *path;
int mode;
};
#endif
int
-lchmod(td, uap)
+sys_lchmod(td, uap)
struct thread *td;
register struct lchmod_args /* {
char *path;
int mode;
} */ *uap;
{
return (kern_fchmodat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
uap->mode, AT_SYMLINK_NOFOLLOW));
}
int
kern_fchmodat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
mode_t mode, int flag)
{
int error;
struct nameidata nd;
int vfslocked;
int follow;
AUDIT_ARG_MODE(mode);
follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
NDINIT_ATRIGHTS(&nd, LOOKUP, follow | MPSAFE | AUDITVNODE1, pathseg,
path, fd, CAP_FCHMOD, td);
if ((error = namei(&nd)) != 0)
return (error);
vfslocked = NDHASGIANT(&nd);
NDFREE(&nd, NDF_ONLY_PNBUF);
error = setfmode(td, td->td_ucred, nd.ni_vp, mode);
vrele(nd.ni_vp);
VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
/*
* Change mode of a file given a file descriptor.
*/
#ifndef _SYS_SYSPROTO_H_
struct fchmod_args {
int fd;
int mode;
};
#endif
int
-fchmod(struct thread *td, struct fchmod_args *uap)
+sys_fchmod(struct thread *td, struct fchmod_args *uap)
{
struct file *fp;
int error;
AUDIT_ARG_FD(uap->fd);
AUDIT_ARG_MODE(uap->mode);
error = fget(td, uap->fd, CAP_FCHMOD, &fp);
if (error != 0)
return (error);
error = fo_chmod(fp, uap->mode, td->td_ucred, td);
fdrop(fp, td);
return (error);
}
/*
* Common implementation for chown(), lchown(), and fchown()
*/
int
setfown(td, cred, vp, uid, gid)
struct thread *td;
struct ucred *cred;
struct vnode *vp;
uid_t uid;
gid_t gid;
{
int error;
struct mount *mp;
struct vattr vattr;
if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
return (error);
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
VATTR_NULL(&vattr);
vattr.va_uid = uid;
vattr.va_gid = gid;
#ifdef MAC
error = mac_vnode_check_setowner(cred, vp, vattr.va_uid,
vattr.va_gid);
if (error == 0)
#endif
error = VOP_SETATTR(vp, &vattr, cred);
VOP_UNLOCK(vp, 0);
vn_finished_write(mp);
return (error);
}
/*
* Set ownership given a path name.
*/
#ifndef _SYS_SYSPROTO_H_
struct chown_args {
char *path;
int uid;
int gid;
};
#endif
int
-chown(td, uap)
+sys_chown(td, uap)
struct thread *td;
register struct chown_args /* {
char *path;
int uid;
int gid;
} */ *uap;
{
return (kern_chown(td, uap->path, UIO_USERSPACE, uap->uid, uap->gid));
}
#ifndef _SYS_SYSPROTO_H_
struct fchownat_args {
int fd;
const char * path;
uid_t uid;
gid_t gid;
int flag;
};
#endif
int
-fchownat(struct thread *td, struct fchownat_args *uap)
+sys_fchownat(struct thread *td, struct fchownat_args *uap)
{
int flag;
flag = uap->flag;
if (flag & ~AT_SYMLINK_NOFOLLOW)
return (EINVAL);
return (kern_fchownat(td, uap->fd, uap->path, UIO_USERSPACE, uap->uid,
uap->gid, uap->flag));
}
int
kern_chown(struct thread *td, char *path, enum uio_seg pathseg, int uid,
int gid)
{
return (kern_fchownat(td, AT_FDCWD, path, pathseg, uid, gid, 0));
}
int
kern_fchownat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
int uid, int gid, int flag)
{
struct nameidata nd;
int error, vfslocked, follow;
AUDIT_ARG_OWNER(uid, gid);
follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
NDINIT_ATRIGHTS(&nd, LOOKUP, follow | MPSAFE | AUDITVNODE1, pathseg,
path, fd, CAP_FCHOWN, td);
if ((error = namei(&nd)) != 0)
return (error);
vfslocked = NDHASGIANT(&nd);
NDFREE(&nd, NDF_ONLY_PNBUF);
error = setfown(td, td->td_ucred, nd.ni_vp, uid, gid);
vrele(nd.ni_vp);
VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
/*
* Set ownership given a path name, do not cross symlinks.
*/
#ifndef _SYS_SYSPROTO_H_
struct lchown_args {
char *path;
int uid;
int gid;
};
#endif
int
-lchown(td, uap)
+sys_lchown(td, uap)
struct thread *td;
register struct lchown_args /* {
char *path;
int uid;
int gid;
} */ *uap;
{
return (kern_lchown(td, uap->path, UIO_USERSPACE, uap->uid, uap->gid));
}
int
kern_lchown(struct thread *td, char *path, enum uio_seg pathseg, int uid,
int gid)
{
return (kern_fchownat(td, AT_FDCWD, path, pathseg, uid, gid,
AT_SYMLINK_NOFOLLOW));
}
/*
* Set ownership given a file descriptor.
*/
#ifndef _SYS_SYSPROTO_H_
struct fchown_args {
int fd;
int uid;
int gid;
};
#endif
int
-fchown(td, uap)
+sys_fchown(td, uap)
struct thread *td;
register struct fchown_args /* {
int fd;
int uid;
int gid;
} */ *uap;
{
struct file *fp;
int error;
AUDIT_ARG_FD(uap->fd);
AUDIT_ARG_OWNER(uap->uid, uap->gid);
error = fget(td, uap->fd, CAP_FCHOWN, &fp);
if (error != 0)
return (error);
error = fo_chown(fp, uap->uid, uap->gid, td->td_ucred, td);
fdrop(fp, td);
return (error);
}
/*
* Common implementation code for utimes(), lutimes(), and futimes().
*/
static int
getutimes(usrtvp, tvpseg, tsp)
const struct timeval *usrtvp;
enum uio_seg tvpseg;
struct timespec *tsp;
{
struct timeval tv[2];
const struct timeval *tvp;
int error;
if (usrtvp == NULL) {
vfs_timestamp(&tsp[0]);
tsp[1] = tsp[0];
} else {
if (tvpseg == UIO_SYSSPACE) {
tvp = usrtvp;
} else {
if ((error = copyin(usrtvp, tv, sizeof(tv))) != 0)
return (error);
tvp = tv;
}
if (tvp[0].tv_usec < 0 || tvp[0].tv_usec >= 1000000 ||
tvp[1].tv_usec < 0 || tvp[1].tv_usec >= 1000000)
return (EINVAL);
TIMEVAL_TO_TIMESPEC(&tvp[0], &tsp[0]);
TIMEVAL_TO_TIMESPEC(&tvp[1], &tsp[1]);
}
return (0);
}
/*
* Common implementation code for utimes(), lutimes(), and futimes().
*/
static int
setutimes(td, vp, ts, numtimes, nullflag)
struct thread *td;
struct vnode *vp;
const struct timespec *ts;
int numtimes;
int nullflag;
{
int error, setbirthtime;
struct mount *mp;
struct vattr vattr;
if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
return (error);
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
setbirthtime = 0;
if (numtimes < 3 && !VOP_GETATTR(vp, &vattr, td->td_ucred) &&
timespeccmp(&ts[1], &vattr.va_birthtime, < ))
setbirthtime = 1;
VATTR_NULL(&vattr);
vattr.va_atime = ts[0];
vattr.va_mtime = ts[1];
if (setbirthtime)
vattr.va_birthtime = ts[1];
if (numtimes > 2)
vattr.va_birthtime = ts[2];
if (nullflag)
vattr.va_vaflags |= VA_UTIMES_NULL;
#ifdef MAC
error = mac_vnode_check_setutimes(td->td_ucred, vp, vattr.va_atime,
vattr.va_mtime);
#endif
if (error == 0)
error = VOP_SETATTR(vp, &vattr, td->td_ucred);
VOP_UNLOCK(vp, 0);
vn_finished_write(mp);
return (error);
}
/*
* Set the access and modification times of a file.
*/
#ifndef _SYS_SYSPROTO_H_
struct utimes_args {
char *path;
struct timeval *tptr;
};
#endif
int
-utimes(td, uap)
+sys_utimes(td, uap)
struct thread *td;
register struct utimes_args /* {
char *path;
struct timeval *tptr;
} */ *uap;
{
return (kern_utimes(td, uap->path, UIO_USERSPACE, uap->tptr,
UIO_USERSPACE));
}
#ifndef _SYS_SYSPROTO_H_
struct futimesat_args {
int fd;
const char * path;
const struct timeval * times;
};
#endif
int
-futimesat(struct thread *td, struct futimesat_args *uap)
+sys_futimesat(struct thread *td, struct futimesat_args *uap)
{
return (kern_utimesat(td, uap->fd, uap->path, UIO_USERSPACE,
uap->times, UIO_USERSPACE));
}
int
kern_utimes(struct thread *td, char *path, enum uio_seg pathseg,
struct timeval *tptr, enum uio_seg tptrseg)
{
return (kern_utimesat(td, AT_FDCWD, path, pathseg, tptr, tptrseg));
}
int
kern_utimesat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
struct timeval *tptr, enum uio_seg tptrseg)
{
struct nameidata nd;
struct timespec ts[2];
int error, vfslocked;
if ((error = getutimes(tptr, tptrseg, ts)) != 0)
return (error);
NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1, pathseg,
path, fd, CAP_FUTIMES, td);
if ((error = namei(&nd)) != 0)
return (error);
vfslocked = NDHASGIANT(&nd);
NDFREE(&nd, NDF_ONLY_PNBUF);
error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL);
vrele(nd.ni_vp);
VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
/*
* Set the access and modification times of a file.
*/
#ifndef _SYS_SYSPROTO_H_
struct lutimes_args {
char *path;
struct timeval *tptr;
};
#endif
int
-lutimes(td, uap)
+sys_lutimes(td, uap)
struct thread *td;
register struct lutimes_args /* {
char *path;
struct timeval *tptr;
} */ *uap;
{
return (kern_lutimes(td, uap->path, UIO_USERSPACE, uap->tptr,
UIO_USERSPACE));
}
int
kern_lutimes(struct thread *td, char *path, enum uio_seg pathseg,
struct timeval *tptr, enum uio_seg tptrseg)
{
struct timespec ts[2];
int error;
struct nameidata nd;
int vfslocked;
if ((error = getutimes(tptr, tptrseg, ts)) != 0)
return (error);
NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE | AUDITVNODE1, pathseg, path, td);
if ((error = namei(&nd)) != 0)
return (error);
vfslocked = NDHASGIANT(&nd);
NDFREE(&nd, NDF_ONLY_PNBUF);
error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL);
vrele(nd.ni_vp);
VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
/*
* Set the access and modification times of a file.
*/
#ifndef _SYS_SYSPROTO_H_
struct futimes_args {
int fd;
struct timeval *tptr;
};
#endif
int
-futimes(td, uap)
+sys_futimes(td, uap)
struct thread *td;
register struct futimes_args /* {
int fd;
struct timeval *tptr;
} */ *uap;
{
return (kern_futimes(td, uap->fd, uap->tptr, UIO_USERSPACE));
}
int
kern_futimes(struct thread *td, int fd, struct timeval *tptr,
enum uio_seg tptrseg)
{
struct timespec ts[2];
struct file *fp;
int vfslocked;
int error;
AUDIT_ARG_FD(fd);
if ((error = getutimes(tptr, tptrseg, ts)) != 0)
return (error);
if ((error = getvnode(td->td_proc->p_fd, fd, CAP_FUTIMES, &fp))
!= 0)
return (error);
vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
#ifdef AUDIT
vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
AUDIT_ARG_VNODE1(fp->f_vnode);
VOP_UNLOCK(fp->f_vnode, 0);
#endif
error = setutimes(td, fp->f_vnode, ts, 2, tptr == NULL);
VFS_UNLOCK_GIANT(vfslocked);
fdrop(fp, td);
return (error);
}
/*
* Truncate a file given its path name.
*/
#ifndef _SYS_SYSPROTO_H_
struct truncate_args {
char *path;
int pad;
off_t length;
};
#endif
int
-truncate(td, uap)
+sys_truncate(td, uap)
struct thread *td;
register struct truncate_args /* {
char *path;
int pad;
off_t length;
} */ *uap;
{
return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length));
}
int
kern_truncate(struct thread *td, char *path, enum uio_seg pathseg, off_t length)
{
struct mount *mp;
struct vnode *vp;
struct vattr vattr;
int error;
struct nameidata nd;
int vfslocked;
if (length < 0)
return(EINVAL);
NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1, pathseg, path, td);
if ((error = namei(&nd)) != 0)
return (error);
vfslocked = NDHASGIANT(&nd);
vp = nd.ni_vp;
if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
vrele(vp);
VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
NDFREE(&nd, NDF_ONLY_PNBUF);
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
if (vp->v_type == VDIR)
error = EISDIR;
#ifdef MAC
else if ((error = mac_vnode_check_write(td->td_ucred, NOCRED, vp))) {
}
#endif
else if ((error = vn_writechk(vp)) == 0 &&
(error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td)) == 0) {
VATTR_NULL(&vattr);
vattr.va_size = length;
error = VOP_SETATTR(vp, &vattr, td->td_ucred);
}
vput(vp);
vn_finished_write(mp);
VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
#if defined(COMPAT_43)
/*
* Truncate a file given its path name.
*/
#ifndef _SYS_SYSPROTO_H_
struct otruncate_args {
char *path;
long length;
};
#endif
int
otruncate(td, uap)
struct thread *td;
register struct otruncate_args /* {
char *path;
long length;
} */ *uap;
{
struct truncate_args /* {
char *path;
int pad;
off_t length;
} */ nuap;
nuap.path = uap->path;
nuap.length = uap->length;
- return (truncate(td, &nuap));
+ return (sys_truncate(td, &nuap));
}
#endif /* COMPAT_43 */
/* Versions with the pad argument */
int
freebsd6_truncate(struct thread *td, struct freebsd6_truncate_args *uap)
{
struct truncate_args ouap;
ouap.path = uap->path;
ouap.length = uap->length;
- return (truncate(td, &ouap));
+ return (sys_truncate(td, &ouap));
}
int
freebsd6_ftruncate(struct thread *td, struct freebsd6_ftruncate_args *uap)
{
struct ftruncate_args ouap;
ouap.fd = uap->fd;
ouap.length = uap->length;
- return (ftruncate(td, &ouap));
+ return (sys_ftruncate(td, &ouap));
}
/*
* Sync an open file.
*/
#ifndef _SYS_SYSPROTO_H_
struct fsync_args {
int fd;
};
#endif
int
-fsync(td, uap)
+sys_fsync(td, uap)
struct thread *td;
struct fsync_args /* {
int fd;
} */ *uap;
{
struct vnode *vp;
struct mount *mp;
struct file *fp;
int vfslocked;
int error, lock_flags;
AUDIT_ARG_FD(uap->fd);
if ((error = getvnode(td->td_proc->p_fd, uap->fd, CAP_FSYNC,
&fp)) != 0)
return (error);
vp = fp->f_vnode;
vfslocked = VFS_LOCK_GIANT(vp->v_mount);
if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
goto drop;
if (MNT_SHARED_WRITES(mp) ||
((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) {
lock_flags = LK_SHARED;
} else {
lock_flags = LK_EXCLUSIVE;
}
vn_lock(vp, lock_flags | LK_RETRY);
AUDIT_ARG_VNODE1(vp);
if (vp->v_object != NULL) {
VM_OBJECT_LOCK(vp->v_object);
vm_object_page_clean(vp->v_object, 0, 0, 0);
VM_OBJECT_UNLOCK(vp->v_object);
}
error = VOP_FSYNC(vp, MNT_WAIT, td);
VOP_UNLOCK(vp, 0);
vn_finished_write(mp);
drop:
VFS_UNLOCK_GIANT(vfslocked);
fdrop(fp, td);
return (error);
}
/*
* Rename files. Source and destination must either both be directories, or
* both not be directories. If target is a directory, it must be empty.
*/
#ifndef _SYS_SYSPROTO_H_
struct rename_args {
char *from;
char *to;
};
#endif
int
-rename(td, uap)
+sys_rename(td, uap)
struct thread *td;
register struct rename_args /* {
char *from;
char *to;
} */ *uap;
{
return (kern_rename(td, uap->from, uap->to, UIO_USERSPACE));
}
#ifndef _SYS_SYSPROTO_H_
struct renameat_args {
int oldfd;
char *old;
int newfd;
char *new;
};
#endif
int
-renameat(struct thread *td, struct renameat_args *uap)
+sys_renameat(struct thread *td, struct renameat_args *uap)
{
return (kern_renameat(td, uap->oldfd, uap->old, uap->newfd, uap->new,
UIO_USERSPACE));
}
int
kern_rename(struct thread *td, char *from, char *to, enum uio_seg pathseg)
{
return (kern_renameat(td, AT_FDCWD, from, AT_FDCWD, to, pathseg));
}
int
kern_renameat(struct thread *td, int oldfd, char *old, int newfd, char *new,
enum uio_seg pathseg)
{
struct mount *mp = NULL;
struct vnode *tvp, *fvp, *tdvp;
struct nameidata fromnd, tond;
int tvfslocked;
int fvfslocked;
int error;
bwillwrite();
#ifdef MAC
NDINIT_ATRIGHTS(&fromnd, DELETE, LOCKPARENT | LOCKLEAF | SAVESTART |
MPSAFE | AUDITVNODE1, pathseg, old, oldfd, CAP_DELETE, td);
#else
NDINIT_ATRIGHTS(&fromnd, DELETE, WANTPARENT | SAVESTART | MPSAFE |
AUDITVNODE1, pathseg, old, oldfd, CAP_DELETE, td);
#endif
if ((error = namei(&fromnd)) != 0)
return (error);
fvfslocked = NDHASGIANT(&fromnd);
tvfslocked = 0;
#ifdef MAC
error = mac_vnode_check_rename_from(td->td_ucred, fromnd.ni_dvp,
fromnd.ni_vp, &fromnd.ni_cnd);
VOP_UNLOCK(fromnd.ni_dvp, 0);
if (fromnd.ni_dvp != fromnd.ni_vp)
VOP_UNLOCK(fromnd.ni_vp, 0);
#endif
fvp = fromnd.ni_vp;
if (error == 0)
error = vn_start_write(fvp, &mp, V_WAIT | PCATCH);
if (error != 0) {
NDFREE(&fromnd, NDF_ONLY_PNBUF);
vrele(fromnd.ni_dvp);
vrele(fvp);
goto out1;
}
NDINIT_ATRIGHTS(&tond, RENAME, LOCKPARENT | LOCKLEAF | NOCACHE |
SAVESTART | MPSAFE | AUDITVNODE2, pathseg, new, newfd, CAP_CREATE,
td);
if (fromnd.ni_vp->v_type == VDIR)
tond.ni_cnd.cn_flags |= WILLBEDIR;
if ((error = namei(&tond)) != 0) {
/* Translate error code for rename("dir1", "dir2/."). */
if (error == EISDIR && fvp->v_type == VDIR)
error = EINVAL;
NDFREE(&fromnd, NDF_ONLY_PNBUF);
vrele(fromnd.ni_dvp);
vrele(fvp);
vn_finished_write(mp);
goto out1;
}
tvfslocked = NDHASGIANT(&tond);
tdvp = tond.ni_dvp;
tvp = tond.ni_vp;
if (tvp != NULL) {
if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
error = ENOTDIR;
goto out;
} else if (fvp->v_type != VDIR && tvp->v_type == VDIR) {
error = EISDIR;
goto out;
}
}
if (fvp == tdvp) {
error = EINVAL;
goto out;
}
/*
* If the source is the same as the destination (that is, if they
* are links to the same vnode), then there is nothing to do.
*/
if (fvp == tvp)
error = -1;
#ifdef MAC
else
error = mac_vnode_check_rename_to(td->td_ucred, tdvp,
tond.ni_vp, fromnd.ni_dvp == tdvp, &tond.ni_cnd);
#endif
out:
if (!error) {
error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd,
tond.ni_dvp, tond.ni_vp, &tond.ni_cnd);
NDFREE(&fromnd, NDF_ONLY_PNBUF);
NDFREE(&tond, NDF_ONLY_PNBUF);
} else {
NDFREE(&fromnd, NDF_ONLY_PNBUF);
NDFREE(&tond, NDF_ONLY_PNBUF);
if (tvp)
vput(tvp);
if (tdvp == tvp)
vrele(tdvp);
else
vput(tdvp);
vrele(fromnd.ni_dvp);
vrele(fvp);
}
vrele(tond.ni_startdir);
vn_finished_write(mp);
out1:
if (fromnd.ni_startdir)
vrele(fromnd.ni_startdir);
VFS_UNLOCK_GIANT(fvfslocked);
VFS_UNLOCK_GIANT(tvfslocked);
if (error == -1)
return (0);
return (error);
}
/*
* Make a directory file.
*/
#ifndef _SYS_SYSPROTO_H_
struct mkdir_args {
char *path;
int mode;
};
#endif
int
-mkdir(td, uap)
+sys_mkdir(td, uap)
struct thread *td;
register struct mkdir_args /* {
char *path;
int mode;
} */ *uap;
{
return (kern_mkdir(td, uap->path, UIO_USERSPACE, uap->mode));
}
#ifndef _SYS_SYSPROTO_H_
struct mkdirat_args {
int fd;
char *path;
mode_t mode;
};
#endif
int
-mkdirat(struct thread *td, struct mkdirat_args *uap)
+sys_mkdirat(struct thread *td, struct mkdirat_args *uap)
{
return (kern_mkdirat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode));
}
int
kern_mkdir(struct thread *td, char *path, enum uio_seg segflg, int mode)
{
return (kern_mkdirat(td, AT_FDCWD, path, segflg, mode));
}
int
kern_mkdirat(struct thread *td, int fd, char *path, enum uio_seg segflg,
int mode)
{
struct mount *mp;
struct vnode *vp;
struct vattr vattr;
int error;
struct nameidata nd;
int vfslocked;
AUDIT_ARG_MODE(mode);
restart:
bwillwrite();
NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE |
AUDITVNODE1, segflg, path, fd, CAP_MKDIR, td);
nd.ni_cnd.cn_flags |= WILLBEDIR;
if ((error = namei(&nd)) != 0)
return (error);
vfslocked = NDHASGIANT(&nd);
vp = nd.ni_vp;
if (vp != NULL) {
NDFREE(&nd, NDF_ONLY_PNBUF);
/*
* XXX namei called with LOCKPARENT but not LOCKLEAF has
* the strange behaviour of leaving the vnode unlocked
* if the target is the same vnode as the parent.
*/
if (vp == nd.ni_dvp)
vrele(nd.ni_dvp);
else
vput(nd.ni_dvp);
vrele(vp);
VFS_UNLOCK_GIANT(vfslocked);
return (EEXIST);
}
if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
NDFREE(&nd, NDF_ONLY_PNBUF);
vput(nd.ni_dvp);
VFS_UNLOCK_GIANT(vfslocked);
if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
return (error);
goto restart;
}
VATTR_NULL(&vattr);
vattr.va_type = VDIR;
vattr.va_mode = (mode & ACCESSPERMS) &~ td->td_proc->p_fd->fd_cmask;
#ifdef MAC
error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
&vattr);
if (error)
goto out;
#endif
error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
#ifdef MAC
out:
#endif
NDFREE(&nd, NDF_ONLY_PNBUF);
vput(nd.ni_dvp);
if (!error)
vput(nd.ni_vp);
vn_finished_write(mp);
VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
/*
* Remove a directory file.
*/
#ifndef _SYS_SYSPROTO_H_
struct rmdir_args {
char *path;
};
#endif
int
-rmdir(td, uap)
+sys_rmdir(td, uap)
struct thread *td;
struct rmdir_args /* {
char *path;
} */ *uap;
{
return (kern_rmdir(td, uap->path, UIO_USERSPACE));
}
int
kern_rmdir(struct thread *td, char *path, enum uio_seg pathseg)
{
return (kern_rmdirat(td, AT_FDCWD, path, pathseg));
}
int
kern_rmdirat(struct thread *td, int fd, char *path, enum uio_seg pathseg)
{
struct mount *mp;
struct vnode *vp;
int error;
struct nameidata nd;
int vfslocked;
restart:
bwillwrite();
NDINIT_ATRIGHTS(&nd, DELETE, LOCKPARENT | LOCKLEAF | MPSAFE |
AUDITVNODE1, pathseg, path, fd, CAP_RMDIR, td);
if ((error = namei(&nd)) != 0)
return (error);
vfslocked = NDHASGIANT(&nd);
vp = nd.ni_vp;
if (vp->v_type != VDIR) {
error = ENOTDIR;
goto out;
}
/*
* No rmdir "." please.
*/
if (nd.ni_dvp == vp) {
error = EINVAL;
goto out;
}
/*
* The root of a mounted filesystem cannot be deleted.
*/
if (vp->v_vflag & VV_ROOT) {
error = EBUSY;
goto out;
}
#ifdef MAC
error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp,
&nd.ni_cnd);
if (error)
goto out;
#endif
if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
NDFREE(&nd, NDF_ONLY_PNBUF);
vput(vp);
if (nd.ni_dvp == vp)
vrele(nd.ni_dvp);
else
vput(nd.ni_dvp);
VFS_UNLOCK_GIANT(vfslocked);
if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
return (error);
goto restart;
}
error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
vn_finished_write(mp);
out:
NDFREE(&nd, NDF_ONLY_PNBUF);
vput(vp);
if (nd.ni_dvp == vp)
vrele(nd.ni_dvp);
else
vput(nd.ni_dvp);
VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
#ifdef COMPAT_43
/*
* Read a block of directory entries in a filesystem independent format.
*/
#ifndef _SYS_SYSPROTO_H_
struct ogetdirentries_args {
int fd;
char *buf;
u_int count;
long *basep;
};
#endif
int
ogetdirentries(struct thread *td, struct ogetdirentries_args *uap)
{
long loff;
int error;
error = kern_ogetdirentries(td, uap, &loff);
if (error == 0)
error = copyout(&loff, uap->basep, sizeof(long));
return (error);
}
int
kern_ogetdirentries(struct thread *td, struct ogetdirentries_args *uap,
long *ploff)
{
struct vnode *vp;
struct file *fp;
struct uio auio, kuio;
struct iovec aiov, kiov;
struct dirent *dp, *edp;
caddr_t dirbuf;
int error, eofflag, readcnt, vfslocked;
long loff;
/* XXX arbitrary sanity limit on `count'. */
if (uap->count > 64 * 1024)
return (EINVAL);
if ((error = getvnode(td->td_proc->p_fd, uap->fd, CAP_READ,
&fp)) != 0)
return (error);
if ((fp->f_flag & FREAD) == 0) {
fdrop(fp, td);
return (EBADF);
}
vp = fp->f_vnode;
unionread:
vfslocked = VFS_LOCK_GIANT(vp->v_mount);
if (vp->v_type != VDIR) {
VFS_UNLOCK_GIANT(vfslocked);
fdrop(fp, td);
return (EINVAL);
}
aiov.iov_base = uap->buf;
aiov.iov_len = uap->count;
auio.uio_iov = &aiov;
auio.uio_iovcnt = 1;
auio.uio_rw = UIO_READ;
auio.uio_segflg = UIO_USERSPACE;
auio.uio_td = td;
auio.uio_resid = uap->count;
vn_lock(vp, LK_SHARED | LK_RETRY);
loff = auio.uio_offset = fp->f_offset;
#ifdef MAC
error = mac_vnode_check_readdir(td->td_ucred, vp);
if (error) {
VOP_UNLOCK(vp, 0);
VFS_UNLOCK_GIANT(vfslocked);
fdrop(fp, td);
return (error);
}
#endif
# if (BYTE_ORDER != LITTLE_ENDIAN)
if (vp->v_mount->mnt_maxsymlinklen <= 0) {
error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag,
NULL, NULL);
fp->f_offset = auio.uio_offset;
} else
# endif
{
kuio = auio;
kuio.uio_iov = &kiov;
kuio.uio_segflg = UIO_SYSSPACE;
kiov.iov_len = uap->count;
dirbuf = malloc(uap->count, M_TEMP, M_WAITOK);
kiov.iov_base = dirbuf;
error = VOP_READDIR(vp, &kuio, fp->f_cred, &eofflag,
NULL, NULL);
fp->f_offset = kuio.uio_offset;
if (error == 0) {
readcnt = uap->count - kuio.uio_resid;
edp = (struct dirent *)&dirbuf[readcnt];
for (dp = (struct dirent *)dirbuf; dp < edp; ) {
# if (BYTE_ORDER == LITTLE_ENDIAN)
/*
* The expected low byte of
* dp->d_namlen is our dp->d_type.
* The high MBZ byte of dp->d_namlen
* is our dp->d_namlen.
*/
dp->d_type = dp->d_namlen;
dp->d_namlen = 0;
# else
/*
* The dp->d_type is the high byte
* of the expected dp->d_namlen,
* so must be zero'ed.
*/
dp->d_type = 0;
# endif
if (dp->d_reclen > 0) {
dp = (struct dirent *)
((char *)dp + dp->d_reclen);
} else {
error = EIO;
break;
}
}
if (dp >= edp)
error = uiomove(dirbuf, readcnt, &auio);
}
free(dirbuf, M_TEMP);
}
if (error) {
VOP_UNLOCK(vp, 0);
VFS_UNLOCK_GIANT(vfslocked);
fdrop(fp, td);
return (error);
}
if (uap->count == auio.uio_resid &&
(vp->v_vflag & VV_ROOT) &&
(vp->v_mount->mnt_flag & MNT_UNION)) {
struct vnode *tvp = vp;
vp = vp->v_mount->mnt_vnodecovered;
VREF(vp);
fp->f_vnode = vp;
fp->f_data = vp;
fp->f_offset = 0;
vput(tvp);
VFS_UNLOCK_GIANT(vfslocked);
goto unionread;
}
VOP_UNLOCK(vp, 0);
VFS_UNLOCK_GIANT(vfslocked);
fdrop(fp, td);
td->td_retval[0] = uap->count - auio.uio_resid;
if (error == 0)
*ploff = loff;
return (error);
}
#endif /* COMPAT_43 */
/*
* Read a block of directory entries in a filesystem independent format.
*/
#ifndef _SYS_SYSPROTO_H_
struct getdirentries_args {
int fd;
char *buf;
u_int count;
long *basep;
};
#endif
int
-getdirentries(td, uap)
+sys_getdirentries(td, uap)
struct thread *td;
register struct getdirentries_args /* {
int fd;
char *buf;
u_int count;
long *basep;
} */ *uap;
{
long base;
int error;
error = kern_getdirentries(td, uap->fd, uap->buf, uap->count, &base);
if (error)
return (error);
if (uap->basep != NULL)
error = copyout(&base, uap->basep, sizeof(long));
return (error);
}
int
kern_getdirentries(struct thread *td, int fd, char *buf, u_int count,
long *basep)
{
struct vnode *vp;
struct file *fp;
struct uio auio;
struct iovec aiov;
int vfslocked;
long loff;
int error, eofflag;
AUDIT_ARG_FD(fd);
if (count > INT_MAX)
return (EINVAL);
if ((error = getvnode(td->td_proc->p_fd, fd, CAP_READ | CAP_SEEK,
&fp)) != 0)
return (error);
if ((fp->f_flag & FREAD) == 0) {
fdrop(fp, td);
return (EBADF);
}
vp = fp->f_vnode;
unionread:
vfslocked = VFS_LOCK_GIANT(vp->v_mount);
if (vp->v_type != VDIR) {
VFS_UNLOCK_GIANT(vfslocked);
error = EINVAL;
goto fail;
}
aiov.iov_base = buf;
aiov.iov_len = count;
auio.uio_iov = &aiov;
auio.uio_iovcnt = 1;
auio.uio_rw = UIO_READ;
auio.uio_segflg = UIO_USERSPACE;
auio.uio_td = td;
auio.uio_resid = count;
vn_lock(vp, LK_SHARED | LK_RETRY);
AUDIT_ARG_VNODE1(vp);
loff = auio.uio_offset = fp->f_offset;
#ifdef MAC
error = mac_vnode_check_readdir(td->td_ucred, vp);
if (error == 0)
#endif
error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL,
NULL);
fp->f_offset = auio.uio_offset;
if (error) {
VOP_UNLOCK(vp, 0);
VFS_UNLOCK_GIANT(vfslocked);
goto fail;
}
if (count == auio.uio_resid &&
(vp->v_vflag & VV_ROOT) &&
(vp->v_mount->mnt_flag & MNT_UNION)) {
struct vnode *tvp = vp;
vp = vp->v_mount->mnt_vnodecovered;
VREF(vp);
fp->f_vnode = vp;
fp->f_data = vp;
fp->f_offset = 0;
vput(tvp);
VFS_UNLOCK_GIANT(vfslocked);
goto unionread;
}
VOP_UNLOCK(vp, 0);
VFS_UNLOCK_GIANT(vfslocked);
*basep = loff;
td->td_retval[0] = count - auio.uio_resid;
fail:
fdrop(fp, td);
return (error);
}
#ifndef _SYS_SYSPROTO_H_
struct getdents_args {
int fd;
char *buf;
size_t count;
};
#endif
int
-getdents(td, uap)
+sys_getdents(td, uap)
struct thread *td;
register struct getdents_args /* {
int fd;
char *buf;
u_int count;
} */ *uap;
{
struct getdirentries_args ap;
ap.fd = uap->fd;
ap.buf = uap->buf;
ap.count = uap->count;
ap.basep = NULL;
- return (getdirentries(td, &ap));
+ return (sys_getdirentries(td, &ap));
}
/*
* Set the mode mask for creation of filesystem nodes.
*/
#ifndef _SYS_SYSPROTO_H_
struct umask_args {
int newmask;
};
#endif
int
-umask(td, uap)
+sys_umask(td, uap)
struct thread *td;
struct umask_args /* {
int newmask;
} */ *uap;
{
register struct filedesc *fdp;
FILEDESC_XLOCK(td->td_proc->p_fd);
fdp = td->td_proc->p_fd;
td->td_retval[0] = fdp->fd_cmask;
fdp->fd_cmask = uap->newmask & ALLPERMS;
FILEDESC_XUNLOCK(td->td_proc->p_fd);
return (0);
}
/*
* Void all references to file by ripping underlying filesystem away from
* vnode.
*/
#ifndef _SYS_SYSPROTO_H_
struct revoke_args {
char *path;
};
#endif
int
-revoke(td, uap)
+sys_revoke(td, uap)
struct thread *td;
register struct revoke_args /* {
char *path;
} */ *uap;
{
struct vnode *vp;
struct vattr vattr;
int error;
struct nameidata nd;
int vfslocked;
NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
UIO_USERSPACE, uap->path, td);
if ((error = namei(&nd)) != 0)
return (error);
vfslocked = NDHASGIANT(&nd);
vp = nd.ni_vp;
NDFREE(&nd, NDF_ONLY_PNBUF);
if (vp->v_type != VCHR || vp->v_rdev == NULL) {
error = EINVAL;
goto out;
}
#ifdef MAC
error = mac_vnode_check_revoke(td->td_ucred, vp);
if (error)
goto out;
#endif
error = VOP_GETATTR(vp, &vattr, td->td_ucred);
if (error)
goto out;
if (td->td_ucred->cr_uid != vattr.va_uid) {
error = priv_check(td, PRIV_VFS_ADMIN);
if (error)
goto out;
}
if (vcount(vp) > 1)
VOP_REVOKE(vp, REVOKEALL);
out:
vput(vp);
VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
/*
* Convert a user file descriptor to a kernel file entry and check that, if it
* is a capability, the correct rights are present. A reference on the file
* entry is held upon returning.
*/
int
getvnode(struct filedesc *fdp, int fd, cap_rights_t rights,
struct file **fpp)
{
struct file *fp;
#ifdef CAPABILITIES
struct file *fp_fromcap;
#endif
int error;
error = 0;
fp = NULL;
if ((fdp == NULL) || (fp = fget_unlocked(fdp, fd)) == NULL)
return (EBADF);
#ifdef CAPABILITIES
/*
* If the file descriptor is for a capability, test rights and use the
* file descriptor referenced by the capability.
*/
error = cap_funwrap(fp, rights, &fp_fromcap);
if (error) {
fdrop(fp, curthread);
return (error);
}
if (fp != fp_fromcap) {
fhold(fp_fromcap);
fdrop(fp, curthread);
fp = fp_fromcap;
}
#endif /* CAPABILITIES */
if (fp->f_vnode == NULL) {
fdrop(fp, curthread);
return (EINVAL);
}
*fpp = fp;
return (0);
}
/*
* Get an (NFS) file handle.
*/
#ifndef _SYS_SYSPROTO_H_
struct lgetfh_args {
char *fname;
fhandle_t *fhp;
};
#endif
int
-lgetfh(td, uap)
+sys_lgetfh(td, uap)
struct thread *td;
register struct lgetfh_args *uap;
{
struct nameidata nd;
fhandle_t fh;
register struct vnode *vp;
int vfslocked;
int error;
error = priv_check(td, PRIV_VFS_GETFH);
if (error)
return (error);
NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
UIO_USERSPACE, uap->fname, td);
error = namei(&nd);
if (error)
return (error);
vfslocked = NDHASGIANT(&nd);
NDFREE(&nd, NDF_ONLY_PNBUF);
vp = nd.ni_vp;
bzero(&fh, sizeof(fh));
fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid;
error = VOP_VPTOFH(vp, &fh.fh_fid);
vput(vp);
VFS_UNLOCK_GIANT(vfslocked);
if (error)
return (error);
error = copyout(&fh, uap->fhp, sizeof (fh));
return (error);
}
#ifndef _SYS_SYSPROTO_H_
struct getfh_args {
char *fname;
fhandle_t *fhp;
};
#endif
int
-getfh(td, uap)
+sys_getfh(td, uap)
struct thread *td;
register struct getfh_args *uap;
{
struct nameidata nd;
fhandle_t fh;
register struct vnode *vp;
int vfslocked;
int error;
error = priv_check(td, PRIV_VFS_GETFH);
if (error)
return (error);
NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
UIO_USERSPACE, uap->fname, td);
error = namei(&nd);
if (error)
return (error);
vfslocked = NDHASGIANT(&nd);
NDFREE(&nd, NDF_ONLY_PNBUF);
vp = nd.ni_vp;
bzero(&fh, sizeof(fh));
fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid;
error = VOP_VPTOFH(vp, &fh.fh_fid);
vput(vp);
VFS_UNLOCK_GIANT(vfslocked);
if (error)
return (error);
error = copyout(&fh, uap->fhp, sizeof (fh));
return (error);
}
/*
* syscall for the rpc.lockd to use to translate a NFS file handle into an
* open descriptor.
*
* warning: do not remove the priv_check() call or this becomes one giant
* security hole.
*/
#ifndef _SYS_SYSPROTO_H_
struct fhopen_args {
const struct fhandle *u_fhp;
int flags;
};
#endif
int
-fhopen(td, uap)
+sys_fhopen(td, uap)
struct thread *td;
struct fhopen_args /* {
const struct fhandle *u_fhp;
int flags;
} */ *uap;
{
struct proc *p = td->td_proc;
struct mount *mp;
struct vnode *vp;
struct fhandle fhp;
struct vattr vat;
struct vattr *vap = &vat;
struct flock lf;
struct file *fp;
register struct filedesc *fdp = p->p_fd;
int fmode, error, type;
accmode_t accmode;
struct file *nfp;
int vfslocked;
int indx;
error = priv_check(td, PRIV_VFS_FHOPEN);
if (error)
return (error);
fmode = FFLAGS(uap->flags);
/* why not allow a non-read/write open for our lockd? */
if (((fmode & (FREAD | FWRITE)) == 0) || (fmode & O_CREAT))
return (EINVAL);
error = copyin(uap->u_fhp, &fhp, sizeof(fhp));
if (error)
return(error);
/* find the mount point */
mp = vfs_busyfs(&fhp.fh_fsid);
if (mp == NULL)
return (ESTALE);
vfslocked = VFS_LOCK_GIANT(mp);
/* now give me my vnode, it gets returned to me locked */
error = VFS_FHTOVP(mp, &fhp.fh_fid, LK_EXCLUSIVE, &vp);
vfs_unbusy(mp);
if (error)
goto out;
/*
* from now on we have to make sure not
* to forget about the vnode
* any error that causes an abort must vput(vp)
* just set error = err and 'goto bad;'.
*/
/*
* from vn_open
*/
if (vp->v_type == VLNK) {
error = EMLINK;
goto bad;
}
if (vp->v_type == VSOCK) {
error = EOPNOTSUPP;
goto bad;
}
if (vp->v_type != VDIR && fmode & O_DIRECTORY) {
error = ENOTDIR;
goto bad;
}
accmode = 0;
if (fmode & (FWRITE | O_TRUNC)) {
if (vp->v_type == VDIR) {
error = EISDIR;
goto bad;
}
error = vn_writechk(vp);
if (error)
goto bad;
accmode |= VWRITE;
}
if (fmode & FREAD)
accmode |= VREAD;
if ((fmode & O_APPEND) && (fmode & FWRITE))
accmode |= VAPPEND;
#ifdef MAC
error = mac_vnode_check_open(td->td_ucred, vp, accmode);
if (error)
goto bad;
#endif
if (accmode) {
error = VOP_ACCESS(vp, accmode, td->td_ucred, td);
if (error)
goto bad;
}
if (fmode & O_TRUNC) {
vfs_ref(mp);
VOP_UNLOCK(vp, 0); /* XXX */
if ((error = vn_start_write(NULL, &mp, V_WAIT | PCATCH)) != 0) {
vrele(vp);
vfs_rel(mp);
goto out;
}
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); /* XXX */
vfs_rel(mp);
#ifdef MAC
/*
* We don't yet have fp->f_cred, so use td->td_ucred, which
* should be right.
*/
error = mac_vnode_check_write(td->td_ucred, td->td_ucred, vp);
if (error == 0) {
#endif
VATTR_NULL(vap);
vap->va_size = 0;
error = VOP_SETATTR(vp, vap, td->td_ucred);
#ifdef MAC
}
#endif
vn_finished_write(mp);
if (error)
goto bad;
}
error = VOP_OPEN(vp, fmode, td->td_ucred, td, NULL);
if (error)
goto bad;
if (fmode & FWRITE)
vp->v_writecount++;
/*
* end of vn_open code
*/
if ((error = falloc(td, &nfp, &indx, fmode)) != 0) {
if (fmode & FWRITE)
vp->v_writecount--;
goto bad;
}
/* An extra reference on `nfp' has been held for us by falloc(). */
fp = nfp;
nfp->f_vnode = vp;
finit(nfp, fmode & FMASK, DTYPE_VNODE, vp, &vnops);
if (fmode & (O_EXLOCK | O_SHLOCK)) {
lf.l_whence = SEEK_SET;
lf.l_start = 0;
lf.l_len = 0;
if (fmode & O_EXLOCK)
lf.l_type = F_WRLCK;
else
lf.l_type = F_RDLCK;
type = F_FLOCK;
if ((fmode & FNONBLOCK) == 0)
type |= F_WAIT;
VOP_UNLOCK(vp, 0);
if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
type)) != 0) {
/*
* The lock request failed. Normally close the
* descriptor but handle the case where someone might
* have dup()d or close()d it when we weren't looking.
*/
fdclose(fdp, fp, indx, td);
/*
* release our private reference
*/
fdrop(fp, td);
goto out;
}
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
atomic_set_int(&fp->f_flag, FHASLOCK);
}
VOP_UNLOCK(vp, 0);
fdrop(fp, td);
VFS_UNLOCK_GIANT(vfslocked);
td->td_retval[0] = indx;
return (0);
bad:
vput(vp);
out:
VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
/*
* Stat an (NFS) file handle.
*/
#ifndef _SYS_SYSPROTO_H_
struct fhstat_args {
struct fhandle *u_fhp;
struct stat *sb;
};
#endif
int
-fhstat(td, uap)
+sys_fhstat(td, uap)
struct thread *td;
register struct fhstat_args /* {
struct fhandle *u_fhp;
struct stat *sb;
} */ *uap;
{
struct stat sb;
fhandle_t fh;
struct mount *mp;
struct vnode *vp;
int vfslocked;
int error;
error = priv_check(td, PRIV_VFS_FHSTAT);
if (error)
return (error);
error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
if (error)
return (error);
if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
return (ESTALE);
vfslocked = VFS_LOCK_GIANT(mp);
error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp);
vfs_unbusy(mp);
if (error) {
VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
error = vn_stat(vp, &sb, td->td_ucred, NOCRED, td);
vput(vp);
VFS_UNLOCK_GIANT(vfslocked);
if (error)
return (error);
error = copyout(&sb, uap->sb, sizeof(sb));
return (error);
}
/*
* Implement fstatfs() for (NFS) file handles.
*/
#ifndef _SYS_SYSPROTO_H_
struct fhstatfs_args {
struct fhandle *u_fhp;
struct statfs *buf;
};
#endif
int
-fhstatfs(td, uap)
+sys_fhstatfs(td, uap)
struct thread *td;
struct fhstatfs_args /* {
struct fhandle *u_fhp;
struct statfs *buf;
} */ *uap;
{
struct statfs sf;
fhandle_t fh;
int error;
error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
if (error)
return (error);
error = kern_fhstatfs(td, fh, &sf);
if (error)
return (error);
return (copyout(&sf, uap->buf, sizeof(sf)));
}
int
kern_fhstatfs(struct thread *td, fhandle_t fh, struct statfs *buf)
{
struct statfs *sp;
struct mount *mp;
struct vnode *vp;
int vfslocked;
int error;
error = priv_check(td, PRIV_VFS_FHSTATFS);
if (error)
return (error);
if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
return (ESTALE);
vfslocked = VFS_LOCK_GIANT(mp);
error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp);
if (error) {
vfs_unbusy(mp);
VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
vput(vp);
error = prison_canseemount(td->td_ucred, mp);
if (error)
goto out;
#ifdef MAC
error = mac_mount_check_stat(td->td_ucred, mp);
if (error)
goto out;
#endif
/*
* Set these in case the underlying filesystem fails to do so.
*/
sp = &mp->mnt_stat;
sp->f_version = STATFS_VERSION;
sp->f_namemax = NAME_MAX;
sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
error = VFS_STATFS(mp, sp);
if (error == 0)
*buf = *sp;
out:
vfs_unbusy(mp);
VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
static int
kern_posix_fallocate(struct thread *td, int fd, off_t offset, off_t len)
{
struct file *fp;
struct mount *mp;
struct vnode *vp;
off_t olen, ooffset;
int error, vfslocked;
fp = NULL;
vfslocked = 0;
error = fget(td, fd, CAP_WRITE, &fp);
if (error != 0)
goto out;
switch (fp->f_type) {
case DTYPE_VNODE:
break;
case DTYPE_PIPE:
case DTYPE_FIFO:
error = ESPIPE;
goto out;
default:
error = ENODEV;
goto out;
}
if ((fp->f_flag & FWRITE) == 0) {
error = EBADF;
goto out;
}
vp = fp->f_vnode;
if (vp->v_type != VREG) {
error = ENODEV;
goto out;
}
if (offset < 0 || len <= 0) {
error = EINVAL;
goto out;
}
/* Check for wrap. */
if (offset > OFF_MAX - len) {
error = EFBIG;
goto out;
}
/* Allocating blocks may take a long time, so iterate. */
for (;;) {
olen = len;
ooffset = offset;
bwillwrite();
vfslocked = VFS_LOCK_GIANT(vp->v_mount);
mp = NULL;
error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
if (error != 0) {
VFS_UNLOCK_GIANT(vfslocked);
break;
}
error = vn_lock(vp, LK_EXCLUSIVE);
if (error != 0) {
vn_finished_write(mp);
VFS_UNLOCK_GIANT(vfslocked);
break;
}
#ifdef MAC
error = mac_vnode_check_write(td->td_ucred, fp->f_cred, vp);
if (error == 0)
#endif
error = VOP_ALLOCATE(vp, &offset, &len);
VOP_UNLOCK(vp, 0);
vn_finished_write(mp);
VFS_UNLOCK_GIANT(vfslocked);
if (olen + ooffset != offset + len) {
panic("offset + len changed from %jx/%jx to %jx/%jx",
ooffset, olen, offset, len);
}
if (error != 0 || len == 0)
break;
KASSERT(olen > len, ("Iteration did not make progress?"));
maybe_yield();
}
out:
if (fp != NULL)
fdrop(fp, td);
return (error);
}
int
-posix_fallocate(struct thread *td, struct posix_fallocate_args *uap)
+sys_posix_fallocate(struct thread *td, struct posix_fallocate_args *uap)
{
return (kern_posix_fallocate(td, uap->fd, uap->offset, uap->len));
}
Index: head/sys/kern/vfs_vnops.c
===================================================================
--- head/sys/kern/vfs_vnops.c (revision 225616)
+++ head/sys/kern/vfs_vnops.c (revision 225617)
@@ -1,1415 +1,1415 @@
/*-
* Copyright (c) 1982, 1986, 1989, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)vfs_vnops.c 8.2 (Berkeley) 1/21/94
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/fcntl.h>
#include <sys/file.h>
#include <sys/kdb.h>
#include <sys/stat.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/limits.h>
#include <sys/lock.h>
#include <sys/mount.h>
#include <sys/mutex.h>
#include <sys/namei.h>
#include <sys/vnode.h>
#include <sys/bio.h>
#include <sys/buf.h>
#include <sys/filio.h>
#include <sys/resourcevar.h>
#include <sys/sx.h>
#include <sys/ttycom.h>
#include <sys/conf.h>
#include <sys/syslog.h>
#include <sys/unistd.h>
#include <security/audit/audit.h>
#include <security/mac/mac_framework.h>
#include <vm/vm.h>
#include <vm/vm_object.h>
static fo_rdwr_t vn_read;
static fo_rdwr_t vn_write;
static fo_truncate_t vn_truncate;
static fo_ioctl_t vn_ioctl;
static fo_poll_t vn_poll;
static fo_kqfilter_t vn_kqfilter;
static fo_stat_t vn_statfile;
static fo_close_t vn_closefile;
struct fileops vnops = {
.fo_read = vn_read,
.fo_write = vn_write,
.fo_truncate = vn_truncate,
.fo_ioctl = vn_ioctl,
.fo_poll = vn_poll,
.fo_kqfilter = vn_kqfilter,
.fo_stat = vn_statfile,
.fo_close = vn_closefile,
.fo_chmod = vn_chmod,
.fo_chown = vn_chown,
.fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE
};
int
vn_open(ndp, flagp, cmode, fp)
struct nameidata *ndp;
int *flagp, cmode;
struct file *fp;
{
struct thread *td = ndp->ni_cnd.cn_thread;
return (vn_open_cred(ndp, flagp, cmode, 0, td->td_ucred, fp));
}
/*
* Common code for vnode open operations.
* Check permissions, and call the VOP_OPEN or VOP_CREATE routine.
*
* Note that this does NOT free nameidata for the successful case,
* due to the NDINIT being done elsewhere.
*/
int
vn_open_cred(struct nameidata *ndp, int *flagp, int cmode, u_int vn_open_flags,
struct ucred *cred, struct file *fp)
{
struct vnode *vp;
struct mount *mp;
struct thread *td = ndp->ni_cnd.cn_thread;
struct vattr vat;
struct vattr *vap = &vat;
int fmode, error;
accmode_t accmode;
int vfslocked, mpsafe;
mpsafe = ndp->ni_cnd.cn_flags & MPSAFE;
restart:
vfslocked = 0;
fmode = *flagp;
if (fmode & O_CREAT) {
ndp->ni_cnd.cn_nameiop = CREATE;
ndp->ni_cnd.cn_flags = ISOPEN | LOCKPARENT | LOCKLEAF |
MPSAFE;
if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0)
ndp->ni_cnd.cn_flags |= FOLLOW;
if (!(vn_open_flags & VN_OPEN_NOAUDIT))
ndp->ni_cnd.cn_flags |= AUDITVNODE1;
bwillwrite();
if ((error = namei(ndp)) != 0)
return (error);
vfslocked = NDHASGIANT(ndp);
if (!mpsafe)
ndp->ni_cnd.cn_flags &= ~MPSAFE;
if (ndp->ni_vp == NULL) {
VATTR_NULL(vap);
vap->va_type = VREG;
vap->va_mode = cmode;
if (fmode & O_EXCL)
vap->va_vaflags |= VA_EXCLUSIVE;
if (vn_start_write(ndp->ni_dvp, &mp, V_NOWAIT) != 0) {
NDFREE(ndp, NDF_ONLY_PNBUF);
vput(ndp->ni_dvp);
VFS_UNLOCK_GIANT(vfslocked);
if ((error = vn_start_write(NULL, &mp,
V_XSLEEP | PCATCH)) != 0)
return (error);
goto restart;
}
#ifdef MAC
error = mac_vnode_check_create(cred, ndp->ni_dvp,
&ndp->ni_cnd, vap);
if (error == 0)
#endif
error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp,
&ndp->ni_cnd, vap);
vput(ndp->ni_dvp);
vn_finished_write(mp);
if (error) {
VFS_UNLOCK_GIANT(vfslocked);
NDFREE(ndp, NDF_ONLY_PNBUF);
return (error);
}
fmode &= ~O_TRUNC;
vp = ndp->ni_vp;
} else {
if (ndp->ni_dvp == ndp->ni_vp)
vrele(ndp->ni_dvp);
else
vput(ndp->ni_dvp);
ndp->ni_dvp = NULL;
vp = ndp->ni_vp;
if (fmode & O_EXCL) {
error = EEXIST;
goto bad;
}
fmode &= ~O_CREAT;
}
} else {
ndp->ni_cnd.cn_nameiop = LOOKUP;
ndp->ni_cnd.cn_flags = ISOPEN |
((fmode & O_NOFOLLOW) ? NOFOLLOW : FOLLOW) |
LOCKLEAF | MPSAFE;
if (!(fmode & FWRITE))
ndp->ni_cnd.cn_flags |= LOCKSHARED;
if (!(vn_open_flags & VN_OPEN_NOAUDIT))
ndp->ni_cnd.cn_flags |= AUDITVNODE1;
if ((error = namei(ndp)) != 0)
return (error);
if (!mpsafe)
ndp->ni_cnd.cn_flags &= ~MPSAFE;
vfslocked = NDHASGIANT(ndp);
vp = ndp->ni_vp;
}
if (vp->v_type == VLNK) {
error = EMLINK;
goto bad;
}
if (vp->v_type == VSOCK) {
error = EOPNOTSUPP;
goto bad;
}
if (vp->v_type != VDIR && fmode & O_DIRECTORY) {
error = ENOTDIR;
goto bad;
}
accmode = 0;
if (fmode & (FWRITE | O_TRUNC)) {
if (vp->v_type == VDIR) {
error = EISDIR;
goto bad;
}
accmode |= VWRITE;
}
if (fmode & FREAD)
accmode |= VREAD;
if (fmode & FEXEC)
accmode |= VEXEC;
if ((fmode & O_APPEND) && (fmode & FWRITE))
accmode |= VAPPEND;
#ifdef MAC
error = mac_vnode_check_open(cred, vp, accmode);
if (error)
goto bad;
#endif
if ((fmode & O_CREAT) == 0) {
if (accmode & VWRITE) {
error = vn_writechk(vp);
if (error)
goto bad;
}
if (accmode) {
error = VOP_ACCESS(vp, accmode, cred, td);
if (error)
goto bad;
}
}
if ((error = VOP_OPEN(vp, fmode, cred, td, fp)) != 0)
goto bad;
if (fmode & FWRITE)
vp->v_writecount++;
*flagp = fmode;
ASSERT_VOP_LOCKED(vp, "vn_open_cred");
if (!mpsafe)
VFS_UNLOCK_GIANT(vfslocked);
return (0);
bad:
NDFREE(ndp, NDF_ONLY_PNBUF);
vput(vp);
VFS_UNLOCK_GIANT(vfslocked);
*flagp = fmode;
ndp->ni_vp = NULL;
return (error);
}
/*
* Check for write permissions on the specified vnode.
* Prototype text segments cannot be written.
*/
int
vn_writechk(vp)
register struct vnode *vp;
{
ASSERT_VOP_LOCKED(vp, "vn_writechk");
/*
* If there's shared text associated with
* the vnode, try to free it up once. If
* we fail, we can't allow writing.
*/
if (vp->v_vflag & VV_TEXT)
return (ETXTBSY);
return (0);
}
/*
* Vnode close call
*/
int
vn_close(vp, flags, file_cred, td)
register struct vnode *vp;
int flags;
struct ucred *file_cred;
struct thread *td;
{
struct mount *mp;
int error, lock_flags;
if (!(flags & FWRITE) && vp->v_mount != NULL &&
vp->v_mount->mnt_kern_flag & MNTK_EXTENDED_SHARED)
lock_flags = LK_SHARED;
else
lock_flags = LK_EXCLUSIVE;
VFS_ASSERT_GIANT(vp->v_mount);
vn_start_write(vp, &mp, V_WAIT);
vn_lock(vp, lock_flags | LK_RETRY);
if (flags & FWRITE) {
VNASSERT(vp->v_writecount > 0, vp,
("vn_close: negative writecount"));
vp->v_writecount--;
}
error = VOP_CLOSE(vp, flags, file_cred, td);
vput(vp);
vn_finished_write(mp);
return (error);
}
/*
* Heuristic to detect sequential operation.
*/
static int
sequential_heuristic(struct uio *uio, struct file *fp)
{
if (atomic_load_acq_int(&(fp->f_flag)) & FRDAHEAD)
return (fp->f_seqcount << IO_SEQSHIFT);
/*
* Offset 0 is handled specially. open() sets f_seqcount to 1 so
* that the first I/O is normally considered to be slightly
* sequential. Seeking to offset 0 doesn't change sequentiality
* unless previous seeks have reduced f_seqcount to 0, in which
* case offset 0 is not special.
*/
if ((uio->uio_offset == 0 && fp->f_seqcount > 0) ||
uio->uio_offset == fp->f_nextoff) {
/*
* f_seqcount is in units of fixed-size blocks so that it
* depends mainly on the amount of sequential I/O and not
* much on the number of sequential I/O's. The fixed size
* of 16384 is hard-coded here since it is (not quite) just
* a magic size that works well here. This size is more
* closely related to the best I/O size for real disks than
* to any block size used by software.
*/
fp->f_seqcount += howmany(uio->uio_resid, 16384);
if (fp->f_seqcount > IO_SEQMAX)
fp->f_seqcount = IO_SEQMAX;
return (fp->f_seqcount << IO_SEQSHIFT);
}
/* Not sequential. Quickly draw-down sequentiality. */
if (fp->f_seqcount > 1)
fp->f_seqcount = 1;
else
fp->f_seqcount = 0;
return (0);
}
/*
* Package up an I/O request on a vnode into a uio and do it.
*/
int
vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, active_cred, file_cred,
aresid, td)
enum uio_rw rw;
struct vnode *vp;
void *base;
int len;
off_t offset;
enum uio_seg segflg;
int ioflg;
struct ucred *active_cred;
struct ucred *file_cred;
int *aresid;
struct thread *td;
{
struct uio auio;
struct iovec aiov;
struct mount *mp;
struct ucred *cred;
int error, lock_flags;
VFS_ASSERT_GIANT(vp->v_mount);
if ((ioflg & IO_NODELOCKED) == 0) {
mp = NULL;
if (rw == UIO_WRITE) {
if (vp->v_type != VCHR &&
(error = vn_start_write(vp, &mp, V_WAIT | PCATCH))
!= 0)
return (error);
if (MNT_SHARED_WRITES(mp) ||
((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) {
lock_flags = LK_SHARED;
} else {
lock_flags = LK_EXCLUSIVE;
}
vn_lock(vp, lock_flags | LK_RETRY);
} else
vn_lock(vp, LK_SHARED | LK_RETRY);
}
ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
auio.uio_iov = &aiov;
auio.uio_iovcnt = 1;
aiov.iov_base = base;
aiov.iov_len = len;
auio.uio_resid = len;
auio.uio_offset = offset;
auio.uio_segflg = segflg;
auio.uio_rw = rw;
auio.uio_td = td;
error = 0;
#ifdef MAC
if ((ioflg & IO_NOMACCHECK) == 0) {
if (rw == UIO_READ)
error = mac_vnode_check_read(active_cred, file_cred,
vp);
else
error = mac_vnode_check_write(active_cred, file_cred,
vp);
}
#endif
if (error == 0) {
if (file_cred)
cred = file_cred;
else
cred = active_cred;
if (rw == UIO_READ)
error = VOP_READ(vp, &auio, ioflg, cred);
else
error = VOP_WRITE(vp, &auio, ioflg, cred);
}
if (aresid)
*aresid = auio.uio_resid;
else
if (auio.uio_resid && error == 0)
error = EIO;
if ((ioflg & IO_NODELOCKED) == 0) {
if (rw == UIO_WRITE && vp->v_type != VCHR)
vn_finished_write(mp);
VOP_UNLOCK(vp, 0);
}
return (error);
}
/*
* Package up an I/O request on a vnode into a uio and do it. The I/O
* request is split up into smaller chunks and we try to avoid saturating
* the buffer cache while potentially holding a vnode locked, so we
* check bwillwrite() before calling vn_rdwr(). We also call kern_yield()
* to give other processes a chance to lock the vnode (either other processes
* core'ing the same binary, or unrelated processes scanning the directory).
*/
int
vn_rdwr_inchunks(rw, vp, base, len, offset, segflg, ioflg, active_cred,
file_cred, aresid, td)
enum uio_rw rw;
struct vnode *vp;
void *base;
size_t len;
off_t offset;
enum uio_seg segflg;
int ioflg;
struct ucred *active_cred;
struct ucred *file_cred;
size_t *aresid;
struct thread *td;
{
int error = 0;
int iaresid;
VFS_ASSERT_GIANT(vp->v_mount);
do {
int chunk;
/*
* Force `offset' to a multiple of MAXBSIZE except possibly
* for the first chunk, so that filesystems only need to
* write full blocks except possibly for the first and last
* chunks.
*/
chunk = MAXBSIZE - (uoff_t)offset % MAXBSIZE;
if (chunk > len)
chunk = len;
if (rw != UIO_READ && vp->v_type == VREG)
bwillwrite();
iaresid = 0;
error = vn_rdwr(rw, vp, base, chunk, offset, segflg,
ioflg, active_cred, file_cred, &iaresid, td);
len -= chunk; /* aresid calc already includes length */
if (error)
break;
offset += chunk;
base = (char *)base + chunk;
kern_yield(PRI_USER);
} while (len);
if (aresid)
*aresid = len + iaresid;
return (error);
}
/*
* File table vnode read routine.
*/
static int
vn_read(fp, uio, active_cred, flags, td)
struct file *fp;
struct uio *uio;
struct ucred *active_cred;
int flags;
struct thread *td;
{
struct vnode *vp;
int error, ioflag;
struct mtx *mtxp;
int vfslocked;
KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
uio->uio_td, td));
mtxp = NULL;
vp = fp->f_vnode;
ioflag = 0;
if (fp->f_flag & FNONBLOCK)
ioflag |= IO_NDELAY;
if (fp->f_flag & O_DIRECT)
ioflag |= IO_DIRECT;
vfslocked = VFS_LOCK_GIANT(vp->v_mount);
/*
* According to McKusick the vn lock was protecting f_offset here.
* It is now protected by the FOFFSET_LOCKED flag.
*/
if ((flags & FOF_OFFSET) == 0) {
mtxp = mtx_pool_find(mtxpool_sleep, fp);
mtx_lock(mtxp);
while(fp->f_vnread_flags & FOFFSET_LOCKED) {
fp->f_vnread_flags |= FOFFSET_LOCK_WAITING;
msleep(&fp->f_vnread_flags, mtxp, PUSER -1,
"vnread offlock", 0);
}
fp->f_vnread_flags |= FOFFSET_LOCKED;
mtx_unlock(mtxp);
vn_lock(vp, LK_SHARED | LK_RETRY);
uio->uio_offset = fp->f_offset;
} else
vn_lock(vp, LK_SHARED | LK_RETRY);
ioflag |= sequential_heuristic(uio, fp);
#ifdef MAC
error = mac_vnode_check_read(active_cred, fp->f_cred, vp);
if (error == 0)
#endif
error = VOP_READ(vp, uio, ioflag, fp->f_cred);
if ((flags & FOF_OFFSET) == 0) {
fp->f_offset = uio->uio_offset;
mtx_lock(mtxp);
if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING)
wakeup(&fp->f_vnread_flags);
fp->f_vnread_flags = 0;
mtx_unlock(mtxp);
}
fp->f_nextoff = uio->uio_offset;
VOP_UNLOCK(vp, 0);
VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
/*
* File table vnode write routine.
*/
static int
vn_write(fp, uio, active_cred, flags, td)
struct file *fp;
struct uio *uio;
struct ucred *active_cred;
int flags;
struct thread *td;
{
struct vnode *vp;
struct mount *mp;
int error, ioflag, lock_flags;
int vfslocked;
KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
uio->uio_td, td));
vp = fp->f_vnode;
vfslocked = VFS_LOCK_GIANT(vp->v_mount);
if (vp->v_type == VREG)
bwillwrite();
ioflag = IO_UNIT;
if (vp->v_type == VREG && (fp->f_flag & O_APPEND))
ioflag |= IO_APPEND;
if (fp->f_flag & FNONBLOCK)
ioflag |= IO_NDELAY;
if (fp->f_flag & O_DIRECT)
ioflag |= IO_DIRECT;
if ((fp->f_flag & O_FSYNC) ||
(vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)))
ioflag |= IO_SYNC;
mp = NULL;
if (vp->v_type != VCHR &&
(error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
goto unlock;
if ((MNT_SHARED_WRITES(mp) ||
((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) &&
(flags & FOF_OFFSET) != 0) {
lock_flags = LK_SHARED;
} else {
lock_flags = LK_EXCLUSIVE;
}
vn_lock(vp, lock_flags | LK_RETRY);
if ((flags & FOF_OFFSET) == 0)
uio->uio_offset = fp->f_offset;
ioflag |= sequential_heuristic(uio, fp);
#ifdef MAC
error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
if (error == 0)
#endif
error = VOP_WRITE(vp, uio, ioflag, fp->f_cred);
if ((flags & FOF_OFFSET) == 0)
fp->f_offset = uio->uio_offset;
fp->f_nextoff = uio->uio_offset;
VOP_UNLOCK(vp, 0);
if (vp->v_type != VCHR)
vn_finished_write(mp);
unlock:
VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
/*
* File table truncate routine.
*/
static int
vn_truncate(fp, length, active_cred, td)
struct file *fp;
off_t length;
struct ucred *active_cred;
struct thread *td;
{
struct vattr vattr;
struct mount *mp;
struct vnode *vp;
int vfslocked;
int error;
vp = fp->f_vnode;
vfslocked = VFS_LOCK_GIANT(vp->v_mount);
error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
if (error) {
VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
if (vp->v_type == VDIR) {
error = EISDIR;
goto out;
}
#ifdef MAC
error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
if (error)
goto out;
#endif
error = vn_writechk(vp);
if (error == 0) {
VATTR_NULL(&vattr);
vattr.va_size = length;
error = VOP_SETATTR(vp, &vattr, fp->f_cred);
}
out:
VOP_UNLOCK(vp, 0);
vn_finished_write(mp);
VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
/*
* File table vnode stat routine.
*/
static int
vn_statfile(fp, sb, active_cred, td)
struct file *fp;
struct stat *sb;
struct ucred *active_cred;
struct thread *td;
{
struct vnode *vp = fp->f_vnode;
int vfslocked;
int error;
vfslocked = VFS_LOCK_GIANT(vp->v_mount);
vn_lock(vp, LK_SHARED | LK_RETRY);
error = vn_stat(vp, sb, active_cred, fp->f_cred, td);
VOP_UNLOCK(vp, 0);
VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
/*
* Stat a vnode; implementation for the stat syscall
*/
int
vn_stat(vp, sb, active_cred, file_cred, td)
struct vnode *vp;
register struct stat *sb;
struct ucred *active_cred;
struct ucred *file_cred;
struct thread *td;
{
struct vattr vattr;
register struct vattr *vap;
int error;
u_short mode;
#ifdef MAC
error = mac_vnode_check_stat(active_cred, file_cred, vp);
if (error)
return (error);
#endif
vap = &vattr;
/*
* Initialize defaults for new and unusual fields, so that file
* systems which don't support these fields don't need to know
* about them.
*/
vap->va_birthtime.tv_sec = -1;
vap->va_birthtime.tv_nsec = 0;
vap->va_fsid = VNOVAL;
vap->va_rdev = NODEV;
error = VOP_GETATTR(vp, vap, active_cred);
if (error)
return (error);
/*
* Zero the spare stat fields
*/
bzero(sb, sizeof *sb);
/*
* Copy from vattr table
*/
if (vap->va_fsid != VNOVAL)
sb->st_dev = vap->va_fsid;
else
sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0];
sb->st_ino = vap->va_fileid;
mode = vap->va_mode;
switch (vap->va_type) {
case VREG:
mode |= S_IFREG;
break;
case VDIR:
mode |= S_IFDIR;
break;
case VBLK:
mode |= S_IFBLK;
break;
case VCHR:
mode |= S_IFCHR;
break;
case VLNK:
mode |= S_IFLNK;
break;
case VSOCK:
mode |= S_IFSOCK;
break;
case VFIFO:
mode |= S_IFIFO;
break;
default:
return (EBADF);
};
sb->st_mode = mode;
sb->st_nlink = vap->va_nlink;
sb->st_uid = vap->va_uid;
sb->st_gid = vap->va_gid;
sb->st_rdev = vap->va_rdev;
if (vap->va_size > OFF_MAX)
return (EOVERFLOW);
sb->st_size = vap->va_size;
sb->st_atim = vap->va_atime;
sb->st_mtim = vap->va_mtime;
sb->st_ctim = vap->va_ctime;
sb->st_birthtim = vap->va_birthtime;
/*
* According to www.opengroup.org, the meaning of st_blksize is
* "a filesystem-specific preferred I/O block size for this
* object. In some filesystem types, this may vary from file
* to file"
* Use miminum/default of PAGE_SIZE (e.g. for VCHR).
*/
sb->st_blksize = max(PAGE_SIZE, vap->va_blocksize);
sb->st_flags = vap->va_flags;
if (priv_check(td, PRIV_VFS_GENERATION))
sb->st_gen = 0;
else
sb->st_gen = vap->va_gen;
sb->st_blocks = vap->va_bytes / S_BLKSIZE;
return (0);
}
/*
* File table vnode ioctl routine.
*/
static int
vn_ioctl(fp, com, data, active_cred, td)
struct file *fp;
u_long com;
void *data;
struct ucred *active_cred;
struct thread *td;
{
struct vnode *vp = fp->f_vnode;
struct vattr vattr;
int vfslocked;
int error;
vfslocked = VFS_LOCK_GIANT(vp->v_mount);
error = ENOTTY;
switch (vp->v_type) {
case VREG:
case VDIR:
if (com == FIONREAD) {
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
error = VOP_GETATTR(vp, &vattr, active_cred);
VOP_UNLOCK(vp, 0);
if (!error)
*(int *)data = vattr.va_size - fp->f_offset;
}
if (com == FIONBIO || com == FIOASYNC) /* XXX */
error = 0;
else
error = VOP_IOCTL(vp, com, data, fp->f_flag,
active_cred, td);
break;
default:
break;
}
VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
/*
* File table vnode poll routine.
*/
static int
vn_poll(fp, events, active_cred, td)
struct file *fp;
int events;
struct ucred *active_cred;
struct thread *td;
{
struct vnode *vp;
int vfslocked;
int error;
vp = fp->f_vnode;
vfslocked = VFS_LOCK_GIANT(vp->v_mount);
#ifdef MAC
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
error = mac_vnode_check_poll(active_cred, fp->f_cred, vp);
VOP_UNLOCK(vp, 0);
if (!error)
#endif
error = VOP_POLL(vp, events, fp->f_cred, td);
VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
/*
* Acquire the requested lock and then check for validity. LK_RETRY
* permits vn_lock to return doomed vnodes.
*/
int
_vn_lock(struct vnode *vp, int flags, char *file, int line)
{
int error;
VNASSERT((flags & LK_TYPE_MASK) != 0, vp,
("vn_lock called with no locktype."));
do {
#ifdef DEBUG_VFS_LOCKS
KASSERT(vp->v_holdcnt != 0,
("vn_lock %p: zero hold count", vp));
#endif
error = VOP_LOCK1(vp, flags, file, line);
flags &= ~LK_INTERLOCK; /* Interlock is always dropped. */
KASSERT((flags & LK_RETRY) == 0 || error == 0,
("LK_RETRY set with incompatible flags (0x%x) or an error occured (%d)",
flags, error));
/*
* Callers specify LK_RETRY if they wish to get dead vnodes.
* If RETRY is not set, we return ENOENT instead.
*/
if (error == 0 && vp->v_iflag & VI_DOOMED &&
(flags & LK_RETRY) == 0) {
VOP_UNLOCK(vp, 0);
error = ENOENT;
break;
}
} while (flags & LK_RETRY && error != 0);
return (error);
}
/*
* File table vnode close routine.
*/
static int
vn_closefile(fp, td)
struct file *fp;
struct thread *td;
{
struct vnode *vp;
struct flock lf;
int vfslocked;
int error;
vp = fp->f_vnode;
vfslocked = VFS_LOCK_GIANT(vp->v_mount);
if (fp->f_type == DTYPE_VNODE && fp->f_flag & FHASLOCK) {
lf.l_whence = SEEK_SET;
lf.l_start = 0;
lf.l_len = 0;
lf.l_type = F_UNLCK;
(void) VOP_ADVLOCK(vp, fp, F_UNLCK, &lf, F_FLOCK);
}
fp->f_ops = &badfileops;
error = vn_close(vp, fp->f_flag, fp->f_cred, td);
VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
/*
* Preparing to start a filesystem write operation. If the operation is
* permitted, then we bump the count of operations in progress and
* proceed. If a suspend request is in progress, we wait until the
* suspension is over, and then proceed.
*/
int
vn_start_write(vp, mpp, flags)
struct vnode *vp;
struct mount **mpp;
int flags;
{
struct mount *mp;
int error;
error = 0;
/*
* If a vnode is provided, get and return the mount point that
* to which it will write.
*/
if (vp != NULL) {
if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
*mpp = NULL;
if (error != EOPNOTSUPP)
return (error);
return (0);
}
}
if ((mp = *mpp) == NULL)
return (0);
/*
* VOP_GETWRITEMOUNT() returns with the mp refcount held through
* a vfs_ref().
* As long as a vnode is not provided we need to acquire a
* refcount for the provided mountpoint too, in order to
* emulate a vfs_ref().
*/
MNT_ILOCK(mp);
if (vp == NULL)
MNT_REF(mp);
/*
* Check on status of suspension.
*/
if ((curthread->td_pflags & TDP_IGNSUSP) == 0 ||
mp->mnt_susp_owner != curthread) {
while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
if (flags & V_NOWAIT) {
error = EWOULDBLOCK;
goto unlock;
}
error = msleep(&mp->mnt_flag, MNT_MTX(mp),
(PUSER - 1) | (flags & PCATCH), "suspfs", 0);
if (error)
goto unlock;
}
}
if (flags & V_XSLEEP)
goto unlock;
mp->mnt_writeopcount++;
unlock:
if (error != 0 || (flags & V_XSLEEP) != 0)
MNT_REL(mp);
MNT_IUNLOCK(mp);
return (error);
}
/*
* Secondary suspension. Used by operations such as vop_inactive
* routines that are needed by the higher level functions. These
* are allowed to proceed until all the higher level functions have
* completed (indicated by mnt_writeopcount dropping to zero). At that
* time, these operations are halted until the suspension is over.
*/
int
vn_start_secondary_write(vp, mpp, flags)
struct vnode *vp;
struct mount **mpp;
int flags;
{
struct mount *mp;
int error;
retry:
if (vp != NULL) {
if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
*mpp = NULL;
if (error != EOPNOTSUPP)
return (error);
return (0);
}
}
/*
* If we are not suspended or have not yet reached suspended
* mode, then let the operation proceed.
*/
if ((mp = *mpp) == NULL)
return (0);
/*
* VOP_GETWRITEMOUNT() returns with the mp refcount held through
* a vfs_ref().
* As long as a vnode is not provided we need to acquire a
* refcount for the provided mountpoint too, in order to
* emulate a vfs_ref().
*/
MNT_ILOCK(mp);
if (vp == NULL)
MNT_REF(mp);
if ((mp->mnt_kern_flag & (MNTK_SUSPENDED | MNTK_SUSPEND2)) == 0) {
mp->mnt_secondary_writes++;
mp->mnt_secondary_accwrites++;
MNT_IUNLOCK(mp);
return (0);
}
if (flags & V_NOWAIT) {
MNT_REL(mp);
MNT_IUNLOCK(mp);
return (EWOULDBLOCK);
}
/*
* Wait for the suspension to finish.
*/
error = msleep(&mp->mnt_flag, MNT_MTX(mp),
(PUSER - 1) | (flags & PCATCH) | PDROP, "suspfs", 0);
vfs_rel(mp);
if (error == 0)
goto retry;
return (error);
}
/*
* Filesystem write operation has completed. If we are suspending and this
* operation is the last one, notify the suspender that the suspension is
* now in effect.
*/
void
vn_finished_write(mp)
struct mount *mp;
{
if (mp == NULL)
return;
MNT_ILOCK(mp);
MNT_REL(mp);
mp->mnt_writeopcount--;
if (mp->mnt_writeopcount < 0)
panic("vn_finished_write: neg cnt");
if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
mp->mnt_writeopcount <= 0)
wakeup(&mp->mnt_writeopcount);
MNT_IUNLOCK(mp);
}
/*
* Filesystem secondary write operation has completed. If we are
* suspending and this operation is the last one, notify the suspender
* that the suspension is now in effect.
*/
void
vn_finished_secondary_write(mp)
struct mount *mp;
{
if (mp == NULL)
return;
MNT_ILOCK(mp);
MNT_REL(mp);
mp->mnt_secondary_writes--;
if (mp->mnt_secondary_writes < 0)
panic("vn_finished_secondary_write: neg cnt");
if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
mp->mnt_secondary_writes <= 0)
wakeup(&mp->mnt_secondary_writes);
MNT_IUNLOCK(mp);
}
/*
* Request a filesystem to suspend write operations.
*/
int
vfs_write_suspend(mp)
struct mount *mp;
{
int error;
MNT_ILOCK(mp);
if (mp->mnt_susp_owner == curthread) {
MNT_IUNLOCK(mp);
return (EALREADY);
}
while (mp->mnt_kern_flag & MNTK_SUSPEND)
msleep(&mp->mnt_flag, MNT_MTX(mp), PUSER - 1, "wsuspfs", 0);
mp->mnt_kern_flag |= MNTK_SUSPEND;
mp->mnt_susp_owner = curthread;
if (mp->mnt_writeopcount > 0)
(void) msleep(&mp->mnt_writeopcount,
MNT_MTX(mp), (PUSER - 1)|PDROP, "suspwt", 0);
else
MNT_IUNLOCK(mp);
if ((error = VFS_SYNC(mp, MNT_SUSPEND)) != 0)
vfs_write_resume(mp);
return (error);
}
/*
* Request a filesystem to resume write operations.
*/
void
vfs_write_resume(mp)
struct mount *mp;
{
MNT_ILOCK(mp);
if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
KASSERT(mp->mnt_susp_owner == curthread, ("mnt_susp_owner"));
mp->mnt_kern_flag &= ~(MNTK_SUSPEND | MNTK_SUSPEND2 |
MNTK_SUSPENDED);
mp->mnt_susp_owner = NULL;
wakeup(&mp->mnt_writeopcount);
wakeup(&mp->mnt_flag);
curthread->td_pflags &= ~TDP_IGNSUSP;
MNT_IUNLOCK(mp);
VFS_SUSP_CLEAN(mp);
} else
MNT_IUNLOCK(mp);
}
/*
* Implement kqueues for files by translating it to vnode operation.
*/
static int
vn_kqfilter(struct file *fp, struct knote *kn)
{
int vfslocked;
int error;
vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
error = VOP_KQFILTER(fp->f_vnode, kn);
VFS_UNLOCK_GIANT(vfslocked);
return error;
}
/*
* Simplified in-kernel wrapper calls for extended attribute access.
* Both calls pass in a NULL credential, authorizing as "kernel" access.
* Set IO_NODELOCKED in ioflg if the vnode is already locked.
*/
int
vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace,
const char *attrname, int *buflen, char *buf, struct thread *td)
{
struct uio auio;
struct iovec iov;
int error;
iov.iov_len = *buflen;
iov.iov_base = buf;
auio.uio_iov = &iov;
auio.uio_iovcnt = 1;
auio.uio_rw = UIO_READ;
auio.uio_segflg = UIO_SYSSPACE;
auio.uio_td = td;
auio.uio_offset = 0;
auio.uio_resid = *buflen;
if ((ioflg & IO_NODELOCKED) == 0)
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
/* authorize attribute retrieval as kernel */
error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, NULL,
td);
if ((ioflg & IO_NODELOCKED) == 0)
VOP_UNLOCK(vp, 0);
if (error == 0) {
*buflen = *buflen - auio.uio_resid;
}
return (error);
}
/*
* XXX failure mode if partially written?
*/
int
vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace,
const char *attrname, int buflen, char *buf, struct thread *td)
{
struct uio auio;
struct iovec iov;
struct mount *mp;
int error;
iov.iov_len = buflen;
iov.iov_base = buf;
auio.uio_iov = &iov;
auio.uio_iovcnt = 1;
auio.uio_rw = UIO_WRITE;
auio.uio_segflg = UIO_SYSSPACE;
auio.uio_td = td;
auio.uio_offset = 0;
auio.uio_resid = buflen;
if ((ioflg & IO_NODELOCKED) == 0) {
if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
return (error);
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
}
ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
/* authorize attribute setting as kernel */
error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, td);
if ((ioflg & IO_NODELOCKED) == 0) {
vn_finished_write(mp);
VOP_UNLOCK(vp, 0);
}
return (error);
}
int
vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace,
const char *attrname, struct thread *td)
{
struct mount *mp;
int error;
if ((ioflg & IO_NODELOCKED) == 0) {
if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
return (error);
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
}
ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
/* authorize attribute removal as kernel */
error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, NULL, td);
if (error == EOPNOTSUPP)
error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL,
NULL, td);
if ((ioflg & IO_NODELOCKED) == 0) {
vn_finished_write(mp);
VOP_UNLOCK(vp, 0);
}
return (error);
}
int
vn_vget_ino(struct vnode *vp, ino_t ino, int lkflags, struct vnode **rvp)
{
struct mount *mp;
int ltype, error;
mp = vp->v_mount;
ltype = VOP_ISLOCKED(vp);
KASSERT(ltype == LK_EXCLUSIVE || ltype == LK_SHARED,
("vn_vget_ino: vp not locked"));
error = vfs_busy(mp, MBF_NOWAIT);
if (error != 0) {
vfs_ref(mp);
VOP_UNLOCK(vp, 0);
error = vfs_busy(mp, 0);
vn_lock(vp, ltype | LK_RETRY);
vfs_rel(mp);
if (error != 0)
return (ENOENT);
if (vp->v_iflag & VI_DOOMED) {
vfs_unbusy(mp);
return (ENOENT);
}
}
VOP_UNLOCK(vp, 0);
error = VFS_VGET(mp, ino, lkflags, rvp);
vfs_unbusy(mp);
vn_lock(vp, ltype | LK_RETRY);
if (vp->v_iflag & VI_DOOMED) {
if (error == 0)
vput(*rvp);
error = ENOENT;
}
return (error);
}
int
vn_rlimit_fsize(const struct vnode *vp, const struct uio *uio,
const struct thread *td)
{
if (vp->v_type != VREG || td == NULL)
return (0);
PROC_LOCK(td->td_proc);
if ((uoff_t)uio->uio_offset + uio->uio_resid >
lim_cur(td->td_proc, RLIMIT_FSIZE)) {
- psignal(td->td_proc, SIGXFSZ);
+ kern_psignal(td->td_proc, SIGXFSZ);
PROC_UNLOCK(td->td_proc);
return (EFBIG);
}
PROC_UNLOCK(td->td_proc);
return (0);
}
int
vn_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
struct thread *td)
{
struct vnode *vp;
int error, vfslocked;
vp = fp->f_vnode;
vfslocked = VFS_LOCK_GIANT(vp->v_mount);
#ifdef AUDIT
vn_lock(vp, LK_SHARED | LK_RETRY);
AUDIT_ARG_VNODE1(vp);
VOP_UNLOCK(vp, 0);
#endif
error = setfmode(td, active_cred, vp, mode);
VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
int
vn_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
struct thread *td)
{
struct vnode *vp;
int error, vfslocked;
vp = fp->f_vnode;
vfslocked = VFS_LOCK_GIANT(vp->v_mount);
#ifdef AUDIT
vn_lock(vp, LK_SHARED | LK_RETRY);
AUDIT_ARG_VNODE1(vp);
VOP_UNLOCK(vp, 0);
#endif
error = setfown(td, active_cred, vp, uid, gid);
VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
void
vn_pages_remove(struct vnode *vp, vm_pindex_t start, vm_pindex_t end)
{
vm_object_t object;
if ((object = vp->v_object) == NULL)
return;
VM_OBJECT_LOCK(object);
vm_object_page_remove(object, start, end, 0);
VM_OBJECT_UNLOCK(object);
}
Index: head/sys/kgssapi/gss_impl.c
===================================================================
--- head/sys/kgssapi/gss_impl.c (revision 225616)
+++ head/sys/kgssapi/gss_impl.c (revision 225617)
@@ -1,303 +1,303 @@
/*-
* Copyright (c) 2008 Isilon Inc http://www.isilon.com/
* Authors: Doug Rabson <dfr@rabson.org>
* Developed with Red Inc: Alfred Perlstein <alfred@freebsd.org>
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/kobj.h>
#include <sys/malloc.h>
#include <sys/module.h>
#include <sys/priv.h>
#include <sys/syscall.h>
#include <sys/sysent.h>
#include <sys/sysproto.h>
#include <kgssapi/gssapi.h>
#include <kgssapi/gssapi_impl.h>
#include <rpc/rpc.h>
#include <rpc/rpc_com.h>
#include <rpc/rpcsec_gss.h>
#include "gssd.h"
#include "kgss_if.h"
MALLOC_DEFINE(M_GSSAPI, "GSS-API", "GSS-API");
/*
* Syscall hooks
*/
static int gssd_syscall_offset = SYS_gssd_syscall;
static struct sysent gssd_syscall_prev_sysent;
MAKE_SYSENT(gssd_syscall);
static bool_t gssd_syscall_registered = FALSE;
struct kgss_mech_list kgss_mechs;
CLIENT *kgss_gssd_handle;
static void
kgss_init(void *dummy)
{
int error;
LIST_INIT(&kgss_mechs);
error = syscall_register(&gssd_syscall_offset, &gssd_syscall_sysent,
&gssd_syscall_prev_sysent);
if (error)
printf("Can't register GSSD syscall\n");
else
gssd_syscall_registered = TRUE;
}
SYSINIT(kgss_init, SI_SUB_LOCK, SI_ORDER_FIRST, kgss_init, NULL);
static void
kgss_uninit(void *dummy)
{
if (gssd_syscall_registered)
syscall_deregister(&gssd_syscall_offset,
&gssd_syscall_prev_sysent);
}
SYSUNINIT(kgss_uninit, SI_SUB_LOCK, SI_ORDER_FIRST, kgss_uninit, NULL);
int
-gssd_syscall(struct thread *td, struct gssd_syscall_args *uap)
+sys_gssd_syscall(struct thread *td, struct gssd_syscall_args *uap)
{
struct sockaddr_un sun;
struct netconfig *nconf;
char path[MAXPATHLEN];
int error;
error = priv_check(td, PRIV_NFS_DAEMON);
if (error)
return (error);
if (kgss_gssd_handle)
CLNT_DESTROY(kgss_gssd_handle);
error = copyinstr(uap->path, path, sizeof(path), NULL);
if (error)
return (error);
sun.sun_family = AF_LOCAL;
strcpy(sun.sun_path, path);
sun.sun_len = SUN_LEN(&sun);
nconf = getnetconfigent("local");
kgss_gssd_handle = clnt_reconnect_create(nconf,
(struct sockaddr *) &sun, GSSD, GSSDVERS,
RPC_MAXDATASIZE, RPC_MAXDATASIZE);
return (0);
}
int
kgss_oid_equal(const gss_OID oid1, const gss_OID oid2)
{
if (oid1 == oid2)
return (1);
if (!oid1 || !oid2)
return (0);
if (oid1->length != oid2->length)
return (0);
if (memcmp(oid1->elements, oid2->elements, oid1->length))
return (0);
return (1);
}
void
kgss_install_mech(gss_OID mech_type, const char *name, struct kobj_class *cls)
{
struct kgss_mech *km;
km = malloc(sizeof(struct kgss_mech), M_GSSAPI, M_WAITOK);
km->km_mech_type = mech_type;
km->km_mech_name = name;
km->km_class = cls;
LIST_INSERT_HEAD(&kgss_mechs, km, km_link);
}
void
kgss_uninstall_mech(gss_OID mech_type)
{
struct kgss_mech *km;
LIST_FOREACH(km, &kgss_mechs, km_link) {
if (kgss_oid_equal(km->km_mech_type, mech_type)) {
LIST_REMOVE(km, km_link);
free(km, M_GSSAPI);
return;
}
}
}
gss_OID
kgss_find_mech_by_name(const char *name)
{
struct kgss_mech *km;
LIST_FOREACH(km, &kgss_mechs, km_link) {
if (!strcmp(km->km_mech_name, name)) {
return (km->km_mech_type);
}
}
return (GSS_C_NO_OID);
}
const char *
kgss_find_mech_by_oid(const gss_OID oid)
{
struct kgss_mech *km;
LIST_FOREACH(km, &kgss_mechs, km_link) {
if (kgss_oid_equal(km->km_mech_type, oid)) {
return (km->km_mech_name);
}
}
return (NULL);
}
gss_ctx_id_t
kgss_create_context(gss_OID mech_type)
{
struct kgss_mech *km;
gss_ctx_id_t ctx;
LIST_FOREACH(km, &kgss_mechs, km_link) {
if (kgss_oid_equal(km->km_mech_type, mech_type))
break;
}
if (!km)
return (NULL);
ctx = (gss_ctx_id_t) kobj_create(km->km_class, M_GSSAPI, M_WAITOK);
KGSS_INIT(ctx);
return (ctx);
}
void
kgss_delete_context(gss_ctx_id_t ctx, gss_buffer_t output_token)
{
KGSS_DELETE(ctx, output_token);
kobj_delete((kobj_t) ctx, M_GSSAPI);
}
OM_uint32
kgss_transfer_context(gss_ctx_id_t ctx)
{
struct export_sec_context_res res;
struct export_sec_context_args args;
enum clnt_stat stat;
OM_uint32 maj_stat;
if (!kgss_gssd_handle)
return (GSS_S_FAILURE);
args.ctx = ctx->handle;
bzero(&res, sizeof(res));
stat = gssd_export_sec_context_1(&args, &res, kgss_gssd_handle);
if (stat != RPC_SUCCESS) {
return (GSS_S_FAILURE);
}
maj_stat = KGSS_IMPORT(ctx, res.format, &res.interprocess_token);
ctx->handle = 0;
xdr_free((xdrproc_t) xdr_export_sec_context_res, &res);
return (maj_stat);
}
void
kgss_copy_buffer(const gss_buffer_t from, gss_buffer_t to)
{
to->length = from->length;
if (from->length) {
to->value = malloc(from->length, M_GSSAPI, M_WAITOK);
bcopy(from->value, to->value, from->length);
} else {
to->value = NULL;
}
}
/*
* Kernel module glue
*/
static int
kgssapi_modevent(module_t mod, int type, void *data)
{
int error = 0;
switch (type) {
case MOD_LOAD:
rpc_gss_entries.rpc_gss_secfind = rpc_gss_secfind;
rpc_gss_entries.rpc_gss_secpurge = rpc_gss_secpurge;
rpc_gss_entries.rpc_gss_seccreate = rpc_gss_seccreate;
rpc_gss_entries.rpc_gss_set_defaults = rpc_gss_set_defaults;
rpc_gss_entries.rpc_gss_max_data_length =
rpc_gss_max_data_length;
rpc_gss_entries.rpc_gss_get_error = rpc_gss_get_error;
rpc_gss_entries.rpc_gss_mech_to_oid = rpc_gss_mech_to_oid;
rpc_gss_entries.rpc_gss_oid_to_mech = rpc_gss_oid_to_mech;
rpc_gss_entries.rpc_gss_qop_to_num = rpc_gss_qop_to_num;
rpc_gss_entries.rpc_gss_get_mechanisms = rpc_gss_get_mechanisms;
rpc_gss_entries.rpc_gss_get_versions = rpc_gss_get_versions;
rpc_gss_entries.rpc_gss_is_installed = rpc_gss_is_installed;
rpc_gss_entries.rpc_gss_set_svc_name = rpc_gss_set_svc_name;
rpc_gss_entries.rpc_gss_clear_svc_name = rpc_gss_clear_svc_name;
rpc_gss_entries.rpc_gss_getcred = rpc_gss_getcred;
rpc_gss_entries.rpc_gss_set_callback = rpc_gss_set_callback;
rpc_gss_entries.rpc_gss_clear_callback = rpc_gss_clear_callback;
rpc_gss_entries.rpc_gss_get_principal_name =
rpc_gss_get_principal_name;
rpc_gss_entries.rpc_gss_svc_max_data_length =
rpc_gss_svc_max_data_length;
break;
case MOD_UNLOAD:
/*
* Unloading of the kgssapi module is not currently supported.
* If somebody wants this, we would need to keep track of
* currently executing threads and make sure the count is 0.
*/
/* FALLTHROUGH */
default:
error = EOPNOTSUPP;
};
return (error);
}
static moduledata_t kgssapi_mod = {
"kgssapi",
kgssapi_modevent,
NULL,
};
DECLARE_MODULE(kgssapi, kgssapi_mod, SI_SUB_VFS, SI_ORDER_ANY);
MODULE_DEPEND(kgssapi, krpc, 1, 1, 1);
MODULE_VERSION(kgssapi, 1);
Index: head/sys/mips/mips/pm_machdep.c
===================================================================
--- head/sys/mips/mips/pm_machdep.c (revision 225616)
+++ head/sys/mips/mips/pm_machdep.c (revision 225617)
@@ -1,576 +1,576 @@
/*-
* Copyright (c) 1992 Terrence R. Lambert.
* Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
* All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* William Jolitz.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* from: @(#)machdep.c 7.4 (Berkeley) 6/3/91
* from: src/sys/i386/i386/machdep.c,v 1.385.2.3 2000/05/10 02:04:46 obrien
* JNPR: pm_machdep.c,v 1.9.2.1 2007/08/16 15:59:10 girish
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_compat.h"
#include "opt_cputype.h"
#include <sys/types.h>
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/sysent.h>
#include <sys/proc.h>
#include <sys/signalvar.h>
#include <sys/exec.h>
#include <sys/imgact.h>
#include <sys/ucontext.h>
#include <sys/lock.h>
#include <sys/syscallsubr.h>
#include <sys/sysproto.h>
#include <sys/ptrace.h>
#include <sys/syslog.h>
#include <vm/vm.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
#include <vm/vm_extern.h>
#include <sys/user.h>
#include <sys/uio.h>
#include <machine/reg.h>
#include <machine/md_var.h>
#include <machine/sigframe.h>
#include <machine/vmparam.h>
#include <sys/vnode.h>
#include <fs/pseudofs/pseudofs.h>
#include <fs/procfs/procfs.h>
#define UCONTEXT_MAGIC 0xACEDBADE
/*
* Send an interrupt to process.
*
* Stack is set up to allow sigcode stored
* at top to call routine, followed by kcall
* to sigreturn routine below. After sigreturn
* resets the signal mask, the stack, and the
* frame pointer, it returns to the user
* specified pc, psl.
*/
void
sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
{
struct proc *p;
struct thread *td;
struct trapframe *regs;
struct sigacts *psp;
struct sigframe sf, *sfp;
int sig;
int oonstack;
td = curthread;
p = td->td_proc;
PROC_LOCK_ASSERT(p, MA_OWNED);
sig = ksi->ksi_signo;
psp = p->p_sigacts;
mtx_assert(&psp->ps_mtx, MA_OWNED);
regs = td->td_frame;
oonstack = sigonstack(regs->sp);
/* save user context */
bzero(&sf, sizeof(struct sigframe));
sf.sf_uc.uc_sigmask = *mask;
sf.sf_uc.uc_stack = td->td_sigstk;
sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
sf.sf_uc.uc_mcontext.mc_pc = regs->pc;
sf.sf_uc.uc_mcontext.mullo = regs->mullo;
sf.sf_uc.uc_mcontext.mulhi = regs->mulhi;
sf.sf_uc.uc_mcontext.mc_regs[0] = UCONTEXT_MAGIC; /* magic number */
bcopy((void *)&regs->ast, (void *)&sf.sf_uc.uc_mcontext.mc_regs[1],
sizeof(sf.sf_uc.uc_mcontext.mc_regs) - sizeof(register_t));
sf.sf_uc.uc_mcontext.mc_fpused = td->td_md.md_flags & MDTD_FPUSED;
if (sf.sf_uc.uc_mcontext.mc_fpused) {
/* if FPU has current state, save it first */
if (td == PCPU_GET(fpcurthread))
MipsSaveCurFPState(td);
bcopy((void *)&td->td_frame->f0,
(void *)sf.sf_uc.uc_mcontext.mc_fpregs,
sizeof(sf.sf_uc.uc_mcontext.mc_fpregs));
}
/* Allocate and validate space for the signal handler context. */
if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
SIGISMEMBER(psp->ps_sigonstack, sig)) {
sfp = (struct sigframe *)((vm_offset_t)(td->td_sigstk.ss_sp +
td->td_sigstk.ss_size - sizeof(struct sigframe))
& ~(sizeof(__int64_t) - 1));
} else
sfp = (struct sigframe *)((vm_offset_t)(regs->sp -
sizeof(struct sigframe)) & ~(sizeof(__int64_t) - 1));
/* Translate the signal if appropriate */
if (p->p_sysent->sv_sigtbl) {
if (sig <= p->p_sysent->sv_sigsize)
sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
}
/* Build the argument list for the signal handler. */
regs->a0 = sig;
regs->a2 = (register_t)(intptr_t)&sfp->sf_uc;
if (SIGISMEMBER(psp->ps_siginfo, sig)) {
/* Signal handler installed with SA_SIGINFO. */
regs->a1 = (register_t)(intptr_t)&sfp->sf_si;
/* sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher; */
/* fill siginfo structure */
sf.sf_si.si_signo = sig;
sf.sf_si.si_code = ksi->ksi_code;
sf.sf_si.si_addr = (void*)(intptr_t)regs->badvaddr;
} else {
/* Old FreeBSD-style arguments. */
regs->a1 = ksi->ksi_code;
regs->a3 = regs->badvaddr;
/* sf.sf_ahu.sf_handler = catcher; */
}
mtx_unlock(&psp->ps_mtx);
PROC_UNLOCK(p);
/*
* Copy the sigframe out to the user's stack.
*/
if (copyout(&sf, sfp, sizeof(struct sigframe)) != 0) {
/*
* Something is wrong with the stack pointer.
* ...Kill the process.
*/
PROC_LOCK(p);
sigexit(td, SIGILL);
}
regs->pc = (register_t)(intptr_t)catcher;
regs->t9 = (register_t)(intptr_t)catcher;
regs->sp = (register_t)(intptr_t)sfp;
/*
* Signal trampoline code is at base of user stack.
*/
regs->ra = (register_t)(intptr_t)PS_STRINGS - *(p->p_sysent->sv_szsigcode);
PROC_LOCK(p);
mtx_lock(&psp->ps_mtx);
}
#ifdef GONE_IN_7
/*
* Build siginfo_t for SA thread
*/
void
cpu_thread_siginfo(int sig, u_long code, siginfo_t *si)
{
struct proc *p;
struct thread *td;
td = curthread;
p = td->td_proc;
PROC_LOCK_ASSERT(p, MA_OWNED);
bzero(si, sizeof(*si));
si->si_signo = sig;
si->si_code = code;
/* XXXKSE fill other fields */
}
#endif
/*
* System call to cleanup state after a signal
* has been taken. Reset signal mask and
* stack state from context left by sendsig (above).
* Return to previous pc as specified by
* context left by sendsig.
*/
int
-sigreturn(struct thread *td, struct sigreturn_args *uap)
+sys_sigreturn(struct thread *td, struct sigreturn_args *uap)
{
struct trapframe *regs;
ucontext_t *ucp;
ucontext_t uc;
int error;
ucp = &uc;
error = copyin(uap->sigcntxp, &uc, sizeof(uc));
if (error != 0)
return (error);
regs = td->td_frame;
/* #ifdef DEBUG */
if (ucp->uc_mcontext.mc_regs[ZERO] != UCONTEXT_MAGIC) {
printf("sigreturn: pid %d, ucp %p\n", td->td_proc->p_pid, ucp);
printf(" old sp %p ra %p pc %p\n",
(void *)(intptr_t)regs->sp, (void *)(intptr_t)regs->ra, (void *)(intptr_t)regs->pc);
printf(" new sp %p ra %p pc %p z %p\n",
(void *)(intptr_t)ucp->uc_mcontext.mc_regs[SP],
(void *)(intptr_t)ucp->uc_mcontext.mc_regs[RA],
(void *)(intptr_t)ucp->uc_mcontext.mc_regs[PC],
(void *)(intptr_t)ucp->uc_mcontext.mc_regs[ZERO]);
return EINVAL;
}
/* #endif */
bcopy((const void *)&ucp->uc_mcontext.mc_regs[1], (void *)&regs->ast,
sizeof(ucp->uc_mcontext.mc_regs) - sizeof(register_t));
if (ucp->uc_mcontext.mc_fpused)
bcopy((const void *)ucp->uc_mcontext.mc_fpregs,
(void *)&td->td_frame->f0,
sizeof(ucp->uc_mcontext.mc_fpregs));
regs->pc = ucp->uc_mcontext.mc_pc;
regs->mullo = ucp->uc_mcontext.mullo;
regs->mulhi = ucp->uc_mcontext.mulhi;
kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
return(EJUSTRETURN);
}
int
ptrace_set_pc(struct thread *td, unsigned long addr)
{
td->td_frame->pc = (register_t) addr;
return 0;
}
static int
ptrace_read_int(struct thread *td, off_t addr, int *v)
{
struct iovec iov;
struct uio uio;
PROC_LOCK_ASSERT(td->td_proc, MA_NOTOWNED);
iov.iov_base = (caddr_t) v;
iov.iov_len = sizeof(int);
uio.uio_iov = &iov;
uio.uio_iovcnt = 1;
uio.uio_offset = (off_t)addr;
uio.uio_resid = sizeof(int);
uio.uio_segflg = UIO_SYSSPACE;
uio.uio_rw = UIO_READ;
uio.uio_td = td;
return proc_rwmem(td->td_proc, &uio);
}
static int
ptrace_write_int(struct thread *td, off_t addr, int v)
{
struct iovec iov;
struct uio uio;
PROC_LOCK_ASSERT(td->td_proc, MA_NOTOWNED);
iov.iov_base = (caddr_t) &v;
iov.iov_len = sizeof(int);
uio.uio_iov = &iov;
uio.uio_iovcnt = 1;
uio.uio_offset = (off_t)addr;
uio.uio_resid = sizeof(int);
uio.uio_segflg = UIO_SYSSPACE;
uio.uio_rw = UIO_WRITE;
uio.uio_td = td;
return proc_rwmem(td->td_proc, &uio);
}
int
ptrace_single_step(struct thread *td)
{
unsigned va;
struct trapframe *locr0 = td->td_frame;
int i;
int bpinstr = MIPS_BREAK_SSTEP;
int curinstr;
struct proc *p;
p = td->td_proc;
PROC_UNLOCK(p);
/*
* Fetch what's at the current location.
*/
ptrace_read_int(td, (off_t)locr0->pc, &curinstr);
/* compute next address after current location */
if(curinstr != 0) {
va = MipsEmulateBranch(locr0, locr0->pc, locr0->fsr,
(uintptr_t)&curinstr);
} else {
va = locr0->pc + 4;
}
if (td->td_md.md_ss_addr) {
printf("SS %s (%d): breakpoint already set at %x (va %x)\n",
p->p_comm, p->p_pid, td->td_md.md_ss_addr, va); /* XXX */
return (EFAULT);
}
td->td_md.md_ss_addr = va;
/*
* Fetch what's at the current location.
*/
ptrace_read_int(td, (off_t)va, &td->td_md.md_ss_instr);
/*
* Store breakpoint instruction at the "next" location now.
*/
i = ptrace_write_int (td, va, bpinstr);
/*
* The sync'ing of I & D caches is done by procfs_domem()
* through procfs_rwmem().
*/
PROC_LOCK(p);
if (i < 0)
return (EFAULT);
#if 0
printf("SS %s (%d): breakpoint set at %x: %x (pc %x) br %x\n",
p->p_comm, p->p_pid, p->p_md.md_ss_addr,
p->p_md.md_ss_instr, locr0->pc, curinstr); /* XXX */
#endif
return (0);
}
void
makectx(struct trapframe *tf, struct pcb *pcb)
{
pcb->pcb_regs.ra = tf->ra;
pcb->pcb_regs.pc = tf->pc;
pcb->pcb_regs.sp = tf->sp;
}
int
fill_regs(struct thread *td, struct reg *regs)
{
memcpy(regs, td->td_frame, sizeof(struct reg));
return (0);
}
int
set_regs(struct thread *td, struct reg *regs)
{
struct trapframe *f;
register_t sr;
f = (struct trapframe *) td->td_frame;
/*
* Don't allow the user to change SR
*/
sr = f->sr;
memcpy(td->td_frame, regs, sizeof(struct reg));
f->sr = sr;
return (0);
}
int
get_mcontext(struct thread *td, mcontext_t *mcp, int flags)
{
struct trapframe *tp;
tp = td->td_frame;
PROC_LOCK(curthread->td_proc);
mcp->mc_onstack = sigonstack(tp->sp);
PROC_UNLOCK(curthread->td_proc);
bcopy((void *)&td->td_frame->zero, (void *)&mcp->mc_regs,
sizeof(mcp->mc_regs));
mcp->mc_fpused = td->td_md.md_flags & MDTD_FPUSED;
if (mcp->mc_fpused) {
bcopy((void *)&td->td_frame->f0, (void *)&mcp->mc_fpregs,
sizeof(mcp->mc_fpregs));
}
if (flags & GET_MC_CLEAR_RET) {
mcp->mc_regs[V0] = 0;
mcp->mc_regs[V1] = 0;
mcp->mc_regs[A3] = 0;
}
mcp->mc_pc = td->td_frame->pc;
mcp->mullo = td->td_frame->mullo;
mcp->mulhi = td->td_frame->mulhi;
mcp->mc_tls = td->td_md.md_tls;
return (0);
}
int
set_mcontext(struct thread *td, const mcontext_t *mcp)
{
struct trapframe *tp;
tp = td->td_frame;
bcopy((void *)&mcp->mc_regs, (void *)&td->td_frame->zero,
sizeof(mcp->mc_regs));
td->td_md.md_flags = mcp->mc_fpused & MDTD_FPUSED;
if (mcp->mc_fpused) {
bcopy((void *)&mcp->mc_fpregs, (void *)&td->td_frame->f0,
sizeof(mcp->mc_fpregs));
}
td->td_frame->pc = mcp->mc_pc;
td->td_frame->mullo = mcp->mullo;
td->td_frame->mulhi = mcp->mulhi;
td->td_md.md_tls = mcp->mc_tls;
/* Dont let user to set any bits in Status and casue registers */
return (0);
}
int
fill_fpregs(struct thread *td, struct fpreg *fpregs)
{
if (td == PCPU_GET(fpcurthread))
MipsSaveCurFPState(td);
memcpy(fpregs, &td->td_frame->f0, sizeof(struct fpreg));
return 0;
}
int
set_fpregs(struct thread *td, struct fpreg *fpregs)
{
if (PCPU_GET(fpcurthread) == td)
PCPU_SET(fpcurthread, (struct thread *)0);
memcpy(&td->td_frame->f0, fpregs, sizeof(struct fpreg));
return 0;
}
/*
* Clear registers on exec
* $sp is set to the stack pointer passed in. $pc is set to the entry
* point given by the exec_package passed in, as is $t9 (used for PIC
* code by the MIPS elf abi).
*/
void
exec_setregs(struct thread *td, struct image_params *imgp, u_long stack)
{
bzero((caddr_t)td->td_frame, sizeof(struct trapframe));
/*
* The stack pointer has to be aligned to accommodate the largest
* datatype at minimum. This probably means it should be 16-byte
* aligned, but for now we're 8-byte aligning it.
*/
td->td_frame->sp = ((register_t) stack) & ~(sizeof(__int64_t) - 1);
/*
* If we're running o32 or n32 programs but have 64-bit registers,
* GCC may use stack-relative addressing near the top of user
* address space that, due to sign extension, will yield an
* invalid address. For instance, if sp is 0x7fffff00 then GCC
* might do something like this to load a word from 0x7ffffff0:
*
* addu sp, sp, 32768
* lw t0, -32528(sp)
*
* On systems with 64-bit registers, sp is sign-extended to
* 0xffffffff80007f00 and the load is instead done from
* 0xffffffff7ffffff0.
*
* To prevent this, we subtract 64K from the stack pointer here.
*
* For consistency, we should just always do this unless we're
* running n64 programs. For now, since we don't support
* COMPAT_FREEBSD32 on n64 kernels, we just do it unless we're
* running n64 kernels.
*/
#if !defined(__mips_n64)
td->td_frame->sp -= 65536;
#endif
td->td_frame->pc = imgp->entry_addr & ~3;
td->td_frame->t9 = imgp->entry_addr & ~3; /* abicall req */
td->td_frame->sr = MIPS_SR_KSU_USER | MIPS_SR_EXL | MIPS_SR_INT_IE |
(mips_rd_status() & MIPS_SR_INT_MASK);
#if defined(__mips_n32)
td->td_frame->sr |= MIPS_SR_PX;
#elif defined(__mips_n64)
td->td_frame->sr |= MIPS_SR_PX | MIPS_SR_UX | MIPS_SR_KX;
#endif
#ifdef CPU_CNMIPS
td->td_frame->sr |= MIPS_SR_COP_2_BIT | MIPS_SR_PX | MIPS_SR_UX |
MIPS_SR_KX | MIPS_SR_SX;
#endif
/*
* FREEBSD_DEVELOPERS_FIXME:
* Setup any other CPU-Specific registers (Not MIPS Standard)
* and/or bits in other standard MIPS registers (if CPU-Specific)
* that are needed.
*/
/*
* Set up arguments for the rtld-capable crt0:
* a0 stack pointer
* a1 rtld cleanup (filled in by dynamic loader)
* a2 rtld object (filled in by dynamic loader)
* a3 ps_strings
*/
td->td_frame->a0 = (register_t) stack;
td->td_frame->a1 = 0;
td->td_frame->a2 = 0;
td->td_frame->a3 = (register_t)imgp->ps_strings;
td->td_md.md_flags &= ~MDTD_FPUSED;
if (PCPU_GET(fpcurthread) == td)
PCPU_SET(fpcurthread, (struct thread *)0);
td->td_md.md_ss_addr = 0;
}
int
ptrace_clear_single_step(struct thread *td)
{
int i;
struct proc *p;
p = td->td_proc;
PROC_LOCK_ASSERT(p, MA_OWNED);
if (!td->td_md.md_ss_addr)
return EINVAL;
/*
* Restore original instruction and clear BP
*/
i = ptrace_write_int (td, td->td_md.md_ss_addr, td->td_md.md_ss_instr);
/* The sync'ing of I & D caches is done by procfs_domem(). */
if (i < 0) {
log(LOG_ERR, "SS %s %d: can't restore instruction at %x: %x\n",
p->p_comm, p->p_pid, td->td_md.md_ss_addr,
td->td_md.md_ss_instr);
}
td->td_md.md_ss_addr = 0;
return 0;
}
Index: head/sys/net/route.c
===================================================================
--- head/sys/net/route.c (revision 225616)
+++ head/sys/net/route.c (revision 225617)
@@ -1,1593 +1,1593 @@
/*-
* Copyright (c) 1980, 1986, 1991, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)route.c 8.3.1.1 (Berkeley) 2/23/95
* $FreeBSD$
*/
/************************************************************************
* Note: In this file a 'fib' is a "forwarding information base" *
* Which is the new name for an in kernel routing (next hop) table. *
***********************************************************************/
#include "opt_inet.h"
#include "opt_route.h"
#include "opt_mrouting.h"
#include "opt_mpath.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/syslog.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/socket.h>
#include <sys/sysctl.h>
#include <sys/syslog.h>
#include <sys/sysproto.h>
#include <sys/proc.h>
#include <sys/domain.h>
#include <sys/kernel.h>
#include <net/if.h>
#include <net/if_dl.h>
#include <net/route.h>
#include <net/vnet.h>
#include <net/flowtable.h>
#ifdef RADIX_MPATH
#include <net/radix_mpath.h>
#endif
#include <netinet/in.h>
#include <netinet/ip_mroute.h>
#include <vm/uma.h>
u_int rt_numfibs = RT_NUMFIBS;
SYSCTL_UINT(_net, OID_AUTO, fibs, CTLFLAG_RD, &rt_numfibs, 0, "");
/*
* Allow the boot code to allow LESS than RT_MAXFIBS to be used.
* We can't do more because storage is statically allocated for now.
* (for compatibility reasons.. this will change).
*/
TUNABLE_INT("net.fibs", &rt_numfibs);
/*
* By default add routes to all fibs for new interfaces.
* Once this is set to 0 then only allocate routes on interface
* changes for the FIB of the caller when adding a new set of addresses
* to an interface. XXX this is a shotgun aproach to a problem that needs
* a more fine grained solution.. that will come.
*/
u_int rt_add_addr_allfibs = 1;
SYSCTL_UINT(_net, OID_AUTO, add_addr_allfibs, CTLFLAG_RW,
&rt_add_addr_allfibs, 0, "");
TUNABLE_INT("net.add_addr_allfibs", &rt_add_addr_allfibs);
VNET_DEFINE(struct rtstat, rtstat);
#define V_rtstat VNET(rtstat)
VNET_DEFINE(struct radix_node_head *, rt_tables);
#define V_rt_tables VNET(rt_tables)
VNET_DEFINE(int, rttrash); /* routes not in table but not freed */
#define V_rttrash VNET(rttrash)
/* compare two sockaddr structures */
#define sa_equal(a1, a2) (bcmp((a1), (a2), (a1)->sa_len) == 0)
/*
* Convert a 'struct radix_node *' to a 'struct rtentry *'.
* The operation can be done safely (in this code) because a
* 'struct rtentry' starts with two 'struct radix_node''s, the first
* one representing leaf nodes in the routing tree, which is
* what the code in radix.c passes us as a 'struct radix_node'.
*
* But because there are a lot of assumptions in this conversion,
* do not cast explicitly, but always use the macro below.
*/
#define RNTORT(p) ((struct rtentry *)(p))
static VNET_DEFINE(uma_zone_t, rtzone); /* Routing table UMA zone. */
#define V_rtzone VNET(rtzone)
/*
* handler for net.my_fibnum
*/
static int
sysctl_my_fibnum(SYSCTL_HANDLER_ARGS)
{
int fibnum;
int error;
fibnum = curthread->td_proc->p_fibnum;
error = sysctl_handle_int(oidp, &fibnum, 0, req);
return (error);
}
SYSCTL_PROC(_net, OID_AUTO, my_fibnum, CTLTYPE_INT|CTLFLAG_RD,
NULL, 0, &sysctl_my_fibnum, "I", "default FIB of caller");
static __inline struct radix_node_head **
rt_tables_get_rnh_ptr(int table, int fam)
{
struct radix_node_head **rnh;
KASSERT(table >= 0 && table < rt_numfibs, ("%s: table out of bounds.",
__func__));
KASSERT(fam >= 0 && fam < (AF_MAX+1), ("%s: fam out of bounds.",
__func__));
/* rnh is [fib=0][af=0]. */
rnh = (struct radix_node_head **)V_rt_tables;
/* Get the offset to the requested table and fam. */
rnh += table * (AF_MAX+1) + fam;
return (rnh);
}
struct radix_node_head *
rt_tables_get_rnh(int table, int fam)
{
return (*rt_tables_get_rnh_ptr(table, fam));
}
/*
* route initialization must occur before ip6_init2(), which happenas at
* SI_ORDER_MIDDLE.
*/
static void
route_init(void)
{
struct domain *dom;
int max_keylen = 0;
/* whack the tunable ints into line. */
if (rt_numfibs > RT_MAXFIBS)
rt_numfibs = RT_MAXFIBS;
if (rt_numfibs == 0)
rt_numfibs = 1;
for (dom = domains; dom; dom = dom->dom_next)
if (dom->dom_maxrtkey > max_keylen)
max_keylen = dom->dom_maxrtkey;
rn_init(max_keylen); /* init all zeroes, all ones, mask table */
}
SYSINIT(route_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, route_init, 0);
static void
vnet_route_init(const void *unused __unused)
{
struct domain *dom;
struct radix_node_head **rnh;
int table;
int fam;
V_rt_tables = malloc(rt_numfibs * (AF_MAX+1) *
sizeof(struct radix_node_head *), M_RTABLE, M_WAITOK|M_ZERO);
V_rtzone = uma_zcreate("rtentry", sizeof(struct rtentry), NULL, NULL,
NULL, NULL, UMA_ALIGN_PTR, 0);
for (dom = domains; dom; dom = dom->dom_next) {
if (dom->dom_rtattach) {
for (table = 0; table < rt_numfibs; table++) {
if ( (fam = dom->dom_family) == AF_INET ||
table == 0) {
/* for now only AF_INET has > 1 table */
/* XXX MRT
* rtattach will be also called
* from vfs_export.c but the
* offset will be 0
* (only for AF_INET and AF_INET6
* which don't need it anyhow)
*/
rnh = rt_tables_get_rnh_ptr(table, fam);
if (rnh == NULL)
panic("%s: rnh NULL", __func__);
dom->dom_rtattach((void **)rnh,
dom->dom_rtoffset);
} else {
break;
}
}
}
}
}
VNET_SYSINIT(vnet_route_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH,
vnet_route_init, 0);
#ifdef VIMAGE
static void
vnet_route_uninit(const void *unused __unused)
{
int table;
int fam;
struct domain *dom;
struct radix_node_head **rnh;
for (dom = domains; dom; dom = dom->dom_next) {
if (dom->dom_rtdetach) {
for (table = 0; table < rt_numfibs; table++) {
if ( (fam = dom->dom_family) == AF_INET ||
table == 0) {
/* For now only AF_INET has > 1 tbl. */
rnh = rt_tables_get_rnh_ptr(table, fam);
if (rnh == NULL)
panic("%s: rnh NULL", __func__);
dom->dom_rtdetach((void **)rnh,
dom->dom_rtoffset);
} else {
break;
}
}
}
}
}
VNET_SYSUNINIT(vnet_route_uninit, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD,
vnet_route_uninit, 0);
#endif
#ifndef _SYS_SYSPROTO_H_
struct setfib_args {
int fibnum;
};
#endif
int
-setfib(struct thread *td, struct setfib_args *uap)
+sys_setfib(struct thread *td, struct setfib_args *uap)
{
if (uap->fibnum < 0 || uap->fibnum >= rt_numfibs)
return EINVAL;
td->td_proc->p_fibnum = uap->fibnum;
return (0);
}
/*
* Packet routing routines.
*/
void
rtalloc(struct route *ro)
{
rtalloc_ign_fib(ro, 0UL, 0);
}
void
rtalloc_fib(struct route *ro, u_int fibnum)
{
rtalloc_ign_fib(ro, 0UL, fibnum);
}
void
rtalloc_ign(struct route *ro, u_long ignore)
{
struct rtentry *rt;
if ((rt = ro->ro_rt) != NULL) {
if (rt->rt_ifp != NULL && rt->rt_flags & RTF_UP)
return;
RTFREE(rt);
ro->ro_rt = NULL;
}
ro->ro_rt = rtalloc1_fib(&ro->ro_dst, 1, ignore, 0);
if (ro->ro_rt)
RT_UNLOCK(ro->ro_rt);
}
void
rtalloc_ign_fib(struct route *ro, u_long ignore, u_int fibnum)
{
struct rtentry *rt;
if ((rt = ro->ro_rt) != NULL) {
if (rt->rt_ifp != NULL && rt->rt_flags & RTF_UP)
return;
RTFREE(rt);
ro->ro_rt = NULL;
}
ro->ro_rt = rtalloc1_fib(&ro->ro_dst, 1, ignore, fibnum);
if (ro->ro_rt)
RT_UNLOCK(ro->ro_rt);
}
/*
* Look up the route that matches the address given
* Or, at least try.. Create a cloned route if needed.
*
* The returned route, if any, is locked.
*/
struct rtentry *
rtalloc1(struct sockaddr *dst, int report, u_long ignflags)
{
return (rtalloc1_fib(dst, report, ignflags, 0));
}
struct rtentry *
rtalloc1_fib(struct sockaddr *dst, int report, u_long ignflags,
u_int fibnum)
{
struct radix_node_head *rnh;
struct radix_node *rn;
struct rtentry *newrt;
struct rt_addrinfo info;
int err = 0, msgtype = RTM_MISS;
int needlock;
KASSERT((fibnum < rt_numfibs), ("rtalloc1_fib: bad fibnum"));
if (dst->sa_family != AF_INET) /* Only INET supports > 1 fib now */
fibnum = 0;
rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
newrt = NULL;
if (rnh == NULL)
goto miss;
/*
* Look up the address in the table for that Address Family
*/
needlock = !(ignflags & RTF_RNH_LOCKED);
if (needlock)
RADIX_NODE_HEAD_RLOCK(rnh);
#ifdef INVARIANTS
else
RADIX_NODE_HEAD_LOCK_ASSERT(rnh);
#endif
rn = rnh->rnh_matchaddr(dst, rnh);
if (rn && ((rn->rn_flags & RNF_ROOT) == 0)) {
newrt = RNTORT(rn);
RT_LOCK(newrt);
RT_ADDREF(newrt);
if (needlock)
RADIX_NODE_HEAD_RUNLOCK(rnh);
goto done;
} else if (needlock)
RADIX_NODE_HEAD_RUNLOCK(rnh);
/*
* Either we hit the root or couldn't find any match,
* Which basically means
* "caint get there frm here"
*/
miss:
V_rtstat.rts_unreach++;
if (report) {
/*
* If required, report the failure to the supervising
* Authorities.
* For a delete, this is not an error. (report == 0)
*/
bzero(&info, sizeof(info));
info.rti_info[RTAX_DST] = dst;
rt_missmsg(msgtype, &info, 0, err);
}
done:
if (newrt)
RT_LOCK_ASSERT(newrt);
return (newrt);
}
/*
* Remove a reference count from an rtentry.
* If the count gets low enough, take it out of the routing table
*/
void
rtfree(struct rtentry *rt)
{
struct radix_node_head *rnh;
KASSERT(rt != NULL,("%s: NULL rt", __func__));
rnh = rt_tables_get_rnh(rt->rt_fibnum, rt_key(rt)->sa_family);
KASSERT(rnh != NULL,("%s: NULL rnh", __func__));
RT_LOCK_ASSERT(rt);
/*
* The callers should use RTFREE_LOCKED() or RTFREE(), so
* we should come here exactly with the last reference.
*/
RT_REMREF(rt);
if (rt->rt_refcnt > 0) {
log(LOG_DEBUG, "%s: %p has %d refs\n", __func__, rt, rt->rt_refcnt);
goto done;
}
/*
* On last reference give the "close method" a chance
* to cleanup private state. This also permits (for
* IPv4 and IPv6) a chance to decide if the routing table
* entry should be purged immediately or at a later time.
* When an immediate purge is to happen the close routine
* typically calls rtexpunge which clears the RTF_UP flag
* on the entry so that the code below reclaims the storage.
*/
if (rt->rt_refcnt == 0 && rnh->rnh_close)
rnh->rnh_close((struct radix_node *)rt, rnh);
/*
* If we are no longer "up" (and ref == 0)
* then we can free the resources associated
* with the route.
*/
if ((rt->rt_flags & RTF_UP) == 0) {
if (rt->rt_nodes->rn_flags & (RNF_ACTIVE | RNF_ROOT))
panic("rtfree 2");
/*
* the rtentry must have been removed from the routing table
* so it is represented in rttrash.. remove that now.
*/
V_rttrash--;
#ifdef DIAGNOSTIC
if (rt->rt_refcnt < 0) {
printf("rtfree: %p not freed (neg refs)\n", rt);
goto done;
}
#endif
/*
* release references on items we hold them on..
* e.g other routes and ifaddrs.
*/
if (rt->rt_ifa)
ifa_free(rt->rt_ifa);
/*
* The key is separatly alloc'd so free it (see rt_setgate()).
* This also frees the gateway, as they are always malloc'd
* together.
*/
Free(rt_key(rt));
/*
* and the rtentry itself of course
*/
RT_LOCK_DESTROY(rt);
uma_zfree(V_rtzone, rt);
return;
}
done:
RT_UNLOCK(rt);
}
/*
* Force a routing table entry to the specified
* destination to go through the given gateway.
* Normally called as a result of a routing redirect
* message from the network layer.
*/
void
rtredirect(struct sockaddr *dst,
struct sockaddr *gateway,
struct sockaddr *netmask,
int flags,
struct sockaddr *src)
{
rtredirect_fib(dst, gateway, netmask, flags, src, 0);
}
void
rtredirect_fib(struct sockaddr *dst,
struct sockaddr *gateway,
struct sockaddr *netmask,
int flags,
struct sockaddr *src,
u_int fibnum)
{
struct rtentry *rt, *rt0 = NULL;
int error = 0;
short *stat = NULL;
struct rt_addrinfo info;
struct ifaddr *ifa;
struct radix_node_head *rnh;
ifa = NULL;
rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
if (rnh == NULL) {
error = EAFNOSUPPORT;
goto out;
}
/* verify the gateway is directly reachable */
if ((ifa = ifa_ifwithnet(gateway, 0)) == NULL) {
error = ENETUNREACH;
goto out;
}
rt = rtalloc1_fib(dst, 0, 0UL, fibnum); /* NB: rt is locked */
/*
* If the redirect isn't from our current router for this dst,
* it's either old or wrong. If it redirects us to ourselves,
* we have a routing loop, perhaps as a result of an interface
* going down recently.
*/
if (!(flags & RTF_DONE) && rt &&
(!sa_equal(src, rt->rt_gateway) || rt->rt_ifa != ifa))
error = EINVAL;
else if (ifa_ifwithaddr_check(gateway))
error = EHOSTUNREACH;
if (error)
goto done;
/*
* Create a new entry if we just got back a wildcard entry
* or the lookup failed. This is necessary for hosts
* which use routing redirects generated by smart gateways
* to dynamically build the routing tables.
*/
if (rt == NULL || (rt_mask(rt) && rt_mask(rt)->sa_len < 2))
goto create;
/*
* Don't listen to the redirect if it's
* for a route to an interface.
*/
if (rt->rt_flags & RTF_GATEWAY) {
if (((rt->rt_flags & RTF_HOST) == 0) && (flags & RTF_HOST)) {
/*
* Changing from route to net => route to host.
* Create new route, rather than smashing route to net.
*/
create:
rt0 = rt;
rt = NULL;
flags |= RTF_GATEWAY | RTF_DYNAMIC;
bzero((caddr_t)&info, sizeof(info));
info.rti_info[RTAX_DST] = dst;
info.rti_info[RTAX_GATEWAY] = gateway;
info.rti_info[RTAX_NETMASK] = netmask;
info.rti_ifa = ifa;
info.rti_flags = flags;
if (rt0 != NULL)
RT_UNLOCK(rt0); /* drop lock to avoid LOR with RNH */
error = rtrequest1_fib(RTM_ADD, &info, &rt, fibnum);
if (rt != NULL) {
RT_LOCK(rt);
if (rt0 != NULL)
EVENTHANDLER_INVOKE(route_redirect_event, rt0, rt, dst);
flags = rt->rt_flags;
}
if (rt0 != NULL)
RTFREE(rt0);
stat = &V_rtstat.rts_dynamic;
} else {
struct rtentry *gwrt;
/*
* Smash the current notion of the gateway to
* this destination. Should check about netmask!!!
*/
rt->rt_flags |= RTF_MODIFIED;
flags |= RTF_MODIFIED;
stat = &V_rtstat.rts_newgateway;
/*
* add the key and gateway (in one malloc'd chunk).
*/
RT_UNLOCK(rt);
RADIX_NODE_HEAD_LOCK(rnh);
RT_LOCK(rt);
rt_setgate(rt, rt_key(rt), gateway);
gwrt = rtalloc1(gateway, 1, RTF_RNH_LOCKED);
RADIX_NODE_HEAD_UNLOCK(rnh);
EVENTHANDLER_INVOKE(route_redirect_event, rt, gwrt, dst);
RTFREE_LOCKED(gwrt);
}
} else
error = EHOSTUNREACH;
done:
if (rt)
RTFREE_LOCKED(rt);
out:
if (error)
V_rtstat.rts_badredirect++;
else if (stat != NULL)
(*stat)++;
bzero((caddr_t)&info, sizeof(info));
info.rti_info[RTAX_DST] = dst;
info.rti_info[RTAX_GATEWAY] = gateway;
info.rti_info[RTAX_NETMASK] = netmask;
info.rti_info[RTAX_AUTHOR] = src;
rt_missmsg(RTM_REDIRECT, &info, flags, error);
if (ifa != NULL)
ifa_free(ifa);
}
int
rtioctl(u_long req, caddr_t data)
{
return (rtioctl_fib(req, data, 0));
}
/*
* Routing table ioctl interface.
*/
int
rtioctl_fib(u_long req, caddr_t data, u_int fibnum)
{
/*
* If more ioctl commands are added here, make sure the proper
* super-user checks are being performed because it is possible for
* prison-root to make it this far if raw sockets have been enabled
* in jails.
*/
#ifdef INET
/* Multicast goop, grrr... */
return mrt_ioctl ? mrt_ioctl(req, data, fibnum) : EOPNOTSUPP;
#else /* INET */
return ENXIO;
#endif /* INET */
}
/*
* For both ifa_ifwithroute() routines, 'ifa' is returned referenced.
*/
struct ifaddr *
ifa_ifwithroute(int flags, struct sockaddr *dst, struct sockaddr *gateway)
{
return (ifa_ifwithroute_fib(flags, dst, gateway, 0));
}
struct ifaddr *
ifa_ifwithroute_fib(int flags, struct sockaddr *dst, struct sockaddr *gateway,
u_int fibnum)
{
register struct ifaddr *ifa;
int not_found = 0;
if ((flags & RTF_GATEWAY) == 0) {
/*
* If we are adding a route to an interface,
* and the interface is a pt to pt link
* we should search for the destination
* as our clue to the interface. Otherwise
* we can use the local address.
*/
ifa = NULL;
if (flags & RTF_HOST)
ifa = ifa_ifwithdstaddr(dst);
if (ifa == NULL)
ifa = ifa_ifwithaddr(gateway);
} else {
/*
* If we are adding a route to a remote net
* or host, the gateway may still be on the
* other end of a pt to pt link.
*/
ifa = ifa_ifwithdstaddr(gateway);
}
if (ifa == NULL)
ifa = ifa_ifwithnet(gateway, 0);
if (ifa == NULL) {
struct rtentry *rt = rtalloc1_fib(gateway, 0, RTF_RNH_LOCKED, fibnum);
if (rt == NULL)
return (NULL);
/*
* dismiss a gateway that is reachable only
* through the default router
*/
switch (gateway->sa_family) {
case AF_INET:
if (satosin(rt_key(rt))->sin_addr.s_addr == INADDR_ANY)
not_found = 1;
break;
case AF_INET6:
if (IN6_IS_ADDR_UNSPECIFIED(&satosin6(rt_key(rt))->sin6_addr))
not_found = 1;
break;
default:
break;
}
if (!not_found && rt->rt_ifa != NULL) {
ifa = rt->rt_ifa;
ifa_ref(ifa);
}
RT_REMREF(rt);
RT_UNLOCK(rt);
if (not_found || ifa == NULL)
return (NULL);
}
if (ifa->ifa_addr->sa_family != dst->sa_family) {
struct ifaddr *oifa = ifa;
ifa = ifaof_ifpforaddr(dst, ifa->ifa_ifp);
if (ifa == NULL)
ifa = oifa;
else
ifa_free(oifa);
}
return (ifa);
}
/*
* Do appropriate manipulations of a routing tree given
* all the bits of info needed
*/
int
rtrequest(int req,
struct sockaddr *dst,
struct sockaddr *gateway,
struct sockaddr *netmask,
int flags,
struct rtentry **ret_nrt)
{
return (rtrequest_fib(req, dst, gateway, netmask, flags, ret_nrt, 0));
}
int
rtrequest_fib(int req,
struct sockaddr *dst,
struct sockaddr *gateway,
struct sockaddr *netmask,
int flags,
struct rtentry **ret_nrt,
u_int fibnum)
{
struct rt_addrinfo info;
if (dst->sa_len == 0)
return(EINVAL);
bzero((caddr_t)&info, sizeof(info));
info.rti_flags = flags;
info.rti_info[RTAX_DST] = dst;
info.rti_info[RTAX_GATEWAY] = gateway;
info.rti_info[RTAX_NETMASK] = netmask;
return rtrequest1_fib(req, &info, ret_nrt, fibnum);
}
/*
* These (questionable) definitions of apparent local variables apply
* to the next two functions. XXXXXX!!!
*/
#define dst info->rti_info[RTAX_DST]
#define gateway info->rti_info[RTAX_GATEWAY]
#define netmask info->rti_info[RTAX_NETMASK]
#define ifaaddr info->rti_info[RTAX_IFA]
#define ifpaddr info->rti_info[RTAX_IFP]
#define flags info->rti_flags
int
rt_getifa(struct rt_addrinfo *info)
{
return (rt_getifa_fib(info, 0));
}
/*
* Look up rt_addrinfo for a specific fib. Note that if rti_ifa is defined,
* it will be referenced so the caller must free it.
*/
int
rt_getifa_fib(struct rt_addrinfo *info, u_int fibnum)
{
struct ifaddr *ifa;
int error = 0;
/*
* ifp may be specified by sockaddr_dl
* when protocol address is ambiguous.
*/
if (info->rti_ifp == NULL && ifpaddr != NULL &&
ifpaddr->sa_family == AF_LINK &&
(ifa = ifa_ifwithnet(ifpaddr, 0)) != NULL) {
info->rti_ifp = ifa->ifa_ifp;
ifa_free(ifa);
}
if (info->rti_ifa == NULL && ifaaddr != NULL)
info->rti_ifa = ifa_ifwithaddr(ifaaddr);
if (info->rti_ifa == NULL) {
struct sockaddr *sa;
sa = ifaaddr != NULL ? ifaaddr :
(gateway != NULL ? gateway : dst);
if (sa != NULL && info->rti_ifp != NULL)
info->rti_ifa = ifaof_ifpforaddr(sa, info->rti_ifp);
else if (dst != NULL && gateway != NULL)
info->rti_ifa = ifa_ifwithroute_fib(flags, dst, gateway,
fibnum);
else if (sa != NULL)
info->rti_ifa = ifa_ifwithroute_fib(flags, sa, sa,
fibnum);
}
if ((ifa = info->rti_ifa) != NULL) {
if (info->rti_ifp == NULL)
info->rti_ifp = ifa->ifa_ifp;
} else
error = ENETUNREACH;
return (error);
}
/*
* Expunges references to a route that's about to be reclaimed.
* The route must be locked.
*/
int
rtexpunge(struct rtentry *rt)
{
#if !defined(RADIX_MPATH)
struct radix_node *rn;
#else
struct rt_addrinfo info;
int fib;
struct rtentry *rt0;
#endif
struct radix_node_head *rnh;
struct ifaddr *ifa;
int error = 0;
/*
* Find the correct routing tree to use for this Address Family
*/
rnh = rt_tables_get_rnh(rt->rt_fibnum, rt_key(rt)->sa_family);
RT_LOCK_ASSERT(rt);
if (rnh == NULL)
return (EAFNOSUPPORT);
RADIX_NODE_HEAD_LOCK_ASSERT(rnh);
#ifdef RADIX_MPATH
fib = rt->rt_fibnum;
bzero(&info, sizeof(info));
info.rti_ifp = rt->rt_ifp;
info.rti_flags = RTF_RNH_LOCKED;
info.rti_info[RTAX_DST] = rt_key(rt);
info.rti_info[RTAX_GATEWAY] = rt->rt_ifa->ifa_addr;
RT_UNLOCK(rt);
error = rtrequest1_fib(RTM_DELETE, &info, &rt0, fib);
if (error == 0 && rt0 != NULL) {
rt = rt0;
RT_LOCK(rt);
} else if (error != 0) {
RT_LOCK(rt);
return (error);
}
#else
/*
* Remove the item from the tree; it should be there,
* but when callers invoke us blindly it may not (sigh).
*/
rn = rnh->rnh_deladdr(rt_key(rt), rt_mask(rt), rnh);
if (rn == NULL) {
error = ESRCH;
goto bad;
}
KASSERT((rn->rn_flags & (RNF_ACTIVE | RNF_ROOT)) == 0,
("unexpected flags 0x%x", rn->rn_flags));
KASSERT(rt == RNTORT(rn),
("lookup mismatch, rt %p rn %p", rt, rn));
#endif /* RADIX_MPATH */
rt->rt_flags &= ~RTF_UP;
/*
* Give the protocol a chance to keep things in sync.
*/
if ((ifa = rt->rt_ifa) && ifa->ifa_rtrequest) {
struct rt_addrinfo info;
bzero((caddr_t)&info, sizeof(info));
info.rti_flags = rt->rt_flags;
info.rti_info[RTAX_DST] = rt_key(rt);
info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
info.rti_info[RTAX_NETMASK] = rt_mask(rt);
ifa->ifa_rtrequest(RTM_DELETE, rt, &info);
}
/*
* one more rtentry floating around that is not
* linked to the routing table.
*/
V_rttrash++;
#if !defined(RADIX_MPATH)
bad:
#endif
return (error);
}
#ifdef RADIX_MPATH
static int
rn_mpath_update(int req, struct rt_addrinfo *info,
struct radix_node_head *rnh, struct rtentry **ret_nrt)
{
/*
* if we got multipath routes, we require users to specify
* a matching RTAX_GATEWAY.
*/
struct rtentry *rt, *rto = NULL;
register struct radix_node *rn;
int error = 0;
rn = rnh->rnh_matchaddr(dst, rnh);
if (rn == NULL)
return (ESRCH);
rto = rt = RNTORT(rn);
rt = rt_mpath_matchgate(rt, gateway);
if (rt == NULL)
return (ESRCH);
/*
* this is the first entry in the chain
*/
if (rto == rt) {
rn = rn_mpath_next((struct radix_node *)rt);
/*
* there is another entry, now it's active
*/
if (rn) {
rto = RNTORT(rn);
RT_LOCK(rto);
rto->rt_flags |= RTF_UP;
RT_UNLOCK(rto);
} else if (rt->rt_flags & RTF_GATEWAY) {
/*
* For gateway routes, we need to
* make sure that we we are deleting
* the correct gateway.
* rt_mpath_matchgate() does not
* check the case when there is only
* one route in the chain.
*/
if (gateway &&
(rt->rt_gateway->sa_len != gateway->sa_len ||
memcmp(rt->rt_gateway, gateway, gateway->sa_len)))
error = ESRCH;
else {
/*
* remove from tree before returning it
* to the caller
*/
rn = rnh->rnh_deladdr(dst, netmask, rnh);
KASSERT(rt == RNTORT(rn), ("radix node disappeared"));
goto gwdelete;
}
}
/*
* use the normal delete code to remove
* the first entry
*/
if (req != RTM_DELETE)
goto nondelete;
error = ENOENT;
goto done;
}
/*
* if the entry is 2nd and on up
*/
if ((req == RTM_DELETE) && !rt_mpath_deldup(rto, rt))
panic ("rtrequest1: rt_mpath_deldup");
gwdelete:
RT_LOCK(rt);
RT_ADDREF(rt);
if (req == RTM_DELETE) {
rt->rt_flags &= ~RTF_UP;
/*
* One more rtentry floating around that is not
* linked to the routing table. rttrash will be decremented
* when RTFREE(rt) is eventually called.
*/
V_rttrash++;
}
nondelete:
if (req != RTM_DELETE)
panic("unrecognized request %d", req);
/*
* If the caller wants it, then it can have it,
* but it's up to it to free the rtentry as we won't be
* doing it.
*/
if (ret_nrt) {
*ret_nrt = rt;
RT_UNLOCK(rt);
} else
RTFREE_LOCKED(rt);
done:
return (error);
}
#endif
int
rtrequest1_fib(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt,
u_int fibnum)
{
int error = 0, needlock = 0;
register struct rtentry *rt;
#ifdef FLOWTABLE
register struct rtentry *rt0;
#endif
register struct radix_node *rn;
register struct radix_node_head *rnh;
struct ifaddr *ifa;
struct sockaddr *ndst;
#define senderr(x) { error = x ; goto bad; }
KASSERT((fibnum < rt_numfibs), ("rtrequest1_fib: bad fibnum"));
if (dst->sa_family != AF_INET) /* Only INET supports > 1 fib now */
fibnum = 0;
/*
* Find the correct routing tree to use for this Address Family
*/
rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
if (rnh == NULL)
return (EAFNOSUPPORT);
needlock = ((flags & RTF_RNH_LOCKED) == 0);
flags &= ~RTF_RNH_LOCKED;
if (needlock)
RADIX_NODE_HEAD_LOCK(rnh);
else
RADIX_NODE_HEAD_LOCK_ASSERT(rnh);
/*
* If we are adding a host route then we don't want to put
* a netmask in the tree, nor do we want to clone it.
*/
if (flags & RTF_HOST)
netmask = NULL;
switch (req) {
case RTM_DELETE:
#ifdef RADIX_MPATH
if (rn_mpath_capable(rnh)) {
error = rn_mpath_update(req, info, rnh, ret_nrt);
/*
* "bad" holds true for the success case
* as well
*/
if (error != ENOENT)
goto bad;
error = 0;
}
#endif
/*
* Remove the item from the tree and return it.
* Complain if it is not there and do no more processing.
*/
rn = rnh->rnh_deladdr(dst, netmask, rnh);
if (rn == NULL)
senderr(ESRCH);
if (rn->rn_flags & (RNF_ACTIVE | RNF_ROOT))
panic ("rtrequest delete");
rt = RNTORT(rn);
RT_LOCK(rt);
RT_ADDREF(rt);
rt->rt_flags &= ~RTF_UP;
/*
* give the protocol a chance to keep things in sync.
*/
if ((ifa = rt->rt_ifa) && ifa->ifa_rtrequest)
ifa->ifa_rtrequest(RTM_DELETE, rt, info);
/*
* One more rtentry floating around that is not
* linked to the routing table. rttrash will be decremented
* when RTFREE(rt) is eventually called.
*/
V_rttrash++;
/*
* If the caller wants it, then it can have it,
* but it's up to it to free the rtentry as we won't be
* doing it.
*/
if (ret_nrt) {
*ret_nrt = rt;
RT_UNLOCK(rt);
} else
RTFREE_LOCKED(rt);
break;
case RTM_RESOLVE:
/*
* resolve was only used for route cloning
* here for compat
*/
break;
case RTM_ADD:
if ((flags & RTF_GATEWAY) && !gateway)
senderr(EINVAL);
if (dst && gateway && (dst->sa_family != gateway->sa_family) &&
(gateway->sa_family != AF_UNSPEC) && (gateway->sa_family != AF_LINK))
senderr(EINVAL);
if (info->rti_ifa == NULL) {
error = rt_getifa_fib(info, fibnum);
if (error)
senderr(error);
} else
ifa_ref(info->rti_ifa);
ifa = info->rti_ifa;
rt = uma_zalloc(V_rtzone, M_NOWAIT | M_ZERO);
if (rt == NULL) {
if (ifa != NULL)
ifa_free(ifa);
senderr(ENOBUFS);
}
RT_LOCK_INIT(rt);
rt->rt_flags = RTF_UP | flags;
rt->rt_fibnum = fibnum;
/*
* Add the gateway. Possibly re-malloc-ing the storage for it
*
*/
RT_LOCK(rt);
if ((error = rt_setgate(rt, dst, gateway)) != 0) {
RT_LOCK_DESTROY(rt);
if (ifa != NULL)
ifa_free(ifa);
uma_zfree(V_rtzone, rt);
senderr(error);
}
/*
* point to the (possibly newly malloc'd) dest address.
*/
ndst = (struct sockaddr *)rt_key(rt);
/*
* make sure it contains the value we want (masked if needed).
*/
if (netmask) {
rt_maskedcopy(dst, ndst, netmask);
} else
bcopy(dst, ndst, dst->sa_len);
/*
* We use the ifa reference returned by rt_getifa_fib().
* This moved from below so that rnh->rnh_addaddr() can
* examine the ifa and ifa->ifa_ifp if it so desires.
*/
rt->rt_ifa = ifa;
rt->rt_ifp = ifa->ifa_ifp;
rt->rt_rmx.rmx_weight = 1;
#ifdef RADIX_MPATH
/* do not permit exactly the same dst/mask/gw pair */
if (rn_mpath_capable(rnh) &&
rt_mpath_conflict(rnh, rt, netmask)) {
if (rt->rt_ifa) {
ifa_free(rt->rt_ifa);
}
Free(rt_key(rt));
RT_LOCK_DESTROY(rt);
uma_zfree(V_rtzone, rt);
senderr(EEXIST);
}
#endif
#ifdef FLOWTABLE
rt0 = NULL;
/* XXX
* "flow-table" only support IPv4 at the moment.
* XXX-BZ as of r205066 it would support IPv6.
*/
#ifdef INET
if (dst->sa_family == AF_INET) {
rn = rnh->rnh_matchaddr(dst, rnh);
if (rn && ((rn->rn_flags & RNF_ROOT) == 0)) {
struct sockaddr *mask;
u_char *m, *n;
int len;
/*
* compare mask to see if the new route is
* more specific than the existing one
*/
rt0 = RNTORT(rn);
RT_LOCK(rt0);
RT_ADDREF(rt0);
RT_UNLOCK(rt0);
/*
* A host route is already present, so
* leave the flow-table entries as is.
*/
if (rt0->rt_flags & RTF_HOST) {
RTFREE(rt0);
rt0 = NULL;
} else if (!(flags & RTF_HOST) && netmask) {
mask = rt_mask(rt0);
len = mask->sa_len;
m = (u_char *)mask;
n = (u_char *)netmask;
while (len-- > 0) {
if (*n != *m)
break;
n++;
m++;
}
if (len == 0 || (*n < *m)) {
RTFREE(rt0);
rt0 = NULL;
}
}
}
}
#endif
#endif
/* XXX mtu manipulation will be done in rnh_addaddr -- itojun */
rn = rnh->rnh_addaddr(ndst, netmask, rnh, rt->rt_nodes);
/*
* If it still failed to go into the tree,
* then un-make it (this should be a function)
*/
if (rn == NULL) {
if (rt->rt_ifa)
ifa_free(rt->rt_ifa);
Free(rt_key(rt));
RT_LOCK_DESTROY(rt);
uma_zfree(V_rtzone, rt);
#ifdef FLOWTABLE
if (rt0 != NULL)
RTFREE(rt0);
#endif
senderr(EEXIST);
}
#ifdef FLOWTABLE
else if (rt0 != NULL) {
#ifdef INET
flowtable_route_flush(V_ip_ft, rt0);
#endif
RTFREE(rt0);
}
#endif
/*
* If this protocol has something to add to this then
* allow it to do that as well.
*/
if (ifa->ifa_rtrequest)
ifa->ifa_rtrequest(req, rt, info);
/*
* actually return a resultant rtentry and
* give the caller a single reference.
*/
if (ret_nrt) {
*ret_nrt = rt;
RT_ADDREF(rt);
}
RT_UNLOCK(rt);
break;
default:
error = EOPNOTSUPP;
}
bad:
if (needlock)
RADIX_NODE_HEAD_UNLOCK(rnh);
return (error);
#undef senderr
}
#undef dst
#undef gateway
#undef netmask
#undef ifaaddr
#undef ifpaddr
#undef flags
int
rt_setgate(struct rtentry *rt, struct sockaddr *dst, struct sockaddr *gate)
{
/* XXX dst may be overwritten, can we move this to below */
int dlen = SA_SIZE(dst), glen = SA_SIZE(gate);
#ifdef INVARIANTS
struct radix_node_head *rnh;
rnh = rt_tables_get_rnh(rt->rt_fibnum, dst->sa_family);
#endif
RT_LOCK_ASSERT(rt);
RADIX_NODE_HEAD_LOCK_ASSERT(rnh);
/*
* Prepare to store the gateway in rt->rt_gateway.
* Both dst and gateway are stored one after the other in the same
* malloc'd chunk. If we have room, we can reuse the old buffer,
* rt_gateway already points to the right place.
* Otherwise, malloc a new block and update the 'dst' address.
*/
if (rt->rt_gateway == NULL || glen > SA_SIZE(rt->rt_gateway)) {
caddr_t new;
R_Malloc(new, caddr_t, dlen + glen);
if (new == NULL)
return ENOBUFS;
/*
* XXX note, we copy from *dst and not *rt_key(rt) because
* rt_setgate() can be called to initialize a newly
* allocated route entry, in which case rt_key(rt) == NULL
* (and also rt->rt_gateway == NULL).
* Free()/free() handle a NULL argument just fine.
*/
bcopy(dst, new, dlen);
Free(rt_key(rt)); /* free old block, if any */
rt_key(rt) = (struct sockaddr *)new;
rt->rt_gateway = (struct sockaddr *)(new + dlen);
}
/*
* Copy the new gateway value into the memory chunk.
*/
bcopy(gate, rt->rt_gateway, glen);
return (0);
}
void
rt_maskedcopy(struct sockaddr *src, struct sockaddr *dst, struct sockaddr *netmask)
{
register u_char *cp1 = (u_char *)src;
register u_char *cp2 = (u_char *)dst;
register u_char *cp3 = (u_char *)netmask;
u_char *cplim = cp2 + *cp3;
u_char *cplim2 = cp2 + *cp1;
*cp2++ = *cp1++; *cp2++ = *cp1++; /* copies sa_len & sa_family */
cp3 += 2;
if (cplim > cplim2)
cplim = cplim2;
while (cp2 < cplim)
*cp2++ = *cp1++ & *cp3++;
if (cp2 < cplim2)
bzero((caddr_t)cp2, (unsigned)(cplim2 - cp2));
}
/*
* Set up a routing table entry, normally
* for an interface.
*/
#define _SOCKADDR_TMPSIZE 128 /* Not too big.. kernel stack size is limited */
static inline int
rtinit1(struct ifaddr *ifa, int cmd, int flags, int fibnum)
{
struct sockaddr *dst;
struct sockaddr *netmask;
struct rtentry *rt = NULL;
struct rt_addrinfo info;
int error = 0;
int startfib, endfib;
char tempbuf[_SOCKADDR_TMPSIZE];
int didwork = 0;
int a_failure = 0;
static struct sockaddr_dl null_sdl = {sizeof(null_sdl), AF_LINK};
if (flags & RTF_HOST) {
dst = ifa->ifa_dstaddr;
netmask = NULL;
} else {
dst = ifa->ifa_addr;
netmask = ifa->ifa_netmask;
}
if ( dst->sa_family != AF_INET)
fibnum = 0;
if (fibnum == -1) {
if (rt_add_addr_allfibs == 0 && cmd == (int)RTM_ADD) {
startfib = endfib = curthread->td_proc->p_fibnum;
} else {
startfib = 0;
endfib = rt_numfibs - 1;
}
} else {
KASSERT((fibnum < rt_numfibs), ("rtinit1: bad fibnum"));
startfib = fibnum;
endfib = fibnum;
}
if (dst->sa_len == 0)
return(EINVAL);
/*
* If it's a delete, check that if it exists,
* it's on the correct interface or we might scrub
* a route to another ifa which would
* be confusing at best and possibly worse.
*/
if (cmd == RTM_DELETE) {
/*
* It's a delete, so it should already exist..
* If it's a net, mask off the host bits
* (Assuming we have a mask)
* XXX this is kinda inet specific..
*/
if (netmask != NULL) {
rt_maskedcopy(dst, (struct sockaddr *)tempbuf, netmask);
dst = (struct sockaddr *)tempbuf;
}
}
/*
* Now go through all the requested tables (fibs) and do the
* requested action. Realistically, this will either be fib 0
* for protocols that don't do multiple tables or all the
* tables for those that do. XXX For this version only AF_INET.
* When that changes code should be refactored to protocol
* independent parts and protocol dependent parts.
*/
for ( fibnum = startfib; fibnum <= endfib; fibnum++) {
if (cmd == RTM_DELETE) {
struct radix_node_head *rnh;
struct radix_node *rn;
/*
* Look up an rtentry that is in the routing tree and
* contains the correct info.
*/
rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
if (rnh == NULL)
/* this table doesn't exist but others might */
continue;
RADIX_NODE_HEAD_LOCK(rnh);
#ifdef RADIX_MPATH
if (rn_mpath_capable(rnh)) {
rn = rnh->rnh_matchaddr(dst, rnh);
if (rn == NULL)
error = ESRCH;
else {
rt = RNTORT(rn);
/*
* for interface route the
* rt->rt_gateway is sockaddr_intf
* for cloning ARP entries, so
* rt_mpath_matchgate must use the
* interface address
*/
rt = rt_mpath_matchgate(rt,
ifa->ifa_addr);
if (!rt)
error = ESRCH;
}
}
else
#endif
rn = rnh->rnh_lookup(dst, netmask, rnh);
error = (rn == NULL ||
(rn->rn_flags & RNF_ROOT) ||
RNTORT(rn)->rt_ifa != ifa ||
!sa_equal((struct sockaddr *)rn->rn_key, dst));
RADIX_NODE_HEAD_UNLOCK(rnh);
if (error) {
/* this is only an error if bad on ALL tables */
continue;
}
}
/*
* Do the actual request
*/
bzero((caddr_t)&info, sizeof(info));
info.rti_ifa = ifa;
info.rti_flags = flags | (ifa->ifa_flags & ~IFA_RTSELF);
info.rti_info[RTAX_DST] = dst;
/*
* doing this for compatibility reasons
*/
if (cmd == RTM_ADD)
info.rti_info[RTAX_GATEWAY] =
(struct sockaddr *)&null_sdl;
else
info.rti_info[RTAX_GATEWAY] = ifa->ifa_addr;
info.rti_info[RTAX_NETMASK] = netmask;
error = rtrequest1_fib(cmd, &info, &rt, fibnum);
if (error == 0 && rt != NULL) {
/*
* notify any listening routing agents of the change
*/
RT_LOCK(rt);
#ifdef RADIX_MPATH
/*
* in case address alias finds the first address
* e.g. ifconfig bge0 192.103.54.246/24
* e.g. ifconfig bge0 192.103.54.247/24
* the address set in the route is 192.103.54.246
* so we need to replace it with 192.103.54.247
*/
if (memcmp(rt->rt_ifa->ifa_addr,
ifa->ifa_addr, ifa->ifa_addr->sa_len)) {
ifa_free(rt->rt_ifa);
ifa_ref(ifa);
rt->rt_ifp = ifa->ifa_ifp;
rt->rt_ifa = ifa;
}
#endif
/*
* doing this for compatibility reasons
*/
if (cmd == RTM_ADD) {
((struct sockaddr_dl *)rt->rt_gateway)->sdl_type =
rt->rt_ifp->if_type;
((struct sockaddr_dl *)rt->rt_gateway)->sdl_index =
rt->rt_ifp->if_index;
}
RT_ADDREF(rt);
RT_UNLOCK(rt);
rt_newaddrmsg(cmd, ifa, error, rt);
RT_LOCK(rt);
RT_REMREF(rt);
if (cmd == RTM_DELETE) {
/*
* If we are deleting, and we found an entry,
* then it's been removed from the tree..
* now throw it away.
*/
RTFREE_LOCKED(rt);
} else {
if (cmd == RTM_ADD) {
/*
* We just wanted to add it..
* we don't actually need a reference.
*/
RT_REMREF(rt);
}
RT_UNLOCK(rt);
}
didwork = 1;
}
if (error)
a_failure = error;
}
if (cmd == RTM_DELETE) {
if (didwork) {
error = 0;
} else {
/* we only give an error if it wasn't in any table */
error = ((flags & RTF_HOST) ?
EHOSTUNREACH : ENETUNREACH);
}
} else {
if (a_failure) {
/* return an error if any of them failed */
error = a_failure;
}
}
return (error);
}
/* special one for inet internal use. may not use. */
int
rtinit_fib(struct ifaddr *ifa, int cmd, int flags)
{
return (rtinit1(ifa, cmd, flags, -1));
}
/*
* Set up a routing table entry, normally
* for an interface.
*/
int
rtinit(struct ifaddr *ifa, int cmd, int flags)
{
struct sockaddr *dst;
int fib = 0;
if (flags & RTF_HOST) {
dst = ifa->ifa_dstaddr;
} else {
dst = ifa->ifa_addr;
}
if (dst->sa_family == AF_INET)
fib = -1;
return (rtinit1(ifa, cmd, flags, fib));
}
Index: head/sys/nfs/nfs_nfssvc.c
===================================================================
--- head/sys/nfs/nfs_nfssvc.c (revision 225616)
+++ head/sys/nfs/nfs_nfssvc.c (revision 225617)
@@ -1,156 +1,156 @@
/*-
* Copyright (c) 1989, 1993
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* Rick Macklem at The University of Guelph.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_nfs.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/sysproto.h>
#include <sys/kernel.h>
#include <sys/sysctl.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/module.h>
#include <sys/sysent.h>
#include <sys/syscall.h>
#include <sys/sysproto.h>
#include <security/audit/audit.h>
#include <nfs/nfssvc.h>
static int nfssvc_offset = SYS_nfssvc;
static struct sysent nfssvc_prev_sysent;
MAKE_SYSENT(nfssvc);
/*
* This tiny module simply handles the nfssvc() system call. The other
* nfs modules that use the system call register themselves by setting
* the nfsd_call_xxx function pointers non-NULL.
*/
int (*nfsd_call_nfsserver)(struct thread *, struct nfssvc_args *) = NULL;
int (*nfsd_call_nfscommon)(struct thread *, struct nfssvc_args *) = NULL;
int (*nfsd_call_nfscl)(struct thread *, struct nfssvc_args *) = NULL;
int (*nfsd_call_nfsd)(struct thread *, struct nfssvc_args *) = NULL;
/*
* Nfs server psuedo system call for the nfsd's
*/
int
-nfssvc(struct thread *td, struct nfssvc_args *uap)
+sys_nfssvc(struct thread *td, struct nfssvc_args *uap)
{
int error;
KASSERT(!mtx_owned(&Giant), ("nfssvc(): called with Giant"));
AUDIT_ARG_CMD(uap->flag);
/* Allow anyone to get the stats. */
if ((uap->flag & ~NFSSVC_GETSTATS) != 0) {
error = priv_check(td, PRIV_NFS_DAEMON);
if (error != 0)
return (error);
}
error = EINVAL;
if ((uap->flag & (NFSSVC_ADDSOCK | NFSSVC_OLDNFSD | NFSSVC_NFSD)) &&
nfsd_call_nfsserver != NULL)
error = (*nfsd_call_nfsserver)(td, uap);
else if ((uap->flag & (NFSSVC_CBADDSOCK | NFSSVC_NFSCBD)) &&
nfsd_call_nfscl != NULL)
error = (*nfsd_call_nfscl)(td, uap);
else if ((uap->flag & (NFSSVC_IDNAME | NFSSVC_GETSTATS |
NFSSVC_GSSDADDPORT | NFSSVC_GSSDADDFIRST | NFSSVC_GSSDDELETEALL |
NFSSVC_NFSUSERDPORT | NFSSVC_NFSUSERDDELPORT)) &&
nfsd_call_nfscommon != NULL)
error = (*nfsd_call_nfscommon)(td, uap);
else if ((uap->flag & (NFSSVC_NFSDNFSD | NFSSVC_NFSDADDSOCK |
NFSSVC_PUBLICFH | NFSSVC_V4ROOTEXPORT | NFSSVC_NOPUBLICFH |
NFSSVC_STABLERESTART | NFSSVC_ADMINREVOKE |
NFSSVC_DUMPCLIENTS | NFSSVC_DUMPLOCKS | NFSSVC_BACKUPSTABLE)) &&
nfsd_call_nfsd != NULL)
error = (*nfsd_call_nfsd)(td, uap);
if (error == EINTR || error == ERESTART)
error = 0;
return (error);
}
/*
* Called once to initialize data structures...
*/
static int
nfssvc_modevent(module_t mod, int type, void *data)
{
static int registered;
int error = 0;
switch (type) {
case MOD_LOAD:
error = syscall_register(&nfssvc_offset, &nfssvc_sysent,
&nfssvc_prev_sysent);
if (error)
break;
registered = 1;
break;
case MOD_UNLOAD:
if (nfsd_call_nfsserver != NULL || nfsd_call_nfscommon != NULL
|| nfsd_call_nfscl != NULL || nfsd_call_nfsd != NULL) {
error = EBUSY;
break;
}
if (registered)
syscall_deregister(&nfssvc_offset, &nfssvc_prev_sysent);
registered = 0;
break;
default:
error = EOPNOTSUPP;
break;
}
return error;
}
static moduledata_t nfssvc_mod = {
"nfssvc",
nfssvc_modevent,
NULL,
};
DECLARE_MODULE(nfssvc, nfssvc_mod, SI_SUB_VFS, SI_ORDER_ANY);
/* So that loader and kldload(2) can find us, wherever we are.. */
MODULE_VERSION(nfssvc, 1);
Index: head/sys/nlm/nlm_prot_impl.c
===================================================================
--- head/sys/nlm/nlm_prot_impl.c (revision 225616)
+++ head/sys/nlm/nlm_prot_impl.c (revision 225617)
@@ -1,2434 +1,2434 @@
/*-
* Copyright (c) 2008 Isilon Inc http://www.isilon.com/
* Authors: Doug Rabson <dfr@rabson.org>
* Developed with Red Inc: Alfred Perlstein <alfred@freebsd.org>
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include "opt_inet6.h"
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/fail.h>
#include <sys/fcntl.h>
#include <sys/kernel.h>
#include <sys/kthread.h>
#include <sys/lockf.h>
#include <sys/malloc.h>
#include <sys/mount.h>
#if __FreeBSD_version >= 700000
#include <sys/priv.h>
#endif
#include <sys/proc.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/syscall.h>
#include <sys/sysctl.h>
#include <sys/sysent.h>
#include <sys/syslog.h>
#include <sys/sysproto.h>
#include <sys/systm.h>
#include <sys/taskqueue.h>
#include <sys/unistd.h>
#include <sys/vnode.h>
#include <nfs/nfsproto.h>
#include <nfs/nfs_lock.h>
#include <nlm/nlm_prot.h>
#include <nlm/sm_inter.h>
#include <nlm/nlm.h>
#include <rpc/rpc_com.h>
#include <rpc/rpcb_prot.h>
MALLOC_DEFINE(M_NLM, "NLM", "Network Lock Manager");
/*
* If a host is inactive (and holds no locks) for this amount of
* seconds, we consider it idle and stop tracking it.
*/
#define NLM_IDLE_TIMEOUT 30
/*
* We check the host list for idle every few seconds.
*/
#define NLM_IDLE_PERIOD 5
/*
* We only look for GRANTED_RES messages for a little while.
*/
#define NLM_EXPIRE_TIMEOUT 10
/*
* Support for sysctl vfs.nlm.sysid
*/
SYSCTL_NODE(_vfs, OID_AUTO, nlm, CTLFLAG_RW, NULL, "Network Lock Manager");
SYSCTL_NODE(_vfs_nlm, OID_AUTO, sysid, CTLFLAG_RW, NULL, "");
/*
* Syscall hooks
*/
static int nlm_syscall_offset = SYS_nlm_syscall;
static struct sysent nlm_syscall_prev_sysent;
#if __FreeBSD_version < 700000
static struct sysent nlm_syscall_sysent = {
(sizeof(struct nlm_syscall_args) / sizeof(register_t)) | SYF_MPSAFE,
(sy_call_t *) nlm_syscall
};
#else
MAKE_SYSENT(nlm_syscall);
#endif
static bool_t nlm_syscall_registered = FALSE;
/*
* Debug level passed in from userland. We also support a sysctl hook
* so that it can be changed on a live system.
*/
static int nlm_debug_level;
SYSCTL_INT(_debug, OID_AUTO, nlm_debug, CTLFLAG_RW, &nlm_debug_level, 0, "");
#define NLM_DEBUG(_level, args...) \
do { \
if (nlm_debug_level >= (_level)) \
log(LOG_DEBUG, args); \
} while(0)
#define NLM_ERR(args...) \
do { \
log(LOG_ERR, args); \
} while(0)
/*
* Grace period handling. The value of nlm_grace_threshold is the
* value of time_uptime after which we are serving requests normally.
*/
static time_t nlm_grace_threshold;
/*
* We check for idle hosts if time_uptime is greater than
* nlm_next_idle_check,
*/
static time_t nlm_next_idle_check;
/*
* A socket to use for RPC - shared by all IPv4 RPC clients.
*/
static struct socket *nlm_socket;
#ifdef INET6
/*
* A socket to use for RPC - shared by all IPv6 RPC clients.
*/
static struct socket *nlm_socket6;
#endif
/*
* An RPC client handle that can be used to communicate with the local
* NSM.
*/
static CLIENT *nlm_nsm;
/*
* An AUTH handle for the server's creds.
*/
static AUTH *nlm_auth;
/*
* A zero timeval for sending async RPC messages.
*/
struct timeval nlm_zero_tv = { 0, 0 };
/*
* The local NSM state number
*/
int nlm_nsm_state;
/*
* A lock to protect the host list and waiting lock list.
*/
static struct mtx nlm_global_lock;
/*
* Locks:
* (l) locked by nh_lock
* (s) only accessed via server RPC which is single threaded
* (g) locked by nlm_global_lock
* (c) const until freeing
* (a) modified using atomic ops
*/
/*
* A pending client-side lock request, stored on the nlm_waiting_locks
* list.
*/
struct nlm_waiting_lock {
TAILQ_ENTRY(nlm_waiting_lock) nw_link; /* (g) */
bool_t nw_waiting; /* (g) */
nlm4_lock nw_lock; /* (c) */
union nfsfh nw_fh; /* (c) */
struct vnode *nw_vp; /* (c) */
};
TAILQ_HEAD(nlm_waiting_lock_list, nlm_waiting_lock);
struct nlm_waiting_lock_list nlm_waiting_locks; /* (g) */
/*
* A pending server-side asynchronous lock request, stored on the
* nh_pending list of the NLM host.
*/
struct nlm_async_lock {
TAILQ_ENTRY(nlm_async_lock) af_link; /* (l) host's list of locks */
struct task af_task; /* (c) async callback details */
void *af_cookie; /* (l) lock manager cancel token */
struct vnode *af_vp; /* (l) vnode to lock */
struct flock af_fl; /* (c) lock details */
struct nlm_host *af_host; /* (c) host which is locking */
CLIENT *af_rpc; /* (c) rpc client to send message */
nlm4_testargs af_granted; /* (c) notification details */
time_t af_expiretime; /* (c) notification time */
};
TAILQ_HEAD(nlm_async_lock_list, nlm_async_lock);
/*
* NLM host.
*/
enum nlm_host_state {
NLM_UNMONITORED,
NLM_MONITORED,
NLM_MONITOR_FAILED,
NLM_RECOVERING
};
struct nlm_rpc {
CLIENT *nr_client; /* (l) RPC client handle */
time_t nr_create_time; /* (l) when client was created */
};
struct nlm_host {
struct mtx nh_lock;
volatile u_int nh_refs; /* (a) reference count */
TAILQ_ENTRY(nlm_host) nh_link; /* (g) global list of hosts */
char nh_caller_name[MAXNAMELEN]; /* (c) printable name of host */
uint32_t nh_sysid; /* (c) our allocaed system ID */
char nh_sysid_string[10]; /* (c) string rep. of sysid */
struct sockaddr_storage nh_addr; /* (s) remote address of host */
struct nlm_rpc nh_srvrpc; /* (l) RPC for server replies */
struct nlm_rpc nh_clntrpc; /* (l) RPC for client requests */
rpcvers_t nh_vers; /* (s) NLM version of host */
int nh_state; /* (s) last seen NSM state of host */
enum nlm_host_state nh_monstate; /* (l) local NSM monitoring state */
time_t nh_idle_timeout; /* (s) Time at which host is idle */
struct sysctl_ctx_list nh_sysctl; /* (c) vfs.nlm.sysid nodes */
uint32_t nh_grantcookie; /* (l) grant cookie counter */
struct nlm_async_lock_list nh_pending; /* (l) pending async locks */
struct nlm_async_lock_list nh_granted; /* (l) granted locks */
struct nlm_async_lock_list nh_finished; /* (l) finished async locks */
};
TAILQ_HEAD(nlm_host_list, nlm_host);
static struct nlm_host_list nlm_hosts; /* (g) */
static uint32_t nlm_next_sysid = 1; /* (g) */
static void nlm_host_unmonitor(struct nlm_host *);
struct nlm_grantcookie {
uint32_t ng_sysid;
uint32_t ng_cookie;
};
static inline uint32_t
ng_sysid(struct netobj *src)
{
return ((struct nlm_grantcookie *)src->n_bytes)->ng_sysid;
}
static inline uint32_t
ng_cookie(struct netobj *src)
{
return ((struct nlm_grantcookie *)src->n_bytes)->ng_cookie;
}
/**********************************************************************/
/*
* Initialise NLM globals.
*/
static void
nlm_init(void *dummy)
{
int error;
mtx_init(&nlm_global_lock, "nlm_global_lock", NULL, MTX_DEF);
TAILQ_INIT(&nlm_waiting_locks);
TAILQ_INIT(&nlm_hosts);
error = syscall_register(&nlm_syscall_offset, &nlm_syscall_sysent,
&nlm_syscall_prev_sysent);
if (error)
NLM_ERR("Can't register NLM syscall\n");
else
nlm_syscall_registered = TRUE;
}
SYSINIT(nlm_init, SI_SUB_LOCK, SI_ORDER_FIRST, nlm_init, NULL);
static void
nlm_uninit(void *dummy)
{
if (nlm_syscall_registered)
syscall_deregister(&nlm_syscall_offset,
&nlm_syscall_prev_sysent);
}
SYSUNINIT(nlm_uninit, SI_SUB_LOCK, SI_ORDER_FIRST, nlm_uninit, NULL);
/*
* Create a netobj from an arbitrary source.
*/
void
nlm_make_netobj(struct netobj *dst, caddr_t src, size_t srcsize,
struct malloc_type *type)
{
dst->n_len = srcsize;
dst->n_bytes = malloc(srcsize, type, M_WAITOK);
memcpy(dst->n_bytes, src, srcsize);
}
/*
* Copy a struct netobj.
*/
void
nlm_copy_netobj(struct netobj *dst, struct netobj *src,
struct malloc_type *type)
{
nlm_make_netobj(dst, src->n_bytes, src->n_len, type);
}
/*
* Create an RPC client handle for the given (address,prog,vers)
* triple using UDP.
*/
static CLIENT *
nlm_get_rpc(struct sockaddr *sa, rpcprog_t prog, rpcvers_t vers)
{
char *wchan = "nlmrcv";
const char* protofmly;
struct sockaddr_storage ss;
struct socket *so;
CLIENT *rpcb;
struct timeval timo;
RPCB parms;
char *uaddr;
enum clnt_stat stat = RPC_SUCCESS;
int rpcvers = RPCBVERS4;
bool_t do_tcp = FALSE;
bool_t tryagain = FALSE;
struct portmap mapping;
u_short port = 0;
/*
* First we need to contact the remote RPCBIND service to find
* the right port.
*/
memcpy(&ss, sa, sa->sa_len);
switch (ss.ss_family) {
case AF_INET:
((struct sockaddr_in *)&ss)->sin_port = htons(111);
protofmly = "inet";
so = nlm_socket;
break;
#ifdef INET6
case AF_INET6:
((struct sockaddr_in6 *)&ss)->sin6_port = htons(111);
protofmly = "inet6";
so = nlm_socket6;
break;
#endif
default:
/*
* Unsupported address family - fail.
*/
return (NULL);
}
rpcb = clnt_dg_create(so, (struct sockaddr *)&ss,
RPCBPROG, rpcvers, 0, 0);
if (!rpcb)
return (NULL);
try_tcp:
parms.r_prog = prog;
parms.r_vers = vers;
if (do_tcp)
parms.r_netid = "tcp";
else
parms.r_netid = "udp";
parms.r_addr = "";
parms.r_owner = "";
/*
* Use the default timeout.
*/
timo.tv_sec = 25;
timo.tv_usec = 0;
again:
switch (rpcvers) {
case RPCBVERS4:
case RPCBVERS:
/*
* Try RPCBIND 4 then 3.
*/
uaddr = NULL;
stat = CLNT_CALL(rpcb, (rpcprog_t) RPCBPROC_GETADDR,
(xdrproc_t) xdr_rpcb, &parms,
(xdrproc_t) xdr_wrapstring, &uaddr, timo);
if (stat == RPC_SUCCESS) {
/*
* We have a reply from the remote RPCBIND - turn it
* into an appropriate address and make a new client
* that can talk to the remote NLM.
*
* XXX fixup IPv6 scope ID.
*/
struct netbuf *a;
a = __rpc_uaddr2taddr_af(ss.ss_family, uaddr);
if (!a) {
tryagain = TRUE;
} else {
tryagain = FALSE;
memcpy(&ss, a->buf, a->len);
free(a->buf, M_RPC);
free(a, M_RPC);
xdr_free((xdrproc_t) xdr_wrapstring, &uaddr);
}
}
if (tryagain || stat == RPC_PROGVERSMISMATCH) {
if (rpcvers == RPCBVERS4)
rpcvers = RPCBVERS;
else if (rpcvers == RPCBVERS)
rpcvers = PMAPVERS;
CLNT_CONTROL(rpcb, CLSET_VERS, &rpcvers);
goto again;
}
break;
case PMAPVERS:
/*
* Try portmap.
*/
mapping.pm_prog = parms.r_prog;
mapping.pm_vers = parms.r_vers;
mapping.pm_prot = do_tcp ? IPPROTO_TCP : IPPROTO_UDP;
mapping.pm_port = 0;
stat = CLNT_CALL(rpcb, (rpcprog_t) PMAPPROC_GETPORT,
(xdrproc_t) xdr_portmap, &mapping,
(xdrproc_t) xdr_u_short, &port, timo);
if (stat == RPC_SUCCESS) {
switch (ss.ss_family) {
case AF_INET:
((struct sockaddr_in *)&ss)->sin_port =
htons(port);
break;
#ifdef INET6
case AF_INET6:
((struct sockaddr_in6 *)&ss)->sin6_port =
htons(port);
break;
#endif
}
}
break;
default:
panic("invalid rpcvers %d", rpcvers);
}
/*
* We may have a positive response from the portmapper, but the NLM
* service was not found. Make sure we received a valid port.
*/
switch (ss.ss_family) {
case AF_INET:
port = ((struct sockaddr_in *)&ss)->sin_port;
break;
#ifdef INET6
case AF_INET6:
port = ((struct sockaddr_in6 *)&ss)->sin6_port;
break;
#endif
}
if (stat != RPC_SUCCESS || !port) {
/*
* If we were able to talk to rpcbind or portmap, but the udp
* variant wasn't available, ask about tcp.
*
* XXX - We could also check for a TCP portmapper, but
* if the host is running a portmapper at all, we should be able
* to hail it over UDP.
*/
if (stat == RPC_SUCCESS && !do_tcp) {
do_tcp = TRUE;
goto try_tcp;
}
/* Otherwise, bad news. */
NLM_ERR("NLM: failed to contact remote rpcbind, "
"stat = %d, port = %d\n", (int) stat, port);
CLNT_DESTROY(rpcb);
return (NULL);
}
if (do_tcp) {
/*
* Destroy the UDP client we used to speak to rpcbind and
* recreate as a TCP client.
*/
struct netconfig *nconf = NULL;
CLNT_DESTROY(rpcb);
switch (ss.ss_family) {
case AF_INET:
nconf = getnetconfigent("tcp");
break;
#ifdef INET6
case AF_INET6:
nconf = getnetconfigent("tcp6");
break;
#endif
}
rpcb = clnt_reconnect_create(nconf, (struct sockaddr *)&ss,
prog, vers, 0, 0);
CLNT_CONTROL(rpcb, CLSET_WAITCHAN, wchan);
rpcb->cl_auth = nlm_auth;
} else {
/*
* Re-use the client we used to speak to rpcbind.
*/
CLNT_CONTROL(rpcb, CLSET_SVC_ADDR, &ss);
CLNT_CONTROL(rpcb, CLSET_PROG, &prog);
CLNT_CONTROL(rpcb, CLSET_VERS, &vers);
CLNT_CONTROL(rpcb, CLSET_WAITCHAN, wchan);
rpcb->cl_auth = nlm_auth;
}
return (rpcb);
}
/*
* This async callback after when an async lock request has been
* granted. We notify the host which initiated the request.
*/
static void
nlm_lock_callback(void *arg, int pending)
{
struct nlm_async_lock *af = (struct nlm_async_lock *) arg;
struct rpc_callextra ext;
NLM_DEBUG(2, "NLM: async lock %p for %s (sysid %d) granted,"
" cookie %d:%d\n", af, af->af_host->nh_caller_name,
af->af_host->nh_sysid, ng_sysid(&af->af_granted.cookie),
ng_cookie(&af->af_granted.cookie));
/*
* Send the results back to the host.
*
* Note: there is a possible race here with nlm_host_notify
* destroying the RPC client. To avoid problems, the first
* thing nlm_host_notify does is to cancel pending async lock
* requests.
*/
memset(&ext, 0, sizeof(ext));
ext.rc_auth = nlm_auth;
if (af->af_host->nh_vers == NLM_VERS4) {
nlm4_granted_msg_4(&af->af_granted,
NULL, af->af_rpc, &ext, nlm_zero_tv);
} else {
/*
* Back-convert to legacy protocol
*/
nlm_testargs granted;
granted.cookie = af->af_granted.cookie;
granted.exclusive = af->af_granted.exclusive;
granted.alock.caller_name =
af->af_granted.alock.caller_name;
granted.alock.fh = af->af_granted.alock.fh;
granted.alock.oh = af->af_granted.alock.oh;
granted.alock.svid = af->af_granted.alock.svid;
granted.alock.l_offset =
af->af_granted.alock.l_offset;
granted.alock.l_len =
af->af_granted.alock.l_len;
nlm_granted_msg_1(&granted,
NULL, af->af_rpc, &ext, nlm_zero_tv);
}
/*
* Move this entry to the nh_granted list.
*/
af->af_expiretime = time_uptime + NLM_EXPIRE_TIMEOUT;
mtx_lock(&af->af_host->nh_lock);
TAILQ_REMOVE(&af->af_host->nh_pending, af, af_link);
TAILQ_INSERT_TAIL(&af->af_host->nh_granted, af, af_link);
mtx_unlock(&af->af_host->nh_lock);
}
/*
* Free an async lock request. The request must have been removed from
* any list.
*/
static void
nlm_free_async_lock(struct nlm_async_lock *af)
{
/*
* Free an async lock.
*/
if (af->af_rpc)
CLNT_RELEASE(af->af_rpc);
xdr_free((xdrproc_t) xdr_nlm4_testargs, &af->af_granted);
if (af->af_vp)
vrele(af->af_vp);
free(af, M_NLM);
}
/*
* Cancel our async request - this must be called with
* af->nh_host->nh_lock held. This is slightly complicated by a
* potential race with our own callback. If we fail to cancel the
* lock, it must already have been granted - we make sure our async
* task has completed by calling taskqueue_drain in this case.
*/
static int
nlm_cancel_async_lock(struct nlm_async_lock *af)
{
struct nlm_host *host = af->af_host;
int error;
mtx_assert(&host->nh_lock, MA_OWNED);
mtx_unlock(&host->nh_lock);
error = VOP_ADVLOCKASYNC(af->af_vp, NULL, F_CANCEL, &af->af_fl,
F_REMOTE, NULL, &af->af_cookie);
if (error) {
/*
* We failed to cancel - make sure our callback has
* completed before we continue.
*/
taskqueue_drain(taskqueue_thread, &af->af_task);
}
mtx_lock(&host->nh_lock);
if (!error) {
NLM_DEBUG(2, "NLM: async lock %p for %s (sysid %d) "
"cancelled\n", af, host->nh_caller_name, host->nh_sysid);
/*
* Remove from the nh_pending list and free now that
* we are safe from the callback.
*/
TAILQ_REMOVE(&host->nh_pending, af, af_link);
mtx_unlock(&host->nh_lock);
nlm_free_async_lock(af);
mtx_lock(&host->nh_lock);
}
return (error);
}
static void
nlm_check_expired_locks(struct nlm_host *host)
{
struct nlm_async_lock *af;
time_t uptime = time_uptime;
mtx_lock(&host->nh_lock);
while ((af = TAILQ_FIRST(&host->nh_granted)) != NULL
&& uptime >= af->af_expiretime) {
NLM_DEBUG(2, "NLM: async lock %p for %s (sysid %d) expired,"
" cookie %d:%d\n", af, af->af_host->nh_caller_name,
af->af_host->nh_sysid, ng_sysid(&af->af_granted.cookie),
ng_cookie(&af->af_granted.cookie));
TAILQ_REMOVE(&host->nh_granted, af, af_link);
mtx_unlock(&host->nh_lock);
nlm_free_async_lock(af);
mtx_lock(&host->nh_lock);
}
while ((af = TAILQ_FIRST(&host->nh_finished)) != NULL) {
TAILQ_REMOVE(&host->nh_finished, af, af_link);
mtx_unlock(&host->nh_lock);
nlm_free_async_lock(af);
mtx_lock(&host->nh_lock);
}
mtx_unlock(&host->nh_lock);
}
/*
* Free resources used by a host. This is called after the reference
* count has reached zero so it doesn't need to worry about locks.
*/
static void
nlm_host_destroy(struct nlm_host *host)
{
mtx_lock(&nlm_global_lock);
TAILQ_REMOVE(&nlm_hosts, host, nh_link);
mtx_unlock(&nlm_global_lock);
if (host->nh_srvrpc.nr_client)
CLNT_RELEASE(host->nh_srvrpc.nr_client);
if (host->nh_clntrpc.nr_client)
CLNT_RELEASE(host->nh_clntrpc.nr_client);
mtx_destroy(&host->nh_lock);
sysctl_ctx_free(&host->nh_sysctl);
free(host, M_NLM);
}
/*
* Thread start callback for client lock recovery
*/
static void
nlm_client_recovery_start(void *arg)
{
struct nlm_host *host = (struct nlm_host *) arg;
NLM_DEBUG(1, "NLM: client lock recovery for %s started\n",
host->nh_caller_name);
nlm_client_recovery(host);
NLM_DEBUG(1, "NLM: client lock recovery for %s completed\n",
host->nh_caller_name);
host->nh_monstate = NLM_MONITORED;
nlm_host_release(host);
kthread_exit();
}
/*
* This is called when we receive a host state change notification. We
* unlock any active locks owned by the host. When rpc.lockd is
* shutting down, this function is called with newstate set to zero
* which allows us to cancel any pending async locks and clear the
* locking state.
*/
static void
nlm_host_notify(struct nlm_host *host, int newstate)
{
struct nlm_async_lock *af;
if (newstate) {
NLM_DEBUG(1, "NLM: host %s (sysid %d) rebooted, new "
"state is %d\n", host->nh_caller_name,
host->nh_sysid, newstate);
}
/*
* Cancel any pending async locks for this host.
*/
mtx_lock(&host->nh_lock);
while ((af = TAILQ_FIRST(&host->nh_pending)) != NULL) {
/*
* nlm_cancel_async_lock will remove the entry from
* nh_pending and free it.
*/
nlm_cancel_async_lock(af);
}
mtx_unlock(&host->nh_lock);
nlm_check_expired_locks(host);
/*
* The host just rebooted - trash its locks.
*/
lf_clearremotesys(host->nh_sysid);
host->nh_state = newstate;
/*
* If we have any remote locks for this host (i.e. it
* represents a remote NFS server that our local NFS client
* has locks for), start a recovery thread.
*/
if (newstate != 0
&& host->nh_monstate != NLM_RECOVERING
&& lf_countlocks(NLM_SYSID_CLIENT | host->nh_sysid) > 0) {
struct thread *td;
host->nh_monstate = NLM_RECOVERING;
refcount_acquire(&host->nh_refs);
kthread_add(nlm_client_recovery_start, host, curproc, &td, 0, 0,
"NFS lock recovery for %s", host->nh_caller_name);
}
}
/*
* Sysctl handler to count the number of locks for a sysid.
*/
static int
nlm_host_lock_count_sysctl(SYSCTL_HANDLER_ARGS)
{
struct nlm_host *host;
int count;
host = oidp->oid_arg1;
count = lf_countlocks(host->nh_sysid);
return sysctl_handle_int(oidp, &count, 0, req);
}
/*
* Sysctl handler to count the number of client locks for a sysid.
*/
static int
nlm_host_client_lock_count_sysctl(SYSCTL_HANDLER_ARGS)
{
struct nlm_host *host;
int count;
host = oidp->oid_arg1;
count = lf_countlocks(NLM_SYSID_CLIENT | host->nh_sysid);
return sysctl_handle_int(oidp, &count, 0, req);
}
/*
* Create a new NLM host.
*/
static struct nlm_host *
nlm_create_host(const char* caller_name)
{
struct nlm_host *host;
struct sysctl_oid *oid;
mtx_assert(&nlm_global_lock, MA_OWNED);
NLM_DEBUG(1, "NLM: new host %s (sysid %d)\n",
caller_name, nlm_next_sysid);
host = malloc(sizeof(struct nlm_host), M_NLM, M_NOWAIT|M_ZERO);
if (!host)
return (NULL);
mtx_init(&host->nh_lock, "nh_lock", NULL, MTX_DEF);
host->nh_refs = 1;
strlcpy(host->nh_caller_name, caller_name, MAXNAMELEN);
host->nh_sysid = nlm_next_sysid++;
snprintf(host->nh_sysid_string, sizeof(host->nh_sysid_string),
"%d", host->nh_sysid);
host->nh_vers = 0;
host->nh_state = 0;
host->nh_monstate = NLM_UNMONITORED;
host->nh_grantcookie = 1;
TAILQ_INIT(&host->nh_pending);
TAILQ_INIT(&host->nh_granted);
TAILQ_INIT(&host->nh_finished);
TAILQ_INSERT_TAIL(&nlm_hosts, host, nh_link);
mtx_unlock(&nlm_global_lock);
sysctl_ctx_init(&host->nh_sysctl);
oid = SYSCTL_ADD_NODE(&host->nh_sysctl,
SYSCTL_STATIC_CHILDREN(_vfs_nlm_sysid),
OID_AUTO, host->nh_sysid_string, CTLFLAG_RD, NULL, "");
SYSCTL_ADD_STRING(&host->nh_sysctl, SYSCTL_CHILDREN(oid), OID_AUTO,
"hostname", CTLFLAG_RD, host->nh_caller_name, 0, "");
SYSCTL_ADD_UINT(&host->nh_sysctl, SYSCTL_CHILDREN(oid), OID_AUTO,
"version", CTLFLAG_RD, &host->nh_vers, 0, "");
SYSCTL_ADD_UINT(&host->nh_sysctl, SYSCTL_CHILDREN(oid), OID_AUTO,
"monitored", CTLFLAG_RD, &host->nh_monstate, 0, "");
SYSCTL_ADD_PROC(&host->nh_sysctl, SYSCTL_CHILDREN(oid), OID_AUTO,
"lock_count", CTLTYPE_INT | CTLFLAG_RD, host, 0,
nlm_host_lock_count_sysctl, "I", "");
SYSCTL_ADD_PROC(&host->nh_sysctl, SYSCTL_CHILDREN(oid), OID_AUTO,
"client_lock_count", CTLTYPE_INT | CTLFLAG_RD, host, 0,
nlm_host_client_lock_count_sysctl, "I", "");
mtx_lock(&nlm_global_lock);
return (host);
}
/*
* Acquire the next sysid for remote locks not handled by the NLM.
*/
uint32_t
nlm_acquire_next_sysid(void)
{
uint32_t next_sysid;
mtx_lock(&nlm_global_lock);
next_sysid = nlm_next_sysid++;
mtx_unlock(&nlm_global_lock);
return (next_sysid);
}
/*
* Return non-zero if the address parts of the two sockaddrs are the
* same.
*/
static int
nlm_compare_addr(const struct sockaddr *a, const struct sockaddr *b)
{
const struct sockaddr_in *a4, *b4;
#ifdef INET6
const struct sockaddr_in6 *a6, *b6;
#endif
if (a->sa_family != b->sa_family)
return (FALSE);
switch (a->sa_family) {
case AF_INET:
a4 = (const struct sockaddr_in *) a;
b4 = (const struct sockaddr_in *) b;
return !memcmp(&a4->sin_addr, &b4->sin_addr,
sizeof(a4->sin_addr));
#ifdef INET6
case AF_INET6:
a6 = (const struct sockaddr_in6 *) a;
b6 = (const struct sockaddr_in6 *) b;
return !memcmp(&a6->sin6_addr, &b6->sin6_addr,
sizeof(a6->sin6_addr));
#endif
}
return (0);
}
/*
* Check for idle hosts and stop monitoring them. We could also free
* the host structure here, possibly after a larger timeout but that
* would require some care to avoid races with
* e.g. nlm_host_lock_count_sysctl.
*/
static void
nlm_check_idle(void)
{
struct nlm_host *host;
mtx_assert(&nlm_global_lock, MA_OWNED);
if (time_uptime <= nlm_next_idle_check)
return;
nlm_next_idle_check = time_uptime + NLM_IDLE_PERIOD;
TAILQ_FOREACH(host, &nlm_hosts, nh_link) {
if (host->nh_monstate == NLM_MONITORED
&& time_uptime > host->nh_idle_timeout) {
mtx_unlock(&nlm_global_lock);
if (lf_countlocks(host->nh_sysid) > 0
|| lf_countlocks(NLM_SYSID_CLIENT
+ host->nh_sysid)) {
host->nh_idle_timeout =
time_uptime + NLM_IDLE_TIMEOUT;
mtx_lock(&nlm_global_lock);
continue;
}
nlm_host_unmonitor(host);
mtx_lock(&nlm_global_lock);
}
}
}
/*
* Search for an existing NLM host that matches the given name
* (typically the caller_name element of an nlm4_lock). If none is
* found, create a new host. If 'addr' is non-NULL, record the remote
* address of the host so that we can call it back for async
* responses. If 'vers' is greater than zero then record the NLM
* program version to use to communicate with this client.
*/
struct nlm_host *
nlm_find_host_by_name(const char *name, const struct sockaddr *addr,
rpcvers_t vers)
{
struct nlm_host *host;
mtx_lock(&nlm_global_lock);
/*
* The remote host is determined by caller_name.
*/
TAILQ_FOREACH(host, &nlm_hosts, nh_link) {
if (!strcmp(host->nh_caller_name, name))
break;
}
if (!host) {
host = nlm_create_host(name);
if (!host) {
mtx_unlock(&nlm_global_lock);
return (NULL);
}
}
refcount_acquire(&host->nh_refs);
host->nh_idle_timeout = time_uptime + NLM_IDLE_TIMEOUT;
/*
* If we have an address for the host, record it so that we
* can send async replies etc.
*/
if (addr) {
KASSERT(addr->sa_len < sizeof(struct sockaddr_storage),
("Strange remote transport address length"));
/*
* If we have seen an address before and we currently
* have an RPC client handle, make sure the address is
* the same, otherwise discard the client handle.
*/
if (host->nh_addr.ss_len && host->nh_srvrpc.nr_client) {
if (!nlm_compare_addr(
(struct sockaddr *) &host->nh_addr,
addr)
|| host->nh_vers != vers) {
CLIENT *client;
mtx_lock(&host->nh_lock);
client = host->nh_srvrpc.nr_client;
host->nh_srvrpc.nr_client = NULL;
mtx_unlock(&host->nh_lock);
if (client) {
CLNT_RELEASE(client);
}
}
}
memcpy(&host->nh_addr, addr, addr->sa_len);
host->nh_vers = vers;
}
nlm_check_idle();
mtx_unlock(&nlm_global_lock);
return (host);
}
/*
* Search for an existing NLM host that matches the given remote
* address. If none is found, create a new host with the requested
* address and remember 'vers' as the NLM protocol version to use for
* that host.
*/
struct nlm_host *
nlm_find_host_by_addr(const struct sockaddr *addr, int vers)
{
/*
* Fake up a name using inet_ntop. This buffer is
* large enough for an IPv6 address.
*/
char tmp[sizeof "ffff:ffff:ffff:ffff:ffff:ffff:255.255.255.255"];
struct nlm_host *host;
switch (addr->sa_family) {
case AF_INET:
inet_ntop(AF_INET,
&((const struct sockaddr_in *) addr)->sin_addr,
tmp, sizeof tmp);
break;
#ifdef INET6
case AF_INET6:
inet_ntop(AF_INET6,
&((const struct sockaddr_in6 *) addr)->sin6_addr,
tmp, sizeof tmp);
break;
#endif
default:
strcmp(tmp, "<unknown>");
}
mtx_lock(&nlm_global_lock);
/*
* The remote host is determined by caller_name.
*/
TAILQ_FOREACH(host, &nlm_hosts, nh_link) {
if (nlm_compare_addr(addr,
(const struct sockaddr *) &host->nh_addr))
break;
}
if (!host) {
host = nlm_create_host(tmp);
if (!host) {
mtx_unlock(&nlm_global_lock);
return (NULL);
}
memcpy(&host->nh_addr, addr, addr->sa_len);
host->nh_vers = vers;
}
refcount_acquire(&host->nh_refs);
host->nh_idle_timeout = time_uptime + NLM_IDLE_TIMEOUT;
nlm_check_idle();
mtx_unlock(&nlm_global_lock);
return (host);
}
/*
* Find the NLM host that matches the value of 'sysid'. If none
* exists, return NULL.
*/
static struct nlm_host *
nlm_find_host_by_sysid(int sysid)
{
struct nlm_host *host;
TAILQ_FOREACH(host, &nlm_hosts, nh_link) {
if (host->nh_sysid == sysid) {
refcount_acquire(&host->nh_refs);
return (host);
}
}
return (NULL);
}
void nlm_host_release(struct nlm_host *host)
{
if (refcount_release(&host->nh_refs)) {
/*
* Free the host
*/
nlm_host_destroy(host);
}
}
/*
* Unregister this NLM host with the local NSM due to idleness.
*/
static void
nlm_host_unmonitor(struct nlm_host *host)
{
mon_id smmonid;
sm_stat_res smstat;
struct timeval timo;
enum clnt_stat stat;
NLM_DEBUG(1, "NLM: unmonitoring %s (sysid %d)\n",
host->nh_caller_name, host->nh_sysid);
/*
* We put our assigned system ID value in the priv field to
* make it simpler to find the host if we are notified of a
* host restart.
*/
smmonid.mon_name = host->nh_caller_name;
smmonid.my_id.my_name = "localhost";
smmonid.my_id.my_prog = NLM_PROG;
smmonid.my_id.my_vers = NLM_SM;
smmonid.my_id.my_proc = NLM_SM_NOTIFY;
timo.tv_sec = 25;
timo.tv_usec = 0;
stat = CLNT_CALL(nlm_nsm, SM_UNMON,
(xdrproc_t) xdr_mon, &smmonid,
(xdrproc_t) xdr_sm_stat, &smstat, timo);
if (stat != RPC_SUCCESS) {
NLM_ERR("Failed to contact local NSM - rpc error %d\n", stat);
return;
}
if (smstat.res_stat == stat_fail) {
NLM_ERR("Local NSM refuses to unmonitor %s\n",
host->nh_caller_name);
return;
}
host->nh_monstate = NLM_UNMONITORED;
}
/*
* Register this NLM host with the local NSM so that we can be
* notified if it reboots.
*/
void
nlm_host_monitor(struct nlm_host *host, int state)
{
mon smmon;
sm_stat_res smstat;
struct timeval timo;
enum clnt_stat stat;
if (state && !host->nh_state) {
/*
* This is the first time we have seen an NSM state
* value for this host. We record it here to help
* detect host reboots.
*/
host->nh_state = state;
NLM_DEBUG(1, "NLM: host %s (sysid %d) has NSM state %d\n",
host->nh_caller_name, host->nh_sysid, state);
}
mtx_lock(&host->nh_lock);
if (host->nh_monstate != NLM_UNMONITORED) {
mtx_unlock(&host->nh_lock);
return;
}
host->nh_monstate = NLM_MONITORED;
mtx_unlock(&host->nh_lock);
NLM_DEBUG(1, "NLM: monitoring %s (sysid %d)\n",
host->nh_caller_name, host->nh_sysid);
/*
* We put our assigned system ID value in the priv field to
* make it simpler to find the host if we are notified of a
* host restart.
*/
smmon.mon_id.mon_name = host->nh_caller_name;
smmon.mon_id.my_id.my_name = "localhost";
smmon.mon_id.my_id.my_prog = NLM_PROG;
smmon.mon_id.my_id.my_vers = NLM_SM;
smmon.mon_id.my_id.my_proc = NLM_SM_NOTIFY;
memcpy(smmon.priv, &host->nh_sysid, sizeof(host->nh_sysid));
timo.tv_sec = 25;
timo.tv_usec = 0;
stat = CLNT_CALL(nlm_nsm, SM_MON,
(xdrproc_t) xdr_mon, &smmon,
(xdrproc_t) xdr_sm_stat, &smstat, timo);
if (stat != RPC_SUCCESS) {
NLM_ERR("Failed to contact local NSM - rpc error %d\n", stat);
return;
}
if (smstat.res_stat == stat_fail) {
NLM_ERR("Local NSM refuses to monitor %s\n",
host->nh_caller_name);
mtx_lock(&host->nh_lock);
host->nh_monstate = NLM_MONITOR_FAILED;
mtx_unlock(&host->nh_lock);
return;
}
host->nh_monstate = NLM_MONITORED;
}
/*
* Return an RPC client handle that can be used to talk to the NLM
* running on the given host.
*/
CLIENT *
nlm_host_get_rpc(struct nlm_host *host, bool_t isserver)
{
struct nlm_rpc *rpc;
CLIENT *client;
mtx_lock(&host->nh_lock);
if (isserver)
rpc = &host->nh_srvrpc;
else
rpc = &host->nh_clntrpc;
/*
* We can't hold onto RPC handles for too long - the async
* call/reply protocol used by some NLM clients makes it hard
* to tell when they change port numbers (e.g. after a
* reboot). Note that if a client reboots while it isn't
* holding any locks, it won't bother to notify us. We
* expire the RPC handles after two minutes.
*/
if (rpc->nr_client && time_uptime > rpc->nr_create_time + 2*60) {
client = rpc->nr_client;
rpc->nr_client = NULL;
mtx_unlock(&host->nh_lock);
CLNT_RELEASE(client);
mtx_lock(&host->nh_lock);
}
if (!rpc->nr_client) {
mtx_unlock(&host->nh_lock);
client = nlm_get_rpc((struct sockaddr *)&host->nh_addr,
NLM_PROG, host->nh_vers);
mtx_lock(&host->nh_lock);
if (client) {
if (rpc->nr_client) {
mtx_unlock(&host->nh_lock);
CLNT_DESTROY(client);
mtx_lock(&host->nh_lock);
} else {
rpc->nr_client = client;
rpc->nr_create_time = time_uptime;
}
}
}
client = rpc->nr_client;
if (client)
CLNT_ACQUIRE(client);
mtx_unlock(&host->nh_lock);
return (client);
}
int nlm_host_get_sysid(struct nlm_host *host)
{
return (host->nh_sysid);
}
int
nlm_host_get_state(struct nlm_host *host)
{
return (host->nh_state);
}
void *
nlm_register_wait_lock(struct nlm4_lock *lock, struct vnode *vp)
{
struct nlm_waiting_lock *nw;
nw = malloc(sizeof(struct nlm_waiting_lock), M_NLM, M_WAITOK);
nw->nw_lock = *lock;
memcpy(&nw->nw_fh.fh_bytes, nw->nw_lock.fh.n_bytes,
nw->nw_lock.fh.n_len);
nw->nw_lock.fh.n_bytes = nw->nw_fh.fh_bytes;
nw->nw_waiting = TRUE;
nw->nw_vp = vp;
mtx_lock(&nlm_global_lock);
TAILQ_INSERT_TAIL(&nlm_waiting_locks, nw, nw_link);
mtx_unlock(&nlm_global_lock);
return nw;
}
void
nlm_deregister_wait_lock(void *handle)
{
struct nlm_waiting_lock *nw = handle;
mtx_lock(&nlm_global_lock);
TAILQ_REMOVE(&nlm_waiting_locks, nw, nw_link);
mtx_unlock(&nlm_global_lock);
free(nw, M_NLM);
}
int
nlm_wait_lock(void *handle, int timo)
{
struct nlm_waiting_lock *nw = handle;
int error;
/*
* If the granted message arrived before we got here,
* nw->nw_waiting will be FALSE - in that case, don't sleep.
*/
mtx_lock(&nlm_global_lock);
error = 0;
if (nw->nw_waiting)
error = msleep(nw, &nlm_global_lock, PCATCH, "nlmlock", timo);
TAILQ_REMOVE(&nlm_waiting_locks, nw, nw_link);
if (error) {
/*
* The granted message may arrive after the
* interrupt/timeout but before we manage to lock the
* mutex. Detect this by examining nw_lock.
*/
if (!nw->nw_waiting)
error = 0;
} else {
/*
* If nlm_cancel_wait is called, then error will be
* zero but nw_waiting will still be TRUE. We
* translate this into EINTR.
*/
if (nw->nw_waiting)
error = EINTR;
}
mtx_unlock(&nlm_global_lock);
free(nw, M_NLM);
return (error);
}
void
nlm_cancel_wait(struct vnode *vp)
{
struct nlm_waiting_lock *nw;
mtx_lock(&nlm_global_lock);
TAILQ_FOREACH(nw, &nlm_waiting_locks, nw_link) {
if (nw->nw_vp == vp) {
wakeup(nw);
}
}
mtx_unlock(&nlm_global_lock);
}
/**********************************************************************/
/*
* Syscall interface with userland.
*/
extern void nlm_prog_0(struct svc_req *rqstp, SVCXPRT *transp);
extern void nlm_prog_1(struct svc_req *rqstp, SVCXPRT *transp);
extern void nlm_prog_3(struct svc_req *rqstp, SVCXPRT *transp);
extern void nlm_prog_4(struct svc_req *rqstp, SVCXPRT *transp);
static int
nlm_register_services(SVCPOOL *pool, int addr_count, char **addrs)
{
static rpcvers_t versions[] = {
NLM_SM, NLM_VERS, NLM_VERSX, NLM_VERS4
};
static void (*dispatchers[])(struct svc_req *, SVCXPRT *) = {
nlm_prog_0, nlm_prog_1, nlm_prog_3, nlm_prog_4
};
static const int version_count = sizeof(versions) / sizeof(versions[0]);
SVCXPRT **xprts;
char netid[16];
char uaddr[128];
struct netconfig *nconf;
int i, j, error;
if (!addr_count) {
NLM_ERR("NLM: no service addresses given - can't start server");
return (EINVAL);
}
xprts = malloc(addr_count * sizeof(SVCXPRT *), M_NLM, M_WAITOK|M_ZERO);
for (i = 0; i < version_count; i++) {
for (j = 0; j < addr_count; j++) {
/*
* Create transports for the first version and
* then just register everything else to the
* same transports.
*/
if (i == 0) {
char *up;
error = copyin(&addrs[2*j], &up,
sizeof(char*));
if (error)
goto out;
error = copyinstr(up, netid, sizeof(netid),
NULL);
if (error)
goto out;
error = copyin(&addrs[2*j+1], &up,
sizeof(char*));
if (error)
goto out;
error = copyinstr(up, uaddr, sizeof(uaddr),
NULL);
if (error)
goto out;
nconf = getnetconfigent(netid);
if (!nconf) {
NLM_ERR("Can't lookup netid %s\n",
netid);
error = EINVAL;
goto out;
}
xprts[j] = svc_tp_create(pool, dispatchers[i],
NLM_PROG, versions[i], uaddr, nconf);
if (!xprts[j]) {
NLM_ERR("NLM: unable to create "
"(NLM_PROG, %d).\n", versions[i]);
error = EINVAL;
goto out;
}
freenetconfigent(nconf);
} else {
nconf = getnetconfigent(xprts[j]->xp_netid);
rpcb_unset(NLM_PROG, versions[i], nconf);
if (!svc_reg(xprts[j], NLM_PROG, versions[i],
dispatchers[i], nconf)) {
NLM_ERR("NLM: can't register "
"(NLM_PROG, %d)\n", versions[i]);
error = EINVAL;
goto out;
}
}
}
}
error = 0;
out:
for (j = 0; j < addr_count; j++) {
if (xprts[j])
SVC_RELEASE(xprts[j]);
}
free(xprts, M_NLM);
return (error);
}
/*
* Main server entry point. Contacts the local NSM to get its current
* state and send SM_UNMON_ALL. Registers the NLM services and then
* services requests. Does not return until the server is interrupted
* by a signal.
*/
static int
nlm_server_main(int addr_count, char **addrs)
{
struct thread *td = curthread;
int error;
SVCPOOL *pool = NULL;
struct sockopt opt;
int portlow;
#ifdef INET6
struct sockaddr_in6 sin6;
#endif
struct sockaddr_in sin;
my_id id;
sm_stat smstat;
struct timeval timo;
enum clnt_stat stat;
struct nlm_host *host, *nhost;
struct nlm_waiting_lock *nw;
vop_advlock_t *old_nfs_advlock;
vop_reclaim_t *old_nfs_reclaim;
int v4_used;
#ifdef INET6
int v6_used;
#endif
if (nlm_socket) {
NLM_ERR("NLM: can't start server - "
"it appears to be running already\n");
return (EPERM);
}
memset(&opt, 0, sizeof(opt));
nlm_socket = NULL;
error = socreate(AF_INET, &nlm_socket, SOCK_DGRAM, 0,
td->td_ucred, td);
if (error) {
NLM_ERR("NLM: can't create IPv4 socket - error %d\n", error);
return (error);
}
opt.sopt_dir = SOPT_SET;
opt.sopt_level = IPPROTO_IP;
opt.sopt_name = IP_PORTRANGE;
portlow = IP_PORTRANGE_LOW;
opt.sopt_val = &portlow;
opt.sopt_valsize = sizeof(portlow);
sosetopt(nlm_socket, &opt);
#ifdef INET6
nlm_socket6 = NULL;
error = socreate(AF_INET6, &nlm_socket6, SOCK_DGRAM, 0,
td->td_ucred, td);
if (error) {
NLM_ERR("NLM: can't create IPv6 socket - error %d\n", error);
goto out;
return (error);
}
opt.sopt_dir = SOPT_SET;
opt.sopt_level = IPPROTO_IPV6;
opt.sopt_name = IPV6_PORTRANGE;
portlow = IPV6_PORTRANGE_LOW;
opt.sopt_val = &portlow;
opt.sopt_valsize = sizeof(portlow);
sosetopt(nlm_socket6, &opt);
#endif
nlm_auth = authunix_create(curthread->td_ucred);
#ifdef INET6
memset(&sin6, 0, sizeof(sin6));
sin6.sin6_len = sizeof(sin6);
sin6.sin6_family = AF_INET6;
sin6.sin6_addr = in6addr_loopback;
nlm_nsm = nlm_get_rpc((struct sockaddr *) &sin6, SM_PROG, SM_VERS);
if (!nlm_nsm) {
#endif
memset(&sin, 0, sizeof(sin));
sin.sin_len = sizeof(sin);
sin.sin_family = AF_INET;
sin.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
nlm_nsm = nlm_get_rpc((struct sockaddr *) &sin, SM_PROG,
SM_VERS);
#ifdef INET6
}
#endif
if (!nlm_nsm) {
NLM_ERR("Can't start NLM - unable to contact NSM\n");
error = EINVAL;
goto out;
}
pool = svcpool_create("NLM", NULL);
error = nlm_register_services(pool, addr_count, addrs);
if (error)
goto out;
memset(&id, 0, sizeof(id));
id.my_name = "NFS NLM";
timo.tv_sec = 25;
timo.tv_usec = 0;
stat = CLNT_CALL(nlm_nsm, SM_UNMON_ALL,
(xdrproc_t) xdr_my_id, &id,
(xdrproc_t) xdr_sm_stat, &smstat, timo);
if (stat != RPC_SUCCESS) {
struct rpc_err err;
CLNT_GETERR(nlm_nsm, &err);
NLM_ERR("NLM: unexpected error contacting NSM, "
"stat=%d, errno=%d\n", stat, err.re_errno);
error = EINVAL;
goto out;
}
NLM_DEBUG(1, "NLM: local NSM state is %d\n", smstat.state);
nlm_nsm_state = smstat.state;
old_nfs_advlock = nfs_advlock_p;
nfs_advlock_p = nlm_advlock;
old_nfs_reclaim = nfs_reclaim_p;
nfs_reclaim_p = nlm_reclaim;
svc_run(pool);
error = 0;
nfs_advlock_p = old_nfs_advlock;
nfs_reclaim_p = old_nfs_reclaim;
out:
if (pool)
svcpool_destroy(pool);
/*
* We are finished communicating with the NSM.
*/
if (nlm_nsm) {
CLNT_RELEASE(nlm_nsm);
nlm_nsm = NULL;
}
/*
* Trash all the existing state so that if the server
* restarts, it gets a clean slate. This is complicated by the
* possibility that there may be other threads trying to make
* client locking requests.
*
* First we fake a client reboot notification which will
* cancel any pending async locks and purge remote lock state
* from the local lock manager. We release the reference from
* nlm_hosts to the host (which may remove it from the list
* and free it). After this phase, the only entries in the
* nlm_host list should be from other threads performing
* client lock requests. We arrange to defer closing the
* sockets until the last RPC client handle is released.
*/
v4_used = 0;
#ifdef INET6
v6_used = 0;
#endif
mtx_lock(&nlm_global_lock);
TAILQ_FOREACH(nw, &nlm_waiting_locks, nw_link) {
wakeup(nw);
}
TAILQ_FOREACH_SAFE(host, &nlm_hosts, nh_link, nhost) {
mtx_unlock(&nlm_global_lock);
nlm_host_notify(host, 0);
nlm_host_release(host);
mtx_lock(&nlm_global_lock);
}
TAILQ_FOREACH_SAFE(host, &nlm_hosts, nh_link, nhost) {
mtx_lock(&host->nh_lock);
if (host->nh_srvrpc.nr_client
|| host->nh_clntrpc.nr_client) {
if (host->nh_addr.ss_family == AF_INET)
v4_used++;
#ifdef INET6
if (host->nh_addr.ss_family == AF_INET6)
v6_used++;
#endif
/*
* Note that the rpc over udp code copes
* correctly with the fact that a socket may
* be used by many rpc handles.
*/
if (host->nh_srvrpc.nr_client)
CLNT_CONTROL(host->nh_srvrpc.nr_client,
CLSET_FD_CLOSE, 0);
if (host->nh_clntrpc.nr_client)
CLNT_CONTROL(host->nh_clntrpc.nr_client,
CLSET_FD_CLOSE, 0);
}
mtx_unlock(&host->nh_lock);
}
mtx_unlock(&nlm_global_lock);
AUTH_DESTROY(nlm_auth);
if (!v4_used)
soclose(nlm_socket);
nlm_socket = NULL;
#ifdef INET6
if (!v6_used)
soclose(nlm_socket6);
nlm_socket6 = NULL;
#endif
return (error);
}
int
-nlm_syscall(struct thread *td, struct nlm_syscall_args *uap)
+sys_nlm_syscall(struct thread *td, struct nlm_syscall_args *uap)
{
int error;
#if __FreeBSD_version >= 700000
error = priv_check(td, PRIV_NFS_LOCKD);
#else
error = suser(td);
#endif
if (error)
return (error);
nlm_debug_level = uap->debug_level;
nlm_grace_threshold = time_uptime + uap->grace_period;
nlm_next_idle_check = time_uptime + NLM_IDLE_PERIOD;
return nlm_server_main(uap->addr_count, uap->addrs);
}
/**********************************************************************/
/*
* NLM implementation details, called from the RPC stubs.
*/
void
nlm_sm_notify(struct nlm_sm_status *argp)
{
uint32_t sysid;
struct nlm_host *host;
NLM_DEBUG(3, "nlm_sm_notify(): mon_name = %s\n", argp->mon_name);
memcpy(&sysid, &argp->priv, sizeof(sysid));
host = nlm_find_host_by_sysid(sysid);
if (host) {
nlm_host_notify(host, argp->state);
nlm_host_release(host);
}
}
static void
nlm_convert_to_fhandle_t(fhandle_t *fhp, struct netobj *p)
{
memcpy(fhp, p->n_bytes, sizeof(fhandle_t));
}
struct vfs_state {
struct mount *vs_mp;
struct vnode *vs_vp;
int vs_vfslocked;
int vs_vnlocked;
};
static int
nlm_get_vfs_state(struct nlm_host *host, struct svc_req *rqstp,
fhandle_t *fhp, struct vfs_state *vs)
{
int error, exflags;
struct ucred *cred = NULL, *credanon;
memset(vs, 0, sizeof(*vs));
vs->vs_mp = vfs_getvfs(&fhp->fh_fsid);
if (!vs->vs_mp) {
return (ESTALE);
}
vs->vs_vfslocked = VFS_LOCK_GIANT(vs->vs_mp);
error = VFS_CHECKEXP(vs->vs_mp, (struct sockaddr *)&host->nh_addr,
&exflags, &credanon, NULL, NULL);
if (error)
goto out;
if (exflags & MNT_EXRDONLY || (vs->vs_mp->mnt_flag & MNT_RDONLY)) {
error = EROFS;
goto out;
}
error = VFS_FHTOVP(vs->vs_mp, &fhp->fh_fid, LK_EXCLUSIVE, &vs->vs_vp);
if (error)
goto out;
vs->vs_vnlocked = TRUE;
if (!svc_getcred(rqstp, &cred, NULL)) {
error = EINVAL;
goto out;
}
if (cred->cr_uid == 0 || (exflags & MNT_EXPORTANON)) {
crfree(cred);
cred = credanon;
credanon = NULL;
}
/*
* Check cred.
*/
error = VOP_ACCESS(vs->vs_vp, VWRITE, cred, curthread);
if (error)
goto out;
#if __FreeBSD_version < 800011
VOP_UNLOCK(vs->vs_vp, 0, curthread);
#else
VOP_UNLOCK(vs->vs_vp, 0);
#endif
vs->vs_vnlocked = FALSE;
out:
if (cred)
crfree(cred);
if (credanon)
crfree(credanon);
return (error);
}
static void
nlm_release_vfs_state(struct vfs_state *vs)
{
if (vs->vs_vp) {
if (vs->vs_vnlocked)
vput(vs->vs_vp);
else
vrele(vs->vs_vp);
}
if (vs->vs_mp)
vfs_rel(vs->vs_mp);
VFS_UNLOCK_GIANT(vs->vs_vfslocked);
}
static nlm4_stats
nlm_convert_error(int error)
{
if (error == ESTALE)
return nlm4_stale_fh;
else if (error == EROFS)
return nlm4_rofs;
else
return nlm4_failed;
}
int
nlm_do_test(nlm4_testargs *argp, nlm4_testres *result, struct svc_req *rqstp,
CLIENT **rpcp)
{
fhandle_t fh;
struct vfs_state vs;
struct nlm_host *host, *bhost;
int error, sysid;
struct flock fl;
memset(result, 0, sizeof(*result));
memset(&vs, 0, sizeof(vs));
host = nlm_find_host_by_name(argp->alock.caller_name,
svc_getrpccaller(rqstp), rqstp->rq_vers);
if (!host) {
result->stat.stat = nlm4_denied_nolocks;
return (ENOMEM);
}
NLM_DEBUG(3, "nlm_do_test(): caller_name = %s (sysid = %d)\n",
host->nh_caller_name, host->nh_sysid);
nlm_check_expired_locks(host);
sysid = host->nh_sysid;
nlm_convert_to_fhandle_t(&fh, &argp->alock.fh);
nlm_copy_netobj(&result->cookie, &argp->cookie, M_RPC);
if (time_uptime < nlm_grace_threshold) {
result->stat.stat = nlm4_denied_grace_period;
goto out;
}
error = nlm_get_vfs_state(host, rqstp, &fh, &vs);
if (error) {
result->stat.stat = nlm_convert_error(error);
goto out;
}
fl.l_start = argp->alock.l_offset;
fl.l_len = argp->alock.l_len;
fl.l_pid = argp->alock.svid;
fl.l_sysid = sysid;
fl.l_whence = SEEK_SET;
if (argp->exclusive)
fl.l_type = F_WRLCK;
else
fl.l_type = F_RDLCK;
error = VOP_ADVLOCK(vs.vs_vp, NULL, F_GETLK, &fl, F_REMOTE);
if (error) {
result->stat.stat = nlm4_failed;
goto out;
}
if (fl.l_type == F_UNLCK) {
result->stat.stat = nlm4_granted;
} else {
result->stat.stat = nlm4_denied;
result->stat.nlm4_testrply_u.holder.exclusive =
(fl.l_type == F_WRLCK);
result->stat.nlm4_testrply_u.holder.svid = fl.l_pid;
bhost = nlm_find_host_by_sysid(fl.l_sysid);
if (bhost) {
/*
* We don't have any useful way of recording
* the value of oh used in the original lock
* request. Ideally, the test reply would have
* a space for the owning host's name allowing
* our caller's NLM to keep track.
*
* As far as I can see, Solaris uses an eight
* byte structure for oh which contains a four
* byte pid encoded in local byte order and
* the first four bytes of the host
* name. Linux uses a variable length string
* 'pid@hostname' in ascii but doesn't even
* return that in test replies.
*
* For the moment, return nothing in oh
* (already zero'ed above).
*/
nlm_host_release(bhost);
}
result->stat.nlm4_testrply_u.holder.l_offset = fl.l_start;
result->stat.nlm4_testrply_u.holder.l_len = fl.l_len;
}
out:
nlm_release_vfs_state(&vs);
if (rpcp)
*rpcp = nlm_host_get_rpc(host, TRUE);
nlm_host_release(host);
return (0);
}
int
nlm_do_lock(nlm4_lockargs *argp, nlm4_res *result, struct svc_req *rqstp,
bool_t monitor, CLIENT **rpcp)
{
fhandle_t fh;
struct vfs_state vs;
struct nlm_host *host;
int error, sysid;
struct flock fl;
memset(result, 0, sizeof(*result));
memset(&vs, 0, sizeof(vs));
host = nlm_find_host_by_name(argp->alock.caller_name,
svc_getrpccaller(rqstp), rqstp->rq_vers);
if (!host) {
result->stat.stat = nlm4_denied_nolocks;
return (ENOMEM);
}
NLM_DEBUG(3, "nlm_do_lock(): caller_name = %s (sysid = %d)\n",
host->nh_caller_name, host->nh_sysid);
if (monitor && host->nh_state && argp->state
&& host->nh_state != argp->state) {
/*
* The host rebooted without telling us. Trash its
* locks.
*/
nlm_host_notify(host, argp->state);
}
nlm_check_expired_locks(host);
sysid = host->nh_sysid;
nlm_convert_to_fhandle_t(&fh, &argp->alock.fh);
nlm_copy_netobj(&result->cookie, &argp->cookie, M_RPC);
if (time_uptime < nlm_grace_threshold && !argp->reclaim) {
result->stat.stat = nlm4_denied_grace_period;
goto out;
}
error = nlm_get_vfs_state(host, rqstp, &fh, &vs);
if (error) {
result->stat.stat = nlm_convert_error(error);
goto out;
}
fl.l_start = argp->alock.l_offset;
fl.l_len = argp->alock.l_len;
fl.l_pid = argp->alock.svid;
fl.l_sysid = sysid;
fl.l_whence = SEEK_SET;
if (argp->exclusive)
fl.l_type = F_WRLCK;
else
fl.l_type = F_RDLCK;
if (argp->block) {
struct nlm_async_lock *af;
CLIENT *client;
struct nlm_grantcookie cookie;
/*
* First, make sure we can contact the host's NLM.
*/
client = nlm_host_get_rpc(host, TRUE);
if (!client) {
result->stat.stat = nlm4_failed;
goto out;
}
/*
* First we need to check and see if there is an
* existing blocked lock that matches. This could be a
* badly behaved client or an RPC re-send. If we find
* one, just return nlm4_blocked.
*/
mtx_lock(&host->nh_lock);
TAILQ_FOREACH(af, &host->nh_pending, af_link) {
if (af->af_fl.l_start == fl.l_start
&& af->af_fl.l_len == fl.l_len
&& af->af_fl.l_pid == fl.l_pid
&& af->af_fl.l_type == fl.l_type) {
break;
}
}
if (!af) {
cookie.ng_sysid = host->nh_sysid;
cookie.ng_cookie = host->nh_grantcookie++;
}
mtx_unlock(&host->nh_lock);
if (af) {
CLNT_RELEASE(client);
result->stat.stat = nlm4_blocked;
goto out;
}
af = malloc(sizeof(struct nlm_async_lock), M_NLM,
M_WAITOK|M_ZERO);
TASK_INIT(&af->af_task, 0, nlm_lock_callback, af);
af->af_vp = vs.vs_vp;
af->af_fl = fl;
af->af_host = host;
af->af_rpc = client;
/*
* We use M_RPC here so that we can xdr_free the thing
* later.
*/
nlm_make_netobj(&af->af_granted.cookie,
(caddr_t)&cookie, sizeof(cookie), M_RPC);
af->af_granted.exclusive = argp->exclusive;
af->af_granted.alock.caller_name =
strdup(argp->alock.caller_name, M_RPC);
nlm_copy_netobj(&af->af_granted.alock.fh,
&argp->alock.fh, M_RPC);
nlm_copy_netobj(&af->af_granted.alock.oh,
&argp->alock.oh, M_RPC);
af->af_granted.alock.svid = argp->alock.svid;
af->af_granted.alock.l_offset = argp->alock.l_offset;
af->af_granted.alock.l_len = argp->alock.l_len;
/*
* Put the entry on the pending list before calling
* VOP_ADVLOCKASYNC. We do this in case the lock
* request was blocked (returning EINPROGRESS) but
* then granted before we manage to run again. The
* client may receive the granted message before we
* send our blocked reply but thats their problem.
*/
mtx_lock(&host->nh_lock);
TAILQ_INSERT_TAIL(&host->nh_pending, af, af_link);
mtx_unlock(&host->nh_lock);
error = VOP_ADVLOCKASYNC(vs.vs_vp, NULL, F_SETLK, &fl, F_REMOTE,
&af->af_task, &af->af_cookie);
/*
* If the lock completed synchronously, just free the
* tracking structure now.
*/
if (error != EINPROGRESS) {
CLNT_RELEASE(af->af_rpc);
mtx_lock(&host->nh_lock);
TAILQ_REMOVE(&host->nh_pending, af, af_link);
mtx_unlock(&host->nh_lock);
xdr_free((xdrproc_t) xdr_nlm4_testargs,
&af->af_granted);
free(af, M_NLM);
} else {
NLM_DEBUG(2, "NLM: pending async lock %p for %s "
"(sysid %d)\n", af, host->nh_caller_name, sysid);
/*
* Don't vrele the vnode just yet - this must
* wait until either the async callback
* happens or the lock is cancelled.
*/
vs.vs_vp = NULL;
}
} else {
error = VOP_ADVLOCK(vs.vs_vp, NULL, F_SETLK, &fl, F_REMOTE);
}
if (error) {
if (error == EINPROGRESS) {
result->stat.stat = nlm4_blocked;
} else if (error == EDEADLK) {
result->stat.stat = nlm4_deadlck;
} else if (error == EAGAIN) {
result->stat.stat = nlm4_denied;
} else {
result->stat.stat = nlm4_failed;
}
} else {
if (monitor)
nlm_host_monitor(host, argp->state);
result->stat.stat = nlm4_granted;
}
out:
nlm_release_vfs_state(&vs);
if (rpcp)
*rpcp = nlm_host_get_rpc(host, TRUE);
nlm_host_release(host);
return (0);
}
int
nlm_do_cancel(nlm4_cancargs *argp, nlm4_res *result, struct svc_req *rqstp,
CLIENT **rpcp)
{
fhandle_t fh;
struct vfs_state vs;
struct nlm_host *host;
int error, sysid;
struct flock fl;
struct nlm_async_lock *af;
memset(result, 0, sizeof(*result));
memset(&vs, 0, sizeof(vs));
host = nlm_find_host_by_name(argp->alock.caller_name,
svc_getrpccaller(rqstp), rqstp->rq_vers);
if (!host) {
result->stat.stat = nlm4_denied_nolocks;
return (ENOMEM);
}
NLM_DEBUG(3, "nlm_do_cancel(): caller_name = %s (sysid = %d)\n",
host->nh_caller_name, host->nh_sysid);
nlm_check_expired_locks(host);
sysid = host->nh_sysid;
nlm_convert_to_fhandle_t(&fh, &argp->alock.fh);
nlm_copy_netobj(&result->cookie, &argp->cookie, M_RPC);
if (time_uptime < nlm_grace_threshold) {
result->stat.stat = nlm4_denied_grace_period;
goto out;
}
error = nlm_get_vfs_state(host, rqstp, &fh, &vs);
if (error) {
result->stat.stat = nlm_convert_error(error);
goto out;
}
fl.l_start = argp->alock.l_offset;
fl.l_len = argp->alock.l_len;
fl.l_pid = argp->alock.svid;
fl.l_sysid = sysid;
fl.l_whence = SEEK_SET;
if (argp->exclusive)
fl.l_type = F_WRLCK;
else
fl.l_type = F_RDLCK;
/*
* First we need to try and find the async lock request - if
* there isn't one, we give up and return nlm4_denied.
*/
mtx_lock(&host->nh_lock);
TAILQ_FOREACH(af, &host->nh_pending, af_link) {
if (af->af_fl.l_start == fl.l_start
&& af->af_fl.l_len == fl.l_len
&& af->af_fl.l_pid == fl.l_pid
&& af->af_fl.l_type == fl.l_type) {
break;
}
}
if (!af) {
mtx_unlock(&host->nh_lock);
result->stat.stat = nlm4_denied;
goto out;
}
error = nlm_cancel_async_lock(af);
if (error) {
result->stat.stat = nlm4_denied;
} else {
result->stat.stat = nlm4_granted;
}
mtx_unlock(&host->nh_lock);
out:
nlm_release_vfs_state(&vs);
if (rpcp)
*rpcp = nlm_host_get_rpc(host, TRUE);
nlm_host_release(host);
return (0);
}
int
nlm_do_unlock(nlm4_unlockargs *argp, nlm4_res *result, struct svc_req *rqstp,
CLIENT **rpcp)
{
fhandle_t fh;
struct vfs_state vs;
struct nlm_host *host;
int error, sysid;
struct flock fl;
memset(result, 0, sizeof(*result));
memset(&vs, 0, sizeof(vs));
host = nlm_find_host_by_name(argp->alock.caller_name,
svc_getrpccaller(rqstp), rqstp->rq_vers);
if (!host) {
result->stat.stat = nlm4_denied_nolocks;
return (ENOMEM);
}
NLM_DEBUG(3, "nlm_do_unlock(): caller_name = %s (sysid = %d)\n",
host->nh_caller_name, host->nh_sysid);
nlm_check_expired_locks(host);
sysid = host->nh_sysid;
nlm_convert_to_fhandle_t(&fh, &argp->alock.fh);
nlm_copy_netobj(&result->cookie, &argp->cookie, M_RPC);
if (time_uptime < nlm_grace_threshold) {
result->stat.stat = nlm4_denied_grace_period;
goto out;
}
error = nlm_get_vfs_state(host, rqstp, &fh, &vs);
if (error) {
result->stat.stat = nlm_convert_error(error);
goto out;
}
fl.l_start = argp->alock.l_offset;
fl.l_len = argp->alock.l_len;
fl.l_pid = argp->alock.svid;
fl.l_sysid = sysid;
fl.l_whence = SEEK_SET;
fl.l_type = F_UNLCK;
error = VOP_ADVLOCK(vs.vs_vp, NULL, F_UNLCK, &fl, F_REMOTE);
/*
* Ignore the error - there is no result code for failure,
* only for grace period.
*/
result->stat.stat = nlm4_granted;
out:
nlm_release_vfs_state(&vs);
if (rpcp)
*rpcp = nlm_host_get_rpc(host, TRUE);
nlm_host_release(host);
return (0);
}
int
nlm_do_granted(nlm4_testargs *argp, nlm4_res *result, struct svc_req *rqstp,
CLIENT **rpcp)
{
struct nlm_host *host;
struct nlm_waiting_lock *nw;
memset(result, 0, sizeof(*result));
host = nlm_find_host_by_addr(svc_getrpccaller(rqstp), rqstp->rq_vers);
if (!host) {
result->stat.stat = nlm4_denied_nolocks;
return (ENOMEM);
}
nlm_copy_netobj(&result->cookie, &argp->cookie, M_RPC);
result->stat.stat = nlm4_denied;
KFAIL_POINT_CODE(DEBUG_FP, nlm_deny_grant, goto out);
mtx_lock(&nlm_global_lock);
TAILQ_FOREACH(nw, &nlm_waiting_locks, nw_link) {
if (!nw->nw_waiting)
continue;
if (argp->alock.svid == nw->nw_lock.svid
&& argp->alock.l_offset == nw->nw_lock.l_offset
&& argp->alock.l_len == nw->nw_lock.l_len
&& argp->alock.fh.n_len == nw->nw_lock.fh.n_len
&& !memcmp(argp->alock.fh.n_bytes, nw->nw_lock.fh.n_bytes,
nw->nw_lock.fh.n_len)) {
nw->nw_waiting = FALSE;
wakeup(nw);
result->stat.stat = nlm4_granted;
break;
}
}
mtx_unlock(&nlm_global_lock);
out:
if (rpcp)
*rpcp = nlm_host_get_rpc(host, TRUE);
nlm_host_release(host);
return (0);
}
void
nlm_do_granted_res(nlm4_res *argp, struct svc_req *rqstp)
{
struct nlm_host *host = NULL;
struct nlm_async_lock *af = NULL;
int error;
if (argp->cookie.n_len != sizeof(struct nlm_grantcookie)) {
NLM_DEBUG(1, "NLM: bogus grant cookie");
goto out;
}
host = nlm_find_host_by_sysid(ng_sysid(&argp->cookie));
if (!host) {
NLM_DEBUG(1, "NLM: Unknown host rejected our grant");
goto out;
}
mtx_lock(&host->nh_lock);
TAILQ_FOREACH(af, &host->nh_granted, af_link)
if (ng_cookie(&argp->cookie) ==
ng_cookie(&af->af_granted.cookie))
break;
if (af)
TAILQ_REMOVE(&host->nh_granted, af, af_link);
mtx_unlock(&host->nh_lock);
if (!af) {
NLM_DEBUG(1, "NLM: host %s (sysid %d) replied to our grant "
"with unrecognized cookie %d:%d", host->nh_caller_name,
host->nh_sysid, ng_sysid(&argp->cookie),
ng_cookie(&argp->cookie));
goto out;
}
if (argp->stat.stat != nlm4_granted) {
af->af_fl.l_type = F_UNLCK;
error = VOP_ADVLOCK(af->af_vp, NULL, F_UNLCK, &af->af_fl, F_REMOTE);
if (error) {
NLM_DEBUG(1, "NLM: host %s (sysid %d) rejected our grant "
"and we failed to unlock (%d)", host->nh_caller_name,
host->nh_sysid, error);
goto out;
}
NLM_DEBUG(5, "NLM: async lock %p rejected by host %s (sysid %d)",
af, host->nh_caller_name, host->nh_sysid);
} else {
NLM_DEBUG(5, "NLM: async lock %p accepted by host %s (sysid %d)",
af, host->nh_caller_name, host->nh_sysid);
}
out:
if (af)
nlm_free_async_lock(af);
if (host)
nlm_host_release(host);
}
void
nlm_do_free_all(nlm4_notify *argp)
{
struct nlm_host *host, *thost;
TAILQ_FOREACH_SAFE(host, &nlm_hosts, nh_link, thost) {
if (!strcmp(host->nh_caller_name, argp->name))
nlm_host_notify(host, argp->state);
}
}
/*
* Kernel module glue
*/
static int
nfslockd_modevent(module_t mod, int type, void *data)
{
return (0);
}
static moduledata_t nfslockd_mod = {
"nfslockd",
nfslockd_modevent,
NULL,
};
DECLARE_MODULE(nfslockd, nfslockd_mod, SI_SUB_VFS, SI_ORDER_ANY);
/* So that loader and kldload(2) can find us, wherever we are.. */
MODULE_DEPEND(nfslockd, krpc, 1, 1, 1);
MODULE_DEPEND(nfslockd, nfslock, 1, 1, 1);
MODULE_VERSION(nfslockd, 1);
Index: head/sys/pc98/pc98/machdep.c
===================================================================
--- head/sys/pc98/pc98/machdep.c (revision 225616)
+++ head/sys/pc98/pc98/machdep.c (revision 225617)
@@ -1,2981 +1,2981 @@
/*-
* Copyright (c) 1992 Terrence R. Lambert.
* Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
* All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* William Jolitz.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* from: @(#)machdep.c 7.4 (Berkeley) 6/3/91
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_atalk.h"
#include "opt_compat.h"
#include "opt_cpu.h"
#include "opt_ddb.h"
#include "opt_inet.h"
#include "opt_ipx.h"
#include "opt_isa.h"
#include "opt_kstack_pages.h"
#include "opt_maxmem.h"
#include "opt_mp_watchdog.h"
#include "opt_npx.h"
#include "opt_perfmon.h"
#include <sys/param.h>
#include <sys/proc.h>
#include <sys/systm.h>
#include <sys/bio.h>
#include <sys/buf.h>
#include <sys/bus.h>
#include <sys/callout.h>
#include <sys/cons.h>
#include <sys/cpu.h>
#include <sys/eventhandler.h>
#include <sys/exec.h>
#include <sys/imgact.h>
#include <sys/kdb.h>
#include <sys/kernel.h>
#include <sys/ktr.h>
#include <sys/linker.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/msgbuf.h>
#include <sys/mutex.h>
#include <sys/pcpu.h>
#include <sys/ptrace.h>
#include <sys/reboot.h>
#include <sys/sched.h>
#include <sys/signalvar.h>
#ifdef SMP
#include <sys/smp.h>
#endif
#include <sys/syscallsubr.h>
#include <sys/sysctl.h>
#include <sys/sysent.h>
#include <sys/sysproto.h>
#include <sys/ucontext.h>
#include <sys/vmmeter.h>
#include <vm/vm.h>
#include <vm/vm_extern.h>
#include <vm/vm_kern.h>
#include <vm/vm_page.h>
#include <vm/vm_map.h>
#include <vm/vm_object.h>
#include <vm/vm_pager.h>
#include <vm/vm_param.h>
#ifdef DDB
#ifndef KDB
#error KDB must be enabled in order for DDB to work!
#endif
#include <ddb/ddb.h>
#include <ddb/db_sym.h>
#endif
#include <pc98/pc98/pc98_machdep.h>
#include <net/netisr.h>
#include <machine/bootinfo.h>
#include <machine/clock.h>
#include <machine/cpu.h>
#include <machine/cputypes.h>
#include <machine/intr_machdep.h>
#include <x86/mca.h>
#include <machine/md_var.h>
#include <machine/mp_watchdog.h>
#include <machine/pc/bios.h>
#include <machine/pcb.h>
#include <machine/pcb_ext.h>
#include <machine/proc.h>
#include <machine/reg.h>
#include <machine/sigframe.h>
#include <machine/specialreg.h>
#include <machine/vm86.h>
#ifdef PERFMON
#include <machine/perfmon.h>
#endif
#ifdef SMP
#include <machine/smp.h>
#endif
#ifdef DEV_ISA
#include <x86/isa/icu.h>
#endif
/* Sanity check for __curthread() */
CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
extern void init386(int first);
extern void dblfault_handler(void);
extern void printcpuinfo(void); /* XXX header file */
extern void finishidentcpu(void);
extern void panicifcpuunsupported(void);
extern void initializecpu(void);
#define CS_SECURE(cs) (ISPL(cs) == SEL_UPL)
#define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
#if !defined(CPU_DISABLE_SSE) && defined(I686_CPU)
#define CPU_ENABLE_SSE
#endif
static void cpu_startup(void *);
static void fpstate_drop(struct thread *td);
static void get_fpcontext(struct thread *td, mcontext_t *mcp);
static int set_fpcontext(struct thread *td, const mcontext_t *mcp);
#ifdef CPU_ENABLE_SSE
static void set_fpregs_xmm(struct save87 *, struct savexmm *);
static void fill_fpregs_xmm(struct savexmm *, struct save87 *);
#endif /* CPU_ENABLE_SSE */
SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
int need_pre_dma_flush; /* If 1, use wbinvd befor DMA transfer. */
int need_post_dma_flush; /* If 1, use invd after DMA transfer. */
#ifdef DDB
extern vm_offset_t ksym_start, ksym_end;
#endif
int _udatasel, _ucodesel;
u_int basemem;
static int ispc98 = 1;
SYSCTL_INT(_machdep, OID_AUTO, ispc98, CTLFLAG_RD, &ispc98, 0, "");
int cold = 1;
#ifdef COMPAT_43
static void osendsig(sig_t catcher, ksiginfo_t *, sigset_t *mask);
#endif
#ifdef COMPAT_FREEBSD4
static void freebsd4_sendsig(sig_t catcher, ksiginfo_t *, sigset_t *mask);
#endif
long Maxmem = 0;
long realmem = 0;
/*
* The number of PHYSMAP entries must be one less than the number of
* PHYSSEG entries because the PHYSMAP entry that spans the largest
* physical address that is accessible by ISA DMA is split into two
* PHYSSEG entries.
*/
#define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1))
vm_paddr_t phys_avail[PHYSMAP_SIZE + 2];
vm_paddr_t dump_avail[PHYSMAP_SIZE + 2];
/* must be 2 less so 0 0 can signal end of chunks */
#define PHYS_AVAIL_ARRAY_END ((sizeof(phys_avail) / sizeof(phys_avail[0])) - 2)
#define DUMP_AVAIL_ARRAY_END ((sizeof(dump_avail) / sizeof(dump_avail[0])) - 2)
struct kva_md_info kmi;
static struct trapframe proc0_tf;
struct pcpu __pcpu[MAXCPU];
struct mtx icu_lock;
static void
cpu_startup(dummy)
void *dummy;
{
uintmax_t memsize;
/*
* Good {morning,afternoon,evening,night}.
*/
startrtclock();
printcpuinfo();
panicifcpuunsupported();
#ifdef PERFMON
perfmon_init();
#endif
realmem = Maxmem;
/*
* Display physical memory.
*/
memsize = ptoa((uintmax_t)Maxmem);
printf("real memory = %ju (%ju MB)\n", memsize, memsize >> 20);
/*
* Display any holes after the first chunk of extended memory.
*/
if (bootverbose) {
int indx;
printf("Physical memory chunk(s):\n");
for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
vm_paddr_t size;
size = phys_avail[indx + 1] - phys_avail[indx];
printf(
"0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
(uintmax_t)phys_avail[indx],
(uintmax_t)phys_avail[indx + 1] - 1,
(uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
}
}
vm_ksubmap_init(&kmi);
printf("avail memory = %ju (%ju MB)\n",
ptoa((uintmax_t)cnt.v_free_count),
ptoa((uintmax_t)cnt.v_free_count) / 1048576);
/*
* Set up buffers, so they can be used to read disk labels.
*/
bufinit();
vm_pager_bufferinit();
cpu_setregs();
}
/*
* Send an interrupt to process.
*
* Stack is set up to allow sigcode stored
* at top to call routine, followed by kcall
* to sigreturn routine below. After sigreturn
* resets the signal mask, the stack, and the
* frame pointer, it returns to the user
* specified pc, psl.
*/
#ifdef COMPAT_43
static void
osendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
{
struct osigframe sf, *fp;
struct proc *p;
struct thread *td;
struct sigacts *psp;
struct trapframe *regs;
int sig;
int oonstack;
td = curthread;
p = td->td_proc;
PROC_LOCK_ASSERT(p, MA_OWNED);
sig = ksi->ksi_signo;
psp = p->p_sigacts;
mtx_assert(&psp->ps_mtx, MA_OWNED);
regs = td->td_frame;
oonstack = sigonstack(regs->tf_esp);
/* Allocate space for the signal handler context. */
if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
SIGISMEMBER(psp->ps_sigonstack, sig)) {
fp = (struct osigframe *)(td->td_sigstk.ss_sp +
td->td_sigstk.ss_size - sizeof(struct osigframe));
#if defined(COMPAT_43)
td->td_sigstk.ss_flags |= SS_ONSTACK;
#endif
} else
fp = (struct osigframe *)regs->tf_esp - 1;
/* Translate the signal if appropriate. */
if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
/* Build the argument list for the signal handler. */
sf.sf_signum = sig;
sf.sf_scp = (register_t)&fp->sf_siginfo.si_sc;
bzero(&sf.sf_siginfo, sizeof(sf.sf_siginfo));
if (SIGISMEMBER(psp->ps_siginfo, sig)) {
/* Signal handler installed with SA_SIGINFO. */
sf.sf_arg2 = (register_t)&fp->sf_siginfo;
sf.sf_siginfo.si_signo = sig;
sf.sf_siginfo.si_code = ksi->ksi_code;
sf.sf_ahu.sf_action = (__osiginfohandler_t *)catcher;
sf.sf_addr = 0;
} else {
/* Old FreeBSD-style arguments. */
sf.sf_arg2 = ksi->ksi_code;
sf.sf_addr = (register_t)ksi->ksi_addr;
sf.sf_ahu.sf_handler = catcher;
}
mtx_unlock(&psp->ps_mtx);
PROC_UNLOCK(p);
/* Save most if not all of trap frame. */
sf.sf_siginfo.si_sc.sc_eax = regs->tf_eax;
sf.sf_siginfo.si_sc.sc_ebx = regs->tf_ebx;
sf.sf_siginfo.si_sc.sc_ecx = regs->tf_ecx;
sf.sf_siginfo.si_sc.sc_edx = regs->tf_edx;
sf.sf_siginfo.si_sc.sc_esi = regs->tf_esi;
sf.sf_siginfo.si_sc.sc_edi = regs->tf_edi;
sf.sf_siginfo.si_sc.sc_cs = regs->tf_cs;
sf.sf_siginfo.si_sc.sc_ds = regs->tf_ds;
sf.sf_siginfo.si_sc.sc_ss = regs->tf_ss;
sf.sf_siginfo.si_sc.sc_es = regs->tf_es;
sf.sf_siginfo.si_sc.sc_fs = regs->tf_fs;
sf.sf_siginfo.si_sc.sc_gs = rgs();
sf.sf_siginfo.si_sc.sc_isp = regs->tf_isp;
/* Build the signal context to be used by osigreturn(). */
sf.sf_siginfo.si_sc.sc_onstack = (oonstack) ? 1 : 0;
SIG2OSIG(*mask, sf.sf_siginfo.si_sc.sc_mask);
sf.sf_siginfo.si_sc.sc_sp = regs->tf_esp;
sf.sf_siginfo.si_sc.sc_fp = regs->tf_ebp;
sf.sf_siginfo.si_sc.sc_pc = regs->tf_eip;
sf.sf_siginfo.si_sc.sc_ps = regs->tf_eflags;
sf.sf_siginfo.si_sc.sc_trapno = regs->tf_trapno;
sf.sf_siginfo.si_sc.sc_err = regs->tf_err;
/*
* If we're a vm86 process, we want to save the segment registers.
* We also change eflags to be our emulated eflags, not the actual
* eflags.
*/
if (regs->tf_eflags & PSL_VM) {
/* XXX confusing names: `tf' isn't a trapframe; `regs' is. */
struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86;
sf.sf_siginfo.si_sc.sc_gs = tf->tf_vm86_gs;
sf.sf_siginfo.si_sc.sc_fs = tf->tf_vm86_fs;
sf.sf_siginfo.si_sc.sc_es = tf->tf_vm86_es;
sf.sf_siginfo.si_sc.sc_ds = tf->tf_vm86_ds;
if (vm86->vm86_has_vme == 0)
sf.sf_siginfo.si_sc.sc_ps =
(tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) |
(vm86->vm86_eflags & (PSL_VIF | PSL_VIP));
/* See sendsig() for comments. */
tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP);
}
/*
* Copy the sigframe out to the user's stack.
*/
if (copyout(&sf, fp, sizeof(*fp)) != 0) {
#ifdef DEBUG
printf("process %ld has trashed its stack\n", (long)p->p_pid);
#endif
PROC_LOCK(p);
sigexit(td, SIGILL);
}
regs->tf_esp = (int)fp;
regs->tf_eip = PS_STRINGS - szosigcode;
regs->tf_eflags &= ~(PSL_T | PSL_D);
regs->tf_cs = _ucodesel;
regs->tf_ds = _udatasel;
regs->tf_es = _udatasel;
regs->tf_fs = _udatasel;
load_gs(_udatasel);
regs->tf_ss = _udatasel;
PROC_LOCK(p);
mtx_lock(&psp->ps_mtx);
}
#endif /* COMPAT_43 */
#ifdef COMPAT_FREEBSD4
static void
freebsd4_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
{
struct sigframe4 sf, *sfp;
struct proc *p;
struct thread *td;
struct sigacts *psp;
struct trapframe *regs;
int sig;
int oonstack;
td = curthread;
p = td->td_proc;
PROC_LOCK_ASSERT(p, MA_OWNED);
sig = ksi->ksi_signo;
psp = p->p_sigacts;
mtx_assert(&psp->ps_mtx, MA_OWNED);
regs = td->td_frame;
oonstack = sigonstack(regs->tf_esp);
/* Save user context. */
bzero(&sf, sizeof(sf));
sf.sf_uc.uc_sigmask = *mask;
sf.sf_uc.uc_stack = td->td_sigstk;
sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
sf.sf_uc.uc_mcontext.mc_gs = rgs();
bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs));
bzero(sf.sf_uc.uc_mcontext.mc_fpregs,
sizeof(sf.sf_uc.uc_mcontext.mc_fpregs));
bzero(sf.sf_uc.uc_mcontext.__spare__,
sizeof(sf.sf_uc.uc_mcontext.__spare__));
bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__));
/* Allocate space for the signal handler context. */
if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
SIGISMEMBER(psp->ps_sigonstack, sig)) {
sfp = (struct sigframe4 *)(td->td_sigstk.ss_sp +
td->td_sigstk.ss_size - sizeof(struct sigframe4));
#if defined(COMPAT_43)
td->td_sigstk.ss_flags |= SS_ONSTACK;
#endif
} else
sfp = (struct sigframe4 *)regs->tf_esp - 1;
/* Translate the signal if appropriate. */
if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
/* Build the argument list for the signal handler. */
sf.sf_signum = sig;
sf.sf_ucontext = (register_t)&sfp->sf_uc;
bzero(&sf.sf_si, sizeof(sf.sf_si));
if (SIGISMEMBER(psp->ps_siginfo, sig)) {
/* Signal handler installed with SA_SIGINFO. */
sf.sf_siginfo = (register_t)&sfp->sf_si;
sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
/* Fill in POSIX parts */
sf.sf_si.si_signo = sig;
sf.sf_si.si_code = ksi->ksi_code;
sf.sf_si.si_addr = ksi->ksi_addr;
} else {
/* Old FreeBSD-style arguments. */
sf.sf_siginfo = ksi->ksi_code;
sf.sf_addr = (register_t)ksi->ksi_addr;
sf.sf_ahu.sf_handler = catcher;
}
mtx_unlock(&psp->ps_mtx);
PROC_UNLOCK(p);
/*
* If we're a vm86 process, we want to save the segment registers.
* We also change eflags to be our emulated eflags, not the actual
* eflags.
*/
if (regs->tf_eflags & PSL_VM) {
struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86;
sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs;
sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs;
sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es;
sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds;
if (vm86->vm86_has_vme == 0)
sf.sf_uc.uc_mcontext.mc_eflags =
(tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) |
(vm86->vm86_eflags & (PSL_VIF | PSL_VIP));
/*
* Clear PSL_NT to inhibit T_TSSFLT faults on return from
* syscalls made by the signal handler. This just avoids
* wasting time for our lazy fixup of such faults. PSL_NT
* does nothing in vm86 mode, but vm86 programs can set it
* almost legitimately in probes for old cpu types.
*/
tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP);
}
/*
* Copy the sigframe out to the user's stack.
*/
if (copyout(&sf, sfp, sizeof(*sfp)) != 0) {
#ifdef DEBUG
printf("process %ld has trashed its stack\n", (long)p->p_pid);
#endif
PROC_LOCK(p);
sigexit(td, SIGILL);
}
regs->tf_esp = (int)sfp;
regs->tf_eip = PS_STRINGS - szfreebsd4_sigcode;
regs->tf_eflags &= ~(PSL_T | PSL_D);
regs->tf_cs = _ucodesel;
regs->tf_ds = _udatasel;
regs->tf_es = _udatasel;
regs->tf_fs = _udatasel;
regs->tf_ss = _udatasel;
PROC_LOCK(p);
mtx_lock(&psp->ps_mtx);
}
#endif /* COMPAT_FREEBSD4 */
void
sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
{
struct sigframe sf, *sfp;
struct proc *p;
struct thread *td;
struct sigacts *psp;
char *sp;
struct trapframe *regs;
struct segment_descriptor *sdp;
int sig;
int oonstack;
td = curthread;
p = td->td_proc;
PROC_LOCK_ASSERT(p, MA_OWNED);
sig = ksi->ksi_signo;
psp = p->p_sigacts;
mtx_assert(&psp->ps_mtx, MA_OWNED);
#ifdef COMPAT_FREEBSD4
if (SIGISMEMBER(psp->ps_freebsd4, sig)) {
freebsd4_sendsig(catcher, ksi, mask);
return;
}
#endif
#ifdef COMPAT_43
if (SIGISMEMBER(psp->ps_osigset, sig)) {
osendsig(catcher, ksi, mask);
return;
}
#endif
regs = td->td_frame;
oonstack = sigonstack(regs->tf_esp);
/* Save user context. */
bzero(&sf, sizeof(sf));
sf.sf_uc.uc_sigmask = *mask;
sf.sf_uc.uc_stack = td->td_sigstk;
sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
sf.sf_uc.uc_mcontext.mc_gs = rgs();
bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs));
sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */
get_fpcontext(td, &sf.sf_uc.uc_mcontext);
fpstate_drop(td);
/*
* Unconditionally fill the fsbase and gsbase into the mcontext.
*/
sdp = &td->td_pcb->pcb_fsd;
sf.sf_uc.uc_mcontext.mc_fsbase = sdp->sd_hibase << 24 |
sdp->sd_lobase;
sdp = &td->td_pcb->pcb_gsd;
sf.sf_uc.uc_mcontext.mc_gsbase = sdp->sd_hibase << 24 |
sdp->sd_lobase;
bzero(sf.sf_uc.uc_mcontext.mc_spare1,
sizeof(sf.sf_uc.uc_mcontext.mc_spare1));
bzero(sf.sf_uc.uc_mcontext.mc_spare2,
sizeof(sf.sf_uc.uc_mcontext.mc_spare2));
bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__));
/* Allocate space for the signal handler context. */
if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
SIGISMEMBER(psp->ps_sigonstack, sig)) {
sp = td->td_sigstk.ss_sp +
td->td_sigstk.ss_size - sizeof(struct sigframe);
#if defined(COMPAT_43)
td->td_sigstk.ss_flags |= SS_ONSTACK;
#endif
} else
sp = (char *)regs->tf_esp - sizeof(struct sigframe);
/* Align to 16 bytes. */
sfp = (struct sigframe *)((unsigned int)sp & ~0xF);
/* Translate the signal if appropriate. */
if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
/* Build the argument list for the signal handler. */
sf.sf_signum = sig;
sf.sf_ucontext = (register_t)&sfp->sf_uc;
bzero(&sf.sf_si, sizeof(sf.sf_si));
if (SIGISMEMBER(psp->ps_siginfo, sig)) {
/* Signal handler installed with SA_SIGINFO. */
sf.sf_siginfo = (register_t)&sfp->sf_si;
sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
/* Fill in POSIX parts */
sf.sf_si = ksi->ksi_info;
sf.sf_si.si_signo = sig; /* maybe a translated signal */
} else {
/* Old FreeBSD-style arguments. */
sf.sf_siginfo = ksi->ksi_code;
sf.sf_addr = (register_t)ksi->ksi_addr;
sf.sf_ahu.sf_handler = catcher;
}
mtx_unlock(&psp->ps_mtx);
PROC_UNLOCK(p);
/*
* If we're a vm86 process, we want to save the segment registers.
* We also change eflags to be our emulated eflags, not the actual
* eflags.
*/
if (regs->tf_eflags & PSL_VM) {
struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86;
sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs;
sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs;
sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es;
sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds;
if (vm86->vm86_has_vme == 0)
sf.sf_uc.uc_mcontext.mc_eflags =
(tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) |
(vm86->vm86_eflags & (PSL_VIF | PSL_VIP));
/*
* Clear PSL_NT to inhibit T_TSSFLT faults on return from
* syscalls made by the signal handler. This just avoids
* wasting time for our lazy fixup of such faults. PSL_NT
* does nothing in vm86 mode, but vm86 programs can set it
* almost legitimately in probes for old cpu types.
*/
tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP);
}
/*
* Copy the sigframe out to the user's stack.
*/
if (copyout(&sf, sfp, sizeof(*sfp)) != 0) {
#ifdef DEBUG
printf("process %ld has trashed its stack\n", (long)p->p_pid);
#endif
PROC_LOCK(p);
sigexit(td, SIGILL);
}
regs->tf_esp = (int)sfp;
regs->tf_eip = PS_STRINGS - *(p->p_sysent->sv_szsigcode);
regs->tf_eflags &= ~(PSL_T | PSL_D);
regs->tf_cs = _ucodesel;
regs->tf_ds = _udatasel;
regs->tf_es = _udatasel;
regs->tf_fs = _udatasel;
regs->tf_ss = _udatasel;
PROC_LOCK(p);
mtx_lock(&psp->ps_mtx);
}
/*
* System call to cleanup state after a signal
* has been taken. Reset signal mask and
* stack state from context left by sendsig (above).
* Return to previous pc and psl as specified by
* context left by sendsig. Check carefully to
* make sure that the user has not modified the
* state to gain improper privileges.
*
* MPSAFE
*/
#ifdef COMPAT_43
int
osigreturn(td, uap)
struct thread *td;
struct osigreturn_args /* {
struct osigcontext *sigcntxp;
} */ *uap;
{
struct osigcontext sc;
struct trapframe *regs;
struct osigcontext *scp;
int eflags, error;
ksiginfo_t ksi;
regs = td->td_frame;
error = copyin(uap->sigcntxp, &sc, sizeof(sc));
if (error != 0)
return (error);
scp = &sc;
eflags = scp->sc_ps;
if (eflags & PSL_VM) {
struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
struct vm86_kernel *vm86;
/*
* if pcb_ext == 0 or vm86_inited == 0, the user hasn't
* set up the vm86 area, and we can't enter vm86 mode.
*/
if (td->td_pcb->pcb_ext == 0)
return (EINVAL);
vm86 = &td->td_pcb->pcb_ext->ext_vm86;
if (vm86->vm86_inited == 0)
return (EINVAL);
/* Go back to user mode if both flags are set. */
if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) {
ksiginfo_init_trap(&ksi);
ksi.ksi_signo = SIGBUS;
ksi.ksi_code = BUS_OBJERR;
ksi.ksi_addr = (void *)regs->tf_eip;
trapsignal(td, &ksi);
}
if (vm86->vm86_has_vme) {
eflags = (tf->tf_eflags & ~VME_USERCHANGE) |
(eflags & VME_USERCHANGE) | PSL_VM;
} else {
vm86->vm86_eflags = eflags; /* save VIF, VIP */
eflags = (tf->tf_eflags & ~VM_USERCHANGE) |
(eflags & VM_USERCHANGE) | PSL_VM;
}
tf->tf_vm86_ds = scp->sc_ds;
tf->tf_vm86_es = scp->sc_es;
tf->tf_vm86_fs = scp->sc_fs;
tf->tf_vm86_gs = scp->sc_gs;
tf->tf_ds = _udatasel;
tf->tf_es = _udatasel;
tf->tf_fs = _udatasel;
} else {
/*
* Don't allow users to change privileged or reserved flags.
*/
/*
* XXX do allow users to change the privileged flag PSL_RF.
* The cpu sets PSL_RF in tf_eflags for faults. Debuggers
* should sometimes set it there too. tf_eflags is kept in
* the signal context during signal handling and there is no
* other place to remember it, so the PSL_RF bit may be
* corrupted by the signal handler without us knowing.
* Corruption of the PSL_RF bit at worst causes one more or
* one less debugger trap, so allowing it is fairly harmless.
*/
if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) {
return (EINVAL);
}
/*
* Don't allow users to load a valid privileged %cs. Let the
* hardware check for invalid selectors, excess privilege in
* other selectors, invalid %eip's and invalid %esp's.
*/
if (!CS_SECURE(scp->sc_cs)) {
ksiginfo_init_trap(&ksi);
ksi.ksi_signo = SIGBUS;
ksi.ksi_code = BUS_OBJERR;
ksi.ksi_trapno = T_PROTFLT;
ksi.ksi_addr = (void *)regs->tf_eip;
trapsignal(td, &ksi);
return (EINVAL);
}
regs->tf_ds = scp->sc_ds;
regs->tf_es = scp->sc_es;
regs->tf_fs = scp->sc_fs;
}
/* Restore remaining registers. */
regs->tf_eax = scp->sc_eax;
regs->tf_ebx = scp->sc_ebx;
regs->tf_ecx = scp->sc_ecx;
regs->tf_edx = scp->sc_edx;
regs->tf_esi = scp->sc_esi;
regs->tf_edi = scp->sc_edi;
regs->tf_cs = scp->sc_cs;
regs->tf_ss = scp->sc_ss;
regs->tf_isp = scp->sc_isp;
regs->tf_ebp = scp->sc_fp;
regs->tf_esp = scp->sc_sp;
regs->tf_eip = scp->sc_pc;
regs->tf_eflags = eflags;
#if defined(COMPAT_43)
if (scp->sc_onstack & 1)
td->td_sigstk.ss_flags |= SS_ONSTACK;
else
td->td_sigstk.ss_flags &= ~SS_ONSTACK;
#endif
kern_sigprocmask(td, SIG_SETMASK, (sigset_t *)&scp->sc_mask, NULL,
SIGPROCMASK_OLD);
return (EJUSTRETURN);
}
#endif /* COMPAT_43 */
#ifdef COMPAT_FREEBSD4
/*
* MPSAFE
*/
int
freebsd4_sigreturn(td, uap)
struct thread *td;
struct freebsd4_sigreturn_args /* {
const ucontext4 *sigcntxp;
} */ *uap;
{
struct ucontext4 uc;
struct trapframe *regs;
struct ucontext4 *ucp;
int cs, eflags, error;
ksiginfo_t ksi;
error = copyin(uap->sigcntxp, &uc, sizeof(uc));
if (error != 0)
return (error);
ucp = &uc;
regs = td->td_frame;
eflags = ucp->uc_mcontext.mc_eflags;
if (eflags & PSL_VM) {
struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
struct vm86_kernel *vm86;
/*
* if pcb_ext == 0 or vm86_inited == 0, the user hasn't
* set up the vm86 area, and we can't enter vm86 mode.
*/
if (td->td_pcb->pcb_ext == 0)
return (EINVAL);
vm86 = &td->td_pcb->pcb_ext->ext_vm86;
if (vm86->vm86_inited == 0)
return (EINVAL);
/* Go back to user mode if both flags are set. */
if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) {
ksiginfo_init_trap(&ksi);
ksi.ksi_signo = SIGBUS;
ksi.ksi_code = BUS_OBJERR;
ksi.ksi_addr = (void *)regs->tf_eip;
trapsignal(td, &ksi);
}
if (vm86->vm86_has_vme) {
eflags = (tf->tf_eflags & ~VME_USERCHANGE) |
(eflags & VME_USERCHANGE) | PSL_VM;
} else {
vm86->vm86_eflags = eflags; /* save VIF, VIP */
eflags = (tf->tf_eflags & ~VM_USERCHANGE) |
(eflags & VM_USERCHANGE) | PSL_VM;
}
bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe));
tf->tf_eflags = eflags;
tf->tf_vm86_ds = tf->tf_ds;
tf->tf_vm86_es = tf->tf_es;
tf->tf_vm86_fs = tf->tf_fs;
tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs;
tf->tf_ds = _udatasel;
tf->tf_es = _udatasel;
tf->tf_fs = _udatasel;
} else {
/*
* Don't allow users to change privileged or reserved flags.
*/
/*
* XXX do allow users to change the privileged flag PSL_RF.
* The cpu sets PSL_RF in tf_eflags for faults. Debuggers
* should sometimes set it there too. tf_eflags is kept in
* the signal context during signal handling and there is no
* other place to remember it, so the PSL_RF bit may be
* corrupted by the signal handler without us knowing.
* Corruption of the PSL_RF bit at worst causes one more or
* one less debugger trap, so allowing it is fairly harmless.
*/
if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) {
uprintf("pid %d (%s): freebsd4_sigreturn eflags = 0x%x\n",
td->td_proc->p_pid, td->td_name, eflags);
return (EINVAL);
}
/*
* Don't allow users to load a valid privileged %cs. Let the
* hardware check for invalid selectors, excess privilege in
* other selectors, invalid %eip's and invalid %esp's.
*/
cs = ucp->uc_mcontext.mc_cs;
if (!CS_SECURE(cs)) {
uprintf("pid %d (%s): freebsd4_sigreturn cs = 0x%x\n",
td->td_proc->p_pid, td->td_name, cs);
ksiginfo_init_trap(&ksi);
ksi.ksi_signo = SIGBUS;
ksi.ksi_code = BUS_OBJERR;
ksi.ksi_trapno = T_PROTFLT;
ksi.ksi_addr = (void *)regs->tf_eip;
trapsignal(td, &ksi);
return (EINVAL);
}
bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs));
}
#if defined(COMPAT_43)
if (ucp->uc_mcontext.mc_onstack & 1)
td->td_sigstk.ss_flags |= SS_ONSTACK;
else
td->td_sigstk.ss_flags &= ~SS_ONSTACK;
#endif
kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
return (EJUSTRETURN);
}
#endif /* COMPAT_FREEBSD4 */
/*
* MPSAFE
*/
int
-sigreturn(td, uap)
+sys_sigreturn(td, uap)
struct thread *td;
struct sigreturn_args /* {
const struct __ucontext *sigcntxp;
} */ *uap;
{
ucontext_t uc;
struct trapframe *regs;
ucontext_t *ucp;
int cs, eflags, error, ret;
ksiginfo_t ksi;
error = copyin(uap->sigcntxp, &uc, sizeof(uc));
if (error != 0)
return (error);
ucp = &uc;
regs = td->td_frame;
eflags = ucp->uc_mcontext.mc_eflags;
if (eflags & PSL_VM) {
struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
struct vm86_kernel *vm86;
/*
* if pcb_ext == 0 or vm86_inited == 0, the user hasn't
* set up the vm86 area, and we can't enter vm86 mode.
*/
if (td->td_pcb->pcb_ext == 0)
return (EINVAL);
vm86 = &td->td_pcb->pcb_ext->ext_vm86;
if (vm86->vm86_inited == 0)
return (EINVAL);
/* Go back to user mode if both flags are set. */
if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) {
ksiginfo_init_trap(&ksi);
ksi.ksi_signo = SIGBUS;
ksi.ksi_code = BUS_OBJERR;
ksi.ksi_addr = (void *)regs->tf_eip;
trapsignal(td, &ksi);
}
if (vm86->vm86_has_vme) {
eflags = (tf->tf_eflags & ~VME_USERCHANGE) |
(eflags & VME_USERCHANGE) | PSL_VM;
} else {
vm86->vm86_eflags = eflags; /* save VIF, VIP */
eflags = (tf->tf_eflags & ~VM_USERCHANGE) |
(eflags & VM_USERCHANGE) | PSL_VM;
}
bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe));
tf->tf_eflags = eflags;
tf->tf_vm86_ds = tf->tf_ds;
tf->tf_vm86_es = tf->tf_es;
tf->tf_vm86_fs = tf->tf_fs;
tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs;
tf->tf_ds = _udatasel;
tf->tf_es = _udatasel;
tf->tf_fs = _udatasel;
} else {
/*
* Don't allow users to change privileged or reserved flags.
*/
/*
* XXX do allow users to change the privileged flag PSL_RF.
* The cpu sets PSL_RF in tf_eflags for faults. Debuggers
* should sometimes set it there too. tf_eflags is kept in
* the signal context during signal handling and there is no
* other place to remember it, so the PSL_RF bit may be
* corrupted by the signal handler without us knowing.
* Corruption of the PSL_RF bit at worst causes one more or
* one less debugger trap, so allowing it is fairly harmless.
*/
if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) {
uprintf("pid %d (%s): sigreturn eflags = 0x%x\n",
td->td_proc->p_pid, td->td_name, eflags);
return (EINVAL);
}
/*
* Don't allow users to load a valid privileged %cs. Let the
* hardware check for invalid selectors, excess privilege in
* other selectors, invalid %eip's and invalid %esp's.
*/
cs = ucp->uc_mcontext.mc_cs;
if (!CS_SECURE(cs)) {
uprintf("pid %d (%s): sigreturn cs = 0x%x\n",
td->td_proc->p_pid, td->td_name, cs);
ksiginfo_init_trap(&ksi);
ksi.ksi_signo = SIGBUS;
ksi.ksi_code = BUS_OBJERR;
ksi.ksi_trapno = T_PROTFLT;
ksi.ksi_addr = (void *)regs->tf_eip;
trapsignal(td, &ksi);
return (EINVAL);
}
ret = set_fpcontext(td, &ucp->uc_mcontext);
if (ret != 0)
return (ret);
bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs));
}
#if defined(COMPAT_43)
if (ucp->uc_mcontext.mc_onstack & 1)
td->td_sigstk.ss_flags |= SS_ONSTACK;
else
td->td_sigstk.ss_flags &= ~SS_ONSTACK;
#endif
kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
return (EJUSTRETURN);
}
/*
* Machine dependent boot() routine
*
* I haven't seen anything to put here yet
* Possibly some stuff might be grafted back here from boot()
*/
void
cpu_boot(int howto)
{
}
/*
* Flush the D-cache for non-DMA I/O so that the I-cache can
* be made coherent later.
*/
void
cpu_flush_dcache(void *ptr, size_t len)
{
/* Not applicable */
}
/* Get current clock frequency for the given cpu id. */
int
cpu_est_clockrate(int cpu_id, uint64_t *rate)
{
uint64_t tsc1, tsc2;
register_t reg;
if (pcpu_find(cpu_id) == NULL || rate == NULL)
return (EINVAL);
if ((cpu_feature & CPUID_TSC) == 0)
return (EOPNOTSUPP);
#ifdef SMP
if (smp_cpus > 1) {
/* Schedule ourselves on the indicated cpu. */
thread_lock(curthread);
sched_bind(curthread, cpu_id);
thread_unlock(curthread);
}
#endif
/* Calibrate by measuring a short delay. */
reg = intr_disable();
tsc1 = rdtsc();
DELAY(1000);
tsc2 = rdtsc();
intr_restore(reg);
*rate = (tsc2 - tsc1) * 1000;
#ifdef SMP
if (smp_cpus > 1) {
thread_lock(curthread);
sched_unbind(curthread);
thread_unlock(curthread);
}
#endif
return (0);
}
/*
* Shutdown the CPU as much as possible
*/
void
cpu_halt(void)
{
for (;;)
__asm__ ("hlt");
}
static int idle_mwait = 1; /* Use MONITOR/MWAIT for short idle. */
TUNABLE_INT("machdep.idle_mwait", &idle_mwait);
SYSCTL_INT(_machdep, OID_AUTO, idle_mwait, CTLFLAG_RW, &idle_mwait,
0, "Use MONITOR/MWAIT for short idle");
#define STATE_RUNNING 0x0
#define STATE_MWAIT 0x1
#define STATE_SLEEPING 0x2
static void
cpu_idle_hlt(int busy)
{
int *state;
state = (int *)PCPU_PTR(monitorbuf);
*state = STATE_SLEEPING;
/*
* We must absolutely guarentee that hlt is the next instruction
* after sti or we introduce a timing window.
*/
disable_intr();
if (sched_runnable())
enable_intr();
else
__asm __volatile("sti; hlt");
*state = STATE_RUNNING;
}
/*
* MWAIT cpu power states. Lower 4 bits are sub-states.
*/
#define MWAIT_C0 0xf0
#define MWAIT_C1 0x00
#define MWAIT_C2 0x10
#define MWAIT_C3 0x20
#define MWAIT_C4 0x30
static void
cpu_idle_mwait(int busy)
{
int *state;
state = (int *)PCPU_PTR(monitorbuf);
*state = STATE_MWAIT;
if (!sched_runnable()) {
cpu_monitor(state, 0, 0);
if (*state == STATE_MWAIT)
cpu_mwait(0, MWAIT_C1);
}
*state = STATE_RUNNING;
}
static void
cpu_idle_spin(int busy)
{
int *state;
int i;
state = (int *)PCPU_PTR(monitorbuf);
*state = STATE_RUNNING;
for (i = 0; i < 1000; i++) {
if (sched_runnable())
return;
cpu_spinwait();
}
}
void (*cpu_idle_fn)(int) = cpu_idle_hlt;
void
cpu_idle(int busy)
{
CTR2(KTR_SPARE2, "cpu_idle(%d) at %d",
busy, curcpu);
#ifdef MP_WATCHDOG
ap_watchdog(PCPU_GET(cpuid));
#endif
/* If we are busy - try to use fast methods. */
if (busy) {
if ((cpu_feature2 & CPUID2_MON) && idle_mwait) {
cpu_idle_mwait(busy);
goto out;
}
}
/* If we have time - switch timers into idle mode. */
if (!busy) {
critical_enter();
cpu_idleclock();
}
/* Call main idle method. */
cpu_idle_fn(busy);
/* Switch timers mack into active mode. */
if (!busy) {
cpu_activeclock();
critical_exit();
}
out:
CTR2(KTR_SPARE2, "cpu_idle(%d) at %d done",
busy, curcpu);
}
int
cpu_idle_wakeup(int cpu)
{
struct pcpu *pcpu;
int *state;
pcpu = pcpu_find(cpu);
state = (int *)pcpu->pc_monitorbuf;
/*
* This doesn't need to be atomic since missing the race will
* simply result in unnecessary IPIs.
*/
if (*state == STATE_SLEEPING)
return (0);
if (*state == STATE_MWAIT)
*state = STATE_RUNNING;
return (1);
}
/*
* Ordered by speed/power consumption.
*/
struct {
void *id_fn;
char *id_name;
} idle_tbl[] = {
{ cpu_idle_spin, "spin" },
{ cpu_idle_mwait, "mwait" },
{ cpu_idle_hlt, "hlt" },
{ NULL, NULL }
};
static int
idle_sysctl_available(SYSCTL_HANDLER_ARGS)
{
char *avail, *p;
int error;
int i;
avail = malloc(256, M_TEMP, M_WAITOK);
p = avail;
for (i = 0; idle_tbl[i].id_name != NULL; i++) {
if (strstr(idle_tbl[i].id_name, "mwait") &&
(cpu_feature2 & CPUID2_MON) == 0)
continue;
p += sprintf(p, "%s%s", p != avail ? ", " : "",
idle_tbl[i].id_name);
}
error = sysctl_handle_string(oidp, avail, 0, req);
free(avail, M_TEMP);
return (error);
}
SYSCTL_PROC(_machdep, OID_AUTO, idle_available, CTLTYPE_STRING | CTLFLAG_RD,
0, 0, idle_sysctl_available, "A", "list of available idle functions");
static int
idle_sysctl(SYSCTL_HANDLER_ARGS)
{
char buf[16];
int error;
char *p;
int i;
p = "unknown";
for (i = 0; idle_tbl[i].id_name != NULL; i++) {
if (idle_tbl[i].id_fn == cpu_idle_fn) {
p = idle_tbl[i].id_name;
break;
}
}
strncpy(buf, p, sizeof(buf));
error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
if (error != 0 || req->newptr == NULL)
return (error);
for (i = 0; idle_tbl[i].id_name != NULL; i++) {
if (strstr(idle_tbl[i].id_name, "mwait") &&
(cpu_feature2 & CPUID2_MON) == 0)
continue;
if (strcmp(idle_tbl[i].id_name, buf))
continue;
cpu_idle_fn = idle_tbl[i].id_fn;
return (0);
}
return (EINVAL);
}
SYSCTL_PROC(_machdep, OID_AUTO, idle, CTLTYPE_STRING | CTLFLAG_RW, 0, 0,
idle_sysctl, "A", "currently selected idle function");
uint64_t (*atomic_load_acq_64)(volatile uint64_t *) =
atomic_load_acq_64_i386;
void (*atomic_store_rel_64)(volatile uint64_t *, uint64_t) =
atomic_store_rel_64_i386;
static void
cpu_probe_cmpxchg8b(void)
{
if ((cpu_feature & CPUID_CX8) != 0) {
atomic_load_acq_64 = atomic_load_acq_64_i586;
atomic_store_rel_64 = atomic_store_rel_64_i586;
}
}
/*
* Reset registers to default values on exec.
*/
void
exec_setregs(struct thread *td, struct image_params *imgp, u_long stack)
{
struct trapframe *regs = td->td_frame;
struct pcb *pcb = td->td_pcb;
/* Reset pc->pcb_gs and %gs before possibly invalidating it. */
pcb->pcb_gs = _udatasel;
load_gs(_udatasel);
mtx_lock_spin(&dt_lock);
if (td->td_proc->p_md.md_ldt)
user_ldt_free(td);
else
mtx_unlock_spin(&dt_lock);
bzero((char *)regs, sizeof(struct trapframe));
regs->tf_eip = imgp->entry_addr;
regs->tf_esp = stack;
regs->tf_eflags = PSL_USER | (regs->tf_eflags & PSL_T);
regs->tf_ss = _udatasel;
regs->tf_ds = _udatasel;
regs->tf_es = _udatasel;
regs->tf_fs = _udatasel;
regs->tf_cs = _ucodesel;
/* PS_STRINGS value for BSD/OS binaries. It is 0 for non-BSD/OS. */
regs->tf_ebx = imgp->ps_strings;
/*
* Reset the hardware debug registers if they were in use.
* They won't have any meaning for the newly exec'd process.
*/
if (pcb->pcb_flags & PCB_DBREGS) {
pcb->pcb_dr0 = 0;
pcb->pcb_dr1 = 0;
pcb->pcb_dr2 = 0;
pcb->pcb_dr3 = 0;
pcb->pcb_dr6 = 0;
pcb->pcb_dr7 = 0;
if (pcb == PCPU_GET(curpcb)) {
/*
* Clear the debug registers on the running
* CPU, otherwise they will end up affecting
* the next process we switch to.
*/
reset_dbregs();
}
pcb->pcb_flags &= ~PCB_DBREGS;
}
/*
* Initialize the math emulator (if any) for the current process.
* Actually, just clear the bit that says that the emulator has
* been initialized. Initialization is delayed until the process
* traps to the emulator (if it is done at all) mainly because
* emulators don't provide an entry point for initialization.
*/
td->td_pcb->pcb_flags &= ~FP_SOFTFP;
pcb->pcb_initial_npxcw = __INITIAL_NPXCW__;
/*
* Drop the FP state if we hold it, so that the process gets a
* clean FP state if it uses the FPU again.
*/
fpstate_drop(td);
/*
* XXX - Linux emulator
* Make sure sure edx is 0x0 on entry. Linux binaries depend
* on it.
*/
td->td_retval[1] = 0;
}
void
cpu_setregs(void)
{
unsigned int cr0;
cr0 = rcr0();
/*
* CR0_MP, CR0_NE and CR0_TS are set for NPX (FPU) support:
*
* Prepare to trap all ESC (i.e., NPX) instructions and all WAIT
* instructions. We must set the CR0_MP bit and use the CR0_TS
* bit to control the trap, because setting the CR0_EM bit does
* not cause WAIT instructions to trap. It's important to trap
* WAIT instructions - otherwise the "wait" variants of no-wait
* control instructions would degenerate to the "no-wait" variants
* after FP context switches but work correctly otherwise. It's
* particularly important to trap WAITs when there is no NPX -
* otherwise the "wait" variants would always degenerate.
*
* Try setting CR0_NE to get correct error reporting on 486DX's.
* Setting it should fail or do nothing on lesser processors.
*/
cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
load_cr0(cr0);
load_gs(_udatasel);
}
u_long bootdev; /* not a struct cdev *- encoding is different */
SYSCTL_ULONG(_machdep, OID_AUTO, guessed_bootdev,
CTLFLAG_RD, &bootdev, 0, "Maybe the Boot device (not in struct cdev *format)");
/*
* Initialize 386 and configure to run kernel
*/
/*
* Initialize segments & interrupt table
*/
int _default_ldt;
union descriptor gdt[NGDT * MAXCPU]; /* global descriptor table */
union descriptor ldt[NLDT]; /* local descriptor table */
static struct gate_descriptor idt0[NIDT];
struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */
struct region_descriptor r_gdt, r_idt; /* table descriptors */
struct mtx dt_lock; /* lock for GDT and LDT */
#if defined(I586_CPU) && !defined(NO_F00F_HACK)
extern int has_f00f_bug;
#endif
static struct i386tss dblfault_tss;
static char dblfault_stack[PAGE_SIZE];
extern vm_offset_t proc0kstack;
/*
* software prototypes -- in more palatable form.
*
* GCODE_SEL through GUDATA_SEL must be in this order for syscall/sysret
* GUFS_SEL and GUGS_SEL must be in this order (swtch.s knows it)
*/
struct soft_segment_descriptor gdt_segs[] = {
/* GNULL_SEL 0 Null Descriptor */
{ .ssd_base = 0x0,
.ssd_limit = 0x0,
.ssd_type = 0,
.ssd_dpl = SEL_KPL,
.ssd_p = 0,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 0,
.ssd_gran = 0 },
/* GPRIV_SEL 1 SMP Per-Processor Private Data Descriptor */
{ .ssd_base = 0x0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMRWA,
.ssd_dpl = SEL_KPL,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 1,
.ssd_gran = 1 },
/* GUFS_SEL 2 %fs Descriptor for user */
{ .ssd_base = 0x0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMRWA,
.ssd_dpl = SEL_UPL,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 1,
.ssd_gran = 1 },
/* GUGS_SEL 3 %gs Descriptor for user */
{ .ssd_base = 0x0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMRWA,
.ssd_dpl = SEL_UPL,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 1,
.ssd_gran = 1 },
/* GCODE_SEL 4 Code Descriptor for kernel */
{ .ssd_base = 0x0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMERA,
.ssd_dpl = SEL_KPL,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 1,
.ssd_gran = 1 },
/* GDATA_SEL 5 Data Descriptor for kernel */
{ .ssd_base = 0x0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMRWA,
.ssd_dpl = SEL_KPL,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 1,
.ssd_gran = 1 },
/* GUCODE_SEL 6 Code Descriptor for user */
{ .ssd_base = 0x0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMERA,
.ssd_dpl = SEL_UPL,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 1,
.ssd_gran = 1 },
/* GUDATA_SEL 7 Data Descriptor for user */
{ .ssd_base = 0x0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMRWA,
.ssd_dpl = SEL_UPL,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 1,
.ssd_gran = 1 },
/* GBIOSLOWMEM_SEL 8 BIOS access to realmode segment 0x40, must be #8 in GDT */
{ .ssd_base = 0x400,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMRWA,
.ssd_dpl = SEL_KPL,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 1,
.ssd_gran = 1 },
/* GPROC0_SEL 9 Proc 0 Tss Descriptor */
{
.ssd_base = 0x0,
.ssd_limit = sizeof(struct i386tss)-1,
.ssd_type = SDT_SYS386TSS,
.ssd_dpl = 0,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 0,
.ssd_gran = 0 },
/* GLDT_SEL 10 LDT Descriptor */
{ .ssd_base = (int) ldt,
.ssd_limit = sizeof(ldt)-1,
.ssd_type = SDT_SYSLDT,
.ssd_dpl = SEL_UPL,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 0,
.ssd_gran = 0 },
/* GUSERLDT_SEL 11 User LDT Descriptor per process */
{ .ssd_base = (int) ldt,
.ssd_limit = (512 * sizeof(union descriptor)-1),
.ssd_type = SDT_SYSLDT,
.ssd_dpl = 0,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 0,
.ssd_gran = 0 },
/* GPANIC_SEL 12 Panic Tss Descriptor */
{ .ssd_base = (int) &dblfault_tss,
.ssd_limit = sizeof(struct i386tss)-1,
.ssd_type = SDT_SYS386TSS,
.ssd_dpl = 0,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 0,
.ssd_gran = 0 },
/* GBIOSCODE32_SEL 13 BIOS 32-bit interface (32bit Code) */
{ .ssd_base = 0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMERA,
.ssd_dpl = 0,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 0,
.ssd_gran = 1 },
/* GBIOSCODE16_SEL 14 BIOS 32-bit interface (16bit Code) */
{ .ssd_base = 0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMERA,
.ssd_dpl = 0,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 0,
.ssd_gran = 1 },
/* GBIOSDATA_SEL 15 BIOS 32-bit interface (Data) */
{ .ssd_base = 0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMRWA,
.ssd_dpl = 0,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 1,
.ssd_gran = 1 },
/* GBIOSUTIL_SEL 16 BIOS 16-bit interface (Utility) */
{ .ssd_base = 0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMRWA,
.ssd_dpl = 0,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 0,
.ssd_gran = 1 },
/* GBIOSARGS_SEL 17 BIOS 16-bit interface (Arguments) */
{ .ssd_base = 0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMRWA,
.ssd_dpl = 0,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 0,
.ssd_gran = 1 },
/* GNDIS_SEL 18 NDIS Descriptor */
{ .ssd_base = 0x0,
.ssd_limit = 0x0,
.ssd_type = 0,
.ssd_dpl = 0,
.ssd_p = 0,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 0,
.ssd_gran = 0 },
};
static struct soft_segment_descriptor ldt_segs[] = {
/* Null Descriptor - overwritten by call gate */
{ .ssd_base = 0x0,
.ssd_limit = 0x0,
.ssd_type = 0,
.ssd_dpl = 0,
.ssd_p = 0,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 0,
.ssd_gran = 0 },
/* Null Descriptor - overwritten by call gate */
{ .ssd_base = 0x0,
.ssd_limit = 0x0,
.ssd_type = 0,
.ssd_dpl = 0,
.ssd_p = 0,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 0,
.ssd_gran = 0 },
/* Null Descriptor - overwritten by call gate */
{ .ssd_base = 0x0,
.ssd_limit = 0x0,
.ssd_type = 0,
.ssd_dpl = 0,
.ssd_p = 0,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 0,
.ssd_gran = 0 },
/* Code Descriptor for user */
{ .ssd_base = 0x0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMERA,
.ssd_dpl = SEL_UPL,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 1,
.ssd_gran = 1 },
/* Null Descriptor - overwritten by call gate */
{ .ssd_base = 0x0,
.ssd_limit = 0x0,
.ssd_type = 0,
.ssd_dpl = 0,
.ssd_p = 0,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 0,
.ssd_gran = 0 },
/* Data Descriptor for user */
{ .ssd_base = 0x0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMRWA,
.ssd_dpl = SEL_UPL,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 1,
.ssd_gran = 1 },
};
void
setidt(idx, func, typ, dpl, selec)
int idx;
inthand_t *func;
int typ;
int dpl;
int selec;
{
struct gate_descriptor *ip;
ip = idt + idx;
ip->gd_looffset = (int)func;
ip->gd_selector = selec;
ip->gd_stkcpy = 0;
ip->gd_xx = 0;
ip->gd_type = typ;
ip->gd_dpl = dpl;
ip->gd_p = 1;
ip->gd_hioffset = ((int)func)>>16 ;
}
extern inthand_t
IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
IDTVEC(xmm), IDTVEC(lcall_syscall), IDTVEC(int0x80_syscall);
#ifdef DDB
/*
* Display the index and function name of any IDT entries that don't use
* the default 'rsvd' entry point.
*/
DB_SHOW_COMMAND(idt, db_show_idt)
{
struct gate_descriptor *ip;
int idx;
uintptr_t func;
ip = idt;
for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
func = (ip->gd_hioffset << 16 | ip->gd_looffset);
if (func != (uintptr_t)&IDTVEC(rsvd)) {
db_printf("%3d\t", idx);
db_printsym(func, DB_STGY_PROC);
db_printf("\n");
}
ip++;
}
}
/* Show privileged registers. */
DB_SHOW_COMMAND(sysregs, db_show_sysregs)
{
uint64_t idtr, gdtr;
idtr = ridt();
db_printf("idtr\t0x%08x/%04x\n",
(u_int)(idtr >> 16), (u_int)idtr & 0xffff);
gdtr = rgdt();
db_printf("gdtr\t0x%08x/%04x\n",
(u_int)(gdtr >> 16), (u_int)gdtr & 0xffff);
db_printf("ldtr\t0x%04x\n", rldt());
db_printf("tr\t0x%04x\n", rtr());
db_printf("cr0\t0x%08x\n", rcr0());
db_printf("cr2\t0x%08x\n", rcr2());
db_printf("cr3\t0x%08x\n", rcr3());
db_printf("cr4\t0x%08x\n", rcr4());
}
#endif
void
sdtossd(sd, ssd)
struct segment_descriptor *sd;
struct soft_segment_descriptor *ssd;
{
ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase;
ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
ssd->ssd_type = sd->sd_type;
ssd->ssd_dpl = sd->sd_dpl;
ssd->ssd_p = sd->sd_p;
ssd->ssd_def32 = sd->sd_def32;
ssd->ssd_gran = sd->sd_gran;
}
static void
basemem_setup(void)
{
vm_paddr_t pa;
pt_entry_t *pte;
int i;
if (basemem > 640) {
printf("Preposterous BIOS basemem of %uK, truncating to 640K\n",
basemem);
basemem = 640;
}
/*
* XXX if biosbasemem is now < 640, there is a `hole'
* between the end of base memory and the start of
* ISA memory. The hole may be empty or it may
* contain BIOS code or data. Map it read/write so
* that the BIOS can write to it. (Memory from 0 to
* the physical end of the kernel is mapped read-only
* to begin with and then parts of it are remapped.
* The parts that aren't remapped form holes that
* remain read-only and are unused by the kernel.
* The base memory area is below the physical end of
* the kernel and right now forms a read-only hole.
* The part of it from PAGE_SIZE to
* (trunc_page(biosbasemem * 1024) - 1) will be
* remapped and used by the kernel later.)
*
* This code is similar to the code used in
* pmap_mapdev, but since no memory needs to be
* allocated we simply change the mapping.
*/
for (pa = trunc_page(basemem * 1024);
pa < ISA_HOLE_START; pa += PAGE_SIZE)
pmap_kenter(KERNBASE + pa, pa);
/*
* Map pages between basemem and ISA_HOLE_START, if any, r/w into
* the vm86 page table so that vm86 can scribble on them using
* the vm86 map too. XXX: why 2 ways for this and only 1 way for
* page 0, at least as initialized here?
*/
pte = (pt_entry_t *)vm86paddr;
for (i = basemem / 4; i < 160; i++)
pte[i] = (i << PAGE_SHIFT) | PG_V | PG_RW | PG_U;
}
/*
* Populate the (physmap) array with base/bound pairs describing the
* available physical memory in the system, then test this memory and
* build the phys_avail array describing the actually-available memory.
*
* If we cannot accurately determine the physical memory map, then use
* value from the 0xE801 call, and failing that, the RTC.
*
* Total memory size may be set by the kernel environment variable
* hw.physmem or the compile-time define MAXMEM.
*
* XXX first should be vm_paddr_t.
*/
static void
getmemsize(int first)
{
int off, physmap_idx, pa_indx, da_indx;
u_long physmem_tunable, memtest;
vm_paddr_t physmap[PHYSMAP_SIZE];
pt_entry_t *pte;
quad_t dcons_addr, dcons_size;
int i;
int pg_n;
u_int extmem;
u_int under16;
vm_paddr_t pa;
bzero(physmap, sizeof(physmap));
/* XXX - some of EPSON machines can't use PG_N */
pg_n = PG_N;
if (pc98_machine_type & M_EPSON_PC98) {
switch (epson_machine_id) {
#ifdef WB_CACHE
default:
#endif
case EPSON_PC486_HX:
case EPSON_PC486_HG:
case EPSON_PC486_HA:
pg_n = 0;
break;
}
}
under16 = pc98_getmemsize(&basemem, &extmem);
basemem_setup();
physmap[0] = 0;
physmap[1] = basemem * 1024;
physmap_idx = 2;
physmap[physmap_idx] = 0x100000;
physmap[physmap_idx + 1] = physmap[physmap_idx] + extmem * 1024;
/*
* Now, physmap contains a map of physical memory.
*/
#ifdef SMP
/* make hole for AP bootstrap code */
physmap[1] = mp_bootaddress(physmap[1]);
#endif
/*
* Maxmem isn't the "maximum memory", it's one larger than the
* highest page of the physical address space. It should be
* called something like "Maxphyspage". We may adjust this
* based on ``hw.physmem'' and the results of the memory test.
*/
Maxmem = atop(physmap[physmap_idx + 1]);
#ifdef MAXMEM
Maxmem = MAXMEM / 4;
#endif
if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
Maxmem = atop(physmem_tunable);
/*
* By default keep the memtest enabled. Use a general name so that
* one could eventually do more with the code than just disable it.
*/
memtest = 1;
TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
if (atop(physmap[physmap_idx + 1]) != Maxmem &&
(boothowto & RB_VERBOSE))
printf("Physical memory use set to %ldK\n", Maxmem * 4);
/*
* If Maxmem has been increased beyond what the system has detected,
* extend the last memory segment to the new limit.
*/
if (atop(physmap[physmap_idx + 1]) < Maxmem)
physmap[physmap_idx + 1] = ptoa((vm_paddr_t)Maxmem);
/*
* We need to divide chunk if Maxmem is larger than 16MB and
* under 16MB area is not full of memory.
* (1) system area (15-16MB region) is cut off
* (2) extended memory is only over 16MB area (ex. Melco "HYPERMEMORY")
*/
if ((under16 != 16 * 1024) && (extmem > 15 * 1024)) {
/* 15M - 16M region is cut off, so need to divide chunk */
physmap[physmap_idx + 1] = under16 * 1024;
physmap_idx += 2;
physmap[physmap_idx] = 0x1000000;
physmap[physmap_idx + 1] = physmap[2] + extmem * 1024;
}
/* call pmap initialization to make new kernel address space */
pmap_bootstrap(first);
/*
* Size up each available chunk of physical memory.
*/
physmap[0] = PAGE_SIZE; /* mask off page 0 */
pa_indx = 0;
da_indx = 1;
phys_avail[pa_indx++] = physmap[0];
phys_avail[pa_indx] = physmap[0];
dump_avail[da_indx] = physmap[0];
pte = CMAP1;
/*
* Get dcons buffer address
*/
if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
getenv_quad("dcons.size", &dcons_size) == 0)
dcons_addr = 0;
/*
* physmap is in bytes, so when converting to page boundaries,
* round up the start address and round down the end address.
*/
for (i = 0; i <= physmap_idx; i += 2) {
vm_paddr_t end;
end = ptoa((vm_paddr_t)Maxmem);
if (physmap[i + 1] < end)
end = trunc_page(physmap[i + 1]);
for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
int tmp, page_bad, full;
int *ptr = (int *)CADDR1;
full = FALSE;
/*
* block out kernel memory as not available.
*/
if (pa >= KERNLOAD && pa < first)
goto do_dump_avail;
/*
* block out dcons buffer
*/
if (dcons_addr > 0
&& pa >= trunc_page(dcons_addr)
&& pa < dcons_addr + dcons_size)
goto do_dump_avail;
page_bad = FALSE;
if (memtest == 0)
goto skip_memtest;
/*
* map page into kernel: valid, read/write,non-cacheable
*/
*pte = pa | PG_V | PG_RW | pg_n;
invltlb();
tmp = *(int *)ptr;
/*
* Test for alternating 1's and 0's
*/
*(volatile int *)ptr = 0xaaaaaaaa;
if (*(volatile int *)ptr != 0xaaaaaaaa)
page_bad = TRUE;
/*
* Test for alternating 0's and 1's
*/
*(volatile int *)ptr = 0x55555555;
if (*(volatile int *)ptr != 0x55555555)
page_bad = TRUE;
/*
* Test for all 1's
*/
*(volatile int *)ptr = 0xffffffff;
if (*(volatile int *)ptr != 0xffffffff)
page_bad = TRUE;
/*
* Test for all 0's
*/
*(volatile int *)ptr = 0x0;
if (*(volatile int *)ptr != 0x0)
page_bad = TRUE;
/*
* Restore original value.
*/
*(int *)ptr = tmp;
skip_memtest:
/*
* Adjust array of valid/good pages.
*/
if (page_bad == TRUE)
continue;
/*
* If this good page is a continuation of the
* previous set of good pages, then just increase
* the end pointer. Otherwise start a new chunk.
* Note that "end" points one higher than end,
* making the range >= start and < end.
* If we're also doing a speculative memory
* test and we at or past the end, bump up Maxmem
* so that we keep going. The first bad page
* will terminate the loop.
*/
if (phys_avail[pa_indx] == pa) {
phys_avail[pa_indx] += PAGE_SIZE;
} else {
pa_indx++;
if (pa_indx == PHYS_AVAIL_ARRAY_END) {
printf(
"Too many holes in the physical address space, giving up\n");
pa_indx--;
full = TRUE;
goto do_dump_avail;
}
phys_avail[pa_indx++] = pa; /* start */
phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
}
physmem++;
do_dump_avail:
if (dump_avail[da_indx] == pa) {
dump_avail[da_indx] += PAGE_SIZE;
} else {
da_indx++;
if (da_indx == DUMP_AVAIL_ARRAY_END) {
da_indx--;
goto do_next;
}
dump_avail[da_indx++] = pa; /* start */
dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
}
do_next:
if (full)
break;
}
}
*pte = 0;
invltlb();
/*
* XXX
* The last chunk must contain at least one page plus the message
* buffer to avoid complicating other code (message buffer address
* calculation, etc.).
*/
while (phys_avail[pa_indx - 1] + PAGE_SIZE +
round_page(msgbufsize) >= phys_avail[pa_indx]) {
physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
phys_avail[pa_indx--] = 0;
phys_avail[pa_indx--] = 0;
}
Maxmem = atop(phys_avail[pa_indx]);
/* Trim off space for the message buffer. */
phys_avail[pa_indx] -= round_page(msgbufsize);
/* Map the message buffer. */
for (off = 0; off < round_page(msgbufsize); off += PAGE_SIZE)
pmap_kenter((vm_offset_t)msgbufp + off, phys_avail[pa_indx] +
off);
}
void
init386(first)
int first;
{
struct gate_descriptor *gdp;
int gsel_tss, metadata_missing, x, pa;
size_t kstack0_sz;
struct pcpu *pc;
thread0.td_kstack = proc0kstack;
thread0.td_kstack_pages = KSTACK_PAGES;
kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
thread0.td_pcb = (struct pcb *)(thread0.td_kstack + kstack0_sz) - 1;
/*
* This may be done better later if it gets more high level
* components in it. If so just link td->td_proc here.
*/
proc_linkup0(&proc0, &thread0);
/*
* Initialize DMAC
*/
pc98_init_dmac();
metadata_missing = 0;
if (bootinfo.bi_modulep) {
preload_metadata = (caddr_t)bootinfo.bi_modulep + KERNBASE;
preload_bootstrap_relocate(KERNBASE);
} else {
metadata_missing = 1;
}
if (envmode == 1)
kern_envp = static_env;
else if (bootinfo.bi_envp)
kern_envp = (caddr_t)bootinfo.bi_envp + KERNBASE;
/* Init basic tunables, hz etc */
init_param1();
/*
* Make gdt memory segments. All segments cover the full 4GB
* of address space and permissions are enforced at page level.
*/
gdt_segs[GCODE_SEL].ssd_limit = atop(0 - 1);
gdt_segs[GDATA_SEL].ssd_limit = atop(0 - 1);
gdt_segs[GUCODE_SEL].ssd_limit = atop(0 - 1);
gdt_segs[GUDATA_SEL].ssd_limit = atop(0 - 1);
gdt_segs[GUFS_SEL].ssd_limit = atop(0 - 1);
gdt_segs[GUGS_SEL].ssd_limit = atop(0 - 1);
pc = &__pcpu[0];
gdt_segs[GPRIV_SEL].ssd_limit = atop(0 - 1);
gdt_segs[GPRIV_SEL].ssd_base = (int) pc;
gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss;
for (x = 0; x < NGDT; x++)
ssdtosd(&gdt_segs[x], &gdt[x].sd);
r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
r_gdt.rd_base = (int) gdt;
mtx_init(&dt_lock, "descriptor tables", NULL, MTX_SPIN);
lgdt(&r_gdt);
pcpu_init(pc, 0, sizeof(struct pcpu));
for (pa = first; pa < first + DPCPU_SIZE; pa += PAGE_SIZE)
pmap_kenter(pa + KERNBASE, pa);
dpcpu_init((void *)(first + KERNBASE), 0);
first += DPCPU_SIZE;
PCPU_SET(prvspace, pc);
PCPU_SET(curthread, &thread0);
PCPU_SET(curpcb, thread0.td_pcb);
/*
* Initialize mutexes.
*
* icu_lock: in order to allow an interrupt to occur in a critical
* section, to set pcpu->ipending (etc...) properly, we
* must be able to get the icu lock, so it can't be
* under witness.
*/
mutex_init();
mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS | MTX_NOPROFILE);
/* make ldt memory segments */
ldt_segs[LUCODE_SEL].ssd_limit = atop(0 - 1);
ldt_segs[LUDATA_SEL].ssd_limit = atop(0 - 1);
for (x = 0; x < sizeof ldt_segs / sizeof ldt_segs[0]; x++)
ssdtosd(&ldt_segs[x], &ldt[x].sd);
_default_ldt = GSEL(GLDT_SEL, SEL_KPL);
lldt(_default_ldt);
PCPU_SET(currentldt, _default_ldt);
/* exceptions */
for (x = 0; x < NIDT; x++)
setidt(x, &IDTVEC(rsvd), SDT_SYS386TGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_DE, &IDTVEC(div), SDT_SYS386TGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_DB, &IDTVEC(dbg), SDT_SYS386IGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYS386IGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_BP, &IDTVEC(bpt), SDT_SYS386IGT, SEL_UPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_OF, &IDTVEC(ofl), SDT_SYS386TGT, SEL_UPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_BR, &IDTVEC(bnd), SDT_SYS386TGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_NM, &IDTVEC(dna), SDT_SYS386TGT, SEL_KPL
, GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_DF, 0, SDT_SYSTASKGT, SEL_KPL, GSEL(GPANIC_SEL, SEL_KPL));
setidt(IDT_FPUGP, &IDTVEC(fpusegm), SDT_SYS386TGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_TS, &IDTVEC(tss), SDT_SYS386TGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_NP, &IDTVEC(missing), SDT_SYS386TGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_SS, &IDTVEC(stk), SDT_SYS386TGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_PF, &IDTVEC(page), SDT_SYS386IGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_MF, &IDTVEC(fpu), SDT_SYS386TGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_AC, &IDTVEC(align), SDT_SYS386TGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_MC, &IDTVEC(mchk), SDT_SYS386TGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_XF, &IDTVEC(xmm), SDT_SYS386TGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_SYSCALL, &IDTVEC(int0x80_syscall), SDT_SYS386TGT, SEL_UPL,
GSEL(GCODE_SEL, SEL_KPL));
r_idt.rd_limit = sizeof(idt0) - 1;
r_idt.rd_base = (int) idt;
lidt(&r_idt);
/*
* Initialize the i8254 before the console so that console
* initialization can use DELAY().
*/
i8254_init();
/*
* Initialize the console before we print anything out.
*/
cninit();
if (metadata_missing)
printf("WARNING: loader(8) metadata is missing!\n");
#ifdef DEV_ISA
atpic_startup();
#endif
#ifdef DDB
ksym_start = bootinfo.bi_symtab;
ksym_end = bootinfo.bi_esymtab;
#endif
kdb_init();
#ifdef KDB
if (boothowto & RB_KDB)
kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
#endif
finishidentcpu(); /* Final stage of CPU initialization */
setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
initializecpu(); /* Initialize CPU registers */
/* make an initial tss so cpu can get interrupt stack on syscall! */
/* Note: -16 is so we can grow the trapframe if we came from vm86 */
PCPU_SET(common_tss.tss_esp0, thread0.td_kstack +
kstack0_sz - sizeof(struct pcb) - 16);
PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL));
gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
PCPU_SET(tss_gdt, &gdt[GPROC0_SEL].sd);
PCPU_SET(common_tssd, *PCPU_GET(tss_gdt));
PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16);
ltr(gsel_tss);
/* pointer to selector slot for %fs/%gs */
PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd);
dblfault_tss.tss_esp = dblfault_tss.tss_esp0 = dblfault_tss.tss_esp1 =
dblfault_tss.tss_esp2 = (int)&dblfault_stack[sizeof(dblfault_stack)];
dblfault_tss.tss_ss = dblfault_tss.tss_ss0 = dblfault_tss.tss_ss1 =
dblfault_tss.tss_ss2 = GSEL(GDATA_SEL, SEL_KPL);
dblfault_tss.tss_cr3 = (int)IdlePTD;
dblfault_tss.tss_eip = (int)dblfault_handler;
dblfault_tss.tss_eflags = PSL_KERNEL;
dblfault_tss.tss_ds = dblfault_tss.tss_es =
dblfault_tss.tss_gs = GSEL(GDATA_SEL, SEL_KPL);
dblfault_tss.tss_fs = GSEL(GPRIV_SEL, SEL_KPL);
dblfault_tss.tss_cs = GSEL(GCODE_SEL, SEL_KPL);
dblfault_tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL);
vm86_initialize();
getmemsize(first);
init_param2(physmem);
/* now running on new page tables, configured,and u/iom is accessible */
msgbufinit(msgbufp, msgbufsize);
/* make a call gate to reenter kernel with */
gdp = &ldt[LSYS5CALLS_SEL].gd;
x = (int) &IDTVEC(lcall_syscall);
gdp->gd_looffset = x;
gdp->gd_selector = GSEL(GCODE_SEL,SEL_KPL);
gdp->gd_stkcpy = 1;
gdp->gd_type = SDT_SYS386CGT;
gdp->gd_dpl = SEL_UPL;
gdp->gd_p = 1;
gdp->gd_hioffset = x >> 16;
/* XXX does this work? */
/* XXX yes! */
ldt[LBSDICALLS_SEL] = ldt[LSYS5CALLS_SEL];
ldt[LSOL26CALLS_SEL] = ldt[LSYS5CALLS_SEL];
/* transfer to user mode */
_ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
_udatasel = GSEL(GUDATA_SEL, SEL_UPL);
/* setup proc 0's pcb */
thread0.td_pcb->pcb_flags = 0;
thread0.td_pcb->pcb_cr3 = (int)IdlePTD;
thread0.td_pcb->pcb_ext = 0;
thread0.td_frame = &proc0_tf;
cpu_probe_cmpxchg8b();
}
void
cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
{
}
void
spinlock_enter(void)
{
struct thread *td;
register_t flags;
td = curthread;
if (td->td_md.md_spinlock_count == 0) {
flags = intr_disable();
td->td_md.md_spinlock_count = 1;
td->td_md.md_saved_flags = flags;
} else
td->td_md.md_spinlock_count++;
critical_enter();
}
void
spinlock_exit(void)
{
struct thread *td;
register_t flags;
td = curthread;
critical_exit();
flags = td->td_md.md_saved_flags;
td->td_md.md_spinlock_count--;
if (td->td_md.md_spinlock_count == 0)
intr_restore(flags);
}
#if defined(I586_CPU) && !defined(NO_F00F_HACK)
static void f00f_hack(void *unused);
SYSINIT(f00f_hack, SI_SUB_INTRINSIC, SI_ORDER_FIRST, f00f_hack, NULL);
static void
f00f_hack(void *unused)
{
struct gate_descriptor *new_idt;
vm_offset_t tmp;
if (!has_f00f_bug)
return;
GIANT_REQUIRED;
printf("Intel Pentium detected, installing workaround for F00F bug\n");
tmp = kmem_alloc(kernel_map, PAGE_SIZE * 2);
if (tmp == 0)
panic("kmem_alloc returned 0");
/* Put the problematic entry (#6) at the end of the lower page. */
new_idt = (struct gate_descriptor*)
(tmp + PAGE_SIZE - 7 * sizeof(struct gate_descriptor));
bcopy(idt, new_idt, sizeof(idt0));
r_idt.rd_base = (u_int)new_idt;
lidt(&r_idt);
idt = new_idt;
if (vm_map_protect(kernel_map, tmp, tmp + PAGE_SIZE,
VM_PROT_READ, FALSE) != KERN_SUCCESS)
panic("vm_map_protect failed");
}
#endif /* defined(I586_CPU) && !NO_F00F_HACK */
/*
* Construct a PCB from a trapframe. This is called from kdb_trap() where
* we want to start a backtrace from the function that caused us to enter
* the debugger. We have the context in the trapframe, but base the trace
* on the PCB. The PCB doesn't have to be perfect, as long as it contains
* enough for a backtrace.
*/
void
makectx(struct trapframe *tf, struct pcb *pcb)
{
pcb->pcb_edi = tf->tf_edi;
pcb->pcb_esi = tf->tf_esi;
pcb->pcb_ebp = tf->tf_ebp;
pcb->pcb_ebx = tf->tf_ebx;
pcb->pcb_eip = tf->tf_eip;
pcb->pcb_esp = (ISPL(tf->tf_cs)) ? tf->tf_esp : (int)(tf + 1) - 8;
}
int
ptrace_set_pc(struct thread *td, u_long addr)
{
td->td_frame->tf_eip = addr;
return (0);
}
int
ptrace_single_step(struct thread *td)
{
td->td_frame->tf_eflags |= PSL_T;
return (0);
}
int
ptrace_clear_single_step(struct thread *td)
{
td->td_frame->tf_eflags &= ~PSL_T;
return (0);
}
int
fill_regs(struct thread *td, struct reg *regs)
{
struct pcb *pcb;
struct trapframe *tp;
tp = td->td_frame;
pcb = td->td_pcb;
regs->r_gs = pcb->pcb_gs;
return (fill_frame_regs(tp, regs));
}
int
fill_frame_regs(struct trapframe *tp, struct reg *regs)
{
regs->r_fs = tp->tf_fs;
regs->r_es = tp->tf_es;
regs->r_ds = tp->tf_ds;
regs->r_edi = tp->tf_edi;
regs->r_esi = tp->tf_esi;
regs->r_ebp = tp->tf_ebp;
regs->r_ebx = tp->tf_ebx;
regs->r_edx = tp->tf_edx;
regs->r_ecx = tp->tf_ecx;
regs->r_eax = tp->tf_eax;
regs->r_eip = tp->tf_eip;
regs->r_cs = tp->tf_cs;
regs->r_eflags = tp->tf_eflags;
regs->r_esp = tp->tf_esp;
regs->r_ss = tp->tf_ss;
return (0);
}
int
set_regs(struct thread *td, struct reg *regs)
{
struct pcb *pcb;
struct trapframe *tp;
tp = td->td_frame;
if (!EFL_SECURE(regs->r_eflags, tp->tf_eflags) ||
!CS_SECURE(regs->r_cs))
return (EINVAL);
pcb = td->td_pcb;
tp->tf_fs = regs->r_fs;
tp->tf_es = regs->r_es;
tp->tf_ds = regs->r_ds;
tp->tf_edi = regs->r_edi;
tp->tf_esi = regs->r_esi;
tp->tf_ebp = regs->r_ebp;
tp->tf_ebx = regs->r_ebx;
tp->tf_edx = regs->r_edx;
tp->tf_ecx = regs->r_ecx;
tp->tf_eax = regs->r_eax;
tp->tf_eip = regs->r_eip;
tp->tf_cs = regs->r_cs;
tp->tf_eflags = regs->r_eflags;
tp->tf_esp = regs->r_esp;
tp->tf_ss = regs->r_ss;
pcb->pcb_gs = regs->r_gs;
return (0);
}
#ifdef CPU_ENABLE_SSE
static void
fill_fpregs_xmm(sv_xmm, sv_87)
struct savexmm *sv_xmm;
struct save87 *sv_87;
{
register struct env87 *penv_87 = &sv_87->sv_env;
register struct envxmm *penv_xmm = &sv_xmm->sv_env;
int i;
bzero(sv_87, sizeof(*sv_87));
/* FPU control/status */
penv_87->en_cw = penv_xmm->en_cw;
penv_87->en_sw = penv_xmm->en_sw;
penv_87->en_tw = penv_xmm->en_tw;
penv_87->en_fip = penv_xmm->en_fip;
penv_87->en_fcs = penv_xmm->en_fcs;
penv_87->en_opcode = penv_xmm->en_opcode;
penv_87->en_foo = penv_xmm->en_foo;
penv_87->en_fos = penv_xmm->en_fos;
/* FPU registers */
for (i = 0; i < 8; ++i)
sv_87->sv_ac[i] = sv_xmm->sv_fp[i].fp_acc;
}
static void
set_fpregs_xmm(sv_87, sv_xmm)
struct save87 *sv_87;
struct savexmm *sv_xmm;
{
register struct env87 *penv_87 = &sv_87->sv_env;
register struct envxmm *penv_xmm = &sv_xmm->sv_env;
int i;
/* FPU control/status */
penv_xmm->en_cw = penv_87->en_cw;
penv_xmm->en_sw = penv_87->en_sw;
penv_xmm->en_tw = penv_87->en_tw;
penv_xmm->en_fip = penv_87->en_fip;
penv_xmm->en_fcs = penv_87->en_fcs;
penv_xmm->en_opcode = penv_87->en_opcode;
penv_xmm->en_foo = penv_87->en_foo;
penv_xmm->en_fos = penv_87->en_fos;
/* FPU registers */
for (i = 0; i < 8; ++i)
sv_xmm->sv_fp[i].fp_acc = sv_87->sv_ac[i];
}
#endif /* CPU_ENABLE_SSE */
int
fill_fpregs(struct thread *td, struct fpreg *fpregs)
{
KASSERT(td == curthread || TD_IS_SUSPENDED(td),
("not suspended thread %p", td));
#ifdef DEV_NPX
npxgetregs(td);
#else
bzero(fpregs, sizeof(*fpregs));
#endif
#ifdef CPU_ENABLE_SSE
if (cpu_fxsr)
fill_fpregs_xmm(&td->td_pcb->pcb_user_save.sv_xmm,
(struct save87 *)fpregs);
else
#endif /* CPU_ENABLE_SSE */
bcopy(&td->td_pcb->pcb_user_save.sv_87, fpregs,
sizeof(*fpregs));
return (0);
}
int
set_fpregs(struct thread *td, struct fpreg *fpregs)
{
#ifdef CPU_ENABLE_SSE
if (cpu_fxsr)
set_fpregs_xmm((struct save87 *)fpregs,
&td->td_pcb->pcb_user_save.sv_xmm);
else
#endif /* CPU_ENABLE_SSE */
bcopy(fpregs, &td->td_pcb->pcb_user_save.sv_87,
sizeof(*fpregs));
#ifdef DEV_NPX
npxuserinited(td);
#endif
return (0);
}
/*
* Get machine context.
*/
int
get_mcontext(struct thread *td, mcontext_t *mcp, int flags)
{
struct trapframe *tp;
struct segment_descriptor *sdp;
tp = td->td_frame;
PROC_LOCK(curthread->td_proc);
mcp->mc_onstack = sigonstack(tp->tf_esp);
PROC_UNLOCK(curthread->td_proc);
mcp->mc_gs = td->td_pcb->pcb_gs;
mcp->mc_fs = tp->tf_fs;
mcp->mc_es = tp->tf_es;
mcp->mc_ds = tp->tf_ds;
mcp->mc_edi = tp->tf_edi;
mcp->mc_esi = tp->tf_esi;
mcp->mc_ebp = tp->tf_ebp;
mcp->mc_isp = tp->tf_isp;
mcp->mc_eflags = tp->tf_eflags;
if (flags & GET_MC_CLEAR_RET) {
mcp->mc_eax = 0;
mcp->mc_edx = 0;
mcp->mc_eflags &= ~PSL_C;
} else {
mcp->mc_eax = tp->tf_eax;
mcp->mc_edx = tp->tf_edx;
}
mcp->mc_ebx = tp->tf_ebx;
mcp->mc_ecx = tp->tf_ecx;
mcp->mc_eip = tp->tf_eip;
mcp->mc_cs = tp->tf_cs;
mcp->mc_esp = tp->tf_esp;
mcp->mc_ss = tp->tf_ss;
mcp->mc_len = sizeof(*mcp);
get_fpcontext(td, mcp);
sdp = &td->td_pcb->pcb_fsd;
mcp->mc_fsbase = sdp->sd_hibase << 24 | sdp->sd_lobase;
sdp = &td->td_pcb->pcb_gsd;
mcp->mc_gsbase = sdp->sd_hibase << 24 | sdp->sd_lobase;
bzero(mcp->mc_spare1, sizeof(mcp->mc_spare1));
bzero(mcp->mc_spare2, sizeof(mcp->mc_spare2));
return (0);
}
/*
* Set machine context.
*
* However, we don't set any but the user modifiable flags, and we won't
* touch the cs selector.
*/
int
set_mcontext(struct thread *td, const mcontext_t *mcp)
{
struct trapframe *tp;
int eflags, ret;
tp = td->td_frame;
if (mcp->mc_len != sizeof(*mcp))
return (EINVAL);
eflags = (mcp->mc_eflags & PSL_USERCHANGE) |
(tp->tf_eflags & ~PSL_USERCHANGE);
if ((ret = set_fpcontext(td, mcp)) == 0) {
tp->tf_fs = mcp->mc_fs;
tp->tf_es = mcp->mc_es;
tp->tf_ds = mcp->mc_ds;
tp->tf_edi = mcp->mc_edi;
tp->tf_esi = mcp->mc_esi;
tp->tf_ebp = mcp->mc_ebp;
tp->tf_ebx = mcp->mc_ebx;
tp->tf_edx = mcp->mc_edx;
tp->tf_ecx = mcp->mc_ecx;
tp->tf_eax = mcp->mc_eax;
tp->tf_eip = mcp->mc_eip;
tp->tf_eflags = eflags;
tp->tf_esp = mcp->mc_esp;
tp->tf_ss = mcp->mc_ss;
td->td_pcb->pcb_gs = mcp->mc_gs;
ret = 0;
}
return (ret);
}
static void
get_fpcontext(struct thread *td, mcontext_t *mcp)
{
#ifndef DEV_NPX
mcp->mc_fpformat = _MC_FPFMT_NODEV;
mcp->mc_ownedfp = _MC_FPOWNED_NONE;
bzero(mcp->mc_fpstate, sizeof(mcp->mc_fpstate));
#else
mcp->mc_ownedfp = npxgetregs(td);
bcopy(&td->td_pcb->pcb_user_save, &mcp->mc_fpstate,
sizeof(mcp->mc_fpstate));
mcp->mc_fpformat = npxformat();
#endif
}
static int
set_fpcontext(struct thread *td, const mcontext_t *mcp)
{
if (mcp->mc_fpformat == _MC_FPFMT_NODEV)
return (0);
else if (mcp->mc_fpformat != _MC_FPFMT_387 &&
mcp->mc_fpformat != _MC_FPFMT_XMM)
return (EINVAL);
else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE)
/* We don't care what state is left in the FPU or PCB. */
fpstate_drop(td);
else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU ||
mcp->mc_ownedfp == _MC_FPOWNED_PCB) {
#ifdef DEV_NPX
#ifdef CPU_ENABLE_SSE
if (cpu_fxsr)
((union savefpu *)&mcp->mc_fpstate)->sv_xmm.sv_env.
en_mxcsr &= cpu_mxcsr_mask;
#endif
npxsetregs(td, (union savefpu *)&mcp->mc_fpstate);
#endif
} else
return (EINVAL);
return (0);
}
static void
fpstate_drop(struct thread *td)
{
critical_enter();
#ifdef DEV_NPX
if (PCPU_GET(fpcurthread) == td)
npxdrop();
#endif
/*
* XXX force a full drop of the npx. The above only drops it if we
* owned it. npxgetregs() has the same bug in the !cpu_fxsr case.
*
* XXX I don't much like npxgetregs()'s semantics of doing a full
* drop. Dropping only to the pcb matches fnsave's behaviour.
* We only need to drop to !PCB_INITDONE in sendsig(). But
* sendsig() is the only caller of npxgetregs()... perhaps we just
* have too many layers.
*/
curthread->td_pcb->pcb_flags &= ~(PCB_NPXINITDONE |
PCB_NPXUSERINITDONE);
critical_exit();
}
int
fill_dbregs(struct thread *td, struct dbreg *dbregs)
{
struct pcb *pcb;
if (td == NULL) {
dbregs->dr[0] = rdr0();
dbregs->dr[1] = rdr1();
dbregs->dr[2] = rdr2();
dbregs->dr[3] = rdr3();
dbregs->dr[4] = rdr4();
dbregs->dr[5] = rdr5();
dbregs->dr[6] = rdr6();
dbregs->dr[7] = rdr7();
} else {
pcb = td->td_pcb;
dbregs->dr[0] = pcb->pcb_dr0;
dbregs->dr[1] = pcb->pcb_dr1;
dbregs->dr[2] = pcb->pcb_dr2;
dbregs->dr[3] = pcb->pcb_dr3;
dbregs->dr[4] = 0;
dbregs->dr[5] = 0;
dbregs->dr[6] = pcb->pcb_dr6;
dbregs->dr[7] = pcb->pcb_dr7;
}
return (0);
}
int
set_dbregs(struct thread *td, struct dbreg *dbregs)
{
struct pcb *pcb;
int i;
if (td == NULL) {
load_dr0(dbregs->dr[0]);
load_dr1(dbregs->dr[1]);
load_dr2(dbregs->dr[2]);
load_dr3(dbregs->dr[3]);
load_dr4(dbregs->dr[4]);
load_dr5(dbregs->dr[5]);
load_dr6(dbregs->dr[6]);
load_dr7(dbregs->dr[7]);
} else {
/*
* Don't let an illegal value for dr7 get set. Specifically,
* check for undefined settings. Setting these bit patterns
* result in undefined behaviour and can lead to an unexpected
* TRCTRAP.
*/
for (i = 0; i < 4; i++) {
if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02)
return (EINVAL);
if (DBREG_DR7_LEN(dbregs->dr[7], i) == 0x02)
return (EINVAL);
}
pcb = td->td_pcb;
/*
* Don't let a process set a breakpoint that is not within the
* process's address space. If a process could do this, it
* could halt the system by setting a breakpoint in the kernel
* (if ddb was enabled). Thus, we need to check to make sure
* that no breakpoints are being enabled for addresses outside
* process's address space.
*
* XXX - what about when the watched area of the user's
* address space is written into from within the kernel
* ... wouldn't that still cause a breakpoint to be generated
* from within kernel mode?
*/
if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) {
/* dr0 is enabled */
if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS)
return (EINVAL);
}
if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) {
/* dr1 is enabled */
if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS)
return (EINVAL);
}
if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) {
/* dr2 is enabled */
if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS)
return (EINVAL);
}
if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) {
/* dr3 is enabled */
if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS)
return (EINVAL);
}
pcb->pcb_dr0 = dbregs->dr[0];
pcb->pcb_dr1 = dbregs->dr[1];
pcb->pcb_dr2 = dbregs->dr[2];
pcb->pcb_dr3 = dbregs->dr[3];
pcb->pcb_dr6 = dbregs->dr[6];
pcb->pcb_dr7 = dbregs->dr[7];
pcb->pcb_flags |= PCB_DBREGS;
}
return (0);
}
/*
* Return > 0 if a hardware breakpoint has been hit, and the
* breakpoint was in user space. Return 0, otherwise.
*/
int
user_dbreg_trap(void)
{
u_int32_t dr7, dr6; /* debug registers dr6 and dr7 */
u_int32_t bp; /* breakpoint bits extracted from dr6 */
int nbp; /* number of breakpoints that triggered */
caddr_t addr[4]; /* breakpoint addresses */
int i;
dr7 = rdr7();
if ((dr7 & 0x000000ff) == 0) {
/*
* all GE and LE bits in the dr7 register are zero,
* thus the trap couldn't have been caused by the
* hardware debug registers
*/
return 0;
}
nbp = 0;
dr6 = rdr6();
bp = dr6 & 0x0000000f;
if (!bp) {
/*
* None of the breakpoint bits are set meaning this
* trap was not caused by any of the debug registers
*/
return 0;
}
/*
* at least one of the breakpoints were hit, check to see
* which ones and if any of them are user space addresses
*/
if (bp & 0x01) {
addr[nbp++] = (caddr_t)rdr0();
}
if (bp & 0x02) {
addr[nbp++] = (caddr_t)rdr1();
}
if (bp & 0x04) {
addr[nbp++] = (caddr_t)rdr2();
}
if (bp & 0x08) {
addr[nbp++] = (caddr_t)rdr3();
}
for (i = 0; i < nbp; i++) {
if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) {
/*
* addr[i] is in user space
*/
return nbp;
}
}
/*
* None of the breakpoints are in user space.
*/
return 0;
}
#ifdef KDB
/*
* Provide inb() and outb() as functions. They are normally only available as
* inline functions, thus cannot be called from the debugger.
*/
/* silence compiler warnings */
u_char inb_(u_short);
void outb_(u_short, u_char);
u_char
inb_(u_short port)
{
return inb(port);
}
void
outb_(u_short port, u_char data)
{
outb(port, data);
}
#endif /* KDB */
Index: head/sys/powerpc/powerpc/exec_machdep.c
===================================================================
--- head/sys/powerpc/powerpc/exec_machdep.c (revision 225616)
+++ head/sys/powerpc/powerpc/exec_machdep.c (revision 225617)
@@ -1,1044 +1,1044 @@
/*-
* Copyright (C) 1995, 1996 Wolfgang Solfrank.
* Copyright (C) 1995, 1996 TooLs GmbH.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by TooLs GmbH.
* 4. The name of TooLs GmbH may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*-
* Copyright (C) 2001 Benno Rice
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY Benno Rice ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* $NetBSD: machdep.c,v 1.74.2.1 2000/11/01 16:13:48 tv Exp $
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_compat.h"
#include <sys/param.h>
#include <sys/proc.h>
#include <sys/systm.h>
#include <sys/bio.h>
#include <sys/buf.h>
#include <sys/bus.h>
#include <sys/cons.h>
#include <sys/cpu.h>
#include <sys/exec.h>
#include <sys/imgact.h>
#include <sys/kernel.h>
#include <sys/ktr.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
#include <sys/signalvar.h>
#include <sys/syscallsubr.h>
#include <sys/syscall.h>
#include <sys/sysent.h>
#include <sys/sysproto.h>
#include <sys/ucontext.h>
#include <sys/uio.h>
#include <machine/altivec.h>
#include <machine/cpu.h>
#include <machine/elf.h>
#include <machine/fpu.h>
#include <machine/pcb.h>
#include <machine/reg.h>
#include <machine/sigframe.h>
#include <machine/trap.h>
#include <machine/vmparam.h>
#ifdef COMPAT_FREEBSD32
#include <compat/freebsd32/freebsd32_signal.h>
#include <compat/freebsd32/freebsd32_util.h>
#include <compat/freebsd32/freebsd32_proto.h>
typedef struct __ucontext32 {
sigset_t uc_sigmask;
mcontext32_t uc_mcontext;
uint32_t uc_link;
struct sigaltstack32 uc_stack;
uint32_t uc_flags;
uint32_t __spare__[4];
} ucontext32_t;
struct sigframe32 {
ucontext32_t sf_uc;
struct siginfo32 sf_si;
};
static int grab_mcontext32(struct thread *td, mcontext32_t *, int flags);
#endif
static int grab_mcontext(struct thread *, mcontext_t *, int);
void
sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
{
struct trapframe *tf;
struct sigacts *psp;
struct sigframe sf;
struct thread *td;
struct proc *p;
#ifdef COMPAT_FREEBSD32
struct siginfo32 siginfo32;
struct sigframe32 sf32;
#endif
size_t sfpsize;
caddr_t sfp, usfp;
int oonstack, rndfsize;
int sig;
int code;
td = curthread;
p = td->td_proc;
PROC_LOCK_ASSERT(p, MA_OWNED);
psp = p->p_sigacts;
mtx_assert(&psp->ps_mtx, MA_OWNED);
tf = td->td_frame;
oonstack = sigonstack(tf->fixreg[1]);
/*
* Fill siginfo structure.
*/
ksi->ksi_info.si_signo = ksi->ksi_signo;
#ifdef AIM
ksi->ksi_info.si_addr = (void *)((tf->exc == EXC_DSI) ?
tf->cpu.aim.dar : tf->srr0);
#else
ksi->ksi_info.si_addr = (void *)((tf->exc == EXC_DSI) ?
tf->cpu.booke.dear : tf->srr0);
#endif
#ifdef COMPAT_FREEBSD32
if (SV_PROC_FLAG(p, SV_ILP32)) {
siginfo_to_siginfo32(&ksi->ksi_info, &siginfo32);
sig = siginfo32.si_signo;
code = siginfo32.si_code;
sfp = (caddr_t)&sf32;
sfpsize = sizeof(sf32);
rndfsize = ((sizeof(sf32) + 15) / 16) * 16;
/*
* Save user context
*/
memset(&sf32, 0, sizeof(sf32));
grab_mcontext32(td, &sf32.sf_uc.uc_mcontext, 0);
sf32.sf_uc.uc_sigmask = *mask;
sf32.sf_uc.uc_stack.ss_sp = (uintptr_t)td->td_sigstk.ss_sp;
sf32.sf_uc.uc_stack.ss_size = (uint32_t)td->td_sigstk.ss_size;
sf32.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
sf32.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
} else {
#endif
sig = ksi->ksi_signo;
code = ksi->ksi_code;
sfp = (caddr_t)&sf;
sfpsize = sizeof(sf);
#ifdef __powerpc64__
/*
* 64-bit PPC defines a 288 byte scratch region
* below the stack.
*/
rndfsize = 288 + ((sizeof(sf) + 47) / 48) * 48;
#else
rndfsize = ((sizeof(sf) + 15) / 16) * 16;
#endif
/*
* Save user context
*/
memset(&sf, 0, sizeof(sf));
grab_mcontext(td, &sf.sf_uc.uc_mcontext, 0);
sf.sf_uc.uc_sigmask = *mask;
sf.sf_uc.uc_stack = td->td_sigstk;
sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
#ifdef COMPAT_FREEBSD32
}
#endif
CTR4(KTR_SIG, "sendsig: td=%p (%s) catcher=%p sig=%d", td, p->p_comm,
catcher, sig);
/*
* Allocate and validate space for the signal handler context.
*/
if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
SIGISMEMBER(psp->ps_sigonstack, sig)) {
usfp = (void *)(td->td_sigstk.ss_sp +
td->td_sigstk.ss_size - rndfsize);
} else {
usfp = (void *)(tf->fixreg[1] - rndfsize);
}
/*
* Translate the signal if appropriate (Linux emu ?)
*/
if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
/*
* Save the floating-point state, if necessary, then copy it.
*/
/* XXX */
/*
* Set up the registers to return to sigcode.
*
* r1/sp - sigframe ptr
* lr - sig function, dispatched to by blrl in trampoline
* r3 - sig number
* r4 - SIGINFO ? &siginfo : exception code
* r5 - user context
* srr0 - trampoline function addr
*/
tf->lr = (register_t)catcher;
tf->fixreg[1] = (register_t)usfp;
tf->fixreg[FIRSTARG] = sig;
#ifdef COMPAT_FREEBSD32
tf->fixreg[FIRSTARG+2] = (register_t)usfp +
((SV_PROC_FLAG(p, SV_ILP32)) ?
offsetof(struct sigframe32, sf_uc) :
offsetof(struct sigframe, sf_uc));
#else
tf->fixreg[FIRSTARG+2] = (register_t)usfp +
offsetof(struct sigframe, sf_uc);
#endif
if (SIGISMEMBER(psp->ps_siginfo, sig)) {
/*
* Signal handler installed with SA_SIGINFO.
*/
#ifdef COMPAT_FREEBSD32
if (SV_PROC_FLAG(p, SV_ILP32)) {
sf32.sf_si = siginfo32;
tf->fixreg[FIRSTARG+1] = (register_t)usfp +
offsetof(struct sigframe32, sf_si);
sf32.sf_si = siginfo32;
} else {
#endif
tf->fixreg[FIRSTARG+1] = (register_t)usfp +
offsetof(struct sigframe, sf_si);
sf.sf_si = ksi->ksi_info;
#ifdef COMPAT_FREEBSD32
}
#endif
} else {
/* Old FreeBSD-style arguments. */
tf->fixreg[FIRSTARG+1] = code;
#ifdef AIM
tf->fixreg[FIRSTARG+3] = (tf->exc == EXC_DSI) ?
tf->cpu.aim.dar : tf->srr0;
#else
tf->fixreg[FIRSTARG+3] = (tf->exc == EXC_DSI) ?
tf->cpu.booke.dear : tf->srr0;
#endif
}
mtx_unlock(&psp->ps_mtx);
PROC_UNLOCK(p);
tf->srr0 = (register_t)p->p_sysent->sv_sigcode_base;
/*
* copy the frame out to userland.
*/
if (copyout(sfp, usfp, sfpsize) != 0) {
/*
* Process has trashed its stack. Kill it.
*/
CTR2(KTR_SIG, "sendsig: sigexit td=%p sfp=%p", td, sfp);
PROC_LOCK(p);
sigexit(td, SIGILL);
}
CTR3(KTR_SIG, "sendsig: return td=%p pc=%#x sp=%#x", td,
tf->srr0, tf->fixreg[1]);
PROC_LOCK(p);
mtx_lock(&psp->ps_mtx);
}
int
-sigreturn(struct thread *td, struct sigreturn_args *uap)
+sys_sigreturn(struct thread *td, struct sigreturn_args *uap)
{
ucontext_t uc;
int error;
CTR2(KTR_SIG, "sigreturn: td=%p ucp=%p", td, uap->sigcntxp);
if (copyin(uap->sigcntxp, &uc, sizeof(uc)) != 0) {
CTR1(KTR_SIG, "sigreturn: efault td=%p", td);
return (EFAULT);
}
error = set_mcontext(td, &uc.uc_mcontext);
if (error != 0)
return (error);
kern_sigprocmask(td, SIG_SETMASK, &uc.uc_sigmask, NULL, 0);
CTR3(KTR_SIG, "sigreturn: return td=%p pc=%#x sp=%#x",
td, uc.uc_mcontext.mc_srr0, uc.uc_mcontext.mc_gpr[1]);
return (EJUSTRETURN);
}
#ifdef COMPAT_FREEBSD4
int
freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap)
{
- return sigreturn(td, (struct sigreturn_args *)uap);
+ return sys_sigreturn(td, (struct sigreturn_args *)uap);
}
#endif
/*
* Construct a PCB from a trapframe. This is called from kdb_trap() where
* we want to start a backtrace from the function that caused us to enter
* the debugger. We have the context in the trapframe, but base the trace
* on the PCB. The PCB doesn't have to be perfect, as long as it contains
* enough for a backtrace.
*/
void
makectx(struct trapframe *tf, struct pcb *pcb)
{
pcb->pcb_lr = tf->srr0;
pcb->pcb_sp = tf->fixreg[1];
}
/*
* get_mcontext/sendsig helper routine that doesn't touch the
* proc lock
*/
static int
grab_mcontext(struct thread *td, mcontext_t *mcp, int flags)
{
struct pcb *pcb;
pcb = td->td_pcb;
memset(mcp, 0, sizeof(mcontext_t));
mcp->mc_vers = _MC_VERSION;
mcp->mc_flags = 0;
memcpy(&mcp->mc_frame, td->td_frame, sizeof(struct trapframe));
if (flags & GET_MC_CLEAR_RET) {
mcp->mc_gpr[3] = 0;
mcp->mc_gpr[4] = 0;
}
#ifdef AIM
/*
* This assumes that floating-point context is *not* lazy,
* so if the thread has used FP there would have been a
* FP-unavailable exception that would have set things up
* correctly.
*/
if (pcb->pcb_flags & PCB_FPU) {
KASSERT(td == curthread,
("get_mcontext: fp save not curthread"));
critical_enter();
save_fpu(td);
critical_exit();
mcp->mc_flags |= _MC_FP_VALID;
memcpy(&mcp->mc_fpscr, &pcb->pcb_fpu.fpscr, sizeof(double));
memcpy(mcp->mc_fpreg, pcb->pcb_fpu.fpr, 32*sizeof(double));
}
/*
* Repeat for Altivec context
*/
if (pcb->pcb_flags & PCB_VEC) {
KASSERT(td == curthread,
("get_mcontext: fp save not curthread"));
critical_enter();
save_vec(td);
critical_exit();
mcp->mc_flags |= _MC_AV_VALID;
mcp->mc_vscr = pcb->pcb_vec.vscr;
mcp->mc_vrsave = pcb->pcb_vec.vrsave;
memcpy(mcp->mc_avec, pcb->pcb_vec.vr, sizeof(mcp->mc_avec));
}
#endif
mcp->mc_len = sizeof(*mcp);
return (0);
}
int
get_mcontext(struct thread *td, mcontext_t *mcp, int flags)
{
int error;
error = grab_mcontext(td, mcp, flags);
if (error == 0) {
PROC_LOCK(curthread->td_proc);
mcp->mc_onstack = sigonstack(td->td_frame->fixreg[1]);
PROC_UNLOCK(curthread->td_proc);
}
return (error);
}
int
set_mcontext(struct thread *td, const mcontext_t *mcp)
{
struct pcb *pcb;
struct trapframe *tf;
pcb = td->td_pcb;
tf = td->td_frame;
if (mcp->mc_vers != _MC_VERSION || mcp->mc_len != sizeof(*mcp))
return (EINVAL);
#ifdef AIM
/*
* Don't let the user set privileged MSR bits
*/
if ((mcp->mc_srr1 & PSL_USERSTATIC) != (tf->srr1 & PSL_USERSTATIC)) {
return (EINVAL);
}
#endif
memcpy(tf, mcp->mc_frame, sizeof(mcp->mc_frame));
#ifdef AIM
if (mcp->mc_flags & _MC_FP_VALID) {
if ((pcb->pcb_flags & PCB_FPU) != PCB_FPU) {
critical_enter();
enable_fpu(td);
critical_exit();
}
memcpy(&pcb->pcb_fpu.fpscr, &mcp->mc_fpscr, sizeof(double));
memcpy(pcb->pcb_fpu.fpr, mcp->mc_fpreg, 32*sizeof(double));
}
if (mcp->mc_flags & _MC_AV_VALID) {
if ((pcb->pcb_flags & PCB_VEC) != PCB_VEC) {
critical_enter();
enable_vec(td);
critical_exit();
}
pcb->pcb_vec.vscr = mcp->mc_vscr;
pcb->pcb_vec.vrsave = mcp->mc_vrsave;
memcpy(pcb->pcb_vec.vr, mcp->mc_avec, sizeof(mcp->mc_avec));
}
#endif
return (0);
}
/*
* Set set up registers on exec.
*/
void
exec_setregs(struct thread *td, struct image_params *imgp, u_long stack)
{
struct trapframe *tf;
register_t argc;
#ifdef __powerpc64__
register_t entry_desc[3];
#endif
tf = trapframe(td);
bzero(tf, sizeof *tf);
#ifdef __powerpc64__
tf->fixreg[1] = -roundup(-stack + 48, 16);
#else
tf->fixreg[1] = -roundup(-stack + 8, 16);
#endif
/*
* Set up arguments for _start():
* _start(argc, argv, envp, obj, cleanup, ps_strings);
*
* Notes:
* - obj and cleanup are the auxilliary and termination
* vectors. They are fixed up by ld.elf_so.
* - ps_strings is a NetBSD extention, and will be
* ignored by executables which are strictly
* compliant with the SVR4 ABI.
*
* XXX We have to set both regs and retval here due to different
* XXX calling convention in trap.c and init_main.c.
*/
/* Collect argc from the user stack */
argc = fuword((void *)stack);
/*
* XXX PG: these get overwritten in the syscall return code.
* execve() should return EJUSTRETURN, like it does on NetBSD.
* Emulate by setting the syscall return value cells. The
* registers still have to be set for init's fork trampoline.
*/
td->td_retval[0] = argc;
td->td_retval[1] = stack + sizeof(register_t);
tf->fixreg[3] = argc;
tf->fixreg[4] = stack + sizeof(register_t);
tf->fixreg[5] = stack + (2 + argc)*sizeof(register_t);
tf->fixreg[6] = 0; /* auxillary vector */
tf->fixreg[7] = 0; /* termination vector */
tf->fixreg[8] = (register_t)imgp->ps_strings; /* NetBSD extension */
#ifdef __powerpc64__
/*
* For 64-bit, we need to disentangle the function descriptor
*
* 0. entry point
* 1. TOC value (r2)
* 2. Environment pointer (r11)
*/
(void)copyin((void *)imgp->entry_addr, entry_desc, sizeof(entry_desc));
tf->srr0 = entry_desc[0] + imgp->reloc_base;
tf->fixreg[2] = entry_desc[1] + imgp->reloc_base;
tf->fixreg[11] = entry_desc[2] + imgp->reloc_base;
tf->srr1 = PSL_SF | PSL_USERSET | PSL_FE_DFLT;
if (mfmsr() & PSL_HV)
tf->srr1 |= PSL_HV;
#else
tf->srr0 = imgp->entry_addr;
tf->srr1 = PSL_USERSET | PSL_FE_DFLT;
#endif
td->td_pcb->pcb_flags = 0;
}
#ifdef COMPAT_FREEBSD32
void
ppc32_setregs(struct thread *td, struct image_params *imgp, u_long stack)
{
struct trapframe *tf;
uint32_t argc;
tf = trapframe(td);
bzero(tf, sizeof *tf);
tf->fixreg[1] = -roundup(-stack + 8, 16);
argc = fuword32((void *)stack);
td->td_retval[0] = argc;
td->td_retval[1] = stack + sizeof(uint32_t);
tf->fixreg[3] = argc;
tf->fixreg[4] = stack + sizeof(uint32_t);
tf->fixreg[5] = stack + (2 + argc)*sizeof(uint32_t);
tf->fixreg[6] = 0; /* auxillary vector */
tf->fixreg[7] = 0; /* termination vector */
tf->fixreg[8] = (register_t)imgp->ps_strings; /* NetBSD extension */
tf->srr0 = imgp->entry_addr;
tf->srr1 = PSL_MBO | PSL_USERSET | PSL_FE_DFLT;
tf->srr1 &= ~PSL_SF;
if (mfmsr() & PSL_HV)
tf->srr1 |= PSL_HV;
td->td_pcb->pcb_flags = 0;
}
#endif
int
fill_regs(struct thread *td, struct reg *regs)
{
struct trapframe *tf;
tf = td->td_frame;
memcpy(regs, tf, sizeof(struct reg));
return (0);
}
int
fill_dbregs(struct thread *td, struct dbreg *dbregs)
{
/* No debug registers on PowerPC */
return (ENOSYS);
}
int
fill_fpregs(struct thread *td, struct fpreg *fpregs)
{
struct pcb *pcb;
pcb = td->td_pcb;
if ((pcb->pcb_flags & PCB_FPU) == 0)
memset(fpregs, 0, sizeof(struct fpreg));
else
memcpy(fpregs, &pcb->pcb_fpu, sizeof(struct fpreg));
return (0);
}
int
set_regs(struct thread *td, struct reg *regs)
{
struct trapframe *tf;
tf = td->td_frame;
memcpy(tf, regs, sizeof(struct reg));
return (0);
}
int
set_dbregs(struct thread *td, struct dbreg *dbregs)
{
/* No debug registers on PowerPC */
return (ENOSYS);
}
int
set_fpregs(struct thread *td, struct fpreg *fpregs)
{
#ifdef AIM
struct pcb *pcb;
pcb = td->td_pcb;
if ((pcb->pcb_flags & PCB_FPU) == 0)
enable_fpu(td);
memcpy(&pcb->pcb_fpu, fpregs, sizeof(struct fpreg));
#endif
return (0);
}
#ifdef COMPAT_FREEBSD32
int
set_regs32(struct thread *td, struct reg32 *regs)
{
struct trapframe *tf;
int i;
tf = td->td_frame;
for (i = 0; i < 32; i++)
tf->fixreg[i] = regs->fixreg[i];
tf->lr = regs->lr;
tf->cr = regs->cr;
tf->xer = regs->xer;
tf->ctr = regs->ctr;
tf->srr0 = regs->pc;
return (0);
}
int
fill_regs32(struct thread *td, struct reg32 *regs)
{
struct trapframe *tf;
int i;
tf = td->td_frame;
for (i = 0; i < 32; i++)
regs->fixreg[i] = tf->fixreg[i];
regs->lr = tf->lr;
regs->cr = tf->cr;
regs->xer = tf->xer;
regs->ctr = tf->ctr;
regs->pc = tf->srr0;
return (0);
}
static int
grab_mcontext32(struct thread *td, mcontext32_t *mcp, int flags)
{
mcontext_t mcp64;
int i, error;
error = grab_mcontext(td, &mcp64, flags);
if (error != 0)
return (error);
mcp->mc_vers = mcp64.mc_vers;
mcp->mc_flags = mcp64.mc_flags;
mcp->mc_onstack = mcp64.mc_onstack;
mcp->mc_len = mcp64.mc_len;
memcpy(mcp->mc_avec,mcp64.mc_avec,sizeof(mcp64.mc_avec));
memcpy(mcp->mc_av,mcp64.mc_av,sizeof(mcp64.mc_av));
for (i = 0; i < 42; i++)
mcp->mc_frame[i] = mcp64.mc_frame[i];
memcpy(mcp->mc_fpreg,mcp64.mc_fpreg,sizeof(mcp64.mc_fpreg));
return (0);
}
static int
get_mcontext32(struct thread *td, mcontext32_t *mcp, int flags)
{
int error;
error = grab_mcontext32(td, mcp, flags);
if (error == 0) {
PROC_LOCK(curthread->td_proc);
mcp->mc_onstack = sigonstack(td->td_frame->fixreg[1]);
PROC_UNLOCK(curthread->td_proc);
}
return (error);
}
static int
set_mcontext32(struct thread *td, const mcontext32_t *mcp)
{
mcontext_t mcp64;
int i, error;
mcp64.mc_vers = mcp->mc_vers;
mcp64.mc_flags = mcp->mc_flags;
mcp64.mc_onstack = mcp->mc_onstack;
mcp64.mc_len = mcp->mc_len;
memcpy(mcp64.mc_avec,mcp->mc_avec,sizeof(mcp64.mc_avec));
memcpy(mcp64.mc_av,mcp->mc_av,sizeof(mcp64.mc_av));
for (i = 0; i < 42; i++)
mcp64.mc_frame[i] = mcp->mc_frame[i];
memcpy(mcp64.mc_fpreg,mcp->mc_fpreg,sizeof(mcp64.mc_fpreg));
error = set_mcontext(td, &mcp64);
return (error);
}
#endif
#ifdef COMPAT_FREEBSD32
int
freebsd32_sigreturn(struct thread *td, struct freebsd32_sigreturn_args *uap)
{
ucontext32_t uc;
int error;
CTR2(KTR_SIG, "sigreturn: td=%p ucp=%p", td, uap->sigcntxp);
if (copyin(uap->sigcntxp, &uc, sizeof(uc)) != 0) {
CTR1(KTR_SIG, "sigreturn: efault td=%p", td);
return (EFAULT);
}
error = set_mcontext32(td, &uc.uc_mcontext);
if (error != 0)
return (error);
kern_sigprocmask(td, SIG_SETMASK, &uc.uc_sigmask, NULL, 0);
CTR3(KTR_SIG, "sigreturn: return td=%p pc=%#x sp=%#x",
td, uc.uc_mcontext.mc_srr0, uc.uc_mcontext.mc_gpr[1]);
return (EJUSTRETURN);
}
/*
* The first two fields of a ucontext_t are the signal mask and the machine
* context. The next field is uc_link; we want to avoid destroying the link
* when copying out contexts.
*/
#define UC32_COPY_SIZE offsetof(ucontext32_t, uc_link)
int
freebsd32_getcontext(struct thread *td, struct freebsd32_getcontext_args *uap)
{
ucontext32_t uc;
int ret;
if (uap->ucp == NULL)
ret = EINVAL;
else {
get_mcontext32(td, &uc.uc_mcontext, GET_MC_CLEAR_RET);
PROC_LOCK(td->td_proc);
uc.uc_sigmask = td->td_sigmask;
PROC_UNLOCK(td->td_proc);
ret = copyout(&uc, uap->ucp, UC32_COPY_SIZE);
}
return (ret);
}
int
freebsd32_setcontext(struct thread *td, struct freebsd32_setcontext_args *uap)
{
ucontext32_t uc;
int ret;
if (uap->ucp == NULL)
ret = EINVAL;
else {
ret = copyin(uap->ucp, &uc, UC32_COPY_SIZE);
if (ret == 0) {
ret = set_mcontext32(td, &uc.uc_mcontext);
if (ret == 0) {
kern_sigprocmask(td, SIG_SETMASK,
&uc.uc_sigmask, NULL, 0);
}
}
}
return (ret == 0 ? EJUSTRETURN : ret);
}
int
freebsd32_swapcontext(struct thread *td, struct freebsd32_swapcontext_args *uap)
{
ucontext32_t uc;
int ret;
if (uap->oucp == NULL || uap->ucp == NULL)
ret = EINVAL;
else {
get_mcontext32(td, &uc.uc_mcontext, GET_MC_CLEAR_RET);
PROC_LOCK(td->td_proc);
uc.uc_sigmask = td->td_sigmask;
PROC_UNLOCK(td->td_proc);
ret = copyout(&uc, uap->oucp, UC32_COPY_SIZE);
if (ret == 0) {
ret = copyin(uap->ucp, &uc, UC32_COPY_SIZE);
if (ret == 0) {
ret = set_mcontext32(td, &uc.uc_mcontext);
if (ret == 0) {
kern_sigprocmask(td, SIG_SETMASK,
&uc.uc_sigmask, NULL, 0);
}
}
}
}
return (ret == 0 ? EJUSTRETURN : ret);
}
#endif
void
cpu_set_syscall_retval(struct thread *td, int error)
{
struct proc *p;
struct trapframe *tf;
int fixup;
if (error == EJUSTRETURN)
return;
p = td->td_proc;
tf = td->td_frame;
if (tf->fixreg[0] == SYS___syscall &&
(SV_PROC_FLAG(p, SV_ILP32))) {
int code = tf->fixreg[FIRSTARG + 1];
if (p->p_sysent->sv_mask)
code &= p->p_sysent->sv_mask;
fixup = (code != SYS_freebsd6_lseek && code != SYS_lseek) ?
1 : 0;
} else
fixup = 0;
switch (error) {
case 0:
if (fixup) {
/*
* 64-bit return, 32-bit syscall. Fixup byte order
*/
tf->fixreg[FIRSTARG] = 0;
tf->fixreg[FIRSTARG + 1] = td->td_retval[0];
} else {
tf->fixreg[FIRSTARG] = td->td_retval[0];
tf->fixreg[FIRSTARG + 1] = td->td_retval[1];
}
tf->cr &= ~0x10000000; /* Unset summary overflow */
break;
case ERESTART:
/*
* Set user's pc back to redo the system call.
*/
tf->srr0 -= 4;
break;
default:
if (p->p_sysent->sv_errsize) {
error = (error < p->p_sysent->sv_errsize) ?
p->p_sysent->sv_errtbl[error] : -1;
}
tf->fixreg[FIRSTARG] = error;
tf->cr |= 0x10000000; /* Set summary overflow */
break;
}
}
/*
* Threading functions
*/
void
cpu_thread_exit(struct thread *td)
{
}
void
cpu_thread_clean(struct thread *td)
{
}
void
cpu_thread_alloc(struct thread *td)
{
struct pcb *pcb;
pcb = (struct pcb *)((td->td_kstack + td->td_kstack_pages * PAGE_SIZE -
sizeof(struct pcb)) & ~0x2fUL);
td->td_pcb = pcb;
td->td_frame = (struct trapframe *)pcb - 1;
}
void
cpu_thread_free(struct thread *td)
{
}
int
cpu_set_user_tls(struct thread *td, void *tls_base)
{
if (SV_PROC_FLAG(td->td_proc, SV_LP64))
td->td_frame->fixreg[13] = (register_t)tls_base + 0x7010;
else
td->td_frame->fixreg[2] = (register_t)tls_base + 0x7008;
return (0);
}
void
cpu_set_upcall(struct thread *td, struct thread *td0)
{
struct pcb *pcb2;
struct trapframe *tf;
struct callframe *cf;
pcb2 = td->td_pcb;
/* Copy the upcall pcb */
bcopy(td0->td_pcb, pcb2, sizeof(*pcb2));
/* Create a stack for the new thread */
tf = td->td_frame;
bcopy(td0->td_frame, tf, sizeof(struct trapframe));
tf->fixreg[FIRSTARG] = 0;
tf->fixreg[FIRSTARG + 1] = 0;
tf->cr &= ~0x10000000;
/* Set registers for trampoline to user mode. */
cf = (struct callframe *)tf - 1;
memset(cf, 0, sizeof(struct callframe));
cf->cf_func = (register_t)fork_return;
cf->cf_arg0 = (register_t)td;
cf->cf_arg1 = (register_t)tf;
pcb2->pcb_sp = (register_t)cf;
#ifdef __powerpc64__
pcb2->pcb_lr = ((register_t *)fork_trampoline)[0];
pcb2->pcb_toc = ((register_t *)fork_trampoline)[1];
#else
pcb2->pcb_lr = (register_t)fork_trampoline;
#endif
pcb2->pcb_cpu.aim.usr_vsid = 0;
/* Setup to release spin count in fork_exit(). */
td->td_md.md_spinlock_count = 1;
td->td_md.md_saved_msr = PSL_KERNSET;
}
void
cpu_set_upcall_kse(struct thread *td, void (*entry)(void *), void *arg,
stack_t *stack)
{
struct trapframe *tf;
uintptr_t sp;
tf = td->td_frame;
/* align stack and alloc space for frame ptr and saved LR */
#ifdef __powerpc64__
sp = ((uintptr_t)stack->ss_sp + stack->ss_size - 48) &
~0x1f;
#else
sp = ((uintptr_t)stack->ss_sp + stack->ss_size - 8) &
~0x1f;
#endif
bzero(tf, sizeof(struct trapframe));
tf->fixreg[1] = (register_t)sp;
tf->fixreg[3] = (register_t)arg;
if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
tf->srr0 = (register_t)entry;
#ifdef AIM
tf->srr1 = PSL_MBO | PSL_USERSET | PSL_FE_DFLT;
#ifdef __powerpc64__
tf->srr1 &= ~PSL_SF;
#endif
#else
tf->srr1 = PSL_USERSET;
#endif
} else {
#ifdef __powerpc64__
register_t entry_desc[3];
(void)copyin((void *)entry, entry_desc, sizeof(entry_desc));
tf->srr0 = entry_desc[0];
tf->fixreg[2] = entry_desc[1];
tf->fixreg[11] = entry_desc[2];
tf->srr1 = PSL_SF | PSL_MBO | PSL_USERSET | PSL_FE_DFLT;
#endif
}
#ifdef __powerpc64__
if (mfmsr() & PSL_HV)
tf->srr1 |= PSL_HV;
#endif
td->td_pcb->pcb_flags = 0;
td->td_retval[0] = (register_t)entry;
td->td_retval[1] = 0;
}
Index: head/sys/security/audit/audit_syscalls.c
===================================================================
--- head/sys/security/audit/audit_syscalls.c (revision 225616)
+++ head/sys/security/audit/audit_syscalls.c (revision 225617)
@@ -1,877 +1,877 @@
/*-
* Copyright (c) 1999-2009 Apple Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of Apple Inc. ("Apple") nor the names of
* its contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
* IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/mount.h>
#include <sys/namei.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/sysproto.h>
#include <sys/systm.h>
#include <sys/vnode.h>
#include <sys/jail.h>
#include <bsm/audit.h>
#include <bsm/audit_kevents.h>
#include <security/audit/audit.h>
#include <security/audit/audit_private.h>
#include <security/mac/mac_framework.h>
#ifdef AUDIT
/*
* System call to allow a user space application to submit a BSM audit record
* to the kernel for inclusion in the audit log. This function does little
* verification on the audit record that is submitted.
*
* XXXAUDIT: Audit preselection for user records does not currently work,
* since we pre-select only based on the AUE_audit event type, not the event
* type submitted as part of the user audit data.
*/
/* ARGSUSED */
int
-audit(struct thread *td, struct audit_args *uap)
+sys_audit(struct thread *td, struct audit_args *uap)
{
int error;
void * rec;
struct kaudit_record *ar;
if (jailed(td->td_ucred))
return (ENOSYS);
error = priv_check(td, PRIV_AUDIT_SUBMIT);
if (error)
return (error);
if ((uap->length <= 0) || (uap->length > audit_qctrl.aq_bufsz))
return (EINVAL);
ar = currecord();
/*
* If there's no current audit record (audit() itself not audited)
* commit the user audit record.
*/
if (ar == NULL) {
/*
* This is not very efficient; we're required to allocate a
* complete kernel audit record just so the user record can
* tag along.
*
* XXXAUDIT: Maybe AUE_AUDIT in the system call context and
* special pre-select handling?
*/
td->td_ar = audit_new(AUE_NULL, td);
if (td->td_ar == NULL)
return (ENOTSUP);
td->td_pflags |= TDP_AUDITREC;
ar = td->td_ar;
}
if (uap->length > MAX_AUDIT_RECORD_SIZE)
return (EINVAL);
rec = malloc(uap->length, M_AUDITDATA, M_WAITOK);
error = copyin(uap->record, rec, uap->length);
if (error)
goto free_out;
/* Verify the record. */
if (bsm_rec_verify(rec) == 0) {
error = EINVAL;
goto free_out;
}
#ifdef MAC
error = mac_system_check_audit(td->td_ucred, rec, uap->length);
if (error)
goto free_out;
#endif
/*
* Attach the user audit record to the kernel audit record. Because
* this system call is an auditable event, we will write the user
* record along with the record for this audit event.
*
* XXXAUDIT: KASSERT appropriate starting values of k_udata, k_ulen,
* k_ar_commit & AR_COMMIT_USER?
*/
ar->k_udata = rec;
ar->k_ulen = uap->length;
ar->k_ar_commit |= AR_COMMIT_USER;
/*
* Currently we assume that all preselection has been performed in
* userspace. We unconditionally set these masks so that the records
* get committed both to the trail and pipe. In the future we will
* want to setup kernel based preselection.
*/
ar->k_ar_commit |= (AR_PRESELECT_USER_TRAIL | AR_PRESELECT_USER_PIPE);
return (0);
free_out:
/*
* audit_syscall_exit() will free the audit record on the thread even
* if we allocated it above.
*/
free(rec, M_AUDITDATA);
return (error);
}
/*
* System call to manipulate auditing.
*/
/* ARGSUSED */
int
-auditon(struct thread *td, struct auditon_args *uap)
+sys_auditon(struct thread *td, struct auditon_args *uap)
{
struct ucred *cred, *newcred, *oldcred;
int error;
union auditon_udata udata;
struct proc *tp;
if (jailed(td->td_ucred))
return (ENOSYS);
AUDIT_ARG_CMD(uap->cmd);
#ifdef MAC
error = mac_system_check_auditon(td->td_ucred, uap->cmd);
if (error)
return (error);
#endif
error = priv_check(td, PRIV_AUDIT_CONTROL);
if (error)
return (error);
if ((uap->length <= 0) || (uap->length > sizeof(union auditon_udata)))
return (EINVAL);
memset((void *)&udata, 0, sizeof(udata));
/*
* Some of the GET commands use the arguments too.
*/
switch (uap->cmd) {
case A_SETPOLICY:
case A_OLDSETPOLICY:
case A_SETKMASK:
case A_SETQCTRL:
case A_OLDSETQCTRL:
case A_SETSTAT:
case A_SETUMASK:
case A_SETSMASK:
case A_SETCOND:
case A_OLDSETCOND:
case A_SETCLASS:
case A_SETPMASK:
case A_SETFSIZE:
case A_SETKAUDIT:
case A_GETCLASS:
case A_GETPINFO:
case A_GETPINFO_ADDR:
case A_SENDTRIGGER:
error = copyin(uap->data, (void *)&udata, uap->length);
if (error)
return (error);
AUDIT_ARG_AUDITON(&udata);
break;
}
/*
* XXXAUDIT: Locking?
*/
switch (uap->cmd) {
case A_OLDGETPOLICY:
case A_GETPOLICY:
if (uap->length == sizeof(udata.au_policy64)) {
if (!audit_fail_stop)
udata.au_policy64 |= AUDIT_CNT;
if (audit_panic_on_write_fail)
udata.au_policy64 |= AUDIT_AHLT;
if (audit_argv)
udata.au_policy64 |= AUDIT_ARGV;
if (audit_arge)
udata.au_policy64 |= AUDIT_ARGE;
break;
}
if (uap->length != sizeof(udata.au_policy))
return (EINVAL);
if (!audit_fail_stop)
udata.au_policy |= AUDIT_CNT;
if (audit_panic_on_write_fail)
udata.au_policy |= AUDIT_AHLT;
if (audit_argv)
udata.au_policy |= AUDIT_ARGV;
if (audit_arge)
udata.au_policy |= AUDIT_ARGE;
break;
case A_OLDSETPOLICY:
case A_SETPOLICY:
if (uap->length == sizeof(udata.au_policy64)) {
if (udata.au_policy & (~AUDIT_CNT|AUDIT_AHLT|
AUDIT_ARGV|AUDIT_ARGE))
return (EINVAL);
audit_fail_stop = ((udata.au_policy64 & AUDIT_CNT) ==
0);
audit_panic_on_write_fail = (udata.au_policy64 &
AUDIT_AHLT);
audit_argv = (udata.au_policy64 & AUDIT_ARGV);
audit_arge = (udata.au_policy64 & AUDIT_ARGE);
break;
}
if (uap->length != sizeof(udata.au_policy))
return (EINVAL);
if (udata.au_policy & ~(AUDIT_CNT|AUDIT_AHLT|AUDIT_ARGV|
AUDIT_ARGE))
return (EINVAL);
/*
* XXX - Need to wake up waiters if the policy relaxes?
*/
audit_fail_stop = ((udata.au_policy & AUDIT_CNT) == 0);
audit_panic_on_write_fail = (udata.au_policy & AUDIT_AHLT);
audit_argv = (udata.au_policy & AUDIT_ARGV);
audit_arge = (udata.au_policy & AUDIT_ARGE);
break;
case A_GETKMASK:
if (uap->length != sizeof(udata.au_mask))
return (EINVAL);
udata.au_mask = audit_nae_mask;
break;
case A_SETKMASK:
if (uap->length != sizeof(udata.au_mask))
return (EINVAL);
audit_nae_mask = udata.au_mask;
break;
case A_OLDGETQCTRL:
case A_GETQCTRL:
if (uap->length == sizeof(udata.au_qctrl64)) {
udata.au_qctrl64.aq64_hiwater =
(u_int64_t)audit_qctrl.aq_hiwater;
udata.au_qctrl64.aq64_lowater =
(u_int64_t)audit_qctrl.aq_lowater;
udata.au_qctrl64.aq64_bufsz =
(u_int64_t)audit_qctrl.aq_bufsz;
udata.au_qctrl64.aq64_minfree =
(u_int64_t)audit_qctrl.aq_minfree;
break;
}
if (uap->length != sizeof(udata.au_qctrl))
return (EINVAL);
udata.au_qctrl = audit_qctrl;
break;
case A_OLDSETQCTRL:
case A_SETQCTRL:
if (uap->length == sizeof(udata.au_qctrl64)) {
if ((udata.au_qctrl64.aq64_hiwater > AQ_MAXHIGH) ||
(udata.au_qctrl64.aq64_lowater >=
udata.au_qctrl.aq_hiwater) ||
(udata.au_qctrl64.aq64_bufsz > AQ_MAXBUFSZ) ||
(udata.au_qctrl64.aq64_minfree < 0) ||
(udata.au_qctrl64.aq64_minfree > 100))
return (EINVAL);
audit_qctrl.aq_hiwater =
(int)udata.au_qctrl64.aq64_hiwater;
audit_qctrl.aq_lowater =
(int)udata.au_qctrl64.aq64_lowater;
audit_qctrl.aq_bufsz =
(int)udata.au_qctrl64.aq64_bufsz;
audit_qctrl.aq_minfree =
(int)udata.au_qctrl64.aq64_minfree;
audit_qctrl.aq_delay = -1; /* Not used. */
break;
}
if (uap->length != sizeof(udata.au_qctrl))
return (EINVAL);
if ((udata.au_qctrl.aq_hiwater > AQ_MAXHIGH) ||
(udata.au_qctrl.aq_lowater >= udata.au_qctrl.aq_hiwater) ||
(udata.au_qctrl.aq_bufsz > AQ_MAXBUFSZ) ||
(udata.au_qctrl.aq_minfree < 0) ||
(udata.au_qctrl.aq_minfree > 100))
return (EINVAL);
audit_qctrl = udata.au_qctrl;
/* XXX The queue delay value isn't used with the kernel. */
audit_qctrl.aq_delay = -1;
break;
case A_GETCWD:
return (ENOSYS);
break;
case A_GETCAR:
return (ENOSYS);
break;
case A_GETSTAT:
return (ENOSYS);
break;
case A_SETSTAT:
return (ENOSYS);
break;
case A_SETUMASK:
return (ENOSYS);
break;
case A_SETSMASK:
return (ENOSYS);
break;
case A_OLDGETCOND:
case A_GETCOND:
if (uap->length == sizeof(udata.au_cond64)) {
if (audit_enabled && !audit_suspended)
udata.au_cond64 = AUC_AUDITING;
else
udata.au_cond64 = AUC_NOAUDIT;
break;
}
if (uap->length != sizeof(udata.au_cond))
return (EINVAL);
if (audit_enabled && !audit_suspended)
udata.au_cond = AUC_AUDITING;
else
udata.au_cond = AUC_NOAUDIT;
break;
case A_OLDSETCOND:
case A_SETCOND:
if (uap->length == sizeof(udata.au_cond64)) {
if (udata.au_cond64 == AUC_NOAUDIT)
audit_suspended = 1;
if (udata.au_cond64 == AUC_AUDITING)
audit_suspended = 0;
if (udata.au_cond64 == AUC_DISABLED) {
audit_suspended = 1;
audit_shutdown(NULL, 0);
}
break;
}
if (uap->length != sizeof(udata.au_cond))
return (EINVAL);
if (udata.au_cond == AUC_NOAUDIT)
audit_suspended = 1;
if (udata.au_cond == AUC_AUDITING)
audit_suspended = 0;
if (udata.au_cond == AUC_DISABLED) {
audit_suspended = 1;
audit_shutdown(NULL, 0);
}
break;
case A_GETCLASS:
if (uap->length != sizeof(udata.au_evclass))
return (EINVAL);
udata.au_evclass.ec_class = au_event_class(
udata.au_evclass.ec_number);
break;
case A_SETCLASS:
if (uap->length != sizeof(udata.au_evclass))
return (EINVAL);
au_evclassmap_insert(udata.au_evclass.ec_number,
udata.au_evclass.ec_class);
break;
case A_GETPINFO:
if (uap->length != sizeof(udata.au_aupinfo))
return (EINVAL);
if (udata.au_aupinfo.ap_pid < 1)
return (ESRCH);
if ((tp = pfind(udata.au_aupinfo.ap_pid)) == NULL)
return (ESRCH);
if ((error = p_cansee(td, tp)) != 0) {
PROC_UNLOCK(tp);
return (error);
}
cred = tp->p_ucred;
if (cred->cr_audit.ai_termid.at_type == AU_IPv6) {
PROC_UNLOCK(tp);
return (EINVAL);
}
udata.au_aupinfo.ap_auid = cred->cr_audit.ai_auid;
udata.au_aupinfo.ap_mask.am_success =
cred->cr_audit.ai_mask.am_success;
udata.au_aupinfo.ap_mask.am_failure =
cred->cr_audit.ai_mask.am_failure;
udata.au_aupinfo.ap_termid.machine =
cred->cr_audit.ai_termid.at_addr[0];
udata.au_aupinfo.ap_termid.port =
(dev_t)cred->cr_audit.ai_termid.at_port;
udata.au_aupinfo.ap_asid = cred->cr_audit.ai_asid;
PROC_UNLOCK(tp);
break;
case A_SETPMASK:
if (uap->length != sizeof(udata.au_aupinfo))
return (EINVAL);
if (udata.au_aupinfo.ap_pid < 1)
return (ESRCH);
newcred = crget();
if ((tp = pfind(udata.au_aupinfo.ap_pid)) == NULL) {
crfree(newcred);
return (ESRCH);
}
if ((error = p_cansee(td, tp)) != 0) {
PROC_UNLOCK(tp);
crfree(newcred);
return (error);
}
oldcred = tp->p_ucred;
crcopy(newcred, oldcred);
newcred->cr_audit.ai_mask.am_success =
udata.au_aupinfo.ap_mask.am_success;
newcred->cr_audit.ai_mask.am_failure =
udata.au_aupinfo.ap_mask.am_failure;
td->td_proc->p_ucred = newcred;
PROC_UNLOCK(tp);
crfree(oldcred);
break;
case A_SETFSIZE:
if (uap->length != sizeof(udata.au_fstat))
return (EINVAL);
if ((udata.au_fstat.af_filesz != 0) &&
(udata.au_fstat.af_filesz < MIN_AUDIT_FILE_SIZE))
return (EINVAL);
audit_fstat.af_filesz = udata.au_fstat.af_filesz;
break;
case A_GETFSIZE:
if (uap->length != sizeof(udata.au_fstat))
return (EINVAL);
udata.au_fstat.af_filesz = audit_fstat.af_filesz;
udata.au_fstat.af_currsz = audit_fstat.af_currsz;
break;
case A_GETPINFO_ADDR:
if (uap->length != sizeof(udata.au_aupinfo_addr))
return (EINVAL);
if (udata.au_aupinfo_addr.ap_pid < 1)
return (ESRCH);
if ((tp = pfind(udata.au_aupinfo_addr.ap_pid)) == NULL)
return (ESRCH);
cred = tp->p_ucred;
udata.au_aupinfo_addr.ap_auid = cred->cr_audit.ai_auid;
udata.au_aupinfo_addr.ap_mask.am_success =
cred->cr_audit.ai_mask.am_success;
udata.au_aupinfo_addr.ap_mask.am_failure =
cred->cr_audit.ai_mask.am_failure;
udata.au_aupinfo_addr.ap_termid = cred->cr_audit.ai_termid;
udata.au_aupinfo_addr.ap_asid = cred->cr_audit.ai_asid;
PROC_UNLOCK(tp);
break;
case A_GETKAUDIT:
if (uap->length != sizeof(udata.au_kau_info))
return (EINVAL);
audit_get_kinfo(&udata.au_kau_info);
break;
case A_SETKAUDIT:
if (uap->length != sizeof(udata.au_kau_info))
return (EINVAL);
if (udata.au_kau_info.ai_termid.at_type != AU_IPv4 &&
udata.au_kau_info.ai_termid.at_type != AU_IPv6)
return (EINVAL);
audit_set_kinfo(&udata.au_kau_info);
break;
case A_SENDTRIGGER:
if (uap->length != sizeof(udata.au_trigger))
return (EINVAL);
if ((udata.au_trigger < AUDIT_TRIGGER_MIN) ||
(udata.au_trigger > AUDIT_TRIGGER_MAX))
return (EINVAL);
return (audit_send_trigger(udata.au_trigger));
default:
return (EINVAL);
}
/*
* Copy data back to userspace for the GET comands.
*/
switch (uap->cmd) {
case A_GETPOLICY:
case A_OLDGETPOLICY:
case A_GETKMASK:
case A_GETQCTRL:
case A_OLDGETQCTRL:
case A_GETCWD:
case A_GETCAR:
case A_GETSTAT:
case A_GETCOND:
case A_OLDGETCOND:
case A_GETCLASS:
case A_GETPINFO:
case A_GETFSIZE:
case A_GETPINFO_ADDR:
case A_GETKAUDIT:
error = copyout((void *)&udata, uap->data, uap->length);
if (error)
return (error);
break;
}
return (0);
}
/*
* System calls to manage the user audit information.
*/
/* ARGSUSED */
int
-getauid(struct thread *td, struct getauid_args *uap)
+sys_getauid(struct thread *td, struct getauid_args *uap)
{
int error;
if (jailed(td->td_ucred))
return (ENOSYS);
error = priv_check(td, PRIV_AUDIT_GETAUDIT);
if (error)
return (error);
return (copyout(&td->td_ucred->cr_audit.ai_auid, uap->auid,
sizeof(td->td_ucred->cr_audit.ai_auid)));
}
/* ARGSUSED */
int
-setauid(struct thread *td, struct setauid_args *uap)
+sys_setauid(struct thread *td, struct setauid_args *uap)
{
struct ucred *newcred, *oldcred;
au_id_t id;
int error;
if (jailed(td->td_ucred))
return (ENOSYS);
error = copyin(uap->auid, &id, sizeof(id));
if (error)
return (error);
audit_arg_auid(id);
newcred = crget();
PROC_LOCK(td->td_proc);
oldcred = td->td_proc->p_ucred;
crcopy(newcred, oldcred);
#ifdef MAC
error = mac_cred_check_setauid(oldcred, id);
if (error)
goto fail;
#endif
error = priv_check_cred(oldcred, PRIV_AUDIT_SETAUDIT, 0);
if (error)
goto fail;
newcred->cr_audit.ai_auid = id;
td->td_proc->p_ucred = newcred;
PROC_UNLOCK(td->td_proc);
crfree(oldcred);
return (0);
fail:
PROC_UNLOCK(td->td_proc);
crfree(newcred);
return (error);
}
/*
* System calls to get and set process audit information.
*/
/* ARGSUSED */
int
-getaudit(struct thread *td, struct getaudit_args *uap)
+sys_getaudit(struct thread *td, struct getaudit_args *uap)
{
struct auditinfo ai;
struct ucred *cred;
int error;
cred = td->td_ucred;
if (jailed(cred))
return (ENOSYS);
error = priv_check(td, PRIV_AUDIT_GETAUDIT);
if (error)
return (error);
if (cred->cr_audit.ai_termid.at_type == AU_IPv6)
return (E2BIG);
bzero(&ai, sizeof(ai));
ai.ai_auid = cred->cr_audit.ai_auid;
ai.ai_mask = cred->cr_audit.ai_mask;
ai.ai_asid = cred->cr_audit.ai_asid;
ai.ai_termid.machine = cred->cr_audit.ai_termid.at_addr[0];
ai.ai_termid.port = cred->cr_audit.ai_termid.at_port;
return (copyout(&ai, uap->auditinfo, sizeof(ai)));
}
/* ARGSUSED */
int
-setaudit(struct thread *td, struct setaudit_args *uap)
+sys_setaudit(struct thread *td, struct setaudit_args *uap)
{
struct ucred *newcred, *oldcred;
struct auditinfo ai;
int error;
if (jailed(td->td_ucred))
return (ENOSYS);
error = copyin(uap->auditinfo, &ai, sizeof(ai));
if (error)
return (error);
audit_arg_auditinfo(&ai);
newcred = crget();
PROC_LOCK(td->td_proc);
oldcred = td->td_proc->p_ucred;
crcopy(newcred, oldcred);
#ifdef MAC
error = mac_cred_check_setaudit(oldcred, &ai);
if (error)
goto fail;
#endif
error = priv_check_cred(oldcred, PRIV_AUDIT_SETAUDIT, 0);
if (error)
goto fail;
bzero(&newcred->cr_audit, sizeof(newcred->cr_audit));
newcred->cr_audit.ai_auid = ai.ai_auid;
newcred->cr_audit.ai_mask = ai.ai_mask;
newcred->cr_audit.ai_asid = ai.ai_asid;
newcred->cr_audit.ai_termid.at_addr[0] = ai.ai_termid.machine;
newcred->cr_audit.ai_termid.at_port = ai.ai_termid.port;
newcred->cr_audit.ai_termid.at_type = AU_IPv4;
td->td_proc->p_ucred = newcred;
PROC_UNLOCK(td->td_proc);
crfree(oldcred);
return (0);
fail:
PROC_UNLOCK(td->td_proc);
crfree(newcred);
return (error);
}
/* ARGSUSED */
int
-getaudit_addr(struct thread *td, struct getaudit_addr_args *uap)
+sys_getaudit_addr(struct thread *td, struct getaudit_addr_args *uap)
{
int error;
if (jailed(td->td_ucred))
return (ENOSYS);
if (uap->length < sizeof(*uap->auditinfo_addr))
return (EOVERFLOW);
error = priv_check(td, PRIV_AUDIT_GETAUDIT);
if (error)
return (error);
return (copyout(&td->td_ucred->cr_audit, uap->auditinfo_addr,
sizeof(*uap->auditinfo_addr)));
}
/* ARGSUSED */
int
-setaudit_addr(struct thread *td, struct setaudit_addr_args *uap)
+sys_setaudit_addr(struct thread *td, struct setaudit_addr_args *uap)
{
struct ucred *newcred, *oldcred;
struct auditinfo_addr aia;
int error;
if (jailed(td->td_ucred))
return (ENOSYS);
error = copyin(uap->auditinfo_addr, &aia, sizeof(aia));
if (error)
return (error);
audit_arg_auditinfo_addr(&aia);
if (aia.ai_termid.at_type != AU_IPv6 &&
aia.ai_termid.at_type != AU_IPv4)
return (EINVAL);
newcred = crget();
PROC_LOCK(td->td_proc);
oldcred = td->td_proc->p_ucred;
crcopy(newcred, oldcred);
#ifdef MAC
error = mac_cred_check_setaudit_addr(oldcred, &aia);
if (error)
goto fail;
#endif
error = priv_check_cred(oldcred, PRIV_AUDIT_SETAUDIT, 0);
if (error)
goto fail;
newcred->cr_audit = aia;
td->td_proc->p_ucred = newcred;
PROC_UNLOCK(td->td_proc);
crfree(oldcred);
return (0);
fail:
PROC_UNLOCK(td->td_proc);
crfree(newcred);
return (error);
}
/*
* Syscall to manage audit files.
*/
/* ARGSUSED */
int
-auditctl(struct thread *td, struct auditctl_args *uap)
+sys_auditctl(struct thread *td, struct auditctl_args *uap)
{
struct nameidata nd;
struct ucred *cred;
struct vnode *vp;
int error = 0;
int flags, vfslocked;
if (jailed(td->td_ucred))
return (ENOSYS);
error = priv_check(td, PRIV_AUDIT_CONTROL);
if (error)
return (error);
vp = NULL;
cred = NULL;
/*
* If a path is specified, open the replacement vnode, perform
* validity checks, and grab another reference to the current
* credential.
*
* On Darwin, a NULL path argument is also used to disable audit.
*/
if (uap->path == NULL)
return (EINVAL);
NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
UIO_USERSPACE, uap->path, td);
flags = AUDIT_OPEN_FLAGS;
error = vn_open(&nd, &flags, 0, NULL);
if (error)
return (error);
vfslocked = NDHASGIANT(&nd);
vp = nd.ni_vp;
#ifdef MAC
error = mac_system_check_auditctl(td->td_ucred, vp);
VOP_UNLOCK(vp, 0);
if (error) {
vn_close(vp, AUDIT_CLOSE_FLAGS, td->td_ucred, td);
VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
#else
VOP_UNLOCK(vp, 0);
#endif
NDFREE(&nd, NDF_ONLY_PNBUF);
if (vp->v_type != VREG) {
vn_close(vp, AUDIT_CLOSE_FLAGS, td->td_ucred, td);
VFS_UNLOCK_GIANT(vfslocked);
return (EINVAL);
}
VFS_UNLOCK_GIANT(vfslocked);
cred = td->td_ucred;
crhold(cred);
/*
* XXXAUDIT: Should audit_suspended actually be cleared by
* audit_worker?
*/
audit_suspended = 0;
audit_rotate_vnode(cred, vp);
return (error);
}
#else /* !AUDIT */
int
-audit(struct thread *td, struct audit_args *uap)
+sys_audit(struct thread *td, struct audit_args *uap)
{
return (ENOSYS);
}
int
-auditon(struct thread *td, struct auditon_args *uap)
+sys_auditon(struct thread *td, struct auditon_args *uap)
{
return (ENOSYS);
}
int
-getauid(struct thread *td, struct getauid_args *uap)
+sys_getauid(struct thread *td, struct getauid_args *uap)
{
return (ENOSYS);
}
int
-setauid(struct thread *td, struct setauid_args *uap)
+sys_setauid(struct thread *td, struct setauid_args *uap)
{
return (ENOSYS);
}
int
-getaudit(struct thread *td, struct getaudit_args *uap)
+sys_getaudit(struct thread *td, struct getaudit_args *uap)
{
return (ENOSYS);
}
int
-setaudit(struct thread *td, struct setaudit_args *uap)
+sys_setaudit(struct thread *td, struct setaudit_args *uap)
{
return (ENOSYS);
}
int
-getaudit_addr(struct thread *td, struct getaudit_addr_args *uap)
+sys_getaudit_addr(struct thread *td, struct getaudit_addr_args *uap)
{
return (ENOSYS);
}
int
-setaudit_addr(struct thread *td, struct setaudit_addr_args *uap)
+sys_setaudit_addr(struct thread *td, struct setaudit_addr_args *uap)
{
return (ENOSYS);
}
int
-auditctl(struct thread *td, struct auditctl_args *uap)
+sys_auditctl(struct thread *td, struct auditctl_args *uap)
{
return (ENOSYS);
}
#endif /* AUDIT */
Index: head/sys/security/mac/mac_syscalls.c
===================================================================
--- head/sys/security/mac/mac_syscalls.c (revision 225616)
+++ head/sys/security/mac/mac_syscalls.c (revision 225617)
@@ -1,731 +1,731 @@
/*-
* Copyright (c) 1999-2002, 2006, 2009 Robert N. M. Watson
* Copyright (c) 2001 Ilmar S. Habibulin
* Copyright (c) 2001-2005 Networks Associates Technology, Inc.
* Copyright (c) 2005-2006 SPARTA, Inc.
* Copyright (c) 2008 Apple Inc.
* All rights reserved.
*
* This software was developed by Robert Watson and Ilmar Habibulin for the
* TrustedBSD Project.
*
* This software was developed for the FreeBSD Project in part by Network
* Associates Laboratories, the Security Research Division of Network
* Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"),
* as part of the DARPA CHATS research program.
*
* This software was enhanced by SPARTA ISSO under SPAWAR contract
* N66001-04-C-6019 ("SEFOS").
*
* This software was developed at the University of Cambridge Computer
* Laboratory with support from a grant from Google, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_mac.h"
#include <sys/param.h>
#include <sys/capability.h>
#include <sys/fcntl.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
#include <sys/mac.h>
#include <sys/proc.h>
#include <sys/systm.h>
#include <sys/sysctl.h>
#include <sys/sysproto.h>
#include <sys/sysent.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/file.h>
#include <sys/namei.h>
#include <sys/socket.h>
#include <sys/pipe.h>
#include <sys/socketvar.h>
#include <security/mac/mac_framework.h>
#include <security/mac/mac_internal.h>
#include <security/mac/mac_policy.h>
#ifdef MAC
FEATURE(security_mac, "Mandatory Access Control Framework support");
int
-__mac_get_pid(struct thread *td, struct __mac_get_pid_args *uap)
+sys___mac_get_pid(struct thread *td, struct __mac_get_pid_args *uap)
{
char *elements, *buffer;
struct mac mac;
struct proc *tproc;
struct ucred *tcred;
int error;
error = copyin(uap->mac_p, &mac, sizeof(mac));
if (error)
return (error);
error = mac_check_structmac_consistent(&mac);
if (error)
return (error);
tproc = pfind(uap->pid);
if (tproc == NULL)
return (ESRCH);
tcred = NULL; /* Satisfy gcc. */
error = p_cansee(td, tproc);
if (error == 0)
tcred = crhold(tproc->p_ucred);
PROC_UNLOCK(tproc);
if (error)
return (error);
elements = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
error = copyinstr(mac.m_string, elements, mac.m_buflen, NULL);
if (error) {
free(elements, M_MACTEMP);
crfree(tcred);
return (error);
}
buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK | M_ZERO);
error = mac_cred_externalize_label(tcred->cr_label, elements,
buffer, mac.m_buflen);
if (error == 0)
error = copyout(buffer, mac.m_string, strlen(buffer)+1);
free(buffer, M_MACTEMP);
free(elements, M_MACTEMP);
crfree(tcred);
return (error);
}
int
-__mac_get_proc(struct thread *td, struct __mac_get_proc_args *uap)
+sys___mac_get_proc(struct thread *td, struct __mac_get_proc_args *uap)
{
char *elements, *buffer;
struct mac mac;
int error;
error = copyin(uap->mac_p, &mac, sizeof(mac));
if (error)
return (error);
error = mac_check_structmac_consistent(&mac);
if (error)
return (error);
elements = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
error = copyinstr(mac.m_string, elements, mac.m_buflen, NULL);
if (error) {
free(elements, M_MACTEMP);
return (error);
}
buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK | M_ZERO);
error = mac_cred_externalize_label(td->td_ucred->cr_label,
elements, buffer, mac.m_buflen);
if (error == 0)
error = copyout(buffer, mac.m_string, strlen(buffer)+1);
free(buffer, M_MACTEMP);
free(elements, M_MACTEMP);
return (error);
}
int
-__mac_set_proc(struct thread *td, struct __mac_set_proc_args *uap)
+sys___mac_set_proc(struct thread *td, struct __mac_set_proc_args *uap)
{
struct ucred *newcred, *oldcred;
struct label *intlabel;
struct proc *p;
struct mac mac;
char *buffer;
int error;
if (!(mac_labeled & MPC_OBJECT_CRED))
return (EINVAL);
error = copyin(uap->mac_p, &mac, sizeof(mac));
if (error)
return (error);
error = mac_check_structmac_consistent(&mac);
if (error)
return (error);
buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
error = copyinstr(mac.m_string, buffer, mac.m_buflen, NULL);
if (error) {
free(buffer, M_MACTEMP);
return (error);
}
intlabel = mac_cred_label_alloc();
error = mac_cred_internalize_label(intlabel, buffer);
free(buffer, M_MACTEMP);
if (error)
goto out;
newcred = crget();
p = td->td_proc;
PROC_LOCK(p);
oldcred = p->p_ucred;
error = mac_cred_check_relabel(oldcred, intlabel);
if (error) {
PROC_UNLOCK(p);
crfree(newcred);
goto out;
}
setsugid(p);
crcopy(newcred, oldcred);
mac_cred_relabel(newcred, intlabel);
p->p_ucred = newcred;
PROC_UNLOCK(p);
crfree(oldcred);
mac_proc_vm_revoke(td);
out:
mac_cred_label_free(intlabel);
return (error);
}
int
-__mac_get_fd(struct thread *td, struct __mac_get_fd_args *uap)
+sys___mac_get_fd(struct thread *td, struct __mac_get_fd_args *uap)
{
char *elements, *buffer;
struct label *intlabel;
struct file *fp;
struct mac mac;
struct vnode *vp;
struct pipe *pipe;
struct socket *so;
short label_type;
int vfslocked, error;
error = copyin(uap->mac_p, &mac, sizeof(mac));
if (error)
return (error);
error = mac_check_structmac_consistent(&mac);
if (error)
return (error);
elements = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
error = copyinstr(mac.m_string, elements, mac.m_buflen, NULL);
if (error) {
free(elements, M_MACTEMP);
return (error);
}
buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK | M_ZERO);
error = fget(td, uap->fd, CAP_MAC_GET, &fp);
if (error)
goto out;
label_type = fp->f_type;
switch (fp->f_type) {
case DTYPE_FIFO:
case DTYPE_VNODE:
if (!(mac_labeled & MPC_OBJECT_VNODE))
return (EINVAL);
vp = fp->f_vnode;
intlabel = mac_vnode_label_alloc();
vfslocked = VFS_LOCK_GIANT(vp->v_mount);
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
mac_vnode_copy_label(vp->v_label, intlabel);
VOP_UNLOCK(vp, 0);
VFS_UNLOCK_GIANT(vfslocked);
error = mac_vnode_externalize_label(intlabel, elements,
buffer, mac.m_buflen);
mac_vnode_label_free(intlabel);
break;
case DTYPE_PIPE:
if (!(mac_labeled & MPC_OBJECT_PIPE))
return (EINVAL);
pipe = fp->f_data;
intlabel = mac_pipe_label_alloc();
PIPE_LOCK(pipe);
mac_pipe_copy_label(pipe->pipe_pair->pp_label, intlabel);
PIPE_UNLOCK(pipe);
error = mac_pipe_externalize_label(intlabel, elements,
buffer, mac.m_buflen);
mac_pipe_label_free(intlabel);
break;
case DTYPE_SOCKET:
if (!(mac_labeled & MPC_OBJECT_SOCKET))
return (EINVAL);
so = fp->f_data;
intlabel = mac_socket_label_alloc(M_WAITOK);
SOCK_LOCK(so);
mac_socket_copy_label(so->so_label, intlabel);
SOCK_UNLOCK(so);
error = mac_socket_externalize_label(intlabel, elements,
buffer, mac.m_buflen);
mac_socket_label_free(intlabel);
break;
default:
error = EINVAL;
}
fdrop(fp, td);
if (error == 0)
error = copyout(buffer, mac.m_string, strlen(buffer)+1);
out:
free(buffer, M_MACTEMP);
free(elements, M_MACTEMP);
return (error);
}
int
-__mac_get_file(struct thread *td, struct __mac_get_file_args *uap)
+sys___mac_get_file(struct thread *td, struct __mac_get_file_args *uap)
{
char *elements, *buffer;
struct nameidata nd;
struct label *intlabel;
struct mac mac;
int vfslocked, error;
if (!(mac_labeled & MPC_OBJECT_VNODE))
return (EINVAL);
error = copyin(uap->mac_p, &mac, sizeof(mac));
if (error)
return (error);
error = mac_check_structmac_consistent(&mac);
if (error)
return (error);
elements = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
error = copyinstr(mac.m_string, elements, mac.m_buflen, NULL);
if (error) {
free(elements, M_MACTEMP);
return (error);
}
buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK | M_ZERO);
NDINIT(&nd, LOOKUP, MPSAFE | LOCKLEAF | FOLLOW, UIO_USERSPACE,
uap->path_p, td);
error = namei(&nd);
if (error)
goto out;
intlabel = mac_vnode_label_alloc();
vfslocked = NDHASGIANT(&nd);
mac_vnode_copy_label(nd.ni_vp->v_label, intlabel);
error = mac_vnode_externalize_label(intlabel, elements, buffer,
mac.m_buflen);
NDFREE(&nd, 0);
VFS_UNLOCK_GIANT(vfslocked);
mac_vnode_label_free(intlabel);
if (error == 0)
error = copyout(buffer, mac.m_string, strlen(buffer)+1);
out:
free(buffer, M_MACTEMP);
free(elements, M_MACTEMP);
return (error);
}
int
-__mac_get_link(struct thread *td, struct __mac_get_link_args *uap)
+sys___mac_get_link(struct thread *td, struct __mac_get_link_args *uap)
{
char *elements, *buffer;
struct nameidata nd;
struct label *intlabel;
struct mac mac;
int vfslocked, error;
if (!(mac_labeled & MPC_OBJECT_VNODE))
return (EINVAL);
error = copyin(uap->mac_p, &mac, sizeof(mac));
if (error)
return (error);
error = mac_check_structmac_consistent(&mac);
if (error)
return (error);
elements = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
error = copyinstr(mac.m_string, elements, mac.m_buflen, NULL);
if (error) {
free(elements, M_MACTEMP);
return (error);
}
buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK | M_ZERO);
NDINIT(&nd, LOOKUP, MPSAFE | LOCKLEAF | NOFOLLOW, UIO_USERSPACE,
uap->path_p, td);
error = namei(&nd);
if (error)
goto out;
intlabel = mac_vnode_label_alloc();
vfslocked = NDHASGIANT(&nd);
mac_vnode_copy_label(nd.ni_vp->v_label, intlabel);
error = mac_vnode_externalize_label(intlabel, elements, buffer,
mac.m_buflen);
NDFREE(&nd, 0);
VFS_UNLOCK_GIANT(vfslocked);
mac_vnode_label_free(intlabel);
if (error == 0)
error = copyout(buffer, mac.m_string, strlen(buffer)+1);
out:
free(buffer, M_MACTEMP);
free(elements, M_MACTEMP);
return (error);
}
int
-__mac_set_fd(struct thread *td, struct __mac_set_fd_args *uap)
+sys___mac_set_fd(struct thread *td, struct __mac_set_fd_args *uap)
{
struct label *intlabel;
struct pipe *pipe;
struct socket *so;
struct file *fp;
struct mount *mp;
struct vnode *vp;
struct mac mac;
char *buffer;
int error, vfslocked;
error = copyin(uap->mac_p, &mac, sizeof(mac));
if (error)
return (error);
error = mac_check_structmac_consistent(&mac);
if (error)
return (error);
buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
error = copyinstr(mac.m_string, buffer, mac.m_buflen, NULL);
if (error) {
free(buffer, M_MACTEMP);
return (error);
}
error = fget(td, uap->fd, CAP_MAC_SET, &fp);
if (error)
goto out;
switch (fp->f_type) {
case DTYPE_FIFO:
case DTYPE_VNODE:
if (!(mac_labeled & MPC_OBJECT_VNODE))
return (EINVAL);
intlabel = mac_vnode_label_alloc();
error = mac_vnode_internalize_label(intlabel, buffer);
if (error) {
mac_vnode_label_free(intlabel);
break;
}
vp = fp->f_vnode;
vfslocked = VFS_LOCK_GIANT(vp->v_mount);
error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
if (error != 0) {
VFS_UNLOCK_GIANT(vfslocked);
mac_vnode_label_free(intlabel);
break;
}
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
error = vn_setlabel(vp, intlabel, td->td_ucred);
VOP_UNLOCK(vp, 0);
vn_finished_write(mp);
VFS_UNLOCK_GIANT(vfslocked);
mac_vnode_label_free(intlabel);
break;
case DTYPE_PIPE:
if (!(mac_labeled & MPC_OBJECT_PIPE))
return (EINVAL);
intlabel = mac_pipe_label_alloc();
error = mac_pipe_internalize_label(intlabel, buffer);
if (error == 0) {
pipe = fp->f_data;
PIPE_LOCK(pipe);
error = mac_pipe_label_set(td->td_ucred,
pipe->pipe_pair, intlabel);
PIPE_UNLOCK(pipe);
}
mac_pipe_label_free(intlabel);
break;
case DTYPE_SOCKET:
if (!(mac_labeled & MPC_OBJECT_SOCKET))
return (EINVAL);
intlabel = mac_socket_label_alloc(M_WAITOK);
error = mac_socket_internalize_label(intlabel, buffer);
if (error == 0) {
so = fp->f_data;
error = mac_socket_label_set(td->td_ucred, so,
intlabel);
}
mac_socket_label_free(intlabel);
break;
default:
error = EINVAL;
}
fdrop(fp, td);
out:
free(buffer, M_MACTEMP);
return (error);
}
int
-__mac_set_file(struct thread *td, struct __mac_set_file_args *uap)
+sys___mac_set_file(struct thread *td, struct __mac_set_file_args *uap)
{
struct label *intlabel;
struct nameidata nd;
struct mount *mp;
struct mac mac;
char *buffer;
int vfslocked, error;
if (!(mac_labeled & MPC_OBJECT_VNODE))
return (EINVAL);
error = copyin(uap->mac_p, &mac, sizeof(mac));
if (error)
return (error);
error = mac_check_structmac_consistent(&mac);
if (error)
return (error);
buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
error = copyinstr(mac.m_string, buffer, mac.m_buflen, NULL);
if (error) {
free(buffer, M_MACTEMP);
return (error);
}
intlabel = mac_vnode_label_alloc();
error = mac_vnode_internalize_label(intlabel, buffer);
free(buffer, M_MACTEMP);
if (error)
goto out;
NDINIT(&nd, LOOKUP, MPSAFE | LOCKLEAF | FOLLOW, UIO_USERSPACE,
uap->path_p, td);
error = namei(&nd);
vfslocked = NDHASGIANT(&nd);
if (error == 0) {
error = vn_start_write(nd.ni_vp, &mp, V_WAIT | PCATCH);
if (error == 0) {
error = vn_setlabel(nd.ni_vp, intlabel,
td->td_ucred);
vn_finished_write(mp);
}
}
NDFREE(&nd, 0);
VFS_UNLOCK_GIANT(vfslocked);
out:
mac_vnode_label_free(intlabel);
return (error);
}
int
-__mac_set_link(struct thread *td, struct __mac_set_link_args *uap)
+sys___mac_set_link(struct thread *td, struct __mac_set_link_args *uap)
{
struct label *intlabel;
struct nameidata nd;
struct mount *mp;
struct mac mac;
char *buffer;
int vfslocked, error;
if (!(mac_labeled & MPC_OBJECT_VNODE))
return (EINVAL);
error = copyin(uap->mac_p, &mac, sizeof(mac));
if (error)
return (error);
error = mac_check_structmac_consistent(&mac);
if (error)
return (error);
buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
error = copyinstr(mac.m_string, buffer, mac.m_buflen, NULL);
if (error) {
free(buffer, M_MACTEMP);
return (error);
}
intlabel = mac_vnode_label_alloc();
error = mac_vnode_internalize_label(intlabel, buffer);
free(buffer, M_MACTEMP);
if (error)
goto out;
NDINIT(&nd, LOOKUP, MPSAFE | LOCKLEAF | NOFOLLOW, UIO_USERSPACE,
uap->path_p, td);
error = namei(&nd);
vfslocked = NDHASGIANT(&nd);
if (error == 0) {
error = vn_start_write(nd.ni_vp, &mp, V_WAIT | PCATCH);
if (error == 0) {
error = vn_setlabel(nd.ni_vp, intlabel,
td->td_ucred);
vn_finished_write(mp);
}
}
NDFREE(&nd, 0);
VFS_UNLOCK_GIANT(vfslocked);
out:
mac_vnode_label_free(intlabel);
return (error);
}
int
-mac_syscall(struct thread *td, struct mac_syscall_args *uap)
+sys_mac_syscall(struct thread *td, struct mac_syscall_args *uap)
{
struct mac_policy_conf *mpc;
char target[MAC_MAX_POLICY_NAME];
int error;
error = copyinstr(uap->policy, target, sizeof(target), NULL);
if (error)
return (error);
error = ENOSYS;
LIST_FOREACH(mpc, &mac_static_policy_list, mpc_list) {
if (strcmp(mpc->mpc_name, target) == 0 &&
mpc->mpc_ops->mpo_syscall != NULL) {
error = mpc->mpc_ops->mpo_syscall(td,
uap->call, uap->arg);
goto out;
}
}
if (!LIST_EMPTY(&mac_policy_list)) {
mac_policy_slock_sleep();
LIST_FOREACH(mpc, &mac_policy_list, mpc_list) {
if (strcmp(mpc->mpc_name, target) == 0 &&
mpc->mpc_ops->mpo_syscall != NULL) {
error = mpc->mpc_ops->mpo_syscall(td,
uap->call, uap->arg);
break;
}
}
mac_policy_sunlock_sleep();
}
out:
return (error);
}
#else /* !MAC */
int
-__mac_get_pid(struct thread *td, struct __mac_get_pid_args *uap)
+sys___mac_get_pid(struct thread *td, struct __mac_get_pid_args *uap)
{
return (ENOSYS);
}
int
-__mac_get_proc(struct thread *td, struct __mac_get_proc_args *uap)
+sys___mac_get_proc(struct thread *td, struct __mac_get_proc_args *uap)
{
return (ENOSYS);
}
int
-__mac_set_proc(struct thread *td, struct __mac_set_proc_args *uap)
+sys___mac_set_proc(struct thread *td, struct __mac_set_proc_args *uap)
{
return (ENOSYS);
}
int
-__mac_get_fd(struct thread *td, struct __mac_get_fd_args *uap)
+sys___mac_get_fd(struct thread *td, struct __mac_get_fd_args *uap)
{
return (ENOSYS);
}
int
-__mac_get_file(struct thread *td, struct __mac_get_file_args *uap)
+sys___mac_get_file(struct thread *td, struct __mac_get_file_args *uap)
{
return (ENOSYS);
}
int
-__mac_get_link(struct thread *td, struct __mac_get_link_args *uap)
+sys___mac_get_link(struct thread *td, struct __mac_get_link_args *uap)
{
return (ENOSYS);
}
int
-__mac_set_fd(struct thread *td, struct __mac_set_fd_args *uap)
+sys___mac_set_fd(struct thread *td, struct __mac_set_fd_args *uap)
{
return (ENOSYS);
}
int
-__mac_set_file(struct thread *td, struct __mac_set_file_args *uap)
+sys___mac_set_file(struct thread *td, struct __mac_set_file_args *uap)
{
return (ENOSYS);
}
int
-__mac_set_link(struct thread *td, struct __mac_set_link_args *uap)
+sys___mac_set_link(struct thread *td, struct __mac_set_link_args *uap)
{
return (ENOSYS);
}
int
-mac_syscall(struct thread *td, struct mac_syscall_args *uap)
+sys_mac_syscall(struct thread *td, struct mac_syscall_args *uap)
{
return (ENOSYS);
}
#endif /* !MAC */
Index: head/sys/sparc64/sparc64/machdep.c
===================================================================
--- head/sys/sparc64/sparc64/machdep.c (revision 225616)
+++ head/sys/sparc64/sparc64/machdep.c (revision 225617)
@@ -1,1128 +1,1128 @@
/*-
* Copyright (c) 2001 Jake Burkholder.
* Copyright (c) 1992 Terrence R. Lambert.
* Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
* All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* William Jolitz.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* from: @(#)machdep.c 7.4 (Berkeley) 6/3/91
* from: FreeBSD: src/sys/i386/i386/machdep.c,v 1.477 2001/08/27
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_compat.h"
#include "opt_ddb.h"
#include "opt_kstack_pages.h"
#include <sys/param.h>
#include <sys/malloc.h>
#include <sys/proc.h>
#include <sys/systm.h>
#include <sys/bio.h>
#include <sys/buf.h>
#include <sys/bus.h>
#include <sys/cpu.h>
#include <sys/cons.h>
#include <sys/eventhandler.h>
#include <sys/exec.h>
#include <sys/imgact.h>
#include <sys/interrupt.h>
#include <sys/kdb.h>
#include <sys/kernel.h>
#include <sys/ktr.h>
#include <sys/linker.h>
#include <sys/lock.h>
#include <sys/msgbuf.h>
#include <sys/mutex.h>
#include <sys/pcpu.h>
#include <sys/ptrace.h>
#include <sys/reboot.h>
#include <sys/signalvar.h>
#include <sys/smp.h>
#include <sys/syscallsubr.h>
#include <sys/sysent.h>
#include <sys/sysproto.h>
#include <sys/timetc.h>
#include <sys/ucontext.h>
#include <dev/ofw/openfirm.h>
#include <vm/vm.h>
#include <vm/vm_extern.h>
#include <vm/vm_kern.h>
#include <vm/vm_page.h>
#include <vm/vm_map.h>
#include <vm/vm_object.h>
#include <vm/vm_pager.h>
#include <vm/vm_param.h>
#include <ddb/ddb.h>
#include <machine/bus.h>
#include <machine/cache.h>
#include <machine/clock.h>
#include <machine/cmt.h>
#include <machine/cpu.h>
#include <machine/fireplane.h>
#include <machine/fp.h>
#include <machine/fsr.h>
#include <machine/intr_machdep.h>
#include <machine/jbus.h>
#include <machine/md_var.h>
#include <machine/metadata.h>
#include <machine/ofw_machdep.h>
#include <machine/ofw_mem.h>
#include <machine/pcb.h>
#include <machine/pmap.h>
#include <machine/pstate.h>
#include <machine/reg.h>
#include <machine/sigframe.h>
#include <machine/smp.h>
#include <machine/tick.h>
#include <machine/tlb.h>
#include <machine/tstate.h>
#include <machine/upa.h>
#include <machine/ver.h>
typedef int ofw_vec_t(void *);
#ifdef DDB
extern vm_offset_t ksym_start, ksym_end;
#endif
int dtlb_slots;
int itlb_slots;
struct tlb_entry *kernel_tlbs;
int kernel_tlb_slots;
int cold = 1;
long Maxmem;
long realmem;
void *dpcpu0;
char pcpu0[PCPU_PAGES * PAGE_SIZE];
struct trapframe frame0;
vm_offset_t kstack0;
vm_paddr_t kstack0_phys;
struct kva_md_info kmi;
u_long ofw_vec;
u_long ofw_tba;
u_int tba_taken_over;
char sparc64_model[32];
static int cpu_use_vis = 1;
cpu_block_copy_t *cpu_block_copy;
cpu_block_zero_t *cpu_block_zero;
static phandle_t find_bsp(phandle_t node, uint32_t bspid, u_int cpu_impl);
void sparc64_init(caddr_t mdp, u_long o1, u_long o2, u_long o3,
ofw_vec_t *vec);
static void sparc64_shutdown_final(void *dummy, int howto);
static void cpu_startup(void *arg);
SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
CTASSERT((1 << INT_SHIFT) == sizeof(int));
CTASSERT((1 << PTR_SHIFT) == sizeof(char *));
CTASSERT(sizeof(struct reg) == 256);
CTASSERT(sizeof(struct fpreg) == 272);
CTASSERT(sizeof(struct __mcontext) == 512);
CTASSERT((sizeof(struct pcb) & (64 - 1)) == 0);
CTASSERT((offsetof(struct pcb, pcb_kfp) & (64 - 1)) == 0);
CTASSERT((offsetof(struct pcb, pcb_ufp) & (64 - 1)) == 0);
CTASSERT(sizeof(struct pcb) <= ((KSTACK_PAGES * PAGE_SIZE) / 8));
CTASSERT(sizeof(struct pcpu) <= ((PCPU_PAGES * PAGE_SIZE) / 2));
static void
cpu_startup(void *arg)
{
vm_paddr_t physsz;
int i;
physsz = 0;
for (i = 0; i < sparc64_nmemreg; i++)
physsz += sparc64_memreg[i].mr_size;
printf("real memory = %lu (%lu MB)\n", physsz,
physsz / (1024 * 1024));
realmem = (long)physsz / PAGE_SIZE;
vm_ksubmap_init(&kmi);
bufinit();
vm_pager_bufferinit();
EVENTHANDLER_REGISTER(shutdown_final, sparc64_shutdown_final, NULL,
SHUTDOWN_PRI_LAST);
printf("avail memory = %lu (%lu MB)\n", cnt.v_free_count * PAGE_SIZE,
cnt.v_free_count / ((1024 * 1024) / PAGE_SIZE));
if (bootverbose)
printf("machine: %s\n", sparc64_model);
cpu_identify(rdpr(ver), PCPU_GET(clock), curcpu);
}
void
cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
{
struct intr_request *ir;
int i;
pcpu->pc_irtail = &pcpu->pc_irhead;
for (i = 0; i < IR_FREE; i++) {
ir = &pcpu->pc_irpool[i];
ir->ir_next = pcpu->pc_irfree;
pcpu->pc_irfree = ir;
}
}
void
spinlock_enter(void)
{
struct thread *td;
register_t pil;
td = curthread;
if (td->td_md.md_spinlock_count == 0) {
pil = rdpr(pil);
wrpr(pil, 0, PIL_TICK);
td->td_md.md_spinlock_count = 1;
td->td_md.md_saved_pil = pil;
} else
td->td_md.md_spinlock_count++;
critical_enter();
}
void
spinlock_exit(void)
{
struct thread *td;
register_t pil;
td = curthread;
critical_exit();
pil = td->td_md.md_saved_pil;
td->td_md.md_spinlock_count--;
if (td->td_md.md_spinlock_count == 0)
wrpr(pil, pil, 0);
}
static phandle_t
find_bsp(phandle_t node, uint32_t bspid, u_int cpu_impl)
{
char type[sizeof("cpu")];
phandle_t child;
uint32_t cpuid;
for (; node != 0; node = OF_peer(node)) {
child = OF_child(node);
if (child > 0) {
child = find_bsp(child, bspid, cpu_impl);
if (child > 0)
return (child);
} else {
if (OF_getprop(node, "device_type", type,
sizeof(type)) <= 0)
continue;
if (strcmp(type, "cpu") != 0)
continue;
if (OF_getprop(node, cpu_cpuid_prop(cpu_impl), &cpuid,
sizeof(cpuid)) <= 0)
continue;
if (cpuid == bspid)
return (node);
}
}
return (0);
}
const char *
cpu_cpuid_prop(u_int cpu_impl)
{
switch (cpu_impl) {
case CPU_IMPL_SPARC64:
case CPU_IMPL_SPARC64V:
case CPU_IMPL_ULTRASPARCI:
case CPU_IMPL_ULTRASPARCII:
case CPU_IMPL_ULTRASPARCIIi:
case CPU_IMPL_ULTRASPARCIIe:
return ("upa-portid");
case CPU_IMPL_ULTRASPARCIII:
case CPU_IMPL_ULTRASPARCIIIp:
case CPU_IMPL_ULTRASPARCIIIi:
case CPU_IMPL_ULTRASPARCIIIip:
return ("portid");
case CPU_IMPL_ULTRASPARCIV:
case CPU_IMPL_ULTRASPARCIVp:
return ("cpuid");
default:
return ("");
}
}
uint32_t
cpu_get_mid(u_int cpu_impl)
{
switch (cpu_impl) {
case CPU_IMPL_SPARC64:
case CPU_IMPL_SPARC64V:
case CPU_IMPL_ULTRASPARCI:
case CPU_IMPL_ULTRASPARCII:
case CPU_IMPL_ULTRASPARCIIi:
case CPU_IMPL_ULTRASPARCIIe:
return (UPA_CR_GET_MID(ldxa(0, ASI_UPA_CONFIG_REG)));
case CPU_IMPL_ULTRASPARCIII:
case CPU_IMPL_ULTRASPARCIIIp:
return (FIREPLANE_CR_GET_AID(ldxa(AA_FIREPLANE_CONFIG,
ASI_FIREPLANE_CONFIG_REG)));
case CPU_IMPL_ULTRASPARCIIIi:
case CPU_IMPL_ULTRASPARCIIIip:
return (JBUS_CR_GET_JID(ldxa(0, ASI_JBUS_CONFIG_REG)));
case CPU_IMPL_ULTRASPARCIV:
case CPU_IMPL_ULTRASPARCIVp:
return (INTR_ID_GET_ID(ldxa(AA_INTR_ID, ASI_INTR_ID)));
default:
return (0);
}
}
void
sparc64_init(caddr_t mdp, u_long o1, u_long o2, u_long o3, ofw_vec_t *vec)
{
char *env;
struct pcpu *pc;
vm_offset_t end;
vm_offset_t va;
caddr_t kmdp;
phandle_t root;
u_int cpu_impl;
end = 0;
kmdp = NULL;
/*
* Find out what kind of CPU we have first, for anything that changes
* behaviour.
*/
cpu_impl = VER_IMPL(rdpr(ver));
/*
* Do CPU-specific initialization.
*/
if (cpu_impl >= CPU_IMPL_ULTRASPARCIII)
cheetah_init(cpu_impl);
else if (cpu_impl == CPU_IMPL_SPARC64V)
zeus_init(cpu_impl);
/*
* Clear (S)TICK timer (including NPT).
*/
tick_clear(cpu_impl);
/*
* UltraSparc II[e,i] based systems come up with the tick interrupt
* enabled and a handler that resets the tick counter, causing DELAY()
* to not work properly when used early in boot.
* UltraSPARC III based systems come up with the system tick interrupt
* enabled, causing an interrupt storm on startup since they are not
* handled.
*/
tick_stop(cpu_impl);
/*
* Set up Open Firmware entry points.
*/
ofw_tba = rdpr(tba);
ofw_vec = (u_long)vec;
/*
* Parse metadata if present and fetch parameters. Must be before the
* console is inited so cninit gets the right value of boothowto.
*/
if (mdp != NULL) {
preload_metadata = mdp;
kmdp = preload_search_by_type("elf kernel");
if (kmdp != NULL) {
boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
kern_envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *);
end = MD_FETCH(kmdp, MODINFOMD_KERNEND, vm_offset_t);
kernel_tlb_slots = MD_FETCH(kmdp, MODINFOMD_DTLB_SLOTS,
int);
kernel_tlbs = (void *)preload_search_info(kmdp,
MODINFO_METADATA | MODINFOMD_DTLB);
}
}
init_param1();
/*
* Initialize Open Firmware (needed for console).
*/
OF_install(OFW_STD_DIRECT, 0);
OF_init(ofw_entry);
/*
* Prime our per-CPU data page for use. Note, we are using it for
* our stack, so don't pass the real size (PAGE_SIZE) to pcpu_init
* or it'll zero it out from under us.
*/
pc = (struct pcpu *)(pcpu0 + (PCPU_PAGES * PAGE_SIZE)) - 1;
pcpu_init(pc, 0, sizeof(struct pcpu));
pc->pc_addr = (vm_offset_t)pcpu0;
pc->pc_impl = cpu_impl;
pc->pc_mid = cpu_get_mid(cpu_impl);
pc->pc_tlb_ctx = TLB_CTX_USER_MIN;
pc->pc_tlb_ctx_min = TLB_CTX_USER_MIN;
pc->pc_tlb_ctx_max = TLB_CTX_USER_MAX;
/*
* Determine the OFW node and frequency of the BSP (and ensure the
* BSP is in the device tree in the first place).
*/
root = OF_peer(0);
pc->pc_node = find_bsp(root, pc->pc_mid, cpu_impl);
if (pc->pc_node == 0)
OF_exit();
if (OF_getprop(pc->pc_node, "clock-frequency", &pc->pc_clock,
sizeof(pc->pc_clock)) <= 0)
OF_exit();
/*
* Provide a DELAY() that works before PCPU_REG is set. We can't
* set PCPU_REG without also taking over the trap table or the
* firmware will overwrite it. Unfortunately, it's way to early
* to also take over the trap table at this point.
*/
clock_boot = pc->pc_clock;
delay_func = delay_boot;
/*
* Initialize the console before printing anything.
* NB: the low-level console drivers require a working DELAY() at
* this point.
*/
cninit();
/*
* Panic if there is no metadata. Most likely the kernel was booted
* directly, instead of through loader(8).
*/
if (mdp == NULL || kmdp == NULL || end == 0 ||
kernel_tlb_slots == 0 || kernel_tlbs == NULL) {
printf("sparc64_init: missing loader metadata.\n"
"This probably means you are not using loader(8).\n");
panic("sparc64_init");
}
/*
* Work around the broken loader behavior of not demapping no
* longer used kernel TLB slots when unloading the kernel or
* modules.
*/
for (va = KERNBASE + (kernel_tlb_slots - 1) * PAGE_SIZE_4M;
va >= roundup2(end, PAGE_SIZE_4M); va -= PAGE_SIZE_4M) {
if (bootverbose)
printf("demapping unused kernel TLB slot "
"(va %#lx - %#lx)\n", va, va + PAGE_SIZE_4M - 1);
stxa(TLB_DEMAP_VA(va) | TLB_DEMAP_PRIMARY | TLB_DEMAP_PAGE,
ASI_DMMU_DEMAP, 0);
stxa(TLB_DEMAP_VA(va) | TLB_DEMAP_PRIMARY | TLB_DEMAP_PAGE,
ASI_IMMU_DEMAP, 0);
flush(KERNBASE);
kernel_tlb_slots--;
}
/*
* Determine the TLB slot maxima, which are expected to be
* equal across all CPUs.
* NB: for cheetah-class CPUs, these properties only refer
* to the t16s.
*/
if (OF_getprop(pc->pc_node, "#dtlb-entries", &dtlb_slots,
sizeof(dtlb_slots)) == -1)
panic("sparc64_init: cannot determine number of dTLB slots");
if (OF_getprop(pc->pc_node, "#itlb-entries", &itlb_slots,
sizeof(itlb_slots)) == -1)
panic("sparc64_init: cannot determine number of iTLB slots");
/*
* Initialize and enable the caches. Note that his may include
* applying workarounds.
*/
cache_init(pc);
cache_enable(cpu_impl);
uma_set_align(pc->pc_cache.dc_linesize - 1);
cpu_block_copy = bcopy;
cpu_block_zero = bzero;
getenv_int("machdep.use_vis", &cpu_use_vis);
if (cpu_use_vis) {
switch (cpu_impl) {
case CPU_IMPL_SPARC64:
case CPU_IMPL_ULTRASPARCI:
case CPU_IMPL_ULTRASPARCII:
case CPU_IMPL_ULTRASPARCIIi:
case CPU_IMPL_ULTRASPARCIIe:
case CPU_IMPL_ULTRASPARCIII: /* NB: we've disabled P$. */
case CPU_IMPL_ULTRASPARCIIIp:
case CPU_IMPL_ULTRASPARCIIIi:
case CPU_IMPL_ULTRASPARCIV:
case CPU_IMPL_ULTRASPARCIVp:
case CPU_IMPL_ULTRASPARCIIIip:
cpu_block_copy = spitfire_block_copy;
cpu_block_zero = spitfire_block_zero;
break;
case CPU_IMPL_SPARC64V:
cpu_block_copy = zeus_block_copy;
cpu_block_zero = zeus_block_zero;
break;
}
}
#ifdef SMP
mp_init(cpu_impl);
#endif
/*
* Initialize virtual memory and calculate physmem.
*/
pmap_bootstrap(cpu_impl);
/*
* Initialize tunables.
*/
init_param2(physmem);
env = getenv("kernelname");
if (env != NULL) {
strlcpy(kernelname, env, sizeof(kernelname));
freeenv(env);
}
/*
* Initialize the interrupt tables.
*/
intr_init1();
/*
* Initialize proc0, set kstack0, frame0, curthread and curpcb.
*/
proc_linkup0(&proc0, &thread0);
proc0.p_md.md_sigtramp = NULL;
proc0.p_md.md_utrap = NULL;
thread0.td_kstack = kstack0;
thread0.td_kstack_pages = KSTACK_PAGES;
thread0.td_pcb = (struct pcb *)
(thread0.td_kstack + KSTACK_PAGES * PAGE_SIZE) - 1;
frame0.tf_tstate = TSTATE_IE | TSTATE_PEF | TSTATE_PRIV;
thread0.td_frame = &frame0;
pc->pc_curthread = &thread0;
pc->pc_curpcb = thread0.td_pcb;
/*
* Initialize global registers.
*/
cpu_setregs(pc);
/*
* Take over the trap table via the PROM. Using the PROM for this
* is necessary in order to set obp-control-relinquished to true
* within the PROM so obtaining /virtual-memory/translations doesn't
* trigger a fatal reset error or worse things further down the road.
* XXX it should be possible to use this solely instead of writing
* %tba in cpu_setregs(). Doing so causes a hang however.
*/
sun4u_set_traptable(tl0_base);
/*
* It's now safe to use the real DELAY().
*/
delay_func = delay_tick;
/*
* Initialize the dynamic per-CPU area for the BSP and the message
* buffer (after setting the trap table).
*/
dpcpu_init(dpcpu0, 0);
msgbufinit(msgbufp, msgbufsize);
/*
* Initialize mutexes.
*/
mutex_init();
/*
* Finish the interrupt initialization now that mutexes work and
* enable them.
*/
intr_init2();
wrpr(pil, 0, 0);
wrpr(pstate, 0, PSTATE_KERNEL);
/*
* Finish pmap initialization now that we're ready for mutexes.
*/
PMAP_LOCK_INIT(kernel_pmap);
OF_getprop(root, "name", sparc64_model, sizeof(sparc64_model) - 1);
kdb_init();
#ifdef KDB
if (boothowto & RB_KDB)
kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
#endif
}
void
sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
{
struct trapframe *tf;
struct sigframe *sfp;
struct sigacts *psp;
struct sigframe sf;
struct thread *td;
struct frame *fp;
struct proc *p;
u_long sp;
int oonstack;
int sig;
oonstack = 0;
td = curthread;
p = td->td_proc;
PROC_LOCK_ASSERT(p, MA_OWNED);
sig = ksi->ksi_signo;
psp = p->p_sigacts;
mtx_assert(&psp->ps_mtx, MA_OWNED);
tf = td->td_frame;
sp = tf->tf_sp + SPOFF;
oonstack = sigonstack(sp);
CTR4(KTR_SIG, "sendsig: td=%p (%s) catcher=%p sig=%d", td, p->p_comm,
catcher, sig);
/* Make sure we have a signal trampoline to return to. */
if (p->p_md.md_sigtramp == NULL) {
/*
* No signal trampoline... kill the process.
*/
CTR0(KTR_SIG, "sendsig: no sigtramp");
printf("sendsig: %s is too old, rebuild it\n", p->p_comm);
sigexit(td, sig);
/* NOTREACHED */
}
/* Save user context. */
bzero(&sf, sizeof(sf));
get_mcontext(td, &sf.sf_uc.uc_mcontext, 0);
sf.sf_uc.uc_sigmask = *mask;
sf.sf_uc.uc_stack = td->td_sigstk;
sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) ?
((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
/* Allocate and validate space for the signal handler context. */
if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
SIGISMEMBER(psp->ps_sigonstack, sig)) {
sfp = (struct sigframe *)(td->td_sigstk.ss_sp +
td->td_sigstk.ss_size - sizeof(struct sigframe));
} else
sfp = (struct sigframe *)sp - 1;
mtx_unlock(&psp->ps_mtx);
PROC_UNLOCK(p);
fp = (struct frame *)sfp - 1;
/* Translate the signal if appropriate. */
if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
/* Build the argument list for the signal handler. */
tf->tf_out[0] = sig;
tf->tf_out[2] = (register_t)&sfp->sf_uc;
tf->tf_out[4] = (register_t)catcher;
if (SIGISMEMBER(psp->ps_siginfo, sig)) {
/* Signal handler installed with SA_SIGINFO. */
tf->tf_out[1] = (register_t)&sfp->sf_si;
/* Fill in POSIX parts. */
sf.sf_si = ksi->ksi_info;
sf.sf_si.si_signo = sig; /* maybe a translated signal */
} else {
/* Old FreeBSD-style arguments. */
tf->tf_out[1] = ksi->ksi_code;
tf->tf_out[3] = (register_t)ksi->ksi_addr;
}
/* Copy the sigframe out to the user's stack. */
if (rwindow_save(td) != 0 || copyout(&sf, sfp, sizeof(*sfp)) != 0 ||
suword(&fp->fr_in[6], tf->tf_out[6]) != 0) {
/*
* Something is wrong with the stack pointer.
* ...Kill the process.
*/
CTR2(KTR_SIG, "sendsig: sigexit td=%p sfp=%p", td, sfp);
PROC_LOCK(p);
sigexit(td, SIGILL);
/* NOTREACHED */
}
tf->tf_tpc = (u_long)p->p_md.md_sigtramp;
tf->tf_tnpc = tf->tf_tpc + 4;
tf->tf_sp = (u_long)fp - SPOFF;
CTR3(KTR_SIG, "sendsig: return td=%p pc=%#lx sp=%#lx", td, tf->tf_tpc,
tf->tf_sp);
PROC_LOCK(p);
mtx_lock(&psp->ps_mtx);
}
#ifndef _SYS_SYSPROTO_H_
struct sigreturn_args {
ucontext_t *ucp;
};
#endif
/*
* MPSAFE
*/
int
-sigreturn(struct thread *td, struct sigreturn_args *uap)
+sys_sigreturn(struct thread *td, struct sigreturn_args *uap)
{
struct proc *p;
mcontext_t *mc;
ucontext_t uc;
int error;
p = td->td_proc;
if (rwindow_save(td)) {
PROC_LOCK(p);
sigexit(td, SIGILL);
}
CTR2(KTR_SIG, "sigreturn: td=%p ucp=%p", td, uap->sigcntxp);
if (copyin(uap->sigcntxp, &uc, sizeof(uc)) != 0) {
CTR1(KTR_SIG, "sigreturn: efault td=%p", td);
return (EFAULT);
}
mc = &uc.uc_mcontext;
error = set_mcontext(td, mc);
if (error != 0)
return (error);
kern_sigprocmask(td, SIG_SETMASK, &uc.uc_sigmask, NULL, 0);
CTR4(KTR_SIG, "sigreturn: return td=%p pc=%#lx sp=%#lx tstate=%#lx",
td, mc->mc_tpc, mc->mc_sp, mc->mc_tstate);
return (EJUSTRETURN);
}
/*
* Construct a PCB from a trapframe. This is called from kdb_trap() where
* we want to start a backtrace from the function that caused us to enter
* the debugger. We have the context in the trapframe, but base the trace
* on the PCB. The PCB doesn't have to be perfect, as long as it contains
* enough for a backtrace.
*/
void
makectx(struct trapframe *tf, struct pcb *pcb)
{
pcb->pcb_pc = tf->tf_tpc;
pcb->pcb_sp = tf->tf_sp;
}
int
get_mcontext(struct thread *td, mcontext_t *mc, int flags)
{
struct trapframe *tf;
struct pcb *pcb;
tf = td->td_frame;
pcb = td->td_pcb;
/*
* Copy the registers which will be restored by tl0_ret() from the
* trapframe.
* Note that we skip %g7 which is used as the userland TLS register
* and %wstate.
*/
mc->mc_flags = _MC_VERSION;
mc->mc_global[1] = tf->tf_global[1];
mc->mc_global[2] = tf->tf_global[2];
mc->mc_global[3] = tf->tf_global[3];
mc->mc_global[4] = tf->tf_global[4];
mc->mc_global[5] = tf->tf_global[5];
mc->mc_global[6] = tf->tf_global[6];
if (flags & GET_MC_CLEAR_RET) {
mc->mc_out[0] = 0;
mc->mc_out[1] = 0;
} else {
mc->mc_out[0] = tf->tf_out[0];
mc->mc_out[1] = tf->tf_out[1];
}
mc->mc_out[2] = tf->tf_out[2];
mc->mc_out[3] = tf->tf_out[3];
mc->mc_out[4] = tf->tf_out[4];
mc->mc_out[5] = tf->tf_out[5];
mc->mc_out[6] = tf->tf_out[6];
mc->mc_out[7] = tf->tf_out[7];
mc->mc_fprs = tf->tf_fprs;
mc->mc_fsr = tf->tf_fsr;
mc->mc_gsr = tf->tf_gsr;
mc->mc_tnpc = tf->tf_tnpc;
mc->mc_tpc = tf->tf_tpc;
mc->mc_tstate = tf->tf_tstate;
mc->mc_y = tf->tf_y;
critical_enter();
if ((tf->tf_fprs & FPRS_FEF) != 0) {
savefpctx(pcb->pcb_ufp);
tf->tf_fprs &= ~FPRS_FEF;
pcb->pcb_flags |= PCB_FEF;
}
if ((pcb->pcb_flags & PCB_FEF) != 0) {
bcopy(pcb->pcb_ufp, mc->mc_fp, sizeof(mc->mc_fp));
mc->mc_fprs |= FPRS_FEF;
}
critical_exit();
return (0);
}
int
set_mcontext(struct thread *td, const mcontext_t *mc)
{
struct trapframe *tf;
struct pcb *pcb;
if (!TSTATE_SECURE(mc->mc_tstate) ||
(mc->mc_flags & ((1L << _MC_VERSION_BITS) - 1)) != _MC_VERSION)
return (EINVAL);
tf = td->td_frame;
pcb = td->td_pcb;
/* Make sure the windows are spilled first. */
flushw();
/*
* Copy the registers which will be restored by tl0_ret() to the
* trapframe.
* Note that we skip %g7 which is used as the userland TLS register
* and %wstate.
*/
tf->tf_global[1] = mc->mc_global[1];
tf->tf_global[2] = mc->mc_global[2];
tf->tf_global[3] = mc->mc_global[3];
tf->tf_global[4] = mc->mc_global[4];
tf->tf_global[5] = mc->mc_global[5];
tf->tf_global[6] = mc->mc_global[6];
tf->tf_out[0] = mc->mc_out[0];
tf->tf_out[1] = mc->mc_out[1];
tf->tf_out[2] = mc->mc_out[2];
tf->tf_out[3] = mc->mc_out[3];
tf->tf_out[4] = mc->mc_out[4];
tf->tf_out[5] = mc->mc_out[5];
tf->tf_out[6] = mc->mc_out[6];
tf->tf_out[7] = mc->mc_out[7];
tf->tf_fprs = mc->mc_fprs;
tf->tf_fsr = mc->mc_fsr;
tf->tf_gsr = mc->mc_gsr;
tf->tf_tnpc = mc->mc_tnpc;
tf->tf_tpc = mc->mc_tpc;
tf->tf_tstate = mc->mc_tstate;
tf->tf_y = mc->mc_y;
if ((mc->mc_fprs & FPRS_FEF) != 0) {
tf->tf_fprs = 0;
bcopy(mc->mc_fp, pcb->pcb_ufp, sizeof(pcb->pcb_ufp));
pcb->pcb_flags |= PCB_FEF;
}
return (0);
}
/*
* Exit the kernel and execute a firmware call that will not return, as
* specified by the arguments.
*/
void
cpu_shutdown(void *args)
{
#ifdef SMP
cpu_mp_shutdown();
#endif
ofw_exit(args);
}
/*
* Flush the D-cache for non-DMA I/O so that the I-cache can
* be made coherent later.
*/
void
cpu_flush_dcache(void *ptr, size_t len)
{
/* TBD */
}
/* Get current clock frequency for the given CPU ID. */
int
cpu_est_clockrate(int cpu_id, uint64_t *rate)
{
struct pcpu *pc;
pc = pcpu_find(cpu_id);
if (pc == NULL || rate == NULL)
return (EINVAL);
*rate = pc->pc_clock;
return (0);
}
/*
* Duplicate OF_exit() with a different firmware call function that restores
* the trap table, otherwise a RED state exception is triggered in at least
* some firmware versions.
*/
void
cpu_halt(void)
{
static struct {
cell_t name;
cell_t nargs;
cell_t nreturns;
} args = {
(cell_t)"exit",
0,
0
};
cpu_shutdown(&args);
}
static void
sparc64_shutdown_final(void *dummy, int howto)
{
static struct {
cell_t name;
cell_t nargs;
cell_t nreturns;
} args = {
(cell_t)"SUNW,power-off",
0,
0
};
/* Turn the power off? */
if ((howto & RB_POWEROFF) != 0)
cpu_shutdown(&args);
/* In case of halt, return to the firmware. */
if ((howto & RB_HALT) != 0)
cpu_halt();
}
void
cpu_idle(int busy)
{
/* Insert code to halt (until next interrupt) for the idle loop. */
}
int
cpu_idle_wakeup(int cpu)
{
return (1);
}
int
ptrace_set_pc(struct thread *td, u_long addr)
{
td->td_frame->tf_tpc = addr;
td->td_frame->tf_tnpc = addr + 4;
return (0);
}
int
ptrace_single_step(struct thread *td)
{
/* TODO; */
return (0);
}
int
ptrace_clear_single_step(struct thread *td)
{
/* TODO; */
return (0);
}
void
exec_setregs(struct thread *td, struct image_params *imgp, u_long stack)
{
struct trapframe *tf;
struct pcb *pcb;
struct proc *p;
u_long sp;
/* XXX no cpu_exec */
p = td->td_proc;
p->p_md.md_sigtramp = NULL;
if (p->p_md.md_utrap != NULL) {
utrap_free(p->p_md.md_utrap);
p->p_md.md_utrap = NULL;
}
pcb = td->td_pcb;
tf = td->td_frame;
sp = rounddown(stack, 16);
bzero(pcb, sizeof(*pcb));
bzero(tf, sizeof(*tf));
tf->tf_out[0] = stack;
tf->tf_out[3] = p->p_sysent->sv_psstrings;
tf->tf_out[6] = sp - SPOFF - sizeof(struct frame);
tf->tf_tnpc = imgp->entry_addr + 4;
tf->tf_tpc = imgp->entry_addr;
tf->tf_tstate = TSTATE_IE | TSTATE_PEF | TSTATE_MM_TSO;
td->td_retval[0] = tf->tf_out[0];
td->td_retval[1] = tf->tf_out[1];
}
int
fill_regs(struct thread *td, struct reg *regs)
{
bcopy(td->td_frame, regs, sizeof(*regs));
return (0);
}
int
set_regs(struct thread *td, struct reg *regs)
{
struct trapframe *tf;
if (!TSTATE_SECURE(regs->r_tstate))
return (EINVAL);
tf = td->td_frame;
regs->r_wstate = tf->tf_wstate;
bcopy(regs, tf, sizeof(*regs));
return (0);
}
int
fill_dbregs(struct thread *td, struct dbreg *dbregs)
{
return (ENOSYS);
}
int
set_dbregs(struct thread *td, struct dbreg *dbregs)
{
return (ENOSYS);
}
int
fill_fpregs(struct thread *td, struct fpreg *fpregs)
{
struct trapframe *tf;
struct pcb *pcb;
pcb = td->td_pcb;
tf = td->td_frame;
bcopy(pcb->pcb_ufp, fpregs->fr_regs, sizeof(fpregs->fr_regs));
fpregs->fr_fsr = tf->tf_fsr;
fpregs->fr_gsr = tf->tf_gsr;
return (0);
}
int
set_fpregs(struct thread *td, struct fpreg *fpregs)
{
struct trapframe *tf;
struct pcb *pcb;
pcb = td->td_pcb;
tf = td->td_frame;
tf->tf_fprs &= ~FPRS_FEF;
bcopy(fpregs->fr_regs, pcb->pcb_ufp, sizeof(pcb->pcb_ufp));
tf->tf_fsr = fpregs->fr_fsr;
tf->tf_gsr = fpregs->fr_gsr;
return (0);
}
struct md_utrap *
utrap_alloc(void)
{
struct md_utrap *ut;
ut = malloc(sizeof(struct md_utrap), M_SUBPROC, M_WAITOK | M_ZERO);
ut->ut_refcnt = 1;
return (ut);
}
void
utrap_free(struct md_utrap *ut)
{
int refcnt;
if (ut == NULL)
return;
mtx_pool_lock(mtxpool_sleep, ut);
ut->ut_refcnt--;
refcnt = ut->ut_refcnt;
mtx_pool_unlock(mtxpool_sleep, ut);
if (refcnt == 0)
free(ut, M_SUBPROC);
}
struct md_utrap *
utrap_hold(struct md_utrap *ut)
{
if (ut == NULL)
return (NULL);
mtx_pool_lock(mtxpool_sleep, ut);
ut->ut_refcnt++;
mtx_pool_unlock(mtxpool_sleep, ut);
return (ut);
}
Index: head/sys/sys/posix4.h
===================================================================
--- head/sys/sys/posix4.h (revision 225616)
+++ head/sys/sys/posix4.h (revision 225617)
@@ -1,117 +1,117 @@
#ifndef _P1003_1B_P1003_1B_H_
#define _P1003_1B_P1003_1B_H_
/*-
* Copyright (c) 1996, 1997, 1998
* HD Associates, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by HD Associates, Inc
* 4. Neither the name of the author nor the names of any co-contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY HD ASSOCIATES AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL HD ASSOCIATES OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/param.h>
#include <sys/ioccom.h>
#include <sys/malloc.h>
#include <sys/sched.h>
/* Generate syscall stubs for when something is optionally
* loadable as a module. References "syscall_not_present".
* XXX Good candidate for sys/syscall.h
*/
struct proc;
struct nosys_args;
extern int syscall_not_present(struct thread *, const char *, struct nosys_args *);
#define SYSCALL_NOT_PRESENT_GEN(SC) \
-int SC (struct thread *td, struct SC##_args *uap) \
+int sys_ ## SC (struct thread *td, struct SC##_args *uap) \
{ \
return syscall_not_present(td, #SC , (struct nosys_args *)uap); \
}
MALLOC_DECLARE(M_P31B);
#define p31b_malloc(SIZE) malloc((SIZE), M_P31B, M_WAITOK)
#define p31b_free(P) free((P), M_P31B)
int p31b_proc(struct proc *, pid_t, struct proc **);
void p31b_setcfg(int, int);
int p31b_getcfg(int);
int p31b_iscfg(int);
void p31b_unsetcfg(int);
#ifdef _KPOSIX_PRIORITY_SCHEDULING
/*
* KSCHED_OP_RW is a vector of read/write flags for each entry indexed
* by the enum ksched_op.
*
* 1 means you need write access, 0 means read is sufficient.
*/
enum ksched_op {
#define KSCHED_OP_RW { 1, 0, 1, 0, 0, 0, 0, 0 }
SCHED_SETPARAM,
SCHED_GETPARAM,
SCHED_SETSCHEDULER,
SCHED_GETSCHEDULER,
SCHED_YIELD,
SCHED_GET_PRIORITY_MAX,
SCHED_GET_PRIORITY_MIN,
SCHED_RR_GET_INTERVAL,
SCHED_OP_MAX
};
struct ksched;
int ksched_attach(struct ksched **);
int ksched_detach(struct ksched *);
int ksched_setparam(struct ksched *,
struct thread *, const struct sched_param *);
int ksched_getparam(struct ksched *,
struct thread *, struct sched_param *);
int ksched_setscheduler(struct ksched *,
struct thread *, int, const struct sched_param *);
int ksched_getscheduler(struct ksched *, struct thread *, int *);
int ksched_yield(struct ksched *);
int ksched_get_priority_max(struct ksched *, int, int *);
int ksched_get_priority_min(struct ksched *, int, int *);
int ksched_rr_get_interval(struct ksched *,
struct thread *, struct timespec *);
#endif /* _KPOSIX_PRIORITY_SCHEDULING */
#endif /* _P1003_1B_P1003_1B_H_ */
Index: head/sys/sys/signalvar.h
===================================================================
--- head/sys/sys/signalvar.h (revision 225616)
+++ head/sys/sys/signalvar.h (revision 225617)
@@ -1,367 +1,367 @@
/*-
* Copyright (c) 1991, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)signalvar.h 8.6 (Berkeley) 2/19/95
* $FreeBSD$
*/
#ifndef _SYS_SIGNALVAR_H_
#define _SYS_SIGNALVAR_H_
#include <sys/queue.h>
#include <sys/_lock.h>
#include <sys/_mutex.h>
#include <sys/signal.h>
/*
* Kernel signal definitions and data structures.
*/
/*
* Logical process signal actions and state, needed only within the process
* The mapping between sigacts and proc structures is 1:1 except for rfork()
* processes masquerading as threads which use one structure for the whole
* group. All members are locked by the included mutex. The reference count
* and mutex must be last for the bcopy in sigacts_copy() to work.
*/
struct sigacts {
sig_t ps_sigact[_SIG_MAXSIG]; /* Disposition of signals. */
sigset_t ps_catchmask[_SIG_MAXSIG]; /* Signals to be blocked. */
sigset_t ps_sigonstack; /* Signals to take on sigstack. */
sigset_t ps_sigintr; /* Signals that interrupt syscalls. */
sigset_t ps_sigreset; /* Signals that reset when caught. */
sigset_t ps_signodefer; /* Signals not masked while handled. */
sigset_t ps_siginfo; /* Signals that want SA_SIGINFO args. */
sigset_t ps_sigignore; /* Signals being ignored. */
sigset_t ps_sigcatch; /* Signals being caught by user. */
sigset_t ps_freebsd4; /* Signals using freebsd4 ucontext. */
sigset_t ps_osigset; /* Signals using <= 3.x osigset_t. */
sigset_t ps_usertramp; /* SunOS compat; libc sigtramp. XXX */
int ps_flag;
int ps_refcnt;
struct mtx ps_mtx;
};
#define PS_NOCLDWAIT 0x0001 /* No zombies if child dies */
#define PS_NOCLDSTOP 0x0002 /* No SIGCHLD when children stop. */
#define PS_CLDSIGIGN 0x0004 /* The SIGCHLD handler is SIG_IGN. */
#ifdef _KERNEL
#ifdef COMPAT_43
typedef struct {
struct osigcontext si_sc;
int si_signo;
int si_code;
union sigval si_value;
} osiginfo_t;
struct osigaction {
union {
void (*__sa_handler)(int);
void (*__sa_sigaction)(int, osiginfo_t *, void *);
} __sigaction_u; /* signal handler */
osigset_t sa_mask; /* signal mask to apply */
int sa_flags; /* see signal options below */
};
typedef void __osiginfohandler_t(int, osiginfo_t *, void *);
#endif /* COMPAT_43 */
/* additional signal action values, used only temporarily/internally */
#define SIG_CATCH ((__sighandler_t *)2)
/* #define SIG_HOLD ((__sighandler_t *)3) See signal.h */
/*
* get signal action for process and signal; currently only for current process
*/
#define SIGACTION(p, sig) (p->p_sigacts->ps_sigact[_SIG_IDX(sig)])
#endif /* _KERNEL */
/*
* sigset_t manipulation macros.
*/
#define SIGADDSET(set, signo) \
((set).__bits[_SIG_WORD(signo)] |= _SIG_BIT(signo))
#define SIGDELSET(set, signo) \
((set).__bits[_SIG_WORD(signo)] &= ~_SIG_BIT(signo))
#define SIGEMPTYSET(set) \
do { \
int __i; \
for (__i = 0; __i < _SIG_WORDS; __i++) \
(set).__bits[__i] = 0; \
} while (0)
#define SIGFILLSET(set) \
do { \
int __i; \
for (__i = 0; __i < _SIG_WORDS; __i++) \
(set).__bits[__i] = ~0U; \
} while (0)
#define SIGISMEMBER(set, signo) \
((set).__bits[_SIG_WORD(signo)] & _SIG_BIT(signo))
#define SIGISEMPTY(set) (__sigisempty(&(set)))
#define SIGNOTEMPTY(set) (!__sigisempty(&(set)))
#define SIGSETEQ(set1, set2) (__sigseteq(&(set1), &(set2)))
#define SIGSETNEQ(set1, set2) (!__sigseteq(&(set1), &(set2)))
#define SIGSETOR(set1, set2) \
do { \
int __i; \
for (__i = 0; __i < _SIG_WORDS; __i++) \
(set1).__bits[__i] |= (set2).__bits[__i]; \
} while (0)
#define SIGSETAND(set1, set2) \
do { \
int __i; \
for (__i = 0; __i < _SIG_WORDS; __i++) \
(set1).__bits[__i] &= (set2).__bits[__i]; \
} while (0)
#define SIGSETNAND(set1, set2) \
do { \
int __i; \
for (__i = 0; __i < _SIG_WORDS; __i++) \
(set1).__bits[__i] &= ~(set2).__bits[__i]; \
} while (0)
#define SIGSETLO(set1, set2) ((set1).__bits[0] = (set2).__bits[0])
#define SIGSETOLD(set, oset) ((set).__bits[0] = (oset))
#define SIG_CANTMASK(set) \
SIGDELSET(set, SIGKILL), SIGDELSET(set, SIGSTOP)
#define SIG_STOPSIGMASK(set) \
SIGDELSET(set, SIGSTOP), SIGDELSET(set, SIGTSTP), \
SIGDELSET(set, SIGTTIN), SIGDELSET(set, SIGTTOU)
#define SIG_CONTSIGMASK(set) \
SIGDELSET(set, SIGCONT)
#define sigcantmask (sigmask(SIGKILL) | sigmask(SIGSTOP))
#define SIG2OSIG(sig, osig) (osig = (sig).__bits[0])
#define OSIG2SIG(osig, sig) SIGEMPTYSET(sig); (sig).__bits[0] = osig
static __inline int
__sigisempty(sigset_t *set)
{
int i;
for (i = 0; i < _SIG_WORDS; i++) {
if (set->__bits[i])
return (0);
}
return (1);
}
static __inline int
__sigseteq(sigset_t *set1, sigset_t *set2)
{
int i;
for (i = 0; i < _SIG_WORDS; i++) {
if (set1->__bits[i] != set2->__bits[i])
return (0);
}
return (1);
}
struct osigevent {
int sigev_notify; /* Notification type */
union {
int __sigev_signo; /* Signal number */
int __sigev_notify_kqueue;
} __sigev_u;
union sigval sigev_value; /* Signal value */
};
typedef struct ksiginfo {
TAILQ_ENTRY(ksiginfo) ksi_link;
siginfo_t ksi_info;
int ksi_flags;
struct sigqueue *ksi_sigq;
} ksiginfo_t;
#define ksi_signo ksi_info.si_signo
#define ksi_errno ksi_info.si_errno
#define ksi_code ksi_info.si_code
#define ksi_pid ksi_info.si_pid
#define ksi_uid ksi_info.si_uid
#define ksi_status ksi_info.si_status
#define ksi_addr ksi_info.si_addr
#define ksi_value ksi_info.si_value
#define ksi_band ksi_info.si_band
#define ksi_trapno ksi_info.si_trapno
#define ksi_overrun ksi_info.si_overrun
#define ksi_timerid ksi_info.si_timerid
#define ksi_mqd ksi_info.si_mqd
/* bits for ksi_flags */
#define KSI_TRAP 0x01 /* Generated by trap. */
#define KSI_EXT 0x02 /* Externally managed ksi. */
#define KSI_INS 0x04 /* Directly insert ksi, not the copy */
#define KSI_SIGQ 0x08 /* Generated by sigqueue, might ret EGAIN. */
#define KSI_HEAD 0x10 /* Insert into head, not tail. */
#define KSI_COPYMASK (KSI_TRAP|KSI_SIGQ)
#define KSI_ONQ(ksi) ((ksi)->ksi_sigq != NULL)
typedef struct sigqueue {
sigset_t sq_signals; /* All pending signals. */
sigset_t sq_kill; /* Legacy depth 1 queue. */
TAILQ_HEAD(, ksiginfo) sq_list;/* Queued signal info. */
struct proc *sq_proc;
int sq_flags;
} sigqueue_t;
/* Flags for ksi_flags */
#define SQ_INIT 0x01
#ifdef _KERNEL
/* Return nonzero if process p has an unmasked pending signal. */
#define SIGPENDING(td) \
((!SIGISEMPTY((td)->td_siglist) && \
!sigsetmasked(&(td)->td_siglist, &(td)->td_sigmask)) || \
(!SIGISEMPTY((td)->td_proc->p_siglist) && \
!sigsetmasked(&(td)->td_proc->p_siglist, &(td)->td_sigmask)))
/*
* Return the value of the pseudo-expression ((*set & ~*mask) != 0). This
* is an optimized version of SIGISEMPTY() on a temporary variable
* containing SIGSETNAND(*set, *mask).
*/
static __inline int
sigsetmasked(sigset_t *set, sigset_t *mask)
{
int i;
for (i = 0; i < _SIG_WORDS; i++) {
if (set->__bits[i] & ~mask->__bits[i])
return (0);
}
return (1);
}
#define ksiginfo_init(ksi) \
do { \
bzero(ksi, sizeof(ksiginfo_t)); \
} while(0)
#define ksiginfo_init_trap(ksi) \
do { \
ksiginfo_t *kp = ksi; \
bzero(kp, sizeof(ksiginfo_t)); \
kp->ksi_flags |= KSI_TRAP; \
} while(0)
static __inline void
ksiginfo_copy(ksiginfo_t *src, ksiginfo_t *dst)
{
(dst)->ksi_info = src->ksi_info;
(dst)->ksi_flags = (src->ksi_flags & KSI_COPYMASK);
}
static __inline void
ksiginfo_set_sigev(ksiginfo_t *dst, struct sigevent *sigev)
{
dst->ksi_signo = sigev->sigev_signo;
dst->ksi_value = sigev->sigev_value;
}
struct pgrp;
struct proc;
struct sigio;
struct thread;
/*
* Lock the pointers for a sigio object in the underlying objects of
* a file descriptor.
*/
#define SIGIO_LOCK() mtx_lock(&sigio_lock)
#define SIGIO_TRYLOCK() mtx_trylock(&sigio_lock)
#define SIGIO_UNLOCK() mtx_unlock(&sigio_lock)
#define SIGIO_LOCKED() mtx_owned(&sigio_lock)
#define SIGIO_ASSERT(type) mtx_assert(&sigio_lock, type)
extern struct mtx sigio_lock;
/* Values for stop_allowed parameter for cursig(). */
#define SIG_STOP_ALLOWED 100
#define SIG_STOP_NOT_ALLOWED 101
/* Flags for kern_sigprocmask(). */
#define SIGPROCMASK_OLD 0x0001
#define SIGPROCMASK_PROC_LOCKED 0x0002
#define SIGPROCMASK_PS_LOCKED 0x0004
int cursig(struct thread *td, int stop_allowed);
void execsigs(struct proc *p);
void gsignal(int pgid, int sig, ksiginfo_t *ksi);
void killproc(struct proc *p, char *why);
ksiginfo_t * ksiginfo_alloc(int wait);
void ksiginfo_free(ksiginfo_t *ksi);
int pksignal(struct proc *p, int sig, ksiginfo_t *ksi);
void pgsigio(struct sigio **sigiop, int sig, int checkctty);
void pgsignal(struct pgrp *pgrp, int sig, int checkctty, ksiginfo_t *ksi);
int postsig(int sig);
-void psignal(struct proc *p, int sig);
+void kern_psignal(struct proc *p, int sig);
int ptracestop(struct thread *td, int sig);
void sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *retmask);
struct sigacts *sigacts_alloc(void);
void sigacts_copy(struct sigacts *dest, struct sigacts *src);
void sigacts_free(struct sigacts *ps);
struct sigacts *sigacts_hold(struct sigacts *ps);
int sigacts_shared(struct sigacts *ps);
void sigexit(struct thread *td, int sig) __dead2;
int sigev_findtd(struct proc *p, struct sigevent *sigev, struct thread **);
int sig_ffs(sigset_t *set);
void siginit(struct proc *p);
void signotify(struct thread *td);
void sigqueue_delete(struct sigqueue *queue, int sig);
void sigqueue_delete_proc(struct proc *p, int sig);
void sigqueue_flush(struct sigqueue *queue);
void sigqueue_init(struct sigqueue *queue, struct proc *p);
void sigqueue_take(ksiginfo_t *ksi);
void tdksignal(struct thread *td, int sig, ksiginfo_t *ksi);
int tdsendsignal(struct proc *p, struct thread *td, int sig,
ksiginfo_t *ksi);
void tdsigcleanup(struct thread *td);
void tdsignal(struct thread *td, int sig);
void trapsignal(struct thread *td, ksiginfo_t *ksi);
#endif /* _KERNEL */
#endif /* !_SYS_SIGNALVAR_H_ */
Index: head/sys/sys/sysent.h
===================================================================
--- head/sys/sys/sysent.h (revision 225616)
+++ head/sys/sys/sysent.h (revision 225617)
@@ -1,247 +1,264 @@
/*-
* Copyright (c) 1982, 1988, 1991 The Regents of the University of California.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _SYS_SYSENT_H_
#define _SYS_SYSENT_H_
#include <bsm/audit.h>
struct rlimit;
struct sysent;
struct thread;
struct ksiginfo;
typedef int sy_call_t(struct thread *, void *);
/* Used by the machine dependent syscall() code. */
typedef void (*systrace_probe_func_t)(u_int32_t, int, struct sysent *, void *,
int);
/*
* Used by loaded syscalls to convert arguments to a DTrace array
* of 64-bit arguments.
*/
typedef void (*systrace_args_func_t)(int, void *, u_int64_t *, int *);
extern systrace_probe_func_t systrace_probe_func;
struct sysent { /* system call table */
int sy_narg; /* number of arguments */
sy_call_t *sy_call; /* implementing function */
au_event_t sy_auevent; /* audit event associated with syscall */
systrace_args_func_t sy_systrace_args_func;
/* optional argument conversion function. */
u_int32_t sy_entry; /* DTrace entry ID for systrace. */
u_int32_t sy_return; /* DTrace return ID for systrace. */
u_int32_t sy_flags; /* General flags for system calls. */
u_int32_t sy_thrcnt;
};
/*
* A system call is permitted in capability mode.
*/
#define SYF_CAPENABLED 0x00000001
#define SY_THR_FLAGMASK 0x7
#define SY_THR_STATIC 0x1
#define SY_THR_DRAINING 0x2
#define SY_THR_ABSENT 0x4
#define SY_THR_INCR 0x8
struct image_params;
struct __sigset;
struct syscall_args;
struct trapframe;
struct vnode;
struct sysentvec {
int sv_size; /* number of entries */
struct sysent *sv_table; /* pointer to sysent */
u_int sv_mask; /* optional mask to index */
int sv_sigsize; /* size of signal translation table */
int *sv_sigtbl; /* signal translation table */
int sv_errsize; /* size of errno translation table */
int *sv_errtbl; /* errno translation table */
int (*sv_transtrap)(int, int);
/* translate trap-to-signal mapping */
int (*sv_fixup)(register_t **, struct image_params *);
/* stack fixup function */
void (*sv_sendsig)(void (*)(int), struct ksiginfo *, struct __sigset *);
/* send signal */
char *sv_sigcode; /* start of sigtramp code */
int *sv_szsigcode; /* size of sigtramp code */
void (*sv_prepsyscall)(struct trapframe *, int *, u_int *,
caddr_t *);
char *sv_name; /* name of binary type */
int (*sv_coredump)(struct thread *, struct vnode *, off_t, int);
/* function to dump core, or NULL */
int (*sv_imgact_try)(struct image_params *);
int sv_minsigstksz; /* minimum signal stack size */
int sv_pagesize; /* pagesize */
vm_offset_t sv_minuser; /* VM_MIN_ADDRESS */
vm_offset_t sv_maxuser; /* VM_MAXUSER_ADDRESS */
vm_offset_t sv_usrstack; /* USRSTACK */
vm_offset_t sv_psstrings; /* PS_STRINGS */
int sv_stackprot; /* vm protection for stack */
register_t *(*sv_copyout_strings)(struct image_params *);
void (*sv_setregs)(struct thread *, struct image_params *,
u_long);
void (*sv_fixlimit)(struct rlimit *, int);
u_long *sv_maxssiz;
u_int sv_flags;
void (*sv_set_syscall_retval)(struct thread *, int);
int (*sv_fetch_syscall_args)(struct thread *, struct
syscall_args *);
const char **sv_syscallnames;
vm_offset_t sv_shared_page_base;
vm_offset_t sv_shared_page_len;
vm_offset_t sv_sigcode_base;
void *sv_shared_page_obj;
void (*sv_schedtail)(struct thread *);
};
#define SV_ILP32 0x000100
#define SV_LP64 0x000200
#define SV_IA32 0x004000
#define SV_AOUT 0x008000
#define SV_SHP 0x010000
#define SV_ABI_MASK 0xff
#define SV_PROC_FLAG(p, x) ((p)->p_sysent->sv_flags & (x))
#define SV_PROC_ABI(p) ((p)->p_sysent->sv_flags & SV_ABI_MASK)
#define SV_CURPROC_FLAG(x) SV_PROC_FLAG(curproc, x)
#define SV_CURPROC_ABI() SV_PROC_ABI(curproc)
/* same as ELFOSABI_XXX, to prevent header pollution */
#define SV_ABI_LINUX 3
#define SV_ABI_FREEBSD 9
#define SV_ABI_UNDEF 255
#ifdef _KERNEL
extern struct sysentvec aout_sysvec;
extern struct sysentvec elf_freebsd_sysvec;
extern struct sysentvec null_sysvec;
extern struct sysent sysent[];
extern const char *syscallnames[];
#define NO_SYSCALL (-1)
struct module;
struct syscall_module_data {
int (*chainevh)(struct module *, int, void *); /* next handler */
void *chainarg; /* arg for next event handler */
int *offset; /* offset into sysent */
struct sysent *new_sysent; /* new sysent */
struct sysent old_sysent; /* old sysent */
};
#define MAKE_SYSENT(syscallname) \
static struct sysent syscallname##_sysent = { \
(sizeof(struct syscallname ## _args ) \
/ sizeof(register_t)), \
+ (sy_call_t *)& sys_##syscallname, \
+ SYS_AUE_##syscallname \
+}
+
+#define MAKE_SYSENT_COMPAT(syscallname) \
+static struct sysent syscallname##_sysent = { \
+ (sizeof(struct syscallname ## _args ) \
+ / sizeof(register_t)), \
(sy_call_t *)& syscallname, \
SYS_AUE_##syscallname \
}
#define SYSCALL_MODULE(name, offset, new_sysent, evh, arg) \
static struct syscall_module_data name##_syscall_mod = { \
evh, arg, offset, new_sysent, { 0, NULL, AUE_NULL } \
}; \
\
static moduledata_t name##_mod = { \
"sys/" #name, \
syscall_module_handler, \
&name##_syscall_mod \
}; \
DECLARE_MODULE(name, name##_mod, SI_SUB_SYSCALLS, SI_ORDER_MIDDLE)
#define SYSCALL_MODULE_HELPER(syscallname) \
static int syscallname##_syscall = SYS_##syscallname; \
MAKE_SYSENT(syscallname); \
SYSCALL_MODULE(syscallname, \
& syscallname##_syscall, & syscallname##_sysent, \
NULL, NULL)
#define SYSCALL_MODULE_PRESENT(syscallname) \
(sysent[SYS_##syscallname].sy_call != (sy_call_t *)lkmnosys && \
sysent[SYS_##syscallname].sy_call != (sy_call_t *)lkmressys)
/*
* Syscall registration helpers with resource allocation handling.
*/
struct syscall_helper_data {
struct sysent new_sysent;
struct sysent old_sysent;
int syscall_no;
int registered;
};
#define SYSCALL_INIT_HELPER(syscallname) { \
+ .new_sysent = { \
+ .sy_narg = (sizeof(struct syscallname ## _args ) \
+ / sizeof(register_t)), \
+ .sy_call = (sy_call_t *)& sys_ ## syscallname, \
+ .sy_auevent = SYS_AUE_##syscallname \
+ }, \
+ .syscall_no = SYS_##syscallname \
+}
+#define SYSCALL_INIT_HELPER_COMPAT(syscallname) { \
.new_sysent = { \
.sy_narg = (sizeof(struct syscallname ## _args ) \
/ sizeof(register_t)), \
.sy_call = (sy_call_t *)& syscallname, \
.sy_auevent = SYS_AUE_##syscallname \
}, \
.syscall_no = SYS_##syscallname \
}
#define SYSCALL_INIT_LAST { \
.syscall_no = NO_SYSCALL \
}
int syscall_register(int *offset, struct sysent *new_sysent,
struct sysent *old_sysent);
int syscall_deregister(int *offset, struct sysent *old_sysent);
int syscall_module_handler(struct module *mod, int what, void *arg);
int syscall_helper_register(struct syscall_helper_data *sd);
int syscall_helper_unregister(struct syscall_helper_data *sd);
struct proc;
const char *syscallname(struct proc *p, u_int code);
/* Special purpose system call functions. */
struct nosys_args;
int lkmnosys(struct thread *, struct nosys_args *);
int lkmressys(struct thread *, struct nosys_args *);
int syscall_thread_enter(struct thread *td, struct sysent *se);
void syscall_thread_exit(struct thread *td, struct sysent *se);
int shared_page_fill(int size, int align, const char *data);
void exec_sysvec_init(void *param);
#define INIT_SYSENTVEC(name, sv) \
SYSINIT(name, SI_SUB_EXEC, SI_ORDER_ANY, \
(sysinit_cfunc_t)exec_sysvec_init, sv);
#endif /* _KERNEL */
#endif /* !_SYS_SYSENT_H_ */
Index: head/sys/vm/swap_pager.c
===================================================================
--- head/sys/vm/swap_pager.c (revision 225616)
+++ head/sys/vm/swap_pager.c (revision 225617)
@@ -1,2694 +1,2694 @@
/*-
* Copyright (c) 1998 Matthew Dillon,
* Copyright (c) 1994 John S. Dyson
* Copyright (c) 1990 University of Utah.
* Copyright (c) 1982, 1986, 1989, 1993
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* the Systems Programming Group of the University of Utah Computer
* Science Department.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* New Swap System
* Matthew Dillon
*
* Radix Bitmap 'blists'.
*
* - The new swapper uses the new radix bitmap code. This should scale
* to arbitrarily small or arbitrarily large swap spaces and an almost
* arbitrary degree of fragmentation.
*
* Features:
*
* - on the fly reallocation of swap during putpages. The new system
* does not try to keep previously allocated swap blocks for dirty
* pages.
*
* - on the fly deallocation of swap
*
* - No more garbage collection required. Unnecessarily allocated swap
* blocks only exist for dirty vm_page_t's now and these are already
* cycled (in a high-load system) by the pager. We also do on-the-fly
* removal of invalidated swap blocks when a page is destroyed
* or renamed.
*
* from: Utah $Hdr: swap_pager.c 1.4 91/04/30$
*
* @(#)swap_pager.c 8.9 (Berkeley) 3/21/94
* @(#)vm_swap.c 8.5 (Berkeley) 2/17/94
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_swap.h"
#include "opt_vm.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/conf.h>
#include <sys/kernel.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/bio.h>
#include <sys/buf.h>
#include <sys/disk.h>
#include <sys/fcntl.h>
#include <sys/mount.h>
#include <sys/namei.h>
#include <sys/vnode.h>
#include <sys/malloc.h>
#include <sys/racct.h>
#include <sys/resource.h>
#include <sys/resourcevar.h>
#include <sys/sysctl.h>
#include <sys/sysproto.h>
#include <sys/blist.h>
#include <sys/lock.h>
#include <sys/sx.h>
#include <sys/vmmeter.h>
#include <security/mac/mac_framework.h>
#include <vm/vm.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
#include <vm/vm_kern.h>
#include <vm/vm_object.h>
#include <vm/vm_page.h>
#include <vm/vm_pager.h>
#include <vm/vm_pageout.h>
#include <vm/vm_param.h>
#include <vm/swap_pager.h>
#include <vm/vm_extern.h>
#include <vm/uma.h>
#include <geom/geom.h>
/*
* SWB_NPAGES must be a power of 2. It may be set to 1, 2, 4, 8, 16
* or 32 pages per allocation.
* The 32-page limit is due to the radix code (kern/subr_blist.c).
*/
#ifndef MAX_PAGEOUT_CLUSTER
#define MAX_PAGEOUT_CLUSTER 16
#endif
#if !defined(SWB_NPAGES)
#define SWB_NPAGES MAX_PAGEOUT_CLUSTER
#endif
/*
* The swblock structure maps an object and a small, fixed-size range
* of page indices to disk addresses within a swap area.
* The collection of these mappings is implemented as a hash table.
* Unused disk addresses within a swap area are allocated and managed
* using a blist.
*/
#define SWCORRECT(n) (sizeof(void *) * (n) / sizeof(daddr_t))
#define SWAP_META_PAGES (SWB_NPAGES * 2)
#define SWAP_META_MASK (SWAP_META_PAGES - 1)
struct swblock {
struct swblock *swb_hnext;
vm_object_t swb_object;
vm_pindex_t swb_index;
int swb_count;
daddr_t swb_pages[SWAP_META_PAGES];
};
static MALLOC_DEFINE(M_VMPGDATA, "vm_pgdata", "swap pager private data");
static struct mtx sw_dev_mtx;
static TAILQ_HEAD(, swdevt) swtailq = TAILQ_HEAD_INITIALIZER(swtailq);
static struct swdevt *swdevhd; /* Allocate from here next */
static int nswapdev; /* Number of swap devices */
int swap_pager_avail;
static int swdev_syscall_active = 0; /* serialize swap(on|off) */
static vm_ooffset_t swap_total;
SYSCTL_QUAD(_vm, OID_AUTO, swap_total, CTLFLAG_RD, &swap_total, 0,
"Total amount of available swap storage.");
static vm_ooffset_t swap_reserved;
SYSCTL_QUAD(_vm, OID_AUTO, swap_reserved, CTLFLAG_RD, &swap_reserved, 0,
"Amount of swap storage needed to back all allocated anonymous memory.");
static int overcommit = 0;
SYSCTL_INT(_vm, OID_AUTO, overcommit, CTLFLAG_RW, &overcommit, 0,
"Configure virtual memory overcommit behavior. See tuning(7) "
"for details.");
/* bits from overcommit */
#define SWAP_RESERVE_FORCE_ON (1 << 0)
#define SWAP_RESERVE_RLIMIT_ON (1 << 1)
#define SWAP_RESERVE_ALLOW_NONWIRED (1 << 2)
int
swap_reserve(vm_ooffset_t incr)
{
return (swap_reserve_by_cred(incr, curthread->td_ucred));
}
int
swap_reserve_by_cred(vm_ooffset_t incr, struct ucred *cred)
{
vm_ooffset_t r, s;
int res, error;
static int curfail;
static struct timeval lastfail;
struct uidinfo *uip;
uip = cred->cr_ruidinfo;
if (incr & PAGE_MASK)
panic("swap_reserve: & PAGE_MASK");
#ifdef RACCT
PROC_LOCK(curproc);
error = racct_add(curproc, RACCT_SWAP, incr);
PROC_UNLOCK(curproc);
if (error != 0)
return (0);
#endif
res = 0;
mtx_lock(&sw_dev_mtx);
r = swap_reserved + incr;
if (overcommit & SWAP_RESERVE_ALLOW_NONWIRED) {
s = cnt.v_page_count - cnt.v_free_reserved - cnt.v_wire_count;
s *= PAGE_SIZE;
} else
s = 0;
s += swap_total;
if ((overcommit & SWAP_RESERVE_FORCE_ON) == 0 || r <= s ||
(error = priv_check(curthread, PRIV_VM_SWAP_NOQUOTA)) == 0) {
res = 1;
swap_reserved = r;
}
mtx_unlock(&sw_dev_mtx);
if (res) {
PROC_LOCK(curproc);
UIDINFO_VMSIZE_LOCK(uip);
if ((overcommit & SWAP_RESERVE_RLIMIT_ON) != 0 &&
uip->ui_vmsize + incr > lim_cur(curproc, RLIMIT_SWAP) &&
priv_check(curthread, PRIV_VM_SWAP_NORLIMIT))
res = 0;
else
uip->ui_vmsize += incr;
UIDINFO_VMSIZE_UNLOCK(uip);
PROC_UNLOCK(curproc);
if (!res) {
mtx_lock(&sw_dev_mtx);
swap_reserved -= incr;
mtx_unlock(&sw_dev_mtx);
}
}
if (!res && ppsratecheck(&lastfail, &curfail, 1)) {
printf("uid %d, pid %d: swap reservation for %jd bytes failed\n",
curproc->p_pid, uip->ui_uid, incr);
}
#ifdef RACCT
if (!res) {
PROC_LOCK(curproc);
racct_sub(curproc, RACCT_SWAP, incr);
PROC_UNLOCK(curproc);
}
#endif
return (res);
}
void
swap_reserve_force(vm_ooffset_t incr)
{
struct uidinfo *uip;
mtx_lock(&sw_dev_mtx);
swap_reserved += incr;
mtx_unlock(&sw_dev_mtx);
#ifdef RACCT
PROC_LOCK(curproc);
racct_add_force(curproc, RACCT_SWAP, incr);
PROC_UNLOCK(curproc);
#endif
uip = curthread->td_ucred->cr_ruidinfo;
PROC_LOCK(curproc);
UIDINFO_VMSIZE_LOCK(uip);
uip->ui_vmsize += incr;
UIDINFO_VMSIZE_UNLOCK(uip);
PROC_UNLOCK(curproc);
}
void
swap_release(vm_ooffset_t decr)
{
struct ucred *cred;
PROC_LOCK(curproc);
cred = curthread->td_ucred;
swap_release_by_cred(decr, cred);
PROC_UNLOCK(curproc);
}
void
swap_release_by_cred(vm_ooffset_t decr, struct ucred *cred)
{
struct uidinfo *uip;
uip = cred->cr_ruidinfo;
if (decr & PAGE_MASK)
panic("swap_release: & PAGE_MASK");
mtx_lock(&sw_dev_mtx);
if (swap_reserved < decr)
panic("swap_reserved < decr");
swap_reserved -= decr;
mtx_unlock(&sw_dev_mtx);
UIDINFO_VMSIZE_LOCK(uip);
if (uip->ui_vmsize < decr)
printf("negative vmsize for uid = %d\n", uip->ui_uid);
uip->ui_vmsize -= decr;
UIDINFO_VMSIZE_UNLOCK(uip);
racct_sub_cred(cred, RACCT_SWAP, decr);
}
static void swapdev_strategy(struct buf *, struct swdevt *sw);
#define SWM_FREE 0x02 /* free, period */
#define SWM_POP 0x04 /* pop out */
int swap_pager_full = 2; /* swap space exhaustion (task killing) */
static int swap_pager_almost_full = 1; /* swap space exhaustion (w/hysteresis)*/
static int nsw_rcount; /* free read buffers */
static int nsw_wcount_sync; /* limit write buffers / synchronous */
static int nsw_wcount_async; /* limit write buffers / asynchronous */
static int nsw_wcount_async_max;/* assigned maximum */
static int nsw_cluster_max; /* maximum VOP I/O allowed */
static struct swblock **swhash;
static int swhash_mask;
static struct mtx swhash_mtx;
static int swap_async_max = 4; /* maximum in-progress async I/O's */
static struct sx sw_alloc_sx;
SYSCTL_INT(_vm, OID_AUTO, swap_async_max,
CTLFLAG_RW, &swap_async_max, 0, "Maximum running async swap ops");
/*
* "named" and "unnamed" anon region objects. Try to reduce the overhead
* of searching a named list by hashing it just a little.
*/
#define NOBJLISTS 8
#define NOBJLIST(handle) \
(&swap_pager_object_list[((int)(intptr_t)handle >> 4) & (NOBJLISTS-1)])
static struct mtx sw_alloc_mtx; /* protect list manipulation */
static struct pagerlst swap_pager_object_list[NOBJLISTS];
static uma_zone_t swap_zone;
static struct vm_object swap_zone_obj;
/*
* pagerops for OBJT_SWAP - "swap pager". Some ops are also global procedure
* calls hooked from other parts of the VM system and do not appear here.
* (see vm/swap_pager.h).
*/
static vm_object_t
swap_pager_alloc(void *handle, vm_ooffset_t size,
vm_prot_t prot, vm_ooffset_t offset, struct ucred *);
static void swap_pager_dealloc(vm_object_t object);
static int swap_pager_getpages(vm_object_t, vm_page_t *, int, int);
static void swap_pager_putpages(vm_object_t, vm_page_t *, int, boolean_t, int *);
static boolean_t
swap_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, int *after);
static void swap_pager_init(void);
static void swap_pager_unswapped(vm_page_t);
static void swap_pager_swapoff(struct swdevt *sp);
struct pagerops swappagerops = {
.pgo_init = swap_pager_init, /* early system initialization of pager */
.pgo_alloc = swap_pager_alloc, /* allocate an OBJT_SWAP object */
.pgo_dealloc = swap_pager_dealloc, /* deallocate an OBJT_SWAP object */
.pgo_getpages = swap_pager_getpages, /* pagein */
.pgo_putpages = swap_pager_putpages, /* pageout */
.pgo_haspage = swap_pager_haspage, /* get backing store status for page */
.pgo_pageunswapped = swap_pager_unswapped, /* remove swap related to page */
};
/*
* dmmax is in page-sized chunks with the new swap system. It was
* dev-bsized chunks in the old. dmmax is always a power of 2.
*
* swap_*() routines are externally accessible. swp_*() routines are
* internal.
*/
static int dmmax;
static int nswap_lowat = 128; /* in pages, swap_pager_almost_full warn */
static int nswap_hiwat = 512; /* in pages, swap_pager_almost_full warn */
SYSCTL_INT(_vm, OID_AUTO, dmmax,
CTLFLAG_RD, &dmmax, 0, "Maximum size of a swap block");
static void swp_sizecheck(void);
static void swp_pager_async_iodone(struct buf *bp);
static int swapongeom(struct thread *, struct vnode *);
static int swaponvp(struct thread *, struct vnode *, u_long);
static int swapoff_one(struct swdevt *sp, struct ucred *cred);
/*
* Swap bitmap functions
*/
static void swp_pager_freeswapspace(daddr_t blk, int npages);
static daddr_t swp_pager_getswapspace(int npages);
/*
* Metadata functions
*/
static struct swblock **swp_pager_hash(vm_object_t object, vm_pindex_t index);
static void swp_pager_meta_build(vm_object_t, vm_pindex_t, daddr_t);
static void swp_pager_meta_free(vm_object_t, vm_pindex_t, daddr_t);
static void swp_pager_meta_free_all(vm_object_t);
static daddr_t swp_pager_meta_ctl(vm_object_t, vm_pindex_t, int);
static void
swp_pager_free_nrpage(vm_page_t m)
{
vm_page_lock(m);
if (m->wire_count == 0)
vm_page_free(m);
vm_page_unlock(m);
}
/*
* SWP_SIZECHECK() - update swap_pager_full indication
*
* update the swap_pager_almost_full indication and warn when we are
* about to run out of swap space, using lowat/hiwat hysteresis.
*
* Clear swap_pager_full ( task killing ) indication when lowat is met.
*
* No restrictions on call
* This routine may not block.
*/
static void
swp_sizecheck(void)
{
if (swap_pager_avail < nswap_lowat) {
if (swap_pager_almost_full == 0) {
printf("swap_pager: out of swap space\n");
swap_pager_almost_full = 1;
}
} else {
swap_pager_full = 0;
if (swap_pager_avail > nswap_hiwat)
swap_pager_almost_full = 0;
}
}
/*
* SWP_PAGER_HASH() - hash swap meta data
*
* This is an helper function which hashes the swapblk given
* the object and page index. It returns a pointer to a pointer
* to the object, or a pointer to a NULL pointer if it could not
* find a swapblk.
*/
static struct swblock **
swp_pager_hash(vm_object_t object, vm_pindex_t index)
{
struct swblock **pswap;
struct swblock *swap;
index &= ~(vm_pindex_t)SWAP_META_MASK;
pswap = &swhash[(index ^ (int)(intptr_t)object) & swhash_mask];
while ((swap = *pswap) != NULL) {
if (swap->swb_object == object &&
swap->swb_index == index
) {
break;
}
pswap = &swap->swb_hnext;
}
return (pswap);
}
/*
* SWAP_PAGER_INIT() - initialize the swap pager!
*
* Expected to be started from system init. NOTE: This code is run
* before much else so be careful what you depend on. Most of the VM
* system has yet to be initialized at this point.
*/
static void
swap_pager_init(void)
{
/*
* Initialize object lists
*/
int i;
for (i = 0; i < NOBJLISTS; ++i)
TAILQ_INIT(&swap_pager_object_list[i]);
mtx_init(&sw_alloc_mtx, "swap_pager list", NULL, MTX_DEF);
mtx_init(&sw_dev_mtx, "swapdev", NULL, MTX_DEF);
/*
* Device Stripe, in PAGE_SIZE'd blocks
*/
dmmax = SWB_NPAGES * 2;
}
/*
* SWAP_PAGER_SWAP_INIT() - swap pager initialization from pageout process
*
* Expected to be started from pageout process once, prior to entering
* its main loop.
*/
void
swap_pager_swap_init(void)
{
int n, n2;
/*
* Number of in-transit swap bp operations. Don't
* exhaust the pbufs completely. Make sure we
* initialize workable values (0 will work for hysteresis
* but it isn't very efficient).
*
* The nsw_cluster_max is constrained by the bp->b_pages[]
* array (MAXPHYS/PAGE_SIZE) and our locally defined
* MAX_PAGEOUT_CLUSTER. Also be aware that swap ops are
* constrained by the swap device interleave stripe size.
*
* Currently we hardwire nsw_wcount_async to 4. This limit is
* designed to prevent other I/O from having high latencies due to
* our pageout I/O. The value 4 works well for one or two active swap
* devices but is probably a little low if you have more. Even so,
* a higher value would probably generate only a limited improvement
* with three or four active swap devices since the system does not
* typically have to pageout at extreme bandwidths. We will want
* at least 2 per swap devices, and 4 is a pretty good value if you
* have one NFS swap device due to the command/ack latency over NFS.
* So it all works out pretty well.
*/
nsw_cluster_max = min((MAXPHYS/PAGE_SIZE), MAX_PAGEOUT_CLUSTER);
mtx_lock(&pbuf_mtx);
nsw_rcount = (nswbuf + 1) / 2;
nsw_wcount_sync = (nswbuf + 3) / 4;
nsw_wcount_async = 4;
nsw_wcount_async_max = nsw_wcount_async;
mtx_unlock(&pbuf_mtx);
/*
* Initialize our zone. Right now I'm just guessing on the number
* we need based on the number of pages in the system. Each swblock
* can hold 16 pages, so this is probably overkill. This reservation
* is typically limited to around 32MB by default.
*/
n = cnt.v_page_count / 2;
if (maxswzone && n > maxswzone / sizeof(struct swblock))
n = maxswzone / sizeof(struct swblock);
n2 = n;
swap_zone = uma_zcreate("SWAPMETA", sizeof(struct swblock), NULL, NULL,
NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE | UMA_ZONE_VM);
if (swap_zone == NULL)
panic("failed to create swap_zone.");
do {
if (uma_zone_set_obj(swap_zone, &swap_zone_obj, n))
break;
/*
* if the allocation failed, try a zone two thirds the
* size of the previous attempt.
*/
n -= ((n + 2) / 3);
} while (n > 0);
if (n2 != n)
printf("Swap zone entries reduced from %d to %d.\n", n2, n);
n2 = n;
/*
* Initialize our meta-data hash table. The swapper does not need to
* be quite as efficient as the VM system, so we do not use an
* oversized hash table.
*
* n: size of hash table, must be power of 2
* swhash_mask: hash table index mask
*/
for (n = 1; n < n2 / 8; n *= 2)
;
swhash = malloc(sizeof(struct swblock *) * n, M_VMPGDATA, M_WAITOK | M_ZERO);
swhash_mask = n - 1;
mtx_init(&swhash_mtx, "swap_pager swhash", NULL, MTX_DEF);
}
/*
* SWAP_PAGER_ALLOC() - allocate a new OBJT_SWAP VM object and instantiate
* its metadata structures.
*
* This routine is called from the mmap and fork code to create a new
* OBJT_SWAP object. We do this by creating an OBJT_DEFAULT object
* and then converting it with swp_pager_meta_build().
*
* This routine may block in vm_object_allocate() and create a named
* object lookup race, so we must interlock.
*
* MPSAFE
*/
static vm_object_t
swap_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot,
vm_ooffset_t offset, struct ucred *cred)
{
vm_object_t object;
vm_pindex_t pindex;
pindex = OFF_TO_IDX(offset + PAGE_MASK + size);
if (handle) {
mtx_lock(&Giant);
/*
* Reference existing named region or allocate new one. There
* should not be a race here against swp_pager_meta_build()
* as called from vm_page_remove() in regards to the lookup
* of the handle.
*/
sx_xlock(&sw_alloc_sx);
object = vm_pager_object_lookup(NOBJLIST(handle), handle);
if (object == NULL) {
if (cred != NULL) {
if (!swap_reserve_by_cred(size, cred)) {
sx_xunlock(&sw_alloc_sx);
mtx_unlock(&Giant);
return (NULL);
}
crhold(cred);
}
object = vm_object_allocate(OBJT_DEFAULT, pindex);
VM_OBJECT_LOCK(object);
object->handle = handle;
if (cred != NULL) {
object->cred = cred;
object->charge = size;
}
swp_pager_meta_build(object, 0, SWAPBLK_NONE);
VM_OBJECT_UNLOCK(object);
}
sx_xunlock(&sw_alloc_sx);
mtx_unlock(&Giant);
} else {
if (cred != NULL) {
if (!swap_reserve_by_cred(size, cred))
return (NULL);
crhold(cred);
}
object = vm_object_allocate(OBJT_DEFAULT, pindex);
VM_OBJECT_LOCK(object);
if (cred != NULL) {
object->cred = cred;
object->charge = size;
}
swp_pager_meta_build(object, 0, SWAPBLK_NONE);
VM_OBJECT_UNLOCK(object);
}
return (object);
}
/*
* SWAP_PAGER_DEALLOC() - remove swap metadata from object
*
* The swap backing for the object is destroyed. The code is
* designed such that we can reinstantiate it later, but this
* routine is typically called only when the entire object is
* about to be destroyed.
*
* The object must be locked.
*/
static void
swap_pager_dealloc(vm_object_t object)
{
/*
* Remove from list right away so lookups will fail if we block for
* pageout completion.
*/
if (object->handle != NULL) {
mtx_lock(&sw_alloc_mtx);
TAILQ_REMOVE(NOBJLIST(object->handle), object, pager_object_list);
mtx_unlock(&sw_alloc_mtx);
}
VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
vm_object_pip_wait(object, "swpdea");
/*
* Free all remaining metadata. We only bother to free it from
* the swap meta data. We do not attempt to free swapblk's still
* associated with vm_page_t's for this object. We do not care
* if paging is still in progress on some objects.
*/
swp_pager_meta_free_all(object);
}
/************************************************************************
* SWAP PAGER BITMAP ROUTINES *
************************************************************************/
/*
* SWP_PAGER_GETSWAPSPACE() - allocate raw swap space
*
* Allocate swap for the requested number of pages. The starting
* swap block number (a page index) is returned or SWAPBLK_NONE
* if the allocation failed.
*
* Also has the side effect of advising that somebody made a mistake
* when they configured swap and didn't configure enough.
*
* This routine may not sleep.
*
* We allocate in round-robin fashion from the configured devices.
*/
static daddr_t
swp_pager_getswapspace(int npages)
{
daddr_t blk;
struct swdevt *sp;
int i;
blk = SWAPBLK_NONE;
mtx_lock(&sw_dev_mtx);
sp = swdevhd;
for (i = 0; i < nswapdev; i++) {
if (sp == NULL)
sp = TAILQ_FIRST(&swtailq);
if (!(sp->sw_flags & SW_CLOSING)) {
blk = blist_alloc(sp->sw_blist, npages);
if (blk != SWAPBLK_NONE) {
blk += sp->sw_first;
sp->sw_used += npages;
swap_pager_avail -= npages;
swp_sizecheck();
swdevhd = TAILQ_NEXT(sp, sw_list);
goto done;
}
}
sp = TAILQ_NEXT(sp, sw_list);
}
if (swap_pager_full != 2) {
printf("swap_pager_getswapspace(%d): failed\n", npages);
swap_pager_full = 2;
swap_pager_almost_full = 1;
}
swdevhd = NULL;
done:
mtx_unlock(&sw_dev_mtx);
return (blk);
}
static int
swp_pager_isondev(daddr_t blk, struct swdevt *sp)
{
return (blk >= sp->sw_first && blk < sp->sw_end);
}
static void
swp_pager_strategy(struct buf *bp)
{
struct swdevt *sp;
mtx_lock(&sw_dev_mtx);
TAILQ_FOREACH(sp, &swtailq, sw_list) {
if (bp->b_blkno >= sp->sw_first && bp->b_blkno < sp->sw_end) {
mtx_unlock(&sw_dev_mtx);
sp->sw_strategy(bp, sp);
return;
}
}
panic("Swapdev not found");
}
/*
* SWP_PAGER_FREESWAPSPACE() - free raw swap space
*
* This routine returns the specified swap blocks back to the bitmap.
*
* This routine may not sleep.
*/
static void
swp_pager_freeswapspace(daddr_t blk, int npages)
{
struct swdevt *sp;
mtx_lock(&sw_dev_mtx);
TAILQ_FOREACH(sp, &swtailq, sw_list) {
if (blk >= sp->sw_first && blk < sp->sw_end) {
sp->sw_used -= npages;
/*
* If we are attempting to stop swapping on
* this device, we don't want to mark any
* blocks free lest they be reused.
*/
if ((sp->sw_flags & SW_CLOSING) == 0) {
blist_free(sp->sw_blist, blk - sp->sw_first,
npages);
swap_pager_avail += npages;
swp_sizecheck();
}
mtx_unlock(&sw_dev_mtx);
return;
}
}
panic("Swapdev not found");
}
/*
* SWAP_PAGER_FREESPACE() - frees swap blocks associated with a page
* range within an object.
*
* This is a globally accessible routine.
*
* This routine removes swapblk assignments from swap metadata.
*
* The external callers of this routine typically have already destroyed
* or renamed vm_page_t's associated with this range in the object so
* we should be ok.
*/
void
swap_pager_freespace(vm_object_t object, vm_pindex_t start, vm_size_t size)
{
VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
swp_pager_meta_free(object, start, size);
}
/*
* SWAP_PAGER_RESERVE() - reserve swap blocks in object
*
* Assigns swap blocks to the specified range within the object. The
* swap blocks are not zerod. Any previous swap assignment is destroyed.
*
* Returns 0 on success, -1 on failure.
*/
int
swap_pager_reserve(vm_object_t object, vm_pindex_t start, vm_size_t size)
{
int n = 0;
daddr_t blk = SWAPBLK_NONE;
vm_pindex_t beg = start; /* save start index */
VM_OBJECT_LOCK(object);
while (size) {
if (n == 0) {
n = BLIST_MAX_ALLOC;
while ((blk = swp_pager_getswapspace(n)) == SWAPBLK_NONE) {
n >>= 1;
if (n == 0) {
swp_pager_meta_free(object, beg, start - beg);
VM_OBJECT_UNLOCK(object);
return (-1);
}
}
}
swp_pager_meta_build(object, start, blk);
--size;
++start;
++blk;
--n;
}
swp_pager_meta_free(object, start, n);
VM_OBJECT_UNLOCK(object);
return (0);
}
/*
* SWAP_PAGER_COPY() - copy blocks from source pager to destination pager
* and destroy the source.
*
* Copy any valid swapblks from the source to the destination. In
* cases where both the source and destination have a valid swapblk,
* we keep the destination's.
*
* This routine is allowed to sleep. It may sleep allocating metadata
* indirectly through swp_pager_meta_build() or if paging is still in
* progress on the source.
*
* The source object contains no vm_page_t's (which is just as well)
*
* The source object is of type OBJT_SWAP.
*
* The source and destination objects must be locked.
* Both object locks may temporarily be released.
*/
void
swap_pager_copy(vm_object_t srcobject, vm_object_t dstobject,
vm_pindex_t offset, int destroysource)
{
vm_pindex_t i;
VM_OBJECT_LOCK_ASSERT(srcobject, MA_OWNED);
VM_OBJECT_LOCK_ASSERT(dstobject, MA_OWNED);
/*
* If destroysource is set, we remove the source object from the
* swap_pager internal queue now.
*/
if (destroysource) {
if (srcobject->handle != NULL) {
mtx_lock(&sw_alloc_mtx);
TAILQ_REMOVE(
NOBJLIST(srcobject->handle),
srcobject,
pager_object_list
);
mtx_unlock(&sw_alloc_mtx);
}
}
/*
* transfer source to destination.
*/
for (i = 0; i < dstobject->size; ++i) {
daddr_t dstaddr;
/*
* Locate (without changing) the swapblk on the destination,
* unless it is invalid in which case free it silently, or
* if the destination is a resident page, in which case the
* source is thrown away.
*/
dstaddr = swp_pager_meta_ctl(dstobject, i, 0);
if (dstaddr == SWAPBLK_NONE) {
/*
* Destination has no swapblk and is not resident,
* copy source.
*/
daddr_t srcaddr;
srcaddr = swp_pager_meta_ctl(
srcobject,
i + offset,
SWM_POP
);
if (srcaddr != SWAPBLK_NONE) {
/*
* swp_pager_meta_build() can sleep.
*/
vm_object_pip_add(srcobject, 1);
VM_OBJECT_UNLOCK(srcobject);
vm_object_pip_add(dstobject, 1);
swp_pager_meta_build(dstobject, i, srcaddr);
vm_object_pip_wakeup(dstobject);
VM_OBJECT_LOCK(srcobject);
vm_object_pip_wakeup(srcobject);
}
} else {
/*
* Destination has valid swapblk or it is represented
* by a resident page. We destroy the sourceblock.
*/
swp_pager_meta_ctl(srcobject, i + offset, SWM_FREE);
}
}
/*
* Free left over swap blocks in source.
*
* We have to revert the type to OBJT_DEFAULT so we do not accidently
* double-remove the object from the swap queues.
*/
if (destroysource) {
swp_pager_meta_free_all(srcobject);
/*
* Reverting the type is not necessary, the caller is going
* to destroy srcobject directly, but I'm doing it here
* for consistency since we've removed the object from its
* queues.
*/
srcobject->type = OBJT_DEFAULT;
}
}
/*
* SWAP_PAGER_HASPAGE() - determine if we have good backing store for
* the requested page.
*
* We determine whether good backing store exists for the requested
* page and return TRUE if it does, FALSE if it doesn't.
*
* If TRUE, we also try to determine how much valid, contiguous backing
* store exists before and after the requested page within a reasonable
* distance. We do not try to restrict it to the swap device stripe
* (that is handled in getpages/putpages). It probably isn't worth
* doing here.
*/
static boolean_t
swap_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, int *after)
{
daddr_t blk0;
VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
/*
* do we have good backing store at the requested index ?
*/
blk0 = swp_pager_meta_ctl(object, pindex, 0);
if (blk0 == SWAPBLK_NONE) {
if (before)
*before = 0;
if (after)
*after = 0;
return (FALSE);
}
/*
* find backwards-looking contiguous good backing store
*/
if (before != NULL) {
int i;
for (i = 1; i < (SWB_NPAGES/2); ++i) {
daddr_t blk;
if (i > pindex)
break;
blk = swp_pager_meta_ctl(object, pindex - i, 0);
if (blk != blk0 - i)
break;
}
*before = (i - 1);
}
/*
* find forward-looking contiguous good backing store
*/
if (after != NULL) {
int i;
for (i = 1; i < (SWB_NPAGES/2); ++i) {
daddr_t blk;
blk = swp_pager_meta_ctl(object, pindex + i, 0);
if (blk != blk0 + i)
break;
}
*after = (i - 1);
}
return (TRUE);
}
/*
* SWAP_PAGER_PAGE_UNSWAPPED() - remove swap backing store related to page
*
* This removes any associated swap backing store, whether valid or
* not, from the page.
*
* This routine is typically called when a page is made dirty, at
* which point any associated swap can be freed. MADV_FREE also
* calls us in a special-case situation
*
* NOTE!!! If the page is clean and the swap was valid, the caller
* should make the page dirty before calling this routine. This routine
* does NOT change the m->dirty status of the page. Also: MADV_FREE
* depends on it.
*
* This routine may not sleep.
*/
static void
swap_pager_unswapped(vm_page_t m)
{
VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
swp_pager_meta_ctl(m->object, m->pindex, SWM_FREE);
}
/*
* SWAP_PAGER_GETPAGES() - bring pages in from swap
*
* Attempt to retrieve (m, count) pages from backing store, but make
* sure we retrieve at least m[reqpage]. We try to load in as large
* a chunk surrounding m[reqpage] as is contiguous in swap and which
* belongs to the same object.
*
* The code is designed for asynchronous operation and
* immediate-notification of 'reqpage' but tends not to be
* used that way. Please do not optimize-out this algorithmic
* feature, I intend to improve on it in the future.
*
* The parent has a single vm_object_pip_add() reference prior to
* calling us and we should return with the same.
*
* The parent has BUSY'd the pages. We should return with 'm'
* left busy, but the others adjusted.
*/
static int
swap_pager_getpages(vm_object_t object, vm_page_t *m, int count, int reqpage)
{
struct buf *bp;
vm_page_t mreq;
int i;
int j;
daddr_t blk;
mreq = m[reqpage];
KASSERT(mreq->object == object,
("swap_pager_getpages: object mismatch %p/%p",
object, mreq->object));
/*
* Calculate range to retrieve. The pages have already been assigned
* their swapblks. We require a *contiguous* range but we know it to
* not span devices. If we do not supply it, bad things
* happen. Note that blk, iblk & jblk can be SWAPBLK_NONE, but the
* loops are set up such that the case(s) are handled implicitly.
*
* The swp_*() calls must be made with the object locked.
*/
blk = swp_pager_meta_ctl(mreq->object, mreq->pindex, 0);
for (i = reqpage - 1; i >= 0; --i) {
daddr_t iblk;
iblk = swp_pager_meta_ctl(m[i]->object, m[i]->pindex, 0);
if (blk != iblk + (reqpage - i))
break;
}
++i;
for (j = reqpage + 1; j < count; ++j) {
daddr_t jblk;
jblk = swp_pager_meta_ctl(m[j]->object, m[j]->pindex, 0);
if (blk != jblk - (j - reqpage))
break;
}
/*
* free pages outside our collection range. Note: we never free
* mreq, it must remain busy throughout.
*/
if (0 < i || j < count) {
int k;
for (k = 0; k < i; ++k)
swp_pager_free_nrpage(m[k]);
for (k = j; k < count; ++k)
swp_pager_free_nrpage(m[k]);
}
/*
* Return VM_PAGER_FAIL if we have nothing to do. Return mreq
* still busy, but the others unbusied.
*/
if (blk == SWAPBLK_NONE)
return (VM_PAGER_FAIL);
/*
* Getpbuf() can sleep.
*/
VM_OBJECT_UNLOCK(object);
/*
* Get a swap buffer header to perform the IO
*/
bp = getpbuf(&nsw_rcount);
bp->b_flags |= B_PAGING;
/*
* map our page(s) into kva for input
*/
pmap_qenter((vm_offset_t)bp->b_data, m + i, j - i);
bp->b_iocmd = BIO_READ;
bp->b_iodone = swp_pager_async_iodone;
bp->b_rcred = crhold(thread0.td_ucred);
bp->b_wcred = crhold(thread0.td_ucred);
bp->b_blkno = blk - (reqpage - i);
bp->b_bcount = PAGE_SIZE * (j - i);
bp->b_bufsize = PAGE_SIZE * (j - i);
bp->b_pager.pg_reqpage = reqpage - i;
VM_OBJECT_LOCK(object);
{
int k;
for (k = i; k < j; ++k) {
bp->b_pages[k - i] = m[k];
m[k]->oflags |= VPO_SWAPINPROG;
}
}
bp->b_npages = j - i;
PCPU_INC(cnt.v_swapin);
PCPU_ADD(cnt.v_swappgsin, bp->b_npages);
/*
* We still hold the lock on mreq, and our automatic completion routine
* does not remove it.
*/
vm_object_pip_add(object, bp->b_npages);
VM_OBJECT_UNLOCK(object);
/*
* perform the I/O. NOTE!!! bp cannot be considered valid after
* this point because we automatically release it on completion.
* Instead, we look at the one page we are interested in which we
* still hold a lock on even through the I/O completion.
*
* The other pages in our m[] array are also released on completion,
* so we cannot assume they are valid anymore either.
*
* NOTE: b_blkno is destroyed by the call to swapdev_strategy
*/
BUF_KERNPROC(bp);
swp_pager_strategy(bp);
/*
* wait for the page we want to complete. VPO_SWAPINPROG is always
* cleared on completion. If an I/O error occurs, SWAPBLK_NONE
* is set in the meta-data.
*/
VM_OBJECT_LOCK(object);
while ((mreq->oflags & VPO_SWAPINPROG) != 0) {
mreq->oflags |= VPO_WANTED;
PCPU_INC(cnt.v_intrans);
if (msleep(mreq, VM_OBJECT_MTX(object), PSWP, "swread", hz*20)) {
printf(
"swap_pager: indefinite wait buffer: bufobj: %p, blkno: %jd, size: %ld\n",
bp->b_bufobj, (intmax_t)bp->b_blkno, bp->b_bcount);
}
}
/*
* mreq is left busied after completion, but all the other pages
* are freed. If we had an unrecoverable read error the page will
* not be valid.
*/
if (mreq->valid != VM_PAGE_BITS_ALL) {
return (VM_PAGER_ERROR);
} else {
return (VM_PAGER_OK);
}
/*
* A final note: in a low swap situation, we cannot deallocate swap
* and mark a page dirty here because the caller is likely to mark
* the page clean when we return, causing the page to possibly revert
* to all-zero's later.
*/
}
/*
* swap_pager_putpages:
*
* Assign swap (if necessary) and initiate I/O on the specified pages.
*
* We support both OBJT_DEFAULT and OBJT_SWAP objects. DEFAULT objects
* are automatically converted to SWAP objects.
*
* In a low memory situation we may block in VOP_STRATEGY(), but the new
* vm_page reservation system coupled with properly written VFS devices
* should ensure that no low-memory deadlock occurs. This is an area
* which needs work.
*
* The parent has N vm_object_pip_add() references prior to
* calling us and will remove references for rtvals[] that are
* not set to VM_PAGER_PEND. We need to remove the rest on I/O
* completion.
*
* The parent has soft-busy'd the pages it passes us and will unbusy
* those whos rtvals[] entry is not set to VM_PAGER_PEND on return.
* We need to unbusy the rest on I/O completion.
*/
void
swap_pager_putpages(vm_object_t object, vm_page_t *m, int count,
boolean_t sync, int *rtvals)
{
int i;
int n = 0;
if (count && m[0]->object != object) {
panic("swap_pager_putpages: object mismatch %p/%p",
object,
m[0]->object
);
}
/*
* Step 1
*
* Turn object into OBJT_SWAP
* check for bogus sysops
* force sync if not pageout process
*/
if (object->type != OBJT_SWAP)
swp_pager_meta_build(object, 0, SWAPBLK_NONE);
VM_OBJECT_UNLOCK(object);
if (curproc != pageproc)
sync = TRUE;
/*
* Step 2
*
* Update nsw parameters from swap_async_max sysctl values.
* Do not let the sysop crash the machine with bogus numbers.
*/
mtx_lock(&pbuf_mtx);
if (swap_async_max != nsw_wcount_async_max) {
int n;
/*
* limit range
*/
if ((n = swap_async_max) > nswbuf / 2)
n = nswbuf / 2;
if (n < 1)
n = 1;
swap_async_max = n;
/*
* Adjust difference ( if possible ). If the current async
* count is too low, we may not be able to make the adjustment
* at this time.
*/
n -= nsw_wcount_async_max;
if (nsw_wcount_async + n >= 0) {
nsw_wcount_async += n;
nsw_wcount_async_max += n;
wakeup(&nsw_wcount_async);
}
}
mtx_unlock(&pbuf_mtx);
/*
* Step 3
*
* Assign swap blocks and issue I/O. We reallocate swap on the fly.
* The page is left dirty until the pageout operation completes
* successfully.
*/
for (i = 0; i < count; i += n) {
int j;
struct buf *bp;
daddr_t blk;
/*
* Maximum I/O size is limited by a number of factors.
*/
n = min(BLIST_MAX_ALLOC, count - i);
n = min(n, nsw_cluster_max);
/*
* Get biggest block of swap we can. If we fail, fall
* back and try to allocate a smaller block. Don't go
* overboard trying to allocate space if it would overly
* fragment swap.
*/
while (
(blk = swp_pager_getswapspace(n)) == SWAPBLK_NONE &&
n > 4
) {
n >>= 1;
}
if (blk == SWAPBLK_NONE) {
for (j = 0; j < n; ++j)
rtvals[i+j] = VM_PAGER_FAIL;
continue;
}
/*
* All I/O parameters have been satisfied, build the I/O
* request and assign the swap space.
*/
if (sync == TRUE) {
bp = getpbuf(&nsw_wcount_sync);
} else {
bp = getpbuf(&nsw_wcount_async);
bp->b_flags = B_ASYNC;
}
bp->b_flags |= B_PAGING;
bp->b_iocmd = BIO_WRITE;
pmap_qenter((vm_offset_t)bp->b_data, &m[i], n);
bp->b_rcred = crhold(thread0.td_ucred);
bp->b_wcred = crhold(thread0.td_ucred);
bp->b_bcount = PAGE_SIZE * n;
bp->b_bufsize = PAGE_SIZE * n;
bp->b_blkno = blk;
VM_OBJECT_LOCK(object);
for (j = 0; j < n; ++j) {
vm_page_t mreq = m[i+j];
swp_pager_meta_build(
mreq->object,
mreq->pindex,
blk + j
);
vm_page_dirty(mreq);
rtvals[i+j] = VM_PAGER_OK;
mreq->oflags |= VPO_SWAPINPROG;
bp->b_pages[j] = mreq;
}
VM_OBJECT_UNLOCK(object);
bp->b_npages = n;
/*
* Must set dirty range for NFS to work.
*/
bp->b_dirtyoff = 0;
bp->b_dirtyend = bp->b_bcount;
PCPU_INC(cnt.v_swapout);
PCPU_ADD(cnt.v_swappgsout, bp->b_npages);
/*
* asynchronous
*
* NOTE: b_blkno is destroyed by the call to swapdev_strategy
*/
if (sync == FALSE) {
bp->b_iodone = swp_pager_async_iodone;
BUF_KERNPROC(bp);
swp_pager_strategy(bp);
for (j = 0; j < n; ++j)
rtvals[i+j] = VM_PAGER_PEND;
/* restart outter loop */
continue;
}
/*
* synchronous
*
* NOTE: b_blkno is destroyed by the call to swapdev_strategy
*/
bp->b_iodone = bdone;
swp_pager_strategy(bp);
/*
* Wait for the sync I/O to complete, then update rtvals.
* We just set the rtvals[] to VM_PAGER_PEND so we can call
* our async completion routine at the end, thus avoiding a
* double-free.
*/
bwait(bp, PVM, "swwrt");
for (j = 0; j < n; ++j)
rtvals[i+j] = VM_PAGER_PEND;
/*
* Now that we are through with the bp, we can call the
* normal async completion, which frees everything up.
*/
swp_pager_async_iodone(bp);
}
VM_OBJECT_LOCK(object);
}
/*
* swp_pager_async_iodone:
*
* Completion routine for asynchronous reads and writes from/to swap.
* Also called manually by synchronous code to finish up a bp.
*
* For READ operations, the pages are VPO_BUSY'd. For WRITE operations,
* the pages are vm_page_t->busy'd. For READ operations, we VPO_BUSY
* unbusy all pages except the 'main' request page. For WRITE
* operations, we vm_page_t->busy'd unbusy all pages ( we can do this
* because we marked them all VM_PAGER_PEND on return from putpages ).
*
* This routine may not sleep.
*/
static void
swp_pager_async_iodone(struct buf *bp)
{
int i;
vm_object_t object = NULL;
/*
* report error
*/
if (bp->b_ioflags & BIO_ERROR) {
printf(
"swap_pager: I/O error - %s failed; blkno %ld,"
"size %ld, error %d\n",
((bp->b_iocmd == BIO_READ) ? "pagein" : "pageout"),
(long)bp->b_blkno,
(long)bp->b_bcount,
bp->b_error
);
}
/*
* remove the mapping for kernel virtual
*/
pmap_qremove((vm_offset_t)bp->b_data, bp->b_npages);
if (bp->b_npages) {
object = bp->b_pages[0]->object;
VM_OBJECT_LOCK(object);
}
/*
* cleanup pages. If an error occurs writing to swap, we are in
* very serious trouble. If it happens to be a disk error, though,
* we may be able to recover by reassigning the swap later on. So
* in this case we remove the m->swapblk assignment for the page
* but do not free it in the rlist. The errornous block(s) are thus
* never reallocated as swap. Redirty the page and continue.
*/
for (i = 0; i < bp->b_npages; ++i) {
vm_page_t m = bp->b_pages[i];
m->oflags &= ~VPO_SWAPINPROG;
if (bp->b_ioflags & BIO_ERROR) {
/*
* If an error occurs I'd love to throw the swapblk
* away without freeing it back to swapspace, so it
* can never be used again. But I can't from an
* interrupt.
*/
if (bp->b_iocmd == BIO_READ) {
/*
* When reading, reqpage needs to stay
* locked for the parent, but all other
* pages can be freed. We still want to
* wakeup the parent waiting on the page,
* though. ( also: pg_reqpage can be -1 and
* not match anything ).
*
* We have to wake specifically requested pages
* up too because we cleared VPO_SWAPINPROG and
* someone may be waiting for that.
*
* NOTE: for reads, m->dirty will probably
* be overridden by the original caller of
* getpages so don't play cute tricks here.
*/
m->valid = 0;
if (i != bp->b_pager.pg_reqpage)
swp_pager_free_nrpage(m);
else
vm_page_flash(m);
/*
* If i == bp->b_pager.pg_reqpage, do not wake
* the page up. The caller needs to.
*/
} else {
/*
* If a write error occurs, reactivate page
* so it doesn't clog the inactive list,
* then finish the I/O.
*/
vm_page_dirty(m);
vm_page_lock(m);
vm_page_activate(m);
vm_page_unlock(m);
vm_page_io_finish(m);
}
} else if (bp->b_iocmd == BIO_READ) {
/*
* NOTE: for reads, m->dirty will probably be
* overridden by the original caller of getpages so
* we cannot set them in order to free the underlying
* swap in a low-swap situation. I don't think we'd
* want to do that anyway, but it was an optimization
* that existed in the old swapper for a time before
* it got ripped out due to precisely this problem.
*
* If not the requested page then deactivate it.
*
* Note that the requested page, reqpage, is left
* busied, but we still have to wake it up. The
* other pages are released (unbusied) by
* vm_page_wakeup().
*/
KASSERT(!pmap_page_is_mapped(m),
("swp_pager_async_iodone: page %p is mapped", m));
m->valid = VM_PAGE_BITS_ALL;
KASSERT(m->dirty == 0,
("swp_pager_async_iodone: page %p is dirty", m));
/*
* We have to wake specifically requested pages
* up too because we cleared VPO_SWAPINPROG and
* could be waiting for it in getpages. However,
* be sure to not unbusy getpages specifically
* requested page - getpages expects it to be
* left busy.
*/
if (i != bp->b_pager.pg_reqpage) {
vm_page_lock(m);
vm_page_deactivate(m);
vm_page_unlock(m);
vm_page_wakeup(m);
} else
vm_page_flash(m);
} else {
/*
* For write success, clear the dirty
* status, then finish the I/O ( which decrements the
* busy count and possibly wakes waiter's up ).
*/
KASSERT((m->aflags & PGA_WRITEABLE) == 0,
("swp_pager_async_iodone: page %p is not write"
" protected", m));
vm_page_undirty(m);
vm_page_io_finish(m);
if (vm_page_count_severe()) {
vm_page_lock(m);
vm_page_try_to_cache(m);
vm_page_unlock(m);
}
}
}
/*
* adjust pip. NOTE: the original parent may still have its own
* pip refs on the object.
*/
if (object != NULL) {
vm_object_pip_wakeupn(object, bp->b_npages);
VM_OBJECT_UNLOCK(object);
}
/*
* swapdev_strategy() manually sets b_vp and b_bufobj before calling
* bstrategy(). Set them back to NULL now we're done with it, or we'll
* trigger a KASSERT in relpbuf().
*/
if (bp->b_vp) {
bp->b_vp = NULL;
bp->b_bufobj = NULL;
}
/*
* release the physical I/O buffer
*/
relpbuf(
bp,
((bp->b_iocmd == BIO_READ) ? &nsw_rcount :
((bp->b_flags & B_ASYNC) ?
&nsw_wcount_async :
&nsw_wcount_sync
)
)
);
}
/*
* swap_pager_isswapped:
*
* Return 1 if at least one page in the given object is paged
* out to the given swap device.
*
* This routine may not sleep.
*/
int
swap_pager_isswapped(vm_object_t object, struct swdevt *sp)
{
daddr_t index = 0;
int bcount;
int i;
VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
if (object->type != OBJT_SWAP)
return (0);
mtx_lock(&swhash_mtx);
for (bcount = 0; bcount < object->un_pager.swp.swp_bcount; bcount++) {
struct swblock *swap;
if ((swap = *swp_pager_hash(object, index)) != NULL) {
for (i = 0; i < SWAP_META_PAGES; ++i) {
if (swp_pager_isondev(swap->swb_pages[i], sp)) {
mtx_unlock(&swhash_mtx);
return (1);
}
}
}
index += SWAP_META_PAGES;
}
mtx_unlock(&swhash_mtx);
return (0);
}
/*
* SWP_PAGER_FORCE_PAGEIN() - force a swap block to be paged in
*
* This routine dissociates the page at the given index within a
* swap block from its backing store, paging it in if necessary.
* If the page is paged in, it is placed in the inactive queue,
* since it had its backing store ripped out from under it.
* We also attempt to swap in all other pages in the swap block,
* we only guarantee that the one at the specified index is
* paged in.
*
* XXX - The code to page the whole block in doesn't work, so we
* revert to the one-by-one behavior for now. Sigh.
*/
static inline void
swp_pager_force_pagein(vm_object_t object, vm_pindex_t pindex)
{
vm_page_t m;
vm_object_pip_add(object, 1);
m = vm_page_grab(object, pindex, VM_ALLOC_NORMAL|VM_ALLOC_RETRY);
if (m->valid == VM_PAGE_BITS_ALL) {
vm_object_pip_subtract(object, 1);
vm_page_dirty(m);
vm_page_lock(m);
vm_page_activate(m);
vm_page_unlock(m);
vm_page_wakeup(m);
vm_pager_page_unswapped(m);
return;
}
if (swap_pager_getpages(object, &m, 1, 0) != VM_PAGER_OK)
panic("swap_pager_force_pagein: read from swap failed");/*XXX*/
vm_object_pip_subtract(object, 1);
vm_page_dirty(m);
vm_page_lock(m);
vm_page_deactivate(m);
vm_page_unlock(m);
vm_page_wakeup(m);
vm_pager_page_unswapped(m);
}
/*
* swap_pager_swapoff:
*
* Page in all of the pages that have been paged out to the
* given device. The corresponding blocks in the bitmap must be
* marked as allocated and the device must be flagged SW_CLOSING.
* There may be no processes swapped out to the device.
*
* This routine may block.
*/
static void
swap_pager_swapoff(struct swdevt *sp)
{
struct swblock *swap;
int i, j, retries;
GIANT_REQUIRED;
retries = 0;
full_rescan:
mtx_lock(&swhash_mtx);
for (i = 0; i <= swhash_mask; i++) { /* '<=' is correct here */
restart:
for (swap = swhash[i]; swap != NULL; swap = swap->swb_hnext) {
vm_object_t object = swap->swb_object;
vm_pindex_t pindex = swap->swb_index;
for (j = 0; j < SWAP_META_PAGES; ++j) {
if (swp_pager_isondev(swap->swb_pages[j], sp)) {
/* avoid deadlock */
if (!VM_OBJECT_TRYLOCK(object)) {
break;
} else {
mtx_unlock(&swhash_mtx);
swp_pager_force_pagein(object,
pindex + j);
VM_OBJECT_UNLOCK(object);
mtx_lock(&swhash_mtx);
goto restart;
}
}
}
}
}
mtx_unlock(&swhash_mtx);
if (sp->sw_used) {
/*
* Objects may be locked or paging to the device being
* removed, so we will miss their pages and need to
* make another pass. We have marked this device as
* SW_CLOSING, so the activity should finish soon.
*/
retries++;
if (retries > 100) {
panic("swapoff: failed to locate %d swap blocks",
sp->sw_used);
}
pause("swpoff", hz / 20);
goto full_rescan;
}
}
/************************************************************************
* SWAP META DATA *
************************************************************************
*
* These routines manipulate the swap metadata stored in the
* OBJT_SWAP object.
*
* Swap metadata is implemented with a global hash and not directly
* linked into the object. Instead the object simply contains
* appropriate tracking counters.
*/
/*
* SWP_PAGER_META_BUILD() - add swap block to swap meta data for object
*
* We first convert the object to a swap object if it is a default
* object.
*
* The specified swapblk is added to the object's swap metadata. If
* the swapblk is not valid, it is freed instead. Any previously
* assigned swapblk is freed.
*/
static void
swp_pager_meta_build(vm_object_t object, vm_pindex_t pindex, daddr_t swapblk)
{
struct swblock *swap;
struct swblock **pswap;
int idx;
VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
/*
* Convert default object to swap object if necessary
*/
if (object->type != OBJT_SWAP) {
object->type = OBJT_SWAP;
object->un_pager.swp.swp_bcount = 0;
if (object->handle != NULL) {
mtx_lock(&sw_alloc_mtx);
TAILQ_INSERT_TAIL(
NOBJLIST(object->handle),
object,
pager_object_list
);
mtx_unlock(&sw_alloc_mtx);
}
}
/*
* Locate hash entry. If not found create, but if we aren't adding
* anything just return. If we run out of space in the map we wait
* and, since the hash table may have changed, retry.
*/
retry:
mtx_lock(&swhash_mtx);
pswap = swp_pager_hash(object, pindex);
if ((swap = *pswap) == NULL) {
int i;
if (swapblk == SWAPBLK_NONE)
goto done;
swap = *pswap = uma_zalloc(swap_zone, M_NOWAIT);
if (swap == NULL) {
mtx_unlock(&swhash_mtx);
VM_OBJECT_UNLOCK(object);
if (uma_zone_exhausted(swap_zone)) {
printf("swap zone exhausted, increase kern.maxswzone\n");
vm_pageout_oom(VM_OOM_SWAPZ);
pause("swzonex", 10);
} else
VM_WAIT;
VM_OBJECT_LOCK(object);
goto retry;
}
swap->swb_hnext = NULL;
swap->swb_object = object;
swap->swb_index = pindex & ~(vm_pindex_t)SWAP_META_MASK;
swap->swb_count = 0;
++object->un_pager.swp.swp_bcount;
for (i = 0; i < SWAP_META_PAGES; ++i)
swap->swb_pages[i] = SWAPBLK_NONE;
}
/*
* Delete prior contents of metadata
*/
idx = pindex & SWAP_META_MASK;
if (swap->swb_pages[idx] != SWAPBLK_NONE) {
swp_pager_freeswapspace(swap->swb_pages[idx], 1);
--swap->swb_count;
}
/*
* Enter block into metadata
*/
swap->swb_pages[idx] = swapblk;
if (swapblk != SWAPBLK_NONE)
++swap->swb_count;
done:
mtx_unlock(&swhash_mtx);
}
/*
* SWP_PAGER_META_FREE() - free a range of blocks in the object's swap metadata
*
* The requested range of blocks is freed, with any associated swap
* returned to the swap bitmap.
*
* This routine will free swap metadata structures as they are cleaned
* out. This routine does *NOT* operate on swap metadata associated
* with resident pages.
*/
static void
swp_pager_meta_free(vm_object_t object, vm_pindex_t index, daddr_t count)
{
VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
if (object->type != OBJT_SWAP)
return;
while (count > 0) {
struct swblock **pswap;
struct swblock *swap;
mtx_lock(&swhash_mtx);
pswap = swp_pager_hash(object, index);
if ((swap = *pswap) != NULL) {
daddr_t v = swap->swb_pages[index & SWAP_META_MASK];
if (v != SWAPBLK_NONE) {
swp_pager_freeswapspace(v, 1);
swap->swb_pages[index & SWAP_META_MASK] =
SWAPBLK_NONE;
if (--swap->swb_count == 0) {
*pswap = swap->swb_hnext;
uma_zfree(swap_zone, swap);
--object->un_pager.swp.swp_bcount;
}
}
--count;
++index;
} else {
int n = SWAP_META_PAGES - (index & SWAP_META_MASK);
count -= n;
index += n;
}
mtx_unlock(&swhash_mtx);
}
}
/*
* SWP_PAGER_META_FREE_ALL() - destroy all swap metadata associated with object
*
* This routine locates and destroys all swap metadata associated with
* an object.
*/
static void
swp_pager_meta_free_all(vm_object_t object)
{
daddr_t index = 0;
VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
if (object->type != OBJT_SWAP)
return;
while (object->un_pager.swp.swp_bcount) {
struct swblock **pswap;
struct swblock *swap;
mtx_lock(&swhash_mtx);
pswap = swp_pager_hash(object, index);
if ((swap = *pswap) != NULL) {
int i;
for (i = 0; i < SWAP_META_PAGES; ++i) {
daddr_t v = swap->swb_pages[i];
if (v != SWAPBLK_NONE) {
--swap->swb_count;
swp_pager_freeswapspace(v, 1);
}
}
if (swap->swb_count != 0)
panic("swap_pager_meta_free_all: swb_count != 0");
*pswap = swap->swb_hnext;
uma_zfree(swap_zone, swap);
--object->un_pager.swp.swp_bcount;
}
mtx_unlock(&swhash_mtx);
index += SWAP_META_PAGES;
}
}
/*
* SWP_PAGER_METACTL() - misc control of swap and vm_page_t meta data.
*
* This routine is capable of looking up, popping, or freeing
* swapblk assignments in the swap meta data or in the vm_page_t.
* The routine typically returns the swapblk being looked-up, or popped,
* or SWAPBLK_NONE if the block was freed, or SWAPBLK_NONE if the block
* was invalid. This routine will automatically free any invalid
* meta-data swapblks.
*
* It is not possible to store invalid swapblks in the swap meta data
* (other then a literal 'SWAPBLK_NONE'), so we don't bother checking.
*
* When acting on a busy resident page and paging is in progress, we
* have to wait until paging is complete but otherwise can act on the
* busy page.
*
* SWM_FREE remove and free swap block from metadata
* SWM_POP remove from meta data but do not free.. pop it out
*/
static daddr_t
swp_pager_meta_ctl(vm_object_t object, vm_pindex_t pindex, int flags)
{
struct swblock **pswap;
struct swblock *swap;
daddr_t r1;
int idx;
VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
/*
* The meta data only exists of the object is OBJT_SWAP
* and even then might not be allocated yet.
*/
if (object->type != OBJT_SWAP)
return (SWAPBLK_NONE);
r1 = SWAPBLK_NONE;
mtx_lock(&swhash_mtx);
pswap = swp_pager_hash(object, pindex);
if ((swap = *pswap) != NULL) {
idx = pindex & SWAP_META_MASK;
r1 = swap->swb_pages[idx];
if (r1 != SWAPBLK_NONE) {
if (flags & SWM_FREE) {
swp_pager_freeswapspace(r1, 1);
r1 = SWAPBLK_NONE;
}
if (flags & (SWM_FREE|SWM_POP)) {
swap->swb_pages[idx] = SWAPBLK_NONE;
if (--swap->swb_count == 0) {
*pswap = swap->swb_hnext;
uma_zfree(swap_zone, swap);
--object->un_pager.swp.swp_bcount;
}
}
}
}
mtx_unlock(&swhash_mtx);
return (r1);
}
/*
* System call swapon(name) enables swapping on device name,
* which must be in the swdevsw. Return EBUSY
* if already swapping on this device.
*/
#ifndef _SYS_SYSPROTO_H_
struct swapon_args {
char *name;
};
#endif
/*
* MPSAFE
*/
/* ARGSUSED */
int
-swapon(struct thread *td, struct swapon_args *uap)
+sys_swapon(struct thread *td, struct swapon_args *uap)
{
struct vattr attr;
struct vnode *vp;
struct nameidata nd;
int error;
error = priv_check(td, PRIV_SWAPON);
if (error)
return (error);
mtx_lock(&Giant);
while (swdev_syscall_active)
tsleep(&swdev_syscall_active, PUSER - 1, "swpon", 0);
swdev_syscall_active = 1;
/*
* Swap metadata may not fit in the KVM if we have physical
* memory of >1GB.
*/
if (swap_zone == NULL) {
error = ENOMEM;
goto done;
}
NDINIT(&nd, LOOKUP, ISOPEN | FOLLOW | AUDITVNODE1, UIO_USERSPACE,
uap->name, td);
error = namei(&nd);
if (error)
goto done;
NDFREE(&nd, NDF_ONLY_PNBUF);
vp = nd.ni_vp;
if (vn_isdisk(vp, &error)) {
error = swapongeom(td, vp);
} else if (vp->v_type == VREG &&
(vp->v_mount->mnt_vfc->vfc_flags & VFCF_NETWORK) != 0 &&
(error = VOP_GETATTR(vp, &attr, td->td_ucred)) == 0) {
/*
* Allow direct swapping to NFS regular files in the same
* way that nfs_mountroot() sets up diskless swapping.
*/
error = swaponvp(td, vp, attr.va_size / DEV_BSIZE);
}
if (error)
vrele(vp);
done:
swdev_syscall_active = 0;
wakeup_one(&swdev_syscall_active);
mtx_unlock(&Giant);
return (error);
}
static void
swaponsomething(struct vnode *vp, void *id, u_long nblks, sw_strategy_t *strategy, sw_close_t *close, dev_t dev)
{
struct swdevt *sp, *tsp;
swblk_t dvbase;
u_long mblocks;
/*
* nblks is in DEV_BSIZE'd chunks, convert to PAGE_SIZE'd chunks.
* First chop nblks off to page-align it, then convert.
*
* sw->sw_nblks is in page-sized chunks now too.
*/
nblks &= ~(ctodb(1) - 1);
nblks = dbtoc(nblks);
/*
* If we go beyond this, we get overflows in the radix
* tree bitmap code.
*/
mblocks = 0x40000000 / BLIST_META_RADIX;
if (nblks > mblocks) {
printf(
"WARNING: reducing swap size to maximum of %luMB per unit\n",
mblocks / 1024 / 1024 * PAGE_SIZE);
nblks = mblocks;
}
sp = malloc(sizeof *sp, M_VMPGDATA, M_WAITOK | M_ZERO);
sp->sw_vp = vp;
sp->sw_id = id;
sp->sw_dev = dev;
sp->sw_flags = 0;
sp->sw_nblks = nblks;
sp->sw_used = 0;
sp->sw_strategy = strategy;
sp->sw_close = close;
sp->sw_blist = blist_create(nblks, M_WAITOK);
/*
* Do not free the first two block in order to avoid overwriting
* any bsd label at the front of the partition
*/
blist_free(sp->sw_blist, 2, nblks - 2);
dvbase = 0;
mtx_lock(&sw_dev_mtx);
TAILQ_FOREACH(tsp, &swtailq, sw_list) {
if (tsp->sw_end >= dvbase) {
/*
* We put one uncovered page between the devices
* in order to definitively prevent any cross-device
* I/O requests
*/
dvbase = tsp->sw_end + 1;
}
}
sp->sw_first = dvbase;
sp->sw_end = dvbase + nblks;
TAILQ_INSERT_TAIL(&swtailq, sp, sw_list);
nswapdev++;
swap_pager_avail += nblks;
swap_total += (vm_ooffset_t)nblks * PAGE_SIZE;
swp_sizecheck();
mtx_unlock(&sw_dev_mtx);
}
/*
* SYSCALL: swapoff(devname)
*
* Disable swapping on the given device.
*
* XXX: Badly designed system call: it should use a device index
* rather than filename as specification. We keep sw_vp around
* only to make this work.
*/
#ifndef _SYS_SYSPROTO_H_
struct swapoff_args {
char *name;
};
#endif
/*
* MPSAFE
*/
/* ARGSUSED */
int
-swapoff(struct thread *td, struct swapoff_args *uap)
+sys_swapoff(struct thread *td, struct swapoff_args *uap)
{
struct vnode *vp;
struct nameidata nd;
struct swdevt *sp;
int error;
error = priv_check(td, PRIV_SWAPOFF);
if (error)
return (error);
mtx_lock(&Giant);
while (swdev_syscall_active)
tsleep(&swdev_syscall_active, PUSER - 1, "swpoff", 0);
swdev_syscall_active = 1;
NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->name,
td);
error = namei(&nd);
if (error)
goto done;
NDFREE(&nd, NDF_ONLY_PNBUF);
vp = nd.ni_vp;
mtx_lock(&sw_dev_mtx);
TAILQ_FOREACH(sp, &swtailq, sw_list) {
if (sp->sw_vp == vp)
break;
}
mtx_unlock(&sw_dev_mtx);
if (sp == NULL) {
error = EINVAL;
goto done;
}
error = swapoff_one(sp, td->td_ucred);
done:
swdev_syscall_active = 0;
wakeup_one(&swdev_syscall_active);
mtx_unlock(&Giant);
return (error);
}
static int
swapoff_one(struct swdevt *sp, struct ucred *cred)
{
u_long nblks, dvbase;
#ifdef MAC
int error;
#endif
mtx_assert(&Giant, MA_OWNED);
#ifdef MAC
(void) vn_lock(sp->sw_vp, LK_EXCLUSIVE | LK_RETRY);
error = mac_system_check_swapoff(cred, sp->sw_vp);
(void) VOP_UNLOCK(sp->sw_vp, 0);
if (error != 0)
return (error);
#endif
nblks = sp->sw_nblks;
/*
* We can turn off this swap device safely only if the
* available virtual memory in the system will fit the amount
* of data we will have to page back in, plus an epsilon so
* the system doesn't become critically low on swap space.
*/
if (cnt.v_free_count + cnt.v_cache_count + swap_pager_avail <
nblks + nswap_lowat) {
return (ENOMEM);
}
/*
* Prevent further allocations on this device.
*/
mtx_lock(&sw_dev_mtx);
sp->sw_flags |= SW_CLOSING;
for (dvbase = 0; dvbase < sp->sw_end; dvbase += dmmax) {
swap_pager_avail -= blist_fill(sp->sw_blist,
dvbase, dmmax);
}
swap_total -= (vm_ooffset_t)nblks * PAGE_SIZE;
mtx_unlock(&sw_dev_mtx);
/*
* Page in the contents of the device and close it.
*/
swap_pager_swapoff(sp);
sp->sw_close(curthread, sp);
sp->sw_id = NULL;
mtx_lock(&sw_dev_mtx);
TAILQ_REMOVE(&swtailq, sp, sw_list);
nswapdev--;
if (nswapdev == 0) {
swap_pager_full = 2;
swap_pager_almost_full = 1;
}
if (swdevhd == sp)
swdevhd = NULL;
mtx_unlock(&sw_dev_mtx);
blist_destroy(sp->sw_blist);
free(sp, M_VMPGDATA);
return (0);
}
void
swapoff_all(void)
{
struct swdevt *sp, *spt;
const char *devname;
int error;
mtx_lock(&Giant);
while (swdev_syscall_active)
tsleep(&swdev_syscall_active, PUSER - 1, "swpoff", 0);
swdev_syscall_active = 1;
mtx_lock(&sw_dev_mtx);
TAILQ_FOREACH_SAFE(sp, &swtailq, sw_list, spt) {
mtx_unlock(&sw_dev_mtx);
if (vn_isdisk(sp->sw_vp, NULL))
devname = sp->sw_vp->v_rdev->si_name;
else
devname = "[file]";
error = swapoff_one(sp, thread0.td_ucred);
if (error != 0) {
printf("Cannot remove swap device %s (error=%d), "
"skipping.\n", devname, error);
} else if (bootverbose) {
printf("Swap device %s removed.\n", devname);
}
mtx_lock(&sw_dev_mtx);
}
mtx_unlock(&sw_dev_mtx);
swdev_syscall_active = 0;
wakeup_one(&swdev_syscall_active);
mtx_unlock(&Giant);
}
void
swap_pager_status(int *total, int *used)
{
struct swdevt *sp;
*total = 0;
*used = 0;
mtx_lock(&sw_dev_mtx);
TAILQ_FOREACH(sp, &swtailq, sw_list) {
*total += sp->sw_nblks;
*used += sp->sw_used;
}
mtx_unlock(&sw_dev_mtx);
}
int
swap_dev_info(int name, struct xswdev *xs, char *devname, size_t len)
{
struct swdevt *sp;
char *tmp_devname;
int error, n;
n = 0;
error = ENOENT;
mtx_lock(&sw_dev_mtx);
TAILQ_FOREACH(sp, &swtailq, sw_list) {
if (n != name) {
n++;
continue;
}
xs->xsw_version = XSWDEV_VERSION;
xs->xsw_dev = sp->sw_dev;
xs->xsw_flags = sp->sw_flags;
xs->xsw_nblks = sp->sw_nblks;
xs->xsw_used = sp->sw_used;
if (devname != NULL) {
if (vn_isdisk(sp->sw_vp, NULL))
tmp_devname = sp->sw_vp->v_rdev->si_name;
else
tmp_devname = "[file]";
strncpy(devname, tmp_devname, len);
}
error = 0;
break;
}
mtx_unlock(&sw_dev_mtx);
return (error);
}
static int
sysctl_vm_swap_info(SYSCTL_HANDLER_ARGS)
{
struct xswdev xs;
int error;
if (arg2 != 1) /* name length */
return (EINVAL);
error = swap_dev_info(*(int *)arg1, &xs, NULL, 0);
if (error != 0)
return (error);
error = SYSCTL_OUT(req, &xs, sizeof(xs));
return (error);
}
SYSCTL_INT(_vm, OID_AUTO, nswapdev, CTLFLAG_RD, &nswapdev, 0,
"Number of swap devices");
SYSCTL_NODE(_vm, OID_AUTO, swap_info, CTLFLAG_RD, sysctl_vm_swap_info,
"Swap statistics by device");
/*
* vmspace_swap_count() - count the approximate swap usage in pages for a
* vmspace.
*
* The map must be locked.
*
* Swap usage is determined by taking the proportional swap used by
* VM objects backing the VM map. To make up for fractional losses,
* if the VM object has any swap use at all the associated map entries
* count for at least 1 swap page.
*/
long
vmspace_swap_count(struct vmspace *vmspace)
{
vm_map_t map;
vm_map_entry_t cur;
vm_object_t object;
long count, n;
map = &vmspace->vm_map;
count = 0;
for (cur = map->header.next; cur != &map->header; cur = cur->next) {
if ((cur->eflags & MAP_ENTRY_IS_SUB_MAP) == 0 &&
(object = cur->object.vm_object) != NULL) {
VM_OBJECT_LOCK(object);
if (object->type == OBJT_SWAP &&
object->un_pager.swp.swp_bcount != 0) {
n = (cur->end - cur->start) / PAGE_SIZE;
count += object->un_pager.swp.swp_bcount *
SWAP_META_PAGES * n / object->size + 1;
}
VM_OBJECT_UNLOCK(object);
}
}
return (count);
}
/*
* GEOM backend
*
* Swapping onto disk devices.
*
*/
static g_orphan_t swapgeom_orphan;
static struct g_class g_swap_class = {
.name = "SWAP",
.version = G_VERSION,
.orphan = swapgeom_orphan,
};
DECLARE_GEOM_CLASS(g_swap_class, g_class);
static void
swapgeom_done(struct bio *bp2)
{
struct buf *bp;
bp = bp2->bio_caller2;
bp->b_ioflags = bp2->bio_flags;
if (bp2->bio_error)
bp->b_ioflags |= BIO_ERROR;
bp->b_resid = bp->b_bcount - bp2->bio_completed;
bp->b_error = bp2->bio_error;
bufdone(bp);
g_destroy_bio(bp2);
}
static void
swapgeom_strategy(struct buf *bp, struct swdevt *sp)
{
struct bio *bio;
struct g_consumer *cp;
cp = sp->sw_id;
if (cp == NULL) {
bp->b_error = ENXIO;
bp->b_ioflags |= BIO_ERROR;
bufdone(bp);
return;
}
if (bp->b_iocmd == BIO_WRITE)
bio = g_new_bio();
else
bio = g_alloc_bio();
if (bio == NULL) {
bp->b_error = ENOMEM;
bp->b_ioflags |= BIO_ERROR;
bufdone(bp);
return;
}
bio->bio_caller2 = bp;
bio->bio_cmd = bp->b_iocmd;
bio->bio_data = bp->b_data;
bio->bio_offset = (bp->b_blkno - sp->sw_first) * PAGE_SIZE;
bio->bio_length = bp->b_bcount;
bio->bio_done = swapgeom_done;
g_io_request(bio, cp);
return;
}
static void
swapgeom_orphan(struct g_consumer *cp)
{
struct swdevt *sp;
mtx_lock(&sw_dev_mtx);
TAILQ_FOREACH(sp, &swtailq, sw_list)
if (sp->sw_id == cp)
sp->sw_id = NULL;
mtx_unlock(&sw_dev_mtx);
}
static void
swapgeom_close_ev(void *arg, int flags)
{
struct g_consumer *cp;
cp = arg;
g_access(cp, -1, -1, 0);
g_detach(cp);
g_destroy_consumer(cp);
}
static void
swapgeom_close(struct thread *td, struct swdevt *sw)
{
/* XXX: direct call when Giant untangled */
g_waitfor_event(swapgeom_close_ev, sw->sw_id, M_WAITOK, NULL);
}
struct swh0h0 {
struct cdev *dev;
struct vnode *vp;
int error;
};
static void
swapongeom_ev(void *arg, int flags)
{
struct swh0h0 *swh;
struct g_provider *pp;
struct g_consumer *cp;
static struct g_geom *gp;
struct swdevt *sp;
u_long nblks;
int error;
swh = arg;
swh->error = 0;
pp = g_dev_getprovider(swh->dev);
if (pp == NULL) {
swh->error = ENODEV;
return;
}
mtx_lock(&sw_dev_mtx);
TAILQ_FOREACH(sp, &swtailq, sw_list) {
cp = sp->sw_id;
if (cp != NULL && cp->provider == pp) {
mtx_unlock(&sw_dev_mtx);
swh->error = EBUSY;
return;
}
}
mtx_unlock(&sw_dev_mtx);
if (gp == NULL)
gp = g_new_geomf(&g_swap_class, "swap", NULL);
cp = g_new_consumer(gp);
g_attach(cp, pp);
/*
* XXX: Everytime you think you can improve the margin for
* footshooting, somebody depends on the ability to do so:
* savecore(8) wants to write to our swapdev so we cannot
* set an exclusive count :-(
*/
error = g_access(cp, 1, 1, 0);
if (error) {
g_detach(cp);
g_destroy_consumer(cp);
swh->error = error;
return;
}
nblks = pp->mediasize / DEV_BSIZE;
swaponsomething(swh->vp, cp, nblks, swapgeom_strategy,
swapgeom_close, dev2udev(swh->dev));
swh->error = 0;
return;
}
static int
swapongeom(struct thread *td, struct vnode *vp)
{
int error;
struct swh0h0 swh;
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
swh.dev = vp->v_rdev;
swh.vp = vp;
swh.error = 0;
/* XXX: direct call when Giant untangled */
error = g_waitfor_event(swapongeom_ev, &swh, M_WAITOK, NULL);
if (!error)
error = swh.error;
VOP_UNLOCK(vp, 0);
return (error);
}
/*
* VNODE backend
*
* This is used mainly for network filesystem (read: probably only tested
* with NFS) swapfiles.
*
*/
static void
swapdev_strategy(struct buf *bp, struct swdevt *sp)
{
struct vnode *vp2;
bp->b_blkno = ctodb(bp->b_blkno - sp->sw_first);
vp2 = sp->sw_id;
vhold(vp2);
if (bp->b_iocmd == BIO_WRITE) {
if (bp->b_bufobj)
bufobj_wdrop(bp->b_bufobj);
bufobj_wref(&vp2->v_bufobj);
}
if (bp->b_bufobj != &vp2->v_bufobj)
bp->b_bufobj = &vp2->v_bufobj;
bp->b_vp = vp2;
bp->b_iooffset = dbtob(bp->b_blkno);
bstrategy(bp);
return;
}
static void
swapdev_close(struct thread *td, struct swdevt *sp)
{
VOP_CLOSE(sp->sw_vp, FREAD | FWRITE, td->td_ucred, td);
vrele(sp->sw_vp);
}
static int
swaponvp(struct thread *td, struct vnode *vp, u_long nblks)
{
struct swdevt *sp;
int error;
if (nblks == 0)
return (ENXIO);
mtx_lock(&sw_dev_mtx);
TAILQ_FOREACH(sp, &swtailq, sw_list) {
if (sp->sw_id == vp) {
mtx_unlock(&sw_dev_mtx);
return (EBUSY);
}
}
mtx_unlock(&sw_dev_mtx);
(void) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
#ifdef MAC
error = mac_system_check_swapon(td->td_ucred, vp);
if (error == 0)
#endif
error = VOP_OPEN(vp, FREAD | FWRITE, td->td_ucred, td, NULL);
(void) VOP_UNLOCK(vp, 0);
if (error)
return (error);
swaponsomething(vp, vp, nblks, swapdev_strategy, swapdev_close,
NODEV);
return (0);
}
Index: head/sys/vm/vm_mmap.c
===================================================================
--- head/sys/vm/vm_mmap.c (revision 225616)
+++ head/sys/vm/vm_mmap.c (revision 225617)
@@ -1,1574 +1,1574 @@
/*-
* Copyright (c) 1988 University of Utah.
* Copyright (c) 1991, 1993
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* the Systems Programming Group of the University of Utah Computer
* Science Department.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$
*
* @(#)vm_mmap.c 8.4 (Berkeley) 1/12/94
*/
/*
* Mapped file (mmap) interface to VM
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_compat.h"
#include "opt_hwpmc_hooks.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/capability.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/sysproto.h>
#include <sys/filedesc.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/racct.h>
#include <sys/resource.h>
#include <sys/resourcevar.h>
#include <sys/vnode.h>
#include <sys/fcntl.h>
#include <sys/file.h>
#include <sys/mman.h>
#include <sys/mount.h>
#include <sys/conf.h>
#include <sys/stat.h>
#include <sys/sysent.h>
#include <sys/vmmeter.h>
#include <security/mac/mac_framework.h>
#include <vm/vm.h>
#include <vm/vm_param.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
#include <vm/vm_object.h>
#include <vm/vm_page.h>
#include <vm/vm_pager.h>
#include <vm/vm_pageout.h>
#include <vm/vm_extern.h>
#include <vm/vm_page.h>
#ifdef HWPMC_HOOKS
#include <sys/pmckern.h>
#endif
#ifndef _SYS_SYSPROTO_H_
struct sbrk_args {
int incr;
};
#endif
static int vm_mmap_vnode(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *,
int *, struct vnode *, vm_ooffset_t *, vm_object_t *);
static int vm_mmap_cdev(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *,
int *, struct cdev *, vm_ooffset_t *, vm_object_t *);
static int vm_mmap_shm(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *,
int *, struct shmfd *, vm_ooffset_t, vm_object_t *);
/*
* MPSAFE
*/
/* ARGSUSED */
int
-sbrk(td, uap)
+sys_sbrk(td, uap)
struct thread *td;
struct sbrk_args *uap;
{
/* Not yet implemented */
return (EOPNOTSUPP);
}
#ifndef _SYS_SYSPROTO_H_
struct sstk_args {
int incr;
};
#endif
/*
* MPSAFE
*/
/* ARGSUSED */
int
-sstk(td, uap)
+sys_sstk(td, uap)
struct thread *td;
struct sstk_args *uap;
{
/* Not yet implemented */
return (EOPNOTSUPP);
}
#if defined(COMPAT_43)
#ifndef _SYS_SYSPROTO_H_
struct getpagesize_args {
int dummy;
};
#endif
/* ARGSUSED */
int
ogetpagesize(td, uap)
struct thread *td;
struct getpagesize_args *uap;
{
/* MP SAFE */
td->td_retval[0] = PAGE_SIZE;
return (0);
}
#endif /* COMPAT_43 */
/*
* Memory Map (mmap) system call. Note that the file offset
* and address are allowed to be NOT page aligned, though if
* the MAP_FIXED flag it set, both must have the same remainder
* modulo the PAGE_SIZE (POSIX 1003.1b). If the address is not
* page-aligned, the actual mapping starts at trunc_page(addr)
* and the return value is adjusted up by the page offset.
*
* Generally speaking, only character devices which are themselves
* memory-based, such as a video framebuffer, can be mmap'd. Otherwise
* there would be no cache coherency between a descriptor and a VM mapping
* both to the same character device.
*/
#ifndef _SYS_SYSPROTO_H_
struct mmap_args {
void *addr;
size_t len;
int prot;
int flags;
int fd;
long pad;
off_t pos;
};
#endif
/*
* MPSAFE
*/
int
-mmap(td, uap)
+sys_mmap(td, uap)
struct thread *td;
struct mmap_args *uap;
{
#ifdef HWPMC_HOOKS
struct pmckern_map_in pkm;
#endif
struct file *fp;
struct vnode *vp;
vm_offset_t addr;
vm_size_t size, pageoff;
vm_prot_t cap_maxprot, prot, maxprot;
void *handle;
objtype_t handle_type;
int flags, error;
off_t pos;
struct vmspace *vms = td->td_proc->p_vmspace;
cap_rights_t rights;
addr = (vm_offset_t) uap->addr;
size = uap->len;
prot = uap->prot & VM_PROT_ALL;
flags = uap->flags;
pos = uap->pos;
fp = NULL;
/* Make sure mapping fits into numeric range, etc. */
if ((uap->len == 0 && !SV_CURPROC_FLAG(SV_AOUT) &&
curproc->p_osrel >= P_OSREL_MAP_ANON) ||
((flags & MAP_ANON) && (uap->fd != -1 || pos != 0)))
return (EINVAL);
if (flags & MAP_STACK) {
if ((uap->fd != -1) ||
((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE)))
return (EINVAL);
flags |= MAP_ANON;
pos = 0;
}
/*
* Align the file position to a page boundary,
* and save its page offset component.
*/
pageoff = (pos & PAGE_MASK);
pos -= pageoff;
/* Adjust size for rounding (on both ends). */
size += pageoff; /* low end... */
size = (vm_size_t) round_page(size); /* hi end */
/*
* Check for illegal addresses. Watch out for address wrap... Note
* that VM_*_ADDRESS are not constants due to casts (argh).
*/
if (flags & MAP_FIXED) {
/*
* The specified address must have the same remainder
* as the file offset taken modulo PAGE_SIZE, so it
* should be aligned after adjustment by pageoff.
*/
addr -= pageoff;
if (addr & PAGE_MASK)
return (EINVAL);
/* Address range must be all in user VM space. */
if (addr < vm_map_min(&vms->vm_map) ||
addr + size > vm_map_max(&vms->vm_map))
return (EINVAL);
if (addr + size < addr)
return (EINVAL);
} else {
/*
* XXX for non-fixed mappings where no hint is provided or
* the hint would fall in the potential heap space,
* place it after the end of the largest possible heap.
*
* There should really be a pmap call to determine a reasonable
* location.
*/
PROC_LOCK(td->td_proc);
if (addr == 0 ||
(addr >= round_page((vm_offset_t)vms->vm_taddr) &&
addr < round_page((vm_offset_t)vms->vm_daddr +
lim_max(td->td_proc, RLIMIT_DATA))))
addr = round_page((vm_offset_t)vms->vm_daddr +
lim_max(td->td_proc, RLIMIT_DATA));
PROC_UNLOCK(td->td_proc);
}
if (flags & MAP_ANON) {
/*
* Mapping blank space is trivial.
*/
handle = NULL;
handle_type = OBJT_DEFAULT;
maxprot = VM_PROT_ALL;
cap_maxprot = VM_PROT_ALL;
} else {
/*
* Mapping file, get fp for validation and don't let the
* descriptor disappear on us if we block. Check capability
* rights, but also return the maximum rights to be combined
* with maxprot later.
*/
rights = CAP_MMAP;
if (prot & PROT_READ)
rights |= CAP_READ;
if ((flags & MAP_SHARED) != 0) {
if (prot & PROT_WRITE)
rights |= CAP_WRITE;
}
if (prot & PROT_EXEC)
rights |= CAP_MAPEXEC;
if ((error = fget_mmap(td, uap->fd, rights, &cap_maxprot,
&fp)) != 0)
goto done;
if (fp->f_type == DTYPE_SHM) {
handle = fp->f_data;
handle_type = OBJT_SWAP;
maxprot = VM_PROT_NONE;
/* FREAD should always be set. */
if (fp->f_flag & FREAD)
maxprot |= VM_PROT_EXECUTE | VM_PROT_READ;
if (fp->f_flag & FWRITE)
maxprot |= VM_PROT_WRITE;
goto map;
}
if (fp->f_type != DTYPE_VNODE) {
error = ENODEV;
goto done;
}
#if defined(COMPAT_FREEBSD7) || defined(COMPAT_FREEBSD6) || \
defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4)
/*
* POSIX shared-memory objects are defined to have
* kernel persistence, and are not defined to support
* read(2)/write(2) -- or even open(2). Thus, we can
* use MAP_ASYNC to trade on-disk coherence for speed.
* The shm_open(3) library routine turns on the FPOSIXSHM
* flag to request this behavior.
*/
if (fp->f_flag & FPOSIXSHM)
flags |= MAP_NOSYNC;
#endif
vp = fp->f_vnode;
/*
* Ensure that file and memory protections are
* compatible. Note that we only worry about
* writability if mapping is shared; in this case,
* current and max prot are dictated by the open file.
* XXX use the vnode instead? Problem is: what
* credentials do we use for determination? What if
* proc does a setuid?
*/
if (vp->v_mount != NULL && vp->v_mount->mnt_flag & MNT_NOEXEC)
maxprot = VM_PROT_NONE;
else
maxprot = VM_PROT_EXECUTE;
if (fp->f_flag & FREAD) {
maxprot |= VM_PROT_READ;
} else if (prot & PROT_READ) {
error = EACCES;
goto done;
}
/*
* If we are sharing potential changes (either via
* MAP_SHARED or via the implicit sharing of character
* device mappings), and we are trying to get write
* permission although we opened it without asking
* for it, bail out.
*/
if ((flags & MAP_SHARED) != 0) {
if ((fp->f_flag & FWRITE) != 0) {
maxprot |= VM_PROT_WRITE;
} else if ((prot & PROT_WRITE) != 0) {
error = EACCES;
goto done;
}
} else if (vp->v_type != VCHR || (fp->f_flag & FWRITE) != 0) {
maxprot |= VM_PROT_WRITE;
cap_maxprot |= VM_PROT_WRITE;
}
handle = (void *)vp;
handle_type = OBJT_VNODE;
}
map:
td->td_fpop = fp;
maxprot &= cap_maxprot;
error = vm_mmap(&vms->vm_map, &addr, size, prot, maxprot,
flags, handle_type, handle, pos);
td->td_fpop = NULL;
#ifdef HWPMC_HOOKS
/* inform hwpmc(4) if an executable is being mapped */
if (error == 0 && handle_type == OBJT_VNODE &&
(prot & PROT_EXEC)) {
pkm.pm_file = handle;
pkm.pm_address = (uintptr_t) addr;
PMC_CALL_HOOK(td, PMC_FN_MMAP, (void *) &pkm);
}
#endif
if (error == 0)
td->td_retval[0] = (register_t) (addr + pageoff);
done:
if (fp)
fdrop(fp, td);
return (error);
}
int
freebsd6_mmap(struct thread *td, struct freebsd6_mmap_args *uap)
{
struct mmap_args oargs;
oargs.addr = uap->addr;
oargs.len = uap->len;
oargs.prot = uap->prot;
oargs.flags = uap->flags;
oargs.fd = uap->fd;
oargs.pos = uap->pos;
- return (mmap(td, &oargs));
+ return (sys_mmap(td, &oargs));
}
#ifdef COMPAT_43
#ifndef _SYS_SYSPROTO_H_
struct ommap_args {
caddr_t addr;
int len;
int prot;
int flags;
int fd;
long pos;
};
#endif
int
ommap(td, uap)
struct thread *td;
struct ommap_args *uap;
{
struct mmap_args nargs;
static const char cvtbsdprot[8] = {
0,
PROT_EXEC,
PROT_WRITE,
PROT_EXEC | PROT_WRITE,
PROT_READ,
PROT_EXEC | PROT_READ,
PROT_WRITE | PROT_READ,
PROT_EXEC | PROT_WRITE | PROT_READ,
};
#define OMAP_ANON 0x0002
#define OMAP_COPY 0x0020
#define OMAP_SHARED 0x0010
#define OMAP_FIXED 0x0100
nargs.addr = uap->addr;
nargs.len = uap->len;
nargs.prot = cvtbsdprot[uap->prot & 0x7];
nargs.flags = 0;
if (uap->flags & OMAP_ANON)
nargs.flags |= MAP_ANON;
if (uap->flags & OMAP_COPY)
nargs.flags |= MAP_COPY;
if (uap->flags & OMAP_SHARED)
nargs.flags |= MAP_SHARED;
else
nargs.flags |= MAP_PRIVATE;
if (uap->flags & OMAP_FIXED)
nargs.flags |= MAP_FIXED;
nargs.fd = uap->fd;
nargs.pos = uap->pos;
- return (mmap(td, &nargs));
+ return (sys_mmap(td, &nargs));
}
#endif /* COMPAT_43 */
#ifndef _SYS_SYSPROTO_H_
struct msync_args {
void *addr;
size_t len;
int flags;
};
#endif
/*
* MPSAFE
*/
int
-msync(td, uap)
+sys_msync(td, uap)
struct thread *td;
struct msync_args *uap;
{
vm_offset_t addr;
vm_size_t size, pageoff;
int flags;
vm_map_t map;
int rv;
addr = (vm_offset_t) uap->addr;
size = uap->len;
flags = uap->flags;
pageoff = (addr & PAGE_MASK);
addr -= pageoff;
size += pageoff;
size = (vm_size_t) round_page(size);
if (addr + size < addr)
return (EINVAL);
if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE))
return (EINVAL);
map = &td->td_proc->p_vmspace->vm_map;
/*
* Clean the pages and interpret the return value.
*/
rv = vm_map_sync(map, addr, addr + size, (flags & MS_ASYNC) == 0,
(flags & MS_INVALIDATE) != 0);
switch (rv) {
case KERN_SUCCESS:
return (0);
case KERN_INVALID_ADDRESS:
return (EINVAL); /* Sun returns ENOMEM? */
case KERN_INVALID_ARGUMENT:
return (EBUSY);
default:
return (EINVAL);
}
}
#ifndef _SYS_SYSPROTO_H_
struct munmap_args {
void *addr;
size_t len;
};
#endif
/*
* MPSAFE
*/
int
-munmap(td, uap)
+sys_munmap(td, uap)
struct thread *td;
struct munmap_args *uap;
{
#ifdef HWPMC_HOOKS
struct pmckern_map_out pkm;
vm_map_entry_t entry;
#endif
vm_offset_t addr;
vm_size_t size, pageoff;
vm_map_t map;
addr = (vm_offset_t) uap->addr;
size = uap->len;
if (size == 0)
return (EINVAL);
pageoff = (addr & PAGE_MASK);
addr -= pageoff;
size += pageoff;
size = (vm_size_t) round_page(size);
if (addr + size < addr)
return (EINVAL);
/*
* Check for illegal addresses. Watch out for address wrap...
*/
map = &td->td_proc->p_vmspace->vm_map;
if (addr < vm_map_min(map) || addr + size > vm_map_max(map))
return (EINVAL);
vm_map_lock(map);
#ifdef HWPMC_HOOKS
/*
* Inform hwpmc if the address range being unmapped contains
* an executable region.
*/
pkm.pm_address = (uintptr_t) NULL;
if (vm_map_lookup_entry(map, addr, &entry)) {
for (;
entry != &map->header && entry->start < addr + size;
entry = entry->next) {
if (vm_map_check_protection(map, entry->start,
entry->end, VM_PROT_EXECUTE) == TRUE) {
pkm.pm_address = (uintptr_t) addr;
pkm.pm_size = (size_t) size;
break;
}
}
}
#endif
vm_map_delete(map, addr, addr + size);
#ifdef HWPMC_HOOKS
/* downgrade the lock to prevent a LOR with the pmc-sx lock */
vm_map_lock_downgrade(map);
if (pkm.pm_address != (uintptr_t) NULL)
PMC_CALL_HOOK(td, PMC_FN_MUNMAP, (void *) &pkm);
vm_map_unlock_read(map);
#else
vm_map_unlock(map);
#endif
/* vm_map_delete returns nothing but KERN_SUCCESS anyway */
return (0);
}
#ifndef _SYS_SYSPROTO_H_
struct mprotect_args {
const void *addr;
size_t len;
int prot;
};
#endif
/*
* MPSAFE
*/
int
-mprotect(td, uap)
+sys_mprotect(td, uap)
struct thread *td;
struct mprotect_args *uap;
{
vm_offset_t addr;
vm_size_t size, pageoff;
vm_prot_t prot;
addr = (vm_offset_t) uap->addr;
size = uap->len;
prot = uap->prot & VM_PROT_ALL;
pageoff = (addr & PAGE_MASK);
addr -= pageoff;
size += pageoff;
size = (vm_size_t) round_page(size);
if (addr + size < addr)
return (EINVAL);
switch (vm_map_protect(&td->td_proc->p_vmspace->vm_map, addr,
addr + size, prot, FALSE)) {
case KERN_SUCCESS:
return (0);
case KERN_PROTECTION_FAILURE:
return (EACCES);
case KERN_RESOURCE_SHORTAGE:
return (ENOMEM);
}
return (EINVAL);
}
#ifndef _SYS_SYSPROTO_H_
struct minherit_args {
void *addr;
size_t len;
int inherit;
};
#endif
/*
* MPSAFE
*/
int
-minherit(td, uap)
+sys_minherit(td, uap)
struct thread *td;
struct minherit_args *uap;
{
vm_offset_t addr;
vm_size_t size, pageoff;
vm_inherit_t inherit;
addr = (vm_offset_t)uap->addr;
size = uap->len;
inherit = uap->inherit;
pageoff = (addr & PAGE_MASK);
addr -= pageoff;
size += pageoff;
size = (vm_size_t) round_page(size);
if (addr + size < addr)
return (EINVAL);
switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr,
addr + size, inherit)) {
case KERN_SUCCESS:
return (0);
case KERN_PROTECTION_FAILURE:
return (EACCES);
}
return (EINVAL);
}
#ifndef _SYS_SYSPROTO_H_
struct madvise_args {
void *addr;
size_t len;
int behav;
};
#endif
/*
* MPSAFE
*/
/* ARGSUSED */
int
-madvise(td, uap)
+sys_madvise(td, uap)
struct thread *td;
struct madvise_args *uap;
{
vm_offset_t start, end;
vm_map_t map;
struct proc *p;
int error;
/*
* Check for our special case, advising the swap pager we are
* "immortal."
*/
if (uap->behav == MADV_PROTECT) {
error = priv_check(td, PRIV_VM_MADV_PROTECT);
if (error == 0) {
p = td->td_proc;
PROC_LOCK(p);
p->p_flag |= P_PROTECTED;
PROC_UNLOCK(p);
}
return (error);
}
/*
* Check for illegal behavior
*/
if (uap->behav < 0 || uap->behav > MADV_CORE)
return (EINVAL);
/*
* Check for illegal addresses. Watch out for address wrap... Note
* that VM_*_ADDRESS are not constants due to casts (argh).
*/
map = &td->td_proc->p_vmspace->vm_map;
if ((vm_offset_t)uap->addr < vm_map_min(map) ||
(vm_offset_t)uap->addr + uap->len > vm_map_max(map))
return (EINVAL);
if (((vm_offset_t) uap->addr + uap->len) < (vm_offset_t) uap->addr)
return (EINVAL);
/*
* Since this routine is only advisory, we default to conservative
* behavior.
*/
start = trunc_page((vm_offset_t) uap->addr);
end = round_page((vm_offset_t) uap->addr + uap->len);
if (vm_map_madvise(map, start, end, uap->behav))
return (EINVAL);
return (0);
}
#ifndef _SYS_SYSPROTO_H_
struct mincore_args {
const void *addr;
size_t len;
char *vec;
};
#endif
/*
* MPSAFE
*/
/* ARGSUSED */
int
-mincore(td, uap)
+sys_mincore(td, uap)
struct thread *td;
struct mincore_args *uap;
{
vm_offset_t addr, first_addr;
vm_offset_t end, cend;
pmap_t pmap;
vm_map_t map;
char *vec;
int error = 0;
int vecindex, lastvecindex;
vm_map_entry_t current;
vm_map_entry_t entry;
vm_object_t object;
vm_paddr_t locked_pa;
vm_page_t m;
vm_pindex_t pindex;
int mincoreinfo;
unsigned int timestamp;
boolean_t locked;
/*
* Make sure that the addresses presented are valid for user
* mode.
*/
first_addr = addr = trunc_page((vm_offset_t) uap->addr);
end = addr + (vm_size_t)round_page(uap->len);
map = &td->td_proc->p_vmspace->vm_map;
if (end > vm_map_max(map) || end < addr)
return (ENOMEM);
/*
* Address of byte vector
*/
vec = uap->vec;
pmap = vmspace_pmap(td->td_proc->p_vmspace);
vm_map_lock_read(map);
RestartScan:
timestamp = map->timestamp;
if (!vm_map_lookup_entry(map, addr, &entry)) {
vm_map_unlock_read(map);
return (ENOMEM);
}
/*
* Do this on a map entry basis so that if the pages are not
* in the current processes address space, we can easily look
* up the pages elsewhere.
*/
lastvecindex = -1;
for (current = entry;
(current != &map->header) && (current->start < end);
current = current->next) {
/*
* check for contiguity
*/
if (current->end < end &&
(entry->next == &map->header ||
current->next->start > current->end)) {
vm_map_unlock_read(map);
return (ENOMEM);
}
/*
* ignore submaps (for now) or null objects
*/
if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) ||
current->object.vm_object == NULL)
continue;
/*
* limit this scan to the current map entry and the
* limits for the mincore call
*/
if (addr < current->start)
addr = current->start;
cend = current->end;
if (cend > end)
cend = end;
/*
* scan this entry one page at a time
*/
while (addr < cend) {
/*
* Check pmap first, it is likely faster, also
* it can provide info as to whether we are the
* one referencing or modifying the page.
*/
object = NULL;
locked_pa = 0;
retry:
m = NULL;
mincoreinfo = pmap_mincore(pmap, addr, &locked_pa);
if (locked_pa != 0) {
/*
* The page is mapped by this process but not
* both accessed and modified. It is also
* managed. Acquire the object lock so that
* other mappings might be examined.
*/
m = PHYS_TO_VM_PAGE(locked_pa);
if (m->object != object) {
if (object != NULL)
VM_OBJECT_UNLOCK(object);
object = m->object;
locked = VM_OBJECT_TRYLOCK(object);
vm_page_unlock(m);
if (!locked) {
VM_OBJECT_LOCK(object);
vm_page_lock(m);
goto retry;
}
} else
vm_page_unlock(m);
KASSERT(m->valid == VM_PAGE_BITS_ALL,
("mincore: page %p is mapped but invalid",
m));
} else if (mincoreinfo == 0) {
/*
* The page is not mapped by this process. If
* the object implements managed pages, then
* determine if the page is resident so that
* the mappings might be examined.
*/
if (current->object.vm_object != object) {
if (object != NULL)
VM_OBJECT_UNLOCK(object);
object = current->object.vm_object;
VM_OBJECT_LOCK(object);
}
if (object->type == OBJT_DEFAULT ||
object->type == OBJT_SWAP ||
object->type == OBJT_VNODE) {
pindex = OFF_TO_IDX(current->offset +
(addr - current->start));
m = vm_page_lookup(object, pindex);
if (m != NULL && m->valid == 0)
m = NULL;
if (m != NULL)
mincoreinfo = MINCORE_INCORE;
}
}
if (m != NULL) {
/* Examine other mappings to the page. */
if (m->dirty == 0 && pmap_is_modified(m))
vm_page_dirty(m);
if (m->dirty != 0)
mincoreinfo |= MINCORE_MODIFIED_OTHER;
/*
* The first test for PGA_REFERENCED is an
* optimization. The second test is
* required because a concurrent pmap
* operation could clear the last reference
* and set PGA_REFERENCED before the call to
* pmap_is_referenced().
*/
if ((m->aflags & PGA_REFERENCED) != 0 ||
pmap_is_referenced(m) ||
(m->aflags & PGA_REFERENCED) != 0)
mincoreinfo |= MINCORE_REFERENCED_OTHER;
}
if (object != NULL)
VM_OBJECT_UNLOCK(object);
/*
* subyte may page fault. In case it needs to modify
* the map, we release the lock.
*/
vm_map_unlock_read(map);
/*
* calculate index into user supplied byte vector
*/
vecindex = OFF_TO_IDX(addr - first_addr);
/*
* If we have skipped map entries, we need to make sure that
* the byte vector is zeroed for those skipped entries.
*/
while ((lastvecindex + 1) < vecindex) {
error = subyte(vec + lastvecindex, 0);
if (error) {
error = EFAULT;
goto done2;
}
++lastvecindex;
}
/*
* Pass the page information to the user
*/
error = subyte(vec + vecindex, mincoreinfo);
if (error) {
error = EFAULT;
goto done2;
}
/*
* If the map has changed, due to the subyte, the previous
* output may be invalid.
*/
vm_map_lock_read(map);
if (timestamp != map->timestamp)
goto RestartScan;
lastvecindex = vecindex;
addr += PAGE_SIZE;
}
}
/*
* subyte may page fault. In case it needs to modify
* the map, we release the lock.
*/
vm_map_unlock_read(map);
/*
* Zero the last entries in the byte vector.
*/
vecindex = OFF_TO_IDX(end - first_addr);
while ((lastvecindex + 1) < vecindex) {
error = subyte(vec + lastvecindex, 0);
if (error) {
error = EFAULT;
goto done2;
}
++lastvecindex;
}
/*
* If the map has changed, due to the subyte, the previous
* output may be invalid.
*/
vm_map_lock_read(map);
if (timestamp != map->timestamp)
goto RestartScan;
vm_map_unlock_read(map);
done2:
return (error);
}
#ifndef _SYS_SYSPROTO_H_
struct mlock_args {
const void *addr;
size_t len;
};
#endif
/*
* MPSAFE
*/
int
-mlock(td, uap)
+sys_mlock(td, uap)
struct thread *td;
struct mlock_args *uap;
{
struct proc *proc;
vm_offset_t addr, end, last, start;
vm_size_t npages, size;
unsigned long nsize;
int error;
error = priv_check(td, PRIV_VM_MLOCK);
if (error)
return (error);
addr = (vm_offset_t)uap->addr;
size = uap->len;
last = addr + size;
start = trunc_page(addr);
end = round_page(last);
if (last < addr || end < addr)
return (EINVAL);
npages = atop(end - start);
if (npages > vm_page_max_wired)
return (ENOMEM);
proc = td->td_proc;
PROC_LOCK(proc);
nsize = ptoa(npages +
pmap_wired_count(vm_map_pmap(&proc->p_vmspace->vm_map)));
if (nsize > lim_cur(proc, RLIMIT_MEMLOCK)) {
PROC_UNLOCK(proc);
return (ENOMEM);
}
PROC_UNLOCK(proc);
if (npages + cnt.v_wire_count > vm_page_max_wired)
return (EAGAIN);
#ifdef RACCT
PROC_LOCK(proc);
error = racct_set(proc, RACCT_MEMLOCK, nsize);
PROC_UNLOCK(proc);
if (error != 0)
return (ENOMEM);
#endif
error = vm_map_wire(&proc->p_vmspace->vm_map, start, end,
VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
#ifdef RACCT
if (error != KERN_SUCCESS) {
PROC_LOCK(proc);
racct_set(proc, RACCT_MEMLOCK,
ptoa(pmap_wired_count(vm_map_pmap(&proc->p_vmspace->vm_map))));
PROC_UNLOCK(proc);
}
#endif
return (error == KERN_SUCCESS ? 0 : ENOMEM);
}
#ifndef _SYS_SYSPROTO_H_
struct mlockall_args {
int how;
};
#endif
/*
* MPSAFE
*/
int
-mlockall(td, uap)
+sys_mlockall(td, uap)
struct thread *td;
struct mlockall_args *uap;
{
vm_map_t map;
int error;
map = &td->td_proc->p_vmspace->vm_map;
error = 0;
if ((uap->how == 0) || ((uap->how & ~(MCL_CURRENT|MCL_FUTURE)) != 0))
return (EINVAL);
#if 0
/*
* If wiring all pages in the process would cause it to exceed
* a hard resource limit, return ENOMEM.
*/
PROC_LOCK(td->td_proc);
if (map->size > lim_cur(td->td_proc, RLIMIT_MEMLOCK)) {
PROC_UNLOCK(td->td_proc);
return (ENOMEM);
}
PROC_UNLOCK(td->td_proc);
#else
error = priv_check(td, PRIV_VM_MLOCK);
if (error)
return (error);
#endif
#ifdef RACCT
PROC_LOCK(td->td_proc);
error = racct_set(td->td_proc, RACCT_MEMLOCK, map->size);
PROC_UNLOCK(td->td_proc);
if (error != 0)
return (ENOMEM);
#endif
if (uap->how & MCL_FUTURE) {
vm_map_lock(map);
vm_map_modflags(map, MAP_WIREFUTURE, 0);
vm_map_unlock(map);
error = 0;
}
if (uap->how & MCL_CURRENT) {
/*
* P1003.1-2001 mandates that all currently mapped pages
* will be memory resident and locked (wired) upon return
* from mlockall(). vm_map_wire() will wire pages, by
* calling vm_fault_wire() for each page in the region.
*/
error = vm_map_wire(map, vm_map_min(map), vm_map_max(map),
VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK);
error = (error == KERN_SUCCESS ? 0 : EAGAIN);
}
#ifdef RACCT
if (error != KERN_SUCCESS) {
PROC_LOCK(td->td_proc);
racct_set(td->td_proc, RACCT_MEMLOCK,
ptoa(pmap_wired_count(vm_map_pmap(&td->td_proc->p_vmspace->vm_map))));
PROC_UNLOCK(td->td_proc);
}
#endif
return (error);
}
#ifndef _SYS_SYSPROTO_H_
struct munlockall_args {
register_t dummy;
};
#endif
/*
* MPSAFE
*/
int
-munlockall(td, uap)
+sys_munlockall(td, uap)
struct thread *td;
struct munlockall_args *uap;
{
vm_map_t map;
int error;
map = &td->td_proc->p_vmspace->vm_map;
error = priv_check(td, PRIV_VM_MUNLOCK);
if (error)
return (error);
/* Clear the MAP_WIREFUTURE flag from this vm_map. */
vm_map_lock(map);
vm_map_modflags(map, 0, MAP_WIREFUTURE);
vm_map_unlock(map);
/* Forcibly unwire all pages. */
error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map),
VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK);
#ifdef RACCT
if (error == KERN_SUCCESS) {
PROC_LOCK(td->td_proc);
racct_set(td->td_proc, RACCT_MEMLOCK, 0);
PROC_UNLOCK(td->td_proc);
}
#endif
return (error);
}
#ifndef _SYS_SYSPROTO_H_
struct munlock_args {
const void *addr;
size_t len;
};
#endif
/*
* MPSAFE
*/
int
-munlock(td, uap)
+sys_munlock(td, uap)
struct thread *td;
struct munlock_args *uap;
{
vm_offset_t addr, end, last, start;
vm_size_t size;
int error;
error = priv_check(td, PRIV_VM_MUNLOCK);
if (error)
return (error);
addr = (vm_offset_t)uap->addr;
size = uap->len;
last = addr + size;
start = trunc_page(addr);
end = round_page(last);
if (last < addr || end < addr)
return (EINVAL);
error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, start, end,
VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
#ifdef RACCT
if (error == KERN_SUCCESS) {
PROC_LOCK(td->td_proc);
racct_sub(td->td_proc, RACCT_MEMLOCK, ptoa(end - start));
PROC_UNLOCK(td->td_proc);
}
#endif
return (error == KERN_SUCCESS ? 0 : ENOMEM);
}
/*
* vm_mmap_vnode()
*
* MPSAFE
*
* Helper function for vm_mmap. Perform sanity check specific for mmap
* operations on vnodes.
*/
int
vm_mmap_vnode(struct thread *td, vm_size_t objsize,
vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp,
struct vnode *vp, vm_ooffset_t *foffp, vm_object_t *objp)
{
struct vattr va;
vm_object_t obj;
vm_offset_t foff;
struct mount *mp;
struct ucred *cred;
int error, flags;
int vfslocked;
mp = vp->v_mount;
cred = td->td_ucred;
vfslocked = VFS_LOCK_GIANT(mp);
if ((error = vget(vp, LK_SHARED, td)) != 0) {
VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
foff = *foffp;
flags = *flagsp;
obj = vp->v_object;
if (vp->v_type == VREG) {
/*
* Get the proper underlying object
*/
if (obj == NULL) {
error = EINVAL;
goto done;
}
if (obj->handle != vp) {
vput(vp);
vp = (struct vnode*)obj->handle;
vget(vp, LK_SHARED, td);
}
} else if (vp->v_type == VCHR) {
error = vm_mmap_cdev(td, objsize, prot, maxprotp, flagsp,
vp->v_rdev, foffp, objp);
if (error == 0)
goto mark_atime;
goto done;
} else {
error = EINVAL;
goto done;
}
if ((error = VOP_GETATTR(vp, &va, cred)))
goto done;
#ifdef MAC
error = mac_vnode_check_mmap(cred, vp, prot, flags);
if (error != 0)
goto done;
#endif
if ((flags & MAP_SHARED) != 0) {
if ((va.va_flags & (SF_SNAPSHOT|IMMUTABLE|APPEND)) != 0) {
if (prot & PROT_WRITE) {
error = EPERM;
goto done;
}
*maxprotp &= ~VM_PROT_WRITE;
}
}
/*
* If it is a regular file without any references
* we do not need to sync it.
* Adjust object size to be the size of actual file.
*/
objsize = round_page(va.va_size);
if (va.va_nlink == 0)
flags |= MAP_NOSYNC;
obj = vm_pager_allocate(OBJT_VNODE, vp, objsize, prot, foff, td->td_ucred);
if (obj == NULL) {
error = ENOMEM;
goto done;
}
*objp = obj;
*flagsp = flags;
mark_atime:
vfs_mark_atime(vp, cred);
done:
vput(vp);
VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
/*
* vm_mmap_cdev()
*
* MPSAFE
*
* Helper function for vm_mmap. Perform sanity check specific for mmap
* operations on cdevs.
*/
int
vm_mmap_cdev(struct thread *td, vm_size_t objsize,
vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp,
struct cdev *cdev, vm_ooffset_t *foff, vm_object_t *objp)
{
vm_object_t obj;
struct cdevsw *dsw;
int error, flags, ref;
flags = *flagsp;
dsw = dev_refthread(cdev, &ref);
if (dsw == NULL)
return (ENXIO);
if (dsw->d_flags & D_MMAP_ANON) {
dev_relthread(cdev, ref);
*maxprotp = VM_PROT_ALL;
*flagsp |= MAP_ANON;
return (0);
}
/*
* cdevs do not provide private mappings of any kind.
*/
if ((*maxprotp & VM_PROT_WRITE) == 0 &&
(prot & PROT_WRITE) != 0) {
dev_relthread(cdev, ref);
return (EACCES);
}
if (flags & (MAP_PRIVATE|MAP_COPY)) {
dev_relthread(cdev, ref);
return (EINVAL);
}
/*
* Force device mappings to be shared.
*/
flags |= MAP_SHARED;
#ifdef MAC_XXX
error = mac_cdev_check_mmap(td->td_ucred, cdev, prot);
if (error != 0) {
dev_relthread(cdev, ref);
return (error);
}
#endif
/*
* First, try d_mmap_single(). If that is not implemented
* (returns ENODEV), fall back to using the device pager.
* Note that d_mmap_single() must return a reference to the
* object (it needs to bump the reference count of the object
* it returns somehow).
*
* XXX assumes VM_PROT_* == PROT_*
*/
error = dsw->d_mmap_single(cdev, foff, objsize, objp, (int)prot);
dev_relthread(cdev, ref);
if (error != ENODEV)
return (error);
obj = vm_pager_allocate(OBJT_DEVICE, cdev, objsize, prot, *foff,
td->td_ucred);
if (obj == NULL)
return (EINVAL);
*objp = obj;
*flagsp = flags;
return (0);
}
/*
* vm_mmap_shm()
*
* MPSAFE
*
* Helper function for vm_mmap. Perform sanity check specific for mmap
* operations on shm file descriptors.
*/
int
vm_mmap_shm(struct thread *td, vm_size_t objsize,
vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp,
struct shmfd *shmfd, vm_ooffset_t foff, vm_object_t *objp)
{
int error;
if ((*flagsp & MAP_SHARED) != 0 &&
(*maxprotp & VM_PROT_WRITE) == 0 &&
(prot & PROT_WRITE) != 0)
return (EACCES);
#ifdef MAC
error = mac_posixshm_check_mmap(td->td_ucred, shmfd, prot, *flagsp);
if (error != 0)
return (error);
#endif
error = shm_mmap(shmfd, objsize, foff, objp);
if (error)
return (error);
return (0);
}
/*
* vm_mmap()
*
* MPSAFE
*
* Internal version of mmap. Currently used by mmap, exec, and sys5
* shared memory. Handle is either a vnode pointer or NULL for MAP_ANON.
*/
int
vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot,
vm_prot_t maxprot, int flags,
objtype_t handle_type, void *handle,
vm_ooffset_t foff)
{
boolean_t fitit;
vm_object_t object = NULL;
int rv = KERN_SUCCESS;
int docow, error;
struct thread *td = curthread;
if (size == 0)
return (0);
size = round_page(size);
PROC_LOCK(td->td_proc);
if (td->td_proc->p_vmspace->vm_map.size + size >
lim_cur(td->td_proc, RLIMIT_VMEM)) {
PROC_UNLOCK(td->td_proc);
return (ENOMEM);
}
if (racct_set(td->td_proc, RACCT_VMEM,
td->td_proc->p_vmspace->vm_map.size + size)) {
PROC_UNLOCK(td->td_proc);
return (ENOMEM);
}
PROC_UNLOCK(td->td_proc);
/*
* We currently can only deal with page aligned file offsets.
* The check is here rather than in the syscall because the
* kernel calls this function internally for other mmaping
* operations (such as in exec) and non-aligned offsets will
* cause pmap inconsistencies...so we want to be sure to
* disallow this in all cases.
*/
if (foff & PAGE_MASK)
return (EINVAL);
if ((flags & MAP_FIXED) == 0) {
fitit = TRUE;
*addr = round_page(*addr);
} else {
if (*addr != trunc_page(*addr))
return (EINVAL);
fitit = FALSE;
}
/*
* Lookup/allocate object.
*/
switch (handle_type) {
case OBJT_DEVICE:
error = vm_mmap_cdev(td, size, prot, &maxprot, &flags,
handle, &foff, &object);
break;
case OBJT_VNODE:
error = vm_mmap_vnode(td, size, prot, &maxprot, &flags,
handle, &foff, &object);
break;
case OBJT_SWAP:
error = vm_mmap_shm(td, size, prot, &maxprot, &flags,
handle, foff, &object);
break;
case OBJT_DEFAULT:
if (handle == NULL) {
error = 0;
break;
}
/* FALLTHROUGH */
default:
error = EINVAL;
break;
}
if (error)
return (error);
if (flags & MAP_ANON) {
object = NULL;
docow = 0;
/*
* Unnamed anonymous regions always start at 0.
*/
if (handle == 0)
foff = 0;
} else if (flags & MAP_PREFAULT_READ)
docow = MAP_PREFAULT;
else
docow = MAP_PREFAULT_PARTIAL;
if ((flags & (MAP_ANON|MAP_SHARED)) == 0)
docow |= MAP_COPY_ON_WRITE;
if (flags & MAP_NOSYNC)
docow |= MAP_DISABLE_SYNCER;
if (flags & MAP_NOCORE)
docow |= MAP_DISABLE_COREDUMP;
if (flags & MAP_STACK)
rv = vm_map_stack(map, *addr, size, prot, maxprot,
docow | MAP_STACK_GROWS_DOWN);
else if (fitit)
rv = vm_map_find(map, object, foff, addr, size,
object != NULL && object->type == OBJT_DEVICE ?
VMFS_ALIGNED_SPACE : VMFS_ANY_SPACE, prot, maxprot, docow);
else
rv = vm_map_fixed(map, object, foff, *addr, size,
prot, maxprot, docow);
if (rv != KERN_SUCCESS) {
/*
* Lose the object reference. Will destroy the
* object if it's an unnamed anonymous mapping
* or named anonymous without other references.
*/
vm_object_deallocate(object);
} else if (flags & MAP_SHARED) {
/*
* Shared memory is also shared with children.
*/
rv = vm_map_inherit(map, *addr, *addr + size, VM_INHERIT_SHARE);
if (rv != KERN_SUCCESS)
(void) vm_map_remove(map, *addr, *addr + size);
}
/*
* If the process has requested that all future mappings
* be wired, then heed this.
*/
if ((rv == KERN_SUCCESS) && (map->flags & MAP_WIREFUTURE))
vm_map_wire(map, *addr, *addr + size,
VM_MAP_WIRE_USER|VM_MAP_WIRE_NOHOLES);
return (vm_mmap_to_errno(rv));
}
int
vm_mmap_to_errno(int rv)
{
switch (rv) {
case KERN_SUCCESS:
return (0);
case KERN_INVALID_ADDRESS:
case KERN_NO_SPACE:
return (ENOMEM);
case KERN_PROTECTION_FAILURE:
return (EACCES);
default:
return (EINVAL);
}
}
Index: head/sys/vm/vm_unix.c
===================================================================
--- head/sys/vm/vm_unix.c (revision 225616)
+++ head/sys/vm/vm_unix.c (revision 225617)
@@ -1,207 +1,207 @@
/*-
* Copyright (c) 1988 University of Utah.
* Copyright (c) 1991, 1993
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* the Systems Programming Group of the University of Utah Computer
* Science Department.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* from: Utah $Hdr: vm_unix.c 1.1 89/11/07$
*
* @(#)vm_unix.c 8.1 (Berkeley) 6/11/93
*/
/*
* Traditional sbrk/grow interface to VM
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/racct.h>
#include <sys/resourcevar.h>
#include <sys/sysproto.h>
#include <sys/systm.h>
#include <vm/vm.h>
#include <vm/vm_param.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
#ifndef _SYS_SYSPROTO_H_
struct obreak_args {
char *nsize;
};
#endif
/*
* MPSAFE
*/
/* ARGSUSED */
int
-obreak(td, uap)
+sys_obreak(td, uap)
struct thread *td;
struct obreak_args *uap;
{
struct vmspace *vm = td->td_proc->p_vmspace;
vm_offset_t new, old, base;
rlim_t datalim, vmemlim;
int rv;
int error = 0;
boolean_t do_map_wirefuture;
PROC_LOCK(td->td_proc);
datalim = lim_cur(td->td_proc, RLIMIT_DATA);
vmemlim = lim_cur(td->td_proc, RLIMIT_VMEM);
PROC_UNLOCK(td->td_proc);
do_map_wirefuture = FALSE;
new = round_page((vm_offset_t)uap->nsize);
vm_map_lock(&vm->vm_map);
base = round_page((vm_offset_t) vm->vm_daddr);
old = base + ctob(vm->vm_dsize);
if (new > base) {
/*
* Check the resource limit, but allow a process to reduce
* its usage, even if it remains over the limit.
*/
if (new - base > datalim && new > old) {
error = ENOMEM;
goto done;
}
if (new > vm_map_max(&vm->vm_map)) {
error = ENOMEM;
goto done;
}
} else if (new < base) {
/*
* This is simply an invalid value. If someone wants to
* do fancy address space manipulations, mmap and munmap
* can do most of what the user would want.
*/
error = EINVAL;
goto done;
}
if (new > old) {
if (vm->vm_map.size + (new - old) > vmemlim) {
error = ENOMEM;
goto done;
}
#ifdef RACCT
PROC_LOCK(td->td_proc);
error = racct_set(td->td_proc, RACCT_DATA, new - base);
if (error != 0) {
PROC_UNLOCK(td->td_proc);
error = ENOMEM;
goto done;
}
error = racct_set(td->td_proc, RACCT_VMEM,
vm->vm_map.size + (new - old));
if (error != 0) {
racct_set_force(td->td_proc, RACCT_DATA, old - base);
PROC_UNLOCK(td->td_proc);
error = ENOMEM;
goto done;
}
PROC_UNLOCK(td->td_proc);
#endif
rv = vm_map_insert(&vm->vm_map, NULL, 0, old, new,
VM_PROT_RW, VM_PROT_ALL, 0);
if (rv != KERN_SUCCESS) {
#ifdef RACCT
PROC_LOCK(td->td_proc);
racct_set_force(td->td_proc, RACCT_DATA, old - base);
racct_set_force(td->td_proc, RACCT_VMEM, vm->vm_map.size);
PROC_UNLOCK(td->td_proc);
#endif
error = ENOMEM;
goto done;
}
vm->vm_dsize += btoc(new - old);
/*
* Handle the MAP_WIREFUTURE case for legacy applications,
* by marking the newly mapped range of pages as wired.
* We are not required to perform a corresponding
* vm_map_unwire() before vm_map_delete() below, as
* it will forcibly unwire the pages in the range.
*
* XXX If the pages cannot be wired, no error is returned.
*/
if ((vm->vm_map.flags & MAP_WIREFUTURE) == MAP_WIREFUTURE) {
if (bootverbose)
printf("obreak: MAP_WIREFUTURE set\n");
do_map_wirefuture = TRUE;
}
} else if (new < old) {
rv = vm_map_delete(&vm->vm_map, new, old);
if (rv != KERN_SUCCESS) {
error = ENOMEM;
goto done;
}
vm->vm_dsize -= btoc(old - new);
#ifdef RACCT
PROC_LOCK(td->td_proc);
racct_set_force(td->td_proc, RACCT_DATA, new - base);
racct_set_force(td->td_proc, RACCT_VMEM, vm->vm_map.size);
PROC_UNLOCK(td->td_proc);
#endif
}
done:
vm_map_unlock(&vm->vm_map);
if (do_map_wirefuture)
(void) vm_map_wire(&vm->vm_map, old, new,
VM_MAP_WIRE_USER|VM_MAP_WIRE_NOHOLES);
return (error);
}
#ifndef _SYS_SYSPROTO_H_
struct ovadvise_args {
int anom;
};
#endif
/*
* MPSAFE
*/
/* ARGSUSED */
int
-ovadvise(td, uap)
+sys_ovadvise(td, uap)
struct thread *td;
struct ovadvise_args *uap;
{
/* START_GIANT_OPTIONAL */
/* END_GIANT_OPTIONAL */
return (EINVAL);
}

File Metadata

Mime Type
application/octet-stream
Expires
Wed, May 8, 6:07 AM (2 d)
Storage Engine
chunks
Storage Format
Chunks
Storage Handle
GjjcOfT2V0j9
Default Alt Text
(4 MB)

Event Timeline