Index: head/sys/alpha/osf1/osf1_misc.c =================================================================== --- head/sys/alpha/osf1/osf1_misc.c (revision 89305) +++ head/sys/alpha/osf1/osf1_misc.c (revision 89306) @@ -1,1822 +1,1821 @@ /* $NetBSD: osf1_misc.c,v 1.14 1998/05/20 16:34:29 chs Exp $ */ /* * Copyright (c) 1994, 1995 Carnegie-Mellon University. * All rights reserved. * * Author: Chris G. Demetriou * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * Additional Copyright (c) 1999 by Andrew Gallatin * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include /* Must come after sys/malloc.h */ #include #include #include #include #include #include #include #include #include #include #include #include /* Must come after sys/selinfo.h */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static void cvtstat2osf1 __P((struct stat *, struct osf1_stat *)); static int osf2bsd_pathconf __P((int *)); static const char osf1_emul_path[] = "/compat/osf1"; /* * [ taken from the linux emulator ] * Search an alternate path before passing pathname arguments on * to system calls. Useful for keeping a separate 'emulation tree'. * * If cflag is set, we check if an attempt can be made to create * the named file, i.e. we check if the directory it should * be in exists. */ int osf1_emul_find(td, sgp, prefix, path, pbuf, cflag) struct thread *td; caddr_t *sgp; /* Pointer to stackgap memory */ const char *prefix; char *path; char **pbuf; int cflag; { int error; size_t len, sz; char *buf, *cp, *ptr; struct ucred *ucred; struct nameidata nd; struct nameidata ndroot; struct vattr vat; struct vattr vatroot; buf = (char *) malloc(MAXPATHLEN, M_TEMP, M_WAITOK); *pbuf = path; for (ptr = buf; (*ptr = *prefix) != '\0'; ptr++, prefix++) continue; sz = MAXPATHLEN - (ptr - buf); /* * If sgp is not given then the path is already in kernel space */ if (sgp == NULL) error = copystr(path, ptr, sz, &len); else error = copyinstr(path, ptr, sz, &len); if (error) { free(buf, M_TEMP); return error; } if (*ptr != '/') { free(buf, M_TEMP); return EINVAL; } /* * We know that there is a / somewhere in this pathname. * Search backwards for it, to find the file's parent dir * to see if it exists in the alternate tree. If it does, * and we want to create a file (cflag is set). We don't * need to worry about the root comparison in this case. */ if (cflag) { for (cp = &ptr[len] - 1; *cp != '/'; cp--) ; *cp = '\0'; NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, buf, td); if ((error = namei(&nd)) != 0) { free(buf, M_TEMP); return error; } *cp = '/'; } else { NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, buf, td); if ((error = namei(&nd)) != 0) { free(buf, M_TEMP); return error; } /* * We now compare the vnode of the osf1_root to the one * vnode asked. If they resolve to be the same, then we * ignore the match so that the real root gets used. * This avoids the problem of traversing "../.." to find the * root directory and never finding it, because "/" resolves * to the emulation root directory. This is expensive :-( */ NDINIT(&ndroot, LOOKUP, FOLLOW, UIO_SYSSPACE, osf1_emul_path, td); if ((error = namei(&ndroot)) != 0) { /* Cannot happen! */ free(buf, M_TEMP); vrele(nd.ni_vp); return error; } ucred = td->td_proc->p_ucred; if ((error = VOP_GETATTR(nd.ni_vp, &vat, ucred, td)) != 0) { goto bad; } if ((error = VOP_GETATTR(ndroot.ni_vp, &vatroot, ucred, td)) != 0) { goto bad; } if (vat.va_fsid == vatroot.va_fsid && vat.va_fileid == vatroot.va_fileid) { error = ENOENT; goto bad; } } if (sgp == NULL) *pbuf = buf; else { sz = &ptr[len] - buf; *pbuf = stackgap_alloc(sgp, sz + 1); error = copyout(buf, *pbuf, sz); free(buf, M_TEMP); } vrele(nd.ni_vp); if (!cflag) vrele(ndroot.ni_vp); return error; bad: vrele(ndroot.ni_vp); vrele(nd.ni_vp); free(buf, M_TEMP); return error; } int osf1_open(td, uap) struct thread *td; struct osf1_open_args *uap; { struct open_args /* { syscallarg(char *) path; syscallarg(int) flags; syscallarg(int) mode; } */ a; caddr_t sg; sg = stackgap_init(); CHECKALTEXIST(td, &sg, uap->path); SCARG(&a, path) = SCARG(uap, path); SCARG(&a, flags) = SCARG(uap, flags); /* XXX translate */ SCARG(&a, mode) = SCARG(uap, mode); return open(td, &a); } extern int totalphysmem; int osf1_getsysinfo(td, uap) struct thread *td; struct osf1_getsysinfo_args *uap; { int error, retval; int ncpus = 1; /* XXX until SMP */ int ophysmem; int unit; long percpu; long proctype; struct osf1_cpu_info cpuinfo; error = retval = 0; switch(uap->op) { case OSF_GET_MAX_UPROCS: error = copyout(&maxprocperuid, uap->buffer, sizeof(maxprocperuid)); retval = 1; break; case OSF_GET_PHYSMEM: ophysmem = totalphysmem * (PAGE_SIZE >> 10); error = copyout(&ophysmem, uap->buffer, sizeof(ophysmem)); retval = 1; break; case OSF_GET_MAX_CPU: case OSF_GET_CPUS_IN_BOX: error = copyout(&ncpus, uap->buffer, sizeof(ncpus)); retval = 1; break; case OSF_GET_IEEE_FP_CONTROL: error = copyout(&td->td_pcb->pcb_fp_control,uap->buffer, sizeof(td->td_pcb->pcb_fp_control)); retval = 1; break; case OSF_GET_CPU_INFO: if (uap->nbytes < sizeof(cpuinfo)) error = EINVAL; else { bzero(&cpuinfo, sizeof(cpuinfo)); unit = alpha_pal_whami(); cpuinfo.current_cpu = unit; cpuinfo.cpus_in_box = ncpus; cpuinfo.cpu_type = LOCATE_PCS(hwrpb, unit)->pcs_proc_type; cpuinfo.ncpus = ncpus; cpuinfo.cpus_present = ncpus; cpuinfo.cpus_running = ncpus; cpuinfo.cpu_binding = 1; cpuinfo.cpu_ex_binding = 0; cpuinfo.mhz = hwrpb->rpb_cc_freq / 1000000; error = copyout(&cpuinfo, uap->buffer, sizeof(cpuinfo)); retval = 1; } break; case OSF_GET_PROC_TYPE: if(uap->nbytes < sizeof(proctype)) error = EINVAL; else { unit = alpha_pal_whami(); proctype = LOCATE_PCS(hwrpb, unit)->pcs_proc_type; error = copyout (&proctype, uap->buffer, sizeof(percpu)); retval = 1; } break; case OSF_GET_HWRPB: { /* note -- osf/1 doesn't have rpb_tbhint[8] */ unsigned long rpb_size; rpb_size = (unsigned long)&hwrpb->rpb_tbhint - (unsigned long)hwrpb; if(uap->nbytes < rpb_size){ uprintf("nbytes = %ld, sizeof(struct rpb) = %ld\n", uap->nbytes, rpb_size); error = EINVAL; } else { error = copyout(hwrpb, uap->buffer, rpb_size); retval = 1; } } break; case OSF_GET_PLATFORM_NAME: error = copyout(platform.model, uap->buffer, strlen(platform.model)); retval = 1; break; default: printf("osf1_getsysinfo called with unknown op=%ld\n", uap->op); return EINVAL; } td->td_retval[0] = retval; return(error); } int osf1_setsysinfo(td, uap) struct thread *td; struct osf1_setsysinfo_args *uap; { int error; error = 0; switch(uap->op) { case OSF_SET_IEEE_FP_CONTROL: { u_int64_t temp, *fp_control; if ((error = copyin(uap->buffer, &temp, sizeof(temp)))) break; fp_control = &td->td_pcb->pcb_fp_control; *fp_control = temp & IEEE_TRAP_ENABLE_MASK; break; } default: uprintf("osf1_setsysinfo called with op=%ld\n", uap->op); /*error = EINVAL;*/ } return (error); } int osf1_getrlimit(td, uap) struct thread *td; struct osf1_getrlimit_args *uap; { struct __getrlimit_args /* { syscallarg(u_int) which; syscallarg(struct rlimit *) rlp; } */ a; if (SCARG(uap, which) >= OSF1_RLIMIT_NLIMITS) return (EINVAL); if (SCARG(uap, which) <= OSF1_RLIMIT_LASTCOMMON) SCARG(&a, which) = SCARG(uap, which); else if (SCARG(uap, which) == OSF1_RLIMIT_NOFILE) SCARG(&a, which) = RLIMIT_NOFILE; else return (0); SCARG(&a, rlp) = (struct rlimit *)SCARG(uap, rlp); return getrlimit(td, &a); } int osf1_setrlimit(td, uap) struct thread *td; struct osf1_setrlimit_args *uap; { struct __setrlimit_args /* { syscallarg(u_int) which; syscallarg(struct rlimit *) rlp; } */ a; if (SCARG(uap, which) >= OSF1_RLIMIT_NLIMITS) return (EINVAL); if (SCARG(uap, which) <= OSF1_RLIMIT_LASTCOMMON) SCARG(&a, which) = SCARG(uap, which); else if (SCARG(uap, which) == OSF1_RLIMIT_NOFILE) SCARG(&a, which) = RLIMIT_NOFILE; else return (0); SCARG(&a, rlp) = (struct rlimit *)SCARG(uap, rlp); return setrlimit(td, &a); } /* * As linux says, this is a total guess. */ int osf1_set_program_attributes(td, uap) struct thread *td; struct osf1_set_program_attributes_args *uap; { struct vmspace *vm = td->td_proc->p_vmspace; vm->vm_taddr = (caddr_t)uap->text_start; vm->vm_tsize = btoc(round_page(uap->text_len)); vm->vm_daddr = (caddr_t)uap->bss_start; vm->vm_dsize = btoc(round_page(uap->bss_len)); return(KERN_SUCCESS); } int osf1_mmap(td, uap) struct thread *td; struct osf1_mmap_args *uap; { struct mmap_args /* { syscallarg(caddr_t) addr; syscallarg(size_t) len; syscallarg(int) prot; syscallarg(int) flags; syscallarg(int) fd; syscallarg(long) pad; syscallarg(off_t) pos; } */ a; int retval; vm_map_t map; vm_offset_t addr, len, newaddr; GIANT_REQUIRED; SCARG(&a, addr) = SCARG(uap, addr); SCARG(&a, len) = SCARG(uap, len); SCARG(&a, prot) = SCARG(uap, prot); SCARG(&a, fd) = SCARG(uap, fd); SCARG(&a, pad) = 0; SCARG(&a, pos) = SCARG(uap, pos); SCARG(&a, flags) = 0; /* * OSF/1's mmap, unlike FreeBSD's, does its best to map memory at the * user's requested address, even if MAP_FIXED is not set. Here we * try to replicate this behaviour as much as we can because some * applications (like /sbin/loader) depend on having things put as * close to where they've requested as possible. */ if (SCARG(uap, addr) != NULL) addr = round_page((vm_offset_t)SCARG(&a,addr)); else /* * Try to use the apparent OSF/1 default placement of 0x10000 for * NULL addrs, this helps to prevent non-64 bit clean binaries from * SEGV'ing. */ addr = round_page((vm_offset_t)0x10000UL); len = (vm_offset_t)SCARG(&a, len); map = &td->td_proc->p_vmspace->vm_map; if (!vm_map_findspace(map, addr, len, &newaddr)) { SCARG(&a,addr) = (caddr_t) newaddr; SCARG(&a, flags) |= (MAP_FIXED); } #ifdef DEBUG else uprintf("osf1_mmap:vm_map_findspace failed for: %p 0x%lx\n", (caddr_t)addr, len); #endif if (SCARG(uap, flags) & OSF1_MAP_SHARED) SCARG(&a, flags) |= MAP_SHARED; if (SCARG(uap, flags) & OSF1_MAP_PRIVATE) SCARG(&a, flags) |= MAP_PRIVATE; switch (SCARG(uap, flags) & OSF1_MAP_TYPE) { case OSF1_MAP_ANONYMOUS: SCARG(&a, flags) |= MAP_ANON; break; case OSF1_MAP_FILE: SCARG(&a, flags) |= MAP_FILE; break; default: return (EINVAL); } if (SCARG(uap, flags) & OSF1_MAP_FIXED) SCARG(&a, flags) |= MAP_FIXED; if (SCARG(uap, flags) & OSF1_MAP_HASSEMAPHORE) SCARG(&a, flags) |= MAP_HASSEMAPHORE; if (SCARG(uap, flags) & OSF1_MAP_INHERIT) return (EINVAL); if (SCARG(uap, flags) & OSF1_MAP_UNALIGNED) return (EINVAL); /* * Emulate an osf/1 bug: Apparently, mmap'ed segments are always * readable even if the user doesn't or in PROT_READ. This causes * some buggy programs to segv. */ SCARG(&a, prot) |= PROT_READ; retval = mmap(td, &a); #ifdef DEBUG uprintf( "\nosf1_mmap: addr=%p (%p), len = 0x%lx, prot=0x%x, fd=%d, pad=0, pos=0x%lx", SCARG(uap, addr), SCARG(&a, addr),SCARG(uap, len), SCARG(uap, prot), SCARG(uap, fd), SCARG(uap, pos)); printf(" flags = 0x%x\n",SCARG(uap, flags)); #endif return (retval); } int osf1_msync(td, uap) struct thread *td; struct osf1_msync_args *uap; { struct msync_args a; a.addr = SCARG(uap, addr); a.len = SCARG(uap, len); a.flags = 0; if(SCARG(uap, flags) & OSF1_MS_ASYNC) SCARG(&a, flags) |= MS_ASYNC; if(SCARG(uap, flags) & OSF1_MS_SYNC) SCARG(&a, flags) |= MS_SYNC; if(SCARG(uap, flags) & OSF1_MS_INVALIDATE) SCARG(&a, flags) |= MS_INVALIDATE; return(msync(td, &a)); } struct osf1_stat { int32_t st_dev; u_int32_t st_ino; u_int32_t st_mode; u_int16_t st_nlink; u_int32_t st_uid; u_int32_t st_gid; int32_t st_rdev; u_int64_t st_size; int32_t st_atime_sec; int32_t st_spare1; int32_t st_mtime_sec; int32_t st_spare2; int32_t st_ctime_sec; int32_t st_spare3; u_int32_t st_blksize; int32_t st_blocks; u_int32_t st_flags; u_int32_t st_gen; }; /* * Get file status; this version follows links. */ /* ARGSUSED */ int osf1_stat(td, uap) struct thread *td; struct osf1_stat_args *uap; { int error; struct stat sb; struct osf1_stat osb; struct nameidata nd; caddr_t sg; sg = stackgap_init(); CHECKALTEXIST(td, &sg, uap->path); NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, SCARG(uap, path), td); if ((error = namei(&nd))) return (error); error = vn_stat(nd.ni_vp, &sb, td); vput(nd.ni_vp); if (error) return (error); cvtstat2osf1(&sb, &osb); error = copyout((caddr_t)&osb, (caddr_t)SCARG(uap, ub), sizeof (osb)); return (error); } /* * Get file status; this version does not follow links. */ /* ARGSUSED */ int osf1_lstat(td, uap) struct thread *td; register struct osf1_lstat_args *uap; { struct stat sb; struct osf1_stat osb; int error; struct nameidata nd; caddr_t sg = stackgap_init(); CHECKALTEXIST(td, &sg, uap->path); NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF, UIO_USERSPACE, SCARG(uap, path), td); if ((error = namei(&nd))) return (error); error = vn_stat(nd.ni_vp, &sb, td); vput(nd.ni_vp); if (error) return (error); cvtstat2osf1(&sb, &osb); error = copyout((caddr_t)&osb, (caddr_t)SCARG(uap, ub), sizeof (osb)); return (error); } /* * Return status information about a file descriptor. */ int osf1_fstat(td, uap) struct thread *td; register struct osf1_fstat_args *uap; { - register struct filedesc *fdp; register struct file *fp; struct stat ub; struct osf1_stat oub; int error; - fdp = td->td_proc->p_fd; - if ((unsigned)SCARG(uap, fd) >= fdp->fd_nfiles || - (fp = fdp->fd_ofiles[SCARG(uap, fd)]) == NULL) + fp = ffind_hold(td, uap->fd); + if (fp == NULL) return (EBADF); error = fo_stat(fp, &ub, td); + fdrop(fp, td); cvtstat2osf1(&ub, &oub); if (error == 0) error = copyout((caddr_t)&oub, (caddr_t)SCARG(uap, sb), sizeof (oub)); return (error); } #if 1 #define bsd2osf_dev(dev) (umajor(dev) << 20 | uminor(dev)) #define osf2bsd_dev(dev) umakedev((umajor(dev) >> 20) & 0xfff, uminor(dev) & 0xfffff) #else #define minor(x) ((int)((x)&0xffff00ff)) #define major(x) ((int)(((u_int)(x) >> 8)&0xff)) #define makedev(x,y) ((dev_t)(((x) << 8) | (y))) #define bsd2osf_dev(dev) (major(dev) << 20 | minor(dev)) #define osf2bsd_dev(dev) makedev(((dev) >> 20) & 0xfff, (dev) & 0xfffff) #endif /* * Convert from a stat structure to an osf1 stat structure. */ static void cvtstat2osf1(st, ost) struct stat *st; struct osf1_stat *ost; { ost->st_dev = bsd2osf_dev(st->st_dev); ost->st_ino = st->st_ino; ost->st_mode = st->st_mode; ost->st_nlink = st->st_nlink; ost->st_uid = st->st_uid == -2 ? (u_int16_t) -2 : st->st_uid; ost->st_gid = st->st_gid == -2 ? (u_int16_t) -2 : st->st_gid; ost->st_rdev = bsd2osf_dev(st->st_rdev); ost->st_size = st->st_size; ost->st_atime_sec = st->st_atime; ost->st_spare1 = 0; ost->st_mtime_sec = st->st_mtime; ost->st_spare2 = 0; ost->st_ctime_sec = st->st_ctime; ost->st_spare3 = 0; ost->st_blksize = st->st_blksize; ost->st_blocks = st->st_blocks; ost->st_flags = st->st_flags; ost->st_gen = st->st_gen; } int osf1_mknod(td, uap) struct thread *td; struct osf1_mknod_args *uap; { #if notanymore struct mknod_args a; caddr_t sg; sg = stackgap_init(); CHECKALTEXIST(td, &sg, uap->path); SCARG(&a, path) = SCARG(uap, path); SCARG(&a, mode) = SCARG(uap, mode); SCARG(&a, dev) = osf2bsd_dev(SCARG(uap, dev)); return mknod(td, &a); #endif printf("osf1_mknod no longer implemented\n"); return ENOSYS; } int osf1_access(td, uap) struct thread *td; struct osf1_access_args *uap; { caddr_t sg; sg = stackgap_init(); CHECKALTEXIST(td, &sg, uap->path); return access(td, (struct access_args *)uap); } struct osf1_flock { short l_type; short l_whence; off_t l_start; off_t l_len; pid_t l_pid; }; int osf1_fcntl(td, uap) struct thread *td; struct osf1_fcntl_args *uap; { int error; long tmp; caddr_t oarg, sg; struct fcntl_args a; struct osf1_flock osf_flock; struct flock bsd_flock; struct flock *nflock; error = 0; switch (SCARG(uap, cmd)) { case F_SETFL: SCARG(&a, fd) = SCARG(uap, fd); SCARG(&a, cmd) = F_SETFL; /* need to translate flags here */ tmp = 0; if ((long)SCARG(uap, arg) & OSF1_FNONBLOCK) tmp |= FNONBLOCK; if ((long)SCARG(uap, arg) & OSF1_FAPPEND) tmp |= FAPPEND; if ((long)SCARG(uap, arg) & OSF1_FDEFER) tmp |= FDEFER; if ((long)SCARG(uap, arg) & OSF1_FASYNC) tmp |= FASYNC; if ((long)SCARG(uap, arg) & OSF1_FCREAT) tmp |= O_CREAT; if ((long)SCARG(uap, arg) & OSF1_FTRUNC) tmp |= O_TRUNC; if ((long)SCARG(uap, arg) & OSF1_FEXCL) tmp |= O_EXCL; if ((long)SCARG(uap, arg) & OSF1_FNDELAY) tmp |= FNDELAY; if ((long)SCARG(uap, arg) & OSF1_FSYNC) tmp |= FFSYNC; SCARG(&a, arg) = tmp; error = fcntl(td, &a); break; case F_SETLK: case F_SETLKW: case F_GETLK: /* * The OSF/1 flock stucture has a different order than * the BSD one, but all else is the same. We must * reorder the one we've gotten so that flock() groks it. */ if ((error = copyin(uap->arg, &osf_flock, sizeof(osf_flock)))) return error; bsd_flock.l_type = osf_flock.l_type; bsd_flock.l_whence = osf_flock.l_whence; bsd_flock.l_start = osf_flock.l_start; bsd_flock.l_len = osf_flock.l_len; bsd_flock.l_pid = osf_flock.l_pid; sg = stackgap_init(); nflock = stackgap_alloc(&sg, sizeof(struct flock)); if ((error = copyout(&bsd_flock, nflock, sizeof(bsd_flock))) != 0) return error; oarg = uap->arg; uap->arg = nflock; error = fcntl(td, (struct fcntl_args *) uap); /* if (error) { printf("fcntl called with cmd=%d, args=0x%lx\n returns %d\n",uap->cmd,(long)uap->arg,error); printf("bsd_flock.l_type = 0x%x\n", bsd_flock.l_type); printf("bsd_flock.l_whence = 0x%x\n", bsd_flock.l_whence); printf("bsd_flock.l_start = 0x%lx\n", bsd_flock.l_start); printf("bsd_flock.l_len = 0x%lx\n", bsd_flock.l_len); printf("bsd_flock.l_pid = 0x%x\n", bsd_flock.l_pid); } */ if ((uap->cmd == F_GETLK) && !error) { osf_flock.l_type = F_UNLCK; if ((error = copyout(&osf_flock, oarg, sizeof(osf_flock)))) return error; } break; default: error = fcntl(td, (struct fcntl_args *) uap); if ((uap->cmd == OSF1_F_GETFL) && !error ) { tmp = td->td_retval[0] & O_ACCMODE; if (td->td_retval[0] & FNONBLOCK) tmp |= OSF1_FNONBLOCK; if (td->td_retval[0] & FAPPEND) tmp |= OSF1_FAPPEND; if (td->td_retval[0] & FDEFER) tmp |= OSF1_FDEFER; if (td->td_retval[0] & FASYNC) tmp |= OSF1_FASYNC; if (td->td_retval[0] & O_CREAT) tmp |= OSF1_FCREAT; if (td->td_retval[0] & O_TRUNC) tmp |= OSF1_FTRUNC; if (td->td_retval[0] & O_EXCL) tmp |= OSF1_FEXCL; if (td->td_retval[0] & FNDELAY) tmp |= OSF1_FNDELAY; if (td->td_retval[0] & FFSYNC) tmp |= OSF1_FSYNC; td->td_retval[0] = tmp; } } return (error); } #if 0 int osf1_fcntl(td, uap) struct thread *td; struct osf1_fcntl_args *uap; { struct fcntl_args a; long tmp; int error; SCARG(&a, fd) = SCARG(uap, fd); switch (SCARG(uap, cmd)) { case OSF1_F_DUPFD: SCARG(&a, cmd) = F_DUPFD; SCARG(&a, arg) = (long)SCARG(uap, arg); break; case OSF1_F_GETFD: SCARG(&a, cmd) = F_GETFD; SCARG(&a, arg) = (long)SCARG(uap, arg); break; case OSF1_F_SETFD: SCARG(&a, cmd) = F_SETFD; SCARG(&a, arg) = (long)SCARG(uap, arg); break; case OSF1_F_GETFL: SCARG(&a, cmd) = F_GETFL; SCARG(&a, arg) = (long)SCARG(uap, arg); /* ignored */ break; case OSF1_F_SETFL: SCARG(&a, cmd) = F_SETFL; tmp = 0; if ((long)SCARG(uap, arg) & OSF1_FAPPEND) tmp |= FAPPEND; if ((long)SCARG(uap, arg) & OSF1_FNONBLOCK) tmp |= FNONBLOCK; if ((long)SCARG(uap, arg) & OSF1_FASYNC) tmp |= FASYNC; if ((long)SCARG(uap, arg) & OSF1_FSYNC) tmp |= FFSYNC; SCARG(&a, arg) = tmp; break; default: /* XXX other cases */ return (EINVAL); } error = fcntl(td, &a); if (error) return error; switch (SCARG(uap, cmd)) { case OSF1_F_GETFL: /* XXX */ break; } return error; } #endif int osf1_socket(td, uap) struct thread *td; struct osf1_socket_args *uap; { struct socket_args a; if (SCARG(uap, type) > AF_LINK) return (EINVAL); /* XXX After AF_LINK, divergence. */ SCARG(&a, domain) = SCARG(uap, domain); SCARG(&a, type) = SCARG(uap, type); SCARG(&a, protocol) = SCARG(uap, protocol); return socket(td, &a); } int osf1_sendto(td, uap) struct thread *td; register struct osf1_sendto_args *uap; { struct sendto_args a; if (SCARG(uap, flags) & ~0x7f) /* unsupported flags */ return (EINVAL); SCARG(&a, s) = SCARG(uap, s); SCARG(&a, buf) = SCARG(uap, buf); SCARG(&a, len) = SCARG(uap, len); SCARG(&a, flags) = SCARG(uap, flags); SCARG(&a, to) = (caddr_t)SCARG(uap, to); SCARG(&a, tolen) = SCARG(uap, tolen); return sendto(td, &a); } int osf1_reboot(td, uap) struct thread *td; struct osf1_reboot_args *uap; { struct reboot_args a; if (SCARG(uap, opt) & ~OSF1_RB_ALLFLAGS && SCARG(uap, opt) & (OSF1_RB_ALTBOOT|OSF1_RB_UNIPROC)) return (EINVAL); SCARG(&a, opt) = 0; if (SCARG(uap, opt) & OSF1_RB_ASKNAME) SCARG(&a, opt) |= RB_ASKNAME; if (SCARG(uap, opt) & OSF1_RB_SINGLE) SCARG(&a, opt) |= RB_SINGLE; if (SCARG(uap, opt) & OSF1_RB_NOSYNC) SCARG(&a, opt) |= RB_NOSYNC; if (SCARG(uap, opt) & OSF1_RB_HALT) SCARG(&a, opt) |= RB_HALT; if (SCARG(uap, opt) & OSF1_RB_INITNAME) SCARG(&a, opt) |= RB_INITNAME; if (SCARG(uap, opt) & OSF1_RB_DFLTROOT) SCARG(&a, opt) |= RB_DFLTROOT; return reboot(td, &a); } int osf1_lseek(td, uap) struct thread *td; struct osf1_lseek_args *uap; { struct lseek_args a; SCARG(&a, fd) = SCARG(uap, fd); SCARG(&a, pad) = 0; SCARG(&a, offset) = SCARG(uap, offset); SCARG(&a, whence) = SCARG(uap, whence); return lseek(td, &a); } /* * OSF/1 defines _POSIX_SAVED_IDS, which means that our normal * setuid() won't work. * * Instead, by P1003.1b-1993, setuid() is supposed to work like: * If the process has appropriate [super-user] priviledges, the * setuid() function sets the real user ID, effective user * ID, and the saved set-user-ID to uid. * If the process does not have appropriate priviledges, but uid * is equal to the real user ID or the saved set-user-ID, the * setuid() function sets the effective user ID to uid; the * real user ID and saved set-user-ID remain unchanged by * this function call. */ int osf1_setuid(td, uap) struct thread *td; struct osf1_setuid_args *uap; { struct proc *p; int error; uid_t uid; struct ucred *newcred, *oldcred; p = td->td_proc; uid = SCARG(uap, uid); oldcred = p->p_ucred; if ((error = suser_xxx(p->p_ucred, NULL, PRISON_ROOT)) != 0 && uid != oldcred->cr_ruid && uid != oldcred->cr_svuid) return (error); newcred = crdup(oldcred); if (error == 0) { if (uid != oldcred->cr_ruid) { change_ruid(newcred, uid); setsugid(p); } if (oldcred->cr_svuid != uid) { change_svuid(newcred, uid); setsugid(p); } } if (newcred->cr_uid != uid) { change_euid(newcred, uid); setsugid(p); } p->p_ucred = newcred; crfree(oldcred); return (0); } /* * OSF/1 defines _POSIX_SAVED_IDS, which means that our normal * setgid() won't work. * * If you change "uid" to "gid" in the discussion, above, about * setuid(), you'll get a correct description of setgid(). */ int osf1_setgid(td, uap) struct thread *td; struct osf1_setgid_args *uap; { struct proc *p; int error; gid_t gid; struct ucred *newcred, *oldcred; p = td->td_proc; gid = SCARG(uap, gid); oldcred = p->p_ucred; if (((error = suser_xxx(p->p_ucred, NULL, PRISON_ROOT)) != 0 ) && gid != oldcred->cr_rgid && gid != oldcred->cr_svgid) return (error); newcred = crdup(oldcred); if (error == 0) { if (gid != oldcred->cr_rgid) { change_rgid(newcred, gid); setsugid(p); } if (oldcred->cr_svgid != gid) { change_svgid(newcred, gid); setsugid(p); } } if (newcred->cr_groups[0] != gid) { change_egid(newcred, gid); setsugid(p); } p->p_ucred = newcred; crfree(oldcred); return (0); } /* * The structures end up being the same... but we can't be sure that * the other word of our iov_len is zero! */ struct osf1_iovec { char *iov_base; int iov_len; }; #define STACKGAPLEN 400 int osf1_readv(td, uap) struct thread *td; struct osf1_readv_args *uap; { int error, osize, nsize, i; caddr_t sg; struct readv_args /* { syscallarg(int) fd; syscallarg(struct iovec *) iovp; syscallarg(u_int) iovcnt; } */ a; struct osf1_iovec *oio; struct iovec *nio; sg = stackgap_init(); if (SCARG(uap, iovcnt) > (STACKGAPLEN / sizeof (struct iovec))) return (EINVAL); osize = SCARG(uap, iovcnt) * sizeof (struct osf1_iovec); nsize = SCARG(uap, iovcnt) * sizeof (struct iovec); oio = malloc(osize, M_TEMP, M_WAITOK); nio = malloc(nsize, M_TEMP, M_WAITOK); error = 0; if ((error = copyin(SCARG(uap, iovp), oio, osize))) goto punt; for (i = 0; i < SCARG(uap, iovcnt); i++) { nio[i].iov_base = oio[i].iov_base; nio[i].iov_len = oio[i].iov_len; } SCARG(&a, fd) = SCARG(uap, fd); SCARG(&a, iovp) = stackgap_alloc(&sg, nsize); SCARG(&a, iovcnt) = SCARG(uap, iovcnt); if ((error = copyout(nio, (caddr_t)SCARG(&a, iovp), nsize))) goto punt; error = readv(td, &a); punt: free(oio, M_TEMP); free(nio, M_TEMP); return (error); } int osf1_writev(td, uap) struct thread *td; struct osf1_writev_args *uap; { int error, i, nsize, osize; caddr_t sg; struct writev_args /* { syscallarg(int) fd; syscallarg(struct iovec *) iovp; syscallarg(u_int) iovcnt; } */ a; struct osf1_iovec *oio; struct iovec *nio; sg = stackgap_init(); if (SCARG(uap, iovcnt) > (STACKGAPLEN / sizeof (struct iovec))) return (EINVAL); osize = SCARG(uap, iovcnt) * sizeof (struct osf1_iovec); nsize = SCARG(uap, iovcnt) * sizeof (struct iovec); oio = malloc(osize, M_TEMP, M_WAITOK); nio = malloc(nsize, M_TEMP, M_WAITOK); error = 0; if ((error = copyin(SCARG(uap, iovp), oio, osize))) goto punt; for (i = 0; i < SCARG(uap, iovcnt); i++) { nio[i].iov_base = oio[i].iov_base; nio[i].iov_len = oio[i].iov_len; } SCARG(&a, fd) = SCARG(uap, fd); SCARG(&a, iovp) = stackgap_alloc(&sg, nsize); SCARG(&a, iovcnt) = SCARG(uap, iovcnt); if ((error = copyout(nio, (caddr_t)SCARG(&a, iovp), nsize))) goto punt; error = writev(td, &a); punt: free(oio, M_TEMP); free(nio, M_TEMP); return (error); } /* * More of the stupid off_t padding! */ int osf1_truncate(td, uap) struct thread *td; struct osf1_truncate_args *uap; { caddr_t sg; struct truncate_args a; sg = stackgap_init(); CHECKALTEXIST(td, &sg, uap->path); SCARG(&a, path) = SCARG(uap, path); SCARG(&a, pad) = 0; SCARG(&a, length) = SCARG(uap, length); return truncate(td, &a); } int osf1_ftruncate(td, uap) struct thread *td; struct osf1_ftruncate_args *uap; { struct ftruncate_args a; SCARG(&a, fd) = SCARG(uap, fd); SCARG(&a, pad) = 0; SCARG(&a, length) = SCARG(uap, length); return ftruncate(td, &a); } static int osf2bsd_pathconf(name) int *name; { switch (*name) { case _OSF1_PC_LINK_MAX: case _OSF1_PC_MAX_CANON: case _OSF1_PC_MAX_INPUT: case _OSF1_PC_NAME_MAX: *name -= 10; break; case _OSF1_PC_PATH_MAX: case _OSF1_PC_PIPE_BUF: *name -= 9; case _OSF1_PC_NO_TRUNC: *name = _PC_NO_TRUNC; break; case _OSF1_PC_CHOWN_RESTRICTED: *name = _PC_CHOWN_RESTRICTED; break; case _OSF1_PC_VDISABLE: *name = _PC_VDISABLE; break; default: return (EINVAL); } return 0; } int osf1_pathconf(td, uap) struct thread *td; struct osf1_pathconf_args *uap; { if (osf2bsd_pathconf(&uap->name)) return (EINVAL); else return (pathconf(td, (void *)uap)); } int osf1_fpathconf(td, uap) struct thread *td; struct osf1_fpathconf_args *uap; { if (osf2bsd_pathconf(&uap->name)) return (EINVAL); else return (fpathconf(td, (void *)uap)); } int osf1_getrusage(td, uap) struct thread *td; struct osf1_getrusage_args *uap; { struct proc *p; struct rusage *rup; struct osf1_rusage oru; p = td->td_proc; switch (uap->who) { case RUSAGE_SELF: rup = &p->p_stats->p_ru; mtx_lock_spin(&sched_lock); calcru(p, &rup->ru_utime, &rup->ru_stime, NULL); mtx_unlock_spin(&sched_lock); break; case RUSAGE_CHILDREN: rup = &p->p_stats->p_cru; break; default: return (EINVAL); } TV_CP(rup->ru_utime, oru.ru_utime); TV_CP(rup->ru_stime, oru.ru_stime); bcopy(&(rup->ru_first), &(oru.ru_first), (&(oru.ru_last) - &(oru.ru_first))); return (copyout((caddr_t)&oru, (caddr_t)uap->rusage, sizeof (struct osf1_rusage))); } int osf1_wait4(td, uap) struct thread *td; struct osf1_wait4_args *uap; { int error; caddr_t sg; struct osf1_rusage *orusage, oru; struct rusage *rusage = NULL, ru; orusage = SCARG(uap, rusage); if (orusage) { sg = stackgap_init(); rusage = stackgap_alloc(&sg, sizeof(struct rusage)); SCARG(uap, rusage) = (struct osf1_rusage *)rusage; } if ((error = wait4(td, (struct wait_args *)uap))) return error; if (orusage && (error = copyin(rusage, &ru, sizeof(ru)) == 0)){ TV_CP(ru.ru_utime, oru.ru_utime); TV_CP(ru.ru_stime, oru.ru_stime); bcopy(&ru.ru_first, &oru.ru_first, (&(oru.ru_last) - &(oru.ru_first))); copyout(&oru, orusage, sizeof (struct osf1_rusage)); } return (0); } int osf1_madvise(td, uap) struct thread *td; struct osf1_madvise_args *uap; { /* XXX */ return EINVAL; } int osf1_execve(td, uap) struct thread *td; struct osf1_execve_args *uap; { caddr_t sg; struct execve_args ap; sg = stackgap_init(); CHECKALTEXIST(td, &sg, SCARG(uap, path)); SCARG(&ap, fname) = SCARG(uap, path); SCARG(&ap, argv) = SCARG(uap, argp); SCARG(&ap, envv) = SCARG(uap, envp); return execve(td, &ap); } int osf1_usleep_thread(td, uap) struct thread *td; struct osf1_usleep_thread_args *uap; { int error, s, timo; struct osf1_timeval time; struct timeval difftv, endtv, sleeptv, tv; if ((error = copyin(SCARG(uap, sleep), &time, sizeof time))) return (error); sleeptv.tv_sec = (u_long)time.tv_sec; sleeptv.tv_usec = (u_long)time.tv_usec; timo = tvtohz(&sleeptv); /* * Some callers use usleep(0) as a sort of thread-yield so make * sure that the timeout is non-zero. */ if (timo == 0) timo = 1; s = splclock(); microtime(&tv); splx(s); tsleep(td, PUSER|PCATCH, "OSF/1", timo); if (SCARG(uap, slept) != NULL) { s = splclock(); microtime(&endtv); timersub(&time, &endtv, &difftv); splx(s); if (tv.tv_sec < 0 || tv.tv_usec < 0) tv.tv_sec = tv.tv_usec = 0; TV_CP(difftv, time) error = copyout(&time, SCARG(uap, slept), sizeof time); } return (error); } int osf1_gettimeofday(td, uap) struct thread *td; register struct osf1_gettimeofday_args *uap; { int error; struct timeval atv; struct osf1_timeval otv; error = 0; if (uap->tp) { microtime(&atv); otv.tv_sec = atv.tv_sec; otv.tv_usec = atv.tv_usec; if ((error = copyout((caddr_t)&otv, (caddr_t)uap->tp, sizeof (otv)))) return (error); } if (uap->tzp) error = copyout((caddr_t)&tz, (caddr_t)uap->tzp, sizeof (tz)); return (error); } int osf1_select(td, uap) struct thread *td; register struct osf1_select_args *uap; { if (uap->tv) { int error; caddr_t sg; struct osf1_timeval otv; struct timeval tv; sg = stackgap_init(); if ((error=copyin((caddr_t)uap->tv,(caddr_t)&otv,sizeof(otv)))) return(error); TV_CP(otv,tv); uap->tv = stackgap_alloc(&sg, sizeof(struct timeval)); if ((error=copyout((caddr_t)&tv, (caddr_t)uap->tv,sizeof(tv)))) return(error); } return(select(td, (struct select_args *)uap)); } int osf1_setitimer(td, uap) struct thread *td; struct osf1_setitimer_args *uap; { int error; caddr_t old_oitv, sg; struct itimerval itv; struct osf1_itimerval otv; error = 0; old_oitv = (caddr_t)uap->oitv; sg = stackgap_init(); if ((error = copyin((caddr_t)uap->itv,(caddr_t)&otv,sizeof(otv)))) { printf("%s(%d): error = %d\n", __FILE__, __LINE__, error); return error; } TV_CP(otv.it_interval,itv.it_interval); TV_CP(otv.it_value,itv.it_value); uap->itv = stackgap_alloc(&sg, sizeof(struct itimerval)); if ((error = copyout((caddr_t)&itv,(caddr_t)uap->itv,sizeof(itv)))) { printf("%s(%d): error = %d\n", __FILE__, __LINE__, error); return error; } uap->oitv = stackgap_alloc(&sg, sizeof(struct itimerval)); if ((error = setitimer(td, (struct setitimer_args *)uap))) { printf("%s(%d): error = %d\n", __FILE__, __LINE__, error); return error; } if ((error = copyin((caddr_t)uap->oitv,(caddr_t)&itv,sizeof(itv)))) { printf("%s(%d): error = %d\n", __FILE__, __LINE__, error); return error; } TV_CP(itv.it_interval,otv.it_interval); TV_CP(itv.it_value,otv.it_value); if (old_oitv && (error = copyout((caddr_t)&otv, old_oitv, sizeof(otv)))) { printf("%s(%d): error = %d\n", __FILE__, __LINE__, error); } return error; } int osf1_getitimer(td, uap) struct thread *td; struct osf1_getitimer_args *uap; { int error; caddr_t old_itv, sg; struct itimerval itv; struct osf1_itimerval otv; error = 0; old_itv = (caddr_t)uap->itv; sg = stackgap_init(); uap->itv = stackgap_alloc(&sg, sizeof(struct itimerval)); if ((error = getitimer(td, (struct getitimer_args *)uap))) { printf("%s(%d): error = %d\n", __FILE__, __LINE__, error); return error; } if ((error = copyin((caddr_t)uap->itv,(caddr_t)&itv,sizeof(itv)))) { printf("%s(%d): error = %d\n", __FILE__, __LINE__, error); return error; } TV_CP(itv.it_interval,otv.it_interval); TV_CP(itv.it_value,otv.it_value); if ((error = copyout((caddr_t)&otv, old_itv, sizeof(otv)))) { printf("%s(%d): error = %d\n", __FILE__, __LINE__, error); } return error; } int osf1_proplist_syscall(td, uap) struct thread *td; struct osf1_proplist_syscall_args *uap; { return(EOPNOTSUPP); } int osf1_ntpgettime(td, uap) struct thread *td; struct osf1_ntpgettime_args *uap; { return(ENOSYS); } int osf1_ntpadjtime(td, uap) struct thread *td; struct osf1_ntpadjtime_args *uap; { return(ENOSYS); } int osf1_setpgrp(td, uap) struct thread *td; struct osf1_setpgrp_args *uap; { return(setpgid(td, (struct setpgid_args *)uap)); } int osf1_uswitch(td, uap) struct thread *td; struct osf1_uswitch_args *uap; { struct proc *p; int rv; vm_map_entry_t entry; vm_offset_t zero; GIANT_REQUIRED; p = td->td_proc; zero = 0; if (uap->cmd == OSF1_USC_GET) { if (vm_map_lookup_entry(&(p->p_vmspace->vm_map), 0, &entry)) td->td_retval[0] = OSF1_USW_NULLP; else td->td_retval[0] = 0; return(KERN_SUCCESS); } else if (uap->cmd == OSF1_USC_SET) if (uap->mask & OSF1_USW_NULLP) { rv = vm_mmap(&(p->p_vmspace->vm_map), &zero, PAGE_SIZE, VM_PROT_READ, VM_PROT_ALL, MAP_PRIVATE | MAP_FIXED | MAP_ANON, NULL, 0); if (!rv) return(KERN_SUCCESS); else { printf( "osf1_uswitch:vm_mmap of zero page failed with status %d\n", rv); return(rv); } } return(EINVAL); } int osf1_classcntl(td, uap) struct thread *td; struct osf1_classcntl_args *uap; { return(EACCES); /* class scheduling not enabled */ } struct osf1_tbl_loadavg { union { long l[3]; double d[3]; } tl_avenrun; int tl_lscale; long tl_mach_factor[3]; /* ???? */ }; struct osf1_tbl_sysinfo { long si_user; long si_nice; long si_sys; long si_idle; long si_hz; long si_phz; long si_boottime; long wait; }; #define TBL_LOADAVG 3 #define TBL_SYSINFO 12 int osf1_table(td, uap) struct thread *td; struct osf1_table_args /*{ long id; long index; void *addr; long nel; u_long lel; }*/ *uap; { int retval; struct osf1_tbl_loadavg ld; struct osf1_tbl_sysinfo si; retval = 0; switch(uap->id) { case TBL_LOADAVG: /* xemacs wants this */ if ((uap->index != 0) || (uap->nel != 1)) retval = EINVAL; bcopy(&averunnable, &ld, sizeof(averunnable)); ld.tl_lscale = (u_int)averunnable.fscale; retval = copyout(&ld, uap->addr, sizeof(ld)); break; case TBL_SYSINFO: if ((uap->index != 0) || (uap->nel != 1)) retval = EINVAL; bzero(&si, sizeof(si)); #if 0 si.si_user = cp_time[CP_USER]; si.si_nice = cp_time[CP_NICE]; si.si_sys = cp_time[CP_SYS]; si.si_idle = cp_time[CP_IDLE]; si.wait = cp_time[CP_INTR]; #endif si.si_hz = hz; si.si_phz = profhz; si.si_boottime = boottime.tv_sec; retval = copyout(&si, uap->addr, sizeof(si)); break; default: printf("osf1_table: %ld, %ld, %p, %ld %ld\n", uap->id, uap->index, uap->addr, uap->nel, uap->lel); retval = EINVAL; } return retval; } int osf1_sysinfo(td, uap) struct thread *td; struct osf1_sysinfo_args /*{ int cmd; char *buf; long count; }*/ *uap; { int name[2], retval; size_t bytes, len; char *string; string = NULL; switch(uap->cmd) { case 1: /* OS */ string = "OSF1"; break; case 2: /* hostname, from ogethostname */ len = uap->count; name[0] = CTL_KERN; name[1] = KERN_HOSTNAME; retval = userland_sysctl(td, name, 2, uap->buf, &len, 1, 0, 0, &bytes); td->td_retval[0] = bytes; return(retval); break; case 3: /* release of osf1 */ string = "V4.0"; break; case 4: /* minor version of osf1 */ string = "878"; break; case 5: /* machine or arch */ case 6: string = "alpha"; break; case 7: /* serial number, real osf1 returns 0! */ string = "0"; break; case 8: /* HW vendor */ string = "Digital"; break; case 9: /* dunno, this is what du does.. */ return(ENOSYS); break; default: return(EINVAL); } bytes = min(uap->count, strlen(string)+1); copyout(string, uap->buf, bytes); td->td_retval[0] = bytes; return(0); } Index: head/sys/alpha/osf1/osf1_mount.c =================================================================== --- head/sys/alpha/osf1/osf1_mount.c (revision 89305) +++ head/sys/alpha/osf1/osf1_mount.c (revision 89306) @@ -1,383 +1,385 @@ /* $NetBSD: osf1_mount.c,v 1.7 1998/05/20 16:34:29 chs Exp $ */ /* * Copyright (c) 1994, 1995 Carnegie-Mellon University. * All rights reserved. * * Author: Chris G. Demetriou * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * Additional Copyright (c) 1999 by Andrew Gallatin * $FreeBSD$ */ #include "opt_nfs.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include void bsd2osf_statfs __P((struct statfs *, struct osf1_statfs *)); int osf1_mount_mfs __P((struct thread *, struct osf1_mount_args *, struct mount_args *)); int osf1_mount_nfs __P((struct thread *, struct osf1_mount_args *, struct mount_args *)); #ifdef notanymore static const char *fsnames[OSF1_MOUNT_MAXTYPE+2] = INITMOUNTNAMES; #endif void bsd2osf_statfs(bsfs, osfs) struct statfs *bsfs; struct osf1_statfs *osfs; { #ifdef notanymore bzero(osfs, sizeof (struct osf1_statfs)); if (!strncmp(fsnames[MOUNT_UFS], bsfs->f_fstypename, MFSNAMELEN)) osfs->f_type = OSF1_MOUNT_UFS; else if (!strncmp(fsnames[MOUNT_NFS], bsfs->f_fstypename, MFSNAMELEN)) osfs->f_type = OSF1_MOUNT_NFS; else if (!strncmp(fsnames[MOUNT_MFS], bsfs->f_fstypename, MFSNAMELEN)) osfs->f_type = OSF1_MOUNT_MFS; else /* uh oh... XXX = PC, CDFS, PROCFS, etc. */ osfs->f_type = OSF1_MOUNT_ADDON; osfs->f_flags = bsfs->f_flags; /* XXX translate */ osfs->f_fsize = bsfs->f_bsize; osfs->f_bsize = bsfs->f_iosize; osfs->f_blocks = bsfs->f_blocks; osfs->f_bfree = bsfs->f_bfree; osfs->f_bavail = bsfs->f_bavail; osfs->f_files = bsfs->f_files; osfs->f_ffree = bsfs->f_ffree; bcopy(&bsfs->f_fsid, &osfs->f_fsid, max(sizeof bsfs->f_fsid, sizeof osfs->f_fsid)); /* osfs->f_spare zeroed above */ bcopy(bsfs->f_mntonname, osfs->f_mntonname, max(sizeof bsfs->f_mntonname, sizeof osfs->f_mntonname)); bcopy(bsfs->f_mntfromname, osfs->f_mntfromname, max(sizeof bsfs->f_mntfromname, sizeof osfs->f_mntfromname)); /* XXX osfs->f_xxx should be filled in... */ #endif } int osf1_statfs(td, uap) struct thread *td; struct osf1_statfs_args *uap; { int error; struct mount *mp; struct statfs *sp; struct osf1_statfs osfs; struct nameidata nd; NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td); if ((error = namei(&nd))) return (error); mp = nd.ni_vp->v_mount; sp = &mp->mnt_stat; vrele(nd.ni_vp); if ((error = VFS_STATFS(mp, sp, td))) return (error); sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; bsd2osf_statfs(sp, &osfs); return copyout(&osfs, SCARG(uap, buf), min(sizeof osfs, SCARG(uap, len))); } int osf1_fstatfs(td, uap) struct thread *td; struct osf1_fstatfs_args *uap; { int error; struct file *fp; struct mount *mp; struct statfs *sp; struct osf1_statfs osfs; if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp))) return (error); mp = ((struct vnode *)fp->f_data)->v_mount; sp = &mp->mnt_stat; - if ((error = VFS_STATFS(mp, sp, td))) + error = VFS_STATFS(mp, sp, td); + fdrop(fp, td); + if (error) return (error); sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; bsd2osf_statfs(sp, &osfs); return copyout(&osfs, SCARG(uap, buf), min(sizeof osfs, SCARG(uap, len))); } int osf1_getfsstat(td, uap) struct thread *td; register struct osf1_getfsstat_args *uap; { long count, error, maxcount; caddr_t osf_sfsp; struct mount *mp, *nmp; struct statfs *sp; struct osf1_statfs osfs; if (SCARG(uap, flags) & ~OSF1_GETFSSTAT_FLAGS) return (EINVAL); maxcount = SCARG(uap, bufsize) / sizeof(struct osf1_statfs); osf_sfsp = (caddr_t)SCARG(uap, buf); for (count = 0, mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { nmp = TAILQ_NEXT(mp, mnt_list); if (osf_sfsp && count < maxcount) { sp = &mp->mnt_stat; /* * If OSF1_MNT_NOWAIT is specified, do not refresh the * fsstat cache. OSF1_MNT_WAIT overrides * OSF1_MNT_NOWAIT. */ if (((SCARG(uap, flags) & OSF1_MNT_NOWAIT) == 0 || (SCARG(uap, flags) & OSF1_MNT_WAIT)) && (error = VFS_STATFS(mp, sp, td))) continue; sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; bsd2osf_statfs(sp, &osfs); if ((error = copyout(&osfs, osf_sfsp, sizeof (struct osf1_statfs)))) return (error); osf_sfsp += sizeof (struct osf1_statfs); } count++; } if (osf_sfsp && count > maxcount) td->td_retval[0] = maxcount; else td->td_retval[0] = count; return (0); } int osf1_unmount(td, uap) struct thread *td; struct osf1_unmount_args *uap; { struct unmount_args a; SCARG(&a, path) = SCARG(uap, path); if (SCARG(uap, flags) & ~OSF1_UNMOUNT_FLAGS) return (EINVAL); SCARG(&a, flags) = 0; if ((SCARG(uap, flags) & OSF1_MNT_FORCE) && (SCARG(uap, flags) & OSF1_MNT_NOFORCE) == 0) SCARG(&a, flags) |= MNT_FORCE; return unmount(td, &a); } int osf1_mount(td, uap) struct thread *td; struct osf1_mount_args *uap; { int error; struct mount_args a; SCARG(&a, path) = SCARG(uap, path); if (SCARG(uap, flags) & ~OSF1_MOUNT_FLAGS) return (EINVAL); SCARG(&a, flags) = SCARG(uap, flags); /* XXX - xlate */ switch (SCARG(uap, type)) { case OSF1_MOUNT_UFS: /* XXX */ return (EINVAL); break; case OSF1_MOUNT_NFS: /* XXX */ if ((error = osf1_mount_nfs(td, uap, &a))) return error; break; case OSF1_MOUNT_MFS: /* XXX */ #ifdef notyet if ((error = osf1_mount_mfs(td, uap, &a))) return error; #endif return EINVAL; break; case OSF1_MOUNT_CDFS: /* XXX */ return (EINVAL); break; case OSF1_MOUNT_PROCFS: /* XXX */ return (EINVAL); break; case OSF1_MOUNT_NONE: case OSF1_MOUNT_PC: case OSF1_MOUNT_S5FS: case OSF1_MOUNT_DFS: case OSF1_MOUNT_EFS: case OSF1_MOUNT_MSFS: case OSF1_MOUNT_FFM: case OSF1_MOUNT_FDFS: case OSF1_MOUNT_ADDON: default: return (EINVAL); } return mount(td, &a); } int osf1_mount_mfs(td, osf_argp, bsd_argp) struct thread *td; struct osf1_mount_args *osf_argp; struct mount_args *bsd_argp; { #ifdef notyet int error, len; caddr_t sg; static const char mfs_name[] = "mfs"; struct osf1_mfs_args osf_ma; struct mfs_args bsd_ma; sg = stackgap_init(); if ((error = copyin(SCARG(osf_argp, data), &osf_ma, sizeof osf_ma))) return error; bzero(&bsd_ma, sizeof bsd_ma); bsd_ma.fspec = osf_ma.name; /* XXX export args */ bsd_ma.base = osf_ma.base; bsd_ma.size = osf_ma.size; SCARG(bsd_argp, data) = stackgap_alloc(&sg, sizeof bsd_ma); if ((error = copyout(&bsd_ma, SCARG(bsd_argp, data), sizeof bsd_ma))) return error; len = strlen(mfs_name) + 1; SCARG(bsd_argp, type) = stackgap_alloc(&sg, len); if ((error = copyout(mfs_name, (void *)SCARG(bsd_argp, type), len))) return error; #endif return 0; } int osf1_mount_nfs(td, osf_argp, bsd_argp) struct thread *td; struct osf1_mount_args *osf_argp; struct mount_args *bsd_argp; { int error, len; caddr_t sg; static const char nfs_name[] = "nfs"; struct osf1_nfs_args osf_na; struct nfs_args bsd_na; sg = stackgap_init(); if ((error = copyin(SCARG(osf_argp, data), &osf_na, sizeof osf_na))) return error; bzero(&bsd_na, sizeof bsd_na); bsd_na.addr = (struct sockaddr *)osf_na.addr; bsd_na.addrlen = sizeof (struct sockaddr_in); bsd_na.sotype = SOCK_DGRAM; bsd_na.proto = 0; bsd_na.fh = osf_na.fh; if (osf_na.flags & ~OSF1_NFSMNT_FLAGS) return EINVAL; if (osf_na.flags & OSF1_NFSMNT_SOFT) bsd_na.flags |= NFSMNT_SOFT; if (osf_na.flags & OSF1_NFSMNT_WSIZE) { bsd_na.wsize = osf_na.wsize; bsd_na.flags |= NFSMNT_WSIZE; } if (osf_na.flags & OSF1_NFSMNT_RSIZE) { bsd_na.rsize = osf_na.rsize; bsd_na.flags |= NFSMNT_RSIZE; } if (osf_na.flags & OSF1_NFSMNT_TIMEO) { bsd_na.timeo = osf_na.timeo; bsd_na.flags |= NFSMNT_TIMEO; } if (osf_na.flags & OSF1_NFSMNT_RETRANS) { bsd_na.retrans = osf_na.retrans; bsd_na.flags |= NFSMNT_RETRANS; } if (osf_na.flags & OSF1_NFSMNT_HOSTNAME) bsd_na.hostname = osf_na.hostname; if (osf_na.flags & OSF1_NFSMNT_INT) bsd_na.flags |= NFSMNT_INT; if (osf_na.flags & OSF1_NFSMNT_NOCONN) bsd_na.flags |= NFSMNT_NOCONN; SCARG(bsd_argp, data) = stackgap_alloc(&sg, sizeof bsd_na); if ((error = copyout(&bsd_na, SCARG(bsd_argp, data), sizeof bsd_na))) return error; len = strlen(nfs_name) + 1; SCARG(bsd_argp, type) = stackgap_alloc(&sg, len); if ((error = copyout(nfs_name, (void *)SCARG(bsd_argp, type), len))) return error; return 0; } Index: head/sys/compat/linux/linux_file.c =================================================================== --- head/sys/compat/linux/linux_file.c (revision 89305) +++ head/sys/compat/linux/linux_file.c (revision 89306) @@ -1,1171 +1,1183 @@ /*- * Copyright (c) 1994-1995 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software withough specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #include "opt_compat.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef __alpha__ int linux_creat(struct thread *td, struct linux_creat_args *args) { struct open_args /* { char *path; int flags; int mode; } */ bsd_open_args; caddr_t sg; sg = stackgap_init(); CHECKALTCREAT(td, &sg, args->path); #ifdef DEBUG if (ldebug(creat)) printf(ARGS(creat, "%s, %d"), args->path, args->mode); #endif bsd_open_args.path = args->path; bsd_open_args.mode = args->mode; bsd_open_args.flags = O_WRONLY | O_CREAT | O_TRUNC; return open(td, &bsd_open_args); } #endif /*!__alpha__*/ int linux_open(struct thread *td, struct linux_open_args *args) { struct open_args /* { char *path; int flags; int mode; } */ bsd_open_args; struct proc *p = td->td_proc; int error; caddr_t sg; sg = stackgap_init(); if (args->flags & LINUX_O_CREAT) CHECKALTCREAT(td, &sg, args->path); else CHECKALTEXIST(td, &sg, args->path); #ifdef DEBUG if (ldebug(open)) printf(ARGS(open, "%s, 0x%x, 0x%x"), args->path, args->flags, args->mode); #endif bsd_open_args.flags = 0; if (args->flags & LINUX_O_RDONLY) bsd_open_args.flags |= O_RDONLY; if (args->flags & LINUX_O_WRONLY) bsd_open_args.flags |= O_WRONLY; if (args->flags & LINUX_O_RDWR) bsd_open_args.flags |= O_RDWR; if (args->flags & LINUX_O_NDELAY) bsd_open_args.flags |= O_NONBLOCK; if (args->flags & LINUX_O_APPEND) bsd_open_args.flags |= O_APPEND; if (args->flags & LINUX_O_SYNC) bsd_open_args.flags |= O_FSYNC; if (args->flags & LINUX_O_NONBLOCK) bsd_open_args.flags |= O_NONBLOCK; if (args->flags & LINUX_FASYNC) bsd_open_args.flags |= O_ASYNC; if (args->flags & LINUX_O_CREAT) bsd_open_args.flags |= O_CREAT; if (args->flags & LINUX_O_TRUNC) bsd_open_args.flags |= O_TRUNC; if (args->flags & LINUX_O_EXCL) bsd_open_args.flags |= O_EXCL; if (args->flags & LINUX_O_NOCTTY) bsd_open_args.flags |= O_NOCTTY; bsd_open_args.path = args->path; bsd_open_args.mode = args->mode; error = open(td, &bsd_open_args); PROC_LOCK(p); if (!error && !(bsd_open_args.flags & O_NOCTTY) && SESS_LEADER(p) && !(p->p_flag & P_CONTROLT)) { - struct filedesc *fdp = p->p_fd; - struct file *fp = fdp->fd_ofiles[td->td_retval[0]]; + struct file *fp; + fp = ffind_hold(td, td->td_retval[0]); PROC_UNLOCK(p); if (fp->f_type == DTYPE_VNODE) fo_ioctl(fp, TIOCSCTTY, (caddr_t) 0, td); + fdrop(fp, td); } else PROC_UNLOCK(p); #ifdef DEBUG if (ldebug(open)) printf(LMSG("open returns error %d"), error); #endif return error; } int linux_lseek(struct thread *td, struct linux_lseek_args *args) { struct lseek_args /* { int fd; int pad; off_t offset; int whence; } */ tmp_args; int error; #ifdef DEBUG if (ldebug(lseek)) printf(ARGS(lseek, "%d, %ld, %d"), args->fdes, (long)args->off, args->whence); #endif tmp_args.fd = args->fdes; tmp_args.offset = (off_t)args->off; tmp_args.whence = args->whence; error = lseek(td, &tmp_args); return error; } #ifndef __alpha__ int linux_llseek(struct thread *td, struct linux_llseek_args *args) { struct lseek_args bsd_args; int error; off_t off; #ifdef DEBUG if (ldebug(llseek)) printf(ARGS(llseek, "%d, %d:%d, %d"), args->fd, args->ohigh, args->olow, args->whence); #endif off = (args->olow) | (((off_t) args->ohigh) << 32); bsd_args.fd = args->fd; bsd_args.offset = off; bsd_args.whence = args->whence; if ((error = lseek(td, &bsd_args))) return error; if ((error = copyout(td->td_retval, (caddr_t)args->res, sizeof (off_t)))) return error; td->td_retval[0] = 0; return 0; } #endif /*!__alpha__*/ #ifndef __alpha__ int linux_readdir(struct thread *td, struct linux_readdir_args *args) { struct linux_getdents_args lda; lda.fd = args->fd; lda.dent = args->dent; lda.count = 1; return linux_getdents(td, &lda); } #endif /*!__alpha__*/ /* * Note that linux_getdents(2) and linux_getdents64(2) have the same * arguments. They only differ in the definition of struct dirent they * operate on. We use this to common the code, with the exception of * accessing struct dirent. Note that linux_readdir(2) is implemented * by means of linux_getdents(2). In this case we never operate on * struct dirent64 and thus don't need to handle it... */ struct l_dirent { l_long d_ino; l_off_t d_off; l_ushort d_reclen; char d_name[LINUX_NAME_MAX + 1]; }; struct l_dirent64 { uint64_t d_ino; int64_t d_off; l_ushort d_reclen; u_char d_type; char d_name[LINUX_NAME_MAX + 1]; }; #define LINUX_RECLEN(de,namlen) \ ALIGN((((char *)&(de)->d_name - (char *)de) + (namlen) + 1)) #define LINUX_DIRBLKSIZ 512 static int getdents_common(struct thread *td, struct linux_getdents64_args *args, int is64bit) { register struct dirent *bdp; struct vnode *vp; caddr_t inp, buf; /* BSD-format */ int len, reclen; /* BSD-format */ caddr_t outp; /* Linux-format */ int resid, linuxreclen=0; /* Linux-format */ struct file *fp; struct uio auio; struct iovec aiov; struct vattr va; off_t off; struct l_dirent linux_dirent; struct l_dirent64 linux_dirent64; int buflen, error, eofflag, nbytes, justone; u_long *cookies = NULL, *cookiep; int ncookies; if ((error = getvnode(td->td_proc->p_fd, args->fd, &fp)) != 0) return (error); - if ((fp->f_flag & FREAD) == 0) + if ((fp->f_flag & FREAD) == 0) { + fdrop(fp, td); return (EBADF); + } vp = (struct vnode *) fp->f_data; - if (vp->v_type != VDIR) + if (vp->v_type != VDIR) { + fdrop(fp, td); return (EINVAL); + } - if ((error = VOP_GETATTR(vp, &va, td->td_proc->p_ucred, td))) + if ((error = VOP_GETATTR(vp, &va, td->td_proc->p_ucred, td))) { + fdrop(fp, td); return (error); + } nbytes = args->count; if (nbytes == 1) { /* readdir(2) case. Always struct dirent. */ - if (is64bit) + if (is64bit) { + fdrop(fp, td); return (EINVAL); + } nbytes = sizeof(linux_dirent); justone = 1; } else justone = 0; off = fp->f_offset; buflen = max(LINUX_DIRBLKSIZ, nbytes); buflen = min(buflen, MAXBSIZE); buf = malloc(buflen, M_TEMP, M_WAITOK); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); again: aiov.iov_base = buf; aiov.iov_len = buflen; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_SYSSPACE; auio.uio_td = td; auio.uio_resid = buflen; auio.uio_offset = off; if (cookies) { free(cookies, M_TEMP); cookies = NULL; } if ((error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, &ncookies, &cookies))) goto out; inp = buf; outp = (caddr_t)args->dirent; resid = nbytes; if ((len = buflen - auio.uio_resid) <= 0) goto eof; cookiep = cookies; if (cookies) { /* * When using cookies, the vfs has the option of reading from * a different offset than that supplied (UFS truncates the * offset to a block boundary to make sure that it never reads * partway through a directory entry, even if the directory * has been compacted). */ while (len > 0 && ncookies > 0 && *cookiep <= off) { bdp = (struct dirent *) inp; len -= bdp->d_reclen; inp += bdp->d_reclen; cookiep++; ncookies--; } } while (len > 0) { if (cookiep && ncookies == 0) break; bdp = (struct dirent *) inp; reclen = bdp->d_reclen; if (reclen & 3) { error = EFAULT; goto out; } if (bdp->d_fileno == 0) { inp += reclen; if (cookiep) { off = *cookiep++; ncookies--; } else off += reclen; len -= reclen; continue; } linuxreclen = (is64bit) ? LINUX_RECLEN(&linux_dirent64, bdp->d_namlen) : LINUX_RECLEN(&linux_dirent, bdp->d_namlen); if (reclen > len || resid < linuxreclen) { outp++; break; } if (justone) { /* readdir(2) case. */ linux_dirent.d_ino = (l_long)bdp->d_fileno; linux_dirent.d_off = (l_off_t)linuxreclen; linux_dirent.d_reclen = (l_ushort)bdp->d_namlen; strcpy(linux_dirent.d_name, bdp->d_name); error = copyout(&linux_dirent, outp, linuxreclen); } else { if (is64bit) { linux_dirent64.d_ino = bdp->d_fileno; linux_dirent64.d_off = (cookiep) ? (l_off_t)*cookiep : (l_off_t)(off + reclen); linux_dirent64.d_reclen = (l_ushort)linuxreclen; linux_dirent64.d_type = bdp->d_type; strcpy(linux_dirent64.d_name, bdp->d_name); error = copyout(&linux_dirent64, outp, linuxreclen); } else { linux_dirent.d_ino = bdp->d_fileno; linux_dirent.d_off = (cookiep) ? (l_off_t)*cookiep : (l_off_t)(off + reclen); linux_dirent.d_reclen = (l_ushort)linuxreclen; strcpy(linux_dirent.d_name, bdp->d_name); error = copyout(&linux_dirent, outp, linuxreclen); } } if (error) goto out; inp += reclen; if (cookiep) { off = *cookiep++; ncookies--; } else off += reclen; outp += linuxreclen; resid -= linuxreclen; len -= reclen; if (justone) break; } if (outp == (caddr_t)args->dirent) goto again; fp->f_offset = off; if (justone) nbytes = resid + linuxreclen; eof: td->td_retval[0] = nbytes - resid; out: if (cookies) free(cookies, M_TEMP); VOP_UNLOCK(vp, 0, td); + fdrop(fp, td); free(buf, M_TEMP); return (error); } int linux_getdents(struct thread *td, struct linux_getdents_args *args) { #ifdef DEBUG if (ldebug(getdents)) printf(ARGS(getdents, "%d, *, %d"), args->fd, args->count); #endif return (getdents_common(td, (struct linux_getdents64_args*)args, 0)); } int linux_getdents64(struct thread *td, struct linux_getdents64_args *args) { #ifdef DEBUG if (ldebug(getdents64)) printf(ARGS(getdents64, "%d, *, %d"), args->fd, args->count); #endif return (getdents_common(td, args, 1)); } /* * These exist mainly for hooks for doing /compat/linux translation. */ int linux_access(struct thread *td, struct linux_access_args *args) { struct access_args bsd; caddr_t sg; sg = stackgap_init(); CHECKALTEXIST(td, &sg, args->path); #ifdef DEBUG if (ldebug(access)) printf(ARGS(access, "%s, %d"), args->path, args->flags); #endif bsd.path = args->path; bsd.flags = args->flags; return access(td, &bsd); } int linux_unlink(struct thread *td, struct linux_unlink_args *args) { struct unlink_args bsd; caddr_t sg; sg = stackgap_init(); CHECKALTEXIST(td, &sg, args->path); #ifdef DEBUG if (ldebug(unlink)) printf(ARGS(unlink, "%s"), args->path); #endif bsd.path = args->path; return unlink(td, &bsd); } int linux_chdir(struct thread *td, struct linux_chdir_args *args) { struct chdir_args bsd; caddr_t sg; sg = stackgap_init(); CHECKALTEXIST(td, &sg, args->path); #ifdef DEBUG if (ldebug(chdir)) printf(ARGS(chdir, "%s"), args->path); #endif bsd.path = args->path; return chdir(td, &bsd); } int linux_chmod(struct thread *td, struct linux_chmod_args *args) { struct chmod_args bsd; caddr_t sg; sg = stackgap_init(); CHECKALTEXIST(td, &sg, args->path); #ifdef DEBUG if (ldebug(chmod)) printf(ARGS(chmod, "%s, %d"), args->path, args->mode); #endif bsd.path = args->path; bsd.mode = args->mode; return chmod(td, &bsd); } int linux_mkdir(struct thread *td, struct linux_mkdir_args *args) { struct mkdir_args bsd; caddr_t sg; sg = stackgap_init(); CHECKALTCREAT(td, &sg, args->path); #ifdef DEBUG if (ldebug(mkdir)) printf(ARGS(mkdir, "%s, %d"), args->path, args->mode); #endif bsd.path = args->path; bsd.mode = args->mode; return mkdir(td, &bsd); } int linux_rmdir(struct thread *td, struct linux_rmdir_args *args) { struct rmdir_args bsd; caddr_t sg; sg = stackgap_init(); CHECKALTEXIST(td, &sg, args->path); #ifdef DEBUG if (ldebug(rmdir)) printf(ARGS(rmdir, "%s"), args->path); #endif bsd.path = args->path; return rmdir(td, &bsd); } int linux_rename(struct thread *td, struct linux_rename_args *args) { struct rename_args bsd; caddr_t sg; sg = stackgap_init(); CHECKALTEXIST(td, &sg, args->from); CHECKALTCREAT(td, &sg, args->to); #ifdef DEBUG if (ldebug(rename)) printf(ARGS(rename, "%s, %s"), args->from, args->to); #endif bsd.from = args->from; bsd.to = args->to; return rename(td, &bsd); } int linux_symlink(struct thread *td, struct linux_symlink_args *args) { struct symlink_args bsd; caddr_t sg; sg = stackgap_init(); CHECKALTEXIST(td, &sg, args->path); CHECKALTCREAT(td, &sg, args->to); #ifdef DEBUG if (ldebug(symlink)) printf(ARGS(symlink, "%s, %s"), args->path, args->to); #endif bsd.path = args->path; bsd.link = args->to; return symlink(td, &bsd); } int linux_readlink(struct thread *td, struct linux_readlink_args *args) { struct readlink_args bsd; caddr_t sg; sg = stackgap_init(); CHECKALTEXIST(td, &sg, args->name); #ifdef DEBUG if (ldebug(readlink)) printf(ARGS(readlink, "%s, %p, %d"), args->name, (void *)args->buf, args->count); #endif bsd.path = args->name; bsd.buf = args->buf; bsd.count = args->count; return readlink(td, &bsd); } int linux_truncate(struct thread *td, struct linux_truncate_args *args) { struct truncate_args bsd; caddr_t sg; sg = stackgap_init(); CHECKALTEXIST(td, &sg, args->path); #ifdef DEBUG if (ldebug(truncate)) printf(ARGS(truncate, "%s, %ld"), args->path, (long)args->length); #endif bsd.path = args->path; bsd.length = args->length; return truncate(td, &bsd); } int linux_link(struct thread *td, struct linux_link_args *args) { struct link_args bsd; caddr_t sg; sg = stackgap_init(); CHECKALTEXIST(td, &sg, args->path); CHECKALTCREAT(td, &sg, args->to); #ifdef DEBUG if (ldebug(link)) printf(ARGS(link, "%s, %s"), args->path, args->to); #endif bsd.path = args->path; bsd.link = args->to; return link(td, &bsd); } #ifndef __alpha__ int linux_fdatasync(td, uap) struct thread *td; struct linux_fdatasync_args *uap; { struct fsync_args bsd; bsd.fd = uap->fd; return fsync(td, &bsd); } #endif /*!__alpha__*/ int linux_pread(td, uap) struct thread *td; struct linux_pread_args *uap; { struct pread_args bsd; bsd.fd = uap->fd; bsd.buf = uap->buf; bsd.nbyte = uap->nbyte; bsd.offset = uap->offset; return pread(td, &bsd); } int linux_pwrite(td, uap) struct thread *td; struct linux_pwrite_args *uap; { struct pwrite_args bsd; bsd.fd = uap->fd; bsd.buf = uap->buf; bsd.nbyte = uap->nbyte; bsd.offset = uap->offset; return pwrite(td, &bsd); } int linux_mount(struct thread *td, struct linux_mount_args *args) { struct ufs_args ufs; char fstypename[MFSNAMELEN]; char mntonname[MNAMELEN], mntfromname[MNAMELEN]; int error; int fsflags; const char *fstype; void *fsdata; error = copyinstr(args->filesystemtype, fstypename, MFSNAMELEN - 1, NULL); if (error) return (error); error = copyinstr(args->specialfile, mntfromname, MFSNAMELEN - 1, NULL); if (error) return (error); error = copyinstr(args->dir, mntonname, MFSNAMELEN - 1, NULL); if (error) return (error); #ifdef DEBUG if (ldebug(mount)) printf(ARGS(mount, "%s, %s, %s"), fstypename, mntfromname, mntonname); #endif if (strcmp(fstypename, "ext2") == 0) { fstype = "ext2fs"; fsdata = &ufs; ufs.fspec = mntfromname; #define DEFAULT_ROOTID -2 ufs.export.ex_root = DEFAULT_ROOTID; ufs.export.ex_flags = args->rwflag & LINUX_MS_RDONLY ? MNT_EXRDONLY : 0; } else if (strcmp(fstypename, "proc") == 0) { fstype = "linprocfs"; fsdata = NULL; } else { return (ENODEV); } fsflags = 0; if ((args->rwflag & 0xffff0000) == 0xc0ed0000) { /* * Linux SYNC flag is not included; the closest equivalent * FreeBSD has is !ASYNC, which is our default. */ if (args->rwflag & LINUX_MS_RDONLY) fsflags |= MNT_RDONLY; if (args->rwflag & LINUX_MS_NOSUID) fsflags |= MNT_NOSUID; if (args->rwflag & LINUX_MS_NODEV) fsflags |= MNT_NODEV; if (args->rwflag & LINUX_MS_NOEXEC) fsflags |= MNT_NOEXEC; if (args->rwflag & LINUX_MS_REMOUNT) fsflags |= MNT_UPDATE; } return (vfs_mount(td, fstype, mntonname, fsflags, fsdata)); } int linux_oldumount(struct thread *td, struct linux_oldumount_args *args) { struct linux_umount_args args2; args2.path = args->path; args2.flags = 0; return (linux_umount(td, &args2)); } int linux_umount(struct thread *td, struct linux_umount_args *args) { struct unmount_args bsd; bsd.path = args->path; bsd.flags = args->flags; /* XXX correct? */ return (unmount(td, &bsd)); } /* * fcntl family of syscalls */ struct l_flock { l_short l_type; l_short l_whence; l_off_t l_start; l_off_t l_len; l_pid_t l_pid; }; static void linux_to_bsd_flock(struct l_flock *linux_flock, struct flock *bsd_flock) { switch (linux_flock->l_type) { case LINUX_F_RDLCK: bsd_flock->l_type = F_RDLCK; break; case LINUX_F_WRLCK: bsd_flock->l_type = F_WRLCK; break; case LINUX_F_UNLCK: bsd_flock->l_type = F_UNLCK; break; default: bsd_flock->l_type = -1; break; } bsd_flock->l_whence = linux_flock->l_whence; bsd_flock->l_start = (off_t)linux_flock->l_start; bsd_flock->l_len = (off_t)linux_flock->l_len; bsd_flock->l_pid = (pid_t)linux_flock->l_pid; } static void bsd_to_linux_flock(struct flock *bsd_flock, struct l_flock *linux_flock) { switch (bsd_flock->l_type) { case F_RDLCK: linux_flock->l_type = LINUX_F_RDLCK; break; case F_WRLCK: linux_flock->l_type = LINUX_F_WRLCK; break; case F_UNLCK: linux_flock->l_type = LINUX_F_UNLCK; break; } linux_flock->l_whence = bsd_flock->l_whence; linux_flock->l_start = (l_off_t)bsd_flock->l_start; linux_flock->l_len = (l_off_t)bsd_flock->l_len; linux_flock->l_pid = (l_pid_t)bsd_flock->l_pid; } #if defined(__i386__) struct l_flock64 { l_short l_type; l_short l_whence; l_loff_t l_start; l_loff_t l_len; l_pid_t l_pid; }; static void linux_to_bsd_flock64(struct l_flock64 *linux_flock, struct flock *bsd_flock) { switch (linux_flock->l_type) { case LINUX_F_RDLCK: bsd_flock->l_type = F_RDLCK; break; case LINUX_F_WRLCK: bsd_flock->l_type = F_WRLCK; break; case LINUX_F_UNLCK: bsd_flock->l_type = F_UNLCK; break; default: bsd_flock->l_type = -1; break; } bsd_flock->l_whence = linux_flock->l_whence; bsd_flock->l_start = (off_t)linux_flock->l_start; bsd_flock->l_len = (off_t)linux_flock->l_len; bsd_flock->l_pid = (pid_t)linux_flock->l_pid; } static void bsd_to_linux_flock64(struct flock *bsd_flock, struct l_flock64 *linux_flock) { switch (bsd_flock->l_type) { case F_RDLCK: linux_flock->l_type = LINUX_F_RDLCK; break; case F_WRLCK: linux_flock->l_type = LINUX_F_WRLCK; break; case F_UNLCK: linux_flock->l_type = LINUX_F_UNLCK; break; } linux_flock->l_whence = bsd_flock->l_whence; linux_flock->l_start = (l_loff_t)bsd_flock->l_start; linux_flock->l_len = (l_loff_t)bsd_flock->l_len; linux_flock->l_pid = (l_pid_t)bsd_flock->l_pid; } #endif /* __i386__ */ #if defined(__alpha__) #define linux_fcntl64_args linux_fcntl_args #endif static int fcntl_common(struct thread *td, struct linux_fcntl64_args *args) { struct fcntl_args fcntl_args; struct filedesc *fdp; struct file *fp; int error, result; fcntl_args.fd = args->fd; switch (args->cmd) { case LINUX_F_DUPFD: fcntl_args.cmd = F_DUPFD; fcntl_args.arg = args->arg; return (fcntl(td, &fcntl_args)); case LINUX_F_GETFD: fcntl_args.cmd = F_GETFD; return (fcntl(td, &fcntl_args)); case LINUX_F_SETFD: fcntl_args.cmd = F_SETFD; fcntl_args.arg = args->arg; return (fcntl(td, &fcntl_args)); case LINUX_F_GETFL: fcntl_args.cmd = F_GETFL; error = fcntl(td, &fcntl_args); result = td->td_retval[0]; td->td_retval[0] = 0; if (result & O_RDONLY) td->td_retval[0] |= LINUX_O_RDONLY; if (result & O_WRONLY) td->td_retval[0] |= LINUX_O_WRONLY; if (result & O_RDWR) td->td_retval[0] |= LINUX_O_RDWR; if (result & O_NDELAY) td->td_retval[0] |= LINUX_O_NONBLOCK; if (result & O_APPEND) td->td_retval[0] |= LINUX_O_APPEND; if (result & O_FSYNC) td->td_retval[0] |= LINUX_O_SYNC; if (result & O_ASYNC) td->td_retval[0] |= LINUX_FASYNC; return (error); case LINUX_F_SETFL: fcntl_args.arg = 0; if (args->arg & LINUX_O_NDELAY) fcntl_args.arg |= O_NONBLOCK; if (args->arg & LINUX_O_APPEND) fcntl_args.arg |= O_APPEND; if (args->arg & LINUX_O_SYNC) fcntl_args.arg |= O_FSYNC; if (args->arg & LINUX_FASYNC) fcntl_args.arg |= O_ASYNC; fcntl_args.cmd = F_SETFL; return (fcntl(td, &fcntl_args)); case LINUX_F_GETOWN: fcntl_args.cmd = F_GETOWN; return (fcntl(td, &fcntl_args)); case LINUX_F_SETOWN: /* * XXX some Linux applications depend on F_SETOWN having no * significant effect for pipes (SIGIO is not delivered for * pipes under Linux-2.2.35 at least). */ - fdp = td->td_proc->p_fd; - if ((u_int)args->fd >= fdp->fd_nfiles || - (fp = fdp->fd_ofiles[args->fd]) == NULL) - return (EBADF); - if (fp->f_type == DTYPE_PIPE) + fp = ffind_hold(td, args->fd); + if (fp == NULL) + return EBADF; + if (fp->f_type == DTYPE_PIPE) { + fdrop(fp, td); return (EINVAL); + } + fdrop(fp, td); fcntl_args.cmd = F_SETOWN; fcntl_args.arg = args->arg; return (fcntl(td, &fcntl_args)); } return (EINVAL); } int linux_fcntl(struct thread *td, struct linux_fcntl_args *args) { struct linux_fcntl64_args args64; struct fcntl_args fcntl_args; struct l_flock linux_flock; struct flock *bsd_flock; int error; caddr_t sg; sg = stackgap_init(); bsd_flock = (struct flock *)stackgap_alloc(&sg, sizeof(bsd_flock)); #ifdef DEBUG if (ldebug(fcntl)) printf(ARGS(fcntl, "%d, %08x, *"), args->fd, args->cmd); #endif switch (args->cmd) { case LINUX_F_GETLK: error = copyin((caddr_t)args->arg, &linux_flock, sizeof(linux_flock)); if (error) return (error); linux_to_bsd_flock(&linux_flock, bsd_flock); fcntl_args.fd = args->fd; fcntl_args.cmd = F_GETLK; fcntl_args.arg = (long)bsd_flock; error = fcntl(td, &fcntl_args); if (error) return (error); bsd_to_linux_flock(bsd_flock, &linux_flock); return (copyout(&linux_flock, (caddr_t)args->arg, sizeof(linux_flock))); case LINUX_F_SETLK: error = copyin((caddr_t)args->arg, &linux_flock, sizeof(linux_flock)); if (error) return (error); linux_to_bsd_flock(&linux_flock, bsd_flock); fcntl_args.fd = args->fd; fcntl_args.cmd = F_SETLK; fcntl_args.arg = (long)bsd_flock; return (fcntl(td, &fcntl_args)); case LINUX_F_SETLKW: error = copyin((caddr_t)args->arg, &linux_flock, sizeof(linux_flock)); if (error) return (error); linux_to_bsd_flock(&linux_flock, bsd_flock); fcntl_args.fd = args->fd; fcntl_args.cmd = F_SETLKW; fcntl_args.arg = (long)bsd_flock; return (fcntl(td, &fcntl_args)); } args64.fd = args->fd; args64.cmd = args->cmd; args64.arg = args->arg; return (fcntl_common(td, &args64)); } #if defined(__i386__) int linux_fcntl64(struct thread *td, struct linux_fcntl64_args *args) { struct fcntl_args fcntl_args; struct l_flock64 linux_flock; struct flock *bsd_flock; int error; caddr_t sg; sg = stackgap_init(); bsd_flock = (struct flock *)stackgap_alloc(&sg, sizeof(bsd_flock)); #ifdef DEBUG if (ldebug(fcntl64)) printf(ARGS(fcntl64, "%d, %08x, *"), args->fd, args->cmd); #endif switch (args->cmd) { case LINUX_F_GETLK: error = copyin((caddr_t)args->arg, &linux_flock, sizeof(linux_flock)); if (error) return (error); linux_to_bsd_flock64(&linux_flock, bsd_flock); fcntl_args.fd = args->fd; fcntl_args.cmd = F_GETLK; fcntl_args.arg = (long)bsd_flock; error = fcntl(td, &fcntl_args); if (error) return (error); bsd_to_linux_flock64(bsd_flock, &linux_flock); return (copyout(&linux_flock, (caddr_t)args->arg, sizeof(linux_flock))); case LINUX_F_SETLK: error = copyin((caddr_t)args->arg, &linux_flock, sizeof(linux_flock)); if (error) return (error); linux_to_bsd_flock64(&linux_flock, bsd_flock); fcntl_args.fd = args->fd; fcntl_args.cmd = F_SETLK; fcntl_args.arg = (long)bsd_flock; return (fcntl(td, &fcntl_args)); case LINUX_F_SETLKW: error = copyin((caddr_t)args->arg, &linux_flock, sizeof(linux_flock)); if (error) return (error); linux_to_bsd_flock64(&linux_flock, bsd_flock); fcntl_args.fd = args->fd; fcntl_args.cmd = F_SETLKW; fcntl_args.arg = (long)bsd_flock; return (fcntl(td, &fcntl_args)); } return (fcntl_common(td, args)); } #endif /* __i386__ */ int linux_chown(struct thread *td, struct linux_chown_args *args) { struct chown_args bsd; caddr_t sg; sg = stackgap_init(); CHECKALTEXIST(td, &sg, args->path); #ifdef DEBUG if (ldebug(chown)) printf(ARGS(chown, "%s, %d, %d"), args->path, args->uid, args->gid); #endif bsd.path = args->path; bsd.uid = args->uid; bsd.gid = args->gid; return (chown(td, &bsd)); } int linux_lchown(struct thread *td, struct linux_lchown_args *args) { struct lchown_args bsd; caddr_t sg; sg = stackgap_init(); CHECKALTEXIST(td, &sg, args->path); #ifdef DEBUG if (ldebug(lchown)) printf(ARGS(lchown, "%s, %d, %d"), args->path, args->uid, args->gid); #endif bsd.path = args->path; bsd.uid = args->uid; bsd.gid = args->gid; return (lchown(td, &bsd)); } Index: head/sys/compat/linux/linux_ioctl.c =================================================================== --- head/sys/compat/linux/linux_ioctl.c (revision 89305) +++ head/sys/compat/linux/linux_ioctl.c (revision 89306) @@ -1,2256 +1,2346 @@ /* * Copyright (c) 1994-1995 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software withough specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static linux_ioctl_function_t linux_ioctl_cdrom; static linux_ioctl_function_t linux_ioctl_console; static linux_ioctl_function_t linux_ioctl_disk; static linux_ioctl_function_t linux_ioctl_socket; static linux_ioctl_function_t linux_ioctl_sound; static linux_ioctl_function_t linux_ioctl_termio; static linux_ioctl_function_t linux_ioctl_private; static linux_ioctl_function_t linux_ioctl_special; static struct linux_ioctl_handler cdrom_handler = { linux_ioctl_cdrom, LINUX_IOCTL_CDROM_MIN, LINUX_IOCTL_CDROM_MAX }; static struct linux_ioctl_handler console_handler = { linux_ioctl_console, LINUX_IOCTL_CONSOLE_MIN, LINUX_IOCTL_CONSOLE_MAX }; static struct linux_ioctl_handler disk_handler = { linux_ioctl_disk, LINUX_IOCTL_DISK_MIN, LINUX_IOCTL_DISK_MAX }; static struct linux_ioctl_handler socket_handler = { linux_ioctl_socket, LINUX_IOCTL_SOCKET_MIN, LINUX_IOCTL_SOCKET_MAX }; static struct linux_ioctl_handler sound_handler = { linux_ioctl_sound, LINUX_IOCTL_SOUND_MIN, LINUX_IOCTL_SOUND_MAX }; static struct linux_ioctl_handler termio_handler = { linux_ioctl_termio, LINUX_IOCTL_TERMIO_MIN, LINUX_IOCTL_TERMIO_MAX }; static struct linux_ioctl_handler private_handler = { linux_ioctl_private, LINUX_IOCTL_PRIVATE_MIN, LINUX_IOCTL_PRIVATE_MAX }; DATA_SET(linux_ioctl_handler_set, cdrom_handler); DATA_SET(linux_ioctl_handler_set, console_handler); DATA_SET(linux_ioctl_handler_set, disk_handler); DATA_SET(linux_ioctl_handler_set, socket_handler); DATA_SET(linux_ioctl_handler_set, sound_handler); DATA_SET(linux_ioctl_handler_set, termio_handler); DATA_SET(linux_ioctl_handler_set, private_handler); struct handler_element { TAILQ_ENTRY(handler_element) list; int (*func)(struct thread *, struct linux_ioctl_args *); int low, high, span; }; static TAILQ_HEAD(, handler_element) handlers = TAILQ_HEAD_INITIALIZER(handlers); static int linux_ioctl_disk(struct thread *td, struct linux_ioctl_args *args) { - struct file *fp = td->td_proc->p_fd->fd_ofiles[args->fd]; + struct file *fp; int error; struct disklabel dl; + fp = ffind_hold(td, args->fd); + if (fp == NULL) + return (EBADF); switch (args->cmd & 0xffff) { case LINUX_BLKGETSIZE: error = fo_ioctl(fp, DIOCGDINFO, (caddr_t)&dl, td); + fdrop(fp, td); if (error) return (error); return (copyout(&(dl.d_secperunit), (caddr_t)args->arg, sizeof(dl.d_secperunit))); - break; } + fdrop(fp, td); return (ENOIOCTL); } /* * termio related ioctls */ struct linux_termio { unsigned short c_iflag; unsigned short c_oflag; unsigned short c_cflag; unsigned short c_lflag; unsigned char c_line; unsigned char c_cc[LINUX_NCC]; }; struct linux_termios { unsigned int c_iflag; unsigned int c_oflag; unsigned int c_cflag; unsigned int c_lflag; #ifdef __alpha__ unsigned char c_cc[LINUX_NCCS]; unsigned char c_line; unsigned int c_ispeed; unsigned int c_ospeed; #else unsigned char c_line; unsigned char c_cc[LINUX_NCCS]; #endif }; struct linux_winsize { unsigned short ws_row, ws_col; unsigned short ws_xpixel, ws_ypixel; }; static struct speedtab sptab[] = { { B0, LINUX_B0 }, { B50, LINUX_B50 }, { B75, LINUX_B75 }, { B110, LINUX_B110 }, { B134, LINUX_B134 }, { B150, LINUX_B150 }, { B200, LINUX_B200 }, { B300, LINUX_B300 }, { B600, LINUX_B600 }, { B1200, LINUX_B1200 }, { B1800, LINUX_B1800 }, { B2400, LINUX_B2400 }, { B4800, LINUX_B4800 }, { B9600, LINUX_B9600 }, { B19200, LINUX_B19200 }, { B38400, LINUX_B38400 }, { B57600, LINUX_B57600 }, { B115200, LINUX_B115200 }, {-1, -1 } }; struct linux_serial_struct { int type; int line; int port; int irq; int flags; int xmit_fifo_size; int custom_divisor; int baud_base; unsigned short close_delay; char reserved_char[2]; int hub6; unsigned short closing_wait; unsigned short closing_wait2; int reserved[4]; }; static int linux_to_bsd_speed(int code, struct speedtab *table) { for ( ; table->sp_code != -1; table++) if (table->sp_code == code) return (table->sp_speed); return -1; } static int bsd_to_linux_speed(int speed, struct speedtab *table) { for ( ; table->sp_speed != -1; table++) if (table->sp_speed == speed) return (table->sp_code); return -1; } static void bsd_to_linux_termios(struct termios *bios, struct linux_termios *lios) { int i; #ifdef DEBUG if (ldebug(ioctl)) { printf("LINUX: BSD termios structure (input):\n"); printf("i=%08x o=%08x c=%08x l=%08x ispeed=%d ospeed=%d\n", bios->c_iflag, bios->c_oflag, bios->c_cflag, bios->c_lflag, bios->c_ispeed, bios->c_ospeed); printf("c_cc "); for (i=0; ic_cc[i]); printf("\n"); } #endif lios->c_iflag = 0; if (bios->c_iflag & IGNBRK) lios->c_iflag |= LINUX_IGNBRK; if (bios->c_iflag & BRKINT) lios->c_iflag |= LINUX_BRKINT; if (bios->c_iflag & IGNPAR) lios->c_iflag |= LINUX_IGNPAR; if (bios->c_iflag & PARMRK) lios->c_iflag |= LINUX_PARMRK; if (bios->c_iflag & INPCK) lios->c_iflag |= LINUX_INPCK; if (bios->c_iflag & ISTRIP) lios->c_iflag |= LINUX_ISTRIP; if (bios->c_iflag & INLCR) lios->c_iflag |= LINUX_INLCR; if (bios->c_iflag & IGNCR) lios->c_iflag |= LINUX_IGNCR; if (bios->c_iflag & ICRNL) lios->c_iflag |= LINUX_ICRNL; if (bios->c_iflag & IXON) lios->c_iflag |= LINUX_IXON; if (bios->c_iflag & IXANY) lios->c_iflag |= LINUX_IXANY; if (bios->c_iflag & IXOFF) lios->c_iflag |= LINUX_IXOFF; if (bios->c_iflag & IMAXBEL) lios->c_iflag |= LINUX_IMAXBEL; lios->c_oflag = 0; if (bios->c_oflag & OPOST) lios->c_oflag |= LINUX_OPOST; if (bios->c_oflag & ONLCR) lios->c_oflag |= LINUX_ONLCR; if (bios->c_oflag & OXTABS) lios->c_oflag |= LINUX_XTABS; lios->c_cflag = bsd_to_linux_speed(bios->c_ispeed, sptab); lios->c_cflag |= (bios->c_cflag & CSIZE) >> 4; if (bios->c_cflag & CSTOPB) lios->c_cflag |= LINUX_CSTOPB; if (bios->c_cflag & CREAD) lios->c_cflag |= LINUX_CREAD; if (bios->c_cflag & PARENB) lios->c_cflag |= LINUX_PARENB; if (bios->c_cflag & PARODD) lios->c_cflag |= LINUX_PARODD; if (bios->c_cflag & HUPCL) lios->c_cflag |= LINUX_HUPCL; if (bios->c_cflag & CLOCAL) lios->c_cflag |= LINUX_CLOCAL; if (bios->c_cflag & CRTSCTS) lios->c_cflag |= LINUX_CRTSCTS; lios->c_lflag = 0; if (bios->c_lflag & ISIG) lios->c_lflag |= LINUX_ISIG; if (bios->c_lflag & ICANON) lios->c_lflag |= LINUX_ICANON; if (bios->c_lflag & ECHO) lios->c_lflag |= LINUX_ECHO; if (bios->c_lflag & ECHOE) lios->c_lflag |= LINUX_ECHOE; if (bios->c_lflag & ECHOK) lios->c_lflag |= LINUX_ECHOK; if (bios->c_lflag & ECHONL) lios->c_lflag |= LINUX_ECHONL; if (bios->c_lflag & NOFLSH) lios->c_lflag |= LINUX_NOFLSH; if (bios->c_lflag & TOSTOP) lios->c_lflag |= LINUX_TOSTOP; if (bios->c_lflag & ECHOCTL) lios->c_lflag |= LINUX_ECHOCTL; if (bios->c_lflag & ECHOPRT) lios->c_lflag |= LINUX_ECHOPRT; if (bios->c_lflag & ECHOKE) lios->c_lflag |= LINUX_ECHOKE; if (bios->c_lflag & FLUSHO) lios->c_lflag |= LINUX_FLUSHO; if (bios->c_lflag & PENDIN) lios->c_lflag |= LINUX_PENDIN; if (bios->c_lflag & IEXTEN) lios->c_lflag |= LINUX_IEXTEN; for (i=0; ic_cc[i] = LINUX_POSIX_VDISABLE; lios->c_cc[LINUX_VINTR] = bios->c_cc[VINTR]; lios->c_cc[LINUX_VQUIT] = bios->c_cc[VQUIT]; lios->c_cc[LINUX_VERASE] = bios->c_cc[VERASE]; lios->c_cc[LINUX_VKILL] = bios->c_cc[VKILL]; lios->c_cc[LINUX_VEOF] = bios->c_cc[VEOF]; lios->c_cc[LINUX_VEOL] = bios->c_cc[VEOL]; lios->c_cc[LINUX_VMIN] = bios->c_cc[VMIN]; lios->c_cc[LINUX_VTIME] = bios->c_cc[VTIME]; lios->c_cc[LINUX_VEOL2] = bios->c_cc[VEOL2]; lios->c_cc[LINUX_VSUSP] = bios->c_cc[VSUSP]; lios->c_cc[LINUX_VSTART] = bios->c_cc[VSTART]; lios->c_cc[LINUX_VSTOP] = bios->c_cc[VSTOP]; lios->c_cc[LINUX_VREPRINT] = bios->c_cc[VREPRINT]; lios->c_cc[LINUX_VDISCARD] = bios->c_cc[VDISCARD]; lios->c_cc[LINUX_VWERASE] = bios->c_cc[VWERASE]; lios->c_cc[LINUX_VLNEXT] = bios->c_cc[VLNEXT]; for (i=0; ic_cc[i] == _POSIX_VDISABLE) lios->c_cc[i] = LINUX_POSIX_VDISABLE; } lios->c_line = 0; #ifdef DEBUG if (ldebug(ioctl)) { printf("LINUX: LINUX termios structure (output):\n"); printf("i=%08x o=%08x c=%08x l=%08x line=%d\n", lios->c_iflag, lios->c_oflag, lios->c_cflag, lios->c_lflag, (int)lios->c_line); printf("c_cc "); for (i=0; ic_cc[i]); printf("\n"); } #endif } static void linux_to_bsd_termios(struct linux_termios *lios, struct termios *bios) { int i; #ifdef DEBUG if (ldebug(ioctl)) { printf("LINUX: LINUX termios structure (input):\n"); printf("i=%08x o=%08x c=%08x l=%08x line=%d\n", lios->c_iflag, lios->c_oflag, lios->c_cflag, lios->c_lflag, (int)lios->c_line); printf("c_cc "); for (i=0; ic_cc[i]); printf("\n"); } #endif bios->c_iflag = 0; if (lios->c_iflag & LINUX_IGNBRK) bios->c_iflag |= IGNBRK; if (lios->c_iflag & LINUX_BRKINT) bios->c_iflag |= BRKINT; if (lios->c_iflag & LINUX_IGNPAR) bios->c_iflag |= IGNPAR; if (lios->c_iflag & LINUX_PARMRK) bios->c_iflag |= PARMRK; if (lios->c_iflag & LINUX_INPCK) bios->c_iflag |= INPCK; if (lios->c_iflag & LINUX_ISTRIP) bios->c_iflag |= ISTRIP; if (lios->c_iflag & LINUX_INLCR) bios->c_iflag |= INLCR; if (lios->c_iflag & LINUX_IGNCR) bios->c_iflag |= IGNCR; if (lios->c_iflag & LINUX_ICRNL) bios->c_iflag |= ICRNL; if (lios->c_iflag & LINUX_IXON) bios->c_iflag |= IXON; if (lios->c_iflag & LINUX_IXANY) bios->c_iflag |= IXANY; if (lios->c_iflag & LINUX_IXOFF) bios->c_iflag |= IXOFF; if (lios->c_iflag & LINUX_IMAXBEL) bios->c_iflag |= IMAXBEL; bios->c_oflag = 0; if (lios->c_oflag & LINUX_OPOST) bios->c_oflag |= OPOST; if (lios->c_oflag & LINUX_ONLCR) bios->c_oflag |= ONLCR; if (lios->c_oflag & LINUX_XTABS) bios->c_oflag |= OXTABS; bios->c_cflag = (lios->c_cflag & LINUX_CSIZE) << 4; if (lios->c_cflag & LINUX_CSTOPB) bios->c_cflag |= CSTOPB; if (lios->c_cflag & LINUX_CREAD) bios->c_cflag |= CREAD; if (lios->c_cflag & LINUX_PARENB) bios->c_cflag |= PARENB; if (lios->c_cflag & LINUX_PARODD) bios->c_cflag |= PARODD; if (lios->c_cflag & LINUX_HUPCL) bios->c_cflag |= HUPCL; if (lios->c_cflag & LINUX_CLOCAL) bios->c_cflag |= CLOCAL; if (lios->c_cflag & LINUX_CRTSCTS) bios->c_cflag |= CRTSCTS; bios->c_lflag = 0; if (lios->c_lflag & LINUX_ISIG) bios->c_lflag |= ISIG; if (lios->c_lflag & LINUX_ICANON) bios->c_lflag |= ICANON; if (lios->c_lflag & LINUX_ECHO) bios->c_lflag |= ECHO; if (lios->c_lflag & LINUX_ECHOE) bios->c_lflag |= ECHOE; if (lios->c_lflag & LINUX_ECHOK) bios->c_lflag |= ECHOK; if (lios->c_lflag & LINUX_ECHONL) bios->c_lflag |= ECHONL; if (lios->c_lflag & LINUX_NOFLSH) bios->c_lflag |= NOFLSH; if (lios->c_lflag & LINUX_TOSTOP) bios->c_lflag |= TOSTOP; if (lios->c_lflag & LINUX_ECHOCTL) bios->c_lflag |= ECHOCTL; if (lios->c_lflag & LINUX_ECHOPRT) bios->c_lflag |= ECHOPRT; if (lios->c_lflag & LINUX_ECHOKE) bios->c_lflag |= ECHOKE; if (lios->c_lflag & LINUX_FLUSHO) bios->c_lflag |= FLUSHO; if (lios->c_lflag & LINUX_PENDIN) bios->c_lflag |= PENDIN; if (lios->c_lflag & LINUX_IEXTEN) bios->c_lflag |= IEXTEN; for (i=0; ic_cc[i] = _POSIX_VDISABLE; bios->c_cc[VINTR] = lios->c_cc[LINUX_VINTR]; bios->c_cc[VQUIT] = lios->c_cc[LINUX_VQUIT]; bios->c_cc[VERASE] = lios->c_cc[LINUX_VERASE]; bios->c_cc[VKILL] = lios->c_cc[LINUX_VKILL]; bios->c_cc[VEOF] = lios->c_cc[LINUX_VEOF]; bios->c_cc[VEOL] = lios->c_cc[LINUX_VEOL]; bios->c_cc[VMIN] = lios->c_cc[LINUX_VMIN]; bios->c_cc[VTIME] = lios->c_cc[LINUX_VTIME]; bios->c_cc[VEOL2] = lios->c_cc[LINUX_VEOL2]; bios->c_cc[VSUSP] = lios->c_cc[LINUX_VSUSP]; bios->c_cc[VSTART] = lios->c_cc[LINUX_VSTART]; bios->c_cc[VSTOP] = lios->c_cc[LINUX_VSTOP]; bios->c_cc[VREPRINT] = lios->c_cc[LINUX_VREPRINT]; bios->c_cc[VDISCARD] = lios->c_cc[LINUX_VDISCARD]; bios->c_cc[VWERASE] = lios->c_cc[LINUX_VWERASE]; bios->c_cc[VLNEXT] = lios->c_cc[LINUX_VLNEXT]; for (i=0; ic_cc[i] == LINUX_POSIX_VDISABLE) bios->c_cc[i] = _POSIX_VDISABLE; } bios->c_ispeed = bios->c_ospeed = linux_to_bsd_speed(lios->c_cflag & LINUX_CBAUD, sptab); #ifdef DEBUG if (ldebug(ioctl)) { printf("LINUX: BSD termios structure (output):\n"); printf("i=%08x o=%08x c=%08x l=%08x ispeed=%d ospeed=%d\n", bios->c_iflag, bios->c_oflag, bios->c_cflag, bios->c_lflag, bios->c_ispeed, bios->c_ospeed); printf("c_cc "); for (i=0; ic_cc[i]); printf("\n"); } #endif } static void bsd_to_linux_termio(struct termios *bios, struct linux_termio *lio) { struct linux_termios lios; bsd_to_linux_termios(bios, &lios); lio->c_iflag = lios.c_iflag; lio->c_oflag = lios.c_oflag; lio->c_cflag = lios.c_cflag; lio->c_lflag = lios.c_lflag; lio->c_line = lios.c_line; #ifdef __alpha__ lio->c_cc[LINUX__VINTR] = lios.c_cc[LINUX_VINTR]; lio->c_cc[LINUX__VQUIT] = lios.c_cc[LINUX_VQUIT]; lio->c_cc[LINUX__VERASE] = lios.c_cc[LINUX_VERASE]; lio->c_cc[LINUX__VKILL] = lios.c_cc[LINUX_VKILL]; lio->c_cc[LINUX__VEOF] = lios.c_cc[(lios.c_lflag & ICANON) ? LINUX_VEOF : LINUX_VMIN]; lio->c_cc[LINUX__VEOL] = lios.c_cc[(lios.c_lflag & ICANON) ? LINUX_VEOL : LINUX_VTIME]; lio->c_cc[LINUX__VEOL2] = lios.c_cc[LINUX_VEOL2]; lio->c_cc[LINUX__VSWTC] = lios.c_cc[LINUX_VSWTC]; #else memcpy(lio->c_cc, lios.c_cc, LINUX_NCC); #endif } static void linux_to_bsd_termio(struct linux_termio *lio, struct termios *bios) { struct linux_termios lios; int i; lios.c_iflag = lio->c_iflag; lios.c_oflag = lio->c_oflag; lios.c_cflag = lio->c_cflag; lios.c_lflag = lio->c_lflag; #ifdef __alpha__ for (i=0; ic_cc[LINUX__VINTR]; lios.c_cc[LINUX_VQUIT] = lio->c_cc[LINUX__VQUIT]; lios.c_cc[LINUX_VERASE] = lio->c_cc[LINUX__VERASE]; lios.c_cc[LINUX_VKILL] = lio->c_cc[LINUX__VKILL]; lios.c_cc[LINUX_VEOL2] = lio->c_cc[LINUX__VEOL2]; lios.c_cc[LINUX_VSWTC] = lio->c_cc[LINUX__VSWTC]; lios.c_cc[(lio->c_lflag & ICANON) ? LINUX_VEOF : LINUX_VMIN] = lio->c_cc[LINUX__VEOF]; lios.c_cc[(lio->c_lflag & ICANON) ? LINUX_VEOL : LINUX_VTIME] = lio->c_cc[LINUX__VEOL]; #else for (i=LINUX_NCC; ic_cc, LINUX_NCC); #endif linux_to_bsd_termios(&lios, bios); } static int linux_ioctl_termio(struct thread *td, struct linux_ioctl_args *args) { struct termios bios; struct linux_termios lios; struct linux_termio lio; - struct file *fp = td->td_proc->p_fd->fd_ofiles[args->fd]; + struct file *fp; int error; + fp = ffind_hold(td, args->fd); + if (fp == NULL) + return (EBADF); switch (args->cmd & 0xffff) { case LINUX_TCGETS: error = fo_ioctl(fp, TIOCGETA, (caddr_t)&bios, td); if (error) - return (error); + break; bsd_to_linux_termios(&bios, &lios); - return copyout(&lios, (caddr_t)args->arg, sizeof(lios)); + error = copyout(&lios, (caddr_t)args->arg, sizeof(lios)); + break; case LINUX_TCSETS: error = copyin((caddr_t)args->arg, &lios, sizeof(lios)); if (error) - return (error); + break; linux_to_bsd_termios(&lios, &bios); - return (fo_ioctl(fp, TIOCSETA, (caddr_t)&bios, td)); + error = (fo_ioctl(fp, TIOCSETA, (caddr_t)&bios, td)); + break; case LINUX_TCSETSW: error = copyin((caddr_t)args->arg, &lios, sizeof(lios)); if (error) - return (error); + break; linux_to_bsd_termios(&lios, &bios); - return (fo_ioctl(fp, TIOCSETAW, (caddr_t)&bios, td)); + error = (fo_ioctl(fp, TIOCSETAW, (caddr_t)&bios, td)); + break; case LINUX_TCSETSF: error = copyin((caddr_t)args->arg, &lios, sizeof(lios)); if (error) - return (error); + break; linux_to_bsd_termios(&lios, &bios); - return (fo_ioctl(fp, TIOCSETAF, (caddr_t)&bios, td)); + error = (fo_ioctl(fp, TIOCSETAF, (caddr_t)&bios, td)); + break; case LINUX_TCGETA: error = fo_ioctl(fp, TIOCGETA, (caddr_t)&bios, td); if (error) - return (error); + break; bsd_to_linux_termio(&bios, &lio); - return (copyout(&lio, (caddr_t)args->arg, sizeof(lio))); + error = (copyout(&lio, (caddr_t)args->arg, sizeof(lio))); + break; case LINUX_TCSETA: error = copyin((caddr_t)args->arg, &lio, sizeof(lio)); if (error) - return (error); + break; linux_to_bsd_termio(&lio, &bios); - return (fo_ioctl(fp, TIOCSETA, (caddr_t)&bios, td)); + error = (fo_ioctl(fp, TIOCSETA, (caddr_t)&bios, td)); + break; case LINUX_TCSETAW: error = copyin((caddr_t)args->arg, &lio, sizeof(lio)); if (error) - return (error); + break; linux_to_bsd_termio(&lio, &bios); - return (fo_ioctl(fp, TIOCSETAW, (caddr_t)&bios, td)); + error = (fo_ioctl(fp, TIOCSETAW, (caddr_t)&bios, td)); + break; case LINUX_TCSETAF: error = copyin((caddr_t)args->arg, &lio, sizeof(lio)); if (error) - return (error); + break; linux_to_bsd_termio(&lio, &bios); - return (fo_ioctl(fp, TIOCSETAF, (caddr_t)&bios, td)); + error = (fo_ioctl(fp, TIOCSETAF, (caddr_t)&bios, td)); + break; /* LINUX_TCSBRK */ case LINUX_TCXONC: { switch (args->arg) { case LINUX_TCOOFF: args->cmd = TIOCSTOP; break; case LINUX_TCOON: args->cmd = TIOCSTART; break; case LINUX_TCIOFF: case LINUX_TCION: { int c; struct write_args wr; error = fo_ioctl(fp, TIOCGETA, (caddr_t)&bios, td); if (error) - return (error); + break; + fdrop(fp, td); c = (args->arg == LINUX_TCIOFF) ? VSTOP : VSTART; c = bios.c_cc[c]; if (c != _POSIX_VDISABLE) { wr.fd = args->fd; wr.buf = &c; wr.nbyte = sizeof(c); return (write(td, &wr)); } else return (0); } default: + fdrop(fp, td); return (EINVAL); } args->arg = 0; - return (ioctl(td, (struct ioctl_args *)args)); + error = (ioctl(td, (struct ioctl_args *)args)); + break; } case LINUX_TCFLSH: { args->cmd = TIOCFLUSH; switch (args->arg) { case LINUX_TCIFLUSH: args->arg = FREAD; break; case LINUX_TCOFLUSH: args->arg = FWRITE; break; case LINUX_TCIOFLUSH: args->arg = FREAD | FWRITE; break; default: + fdrop(fp, td); return (EINVAL); } - return (ioctl(td, (struct ioctl_args *)args)); + error = (ioctl(td, (struct ioctl_args *)args)); + break; } case LINUX_TIOCEXCL: args->cmd = TIOCEXCL; - return (ioctl(td, (struct ioctl_args *)args)); + error = (ioctl(td, (struct ioctl_args *)args)); + break; case LINUX_TIOCNXCL: args->cmd = TIOCNXCL; - return (ioctl(td, (struct ioctl_args *)args)); + error = (ioctl(td, (struct ioctl_args *)args)); + break; /* LINUX_TIOCSCTTY */ case LINUX_TIOCGPGRP: args->cmd = TIOCGPGRP; - return (ioctl(td, (struct ioctl_args *)args)); + error = (ioctl(td, (struct ioctl_args *)args)); + break; case LINUX_TIOCSPGRP: args->cmd = TIOCSPGRP; - return (ioctl(td, (struct ioctl_args *)args)); + error = (ioctl(td, (struct ioctl_args *)args)); + break; /* LINUX_TIOCOUTQ */ /* LINUX_TIOCSTI */ case LINUX_TIOCGWINSZ: args->cmd = TIOCGWINSZ; - return (ioctl(td, (struct ioctl_args *)args)); + error = (ioctl(td, (struct ioctl_args *)args)); + break; case LINUX_TIOCSWINSZ: args->cmd = TIOCSWINSZ; - return (ioctl(td, (struct ioctl_args *)args)); + error = (ioctl(td, (struct ioctl_args *)args)); + break; case LINUX_TIOCMGET: args->cmd = TIOCMGET; - return (ioctl(td, (struct ioctl_args *)args)); + error = (ioctl(td, (struct ioctl_args *)args)); + break; case LINUX_TIOCMBIS: args->cmd = TIOCMBIS; - return (ioctl(td, (struct ioctl_args *)args)); + error = (ioctl(td, (struct ioctl_args *)args)); + break; case LINUX_TIOCMBIC: args->cmd = TIOCMBIC; - return (ioctl(td, (struct ioctl_args *)args)); + error = (ioctl(td, (struct ioctl_args *)args)); + break; case LINUX_TIOCMSET: args->cmd = TIOCMSET; - return (ioctl(td, (struct ioctl_args *)args)); + error = (ioctl(td, (struct ioctl_args *)args)); + break; /* TIOCGSOFTCAR */ /* TIOCSSOFTCAR */ case LINUX_FIONREAD: /* LINUX_TIOCINQ */ args->cmd = FIONREAD; - return (ioctl(td, (struct ioctl_args *)args)); + error = (ioctl(td, (struct ioctl_args *)args)); + break; /* LINUX_TIOCLINUX */ case LINUX_TIOCCONS: args->cmd = TIOCCONS; - return (ioctl(td, (struct ioctl_args *)args)); + error = (ioctl(td, (struct ioctl_args *)args)); + break; case LINUX_TIOCGSERIAL: { struct linux_serial_struct lss; lss.type = LINUX_PORT_16550A; lss.flags = 0; lss.close_delay = 0; - return copyout(&lss, (caddr_t)args->arg, sizeof(lss)); + error = copyout(&lss, (caddr_t)args->arg, sizeof(lss)); + break; } case LINUX_TIOCSSERIAL: { struct linux_serial_struct lss; error = copyin((caddr_t)args->arg, &lss, sizeof(lss)); if (error) - return (error); + break; /* XXX - It really helps to have an implementation that * does nothing. NOT! */ - return (0); + error = 0; + break; } /* LINUX_TIOCPKT */ case LINUX_FIONBIO: args->cmd = FIONBIO; - return (ioctl(td, (struct ioctl_args *)args)); + error = (ioctl(td, (struct ioctl_args *)args)); + break; case LINUX_TIOCNOTTY: args->cmd = TIOCNOTTY; - return (ioctl(td, (struct ioctl_args *)args)); + error = (ioctl(td, (struct ioctl_args *)args)); + break; case LINUX_TIOCSETD: { int line; switch (args->arg) { case LINUX_N_TTY: line = TTYDISC; break; case LINUX_N_SLIP: line = SLIPDISC; break; case LINUX_N_PPP: line = PPPDISC; break; default: + fdrop(fp, td); return (EINVAL); } - return (fo_ioctl(fp, TIOCSETD, (caddr_t)&line, td)); + error = (fo_ioctl(fp, TIOCSETD, (caddr_t)&line, td)); + break; } case LINUX_TIOCGETD: { int linux_line; int bsd_line = TTYDISC; error = fo_ioctl(fp, TIOCGETD, (caddr_t)&bsd_line, td); if (error) return (error); switch (bsd_line) { case TTYDISC: linux_line = LINUX_N_TTY; break; case SLIPDISC: linux_line = LINUX_N_SLIP; break; case PPPDISC: linux_line = LINUX_N_PPP; break; default: + fdrop(fp, td); return (EINVAL); } - return (copyout(&linux_line, (caddr_t)args->arg, sizeof(int))); + error = (copyout(&linux_line, (caddr_t)args->arg, sizeof(int))); + break; } /* LINUX_TCSBRKP */ /* LINUX_TIOCTTYGSTRUCT */ case LINUX_FIONCLEX: args->cmd = FIONCLEX; - return (ioctl(td, (struct ioctl_args *)args)); + error = (ioctl(td, (struct ioctl_args *)args)); + break; case LINUX_FIOCLEX: args->cmd = FIOCLEX; - return (ioctl(td, (struct ioctl_args *)args)); + error = (ioctl(td, (struct ioctl_args *)args)); + break; case LINUX_FIOASYNC: args->cmd = FIOASYNC; - return (ioctl(td, (struct ioctl_args *)args)); + error = (ioctl(td, (struct ioctl_args *)args)); + break; /* LINUX_TIOCSERCONFIG */ /* LINUX_TIOCSERGWILD */ /* LINUX_TIOCSERSWILD */ /* LINUX_TIOCGLCKTRMIOS */ /* LINUX_TIOCSLCKTRMIOS */ + default: + error = ENOIOCTL; + break; } - return (ENOIOCTL); + fdrop(fp, td); + return (error); } /* * CDROM related ioctls */ struct linux_cdrom_msf { u_char cdmsf_min0; u_char cdmsf_sec0; u_char cdmsf_frame0; u_char cdmsf_min1; u_char cdmsf_sec1; u_char cdmsf_frame1; }; struct linux_cdrom_tochdr { u_char cdth_trk0; u_char cdth_trk1; }; union linux_cdrom_addr { struct { u_char minute; u_char second; u_char frame; } msf; int lba; }; struct linux_cdrom_tocentry { u_char cdte_track; u_char cdte_adr:4; u_char cdte_ctrl:4; u_char cdte_format; union linux_cdrom_addr cdte_addr; u_char cdte_datamode; }; struct linux_cdrom_subchnl { u_char cdsc_format; u_char cdsc_audiostatus; u_char cdsc_adr:4; u_char cdsc_ctrl:4; u_char cdsc_trk; u_char cdsc_ind; union linux_cdrom_addr cdsc_absaddr; union linux_cdrom_addr cdsc_reladdr; }; struct l_dvd_layer { u_char book_version:4; u_char book_type:4; u_char min_rate:4; u_char disc_size:4; u_char layer_type:4; u_char track_path:1; u_char nlayers:2; u_char track_density:4; u_char linear_density:4; u_char bca:1; u_int32_t start_sector; u_int32_t end_sector; u_int32_t end_sector_l0; }; struct l_dvd_physical { u_char type; u_char layer_num; struct l_dvd_layer layer[4]; }; struct l_dvd_copyright { u_char type; u_char layer_num; u_char cpst; u_char rmi; }; struct l_dvd_disckey { u_char type; l_uint agid:2; u_char value[2048]; }; struct l_dvd_bca { u_char type; l_int len; u_char value[188]; }; struct l_dvd_manufact { u_char type; u_char layer_num; l_int len; u_char value[2048]; }; typedef union { u_char type; struct l_dvd_physical physical; struct l_dvd_copyright copyright; struct l_dvd_disckey disckey; struct l_dvd_bca bca; struct l_dvd_manufact manufact; } l_dvd_struct; typedef u_char l_dvd_key[5]; typedef u_char l_dvd_challenge[10]; struct l_dvd_lu_send_agid { u_char type; l_uint agid:2; }; struct l_dvd_host_send_challenge { u_char type; l_uint agid:2; l_dvd_challenge chal; }; struct l_dvd_send_key { u_char type; l_uint agid:2; l_dvd_key key; }; struct l_dvd_lu_send_challenge { u_char type; l_uint agid:2; l_dvd_challenge chal; }; struct l_dvd_lu_send_title_key { u_char type; l_uint agid:2; l_dvd_key title_key; l_int lba; l_uint cpm:1; l_uint cp_sec:1; l_uint cgms:2; }; struct l_dvd_lu_send_asf { u_char type; l_uint agid:2; l_uint asf:1; }; struct l_dvd_host_send_rpcstate { u_char type; u_char pdrc; }; struct l_dvd_lu_send_rpcstate { u_char type:2; u_char vra:3; u_char ucca:3; u_char region_mask; u_char rpc_scheme; }; typedef union { u_char type; struct l_dvd_lu_send_agid lsa; struct l_dvd_host_send_challenge hsc; struct l_dvd_send_key lsk; struct l_dvd_lu_send_challenge lsc; struct l_dvd_send_key hsk; struct l_dvd_lu_send_title_key lstk; struct l_dvd_lu_send_asf lsasf; struct l_dvd_host_send_rpcstate hrpcs; struct l_dvd_lu_send_rpcstate lrpcs; } l_dvd_authinfo; static void bsd_to_linux_msf_lba(u_char af, union msf_lba *bp, union linux_cdrom_addr *lp) { if (af == CD_LBA_FORMAT) lp->lba = bp->lba; else { lp->msf.minute = bp->msf.minute; lp->msf.second = bp->msf.second; lp->msf.frame = bp->msf.frame; } } static void set_linux_cdrom_addr(union linux_cdrom_addr *addr, int format, int lba) { if (format == LINUX_CDROM_MSF) { addr->msf.frame = lba % 75; lba /= 75; lba += 2; addr->msf.second = lba % 60; addr->msf.minute = lba / 60; } else addr->lba = lba; } static int linux_to_bsd_dvd_struct(l_dvd_struct *lp, struct dvd_struct *bp) { bp->format = lp->type; switch (bp->format) { case DVD_STRUCT_PHYSICAL: if (bp->layer_num >= 4) return (EINVAL); bp->layer_num = lp->physical.layer_num; break; case DVD_STRUCT_COPYRIGHT: bp->layer_num = lp->copyright.layer_num; break; case DVD_STRUCT_DISCKEY: bp->agid = lp->disckey.agid; break; case DVD_STRUCT_BCA: case DVD_STRUCT_MANUFACT: break; default: return (EINVAL); } return (0); } static int bsd_to_linux_dvd_struct(struct dvd_struct *bp, l_dvd_struct *lp) { switch (bp->format) { case DVD_STRUCT_PHYSICAL: { struct dvd_layer *blp = (struct dvd_layer *)bp->data; struct l_dvd_layer *llp = &lp->physical.layer[bp->layer_num]; memset(llp, 0, sizeof(*llp)); llp->book_version = blp->book_version; llp->book_type = blp->book_type; llp->min_rate = blp->max_rate; llp->disc_size = blp->disc_size; llp->layer_type = blp->layer_type; llp->track_path = blp->track_path; llp->nlayers = blp->nlayers; llp->track_density = blp->track_density; llp->linear_density = blp->linear_density; llp->bca = blp->bca; llp->start_sector = blp->start_sector; llp->end_sector = blp->end_sector; llp->end_sector_l0 = blp->end_sector_l0; break; } case DVD_STRUCT_COPYRIGHT: lp->copyright.cpst = bp->cpst; lp->copyright.rmi = bp->rmi; break; case DVD_STRUCT_DISCKEY: memcpy(lp->disckey.value, bp->data, sizeof(lp->disckey.value)); break; case DVD_STRUCT_BCA: lp->bca.len = bp->length; memcpy(lp->bca.value, bp->data, sizeof(lp->bca.value)); break; case DVD_STRUCT_MANUFACT: lp->manufact.len = bp->length; memcpy(lp->manufact.value, bp->data, sizeof(lp->manufact.value)); /* lp->manufact.layer_num is unused in linux (redhat 7.0) */ break; default: return (EINVAL); } return (0); } static int linux_to_bsd_dvd_authinfo(l_dvd_authinfo *lp, int *bcode, struct dvd_authinfo *bp) { switch (lp->type) { case LINUX_DVD_LU_SEND_AGID: *bcode = DVDIOCREPORTKEY; bp->format = DVD_REPORT_AGID; bp->agid = lp->lsa.agid; break; case LINUX_DVD_HOST_SEND_CHALLENGE: *bcode = DVDIOCSENDKEY; bp->format = DVD_SEND_CHALLENGE; bp->agid = lp->hsc.agid; memcpy(bp->keychal, lp->hsc.chal, 10); break; case LINUX_DVD_LU_SEND_KEY1: *bcode = DVDIOCREPORTKEY; bp->format = DVD_REPORT_KEY1; bp->agid = lp->lsk.agid; break; case LINUX_DVD_LU_SEND_CHALLENGE: *bcode = DVDIOCREPORTKEY; bp->format = DVD_REPORT_CHALLENGE; bp->agid = lp->lsc.agid; break; case LINUX_DVD_HOST_SEND_KEY2: *bcode = DVDIOCSENDKEY; bp->format = DVD_SEND_KEY2; bp->agid = lp->hsk.agid; memcpy(bp->keychal, lp->hsk.key, 5); break; case LINUX_DVD_LU_SEND_TITLE_KEY: *bcode = DVDIOCREPORTKEY; bp->format = DVD_REPORT_TITLE_KEY; bp->agid = lp->lstk.agid; bp->lba = lp->lstk.lba; break; case LINUX_DVD_LU_SEND_ASF: *bcode = DVDIOCREPORTKEY; bp->format = DVD_REPORT_ASF; bp->agid = lp->lsasf.agid; break; case LINUX_DVD_INVALIDATE_AGID: *bcode = DVDIOCREPORTKEY; bp->format = DVD_INVALIDATE_AGID; bp->agid = lp->lsa.agid; break; case LINUX_DVD_LU_SEND_RPC_STATE: *bcode = DVDIOCREPORTKEY; bp->format = DVD_REPORT_RPC; break; case LINUX_DVD_HOST_SEND_RPC_STATE: *bcode = DVDIOCSENDKEY; bp->format = DVD_SEND_RPC; bp->region = lp->hrpcs.pdrc; break; default: return (EINVAL); } return (0); } static int bsd_to_linux_dvd_authinfo(struct dvd_authinfo *bp, l_dvd_authinfo *lp) { switch (lp->type) { case LINUX_DVD_LU_SEND_AGID: lp->lsa.agid = bp->agid; break; case LINUX_DVD_HOST_SEND_CHALLENGE: lp->type = LINUX_DVD_LU_SEND_KEY1; break; case LINUX_DVD_LU_SEND_KEY1: memcpy(lp->lsk.key, bp->keychal, sizeof(lp->lsk.key)); break; case LINUX_DVD_LU_SEND_CHALLENGE: memcpy(lp->lsc.chal, bp->keychal, sizeof(lp->lsc.chal)); break; case LINUX_DVD_HOST_SEND_KEY2: lp->type = LINUX_DVD_AUTH_ESTABLISHED; break; case LINUX_DVD_LU_SEND_TITLE_KEY: memcpy(lp->lstk.title_key, bp->keychal, sizeof(lp->lstk.title_key)); lp->lstk.cpm = bp->cpm; lp->lstk.cp_sec = bp->cp_sec; lp->lstk.cgms = bp->cgms; break; case LINUX_DVD_LU_SEND_ASF: lp->lsasf.asf = bp->asf; break; case LINUX_DVD_INVALIDATE_AGID: break; case LINUX_DVD_LU_SEND_RPC_STATE: lp->lrpcs.type = bp->reg_type; lp->lrpcs.vra = bp->vend_rsts; lp->lrpcs.ucca = bp->user_rsts; lp->lrpcs.region_mask = bp->region; lp->lrpcs.rpc_scheme = bp->rpc_scheme; break; case LINUX_DVD_HOST_SEND_RPC_STATE: break; default: return (EINVAL); } return (0); } static int linux_ioctl_cdrom(struct thread *td, struct linux_ioctl_args *args) { - struct file *fp = td->td_proc->p_fd->fd_ofiles[args->fd]; + struct file *fp; int error; + fp = ffind_hold(td, args->fd); + if (fp == NULL) + return (EBADF); switch (args->cmd & 0xffff) { case LINUX_CDROMPAUSE: args->cmd = CDIOCPAUSE; - return (ioctl(td, (struct ioctl_args *)args)); + error = (ioctl(td, (struct ioctl_args *)args)); + break; case LINUX_CDROMRESUME: args->cmd = CDIOCRESUME; - return (ioctl(td, (struct ioctl_args *)args)); + error = (ioctl(td, (struct ioctl_args *)args)); + break; case LINUX_CDROMPLAYMSF: args->cmd = CDIOCPLAYMSF; - return (ioctl(td, (struct ioctl_args *)args)); + error = (ioctl(td, (struct ioctl_args *)args)); + break; case LINUX_CDROMPLAYTRKIND: args->cmd = CDIOCPLAYTRACKS; - return (ioctl(td, (struct ioctl_args *)args)); + error = (ioctl(td, (struct ioctl_args *)args)); + break; case LINUX_CDROMREADTOCHDR: { struct ioc_toc_header th; struct linux_cdrom_tochdr lth; error = fo_ioctl(fp, CDIOREADTOCHEADER, (caddr_t)&th, td); if (!error) { lth.cdth_trk0 = th.starting_track; lth.cdth_trk1 = th.ending_track; copyout(<h, (caddr_t)args->arg, sizeof(lth)); } - return (error); + break; } case LINUX_CDROMREADTOCENTRY: { struct linux_cdrom_tocentry lte, *ltep = (struct linux_cdrom_tocentry *)args->arg; struct ioc_read_toc_single_entry irtse; irtse.address_format = ltep->cdte_format; irtse.track = ltep->cdte_track; error = fo_ioctl(fp, CDIOREADTOCENTRY, (caddr_t)&irtse, td); if (!error) { lte = *ltep; lte.cdte_ctrl = irtse.entry.control; lte.cdte_adr = irtse.entry.addr_type; bsd_to_linux_msf_lba(irtse.address_format, &irtse.entry.addr, <e.cdte_addr); copyout(<e, (caddr_t)args->arg, sizeof(lte)); } - return (error); + break; } case LINUX_CDROMSTOP: args->cmd = CDIOCSTOP; - return (ioctl(td, (struct ioctl_args *)args)); + error = (ioctl(td, (struct ioctl_args *)args)); + break; case LINUX_CDROMSTART: args->cmd = CDIOCSTART; - return (ioctl(td, (struct ioctl_args *)args)); + error = (ioctl(td, (struct ioctl_args *)args)); + break; case LINUX_CDROMEJECT: args->cmd = CDIOCEJECT; - return (ioctl(td, (struct ioctl_args *)args)); + error = (ioctl(td, (struct ioctl_args *)args)); + break; /* LINUX_CDROMVOLCTRL */ case LINUX_CDROMSUBCHNL: { struct linux_cdrom_subchnl sc; struct ioc_read_subchannel bsdsc; struct cd_sub_channel_info *bsdinfo; caddr_t sg = stackgap_init(); bsdinfo = (struct cd_sub_channel_info*)stackgap_alloc(&sg, sizeof(struct cd_sub_channel_info)); bsdsc.address_format = CD_LBA_FORMAT; bsdsc.data_format = CD_CURRENT_POSITION; bsdsc.track = 0; bsdsc.data_len = sizeof(struct cd_sub_channel_info); bsdsc.data = bsdinfo; error = fo_ioctl(fp, CDIOCREADSUBCHANNEL, (caddr_t)&bsdsc, td); if (error) - return (error); + break; error = copyin((caddr_t)args->arg, &sc, sizeof(struct linux_cdrom_subchnl)); if (error) - return (error); + break; sc.cdsc_audiostatus = bsdinfo->header.audio_status; sc.cdsc_adr = bsdinfo->what.position.addr_type; sc.cdsc_ctrl = bsdinfo->what.position.control; sc.cdsc_trk = bsdinfo->what.position.track_number; sc.cdsc_ind = bsdinfo->what.position.index_number; set_linux_cdrom_addr(&sc.cdsc_absaddr, sc.cdsc_format, bsdinfo->what.position.absaddr.lba); set_linux_cdrom_addr(&sc.cdsc_reladdr, sc.cdsc_format, bsdinfo->what.position.reladdr.lba); error = copyout(&sc, (caddr_t)args->arg, sizeof(struct linux_cdrom_subchnl)); - return (error); + break; } /* LINUX_CDROMREADMODE2 */ /* LINUX_CDROMREADMODE1 */ /* LINUX_CDROMREADAUDIO */ /* LINUX_CDROMEJECT_SW */ /* LINUX_CDROMMULTISESSION */ /* LINUX_CDROM_GET_UPC */ case LINUX_CDROMRESET: args->cmd = CDIOCRESET; - return (ioctl(td, (struct ioctl_args *)args)); + error = (ioctl(td, (struct ioctl_args *)args)); + break; /* LINUX_CDROMVOLREAD */ /* LINUX_CDROMREADRAW */ /* LINUX_CDROMREADCOOKED */ /* LINUX_CDROMSEEK */ /* LINUX_CDROMPLAYBLK */ /* LINUX_CDROMREADALL */ /* LINUX_CDROMCLOSETRAY */ /* LINUX_CDROMLOADFROMSLOT */ /* LINUX_CDROMGETSPINDOWN */ /* LINUX_CDROMSETSPINDOWN */ /* LINUX_CDROM_SET_OPTIONS */ /* LINUX_CDROM_CLEAR_OPTIONS */ /* LINUX_CDROM_SELECT_SPEED */ /* LINUX_CDROM_SELECT_DISC */ /* LINUX_CDROM_MEDIA_CHANGED */ /* LINUX_CDROM_DRIVE_STATUS */ /* LINUX_CDROM_DISC_STATUS */ /* LINUX_CDROM_CHANGER_NSLOTS */ /* LINUX_CDROM_LOCKDOOR */ /* LINUX_CDROM_DEBUG */ /* LINUX_CDROM_GET_CAPABILITY */ /* LINUX_CDROMAUDIOBUFSIZ */ case LINUX_DVD_READ_STRUCT: { l_dvd_struct lds; struct dvd_struct bds; error = copyin((caddr_t)args->arg, &lds, sizeof(l_dvd_struct)); if (error) - return (error); + break; error = linux_to_bsd_dvd_struct(&lds, &bds); if (error) - return (error); + break; error = fo_ioctl(fp, DVDIOCREADSTRUCTURE, (caddr_t)&bds, td); if (error) - return (error); + break; error = bsd_to_linux_dvd_struct(&bds, &lds); if (error) - return (error); - return (copyout(&lds, (caddr_t)args->arg, - sizeof(l_dvd_struct))); + break; + error = copyout(&lds, (caddr_t)args->arg, + sizeof(l_dvd_struct)); + break; } /* LINUX_DVD_WRITE_STRUCT */ case LINUX_DVD_AUTH: { l_dvd_authinfo lda; struct dvd_authinfo bda; int bcode; error = copyin((caddr_t)args->arg, &lda, sizeof(l_dvd_authinfo)); if (error) - return (error); + break; error = linux_to_bsd_dvd_authinfo(&lda, &bcode, &bda); if (error) - return (error); + break; error = fo_ioctl(fp, bcode, (caddr_t)&bda, td); if (error) { if (lda.type == LINUX_DVD_HOST_SEND_KEY2) { lda.type = LINUX_DVD_AUTH_FAILURE; copyout(&lda, (caddr_t)args->arg, sizeof(l_dvd_authinfo)); } - return (error); + break; } error = bsd_to_linux_dvd_authinfo(&bda, &lda); if (error) - return (error); - return (copyout(&lda, (caddr_t)args->arg, - sizeof(l_dvd_authinfo))); + break; + error = copyout(&lda, (caddr_t)args->arg, + sizeof(l_dvd_authinfo)); + break; } /* LINUX_CDROM_SEND_PACKET */ /* LINUX_CDROM_NEXT_WRITABLE */ /* LINUX_CDROM_LAST_WRITTEN */ + default: + error = ENOIOCTL; + break; } - return (ENOIOCTL); + fdrop(fp, td); + return (error); } /* * Sound related ioctls */ static u_int32_t dirbits[4] = { IOC_VOID, IOC_IN, IOC_OUT, IOC_INOUT }; #define SETDIR(c) (((c) & ~IOC_DIRMASK) | dirbits[args->cmd >> 30]) static int linux_ioctl_sound(struct thread *td, struct linux_ioctl_args *args) { switch (args->cmd & 0xffff) { case LINUX_SOUND_MIXER_WRITE_VOLUME: args->cmd = SETDIR(SOUND_MIXER_WRITE_VOLUME); return (ioctl(td, (struct ioctl_args *)args)); case LINUX_SOUND_MIXER_WRITE_BASS: args->cmd = SETDIR(SOUND_MIXER_WRITE_BASS); return (ioctl(td, (struct ioctl_args *)args)); case LINUX_SOUND_MIXER_WRITE_TREBLE: args->cmd = SETDIR(SOUND_MIXER_WRITE_TREBLE); return (ioctl(td, (struct ioctl_args *)args)); case LINUX_SOUND_MIXER_WRITE_SYNTH: args->cmd = SETDIR(SOUND_MIXER_WRITE_SYNTH); return (ioctl(td, (struct ioctl_args *)args)); case LINUX_SOUND_MIXER_WRITE_PCM: args->cmd = SETDIR(SOUND_MIXER_WRITE_PCM); return (ioctl(td, (struct ioctl_args *)args)); case LINUX_SOUND_MIXER_WRITE_SPEAKER: args->cmd = SETDIR(SOUND_MIXER_WRITE_SPEAKER); return (ioctl(td, (struct ioctl_args *)args)); case LINUX_SOUND_MIXER_WRITE_LINE: args->cmd = SETDIR(SOUND_MIXER_WRITE_LINE); return (ioctl(td, (struct ioctl_args *)args)); case LINUX_SOUND_MIXER_WRITE_MIC: args->cmd = SETDIR(SOUND_MIXER_WRITE_MIC); return (ioctl(td, (struct ioctl_args *)args)); case LINUX_SOUND_MIXER_WRITE_CD: args->cmd = SETDIR(SOUND_MIXER_WRITE_CD); return (ioctl(td, (struct ioctl_args *)args)); case LINUX_SOUND_MIXER_WRITE_IMIX: args->cmd = SETDIR(SOUND_MIXER_WRITE_IMIX); return (ioctl(td, (struct ioctl_args *)args)); case LINUX_SOUND_MIXER_WRITE_ALTPCM: args->cmd = SETDIR(SOUND_MIXER_WRITE_ALTPCM); return (ioctl(td, (struct ioctl_args *)args)); case LINUX_SOUND_MIXER_WRITE_RECLEV: args->cmd = SETDIR(SOUND_MIXER_WRITE_RECLEV); return (ioctl(td, (struct ioctl_args *)args)); case LINUX_SOUND_MIXER_WRITE_IGAIN: args->cmd = SETDIR(SOUND_MIXER_WRITE_IGAIN); return (ioctl(td, (struct ioctl_args *)args)); case LINUX_SOUND_MIXER_WRITE_OGAIN: args->cmd = SETDIR(SOUND_MIXER_WRITE_OGAIN); return (ioctl(td, (struct ioctl_args *)args)); case LINUX_SOUND_MIXER_WRITE_LINE1: args->cmd = SETDIR(SOUND_MIXER_WRITE_LINE1); return (ioctl(td, (struct ioctl_args *)args)); case LINUX_SOUND_MIXER_WRITE_LINE2: args->cmd = SETDIR(SOUND_MIXER_WRITE_LINE2); return (ioctl(td, (struct ioctl_args *)args)); case LINUX_SOUND_MIXER_WRITE_LINE3: args->cmd = SETDIR(SOUND_MIXER_WRITE_LINE3); return (ioctl(td, (struct ioctl_args *)args)); case LINUX_OSS_GETVERSION: { int version = linux_get_oss_version(td->td_proc); return (copyout(&version, (caddr_t)args->arg, sizeof(int))); } case LINUX_SOUND_MIXER_READ_DEVMASK: args->cmd = SOUND_MIXER_READ_DEVMASK; return (ioctl(td, (struct ioctl_args *)args)); case LINUX_SOUND_MIXER_WRITE_RECSRC: args->cmd = SETDIR(SOUND_MIXER_WRITE_RECSRC); return (ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_DSP_RESET: args->cmd = SNDCTL_DSP_RESET; return (ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_DSP_SYNC: args->cmd = SNDCTL_DSP_SYNC; return (ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_DSP_SPEED: args->cmd = SNDCTL_DSP_SPEED; return (ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_DSP_STEREO: args->cmd = SNDCTL_DSP_STEREO; return (ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_DSP_GETBLKSIZE: /* LINUX_SNDCTL_DSP_SETBLKSIZE */ args->cmd = SNDCTL_DSP_GETBLKSIZE; return (ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_DSP_SETFMT: args->cmd = SNDCTL_DSP_SETFMT; return (ioctl(td, (struct ioctl_args *)args)); case LINUX_SOUND_PCM_WRITE_CHANNELS: args->cmd = SOUND_PCM_WRITE_CHANNELS; return (ioctl(td, (struct ioctl_args *)args)); case LINUX_SOUND_PCM_WRITE_FILTER: args->cmd = SOUND_PCM_WRITE_FILTER; return (ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_DSP_POST: args->cmd = SNDCTL_DSP_POST; return (ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_DSP_SUBDIVIDE: args->cmd = SNDCTL_DSP_SUBDIVIDE; return (ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_DSP_SETFRAGMENT: args->cmd = SNDCTL_DSP_SETFRAGMENT; return (ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_DSP_GETFMTS: args->cmd = SNDCTL_DSP_GETFMTS; return (ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_DSP_GETOSPACE: args->cmd = SNDCTL_DSP_GETOSPACE; return (ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_DSP_GETISPACE: args->cmd = SNDCTL_DSP_GETISPACE; return (ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_DSP_NONBLOCK: args->cmd = SNDCTL_DSP_NONBLOCK; return (ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_DSP_GETCAPS: args->cmd = SNDCTL_DSP_GETCAPS; return (ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_DSP_SETTRIGGER: /* LINUX_SNDCTL_GETTRIGGER */ args->cmd = SNDCTL_DSP_SETTRIGGER; return (ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_DSP_GETIPTR: args->cmd = SNDCTL_DSP_GETIPTR; return (ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_DSP_GETOPTR: args->cmd = SNDCTL_DSP_GETOPTR; return (ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_DSP_GETODELAY: args->cmd = SNDCTL_DSP_GETODELAY; return (ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_SEQ_RESET: args->cmd = SNDCTL_SEQ_RESET; return (ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_SEQ_SYNC: args->cmd = SNDCTL_SEQ_SYNC; return (ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_SYNTH_INFO: args->cmd = SNDCTL_SYNTH_INFO; return (ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_SEQ_CTRLRATE: args->cmd = SNDCTL_SEQ_CTRLRATE; return (ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_SEQ_GETOUTCOUNT: args->cmd = SNDCTL_SEQ_GETOUTCOUNT; return (ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_SEQ_GETINCOUNT: args->cmd = SNDCTL_SEQ_GETINCOUNT; return (ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_SEQ_PERCMODE: args->cmd = SNDCTL_SEQ_PERCMODE; return (ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_FM_LOAD_INSTR: args->cmd = SNDCTL_FM_LOAD_INSTR; return (ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_SEQ_TESTMIDI: args->cmd = SNDCTL_SEQ_TESTMIDI; return (ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_SEQ_RESETSAMPLES: args->cmd = SNDCTL_SEQ_RESETSAMPLES; return (ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_SEQ_NRSYNTHS: args->cmd = SNDCTL_SEQ_NRSYNTHS; return (ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_SEQ_NRMIDIS: args->cmd = SNDCTL_SEQ_NRMIDIS; return (ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_MIDI_INFO: args->cmd = SNDCTL_MIDI_INFO; return (ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_SEQ_TRESHOLD: args->cmd = SNDCTL_SEQ_TRESHOLD; return (ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_SYNTH_MEMAVL: args->cmd = SNDCTL_SYNTH_MEMAVL; return (ioctl(td, (struct ioctl_args *)args)); } return (ENOIOCTL); } /* * Console related ioctls */ #define ISSIGVALID(sig) ((sig) > 0 && (sig) < NSIG) static int linux_ioctl_console(struct thread *td, struct linux_ioctl_args *args) { - struct file *fp = td->td_proc->p_fd->fd_ofiles[args->fd]; + struct file *fp; + int error; + fp = ffind_hold(td, args->fd); + if (fp == NULL) + return (EBADF); switch (args->cmd & 0xffff) { case LINUX_KIOCSOUND: args->cmd = KIOCSOUND; - return (ioctl(td, (struct ioctl_args *)args)); + error = (ioctl(td, (struct ioctl_args *)args)); + break; case LINUX_KDMKTONE: args->cmd = KDMKTONE; - return (ioctl(td, (struct ioctl_args *)args)); + error = (ioctl(td, (struct ioctl_args *)args)); + break; case LINUX_KDGETLED: args->cmd = KDGETLED; - return (ioctl(td, (struct ioctl_args *)args)); + error = (ioctl(td, (struct ioctl_args *)args)); + break; case LINUX_KDSETLED: args->cmd = KDSETLED; - return (ioctl(td, (struct ioctl_args *)args)); + error = (ioctl(td, (struct ioctl_args *)args)); + break; case LINUX_KDSETMODE: args->cmd = KDSETMODE; - return (ioctl(td, (struct ioctl_args *)args)); + error = (ioctl(td, (struct ioctl_args *)args)); + break; case LINUX_KDGETMODE: args->cmd = KDGETMODE; - return (ioctl(td, (struct ioctl_args *)args)); + error = (ioctl(td, (struct ioctl_args *)args)); + break; case LINUX_KDGKBMODE: args->cmd = KDGKBMODE; - return (ioctl(td, (struct ioctl_args *)args)); + error = (ioctl(td, (struct ioctl_args *)args)); + break; case LINUX_KDSKBMODE: { int kbdmode; switch (args->arg) { case LINUX_KBD_RAW: kbdmode = K_RAW; break; case LINUX_KBD_XLATE: kbdmode = K_XLATE; break; case LINUX_KBD_MEDIUMRAW: kbdmode = K_RAW; break; default: + fdrop(fp, td); return (EINVAL); } - return (fo_ioctl(fp, KDSKBMODE, (caddr_t)&kbdmode, td)); + error = (fo_ioctl(fp, KDSKBMODE, (caddr_t)&kbdmode, td)); + break; } case LINUX_VT_OPENQRY: args->cmd = VT_OPENQRY; - return (ioctl(td, (struct ioctl_args *)args)); + error = (ioctl(td, (struct ioctl_args *)args)); + break; case LINUX_VT_GETMODE: args->cmd = VT_GETMODE; - return (ioctl(td, (struct ioctl_args *)args)); + error = (ioctl(td, (struct ioctl_args *)args)); + break; case LINUX_VT_SETMODE: { struct vt_mode *mode; args->cmd = VT_SETMODE; mode = (struct vt_mode *)args->arg; if (!ISSIGVALID(mode->frsig) && ISSIGVALID(mode->acqsig)) mode->frsig = mode->acqsig; - return (ioctl(td, (struct ioctl_args *)args)); + error = (ioctl(td, (struct ioctl_args *)args)); + break; } case LINUX_VT_GETSTATE: args->cmd = VT_GETACTIVE; - return (ioctl(td, (struct ioctl_args *)args)); + error = (ioctl(td, (struct ioctl_args *)args)); + break; case LINUX_VT_RELDISP: args->cmd = VT_RELDISP; - return (ioctl(td, (struct ioctl_args *)args)); + error = (ioctl(td, (struct ioctl_args *)args)); + break; case LINUX_VT_ACTIVATE: args->cmd = VT_ACTIVATE; - return (ioctl(td, (struct ioctl_args *)args)); + error = (ioctl(td, (struct ioctl_args *)args)); + break; case LINUX_VT_WAITACTIVE: args->cmd = VT_WAITACTIVE; - return (ioctl(td, (struct ioctl_args *)args)); + error = (ioctl(td, (struct ioctl_args *)args)); + break; + default: + error = ENOIOCTL; + break; } - return (ENOIOCTL); + fdrop(fp, td); + return (error); } /* * Criteria for interface name translation */ #define IFP_IS_ETH(ifp) (ifp->if_type == IFT_ETHER) /* * Translate a Linux interface name to a FreeBSD interface name, * and return the associated ifnet structure * bsdname and lxname need to be least IFNAMSIZ bytes long, but * can point to the same buffer. */ static struct ifnet * ifname_linux_to_bsd(const char *lxname, char *bsdname) { struct ifnet *ifp; int len, unit; char *ep; int is_eth, index; for (len = 0; len < LINUX_IFNAMSIZ; ++len) if (!isalpha(lxname[len])) break; if (len == 0 || len == LINUX_IFNAMSIZ) return (NULL); unit = (int)strtoul(lxname + len, &ep, 10); if (ep == NULL || ep == lxname + len || ep >= lxname + LINUX_IFNAMSIZ) return (NULL); index = 0; is_eth = (len == 3 && !strncmp(lxname, "eth", len)) ? 1 : 0; TAILQ_FOREACH(ifp, &ifnet, if_link) { /* * Allow Linux programs to use FreeBSD names. Don't presume * we never have an interface named "eth", so don't make * the test optional based on is_eth. */ if (ifp->if_unit == unit && ifp->if_name[len] == '\0' && strncmp(ifp->if_name, lxname, len) == 0) break; if (is_eth && IFP_IS_ETH(ifp) && unit == index++) break; } if (ifp != NULL) snprintf(bsdname, IFNAMSIZ, "%s%d", ifp->if_name, ifp->if_unit); return (ifp); } /* * Implement the SIOCGIFCONF ioctl */ static int linux_ifconf(struct thread *td, struct ifconf *uifc) { struct ifconf ifc; struct l_ifreq ifr; struct ifnet *ifp; struct iovec iov; struct uio uio; int error, ethno; error = copyin(uifc, &ifc, sizeof ifc); if (error != 0) return (error); /* much easier to use uiomove than keep track ourselves */ iov.iov_base = ifc.ifc_buf; iov.iov_len = ifc.ifc_len; uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_offset = 0; uio.uio_resid = ifc.ifc_len; uio.uio_segflg = UIO_USERSPACE; uio.uio_rw = UIO_READ; uio.uio_td = td; /* Keep track of eth interfaces */ ethno = 0; /* return interface names but no addresses. */ TAILQ_FOREACH(ifp, &ifnet, if_link) { if (uio.uio_resid <= 0) break; bzero(&ifr, sizeof ifr); if (IFP_IS_ETH(ifp)) snprintf(ifr.ifr_name, LINUX_IFNAMSIZ, "eth%d", ethno++); else snprintf(ifr.ifr_name, LINUX_IFNAMSIZ, "%s%d", ifp->if_name, ifp->if_unit); error = uiomove((caddr_t)&ifr, sizeof ifr, &uio); if (error != 0) return (error); } ifc.ifc_len -= uio.uio_resid; error = copyout(&ifc, uifc, sizeof ifc); return (error); } static int linux_gifflags(struct thread *td, struct ifnet *ifp, struct l_ifreq *ifr) { l_short flags; flags = ifp->if_flags; /* these flags have no Linux equivalent */ flags &= ~(IFF_SMART|IFF_OACTIVE|IFF_SIMPLEX| IFF_LINK0|IFF_LINK1|IFF_LINK2); /* Linux' multicast flag is in a different bit */ if (flags & IFF_MULTICAST) { flags &= ~IFF_MULTICAST; flags |= 0x1000; } return (copyout(&flags, &ifr->ifr_flags, sizeof flags)); } #define ARPHRD_ETHER 1 #define ARPHRD_LOOPBACK 772 static int linux_gifhwaddr(struct ifnet *ifp, struct l_ifreq *ifr) { struct ifaddr *ifa; struct sockaddr_dl *sdl; struct l_sockaddr lsa; if (ifp->if_type == IFT_LOOP) { bzero(&lsa, sizeof lsa); lsa.sa_family = ARPHRD_LOOPBACK; return (copyout(&lsa, &ifr->ifr_hwaddr, sizeof lsa)); } if (ifp->if_type != IFT_ETHER) return (ENOENT); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { sdl = (struct sockaddr_dl*)ifa->ifa_addr; if (sdl != NULL && (sdl->sdl_family == AF_LINK) && (sdl->sdl_type == IFT_ETHER)) { bzero(&lsa, sizeof lsa); lsa.sa_family = ARPHRD_ETHER; bcopy(LLADDR(sdl), lsa.sa_data, LINUX_IFHWADDRLEN); return (copyout(&lsa, &ifr->ifr_hwaddr, sizeof lsa)); } } return (ENOENT); } /* * Socket related ioctls */ static int linux_ioctl_socket(struct thread *td, struct linux_ioctl_args *args) { char lifname[LINUX_IFNAMSIZ], ifname[IFNAMSIZ]; struct ifnet *ifp; struct file *fp; int error, type; KASSERT(LINUX_IFNAMSIZ == IFNAMSIZ, ("%s(): LINUX_IFNAMSIZ != IFNAMSIZ", __func__)); ifp = NULL; error = 0; mtx_lock(&Giant); if ((error = fget(td, args->fd, &fp)) != 0) { mtx_unlock(&Giant); return (error); } type = fp->f_type; fdrop(fp, td); mtx_unlock(&Giant); if (type != DTYPE_SOCKET) { /* not a socket - probably a tap / vmnet device */ switch (args->cmd) { case LINUX_SIOCGIFADDR: case LINUX_SIOCSIFADDR: case LINUX_SIOCGIFFLAGS: return (linux_ioctl_special(td, args)); default: return (ENOIOCTL); } } switch (args->cmd & 0xffff) { case LINUX_FIOGETOWN: case LINUX_FIOSETOWN: case LINUX_SIOCADDMULTI: case LINUX_SIOCATMARK: case LINUX_SIOCDELMULTI: case LINUX_SIOCGIFCONF: case LINUX_SIOCGPGRP: case LINUX_SIOCSPGRP: /* these ioctls don't take an interface name */ #ifdef DEBUG printf("%s(): ioctl %d\n", __func__, args->cmd & 0xffff); #endif break; case LINUX_SIOCGIFFLAGS: case LINUX_SIOCGIFADDR: case LINUX_SIOCSIFADDR: case LINUX_SIOCGIFDSTADDR: case LINUX_SIOCGIFBRDADDR: case LINUX_SIOCGIFNETMASK: case LINUX_SIOCSIFNETMASK: case LINUX_SIOCGIFMTU: case LINUX_SIOCSIFMTU: case LINUX_SIOCSIFNAME: case LINUX_SIOCGIFHWADDR: case LINUX_SIOCSIFHWADDR: case LINUX_SIOCDEVPRIVATE: case LINUX_SIOCDEVPRIVATE+1: /* copy in the interface name and translate it. */ error = copyin((char *)args->arg, lifname, LINUX_IFNAMSIZ); if (error != 0) return (error); #ifdef DEBUG printf("%s(): ioctl %d on %.*s\n", __func__, args->cmd & 0xffff, LINUX_IFNAMSIZ, lifname); #endif ifp = ifname_linux_to_bsd(lifname, ifname); if (ifp == NULL) return (EINVAL); /* * We need to copy it back out in case we pass the * request on to our native ioctl(), which will expect * the ifreq to be in user space and have the correct * interface name. */ error = copyout(ifname, (char *)args->arg, IFNAMSIZ); if (error != 0) return (error); #ifdef DEBUG printf("%s(): %s translated to %s\n", __func__, lifname, ifname); #endif break; default: return (ENOIOCTL); } switch (args->cmd & 0xffff) { case LINUX_FIOSETOWN: args->cmd = FIOSETOWN; error = ioctl(td, (struct ioctl_args *)args); break; case LINUX_SIOCSPGRP: args->cmd = SIOCSPGRP; error = ioctl(td, (struct ioctl_args *)args); break; case LINUX_FIOGETOWN: args->cmd = FIOGETOWN; error = ioctl(td, (struct ioctl_args *)args); break; case LINUX_SIOCGPGRP: args->cmd = SIOCGPGRP; error = ioctl(td, (struct ioctl_args *)args); break; case LINUX_SIOCATMARK: args->cmd = SIOCATMARK; error = ioctl(td, (struct ioctl_args *)args); break; /* LINUX_SIOCGSTAMP */ case LINUX_SIOCGIFCONF: error = linux_ifconf(td, (struct ifconf *)args->arg); break; case LINUX_SIOCGIFFLAGS: args->cmd = SIOCGIFFLAGS; error = linux_gifflags(td, ifp, (struct l_ifreq *)args->arg); break; case LINUX_SIOCGIFADDR: args->cmd = OSIOCGIFADDR; error = ioctl(td, (struct ioctl_args *)args); break; case LINUX_SIOCSIFADDR: /* XXX probably doesn't work, included for completeness */ args->cmd = SIOCSIFADDR; error = ioctl(td, (struct ioctl_args *)args); break; case LINUX_SIOCGIFDSTADDR: args->cmd = OSIOCGIFDSTADDR; error = ioctl(td, (struct ioctl_args *)args); break; case LINUX_SIOCGIFBRDADDR: args->cmd = OSIOCGIFBRDADDR; error = ioctl(td, (struct ioctl_args *)args); break; case LINUX_SIOCGIFNETMASK: args->cmd = OSIOCGIFNETMASK; error = ioctl(td, (struct ioctl_args *)args); break; case LINUX_SIOCSIFNETMASK: error = ENOIOCTL; break; case LINUX_SIOCGIFMTU: args->cmd = SIOCGIFMTU; error = ioctl(td, (struct ioctl_args *)args); break; case LINUX_SIOCSIFMTU: args->cmd = SIOCSIFMTU; error = ioctl(td, (struct ioctl_args *)args); break; case LINUX_SIOCSIFNAME: error = ENOIOCTL; break; case LINUX_SIOCGIFHWADDR: error = linux_gifhwaddr(ifp, (struct l_ifreq *)args->arg); break; case LINUX_SIOCSIFHWADDR: error = ENOIOCTL; break; case LINUX_SIOCADDMULTI: args->cmd = SIOCADDMULTI; error = ioctl(td, (struct ioctl_args *)args); break; case LINUX_SIOCDELMULTI: args->cmd = SIOCDELMULTI; error = ioctl(td, (struct ioctl_args *)args); break; /* * XXX This is slightly bogus, but these ioctls are currently * XXX only used by the aironet (if_an) network driver. */ case LINUX_SIOCDEVPRIVATE: args->cmd = SIOCGPRIVATE_0; error = ioctl(td, (struct ioctl_args *)args); break; case LINUX_SIOCDEVPRIVATE+1: args->cmd = SIOCGPRIVATE_1; error = ioctl(td, (struct ioctl_args *)args); break; } if (ifp != NULL) /* restore the original interface name */ copyout(lifname, (char *)args->arg, LINUX_IFNAMSIZ); #ifdef DEBUG printf("%s(): returning %d\n", __func__, error); #endif return (error); } /* * Device private ioctl handler */ static int linux_ioctl_private(struct thread *td, struct linux_ioctl_args *args) { struct file *fp; int error, type; mtx_lock(&Giant); if ((error = fget(td, args->fd, &fp)) != 0) { mtx_unlock(&Giant); return (error); } type = fp->f_type; fdrop(fp, td); mtx_unlock(&Giant); if (type == DTYPE_SOCKET) return (linux_ioctl_socket(td, args)); return (ENOIOCTL); } /* * Special ioctl handler */ static int linux_ioctl_special(struct thread *td, struct linux_ioctl_args *args) { int error; switch (args->cmd) { case LINUX_SIOCGIFADDR: args->cmd = SIOCGIFADDR; error = ioctl(td, (struct ioctl_args *)args); break; case LINUX_SIOCSIFADDR: args->cmd = SIOCSIFADDR; error = ioctl(td, (struct ioctl_args *)args); break; case LINUX_SIOCGIFFLAGS: args->cmd = SIOCGIFFLAGS; error = ioctl(td, (struct ioctl_args *)args); break; default: error = ENOIOCTL; } return (error); } /* * main ioctl syscall function */ int linux_ioctl(struct thread *td, struct linux_ioctl_args *args) { - struct filedesc *fdp; struct file *fp; struct handler_element *he; int error, cmd; #ifdef DEBUG if (ldebug(ioctl)) printf(ARGS(ioctl, "%d, %04lx, *"), args->fd, (unsigned long)args->cmd); #endif - fdp = td->td_proc->p_fd; - if ((unsigned)args->fd >= fdp->fd_nfiles) + fp = ffind_hold(td, args->fd); + if (fp == NULL) return (EBADF); - fp = fdp->fd_ofiles[args->fd]; - if (fp == NULL || (fp->f_flag & (FREAD|FWRITE)) == 0) + if ((fp->f_flag & (FREAD|FWRITE)) == 0) { + fdrop(fp, td); return (EBADF); + } /* Iterate over the ioctl handlers */ cmd = args->cmd & 0xffff; TAILQ_FOREACH(he, &handlers, list) { if (cmd >= he->low && cmd <= he->high) { error = (*he->func)(td, args); if (error != ENOIOCTL) + fdrop(fp, td); return (error); } } + fdrop(fp, td); printf("linux: 'ioctl' fd=%d, cmd=0x%x ('%c',%d) not implemented\n", args->fd, (int)(args->cmd & 0xffff), (int)(args->cmd & 0xff00) >> 8, (int)(args->cmd & 0xff)); return (EINVAL); } int linux_ioctl_register_handler(struct linux_ioctl_handler *h) { struct handler_element *he, *cur; if (h == NULL || h->func == NULL) return (EINVAL); /* * Reuse the element if the handler is already on the list, otherwise * create a new element. */ TAILQ_FOREACH(he, &handlers, list) { if (he->func == h->func) break; } if (he == NULL) { MALLOC(he, struct handler_element *, sizeof(*he), M_LINUX, M_WAITOK); he->func = h->func; } else TAILQ_REMOVE(&handlers, he, list); /* Initialize range information. */ he->low = h->low; he->high = h->high; he->span = h->high - h->low + 1; /* Add the element to the list, sorted on span. */ TAILQ_FOREACH(cur, &handlers, list) { if (cur->span > he->span) { TAILQ_INSERT_BEFORE(cur, he, list); return (0); } } TAILQ_INSERT_TAIL(&handlers, he, list); return (0); } int linux_ioctl_unregister_handler(struct linux_ioctl_handler *h) { struct handler_element *he; if (h == NULL || h->func == NULL) return (EINVAL); TAILQ_FOREACH(he, &handlers, list) { if (he->func == h->func) { TAILQ_REMOVE(&handlers, he, list); FREE(he, M_LINUX); return (0); } } return (EINVAL); } Index: head/sys/compat/linux/linux_stats.c =================================================================== --- head/sys/compat/linux/linux_stats.c (revision 89305) +++ head/sys/compat/linux/linux_stats.c (revision 89306) @@ -1,477 +1,480 @@ /*- * Copyright (c) 1994-1995 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software withough specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int newstat_copyout(struct stat *buf, void *ubuf) { struct l_newstat tbuf; struct cdevsw *cdevsw; dev_t dev; tbuf.st_dev = uminor(buf->st_dev) | (umajor(buf->st_dev) << 8); tbuf.st_ino = buf->st_ino; tbuf.st_mode = buf->st_mode; tbuf.st_nlink = buf->st_nlink; tbuf.st_uid = buf->st_uid; tbuf.st_gid = buf->st_gid; tbuf.st_rdev = buf->st_rdev; tbuf.st_size = buf->st_size; tbuf.st_atime = buf->st_atime; tbuf.st_mtime = buf->st_mtime; tbuf.st_ctime = buf->st_ctime; tbuf.st_blksize = buf->st_blksize; tbuf.st_blocks = buf->st_blocks; /* Lie about disk drives which are character devices * in FreeBSD but block devices under Linux. */ if (S_ISCHR(tbuf.st_mode) && (dev = udev2dev(buf->st_rdev, 0)) != NODEV) { cdevsw = devsw(dev); if (cdevsw != NULL && (cdevsw->d_flags & D_DISK)) { tbuf.st_mode &= ~S_IFMT; tbuf.st_mode |= S_IFBLK; /* XXX this may not be quite right */ /* Map major number to 0 */ tbuf.st_dev = uminor(buf->st_dev) & 0xf; tbuf.st_rdev = buf->st_rdev & 0xff; } } return (copyout(&tbuf, ubuf, sizeof(tbuf))); } int linux_newstat(struct thread *td, struct linux_newstat_args *args) { struct stat buf; struct nameidata nd; int error; caddr_t sg; sg = stackgap_init(); CHECKALTEXIST(td, &sg, args->path); #ifdef DEBUG if (ldebug(newstat)) printf(ARGS(newstat, "%s, *"), args->path); #endif NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, args->path, td); error = namei(&nd); if (error) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = vn_stat(nd.ni_vp, &buf, td); vput(nd.ni_vp); if (error) return (error); return (newstat_copyout(&buf, args->buf)); } int linux_newlstat(struct thread *td, struct linux_newlstat_args *args) { int error; struct stat sb; struct nameidata nd; caddr_t sg; sg = stackgap_init(); CHECKALTEXIST(td, &sg, args->path); #ifdef DEBUG if (ldebug(newlstat)) printf(ARGS(newlstat, "%s, *"), args->path); #endif NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, args->path, td); error = namei(&nd); if (error) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = vn_stat(nd.ni_vp, &sb, td); vput(nd.ni_vp); if (error) return (error); return (newstat_copyout(&sb, args->buf)); } int linux_newfstat(struct thread *td, struct linux_newfstat_args *args) { - struct filedesc *fdp; struct file *fp; struct stat buf; int error; #ifdef DEBUG if (ldebug(newfstat)) printf(ARGS(newfstat, "%d, *"), args->fd); #endif - fdp = td->td_proc->p_fd; - if ((unsigned)args->fd >= fdp->fd_nfiles || - (fp = fdp->fd_ofiles[args->fd]) == NULL) + fp = ffind_hold(td, args->fd); + if (fp == NULL) return (EBADF); error = fo_stat(fp, &buf, td); + fdrop(fp, td); if (!error) error = newstat_copyout(&buf, args->buf); return (error); } /* XXX - All fields of type l_int are defined as l_long on i386 */ struct l_statfs { l_int f_type; l_int f_bsize; l_int f_blocks; l_int f_bfree; l_int f_bavail; l_int f_files; l_int f_ffree; l_fsid_t f_fsid; l_int f_namelen; l_int f_spare[6]; }; #define LINUX_CODA_SUPER_MAGIC 0x73757245L #define LINUX_EXT2_SUPER_MAGIC 0xEF53L #define LINUX_HPFS_SUPER_MAGIC 0xf995e849L #define LINUX_ISOFS_SUPER_MAGIC 0x9660L #define LINUX_MSDOS_SUPER_MAGIC 0x4d44L #define LINUX_NCP_SUPER_MAGIC 0x564cL #define LINUX_NFS_SUPER_MAGIC 0x6969L #define LINUX_NTFS_SUPER_MAGIC 0x5346544EL #define LINUX_PROC_SUPER_MAGIC 0x9fa0L #define LINUX_UFS_SUPER_MAGIC 0x00011954L /* XXX - UFS_MAGIC in Linux */ static long bsd_to_linux_ftype(const char *fstypename) { int i; static struct {const char *bsd_name; long linux_type;} b2l_tbl[] = { {"ufs", LINUX_UFS_SUPER_MAGIC}, {"cd9660", LINUX_ISOFS_SUPER_MAGIC}, {"nfs", LINUX_NFS_SUPER_MAGIC}, {"ext2fs", LINUX_EXT2_SUPER_MAGIC}, {"procfs", LINUX_PROC_SUPER_MAGIC}, {"msdosfs", LINUX_MSDOS_SUPER_MAGIC}, {"ntfs", LINUX_NTFS_SUPER_MAGIC}, {"nwfs", LINUX_NCP_SUPER_MAGIC}, {"hpfs", LINUX_HPFS_SUPER_MAGIC}, {"coda", LINUX_CODA_SUPER_MAGIC}, {NULL, 0L}}; for (i = 0; b2l_tbl[i].bsd_name != NULL; i++) if (strcmp(b2l_tbl[i].bsd_name, fstypename) == 0) return (b2l_tbl[i].linux_type); return (0L); } int linux_statfs(struct thread *td, struct linux_statfs_args *args) { struct mount *mp; struct nameidata *ndp; struct statfs *bsd_statfs; struct nameidata nd; struct l_statfs linux_statfs; int error; caddr_t sg; sg = stackgap_init(); CHECKALTEXIST(td, &sg, args->path); #ifdef DEBUG if (ldebug(statfs)) printf(ARGS(statfs, "%s, *"), args->path); #endif ndp = &nd; NDINIT(ndp, LOOKUP, FOLLOW, UIO_USERSPACE, args->path, curthread); error = namei(ndp); if (error) return error; NDFREE(ndp, NDF_ONLY_PNBUF); mp = ndp->ni_vp->v_mount; bsd_statfs = &mp->mnt_stat; vrele(ndp->ni_vp); error = VFS_STATFS(mp, bsd_statfs, td); if (error) return error; bsd_statfs->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; linux_statfs.f_type = bsd_to_linux_ftype(bsd_statfs->f_fstypename); linux_statfs.f_bsize = bsd_statfs->f_bsize; linux_statfs.f_blocks = bsd_statfs->f_blocks; linux_statfs.f_bfree = bsd_statfs->f_bfree; linux_statfs.f_bavail = bsd_statfs->f_bavail; linux_statfs.f_ffree = bsd_statfs->f_ffree; linux_statfs.f_files = bsd_statfs->f_files; linux_statfs.f_fsid.val[0] = bsd_statfs->f_fsid.val[0]; linux_statfs.f_fsid.val[1] = bsd_statfs->f_fsid.val[1]; linux_statfs.f_namelen = MAXNAMLEN; return copyout((caddr_t)&linux_statfs, (caddr_t)args->buf, sizeof(linux_statfs)); } int linux_fstatfs(struct thread *td, struct linux_fstatfs_args *args) { struct file *fp; struct mount *mp; struct statfs *bsd_statfs; struct l_statfs linux_statfs; int error; #ifdef DEBUG if (ldebug(fstatfs)) printf(ARGS(fstatfs, "%d, *"), args->fd); #endif error = getvnode(td->td_proc->p_fd, args->fd, &fp); if (error) return error; mp = ((struct vnode *)fp->f_data)->v_mount; bsd_statfs = &mp->mnt_stat; error = VFS_STATFS(mp, bsd_statfs, td); - if (error) + if (error) { + fdrop(fp, td); return error; + } bsd_statfs->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; linux_statfs.f_type = bsd_to_linux_ftype(bsd_statfs->f_fstypename); linux_statfs.f_bsize = bsd_statfs->f_bsize; linux_statfs.f_blocks = bsd_statfs->f_blocks; linux_statfs.f_bfree = bsd_statfs->f_bfree; linux_statfs.f_bavail = bsd_statfs->f_bavail; linux_statfs.f_ffree = bsd_statfs->f_ffree; linux_statfs.f_files = bsd_statfs->f_files; linux_statfs.f_fsid.val[0] = bsd_statfs->f_fsid.val[0]; linux_statfs.f_fsid.val[1] = bsd_statfs->f_fsid.val[1]; linux_statfs.f_namelen = MAXNAMLEN; - return copyout((caddr_t)&linux_statfs, (caddr_t)args->buf, + error = copyout((caddr_t)&linux_statfs, (caddr_t)args->buf, sizeof(linux_statfs)); + fdrop(fp, td); + return error; } struct l_ustat { l_daddr_t f_tfree; l_ino_t f_tinode; char f_fname[6]; char f_fpack[6]; }; int linux_ustat(struct thread *td, struct linux_ustat_args *args) { struct l_ustat lu; dev_t dev; struct vnode *vp; struct statfs *stat; int error; #ifdef DEBUG if (ldebug(ustat)) printf(ARGS(ustat, "%d, *"), args->dev); #endif /* * lu.f_fname and lu.f_fpack are not used. They are always zeroed. * lu.f_tinode and lu.f_tfree are set from the device's super block. */ bzero(&lu, sizeof(lu)); /* * XXX - Don't return an error if we can't find a vnode for the * device. Our dev_t is 32-bits whereas Linux only has a 16-bits * dev_t. The dev_t that is used now may as well be a truncated * dev_t returned from previous syscalls. Just return a bzeroed * ustat in that case. */ dev = makedev(args->dev >> 8, args->dev & 0xFF); if (vfinddev(dev, VCHR, &vp)) { if (vp->v_mount == NULL) return (EINVAL); stat = &(vp->v_mount->mnt_stat); error = VFS_STATFS(vp->v_mount, stat, td); if (error) return (error); lu.f_tfree = stat->f_bfree; lu.f_tinode = stat->f_ffree; } return (copyout(&lu, args->ubuf, sizeof(lu))); } #if defined(__i386__) static int stat64_copyout(struct stat *buf, void *ubuf) { struct l_stat64 lbuf; bzero(&lbuf, sizeof(lbuf)); lbuf.st_dev = uminor(buf->st_dev) | (umajor(buf->st_dev) << 8); lbuf.st_ino = buf->st_ino; lbuf.st_mode = buf->st_mode; lbuf.st_nlink = buf->st_nlink; lbuf.st_uid = buf->st_uid; lbuf.st_gid = buf->st_gid; lbuf.st_rdev = buf->st_rdev; lbuf.st_size = buf->st_size; lbuf.st_atime = buf->st_atime; lbuf.st_mtime = buf->st_mtime; lbuf.st_ctime = buf->st_ctime; lbuf.st_blksize = buf->st_blksize; lbuf.st_blocks = buf->st_blocks; /* * The __st_ino field makes all the difference. In the Linux kernel * it is conditionally compiled based on STAT64_HAS_BROKEN_ST_INO, * but without the assignment to __st_ino the runtime linker refuses * to mmap(2) any shared libraries. I guess it's broken alright :-) */ lbuf.__st_ino = buf->st_ino; return (copyout(&lbuf, ubuf, sizeof(lbuf))); } int linux_stat64(struct thread *td, struct linux_stat64_args *args) { struct stat buf; struct nameidata nd; int error; caddr_t sg; sg = stackgap_init(); CHECKALTEXIST(td, &sg, args->filename); #ifdef DEBUG if (ldebug(stat64)) printf(ARGS(stat64, "%s, *"), args->filename); #endif NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, args->filename, td); error = namei(&nd); if (error) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = vn_stat(nd.ni_vp, &buf, td); vput(nd.ni_vp); if (error) return (error); return (stat64_copyout(&buf, args->statbuf)); } int linux_lstat64(struct thread *td, struct linux_lstat64_args *args) { int error; struct stat sb; struct nameidata nd; caddr_t sg; sg = stackgap_init(); CHECKALTEXIST(td, &sg, args->filename); #ifdef DEBUG if (ldebug(lstat64)) printf(ARGS(lstat64, "%s, *"), args->filename); #endif NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, args->filename, td); error = namei(&nd); if (error) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = vn_stat(nd.ni_vp, &sb, td); vput(nd.ni_vp); if (error) return (error); return (stat64_copyout(&sb, args->statbuf)); } int linux_fstat64(struct thread *td, struct linux_fstat64_args *args) { struct filedesc *fdp; struct file *fp; struct stat buf; int error; #ifdef DEBUG if (ldebug(fstat64)) printf(ARGS(fstat64, "%d, *"), args->fd); #endif fdp = td->td_proc->p_fd; if ((unsigned)args->fd >= fdp->fd_nfiles || (fp = fdp->fd_ofiles[args->fd]) == NULL) return (EBADF); error = fo_stat(fp, &buf, td); if (!error) error = stat64_copyout(&buf, args->statbuf); return (error); } #endif /* __i386__ */ Index: head/sys/compat/svr4/svr4_fcntl.c =================================================================== --- head/sys/compat/svr4/svr4_fcntl.c (revision 89305) +++ head/sys/compat/svr4/svr4_fcntl.c (revision 89306) @@ -1,726 +1,745 @@ /* * Copyright (c) 1998 Mark Newton * Copyright (c) 1994, 1997 Christos Zoulas. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Christos Zoulas. * 4. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #include #include #include #include /*#include */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int svr4_to_bsd_flags __P((int)); static u_long svr4_to_bsd_cmd __P((u_long)); static int fd_revoke __P((struct thread *, int)); static int fd_truncate __P((struct thread *, int, struct flock *)); static int bsd_to_svr4_flags __P((int)); static void bsd_to_svr4_flock __P((struct flock *, struct svr4_flock *)); static void svr4_to_bsd_flock __P((struct svr4_flock *, struct flock *)); static void bsd_to_svr4_flock64 __P((struct flock *, struct svr4_flock64 *)); static void svr4_to_bsd_flock64 __P((struct svr4_flock64 *, struct flock *)); static u_long svr4_to_bsd_cmd(cmd) u_long cmd; { switch (cmd) { case SVR4_F_DUPFD: return F_DUPFD; case SVR4_F_GETFD: return F_GETFD; case SVR4_F_SETFD: return F_SETFD; case SVR4_F_GETFL: return F_GETFL; case SVR4_F_SETFL: return F_SETFL; case SVR4_F_GETLK: return F_GETLK; case SVR4_F_SETLK: return F_SETLK; case SVR4_F_SETLKW: return F_SETLKW; default: return -1; } } static int svr4_to_bsd_flags(l) int l; { int r = 0; r |= (l & SVR4_O_RDONLY) ? O_RDONLY : 0; r |= (l & SVR4_O_WRONLY) ? O_WRONLY : 0; r |= (l & SVR4_O_RDWR) ? O_RDWR : 0; r |= (l & SVR4_O_NDELAY) ? O_NONBLOCK : 0; r |= (l & SVR4_O_APPEND) ? O_APPEND : 0; r |= (l & SVR4_O_SYNC) ? O_FSYNC : 0; r |= (l & SVR4_O_NONBLOCK) ? O_NONBLOCK : 0; r |= (l & SVR4_O_PRIV) ? O_EXLOCK : 0; r |= (l & SVR4_O_CREAT) ? O_CREAT : 0; r |= (l & SVR4_O_TRUNC) ? O_TRUNC : 0; r |= (l & SVR4_O_EXCL) ? O_EXCL : 0; r |= (l & SVR4_O_NOCTTY) ? O_NOCTTY : 0; return r; } static int bsd_to_svr4_flags(l) int l; { int r = 0; r |= (l & O_RDONLY) ? SVR4_O_RDONLY : 0; r |= (l & O_WRONLY) ? SVR4_O_WRONLY : 0; r |= (l & O_RDWR) ? SVR4_O_RDWR : 0; r |= (l & O_NDELAY) ? SVR4_O_NONBLOCK : 0; r |= (l & O_APPEND) ? SVR4_O_APPEND : 0; r |= (l & O_FSYNC) ? SVR4_O_SYNC : 0; r |= (l & O_NONBLOCK) ? SVR4_O_NONBLOCK : 0; r |= (l & O_EXLOCK) ? SVR4_O_PRIV : 0; r |= (l & O_CREAT) ? SVR4_O_CREAT : 0; r |= (l & O_TRUNC) ? SVR4_O_TRUNC : 0; r |= (l & O_EXCL) ? SVR4_O_EXCL : 0; r |= (l & O_NOCTTY) ? SVR4_O_NOCTTY : 0; return r; } static void bsd_to_svr4_flock(iflp, oflp) struct flock *iflp; struct svr4_flock *oflp; { switch (iflp->l_type) { case F_RDLCK: oflp->l_type = SVR4_F_RDLCK; break; case F_WRLCK: oflp->l_type = SVR4_F_WRLCK; break; case F_UNLCK: oflp->l_type = SVR4_F_UNLCK; break; default: oflp->l_type = -1; break; } oflp->l_whence = (short) iflp->l_whence; oflp->l_start = (svr4_off_t) iflp->l_start; oflp->l_len = (svr4_off_t) iflp->l_len; oflp->l_sysid = 0; oflp->l_pid = (svr4_pid_t) iflp->l_pid; } static void svr4_to_bsd_flock(iflp, oflp) struct svr4_flock *iflp; struct flock *oflp; { switch (iflp->l_type) { case SVR4_F_RDLCK: oflp->l_type = F_RDLCK; break; case SVR4_F_WRLCK: oflp->l_type = F_WRLCK; break; case SVR4_F_UNLCK: oflp->l_type = F_UNLCK; break; default: oflp->l_type = -1; break; } oflp->l_whence = iflp->l_whence; oflp->l_start = (off_t) iflp->l_start; oflp->l_len = (off_t) iflp->l_len; oflp->l_pid = (pid_t) iflp->l_pid; } static void bsd_to_svr4_flock64(iflp, oflp) struct flock *iflp; struct svr4_flock64 *oflp; { switch (iflp->l_type) { case F_RDLCK: oflp->l_type = SVR4_F_RDLCK; break; case F_WRLCK: oflp->l_type = SVR4_F_WRLCK; break; case F_UNLCK: oflp->l_type = SVR4_F_UNLCK; break; default: oflp->l_type = -1; break; } oflp->l_whence = (short) iflp->l_whence; oflp->l_start = (svr4_off64_t) iflp->l_start; oflp->l_len = (svr4_off64_t) iflp->l_len; oflp->l_sysid = 0; oflp->l_pid = (svr4_pid_t) iflp->l_pid; } static void svr4_to_bsd_flock64(iflp, oflp) struct svr4_flock64 *iflp; struct flock *oflp; { switch (iflp->l_type) { case SVR4_F_RDLCK: oflp->l_type = F_RDLCK; break; case SVR4_F_WRLCK: oflp->l_type = F_WRLCK; break; case SVR4_F_UNLCK: oflp->l_type = F_UNLCK; break; default: oflp->l_type = -1; break; } oflp->l_whence = iflp->l_whence; oflp->l_start = (off_t) iflp->l_start; oflp->l_len = (off_t) iflp->l_len; oflp->l_pid = (pid_t) iflp->l_pid; } static int fd_revoke(td, fd) struct thread *td; int fd; { - struct filedesc *fdp = td->td_proc->p_fd; struct file *fp; struct vnode *vp; struct mount *mp; struct vattr vattr; int error, *retval; retval = td->td_retval; - if ((u_int)fd >= fdp->fd_nfiles || (fp = fdp->fd_ofiles[fd]) == NULL) + fp = ffind_hold(td, fd); + if (fp == NULL) return EBADF; - if (fp->f_type != DTYPE_VNODE) + if (fp->f_type != DTYPE_VNODE) { + fdrop(fp, td); return EINVAL; + } vp = (struct vnode *) fp->f_data; if (vp->v_type != VCHR && vp->v_type != VBLK) { error = EINVAL; goto out; } if ((error = VOP_GETATTR(vp, &vattr, td->td_proc->p_ucred, td)) != 0) goto out; if (td->td_proc->p_ucred->cr_uid != vattr.va_uid && (error = suser_td(td)) != 0) goto out; if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) goto out; if (vcount(vp) > 1) VOP_REVOKE(vp, REVOKEALL); vn_finished_write(mp); out: vrele(vp); + fdrop(fp, td); return error; } static int fd_truncate(td, fd, flp) struct thread *td; int fd; struct flock *flp; { - struct filedesc *fdp = td->td_proc->p_fd; struct file *fp; off_t start, length; struct vnode *vp; struct vattr vattr; int error, *retval; struct ftruncate_args ft; retval = td->td_retval; /* * We only support truncating the file. */ - if ((u_int)fd >= fdp->fd_nfiles || (fp = fdp->fd_ofiles[fd]) == NULL) + fp = ffind_hold(td, fd); + if (fp == NULL) return EBADF; vp = (struct vnode *)fp->f_data; - if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) + if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) { + fdrop(fp, td); return ESPIPE; + } if ((error = VOP_GETATTR(vp, &vattr, td->td_proc->p_ucred, td)) != 0) + fdrop(fp, td); return error; + } length = vattr.va_size; switch (flp->l_whence) { case SEEK_CUR: start = fp->f_offset + flp->l_start; break; case SEEK_END: start = flp->l_start + length; break; case SEEK_SET: start = flp->l_start; break; default: + fdrop(fp, td); return EINVAL; } if (start + flp->l_len < length) { /* We don't support free'ing in the middle of the file */ return EINVAL; } SCARG(&ft, fd) = fd; SCARG(&ft, length) = start; - return ftruncate(td, &ft); + error = ftruncate(td, &ft); + + fdrop(fp, td); + return (error); } int svr4_sys_open(td, uap) register struct thread *td; struct svr4_sys_open_args *uap; { struct proc *p = td->td_proc; int error, retval; struct open_args cup; caddr_t sg = stackgap_init(); CHECKALTEXIST(td, &sg, SCARG(uap, path)); (&cup)->path = uap->path; (&cup)->flags = svr4_to_bsd_flags(uap->flags); (&cup)->mode = uap->mode; error = open(td, &cup); if (error) { /* uprintf("svr4_open(%s, 0x%0x, 0%o): %d\n", uap->path, uap->flags, uap->mode, error);*/ return error; } retval = td->td_retval[0]; PROC_LOCK(p); if (!(SCARG(&cup, flags) & O_NOCTTY) && SESS_LEADER(p) && !(td->td_proc->p_flag & P_CONTROLT)) { #if defined(NOTYET) - struct filedesc *fdp = td->td_proc->p_fd; - struct file *fp = fdp->fd_ofiles[retval]; + struct file *fp; + fp = ffind_hold(td, retval); PROC_UNLOCK(p); + /* + * we may have lost a race the above open() and + * another thread issuing a close() + */ + if (fp == NULL) + return (EBADF); /* XXX: correct errno? */ /* ignore any error, just give it a try */ if (fp->f_type == DTYPE_VNODE) fo_ioctl(fp, TIOCSCTTY, (caddr_t) 0, td); - } else + fdrop(fp, td); + } else { PROC_UNLOCK(p); + } #else } PROC_UNLOCK(p); #endif return error; } int svr4_sys_open64(td, uap) register struct thread *td; struct svr4_sys_open64_args *uap; { return svr4_sys_open(td, (struct svr4_sys_open_args *)uap); } int svr4_sys_creat(td, uap) register struct thread *td; struct svr4_sys_creat_args *uap; { struct open_args cup; caddr_t sg = stackgap_init(); CHECKALTEXIST(td, &sg, SCARG(uap, path)); SCARG(&cup, path) = SCARG(uap, path); SCARG(&cup, mode) = SCARG(uap, mode); SCARG(&cup, flags) = O_WRONLY | O_CREAT | O_TRUNC; return open(td, &cup); } int svr4_sys_creat64(td, uap) register struct thread *td; struct svr4_sys_creat64_args *uap; { return svr4_sys_creat(td, (struct svr4_sys_creat_args *)uap); } int svr4_sys_llseek(td, uap) register struct thread *td; struct svr4_sys_llseek_args *uap; { struct lseek_args ap; SCARG(&ap, fd) = SCARG(uap, fd); #if BYTE_ORDER == BIG_ENDIAN SCARG(&ap, offset) = (((u_int64_t) SCARG(uap, offset1)) << 32) | SCARG(uap, offset2); #else SCARG(&ap, offset) = (((u_int64_t) SCARG(uap, offset2)) << 32) | SCARG(uap, offset1); #endif SCARG(&ap, whence) = SCARG(uap, whence); return lseek(td, &ap); } int svr4_sys_access(td, uap) register struct thread *td; struct svr4_sys_access_args *uap; { struct access_args cup; int *retval; caddr_t sg = stackgap_init(); CHECKALTEXIST(td, &sg, SCARG(uap, path)); retval = td->td_retval; SCARG(&cup, path) = SCARG(uap, path); SCARG(&cup, flags) = SCARG(uap, flags); return access(td, &cup); } #if defined(NOTYET) int svr4_sys_pread(td, uap) register struct thread *td; struct svr4_sys_pread_args *uap; { struct pread_args pra; /* * Just translate the args structure and call the NetBSD * pread(2) system call (offset type is 64-bit in NetBSD). */ SCARG(&pra, fd) = SCARG(uap, fd); SCARG(&pra, buf) = SCARG(uap, buf); SCARG(&pra, nbyte) = SCARG(uap, nbyte); SCARG(&pra, offset) = SCARG(uap, off); return pread(td, &pra); } #endif #if defined(NOTYET) int svr4_sys_pread64(td, v, retval) register struct thread *td; void *v; register_t *retval; { struct svr4_sys_pread64_args *uap = v; struct sys_pread_args pra; /* * Just translate the args structure and call the NetBSD * pread(2) system call (offset type is 64-bit in NetBSD). */ SCARG(&pra, fd) = SCARG(uap, fd); SCARG(&pra, buf) = SCARG(uap, buf); SCARG(&pra, nbyte) = SCARG(uap, nbyte); SCARG(&pra, offset) = SCARG(uap, off); return (sys_pread(td, &pra, retval)); } #endif /* NOTYET */ #if defined(NOTYET) int svr4_sys_pwrite(td, uap) register struct thread *td; struct svr4_sys_pwrite_args *uap; { struct pwrite_args pwa; /* * Just translate the args structure and call the NetBSD * pwrite(2) system call (offset type is 64-bit in NetBSD). */ SCARG(&pwa, fd) = SCARG(uap, fd); SCARG(&pwa, buf) = SCARG(uap, buf); SCARG(&pwa, nbyte) = SCARG(uap, nbyte); SCARG(&pwa, offset) = SCARG(uap, off); return pwrite(td, &pwa); } #endif #if defined(NOTYET) int svr4_sys_pwrite64(td, v, retval) register struct thread *td; void *v; register_t *retval; { struct svr4_sys_pwrite64_args *uap = v; struct sys_pwrite_args pwa; /* * Just translate the args structure and call the NetBSD * pwrite(2) system call (offset type is 64-bit in NetBSD). */ SCARG(&pwa, fd) = SCARG(uap, fd); SCARG(&pwa, buf) = SCARG(uap, buf); SCARG(&pwa, nbyte) = SCARG(uap, nbyte); SCARG(&pwa, offset) = SCARG(uap, off); return (sys_pwrite(td, &pwa, retval)); } #endif /* NOTYET */ int svr4_sys_fcntl(td, uap) register struct thread *td; struct svr4_sys_fcntl_args *uap; { int error; struct fcntl_args fa; int *retval; retval = td->td_retval; SCARG(&fa, fd) = SCARG(uap, fd); SCARG(&fa, cmd) = svr4_to_bsd_cmd(SCARG(uap, cmd)); switch (SCARG(&fa, cmd)) { case F_DUPFD: case F_GETFD: case F_SETFD: SCARG(&fa, arg) = (long) SCARG(uap, arg); return fcntl(td, &fa); case F_GETFL: SCARG(&fa, arg) = (long) SCARG(uap, arg); error = fcntl(td, &fa); if (error) return error; *retval = bsd_to_svr4_flags(*retval); return error; case F_SETFL: { /* * we must save the O_ASYNC flag, as that is * handled by ioctl(_, I_SETSIG, _) emulation. */ long cmd; int flags; DPRINTF(("Setting flags %p\n", SCARG(uap, arg))); cmd = SCARG(&fa, cmd); /* save it for a while */ SCARG(&fa, cmd) = F_GETFL; if ((error = fcntl(td, &fa)) != 0) return error; flags = *retval; flags &= O_ASYNC; flags |= svr4_to_bsd_flags((u_long) SCARG(uap, arg)); SCARG(&fa, cmd) = cmd; SCARG(&fa, arg) = (long) flags; return fcntl(td, &fa); } case F_GETLK: case F_SETLK: case F_SETLKW: { struct svr4_flock ifl; struct flock *flp, fl; caddr_t sg = stackgap_init(); flp = stackgap_alloc(&sg, sizeof(struct flock)); SCARG(&fa, arg) = (long) flp; error = copyin(SCARG(uap, arg), &ifl, sizeof ifl); if (error) return error; svr4_to_bsd_flock(&ifl, &fl); error = copyout(&fl, flp, sizeof fl); if (error) return error; error = fcntl(td, &fa); if (error || SCARG(&fa, cmd) != F_GETLK) return error; error = copyin(flp, &fl, sizeof fl); if (error) return error; bsd_to_svr4_flock(&fl, &ifl); return copyout(&ifl, SCARG(uap, arg), sizeof ifl); } case -1: switch (SCARG(uap, cmd)) { case SVR4_F_DUP2FD: { struct dup2_args du; SCARG(&du, from) = SCARG(uap, fd); SCARG(&du, to) = (int)SCARG(uap, arg); error = dup2(td, &du); if (error) return error; *retval = SCARG(&du, to); return 0; } case SVR4_F_FREESP: { struct svr4_flock ifl; struct flock fl; error = copyin(SCARG(uap, arg), &ifl, sizeof ifl); if (error) return error; svr4_to_bsd_flock(&ifl, &fl); return fd_truncate(td, SCARG(uap, fd), &fl); } case SVR4_F_GETLK64: case SVR4_F_SETLK64: case SVR4_F_SETLKW64: { struct svr4_flock64 ifl; struct flock *flp, fl; caddr_t sg = stackgap_init(); flp = stackgap_alloc(&sg, sizeof(struct flock)); SCARG(&fa, arg) = (long) flp; error = copyin(SCARG(uap, arg), &ifl, sizeof ifl); if (error) return error; svr4_to_bsd_flock64(&ifl, &fl); error = copyout(&fl, flp, sizeof fl); if (error) return error; error = fcntl(td, &fa); if (error || SCARG(&fa, cmd) != F_GETLK) return error; error = copyin(flp, &fl, sizeof fl); if (error) return error; bsd_to_svr4_flock64(&fl, &ifl); return copyout(&ifl, SCARG(uap, arg), sizeof ifl); } case SVR4_F_FREESP64: { struct svr4_flock64 ifl; struct flock fl; error = copyin(SCARG(uap, arg), &ifl, sizeof ifl); if (error) return error; svr4_to_bsd_flock64(&ifl, &fl); return fd_truncate(td, SCARG(uap, fd), &fl); } case SVR4_F_REVOKE: return fd_revoke(td, SCARG(uap, fd)); default: return ENOSYS; } default: return ENOSYS; } } Index: head/sys/compat/svr4/svr4_filio.c =================================================================== --- head/sys/compat/svr4/svr4_filio.c (revision 89305) +++ head/sys/compat/svr4/svr4_filio.c (revision 89306) @@ -1,228 +1,233 @@ /* * Copyright (c) 1998 Mark Newton * Copyright (c) 1994 Christos Zoulas * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /*#define GROTTY_READ_HACK*/ int svr4_sys_poll(td, uap) struct thread *td; struct svr4_sys_poll_args *uap; { int error; struct poll_args pa; struct pollfd *pfd; int idx = 0, cerr; u_long siz; SCARG(&pa, fds) = SCARG(uap, fds); SCARG(&pa, nfds) = SCARG(uap, nfds); SCARG(&pa, timeout) = SCARG(uap, timeout); siz = SCARG(uap, nfds) * sizeof(struct pollfd); pfd = (struct pollfd *)malloc(siz, M_TEMP, M_WAITOK); error = poll(td, (struct poll_args *)uap); if ((cerr = copyin(SCARG(uap, fds), pfd, siz)) != 0) { error = cerr; goto done; } for (idx = 0; idx < SCARG(uap, nfds); idx++) { /* POLLWRNORM already equals POLLOUT, so we don't worry about that */ if (pfd[idx].revents & (POLLOUT | POLLWRNORM | POLLWRBAND)) pfd[idx].revents |= (POLLOUT | POLLWRNORM | POLLWRBAND); } if ((cerr = copyout(pfd, SCARG(uap, fds), siz)) != 0) { error = cerr; goto done; /* yeah, I know it's the next line, but this way I won't forget to update it if I add more code */ } done: free(pfd, M_TEMP); return error; } #if defined(READ_TEST) int svr4_sys_read(td, uap) struct thread *td; struct svr4_sys_read_args *uap; { struct read_args ra; - struct filedesc *fdp = td->td_proc->p_fd; struct file *fp; struct socket *so = NULL; int so_state; sigset_t sigmask; int rv; SCARG(&ra, fd) = SCARG(uap, fd); SCARG(&ra, buf) = SCARG(uap, buf); SCARG(&ra, nbyte) = SCARG(uap, nbyte); - if ((fp = fdp->fd_ofiles[SCARG(uap, fd)]) == NULL) { + fp = ffind_hold(td, uap->fd); + if (fp == NULL) { DPRINTF(("Something fishy with the user-supplied file descriptor...\n")); return EBADF; } if (fp->f_type == DTYPE_SOCKET) { so = (struct socket *)fp->f_data; DPRINTF(("fd %d is a socket\n", SCARG(uap, fd))); if (so->so_state & SS_ASYNC) { DPRINTF(("fd %d is an ASYNC socket!\n", SCARG(uap, fd))); } DPRINTF(("Here are its flags: 0x%x\n", so->so_state)); #if defined(GROTTY_READ_HACK) so_state = so->so_state; so->so_state &= ~SS_NBIO; #endif } rv = read(td, &ra); DPRINTF(("svr4_read(%d, 0x%0x, %d) = %d\n", SCARG(uap, fd), SCARG(uap, buf), SCARG(uap, nbyte), rv)); if (rv == EAGAIN) { DPRINTF(("sigmask = 0x%x\n", td->td_proc->p_sigmask)); DPRINTF(("sigignore = 0x%x\n", td->td_proc->p_sigignore)); DPRINTF(("sigcaught = 0x%x\n", td->td_proc->p_sigcatch)); DPRINTF(("siglist = 0x%x\n", td->td_proc->p_siglist)); } #if defined(GROTTY_READ_HACK) if (so) { /* We've already checked to see if this is a socket */ so->so_state = so_state; } #endif + fdrop(fp, td); return(rv); } #endif /* READ_TEST */ #if defined(BOGUS) int svr4_sys_write(td, uap) struct thread *td; struct svr4_sys_write_args *uap; { struct write_args wa; - struct filedesc *fdp; struct file *fp; int rv; SCARG(&wa, fd) = SCARG(uap, fd); SCARG(&wa, buf) = SCARG(uap, buf); SCARG(&wa, nbyte) = SCARG(uap, nbyte); rv = write(td, &wa); DPRINTF(("svr4_write(%d, 0x%0x, %d) = %d\n", SCARG(uap, fd), SCARG(uap, buf), SCARG(uap, nbyte), rv)); return(rv); } #endif /* BOGUS */ int svr4_fil_ioctl(fp, td, retval, fd, cmd, data) struct file *fp; struct thread *td; register_t *retval; int fd; u_long cmd; caddr_t data; { int error; int num; struct filedesc *fdp = td->td_proc->p_fd; *retval = 0; + FILEDESC_LOCK(fdp); switch (cmd) { case SVR4_FIOCLEX: fdp->fd_ofileflags[fd] |= UF_EXCLOSE; + FILEDESC_UNLOCK(fdp); return 0; case SVR4_FIONCLEX: fdp->fd_ofileflags[fd] &= ~UF_EXCLOSE; + FILEDESC_UNLOCK(fdp); return 0; case SVR4_FIOGETOWN: case SVR4_FIOSETOWN: case SVR4_FIOASYNC: case SVR4_FIONBIO: case SVR4_FIONREAD: + FILEDESC_UNLOCK(fdp); if ((error = copyin(data, &num, sizeof(num))) != 0) return error; switch (cmd) { case SVR4_FIOGETOWN: cmd = FIOGETOWN; break; case SVR4_FIOSETOWN: cmd = FIOSETOWN; break; case SVR4_FIOASYNC: cmd = FIOASYNC; break; case SVR4_FIONBIO: cmd = FIONBIO; break; case SVR4_FIONREAD: cmd = FIONREAD; break; } #ifdef SVR4_DEBUG if (cmd == FIOASYNC) DPRINTF(("FIOASYNC\n")); #endif error = fo_ioctl(fp, cmd, (caddr_t) &num, td); if (error) return error; return copyout(&num, data, sizeof(num)); default: + FILEDESC_UNLOCK(fdp); DPRINTF(("Unknown svr4 filio %lx\n", cmd)); return 0; /* ENOSYS really */ } } Index: head/sys/compat/svr4/svr4_ioctl.c =================================================================== --- head/sys/compat/svr4/svr4_ioctl.c (revision 89305) +++ head/sys/compat/svr4/svr4_ioctl.c (revision 89306) @@ -1,161 +1,168 @@ /* * Copyright (c) 1998 Mark Newton * Copyright (c) 1994 Christos Zoulas * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DEBUG_SVR4 static void svr4_decode_cmd __P((u_long, char *, char *, int *, int *)); /* * Decode an ioctl command symbolically */ static void svr4_decode_cmd(cmd, dir, c, num, argsiz) u_long cmd; char *dir, *c; int *num, *argsiz; { if (cmd & SVR4_IOC_VOID) *dir++ = 'V'; if (cmd & SVR4_IOC_IN) *dir++ = 'R'; if (cmd & SVR4_IOC_OUT) *dir++ = 'W'; *dir = '\0'; if (cmd & SVR4_IOC_INOUT) *argsiz = (cmd >> 16) & 0xff; else *argsiz = -1; *c = (cmd >> 8) & 0xff; *num = cmd & 0xff; } #endif int svr4_sys_ioctl(td, uap) register struct thread *td; struct svr4_sys_ioctl_args *uap; { int *retval; struct file *fp; - struct filedesc *fdp; u_long cmd; int (*fun) __P((struct file *, struct thread *, register_t *, int, u_long, caddr_t)); + int error; #ifdef DEBUG_SVR4 char dir[4]; char c; int num; int argsiz; svr4_decode_cmd(SCARG(uap, com), dir, &c, &num, &argsiz); DPRINTF(("svr4_ioctl[%lx](%d, _IO%s(%c, %d, %d), %p);\n", SCARG(uap, com), SCARG(uap, fd), dir, c, num, argsiz, SCARG(uap, data))); #endif retval = td->td_retval; - fdp = td->td_proc->p_fd; cmd = SCARG(uap, com); - if ((u_int)SCARG(uap, fd) >= fdp->fd_nfiles || - (fp = fdp->fd_ofiles[SCARG(uap, fd)]) == NULL) + fp = ffind_hold(td, uap->fd); + if (fp == NULL) return EBADF; - if ((fp->f_flag & (FREAD | FWRITE)) == 0) + if ((fp->f_flag & (FREAD | FWRITE)) == 0) { + fdrop(fp, td); return EBADF; + } #if defined(DEBUG_SVR4) if (fp->f_type == DTYPE_SOCKET) { struct socket *so = (struct socket *)fp->f_data; DPRINTF(("<<< IN: so_state = 0x%x\n", so->so_state)); } #endif switch (cmd & 0xff00) { case SVR4_tIOC: DPRINTF(("ttold\n")); fun = svr4_ttold_ioctl; break; case SVR4_TIOC: DPRINTF(("term\n")); fun = svr4_term_ioctl; break; case SVR4_STR: DPRINTF(("stream\n")); fun = svr4_stream_ioctl; break; case SVR4_FIOC: DPRINTF(("file\n")); fun = svr4_fil_ioctl; break; case SVR4_SIOC: DPRINTF(("socket\n")); fun = svr4_sock_ioctl; break; case SVR4_XIOC: /* We do not support those */ + fdrop(fp, td); return EINVAL; default: + fdrop(fp, td); DPRINTF(("Unimplemented ioctl %lx\n", cmd)); return 0; /* XXX: really ENOSYS */ } #if defined(DEBUG_SVR4) if (fp->f_type == DTYPE_SOCKET) { - struct socket *so = (struct socket *)fp->f_data; + struct socket *so; + + so = (struct socket *)fp->f_data; DPRINTF((">>> OUT: so_state = 0x%x\n", so->so_state)); } #endif - return (*fun)(fp, td, retval, SCARG(uap, fd), cmd, SCARG(uap, data)); + error = (*fun)(fp, td, retval, SCARG(uap, fd), cmd, SCARG(uap, data)); + fdrop(fp, td); + return (error); } Index: head/sys/compat/svr4/svr4_misc.c =================================================================== --- head/sys/compat/svr4/svr4_misc.c (revision 89305) +++ head/sys/compat/svr4/svr4_misc.c (revision 89306) @@ -1,1715 +1,1734 @@ /* * Copyright (c) 1998 Mark Newton * Copyright (c) 1994 Christos Zoulas * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ /* * SVR4 compatibility module. * * SVR4 system calls that are implemented differently in BSD are * handled here. */ #include #include #include #include #include #include #include #include #include #include /* Must come after sys/malloc.h */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(__FreeBSD__) #include #endif #if defined(NetBSD) # if defined(UVM) # include # endif #endif #define BSD_DIRENT(cp) ((struct dirent *)(cp)) static int svr4_mknod __P((struct thread *, register_t *, char *, svr4_mode_t, svr4_dev_t)); static __inline clock_t timeval_to_clock_t __P((struct timeval *)); static int svr4_setinfo __P((struct proc *, int, svr4_siginfo_t *)); struct svr4_hrtcntl_args; static int svr4_hrtcntl __P((struct thread *, struct svr4_hrtcntl_args *, register_t *)); static void bsd_statfs_to_svr4_statvfs __P((const struct statfs *, struct svr4_statvfs *)); static void bsd_statfs_to_svr4_statvfs64 __P((const struct statfs *, struct svr4_statvfs64 *)); static struct proc *svr4_pfind __P((pid_t pid)); /* BOGUS noop */ #if defined(BOGUS) int svr4_sys_setitimer(td, uap) register struct thread *td; struct svr4_sys_setitimer_args *uap; { td->td_retval[0] = 0; return 0; } #endif int svr4_sys_wait(td, uap) struct thread *td; struct svr4_sys_wait_args *uap; { struct wait_args w4; int error, *retval = td->td_retval, st, sig; size_t sz = sizeof(*SCARG(&w4, status)); SCARG(&w4, rusage) = NULL; SCARG(&w4, options) = 0; if (SCARG(uap, status) == NULL) { caddr_t sg = stackgap_init(); SCARG(&w4, status) = stackgap_alloc(&sg, sz); } else SCARG(&w4, status) = SCARG(uap, status); SCARG(&w4, pid) = WAIT_ANY; if ((error = wait4(td, &w4)) != 0) return error; if ((error = copyin(SCARG(&w4, status), &st, sizeof(st))) != 0) return error; if (WIFSIGNALED(st)) { sig = WTERMSIG(st); if (sig >= 0 && sig < NSIG) st = (st & ~0177) | SVR4_BSD2SVR4_SIG(sig); } else if (WIFSTOPPED(st)) { sig = WSTOPSIG(st); if (sig >= 0 && sig < NSIG) st = (st & ~0xff00) | (SVR4_BSD2SVR4_SIG(sig) << 8); } /* * It looks like wait(2) on svr4/solaris/2.4 returns * the status in retval[1], and the pid on retval[0]. */ retval[1] = st; if (SCARG(uap, status)) if ((error = copyout(&st, SCARG(uap, status), sizeof(st))) != 0) return error; return 0; } int svr4_sys_execv(td, uap) struct thread *td; struct svr4_sys_execv_args *uap; { struct execve_args ap; caddr_t sg; sg = stackgap_init(); CHECKALTEXIST(td, &sg, SCARG(uap, path)); SCARG(&ap, fname) = SCARG(uap, path); SCARG(&ap, argv) = SCARG(uap, argp); SCARG(&ap, envv) = NULL; return execve(td, &ap); } int svr4_sys_execve(td, uap) struct thread *td; struct svr4_sys_execve_args *uap; { struct execve_args ap; caddr_t sg; sg = stackgap_init(); CHECKALTEXIST(td, &sg, uap->path); SCARG(&ap, fname) = SCARG(uap, path); SCARG(&ap, argv) = SCARG(uap, argp); SCARG(&ap, envv) = SCARG(uap, envp); return execve(td, &ap); } int svr4_sys_time(td, v) struct thread *td; struct svr4_sys_time_args *v; { struct svr4_sys_time_args *uap = v; int error = 0; struct timeval tv; microtime(&tv); if (SCARG(uap, t)) error = copyout(&tv.tv_sec, SCARG(uap, t), sizeof(*(SCARG(uap, t)))); td->td_retval[0] = (int) tv.tv_sec; return error; } /* * Read SVR4-style directory entries. We suck them into kernel space so * that they can be massaged before being copied out to user code. * * This code is ported from the Linux emulator: Changes to the VFS interface * between FreeBSD and NetBSD have made it simpler to port it from there than * to adapt the NetBSD version. */ int svr4_sys_getdents64(td, uap) struct thread *td; struct svr4_sys_getdents64_args *uap; { register struct dirent *bdp; struct vnode *vp; caddr_t inp, buf; /* BSD-format */ int len, reclen; /* BSD-format */ caddr_t outp; /* SVR4-format */ int resid, svr4reclen=0; /* SVR4-format */ struct file *fp; struct uio auio; struct iovec aiov; struct vattr va; off_t off; struct svr4_dirent64 svr4_dirent; int buflen, error, eofflag, nbytes, justone; u_long *cookies = NULL, *cookiep; int ncookies; DPRINTF(("svr4_sys_getdents64(%d, *, %d)\n", td->td_proc->p_pid, SCARG(uap, fd), SCARG(uap, nbytes))); if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0) { return (error); } - if ((fp->f_flag & FREAD) == 0) + if ((fp->f_flag & FREAD) == 0) { + fdrop(fp, td); return (EBADF); + } vp = (struct vnode *) fp->f_data; - if (vp->v_type != VDIR) + if (vp->v_type != VDIR) { + fdrop(fp, td); return (EINVAL); + } if ((error = VOP_GETATTR(vp, &va, td->td_proc->p_ucred, td))) { + fdrop(fp, td); return error; } nbytes = SCARG(uap, nbytes); if (nbytes == 1) { nbytes = sizeof (struct svr4_dirent64); justone = 1; } else justone = 0; off = fp->f_offset; #define DIRBLKSIZ 512 /* XXX we used to use ufs's DIRBLKSIZ */ buflen = max(DIRBLKSIZ, nbytes); buflen = min(buflen, MAXBSIZE); buf = malloc(buflen, M_TEMP, M_WAITOK); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); again: aiov.iov_base = buf; aiov.iov_len = buflen; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_SYSSPACE; auio.uio_td = td; auio.uio_resid = buflen; auio.uio_offset = off; if (cookies) { free(cookies, M_TEMP); cookies = NULL; } error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, &ncookies, &cookies); if (error) { goto out; } inp = buf; outp = (caddr_t) SCARG(uap, dp); resid = nbytes; if ((len = buflen - auio.uio_resid) <= 0) { goto eof; } cookiep = cookies; if (cookies) { /* * When using cookies, the vfs has the option of reading from * a different offset than that supplied (UFS truncates the * offset to a block boundary to make sure that it never reads * partway through a directory entry, even if the directory * has been compacted). */ while (len > 0 && ncookies > 0 && *cookiep <= off) { bdp = (struct dirent *) inp; len -= bdp->d_reclen; inp += bdp->d_reclen; cookiep++; ncookies--; } } while (len > 0) { if (cookiep && ncookies == 0) break; bdp = (struct dirent *) inp; reclen = bdp->d_reclen; if (reclen & 3) { DPRINTF(("svr4_readdir: reclen=%d\n", reclen)); error = EFAULT; goto out; } if (bdp->d_fileno == 0) { inp += reclen; if (cookiep) { off = *cookiep++; ncookies--; } else off += reclen; len -= reclen; continue; } svr4reclen = SVR4_RECLEN(&svr4_dirent, bdp->d_namlen); if (reclen > len || resid < svr4reclen) { outp++; break; } svr4_dirent.d_ino = (long) bdp->d_fileno; if (justone) { /* * old svr4-style readdir usage. */ svr4_dirent.d_off = (svr4_off_t) svr4reclen; svr4_dirent.d_reclen = (u_short) bdp->d_namlen; } else { svr4_dirent.d_off = (svr4_off_t)(off + reclen); svr4_dirent.d_reclen = (u_short) svr4reclen; } strcpy(svr4_dirent.d_name, bdp->d_name); if ((error = copyout((caddr_t)&svr4_dirent, outp, svr4reclen))) goto out; inp += reclen; if (cookiep) { off = *cookiep++; ncookies--; } else off += reclen; outp += svr4reclen; resid -= svr4reclen; len -= reclen; if (justone) break; } if (outp == (caddr_t) SCARG(uap, dp)) goto again; fp->f_offset = off; if (justone) nbytes = resid + svr4reclen; eof: td->td_retval[0] = nbytes - resid; out: + VOP_UNLOCK(vp, 0, td); + fdrop(fp, td); if (cookies) free(cookies, M_TEMP); - VOP_UNLOCK(vp, 0, td); free(buf, M_TEMP); return error; } int svr4_sys_getdents(td, uap) struct thread *td; struct svr4_sys_getdents_args *uap; { struct dirent *bdp; struct vnode *vp; caddr_t inp, buf; /* BSD-format */ int len, reclen; /* BSD-format */ caddr_t outp; /* SVR4-format */ int resid, svr4_reclen; /* SVR4-format */ struct file *fp; struct uio auio; struct iovec aiov; struct svr4_dirent idb; off_t off; /* true file offset */ int buflen, error, eofflag; u_long *cookiebuf = NULL, *cookie; int ncookies = 0, *retval = td->td_retval; if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0) return (error); - if ((fp->f_flag & FREAD) == 0) + if ((fp->f_flag & FREAD) == 0) { + fdrop(fp, td); return (EBADF); + } vp = (struct vnode *)fp->f_data; - if (vp->v_type != VDIR) + if (vp->v_type != VDIR) { + fdrop(fp, td); return (EINVAL); + } buflen = min(MAXBSIZE, SCARG(uap, nbytes)); buf = malloc(buflen, M_TEMP, M_WAITOK); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); off = fp->f_offset; again: aiov.iov_base = buf; aiov.iov_len = buflen; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_SYSSPACE; auio.uio_td = td; auio.uio_resid = buflen; auio.uio_offset = off; /* * First we read into the malloc'ed buffer, then * we massage it into user space, one record at a time. */ error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, &ncookies, &cookiebuf); - if (error) + if (error) { goto out; + } inp = buf; outp = SCARG(uap, buf); resid = SCARG(uap, nbytes); if ((len = buflen - auio.uio_resid) == 0) goto eof; for (cookie = cookiebuf; len > 0; len -= reclen) { bdp = (struct dirent *)inp; reclen = bdp->d_reclen; if (reclen & 3) panic("svr4_sys_getdents64: bad reclen"); off = *cookie++; /* each entry points to the next */ if ((off >> 32) != 0) { uprintf("svr4_sys_getdents64: dir offset too large for emulated program"); error = EINVAL; goto out; } if (bdp->d_fileno == 0) { inp += reclen; /* it is a hole; squish it out */ continue; } svr4_reclen = SVR4_RECLEN(&idb, bdp->d_namlen); if (reclen > len || resid < svr4_reclen) { /* entry too big for buffer, so just stop */ outp++; break; } /* * Massage in place to make a SVR4-shaped dirent (otherwise * we have to worry about touching user memory outside of * the copyout() call). */ idb.d_ino = (svr4_ino_t)bdp->d_fileno; idb.d_off = (svr4_off_t)off; idb.d_reclen = (u_short)svr4_reclen; strcpy(idb.d_name, bdp->d_name); if ((error = copyout((caddr_t)&idb, outp, svr4_reclen))) goto out; /* advance past this real entry */ inp += reclen; /* advance output past SVR4-shaped entry */ outp += svr4_reclen; resid -= svr4_reclen; } /* if we squished out the whole block, try again */ if (outp == SCARG(uap, buf)) goto again; fp->f_offset = off; /* update the vnode offset */ eof: *retval = SCARG(uap, nbytes) - resid; out: VOP_UNLOCK(vp, 0, td); + fdrop(fp, td); if (cookiebuf) free(cookiebuf, M_TEMP); free(buf, M_TEMP); return error; } int svr4_sys_mmap(td, uap) struct thread *td; struct svr4_sys_mmap_args *uap; { struct mmap_args mm; int *retval; retval = td->td_retval; #define _MAP_NEW 0x80000000 /* * Verify the arguments. */ if (SCARG(uap, prot) & ~(PROT_READ | PROT_WRITE | PROT_EXEC)) return EINVAL; /* XXX still needed? */ if (SCARG(uap, len) == 0) return EINVAL; SCARG(&mm, prot) = SCARG(uap, prot); SCARG(&mm, len) = SCARG(uap, len); SCARG(&mm, flags) = SCARG(uap, flags) & ~_MAP_NEW; SCARG(&mm, fd) = SCARG(uap, fd); SCARG(&mm, addr) = SCARG(uap, addr); SCARG(&mm, pos) = SCARG(uap, pos); return mmap(td, &mm); } int svr4_sys_mmap64(td, uap) struct thread *td; struct svr4_sys_mmap64_args *uap; { struct mmap_args mm; void *rp; #define _MAP_NEW 0x80000000 /* * Verify the arguments. */ if (SCARG(uap, prot) & ~(PROT_READ | PROT_WRITE | PROT_EXEC)) return EINVAL; /* XXX still needed? */ if (SCARG(uap, len) == 0) return EINVAL; SCARG(&mm, prot) = SCARG(uap, prot); SCARG(&mm, len) = SCARG(uap, len); SCARG(&mm, flags) = SCARG(uap, flags) & ~_MAP_NEW; SCARG(&mm, fd) = SCARG(uap, fd); SCARG(&mm, addr) = SCARG(uap, addr); SCARG(&mm, pos) = SCARG(uap, pos); rp = (void *) round_page((vm_offset_t)(td->td_proc->p_vmspace->vm_daddr + maxdsiz)); if ((SCARG(&mm, flags) & MAP_FIXED) == 0 && SCARG(&mm, addr) != 0 && (void *)SCARG(&mm, addr) < rp) SCARG(&mm, addr) = rp; return mmap(td, &mm); } int svr4_sys_fchroot(td, uap) struct thread *td; struct svr4_sys_fchroot_args *uap; { struct filedesc *fdp = td->td_proc->p_fd; struct vnode *vp; struct file *fp; int error; if ((error = suser_td(td)) != 0) return error; if ((error = getvnode(fdp, SCARG(uap, fd), &fp)) != 0) return error; vp = (struct vnode *) fp->f_data; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); if (vp->v_type != VDIR) error = ENOTDIR; else error = VOP_ACCESS(vp, VEXEC, td->td_proc->p_ucred, td); VOP_UNLOCK(vp, 0, td); if (error) + fdrop(fp, td); return error; + } VREF(vp); - if (fdp->fd_rdir != NULL) - vrele(fdp->fd_rdir); + FILEDESC_LOCK(fdp); + vpold = fdp->fd_rdir; fdp->fd_rdir = vp; + FILEDESC_UNLOCK(fdp); + if (vpold != NULL) + vrele(vpold); + fdrop(fp, td); return 0; } static int svr4_mknod(td, retval, path, mode, dev) struct thread *td; register_t *retval; char *path; svr4_mode_t mode; svr4_dev_t dev; { caddr_t sg = stackgap_init(); CHECKALTEXIST(td, &sg, path); if (S_ISFIFO(mode)) { struct mkfifo_args ap; SCARG(&ap, path) = path; SCARG(&ap, mode) = mode; return mkfifo(td, &ap); } else { struct mknod_args ap; SCARG(&ap, path) = path; SCARG(&ap, mode) = mode; SCARG(&ap, dev) = dev; return mknod(td, &ap); } } int svr4_sys_mknod(td, uap) register struct thread *td; struct svr4_sys_mknod_args *uap; { int *retval = td->td_retval; return svr4_mknod(td, retval, SCARG(uap, path), SCARG(uap, mode), (svr4_dev_t)svr4_to_bsd_odev_t(SCARG(uap, dev))); } int svr4_sys_xmknod(td, uap) struct thread *td; struct svr4_sys_xmknod_args *uap; { int *retval = td->td_retval; return svr4_mknod(td, retval, SCARG(uap, path), SCARG(uap, mode), (svr4_dev_t)svr4_to_bsd_dev_t(SCARG(uap, dev))); } int svr4_sys_vhangup(td, uap) struct thread *td; struct svr4_sys_vhangup_args *uap; { return 0; } int svr4_sys_sysconfig(td, uap) struct thread *td; struct svr4_sys_sysconfig_args *uap; { int *retval; retval = &(td->td_retval[0]); switch (SCARG(uap, name)) { case SVR4_CONFIG_UNUSED: *retval = 0; break; case SVR4_CONFIG_NGROUPS: *retval = NGROUPS_MAX; break; case SVR4_CONFIG_CHILD_MAX: *retval = maxproc; break; case SVR4_CONFIG_OPEN_FILES: *retval = maxfiles; break; case SVR4_CONFIG_POSIX_VER: *retval = 198808; break; case SVR4_CONFIG_PAGESIZE: *retval = PAGE_SIZE; break; case SVR4_CONFIG_CLK_TCK: *retval = 60; /* should this be `hz', ie. 100? */ break; case SVR4_CONFIG_XOPEN_VER: *retval = 2; /* XXX: What should that be? */ break; case SVR4_CONFIG_PROF_TCK: *retval = 60; /* XXX: What should that be? */ break; case SVR4_CONFIG_NPROC_CONF: *retval = 1; /* Only one processor for now */ break; case SVR4_CONFIG_NPROC_ONLN: *retval = 1; /* And it better be online */ break; case SVR4_CONFIG_AIO_LISTIO_MAX: case SVR4_CONFIG_AIO_MAX: case SVR4_CONFIG_AIO_PRIO_DELTA_MAX: *retval = 0; /* No aio support */ break; case SVR4_CONFIG_DELAYTIMER_MAX: *retval = 0; /* No delaytimer support */ break; case SVR4_CONFIG_MQ_OPEN_MAX: *retval = msginfo.msgmni; break; case SVR4_CONFIG_MQ_PRIO_MAX: *retval = 0; /* XXX: Don't know */ break; case SVR4_CONFIG_RTSIG_MAX: *retval = 0; break; case SVR4_CONFIG_SEM_NSEMS_MAX: *retval = seminfo.semmni; break; case SVR4_CONFIG_SEM_VALUE_MAX: *retval = seminfo.semvmx; break; case SVR4_CONFIG_SIGQUEUE_MAX: *retval = 0; /* XXX: Don't know */ break; case SVR4_CONFIG_SIGRT_MIN: case SVR4_CONFIG_SIGRT_MAX: *retval = 0; /* No real time signals */ break; case SVR4_CONFIG_TIMER_MAX: *retval = 3; /* XXX: real, virtual, profiling */ break; #if defined(NOTYET) case SVR4_CONFIG_PHYS_PAGES: #if defined(UVM) *retval = uvmexp.free; /* XXX: free instead of total */ #else *retval = cnt.v_free_count; /* XXX: free instead of total */ #endif break; case SVR4_CONFIG_AVPHYS_PAGES: #if defined(UVM) *retval = uvmexp.active; /* XXX: active instead of avg */ #else *retval = cnt.v_active_count; /* XXX: active instead of avg */ #endif break; #endif /* NOTYET */ default: return EINVAL; } return 0; } extern int swap_pager_full; /* ARGSUSED */ int svr4_sys_break(td, uap) struct thread *td; struct svr4_sys_break_args *uap; { struct vmspace *vm = td->td_proc->p_vmspace; vm_offset_t new, old, base, ns; int rv; base = round_page((vm_offset_t) vm->vm_daddr); ns = (vm_offset_t)SCARG(uap, nsize); new = round_page(ns); /* For p_rlimit. */ mtx_assert(&Giant, MA_OWNED); if (new > base) { if ((new - base) > (unsigned) td->td_proc->p_rlimit[RLIMIT_DATA].rlim_cur) { return ENOMEM; } if (new >= VM_MAXUSER_ADDRESS) { return (ENOMEM); } } else if (new < base) { /* * This is simply an invalid value. If someone wants to * do fancy address space manipulations, mmap and munmap * can do most of what the user would want. */ return EINVAL; } old = base + ctob(vm->vm_dsize); if (new > old) { vm_size_t diff; if (swap_pager_full) { return (ENOMEM); } diff = new - old; rv = vm_map_find(&vm->vm_map, NULL, 0, &old, diff, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0); if (rv != KERN_SUCCESS) { return (ENOMEM); } vm->vm_dsize += btoc(diff); } else if (new < old) { rv = vm_map_remove(&vm->vm_map, new, old); if (rv != KERN_SUCCESS) { return (ENOMEM); } vm->vm_dsize -= btoc(old - new); } return (0); } static __inline clock_t timeval_to_clock_t(tv) struct timeval *tv; { return tv->tv_sec * hz + tv->tv_usec / (1000000 / hz); } int svr4_sys_times(td, uap) struct thread *td; struct svr4_sys_times_args *uap; { int error, *retval = td->td_retval; struct tms tms; struct timeval t; struct rusage *ru; struct rusage r; struct getrusage_args ga; caddr_t sg = stackgap_init(); ru = stackgap_alloc(&sg, sizeof(struct rusage)); SCARG(&ga, who) = RUSAGE_SELF; SCARG(&ga, rusage) = ru; error = getrusage(td, &ga); if (error) return error; if ((error = copyin(ru, &r, sizeof r)) != 0) return error; tms.tms_utime = timeval_to_clock_t(&r.ru_utime); tms.tms_stime = timeval_to_clock_t(&r.ru_stime); SCARG(&ga, who) = RUSAGE_CHILDREN; error = getrusage(td, &ga); if (error) return error; if ((error = copyin(ru, &r, sizeof r)) != 0) return error; tms.tms_cutime = timeval_to_clock_t(&r.ru_utime); tms.tms_cstime = timeval_to_clock_t(&r.ru_stime); microtime(&t); *retval = timeval_to_clock_t(&t); return copyout(&tms, SCARG(uap, tp), sizeof(tms)); } int svr4_sys_ulimit(td, uap) struct thread *td; struct svr4_sys_ulimit_args *uap; { int *retval = td->td_retval; switch (SCARG(uap, cmd)) { case SVR4_GFILLIM: /* For p_rlimit below. */ mtx_assert(&Giant, MA_OWNED); *retval = td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur / 512; if (*retval == -1) *retval = 0x7fffffff; return 0; case SVR4_SFILLIM: { int error; struct __setrlimit_args srl; struct rlimit krl; caddr_t sg = stackgap_init(); struct rlimit *url = (struct rlimit *) stackgap_alloc(&sg, sizeof *url); krl.rlim_cur = SCARG(uap, newlimit) * 512; mtx_assert(&Giant, MA_OWNED); krl.rlim_max = td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_max; error = copyout(&krl, url, sizeof(*url)); if (error) return error; SCARG(&srl, which) = RLIMIT_FSIZE; SCARG(&srl, rlp) = url; error = setrlimit(td, &srl); if (error) return error; mtx_assert(&Giant, MA_OWNED); *retval = td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur; if (*retval == -1) *retval = 0x7fffffff; return 0; } case SVR4_GMEMLIM: { struct vmspace *vm = td->td_proc->p_vmspace; register_t r; mtx_assert(&Giant, MA_OWNED); r = td->td_proc->p_rlimit[RLIMIT_DATA].rlim_cur; if (r == -1) r = 0x7fffffff; r += (long) vm->vm_daddr; if (r < 0) r = 0x7fffffff; *retval = r; return 0; } case SVR4_GDESLIM: mtx_assert(&Giant, MA_OWNED); *retval = td->td_proc->p_rlimit[RLIMIT_NOFILE].rlim_cur; if (*retval == -1) *retval = 0x7fffffff; return 0; default: return EINVAL; } } static struct proc * svr4_pfind(pid) pid_t pid; { struct proc *p; /* look in the live processes */ if ((p = pfind(pid)) == NULL) /* look in the zombies */ p = zpfind(pid); return p; } int svr4_sys_pgrpsys(td, uap) struct thread *td; struct svr4_sys_pgrpsys_args *uap; { int *retval = td->td_retval; struct proc *p = td->td_proc; switch (SCARG(uap, cmd)) { case 1: /* setpgrp() */ /* * SVR4 setpgrp() (which takes no arguments) has the * semantics that the session ID is also created anew, so * in almost every sense, setpgrp() is identical to * setsid() for SVR4. (Under BSD, the difference is that * a setpgid(0,0) will not create a new session.) */ setsid(td, NULL); /*FALLTHROUGH*/ case 0: /* getpgrp() */ *retval = p->p_pgrp->pg_id; return 0; case 2: /* getsid(pid) */ if (SCARG(uap, pid) != 0 && (p = svr4_pfind(SCARG(uap, pid))) == NULL) return ESRCH; /* * This has already been initialized to the pid of * the session leader. */ *retval = (register_t) p->p_session->s_leader->p_pid; PROC_UNLOCK(p); return 0; case 3: /* setsid() */ return setsid(td, NULL); case 4: /* getpgid(pid) */ if (SCARG(uap, pid) != 0 && (p = svr4_pfind(SCARG(uap, pid))) == NULL) return ESRCH; *retval = (int) p->p_pgrp->pg_id; PROC_UNLOCK(p); return 0; case 5: /* setpgid(pid, pgid); */ { struct setpgid_args sa; SCARG(&sa, pid) = SCARG(uap, pid); SCARG(&sa, pgid) = SCARG(uap, pgid); return setpgid(td, &sa); } default: return EINVAL; } } #define syscallarg(x) union { x datum; register_t pad; } struct svr4_hrtcntl_args { int cmd; int fun; int clk; svr4_hrt_interval_t * iv; svr4_hrt_time_t * ti; }; static int svr4_hrtcntl(td, uap, retval) struct thread *td; struct svr4_hrtcntl_args *uap; register_t *retval; { switch (SCARG(uap, fun)) { case SVR4_HRT_CNTL_RES: DPRINTF(("htrcntl(RES)\n")); *retval = SVR4_HRT_USEC; return 0; case SVR4_HRT_CNTL_TOFD: DPRINTF(("htrcntl(TOFD)\n")); { struct timeval tv; svr4_hrt_time_t t; if (SCARG(uap, clk) != SVR4_HRT_CLK_STD) { DPRINTF(("clk == %d\n", SCARG(uap, clk))); return EINVAL; } if (SCARG(uap, ti) == NULL) { DPRINTF(("ti NULL\n")); return EINVAL; } microtime(&tv); t.h_sec = tv.tv_sec; t.h_rem = tv.tv_usec; t.h_res = SVR4_HRT_USEC; return copyout(&t, SCARG(uap, ti), sizeof(t)); } case SVR4_HRT_CNTL_START: DPRINTF(("htrcntl(START)\n")); return ENOSYS; case SVR4_HRT_CNTL_GET: DPRINTF(("htrcntl(GET)\n")); return ENOSYS; default: DPRINTF(("Bad htrcntl command %d\n", SCARG(uap, fun))); return ENOSYS; } } int svr4_sys_hrtsys(td, uap) struct thread *td; struct svr4_sys_hrtsys_args *uap; { int *retval = td->td_retval; switch (SCARG(uap, cmd)) { case SVR4_HRT_CNTL: return svr4_hrtcntl(td, (struct svr4_hrtcntl_args *) uap, retval); case SVR4_HRT_ALRM: DPRINTF(("hrtalarm\n")); return ENOSYS; case SVR4_HRT_SLP: DPRINTF(("hrtsleep\n")); return ENOSYS; case SVR4_HRT_CAN: DPRINTF(("hrtcancel\n")); return ENOSYS; default: DPRINTF(("Bad hrtsys command %d\n", SCARG(uap, cmd))); return EINVAL; } } static int svr4_setinfo(p, st, s) struct proc *p; int st; svr4_siginfo_t *s; { svr4_siginfo_t i; int sig; memset(&i, 0, sizeof(i)); i.si_signo = SVR4_SIGCHLD; i.si_errno = 0; /* XXX? */ if (p) { i.si_pid = p->p_pid; mtx_lock_spin(&sched_lock); if (p->p_stat == SZOMB) { i.si_stime = p->p_ru->ru_stime.tv_sec; i.si_utime = p->p_ru->ru_utime.tv_sec; } else { i.si_stime = p->p_stats->p_ru.ru_stime.tv_sec; i.si_utime = p->p_stats->p_ru.ru_utime.tv_sec; } mtx_unlock_spin(&sched_lock); } if (WIFEXITED(st)) { i.si_status = WEXITSTATUS(st); i.si_code = SVR4_CLD_EXITED; } else if (WIFSTOPPED(st)) { sig = WSTOPSIG(st); if (sig >= 0 && sig < NSIG) i.si_status = SVR4_BSD2SVR4_SIG(sig); if (i.si_status == SVR4_SIGCONT) i.si_code = SVR4_CLD_CONTINUED; else i.si_code = SVR4_CLD_STOPPED; } else { sig = WTERMSIG(st); if (sig >= 0 && sig < NSIG) i.si_status = SVR4_BSD2SVR4_SIG(sig); if (WCOREDUMP(st)) i.si_code = SVR4_CLD_DUMPED; else i.si_code = SVR4_CLD_KILLED; } DPRINTF(("siginfo [pid %ld signo %d code %d errno %d status %d]\n", i.si_pid, i.si_signo, i.si_code, i.si_errno, i.si_status)); return copyout(&i, s, sizeof(i)); } int svr4_sys_waitsys(td, uap) struct thread *td; struct svr4_sys_waitsys_args *uap; { int nfound; int error, *retval = td->td_retval; struct proc *q, *t; switch (SCARG(uap, grp)) { case SVR4_P_PID: break; case SVR4_P_PGID: SCARG(uap, id) = -td->td_proc->p_pgid; break; case SVR4_P_ALL: SCARG(uap, id) = WAIT_ANY; break; default: return EINVAL; } DPRINTF(("waitsys(%d, %d, %p, %x)\n", SCARG(uap, grp), SCARG(uap, id), SCARG(uap, info), SCARG(uap, options))); loop: nfound = 0; sx_slock(&proctree_lock); LIST_FOREACH(q, &td->td_proc->p_children, p_sibling) { + PROC_LOCK(q); if (SCARG(uap, id) != WAIT_ANY && q->p_pid != SCARG(uap, id) && q->p_pgid != -SCARG(uap, id)) { + PROC_UNLOCK(q); DPRINTF(("pid %d pgid %d != %d\n", q->p_pid, q->p_pgid, SCARG(uap, id))); continue; } nfound++; - PROC_LOCK(q); mtx_lock_spin(&sched_lock); if (q->p_stat == SZOMB && ((SCARG(uap, options) & (SVR4_WEXITED|SVR4_WTRAPPED)))) { mtx_unlock_spin(&sched_lock); PROC_UNLOCK(q); sx_sunlock(&proctree_lock); *retval = 0; DPRINTF(("found %d\n", q->p_pid)); error = svr4_setinfo(q, q->p_xstat, SCARG(uap, info)); if (error != 0) return error; if ((SCARG(uap, options) & SVR4_WNOWAIT)) { DPRINTF(("Don't wait\n")); return 0; } /* * If we got the child via ptrace(2) or procfs, and * the parent is different (meaning the process was * attached, rather than run as a child), then we need * to give it back to the old parent, and send the * parent a SIGCHLD. The rest of the cleanup will be * done when the old parent waits on the child. */ sx_xlock(&proctree_lock); PROC_LOCK(q); if (q->p_flag & P_TRACED) { if (q->p_oppid != q->p_pptr->p_pid) { PROC_UNLOCK(q); t = pfind(q->p_oppid); PROC_LOCK(q); proc_reparent(q, t ? t : initproc); q->p_oppid = 0; q->p_flag &= ~(P_TRACED | P_WAITED); PROC_UNLOCK(q); psignal(t, SIGCHLD); wakeup(t); PROC_UNLOCK(t); sx_xunlock(&proctree_lock); return 0; } } PROC_UNLOCK(q); sx_xunlock(&proctree_lock); q->p_xstat = 0; ruadd(&td->td_proc->p_stats->p_cru, q->p_ru); FREE(q->p_ru, M_ZOMBIE); q->p_ru = 0; /* * Decrement the count of procs running with this uid. */ (void)chgproccnt(q->p_ucred->cr_ruidinfo, -1, 0); /* * Release reference to text vnode. */ if (q->p_textvp) vrele(q->p_textvp); /* * Free up credentials. */ crfree(q->p_ucred); q->p_ucred = NULL; /* * Remove unused arguments */ if (q->p_args && --q->p_args->ar_ref == 0) FREE(q->p_args, M_PARGS); PROC_UNLOCK(q); /* * Finally finished with old proc entry. * Unlink it from its process group and free it. */ leavepgrp(q); sx_xlock(&allproc_lock); LIST_REMOVE(q, p_list); /* off zombproc */ sx_xunlock(&allproc_lock); sx_xlock(&proctree_lock); LIST_REMOVE(q, p_sibling); sx_xunlock(&proctree_lock); PROC_LOCK(q); if (--q->p_procsig->ps_refcnt == 0) { if (q->p_sigacts != &q->p_uarea->u_sigacts) FREE(q->p_sigacts, M_SUBPROC); FREE(q->p_procsig, M_SUBPROC); q->p_procsig = NULL; } PROC_UNLOCK(q); /* * Give machine-dependent layer a chance * to free anything that cpu_exit couldn't * release while still running in process context. */ cpu_wait(q); #if defined(__NetBSD__) pool_put(&proc_pool, q); #endif #ifdef __FreeBSD__ mtx_destroy(&q->p_mtx); zfree(proc_zone, q); #endif nprocs--; return 0; } if (q->p_stat == SSTOP && (q->p_flag & P_WAITED) == 0 && (q->p_flag & P_TRACED || (SCARG(uap, options) & (SVR4_WSTOPPED|SVR4_WCONTINUED)))) { mtx_unlock_spin(&sched_lock); DPRINTF(("jobcontrol %d\n", q->p_pid)); if (((SCARG(uap, options) & SVR4_WNOWAIT)) == 0) q->p_flag |= P_WAITED; PROC_UNLOCK(q); *retval = 0; return svr4_setinfo(q, W_STOPCODE(q->p_xstat), SCARG(uap, info)); } mtx_unlock_spin(&sched_lock); PROC_UNLOCK(q); } if (nfound == 0) return ECHILD; if (SCARG(uap, options) & SVR4_WNOHANG) { *retval = 0; if ((error = svr4_setinfo(NULL, 0, SCARG(uap, info))) != 0) return error; return 0; } if ((error = tsleep((caddr_t)td->td_proc, PWAIT | PCATCH, "svr4_wait", 0)) != 0) return error; goto loop; } static void bsd_statfs_to_svr4_statvfs(bfs, sfs) const struct statfs *bfs; struct svr4_statvfs *sfs; { sfs->f_bsize = bfs->f_iosize; /* XXX */ sfs->f_frsize = bfs->f_bsize; sfs->f_blocks = bfs->f_blocks; sfs->f_bfree = bfs->f_bfree; sfs->f_bavail = bfs->f_bavail; sfs->f_files = bfs->f_files; sfs->f_ffree = bfs->f_ffree; sfs->f_favail = bfs->f_ffree; sfs->f_fsid = bfs->f_fsid.val[0]; memcpy(sfs->f_basetype, bfs->f_fstypename, sizeof(sfs->f_basetype)); sfs->f_flag = 0; if (bfs->f_flags & MNT_RDONLY) sfs->f_flag |= SVR4_ST_RDONLY; if (bfs->f_flags & MNT_NOSUID) sfs->f_flag |= SVR4_ST_NOSUID; sfs->f_namemax = MAXNAMLEN; memcpy(sfs->f_fstr, bfs->f_fstypename, sizeof(sfs->f_fstr)); /* XXX */ memset(sfs->f_filler, 0, sizeof(sfs->f_filler)); } static void bsd_statfs_to_svr4_statvfs64(bfs, sfs) const struct statfs *bfs; struct svr4_statvfs64 *sfs; { sfs->f_bsize = bfs->f_iosize; /* XXX */ sfs->f_frsize = bfs->f_bsize; sfs->f_blocks = bfs->f_blocks; sfs->f_bfree = bfs->f_bfree; sfs->f_bavail = bfs->f_bavail; sfs->f_files = bfs->f_files; sfs->f_ffree = bfs->f_ffree; sfs->f_favail = bfs->f_ffree; sfs->f_fsid = bfs->f_fsid.val[0]; memcpy(sfs->f_basetype, bfs->f_fstypename, sizeof(sfs->f_basetype)); sfs->f_flag = 0; if (bfs->f_flags & MNT_RDONLY) sfs->f_flag |= SVR4_ST_RDONLY; if (bfs->f_flags & MNT_NOSUID) sfs->f_flag |= SVR4_ST_NOSUID; sfs->f_namemax = MAXNAMLEN; memcpy(sfs->f_fstr, bfs->f_fstypename, sizeof(sfs->f_fstr)); /* XXX */ memset(sfs->f_filler, 0, sizeof(sfs->f_filler)); } int svr4_sys_statvfs(td, uap) struct thread *td; struct svr4_sys_statvfs_args *uap; { struct statfs_args fs_args; caddr_t sg = stackgap_init(); struct statfs *fs = stackgap_alloc(&sg, sizeof(struct statfs)); struct statfs bfs; struct svr4_statvfs sfs; int error; CHECKALTEXIST(td, &sg, SCARG(uap, path)); SCARG(&fs_args, path) = SCARG(uap, path); SCARG(&fs_args, buf) = fs; if ((error = statfs(td, &fs_args)) != 0) return error; if ((error = copyin(fs, &bfs, sizeof(bfs))) != 0) return error; bsd_statfs_to_svr4_statvfs(&bfs, &sfs); return copyout(&sfs, SCARG(uap, fs), sizeof(sfs)); } int svr4_sys_fstatvfs(td, uap) struct thread *td; struct svr4_sys_fstatvfs_args *uap; { struct fstatfs_args fs_args; caddr_t sg = stackgap_init(); struct statfs *fs = stackgap_alloc(&sg, sizeof(struct statfs)); struct statfs bfs; struct svr4_statvfs sfs; int error; SCARG(&fs_args, fd) = SCARG(uap, fd); SCARG(&fs_args, buf) = fs; if ((error = fstatfs(td, &fs_args)) != 0) return error; if ((error = copyin(fs, &bfs, sizeof(bfs))) != 0) return error; bsd_statfs_to_svr4_statvfs(&bfs, &sfs); return copyout(&sfs, SCARG(uap, fs), sizeof(sfs)); } int svr4_sys_statvfs64(td, uap) struct thread *td; struct svr4_sys_statvfs64_args *uap; { struct statfs_args fs_args; caddr_t sg = stackgap_init(); struct statfs *fs = stackgap_alloc(&sg, sizeof(struct statfs)); struct statfs bfs; struct svr4_statvfs64 sfs; int error; CHECKALTEXIST(td, &sg, SCARG(uap, path)); SCARG(&fs_args, path) = SCARG(uap, path); SCARG(&fs_args, buf) = fs; if ((error = statfs(td, &fs_args)) != 0) return error; if ((error = copyin(fs, &bfs, sizeof(bfs))) != 0) return error; bsd_statfs_to_svr4_statvfs64(&bfs, &sfs); return copyout(&sfs, SCARG(uap, fs), sizeof(sfs)); } int svr4_sys_fstatvfs64(td, uap) struct thread *td; struct svr4_sys_fstatvfs64_args *uap; { struct fstatfs_args fs_args; caddr_t sg = stackgap_init(); struct statfs *fs = stackgap_alloc(&sg, sizeof(struct statfs)); struct statfs bfs; struct svr4_statvfs64 sfs; int error; SCARG(&fs_args, fd) = SCARG(uap, fd); SCARG(&fs_args, buf) = fs; if ((error = fstatfs(td, &fs_args)) != 0) return error; if ((error = copyin(fs, &bfs, sizeof(bfs))) != 0) return error; bsd_statfs_to_svr4_statvfs64(&bfs, &sfs); return copyout(&sfs, SCARG(uap, fs), sizeof(sfs)); } int svr4_sys_alarm(td, uap) struct thread *td; struct svr4_sys_alarm_args *uap; { int error; struct itimerval *itp, *oitp; struct setitimer_args sa; caddr_t sg = stackgap_init(); itp = stackgap_alloc(&sg, sizeof(*itp)); oitp = stackgap_alloc(&sg, sizeof(*oitp)); timevalclear(&itp->it_interval); itp->it_value.tv_sec = SCARG(uap, sec); itp->it_value.tv_usec = 0; SCARG(&sa, which) = ITIMER_REAL; SCARG(&sa, itv) = itp; SCARG(&sa, oitv) = oitp; error = setitimer(td, &sa); if (error) return error; if (oitp->it_value.tv_usec) oitp->it_value.tv_sec++; td->td_retval[0] = oitp->it_value.tv_sec; return 0; } int svr4_sys_gettimeofday(td, uap) struct thread *td; struct svr4_sys_gettimeofday_args *uap; { if (SCARG(uap, tp)) { struct timeval atv; microtime(&atv); return copyout(&atv, SCARG(uap, tp), sizeof (atv)); } return 0; } int svr4_sys_facl(td, uap) struct thread *td; struct svr4_sys_facl_args *uap; { int *retval; retval = td->td_retval; *retval = 0; switch (SCARG(uap, cmd)) { case SVR4_SYS_SETACL: /* We don't support acls on any filesystem */ return ENOSYS; case SVR4_SYS_GETACL: return copyout(retval, &SCARG(uap, num), sizeof(SCARG(uap, num))); case SVR4_SYS_GETACLCNT: return 0; default: return EINVAL; } } int svr4_sys_acl(td, uap) struct thread *td; struct svr4_sys_acl_args *uap; { /* XXX: for now the same */ return svr4_sys_facl(td, (struct svr4_sys_facl_args *)uap); } int svr4_sys_auditsys(td, uap) struct thread *td; struct svr4_sys_auditsys_args *uap; { /* * XXX: Big brother is *not* watching. */ return 0; } int svr4_sys_memcntl(td, uap) struct thread *td; struct svr4_sys_memcntl_args *uap; { switch (SCARG(uap, cmd)) { case SVR4_MC_SYNC: { struct msync_args msa; SCARG(&msa, addr) = SCARG(uap, addr); SCARG(&msa, len) = SCARG(uap, len); SCARG(&msa, flags) = (int)SCARG(uap, arg); return msync(td, &msa); } case SVR4_MC_ADVISE: { struct madvise_args maa; SCARG(&maa, addr) = SCARG(uap, addr); SCARG(&maa, len) = SCARG(uap, len); SCARG(&maa, behav) = (int)SCARG(uap, arg); return madvise(td, &maa); } case SVR4_MC_LOCK: case SVR4_MC_UNLOCK: case SVR4_MC_LOCKAS: case SVR4_MC_UNLOCKAS: return EOPNOTSUPP; default: return ENOSYS; } } int svr4_sys_nice(td, uap) struct thread *td; struct svr4_sys_nice_args *uap; { struct setpriority_args ap; int error; SCARG(&ap, which) = PRIO_PROCESS; SCARG(&ap, who) = 0; SCARG(&ap, prio) = SCARG(uap, prio); if ((error = setpriority(td, &ap)) != 0) return error; /* the cast is stupid, but the structures are the same */ if ((error = getpriority(td, (struct getpriority_args *)&ap)) != 0) return error; return 0; } int svr4_sys_resolvepath(td, uap) struct thread *td; struct svr4_sys_resolvepath_args *uap; { struct nameidata nd; int error, *retval = td->td_retval; NDINIT(&nd, LOOKUP, NOFOLLOW | SAVENAME, UIO_USERSPACE, SCARG(uap, path), td); if ((error = namei(&nd)) != 0) return error; if ((error = copyout(nd.ni_cnd.cn_pnbuf, SCARG(uap, buf), SCARG(uap, bufsiz))) != 0) goto bad; *retval = strlen(nd.ni_cnd.cn_pnbuf) < SCARG(uap, bufsiz) ? strlen(nd.ni_cnd.cn_pnbuf) + 1 : SCARG(uap, bufsiz); bad: NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_vp); return error; } Index: head/sys/compat/svr4/svr4_stream.c =================================================================== --- head/sys/compat/svr4/svr4_stream.c (revision 89305) +++ head/sys/compat/svr4/svr4_stream.c (revision 89306) @@ -1,2273 +1,2302 @@ /* * Copyright (c) 1998 Mark Newton. All rights reserved. * Copyright (c) 1994, 1996 Christos Zoulas. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Christos Zoulas. * 4. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ /* * Pretend that we have streams... * Yes, this is gross. * * ToDo: The state machine for getmsg needs re-thinking */ #define COMPAT_43 1 #include #include #include #include #include #include #include #include /* Must come after sys/malloc.h */ #include #include #include #include #include #include #include #include #include #include #include #include /* Must come after sys/uio.h */ #include #include #include #include #include #include #include #include #include #include #include #include /* Utils */ static int clean_pipe __P((struct thread *, const char *)); static void getparm __P((struct file *, struct svr4_si_sockparms *)); +static int svr4_do_putmsg __P((struct proc *, struct svr4_sys_putmsg_args *, + struct file *)); +static int svr4_do_getmsg __P((struct proc *, struct svr4_sys_getmsg_args *, + struct file *)); /* Address Conversions */ static void sockaddr_to_netaddr_in __P((struct svr4_strmcmd *, const struct sockaddr_in *)); static void sockaddr_to_netaddr_un __P((struct svr4_strmcmd *, const struct sockaddr_un *)); static void netaddr_to_sockaddr_in __P((struct sockaddr_in *, const struct svr4_strmcmd *)); static void netaddr_to_sockaddr_un __P((struct sockaddr_un *, const struct svr4_strmcmd *)); /* stream ioctls */ static int i_nread __P((struct file *, struct thread *, register_t *, int, u_long, caddr_t)); static int i_fdinsert __P((struct file *, struct thread *, register_t *, int, u_long, caddr_t)); static int i_str __P((struct file *, struct thread *, register_t *, int, u_long, caddr_t)); static int i_setsig __P((struct file *, struct thread *, register_t *, int, u_long, caddr_t)); static int i_getsig __P((struct file *, struct thread *, register_t *, int, u_long, caddr_t)); static int _i_bind_rsvd __P((struct file *, struct thread *, register_t *, int, u_long, caddr_t)); static int _i_rele_rsvd __P((struct file *, struct thread *, register_t *, int, u_long, caddr_t)); /* i_str sockmod calls */ static int sockmod __P((struct file *, int, struct svr4_strioctl *, struct thread *)); static int si_listen __P((struct file *, int, struct svr4_strioctl *, struct thread *)); static int si_ogetudata __P((struct file *, int, struct svr4_strioctl *, struct thread *)); static int si_sockparams __P((struct file *, int, struct svr4_strioctl *, struct thread *)); static int si_shutdown __P((struct file *, int, struct svr4_strioctl *, struct thread *)); static int si_getudata __P((struct file *, int, struct svr4_strioctl *, struct thread *)); /* i_str timod calls */ static int timod __P((struct file *, int, struct svr4_strioctl *, struct thread *)); static int ti_getinfo __P((struct file *, int, struct svr4_strioctl *, struct thread *)); static int ti_bind __P((struct file *, int, struct svr4_strioctl *, struct thread *)); /* infrastructure */ static int svr4_sendit __P((struct thread *td, int s, struct msghdr *mp, int flags)); static int svr4_recvit __P((struct thread *td, int s, struct msghdr *mp, caddr_t namelenp)); /* Ok, so we shouldn't use sendit() in uipc_syscalls.c because * it isn't part of a "public" interface; We're supposed to use * pru_sosend instead. Same goes for recvit()/pru_soreceive() for * that matter. Solution: Suck sendit()/recvit() into here where we * can do what we like. * * I hate code duplication. * * I will take out all the #ifdef COMPAT_OLDSOCK gumph, though. */ static int svr4_sendit(td, s, mp, flags) register struct thread *td; int s; register struct msghdr *mp; int flags; { struct uio auio; register struct iovec *iov; register int i; struct mbuf *control; struct sockaddr *to; int len, error; struct socket *so; #ifdef KTRACE struct iovec *ktriov = NULL; struct uio ktruio; #endif if ((error = fgetsock(td, s, &so, NULL)) != 0) return (error); auio.uio_iov = mp->msg_iov; auio.uio_iovcnt = mp->msg_iovlen; auio.uio_segflg = UIO_USERSPACE; auio.uio_rw = UIO_WRITE; auio.uio_td = td; auio.uio_offset = 0; /* XXX */ auio.uio_resid = 0; iov = mp->msg_iov; for (i = 0; i < mp->msg_iovlen; i++, iov++) { if ((auio.uio_resid += iov->iov_len) < 0) { error = EINVAL; goto done1; } } if (mp->msg_name) { error = getsockaddr(&to, mp->msg_name, mp->msg_namelen); if (error) goto done1; } else { to = 0; } if (mp->msg_control) { if (mp->msg_controllen < sizeof(struct cmsghdr)) { error = EINVAL; goto bad; } error = sockargs(&control, mp->msg_control, mp->msg_controllen, MT_CONTROL); if (error) goto bad; } else { control = 0; } #ifdef KTRACE if (KTRPOINT(td->td_proc, KTR_GENIO)) { int iovlen = auio.uio_iovcnt * sizeof (struct iovec); MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen); ktruio = auio; } #endif len = auio.uio_resid; error = so->so_proto->pr_usrreqs->pru_sosend(so, to, &auio, 0, control, flags, td); if (error) { if (auio.uio_resid != len && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; if (error == EPIPE) { PROC_LOCK(td->td_proc); psignal(td->td_proc, SIGPIPE); PROC_UNLOCK(td->td_proc); } } if (error == 0) td->td_retval[0] = len - auio.uio_resid; #ifdef KTRACE if (ktriov != NULL) { if (error == 0) { ktruio.uio_iov = ktriov; ktruio.uio_resid = td->td_retval[0]; ktrgenio(td->td_proc->p_tracep, s, UIO_WRITE, &ktruio, error); } FREE(ktriov, M_TEMP); } #endif bad: if (to) FREE(to, M_SONAME); done1: + fdrop(fp, td); fputsock(so); return (error); } static int svr4_recvit(td, s, mp, namelenp) register struct thread *td; int s; register struct msghdr *mp; caddr_t namelenp; { struct uio auio; register struct iovec *iov; register int i; int len, error; struct mbuf *m, *control = 0; caddr_t ctlbuf; struct socket *so; struct sockaddr *fromsa = 0; #ifdef KTRACE struct iovec *ktriov = NULL; struct uio ktruio; #endif if ((error = fgetsock(td, s, &so, NULL)) != 0) return (error); auio.uio_iov = mp->msg_iov; auio.uio_iovcnt = mp->msg_iovlen; auio.uio_segflg = UIO_USERSPACE; auio.uio_rw = UIO_READ; auio.uio_td = td; auio.uio_offset = 0; /* XXX */ auio.uio_resid = 0; iov = mp->msg_iov; for (i = 0; i < mp->msg_iovlen; i++, iov++) { if ((auio.uio_resid += iov->iov_len) < 0) { error = EINVAL; goto done1; } } #ifdef KTRACE if (KTRPOINT(td->td_proc, KTR_GENIO)) { int iovlen = auio.uio_iovcnt * sizeof (struct iovec); MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen); ktruio = auio; } #endif len = auio.uio_resid; error = so->so_proto->pr_usrreqs->pru_soreceive(so, &fromsa, &auio, (struct mbuf **)0, mp->msg_control ? &control : (struct mbuf **)0, &mp->msg_flags); if (error) { if (auio.uio_resid != len && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; } #ifdef KTRACE if (ktriov != NULL) { if (error == 0) { ktruio.uio_iov = ktriov; ktruio.uio_resid = len - auio.uio_resid; ktrgenio(td->td_proc->p_tracep, s, UIO_READ, &ktruio, error); } FREE(ktriov, M_TEMP); } #endif if (error) goto out; td->td_retval[0] = len - auio.uio_resid; if (mp->msg_name) { len = mp->msg_namelen; if (len <= 0 || fromsa == 0) len = 0; else { #ifndef MIN #define MIN(a,b) ((a)>(b)?(b):(a)) #endif /* save sa_len before it is destroyed by MSG_COMPAT */ len = MIN(len, fromsa->sa_len); error = copyout(fromsa, (caddr_t)mp->msg_name, (unsigned)len); if (error) goto out; } mp->msg_namelen = len; if (namelenp && (error = copyout((caddr_t)&len, namelenp, sizeof (int)))) { goto out; } } if (mp->msg_control) { len = mp->msg_controllen; m = control; mp->msg_controllen = 0; ctlbuf = (caddr_t) mp->msg_control; while (m && len > 0) { unsigned int tocopy; if (len >= m->m_len) tocopy = m->m_len; else { mp->msg_flags |= MSG_CTRUNC; tocopy = len; } if ((error = copyout((caddr_t)mtod(m, caddr_t), ctlbuf, tocopy)) != 0) goto out; ctlbuf += tocopy; len -= tocopy; m = m->m_next; } mp->msg_controllen = ctlbuf - (caddr_t)mp->msg_control; } out: if (fromsa) FREE(fromsa, M_SONAME); if (control) m_freem(control); done1: + fdrop(fp, td); fputsock(so); return (error); } #ifdef DEBUG_SVR4 static void bufprint __P((u_char *, size_t)); static int show_ioc __P((const char *, struct svr4_strioctl *)); static int show_strbuf __P((struct svr4_strbuf *)); static void show_msg __P((const char *, int, struct svr4_strbuf *, struct svr4_strbuf *, int)); static void bufprint(buf, len) u_char *buf; size_t len; { size_t i; uprintf("\n\t"); for (i = 0; i < len; i++) { uprintf("%x ", buf[i]); if (i && (i % 16) == 0) uprintf("\n\t"); } } static int show_ioc(str, ioc) const char *str; struct svr4_strioctl *ioc; { u_char *ptr = (u_char *) malloc(ioc->len, M_TEMP, M_WAITOK); int error; uprintf("%s cmd = %ld, timeout = %d, len = %d, buf = %p { ", str, ioc->cmd, ioc->timeout, ioc->len, ioc->buf); if ((error = copyin(ioc->buf, ptr, ioc->len)) != 0) { free((char *) ptr, M_TEMP); return error; } bufprint(ptr, ioc->len); uprintf("}\n"); free((char *) ptr, M_TEMP); return 0; } static int show_strbuf(str) struct svr4_strbuf *str; { int error; u_char *ptr = NULL; int maxlen = str->maxlen; int len = str->len; if (maxlen < 0) maxlen = 0; if (len >= maxlen) len = maxlen; if (len > 0) { ptr = (u_char *) malloc(len, M_TEMP, M_WAITOK); if ((error = copyin(str->buf, ptr, len)) != 0) { free((char *) ptr, M_TEMP); return error; } } uprintf(", { %d, %d, %p=[ ", str->maxlen, str->len, str->buf); if (ptr) bufprint(ptr, len); uprintf("]}"); if (ptr) free((char *) ptr, M_TEMP); return 0; } static void show_msg(str, fd, ctl, dat, flags) const char *str; int fd; struct svr4_strbuf *ctl; struct svr4_strbuf *dat; int flags; { struct svr4_strbuf buf; int error; uprintf("%s(%d", str, fd); if (ctl != NULL) { if ((error = copyin(ctl, &buf, sizeof(buf))) != 0) return; show_strbuf(&buf); } else uprintf(", NULL"); if (dat != NULL) { if ((error = copyin(dat, &buf, sizeof(buf))) != 0) return; show_strbuf(&buf); } else uprintf(", NULL"); uprintf(", %x);\n", flags); } #endif /* DEBUG_SVR4 */ /* * We are faced with an interesting situation. On svr4 unix sockets * are really pipes. But we really have sockets, and we might as * well use them. At the point where svr4 calls TI_BIND, it has * already created a named pipe for the socket using mknod(2). * We need to create a socket with the same name when we bind, * so we need to remove the pipe before, otherwise we'll get address * already in use. So we *carefully* remove the pipe, to avoid * using this as a random file removal tool. We use system calls * to avoid code duplication. */ static int clean_pipe(td, path) struct thread *td; const char *path; { struct lstat_args la; struct unlink_args ua; struct stat st; int error; caddr_t sg = stackgap_init(); size_t l = strlen(path) + 1; void *tpath; tpath = stackgap_alloc(&sg, l); SCARG(&la, ub) = stackgap_alloc(&sg, sizeof(struct stat)); if ((error = copyout(path, tpath, l)) != 0) return error; SCARG(&la, path) = tpath; if ((error = lstat(td, &la)) != 0) return 0; if ((error = copyin(SCARG(&la, ub), &st, sizeof(st))) != 0) return 0; /* * Make sure we are dealing with a mode 0 named pipe. */ if ((st.st_mode & S_IFMT) != S_IFIFO) return 0; if ((st.st_mode & ALLPERMS) != 0) return 0; SCARG(&ua, path) = SCARG(&la, path); if ((error = unlink(td, &ua)) != 0) { DPRINTF(("clean_pipe: unlink failed %d\n", error)); return error; } return 0; } static void sockaddr_to_netaddr_in(sc, sain) struct svr4_strmcmd *sc; const struct sockaddr_in *sain; { struct svr4_netaddr_in *na; na = SVR4_ADDROF(sc); na->family = sain->sin_family; na->port = sain->sin_port; na->addr = sain->sin_addr.s_addr; DPRINTF(("sockaddr_in -> netaddr %d %d %lx\n", na->family, na->port, na->addr)); } static void sockaddr_to_netaddr_un(sc, saun) struct svr4_strmcmd *sc; const struct sockaddr_un *saun; { struct svr4_netaddr_un *na; char *dst, *edst = ((char *) sc) + sc->offs + sizeof(na->family) + 1 - sizeof(*sc); const char *src; na = SVR4_ADDROF(sc); na->family = saun->sun_family; for (src = saun->sun_path, dst = na->path; (*dst++ = *src++) != '\0'; ) if (dst == edst) break; DPRINTF(("sockaddr_un -> netaddr %d %s\n", na->family, na->path)); } static void netaddr_to_sockaddr_in(sain, sc) struct sockaddr_in *sain; const struct svr4_strmcmd *sc; { const struct svr4_netaddr_in *na; na = SVR4_C_ADDROF(sc); memset(sain, 0, sizeof(*sain)); sain->sin_len = sizeof(*sain); sain->sin_family = na->family; sain->sin_port = na->port; sain->sin_addr.s_addr = na->addr; DPRINTF(("netaddr -> sockaddr_in %d %d %x\n", sain->sin_family, sain->sin_port, sain->sin_addr.s_addr)); } static void netaddr_to_sockaddr_un(saun, sc) struct sockaddr_un *saun; const struct svr4_strmcmd *sc; { const struct svr4_netaddr_un *na; char *dst, *edst = &saun->sun_path[sizeof(saun->sun_path) - 1]; const char *src; na = SVR4_C_ADDROF(sc); memset(saun, 0, sizeof(*saun)); saun->sun_family = na->family; for (src = na->path, dst = saun->sun_path; (*dst++ = *src++) != '\0'; ) if (dst == edst) break; saun->sun_len = dst - saun->sun_path; DPRINTF(("netaddr -> sockaddr_un %d %s\n", saun->sun_family, saun->sun_path)); } static void getparm(fp, pa) struct file *fp; struct svr4_si_sockparms *pa; { - struct svr4_strm *st = svr4_stream_get(fp); - struct socket *so = (struct socket *) fp->f_data; + struct svr4_strm *st; + struct socket *so; + st = svr4_stream_get(fp); if (st == NULL) return; + so = (struct socket *) fp->f_data; + pa->family = st->s_family; switch (so->so_type) { case SOCK_DGRAM: pa->type = SVR4_T_CLTS; pa->protocol = IPPROTO_UDP; DPRINTF(("getparm(dgram)\n")); return; case SOCK_STREAM: pa->type = SVR4_T_COTS; /* What about T_COTS_ORD? XXX */ pa->protocol = IPPROTO_IP; DPRINTF(("getparm(stream)\n")); return; case SOCK_RAW: pa->type = SVR4_T_CLTS; pa->protocol = IPPROTO_RAW; DPRINTF(("getparm(raw)\n")); return; default: pa->type = 0; pa->protocol = 0; DPRINTF(("getparm(type %d?)\n", so->so_type)); return; } } static int si_ogetudata(fp, fd, ioc, td) struct file *fp; int fd; struct svr4_strioctl *ioc; struct thread *td; { int error; struct svr4_si_oudata ud; struct svr4_si_sockparms pa; if (ioc->len != sizeof(ud) && ioc->len != sizeof(ud) - sizeof(int)) { DPRINTF(("SI_OGETUDATA: Wrong size %d != %d\n", sizeof(ud), ioc->len)); return EINVAL; } if ((error = copyin(ioc->buf, &ud, sizeof(ud))) != 0) return error; getparm(fp, &pa); switch (pa.family) { case AF_INET: ud.tidusize = 16384; ud.addrsize = sizeof(struct svr4_sockaddr_in); if (pa.type == SVR4_SOCK_STREAM) ud.etsdusize = 1; else ud.etsdusize = 0; break; case AF_LOCAL: ud.tidusize = 65536; ud.addrsize = 128; ud.etsdusize = 128; break; default: DPRINTF(("SI_OGETUDATA: Unsupported address family %d\n", pa.family)); return ENOSYS; } /* I have no idea what these should be! */ ud.optsize = 128; ud.tsdusize = 128; ud.servtype = pa.type; /* XXX: Fixme */ ud.so_state = 0; ud.so_options = 0; return copyout(&ud, ioc->buf, ioc->len); } static int si_sockparams(fp, fd, ioc, td) struct file *fp; int fd; struct svr4_strioctl *ioc; struct thread *td; { struct svr4_si_sockparms pa; getparm(fp, &pa); return copyout(&pa, ioc->buf, sizeof(pa)); } static int si_listen(fp, fd, ioc, td) struct file *fp; int fd; struct svr4_strioctl *ioc; struct thread *td; { int error; struct svr4_strm *st = svr4_stream_get(fp); struct svr4_strmcmd lst; struct listen_args la; if (st == NULL) return EINVAL; if ((error = copyin(ioc->buf, &lst, ioc->len)) != 0) return error; if (lst.cmd != SVR4_TI_OLD_BIND_REQUEST) { DPRINTF(("si_listen: bad request %ld\n", lst.cmd)); return EINVAL; } /* * We are making assumptions again... */ SCARG(&la, s) = fd; DPRINTF(("SI_LISTEN: fileno %d backlog = %d\n", fd, 5)); SCARG(&la, backlog) = 5; if ((error = listen(td, &la)) != 0) { DPRINTF(("SI_LISTEN: listen failed %d\n", error)); return error; } st->s_cmd = SVR4_TI__ACCEPT_WAIT; lst.cmd = SVR4_TI_BIND_REPLY; switch (st->s_family) { case AF_INET: /* XXX: Fill the length here */ break; case AF_LOCAL: lst.len = 140; lst.pad[28] = 0x00000000; /* magic again */ lst.pad[29] = 0x00000800; /* magic again */ lst.pad[30] = 0x80001400; /* magic again */ break; default: DPRINTF(("SI_LISTEN: Unsupported address family %d\n", st->s_family)); return ENOSYS; } if ((error = copyout(&lst, ioc->buf, ioc->len)) != 0) return error; return 0; } static int si_getudata(fp, fd, ioc, td) struct file *fp; int fd; struct svr4_strioctl *ioc; struct thread *td; { int error; struct svr4_si_udata ud; if (sizeof(ud) != ioc->len) { DPRINTF(("SI_GETUDATA: Wrong size %d != %d\n", sizeof(ud), ioc->len)); return EINVAL; } if ((error = copyin(ioc->buf, &ud, sizeof(ud))) != 0) return error; getparm(fp, &ud.sockparms); switch (ud.sockparms.family) { case AF_INET: DPRINTF(("getudata_inet\n")); ud.tidusize = 16384; ud.tsdusize = 16384; ud.addrsize = sizeof(struct svr4_sockaddr_in); if (ud.sockparms.type == SVR4_SOCK_STREAM) ud.etsdusize = 1; else ud.etsdusize = 0; ud.optsize = 0; break; case AF_LOCAL: DPRINTF(("getudata_local\n")); ud.tidusize = 65536; ud.tsdusize = 128; ud.addrsize = 128; ud.etsdusize = 128; ud.optsize = 128; break; default: DPRINTF(("SI_GETUDATA: Unsupported address family %d\n", ud.sockparms.family)); return ENOSYS; } ud.servtype = ud.sockparms.type; DPRINTF(("ud.servtype = %d\n", ud.servtype)); /* XXX: Fixme */ ud.so_state = 0; ud.so_options = 0; return copyout(&ud, ioc->buf, sizeof(ud)); } static int si_shutdown(fp, fd, ioc, td) struct file *fp; int fd; struct svr4_strioctl *ioc; struct thread *td; { int error; struct shutdown_args ap; if (ioc->len != sizeof(SCARG(&ap, how))) { DPRINTF(("SI_SHUTDOWN: Wrong size %d != %d\n", sizeof(SCARG(&ap, how)), ioc->len)); return EINVAL; } if ((error = copyin(ioc->buf, &SCARG(&ap, how), ioc->len)) != 0) return error; SCARG(&ap, s) = fd; return shutdown(td, &ap); } static int sockmod(fp, fd, ioc, td) struct file *fp; int fd; struct svr4_strioctl *ioc; struct thread *td; { switch (ioc->cmd) { case SVR4_SI_OGETUDATA: DPRINTF(("SI_OGETUDATA\n")); return si_ogetudata(fp, fd, ioc, td); case SVR4_SI_SHUTDOWN: DPRINTF(("SI_SHUTDOWN\n")); return si_shutdown(fp, fd, ioc, td); case SVR4_SI_LISTEN: DPRINTF(("SI_LISTEN\n")); return si_listen(fp, fd, ioc, td); case SVR4_SI_SETMYNAME: DPRINTF(("SI_SETMYNAME\n")); return 0; case SVR4_SI_SETPEERNAME: DPRINTF(("SI_SETPEERNAME\n")); return 0; case SVR4_SI_GETINTRANSIT: DPRINTF(("SI_GETINTRANSIT\n")); return 0; case SVR4_SI_TCL_LINK: DPRINTF(("SI_TCL_LINK\n")); return 0; case SVR4_SI_TCL_UNLINK: DPRINTF(("SI_TCL_UNLINK\n")); return 0; case SVR4_SI_SOCKPARAMS: DPRINTF(("SI_SOCKPARAMS\n")); return si_sockparams(fp, fd, ioc, td); case SVR4_SI_GETUDATA: DPRINTF(("SI_GETUDATA\n")); return si_getudata(fp, fd, ioc, td); default: DPRINTF(("Unknown sockmod ioctl %lx\n", ioc->cmd)); return 0; } } static int ti_getinfo(fp, fd, ioc, td) struct file *fp; int fd; struct svr4_strioctl *ioc; struct thread *td; { int error; struct svr4_infocmd info; memset(&info, 0, sizeof(info)); if ((error = copyin(ioc->buf, &info, ioc->len)) != 0) return error; if (info.cmd != SVR4_TI_INFO_REQUEST) return EINVAL; info.cmd = SVR4_TI_INFO_REPLY; info.tsdu = 0; info.etsdu = 1; info.cdata = -2; info.ddata = -2; info.addr = 16; info.opt = -1; info.tidu = 16384; info.serv = 2; info.current = 0; info.provider = 2; ioc->len = sizeof(info); if ((error = copyout(&info, ioc->buf, ioc->len)) != 0) return error; return 0; } static int ti_bind(fp, fd, ioc, td) struct file *fp; int fd; struct svr4_strioctl *ioc; struct thread *td; { int error; struct svr4_strm *st = svr4_stream_get(fp); struct sockaddr_in sain; struct sockaddr_un saun; caddr_t sg; void *skp, *sup = NULL; int sasize; struct svr4_strmcmd bnd; struct bind_args ba; if (st == NULL) { DPRINTF(("ti_bind: bad file descriptor\n")); return EINVAL; } if ((error = copyin(ioc->buf, &bnd, ioc->len)) != 0) return error; if (bnd.cmd != SVR4_TI_OLD_BIND_REQUEST) { DPRINTF(("ti_bind: bad request %ld\n", bnd.cmd)); return EINVAL; } switch (st->s_family) { case AF_INET: skp = &sain; sasize = sizeof(sain); if (bnd.offs == 0) goto reply; netaddr_to_sockaddr_in(&sain, &bnd); DPRINTF(("TI_BIND: fam %d, port %d, addr %x\n", sain.sin_family, sain.sin_port, sain.sin_addr.s_addr)); break; case AF_LOCAL: skp = &saun; sasize = sizeof(saun); if (bnd.offs == 0) goto reply; netaddr_to_sockaddr_un(&saun, &bnd); if (saun.sun_path[0] == '\0') goto reply; DPRINTF(("TI_BIND: fam %d, path %s\n", saun.sun_family, saun.sun_path)); if ((error = clean_pipe(td, saun.sun_path)) != 0) return error; bnd.pad[28] = 0x00001000; /* magic again */ break; default: DPRINTF(("TI_BIND: Unsupported address family %d\n", st->s_family)); return ENOSYS; } sg = stackgap_init(); sup = stackgap_alloc(&sg, sasize); if ((error = copyout(skp, sup, sasize)) != 0) return error; SCARG(&ba, s) = fd; DPRINTF(("TI_BIND: fileno %d\n", fd)); SCARG(&ba, name) = (void *) sup; SCARG(&ba, namelen) = sasize; if ((error = bind(td, &ba)) != 0) { DPRINTF(("TI_BIND: bind failed %d\n", error)); return error; } reply: if (sup == NULL) { memset(&bnd, 0, sizeof(bnd)); bnd.len = sasize + 4; bnd.offs = 0x10; /* XXX */ } bnd.cmd = SVR4_TI_BIND_REPLY; if ((error = copyout(&bnd, ioc->buf, ioc->len)) != 0) return error; return 0; } static int timod(fp, fd, ioc, td) struct file *fp; int fd; struct svr4_strioctl *ioc; struct thread *td; { switch (ioc->cmd) { case SVR4_TI_GETINFO: DPRINTF(("TI_GETINFO\n")); return ti_getinfo(fp, fd, ioc, td); case SVR4_TI_OPTMGMT: DPRINTF(("TI_OPTMGMT\n")); return 0; case SVR4_TI_BIND: DPRINTF(("TI_BIND\n")); return ti_bind(fp, fd, ioc, td); case SVR4_TI_UNBIND: DPRINTF(("TI_UNBIND\n")); return 0; default: DPRINTF(("Unknown timod ioctl %lx\n", ioc->cmd)); return 0; } } int svr4_stream_ti_ioctl(fp, td, retval, fd, cmd, dat) struct file *fp; struct thread *td; register_t *retval; int fd; u_long cmd; caddr_t dat; { struct svr4_strbuf skb, *sub = (struct svr4_strbuf *) dat; struct svr4_strm *st = svr4_stream_get(fp); int error; void *skp, *sup; struct sockaddr_in sain; struct sockaddr_un saun; struct svr4_strmcmd sc; int sasize; caddr_t sg; int *lenp; DPRINTF(("svr4_stream_ti_ioctl\n")); if (st == NULL) return EINVAL; sc.offs = 0x10; if ((error = copyin(sub, &skb, sizeof(skb))) != 0) { DPRINTF(("ti_ioctl: error copying in strbuf\n")); return error; } switch (st->s_family) { case AF_INET: skp = &sain; sasize = sizeof(sain); break; case AF_LOCAL: skp = &saun; sasize = sizeof(saun); break; default: DPRINTF(("ti_ioctl: Unsupported address family %d\n", st->s_family)); return ENOSYS; } sg = stackgap_init(); sup = stackgap_alloc(&sg, sasize); lenp = stackgap_alloc(&sg, sizeof(*lenp)); if ((error = copyout(&sasize, lenp, sizeof(*lenp))) != 0) { DPRINTF(("ti_ioctl: error copying out lenp\n")); return error; } switch (cmd) { case SVR4_TI_GETMYNAME: DPRINTF(("TI_GETMYNAME\n")); { struct getsockname_args ap; SCARG(&ap, fdes) = fd; SCARG(&ap, asa) = sup; SCARG(&ap, alen) = lenp; if ((error = getsockname(td, &ap)) != 0) { DPRINTF(("ti_ioctl: getsockname error\n")); return error; } } break; case SVR4_TI_GETPEERNAME: DPRINTF(("TI_GETPEERNAME\n")); { struct getpeername_args ap; SCARG(&ap, fdes) = fd; SCARG(&ap, asa) = sup; SCARG(&ap, alen) = lenp; if ((error = getpeername(td, &ap)) != 0) { DPRINTF(("ti_ioctl: getpeername error\n")); return error; } } break; case SVR4_TI_SETMYNAME: DPRINTF(("TI_SETMYNAME\n")); return 0; case SVR4_TI_SETPEERNAME: DPRINTF(("TI_SETPEERNAME\n")); return 0; default: DPRINTF(("ti_ioctl: Unknown ioctl %lx\n", cmd)); return ENOSYS; } if ((error = copyin(sup, skp, sasize)) != 0) { DPRINTF(("ti_ioctl: error copying in socket data\n")); return error; } if ((error = copyin(lenp, &sasize, sizeof(*lenp))) != 0) { DPRINTF(("ti_ioctl: error copying in socket size\n")); return error; } switch (st->s_family) { case AF_INET: sockaddr_to_netaddr_in(&sc, &sain); skb.len = sasize; break; case AF_LOCAL: sockaddr_to_netaddr_un(&sc, &saun); skb.len = sasize + 4; break; default: return ENOSYS; } if ((error = copyout(SVR4_ADDROF(&sc), skb.buf, sasize)) != 0) { DPRINTF(("ti_ioctl: error copying out socket data\n")); return error; } if ((error = copyout(&skb, sub, sizeof(skb))) != 0) { DPRINTF(("ti_ioctl: error copying out strbuf\n")); return error; } return error; } static int i_nread(fp, td, retval, fd, cmd, dat) struct file *fp; struct thread *td; register_t *retval; int fd; u_long cmd; caddr_t dat; { int error; int nread = 0; /* * We are supposed to return the message length in nread, and the * number of messages in retval. We don't have the notion of number * of stream messages, so we just find out if we have any bytes waiting * for us, and if we do, then we assume that we have at least one * message waiting for us. */ if ((error = fo_ioctl(fp, FIONREAD, (caddr_t) &nread, td)) != 0) return error; if (nread != 0) *retval = 1; else *retval = 0; return copyout(&nread, dat, sizeof(nread)); } static int i_fdinsert(fp, td, retval, fd, cmd, dat) struct file *fp; struct thread *td; register_t *retval; int fd; u_long cmd; caddr_t dat; { /* * Major hack again here. We assume that we are using this to * implement accept(2). If that is the case, we have already * called accept, and we have stored the file descriptor in * afd. We find the file descriptor that the code wants to use * in fd insert, and then we dup2() our accepted file descriptor * to it. */ int error; struct svr4_strm *st = svr4_stream_get(fp); struct svr4_strfdinsert fdi; struct dup2_args d2p; struct close_args clp; if (st == NULL) { DPRINTF(("fdinsert: bad file type\n")); return EINVAL; } if (st->s_afd == -1) { DPRINTF(("fdinsert: accept fd not found\n")); return ENOENT; } if ((error = copyin(dat, &fdi, sizeof(fdi))) != 0) { DPRINTF(("fdinsert: copyin failed %d\n", error)); return error; } SCARG(&d2p, from) = st->s_afd; SCARG(&d2p, to) = fdi.fd; if ((error = dup2(td, &d2p)) != 0) { DPRINTF(("fdinsert: dup2(%d, %d) failed %d\n", st->s_afd, fdi.fd, error)); return error; } SCARG(&clp, fd) = st->s_afd; if ((error = close(td, &clp)) != 0) { DPRINTF(("fdinsert: close(%d) failed %d\n", st->s_afd, error)); return error; } st->s_afd = -1; *retval = 0; return 0; } static int _i_bind_rsvd(fp, td, retval, fd, cmd, dat) struct file *fp; struct thread *td; register_t *retval; int fd; u_long cmd; caddr_t dat; { struct mkfifo_args ap; /* * This is a supposed to be a kernel and library only ioctl. * It gets called before ti_bind, when we have a unix * socket, to physically create the socket transport and * ``reserve'' it. I don't know how this get reserved inside * the kernel, but we are going to create it nevertheless. */ SCARG(&ap, path) = dat; SCARG(&ap, mode) = S_IFIFO; return mkfifo(td, &ap); } static int _i_rele_rsvd(fp, td, retval, fd, cmd, dat) struct file *fp; struct thread *td; register_t *retval; int fd; u_long cmd; caddr_t dat; { struct unlink_args ap; /* * This is a supposed to be a kernel and library only ioctl. * I guess it is supposed to release the socket. */ SCARG(&ap, path) = dat; return unlink(td, &ap); } static int i_str(fp, td, retval, fd, cmd, dat) struct file *fp; struct thread *td; register_t *retval; int fd; u_long cmd; caddr_t dat; { int error; struct svr4_strioctl ioc; if ((error = copyin(dat, &ioc, sizeof(ioc))) != 0) return error; #ifdef DEBUG_SVR4 if ((error = show_ioc(">", &ioc)) != 0) return error; #endif /* DEBUG_SVR4 */ switch (ioc.cmd & 0xff00) { case SVR4_SIMOD: if ((error = sockmod(fp, fd, &ioc, td)) != 0) return error; break; case SVR4_TIMOD: if ((error = timod(fp, fd, &ioc, td)) != 0) return error; break; default: DPRINTF(("Unimplemented module %c %ld\n", (char) (cmd >> 8), cmd & 0xff)); return 0; } #ifdef DEBUG_SVR4 if ((error = show_ioc("<", &ioc)) != 0) return error; #endif /* DEBUG_SVR4 */ return copyout(&ioc, dat, sizeof(ioc)); } static int i_setsig(fp, td, retval, fd, cmd, dat) struct file *fp; struct thread *td; register_t *retval; int fd; u_long cmd; caddr_t dat; { /* * This is the best we can do for now; we cannot generate * signals only for specific events so the signal mask gets * ignored; we save it just to pass it to a possible I_GETSIG... * * We alse have to fix the O_ASYNC fcntl bit, so the * process will get SIGPOLLs. */ struct fcntl_args fa; int error; register_t oflags, flags; struct svr4_strm *st = svr4_stream_get(fp); if (st == NULL) { DPRINTF(("i_setsig: bad file descriptor\n")); return EINVAL; } /* get old status flags */ SCARG(&fa, fd) = fd; SCARG(&fa, cmd) = F_GETFL; if ((error = fcntl(td, &fa)) != 0) return error; oflags = td->td_retval[0]; /* update the flags */ if (dat != NULL) { int mask; flags = oflags | O_ASYNC; if ((error = copyin(dat, &mask, sizeof(mask))) != 0) { DPRINTF(("i_setsig: bad eventmask pointer\n")); return error; } if (mask & SVR4_S_ALLMASK) { DPRINTF(("i_setsig: bad eventmask data %x\n", mask)); return EINVAL; } st->s_eventmask = mask; } else { flags = oflags & ~O_ASYNC; st->s_eventmask = 0; } /* set the new flags, if changed */ if (flags != oflags) { SCARG(&fa, cmd) = F_SETFL; SCARG(&fa, arg) = (long) flags; if ((error = fcntl(td, &fa)) != 0) return error; flags = td->td_retval[0]; } /* set up SIGIO receiver if needed */ if (dat != NULL) { SCARG(&fa, cmd) = F_SETOWN; SCARG(&fa, arg) = (long) td->td_proc->p_pid; return fcntl(td, &fa); } return 0; } static int i_getsig(fp, td, retval, fd, cmd, dat) struct file *fp; struct thread *td; register_t *retval; int fd; u_long cmd; caddr_t dat; { int error; if (dat != NULL) { struct svr4_strm *st = svr4_stream_get(fp); if (st == NULL) { DPRINTF(("i_getsig: bad file descriptor\n")); return EINVAL; } if ((error = copyout(&st->s_eventmask, dat, sizeof(st->s_eventmask))) != 0) { DPRINTF(("i_getsig: bad eventmask pointer\n")); return error; } } return 0; } int svr4_stream_ioctl(fp, td, retval, fd, cmd, dat) struct file *fp; struct thread *td; register_t *retval; int fd; u_long cmd; caddr_t dat; { *retval = 0; /* * All the following stuff assumes "sockmod" is pushed... */ switch (cmd) { case SVR4_I_NREAD: DPRINTF(("I_NREAD\n")); return i_nread(fp, td, retval, fd, cmd, dat); case SVR4_I_PUSH: DPRINTF(("I_PUSH %p\n", dat)); #if defined(DEBUG_SVR4) show_strbuf((struct svr4_strbuf *)dat); #endif return 0; case SVR4_I_POP: DPRINTF(("I_POP\n")); return 0; case SVR4_I_LOOK: DPRINTF(("I_LOOK\n")); return 0; case SVR4_I_FLUSH: DPRINTF(("I_FLUSH\n")); return 0; case SVR4_I_SRDOPT: DPRINTF(("I_SRDOPT\n")); return 0; case SVR4_I_GRDOPT: DPRINTF(("I_GRDOPT\n")); return 0; case SVR4_I_STR: DPRINTF(("I_STR\n")); return i_str(fp, td, retval, fd, cmd, dat); case SVR4_I_SETSIG: DPRINTF(("I_SETSIG\n")); return i_setsig(fp, td, retval, fd, cmd, dat); case SVR4_I_GETSIG: DPRINTF(("I_GETSIG\n")); return i_getsig(fp, td, retval, fd, cmd, dat); case SVR4_I_FIND: DPRINTF(("I_FIND\n")); /* * Here we are not pushing modules really, we just * pretend all are present */ *retval = 0; return 0; case SVR4_I_LINK: DPRINTF(("I_LINK\n")); return 0; case SVR4_I_UNLINK: DPRINTF(("I_UNLINK\n")); return 0; case SVR4_I_ERECVFD: DPRINTF(("I_ERECVFD\n")); return 0; case SVR4_I_PEEK: DPRINTF(("I_PEEK\n")); return 0; case SVR4_I_FDINSERT: DPRINTF(("I_FDINSERT\n")); return i_fdinsert(fp, td, retval, fd, cmd, dat); case SVR4_I_SENDFD: DPRINTF(("I_SENDFD\n")); return 0; case SVR4_I_RECVFD: DPRINTF(("I_RECVFD\n")); return 0; case SVR4_I_SWROPT: DPRINTF(("I_SWROPT\n")); return 0; case SVR4_I_GWROPT: DPRINTF(("I_GWROPT\n")); return 0; case SVR4_I_LIST: DPRINTF(("I_LIST\n")); return 0; case SVR4_I_PLINK: DPRINTF(("I_PLINK\n")); return 0; case SVR4_I_PUNLINK: DPRINTF(("I_PUNLINK\n")); return 0; case SVR4_I_SETEV: DPRINTF(("I_SETEV\n")); return 0; case SVR4_I_GETEV: DPRINTF(("I_GETEV\n")); return 0; case SVR4_I_STREV: DPRINTF(("I_STREV\n")); return 0; case SVR4_I_UNSTREV: DPRINTF(("I_UNSTREV\n")); return 0; case SVR4_I_FLUSHBAND: DPRINTF(("I_FLUSHBAND\n")); return 0; case SVR4_I_CKBAND: DPRINTF(("I_CKBAND\n")); return 0; case SVR4_I_GETBAND: DPRINTF(("I_GETBANK\n")); return 0; case SVR4_I_ATMARK: DPRINTF(("I_ATMARK\n")); return 0; case SVR4_I_SETCLTIME: DPRINTF(("I_SETCLTIME\n")); return 0; case SVR4_I_GETCLTIME: DPRINTF(("I_GETCLTIME\n")); return 0; case SVR4_I_CANPUT: DPRINTF(("I_CANPUT\n")); return 0; case SVR4__I_BIND_RSVD: DPRINTF(("_I_BIND_RSVD\n")); return _i_bind_rsvd(fp, td, retval, fd, cmd, dat); case SVR4__I_RELE_RSVD: DPRINTF(("_I_RELE_RSVD\n")); return _i_rele_rsvd(fp, td, retval, fd, cmd, dat); default: DPRINTF(("unimpl cmd = %lx\n", cmd)); break; } return 0; } int svr4_sys_putmsg(td, uap) register struct thread *td; struct svr4_sys_putmsg_args *uap; { - struct filedesc *fdp = td->td_proc->p_fd; + struct file *fp; + int error; + + fp = ffind_hold(td, uap->fd); + if (fp == NULL) { +#ifdef DEBUG_SVR4 + uprintf("putmsg: bad fp\n"); +#endif + return EBADF; + } + error = svr4_do_putmsg(td, uap, fp); + fdrop(fp, td); + return (error); +} + +static int +svr4_do_putmsg(td, uap, fp) + struct thread *td; + struct svr4_sys_putmsg_args *uap; struct file *fp; +{ struct svr4_strbuf dat, ctl; struct svr4_strmcmd sc; struct sockaddr_in sain; struct sockaddr_un saun; void *skp, *sup; int sasize, *retval; struct svr4_strm *st; int error; caddr_t sg; retval = td->td_retval; - fp = fdp->fd_ofiles[SCARG(uap, fd)]; - if (((u_int)SCARG(uap, fd) >= fdp->fd_nfiles) || (fp == NULL)) { #ifdef DEBUG_SVR4 - uprintf("putmsg: bad fp\n"); -#endif - return EBADF; - } - -#ifdef DEBUG_SVR4 show_msg(">putmsg", SCARG(uap, fd), SCARG(uap, ctl), SCARG(uap, dat), SCARG(uap, flags)); #endif /* DEBUG_SVR4 */ - if (((u_int)SCARG(uap, fd) >= fdp->fd_nfiles) || (fp == NULL)) { -#ifdef DEBUG_SVR4 - uprintf("putmsg: bad fp(2)\n"); -#endif - return EBADF; - } + FILE_LOCK_ASSERT(fp, MA_NOTOWNED); if (SCARG(uap, ctl) != NULL) { if ((error = copyin(SCARG(uap, ctl), &ctl, sizeof(ctl))) != 0) { #ifdef DEBUG_SVR4 uprintf("putmsg: copyin(): %d\n", error); #endif return error; } } else ctl.len = -1; if (SCARG(uap, dat) != NULL) { if ((error = copyin(SCARG(uap, dat), &dat, sizeof(dat))) != 0) { #ifdef DEBUG_SVR4 uprintf("putmsg: copyin(): %d (2)\n", error); #endif return error; } } else dat.len = -1; /* * Only for sockets for now. */ if ((st = svr4_stream_get(fp)) == NULL) { DPRINTF(("putmsg: bad file type\n")); return EINVAL; } if (ctl.len > sizeof(sc)) { DPRINTF(("putmsg: Bad control size %d != %d\n", ctl.len, sizeof(struct svr4_strmcmd))); return EINVAL; } if ((error = copyin(ctl.buf, &sc, ctl.len)) != 0) return error; switch (st->s_family) { case AF_INET: if (sc.len != sizeof(sain)) { if (sc.cmd == SVR4_TI_DATA_REQUEST) { struct write_args wa; /* Solaris seems to use sc.cmd = 3 to * send "expedited" data. telnet uses * this for options processing, sending EOF, * etc. I'm sure other things use it too. * I don't have any documentation * on it, so I'm making a guess that this * is how it works. newton@atdot.dotat.org XXX */ DPRINTF(("sending expedited data ??\n")); SCARG(&wa, fd) = SCARG(uap, fd); SCARG(&wa, buf) = dat.buf; SCARG(&wa, nbyte) = dat.len; return write(td, &wa); } DPRINTF(("putmsg: Invalid inet length %ld\n", sc.len)); return EINVAL; } netaddr_to_sockaddr_in(&sain, &sc); skp = &sain; sasize = sizeof(sain); error = sain.sin_family != st->s_family; break; case AF_LOCAL: if (ctl.len == 8) { /* We are doing an accept; succeed */ DPRINTF(("putmsg: Do nothing\n")); *retval = 0; return 0; } else { /* Maybe we've been given a device/inode pair */ udev_t *dev = SVR4_ADDROF(&sc); ino_t *ino = (ino_t *) &dev[1]; skp = svr4_find_socket(td, fp, *dev, *ino); if (skp == NULL) { skp = &saun; /* I guess we have it by name */ netaddr_to_sockaddr_un(skp, &sc); } sasize = sizeof(saun); } break; default: DPRINTF(("putmsg: Unsupported address family %d\n", st->s_family)); return ENOSYS; } sg = stackgap_init(); sup = stackgap_alloc(&sg, sasize); if ((error = copyout(skp, sup, sasize)) != 0) return error; switch (st->s_cmd = sc.cmd) { case SVR4_TI_CONNECT_REQUEST: /* connect */ { struct connect_args co; SCARG(&co, s) = SCARG(uap, fd); SCARG(&co, name) = (void *) sup; SCARG(&co, namelen) = (int) sasize; return connect(td, &co); } case SVR4_TI_SENDTO_REQUEST: /* sendto */ { struct msghdr msg; struct iovec aiov; msg.msg_name = (caddr_t) sup; msg.msg_namelen = sasize; msg.msg_iov = &aiov; msg.msg_iovlen = 1; msg.msg_control = 0; msg.msg_flags = 0; aiov.iov_base = dat.buf; aiov.iov_len = dat.len; #if 0 error = so->so_proto->pr_usrreqs->pru_sosend(so, 0, uio, 0, 0, 0, uio->uio_td); #endif error = svr4_sendit(td, SCARG(uap, fd), &msg, SCARG(uap, flags)); DPRINTF(("sendto_request error: %d\n", error)); *retval = 0; return error; } default: DPRINTF(("putmsg: Unimplemented command %lx\n", sc.cmd)); return ENOSYS; } } int +svr4_sys_getmsg(p, uap) + struct proc *p; + struct svr4_sys_getmsg_args *uap; +{ + struct file *fp; + int error; + + fp = ffind_hold(td, uap->fd); + if (fp == NULL) { +#ifdef DEBUG_SVR4 + uprintf("getmsg: bad fp\n"); +#endif + return EBADF; + } + error = svr4_do_getmsg(p, uap, fp); + fdrop(fp, td); + return (error); +} + +int svr4_sys_getmsg(td, uap) register struct thread *td; struct svr4_sys_getmsg_args *uap; + struct file *fp; { - struct filedesc *fdp = td->td_proc->p_fd; - struct file *fp; struct getpeername_args ga; struct accept_args aa; struct svr4_strbuf dat, ctl; struct svr4_strmcmd sc; int error, *retval; struct msghdr msg; struct iovec aiov; struct sockaddr_in sain; struct sockaddr_un saun; void *skp, *sup; int sasize; struct svr4_strm *st; int *flen; int fl; caddr_t sg; retval = td->td_retval; - fp = fdp->fd_ofiles[SCARG(uap, fd)]; - if (((u_int)SCARG(uap, fd) >= fdp->fd_nfiles) || (fp == NULL)) - return EBADF; + FILE_LOCK_ASSERT(fp, MA_NOTOWNED); memset(&sc, 0, sizeof(sc)); #ifdef DEBUG_SVR4 show_msg(">getmsg", SCARG(uap, fd), SCARG(uap, ctl), SCARG(uap, dat), 0); #endif /* DEBUG_SVR4 */ - - if (((u_int)SCARG(uap, fd) >= fdp->fd_nfiles) || (fp == NULL)) - return EBADF; if (SCARG(uap, ctl) != NULL) { if ((error = copyin(SCARG(uap, ctl), &ctl, sizeof(ctl))) != 0) return error; } else { ctl.len = -1; ctl.maxlen = 0; } if (SCARG(uap, dat) != NULL) { if ((error = copyin(SCARG(uap, dat), &dat, sizeof(dat))) != 0) return error; } else { dat.len = -1; dat.maxlen = 0; } /* * Only for sockets for now. */ if ((st = svr4_stream_get(fp)) == NULL) { DPRINTF(("getmsg: bad file type\n")); return EINVAL; } if (ctl.maxlen == -1 || dat.maxlen == -1) { DPRINTF(("getmsg: Cannot handle -1 maxlen (yet)\n")); return ENOSYS; } switch (st->s_family) { case AF_INET: skp = &sain; sasize = sizeof(sain); break; case AF_LOCAL: skp = &saun; sasize = sizeof(saun); break; default: DPRINTF(("getmsg: Unsupported address family %d\n", st->s_family)); return ENOSYS; } sg = stackgap_init(); sup = stackgap_alloc(&sg, sasize); flen = (int *) stackgap_alloc(&sg, sizeof(*flen)); fl = sasize; if ((error = copyout(&fl, flen, sizeof(fl))) != 0) return error; switch (st->s_cmd) { case SVR4_TI_CONNECT_REQUEST: DPRINTF(("getmsg: TI_CONNECT_REQUEST\n")); /* * We do the connect in one step, so the putmsg should * have gotten the error. */ sc.cmd = SVR4_TI_OK_REPLY; sc.len = 0; ctl.len = 8; dat.len = -1; fl = 1; st->s_cmd = sc.cmd; break; case SVR4_TI_OK_REPLY: DPRINTF(("getmsg: TI_OK_REPLY\n")); /* * We are immediately after a connect reply, so we send * a connect verification. */ SCARG(&ga, fdes) = SCARG(uap, fd); SCARG(&ga, asa) = (void *) sup; SCARG(&ga, alen) = flen; if ((error = getpeername(td, &ga)) != 0) { DPRINTF(("getmsg: getpeername failed %d\n", error)); return error; } if ((error = copyin(sup, skp, sasize)) != 0) return error; sc.cmd = SVR4_TI_CONNECT_REPLY; sc.pad[0] = 0x4; sc.offs = 0x18; sc.pad[1] = 0x14; sc.pad[2] = 0x04000402; switch (st->s_family) { case AF_INET: sc.len = sasize; sockaddr_to_netaddr_in(&sc, &sain); break; case AF_LOCAL: sc.len = sasize + 4; sockaddr_to_netaddr_un(&sc, &saun); break; default: return ENOSYS; } ctl.len = 40; dat.len = -1; fl = 0; st->s_cmd = sc.cmd; break; case SVR4_TI__ACCEPT_OK: DPRINTF(("getmsg: TI__ACCEPT_OK\n")); /* * We do the connect in one step, so the putmsg should * have gotten the error. */ sc.cmd = SVR4_TI_OK_REPLY; sc.len = 1; ctl.len = 8; dat.len = -1; fl = 1; st->s_cmd = SVR4_TI__ACCEPT_WAIT; break; case SVR4_TI__ACCEPT_WAIT: DPRINTF(("getmsg: TI__ACCEPT_WAIT\n")); /* * We are after a listen, so we try to accept... */ SCARG(&aa, s) = SCARG(uap, fd); SCARG(&aa, name) = (void *) sup; SCARG(&aa, anamelen) = flen; if ((error = accept(td, &aa)) != 0) { DPRINTF(("getmsg: accept failed %d\n", error)); return error; } st->s_afd = *retval; DPRINTF(("getmsg: Accept fd = %d\n", st->s_afd)); if ((error = copyin(sup, skp, sasize)) != 0) return error; sc.cmd = SVR4_TI_ACCEPT_REPLY; sc.offs = 0x18; sc.pad[0] = 0x0; switch (st->s_family) { case AF_INET: sc.pad[1] = 0x28; sockaddr_to_netaddr_in(&sc, &sain); ctl.len = 40; sc.len = sasize; break; case AF_LOCAL: sc.pad[1] = 0x00010000; sc.pad[2] = 0xf6bcdaa0; /* I don't know what that is */ sc.pad[3] = 0x00010000; ctl.len = 134; sc.len = sasize + 4; break; default: return ENOSYS; } dat.len = -1; fl = 0; st->s_cmd = SVR4_TI__ACCEPT_OK; break; case SVR4_TI_SENDTO_REQUEST: DPRINTF(("getmsg: TI_SENDTO_REQUEST\n")); if (ctl.maxlen > 36 && ctl.len < 36) ctl.len = 36; if ((error = copyin(ctl.buf, &sc, ctl.len)) != 0) return error; switch (st->s_family) { case AF_INET: sockaddr_to_netaddr_in(&sc, &sain); break; case AF_LOCAL: sockaddr_to_netaddr_un(&sc, &saun); break; default: return ENOSYS; } msg.msg_name = (caddr_t) sup; msg.msg_namelen = sasize; msg.msg_iov = &aiov; msg.msg_iovlen = 1; msg.msg_control = 0; aiov.iov_base = dat.buf; aiov.iov_len = dat.maxlen; msg.msg_flags = 0; error = svr4_recvit(td, SCARG(uap, fd), &msg, (caddr_t) flen); if (error) { DPRINTF(("getmsg: recvit failed %d\n", error)); return error; } if ((error = copyin(msg.msg_name, skp, sasize)) != 0) return error; sc.cmd = SVR4_TI_RECVFROM_IND; switch (st->s_family) { case AF_INET: sc.len = sasize; sockaddr_to_netaddr_in(&sc, &sain); break; case AF_LOCAL: sc.len = sasize + 4; sockaddr_to_netaddr_un(&sc, &saun); break; default: return ENOSYS; } dat.len = *retval; fl = 0; st->s_cmd = sc.cmd; break; default: st->s_cmd = sc.cmd; if (st->s_cmd == SVR4_TI_CONNECT_REQUEST) { struct read_args ra; /* More weirdness: Again, I can't find documentation * to back this up, but when a process does a generic * "getmsg()" call it seems that the command field is * zero and the length of the data area is zero. I * think processes expect getmsg() to fill in dat.len * after reading at most dat.maxlen octets from the * stream. Since we're using sockets I can let * read() look after it and frob return values * appropriately (or inappropriately :-) * -- newton@atdot.dotat.org XXX */ SCARG(&ra, fd) = SCARG(uap, fd); SCARG(&ra, buf) = dat.buf; SCARG(&ra, nbyte) = dat.maxlen; if ((error = read(td, &ra)) != 0) { return error; } dat.len = *retval; *retval = 0; st->s_cmd = SVR4_TI_SENDTO_REQUEST; break; } DPRINTF(("getmsg: Unknown state %x\n", st->s_cmd)); return EINVAL; } if (SCARG(uap, ctl)) { if (ctl.len != -1) if ((error = copyout(&sc, ctl.buf, ctl.len)) != 0) return error; if ((error = copyout(&ctl, SCARG(uap, ctl), sizeof(ctl))) != 0) return error; } if (SCARG(uap, dat)) { if ((error = copyout(&dat, SCARG(uap, dat), sizeof(dat))) != 0) return error; } if (SCARG(uap, flags)) { /* XXX: Need translation */ if ((error = copyout(&fl, SCARG(uap, flags), sizeof(fl))) != 0) return error; } *retval = 0; #ifdef DEBUG_SVR4 show_msg(" */ #include #include #include #include #include #include #include #if __FreeBSD_version >= 500005 #include #else #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static void aac_startup(void *arg); static void aac_add_container(struct aac_softc *sc, struct aac_mntinforesponse *mir, int f); /* Command Processing */ static void aac_startio(struct aac_softc *sc); static void aac_timeout(struct aac_softc *sc); static int aac_start(struct aac_command *cm); static void aac_complete(void *context, int pending); static int aac_bio_command(struct aac_softc *sc, struct aac_command **cmp); static void aac_bio_complete(struct aac_command *cm); static int aac_wait_command(struct aac_command *cm, int timeout); static void aac_host_command(struct aac_softc *sc); static void aac_host_response(struct aac_softc *sc); /* Command Buffer Management */ static int aac_alloc_command(struct aac_softc *sc, struct aac_command **cmp); static void aac_release_command(struct aac_command *cm); static void aac_map_command_helper(void *arg, bus_dma_segment_t *segs, int nseg, int error); static int aac_alloc_commands(struct aac_softc *sc); static void aac_free_commands(struct aac_softc *sc); static void aac_map_command(struct aac_command *cm); static void aac_unmap_command(struct aac_command *cm); /* Hardware Interface */ static void aac_common_map(void *arg, bus_dma_segment_t *segs, int nseg, int error); static int aac_init(struct aac_softc *sc); static int aac_sync_command(struct aac_softc *sc, u_int32_t command, u_int32_t arg0, u_int32_t arg1, u_int32_t arg2, u_int32_t arg3, u_int32_t *sp); static int aac_sync_fib(struct aac_softc *sc, u_int32_t command, u_int32_t xferstate, void *data, u_int16_t datasize, void *result, u_int16_t *resultsize); static int aac_enqueue_fib(struct aac_softc *sc, int queue, struct aac_command *cm); static int aac_dequeue_fib(struct aac_softc *sc, int queue, u_int32_t *fib_size, struct aac_fib **fib_addr); static int aac_enqueue_response(struct aac_softc *sc, int queue, struct aac_fib *fib); /* Falcon/PPC interface */ static int aac_fa_get_fwstatus(struct aac_softc *sc); static void aac_fa_qnotify(struct aac_softc *sc, int qbit); static int aac_fa_get_istatus(struct aac_softc *sc); static void aac_fa_clear_istatus(struct aac_softc *sc, int mask); static void aac_fa_set_mailbox(struct aac_softc *sc, u_int32_t command, u_int32_t arg0, u_int32_t arg1, u_int32_t arg2, u_int32_t arg3); static int aac_fa_get_mailboxstatus(struct aac_softc *sc); static void aac_fa_set_interrupts(struct aac_softc *sc, int enable); struct aac_interface aac_fa_interface = { aac_fa_get_fwstatus, aac_fa_qnotify, aac_fa_get_istatus, aac_fa_clear_istatus, aac_fa_set_mailbox, aac_fa_get_mailboxstatus, aac_fa_set_interrupts }; /* StrongARM interface */ static int aac_sa_get_fwstatus(struct aac_softc *sc); static void aac_sa_qnotify(struct aac_softc *sc, int qbit); static int aac_sa_get_istatus(struct aac_softc *sc); static void aac_sa_clear_istatus(struct aac_softc *sc, int mask); static void aac_sa_set_mailbox(struct aac_softc *sc, u_int32_t command, u_int32_t arg0, u_int32_t arg1, u_int32_t arg2, u_int32_t arg3); static int aac_sa_get_mailboxstatus(struct aac_softc *sc); static void aac_sa_set_interrupts(struct aac_softc *sc, int enable); struct aac_interface aac_sa_interface = { aac_sa_get_fwstatus, aac_sa_qnotify, aac_sa_get_istatus, aac_sa_clear_istatus, aac_sa_set_mailbox, aac_sa_get_mailboxstatus, aac_sa_set_interrupts }; /* i960Rx interface */ static int aac_rx_get_fwstatus(struct aac_softc *sc); static void aac_rx_qnotify(struct aac_softc *sc, int qbit); static int aac_rx_get_istatus(struct aac_softc *sc); static void aac_rx_clear_istatus(struct aac_softc *sc, int mask); static void aac_rx_set_mailbox(struct aac_softc *sc, u_int32_t command, u_int32_t arg0, u_int32_t arg1, u_int32_t arg2, u_int32_t arg3); static int aac_rx_get_mailboxstatus(struct aac_softc *sc); static void aac_rx_set_interrupts(struct aac_softc *sc, int enable); struct aac_interface aac_rx_interface = { aac_rx_get_fwstatus, aac_rx_qnotify, aac_rx_get_istatus, aac_rx_clear_istatus, aac_rx_set_mailbox, aac_rx_get_mailboxstatus, aac_rx_set_interrupts }; /* Debugging and Diagnostics */ static void aac_describe_controller(struct aac_softc *sc); static char *aac_describe_code(struct aac_code_lookup *table, u_int32_t code); /* Management Interface */ static d_open_t aac_open; static d_close_t aac_close; static d_ioctl_t aac_ioctl; static d_poll_t aac_poll; static int aac_ioctl_sendfib(struct aac_softc *sc, caddr_t ufib); static void aac_handle_aif(struct aac_softc *sc, struct aac_fib *fib); static int aac_rev_check(struct aac_softc *sc, caddr_t udata); static int aac_getnext_aif(struct aac_softc *sc, caddr_t arg); static int aac_return_aif(struct aac_softc *sc, caddr_t uptr); static int aac_query_disk(struct aac_softc *sc, caddr_t uptr); #define AAC_CDEV_MAJOR 150 static struct cdevsw aac_cdevsw = { aac_open, /* open */ aac_close, /* close */ noread, /* read */ nowrite, /* write */ aac_ioctl, /* ioctl */ aac_poll, /* poll */ nommap, /* mmap */ nostrategy, /* strategy */ "aac", /* name */ AAC_CDEV_MAJOR, /* major */ nodump, /* dump */ nopsize, /* psize */ 0, /* flags */ #if __FreeBSD_version < 500005 -1, /* bmaj */ #endif }; MALLOC_DEFINE(M_AACBUF, "aacbuf", "Buffers for the AAC driver"); /* sysctl node */ SYSCTL_NODE(_hw, OID_AUTO, aac, CTLFLAG_RD, 0, "AAC driver parameters"); /* * Device Interface */ /* * Initialise the controller and softc */ int aac_attach(struct aac_softc *sc) { int error, unit; debug_called(1); /* * Initialise per-controller queues. */ aac_initq_free(sc); aac_initq_ready(sc); aac_initq_busy(sc); aac_initq_complete(sc); aac_initq_bio(sc); #if __FreeBSD_version >= 500005 /* * Initialise command-completion task. */ TASK_INIT(&sc->aac_task_complete, 0, aac_complete, sc); #endif /* disable interrupts before we enable anything */ AAC_MASK_INTERRUPTS(sc); /* mark controller as suspended until we get ourselves organised */ sc->aac_state |= AAC_STATE_SUSPEND; /* * Allocate command structures. */ if ((error = aac_alloc_commands(sc)) != 0) return(error); /* * Initialise the adapter. */ if ((error = aac_init(sc)) != 0) return(error); /* * Print a little information about the controller. */ aac_describe_controller(sc); /* * Register to probe our containers later. */ TAILQ_INIT(&sc->aac_container_tqh); AAC_LOCK_INIT(&sc->aac_container_lock, "AAC container lock"); /* * Lock for the AIF queue */ AAC_LOCK_INIT(&sc->aac_aifq_lock, "AAC AIF lock"); sc->aac_ich.ich_func = aac_startup; sc->aac_ich.ich_arg = sc; if (config_intrhook_establish(&sc->aac_ich) != 0) { device_printf(sc->aac_dev, "can't establish configuration hook\n"); return(ENXIO); } /* * Make the control device. */ unit = device_get_unit(sc->aac_dev); sc->aac_dev_t = make_dev(&aac_cdevsw, unit, UID_ROOT, GID_WHEEL, 0644, "aac%d", unit); #if __FreeBSD_version > 500005 (void)make_dev_alias(sc->aac_dev_t, "afa%d", unit); (void)make_dev_alias(sc->aac_dev_t, "hpn%d", unit); #endif sc->aac_dev_t->si_drv1 = sc; /* Create the AIF thread */ #if __FreeBSD_version > 500005 if (kthread_create((void(*)(void *))aac_host_command, sc, &sc->aifthread, 0, "aac%daif", unit)) #else if (kthread_create((void(*)(void *))aac_host_command, sc, &sc->aifthread, "aac%daif", unit)) #endif panic("Could not create AIF thread\n"); /* Register the shutdown method to only be called post-dump */ if ((EVENTHANDLER_REGISTER(shutdown_final, aac_shutdown, sc->aac_dev, SHUTDOWN_PRI_DEFAULT)) == NULL) device_printf(sc->aac_dev, "shutdown event registration failed\n"); return(0); } /* * Probe for containers, create disks. */ static void aac_startup(void *arg) { struct aac_softc *sc; struct aac_mntinfo mi; struct aac_mntinforesponse mir; u_int16_t rsize; int i = 0; debug_called(1); sc = (struct aac_softc *)arg; /* disconnect ourselves from the intrhook chain */ config_intrhook_disestablish(&sc->aac_ich); /* loop over possible containers */ mi.Command = VM_NameServe; mi.MntType = FT_FILESYS; do { /* request information on this container */ mi.MntCount = i; rsize = sizeof(mir); if (aac_sync_fib(sc, ContainerCommand, 0, &mi, sizeof(struct aac_mntinfo), &mir, &rsize)) { debug(2, "error probing container %d", i); continue; } /* check response size */ if (rsize != sizeof(mir)) { debug(2, "container info response wrong size " "(%d should be %d)", rsize, sizeof(mir)); continue; } aac_add_container(sc, &mir, 0); i++; } while ((i < mir.MntRespCount) && (i < AAC_MAX_CONTAINERS)); /* poke the bus to actually attach the child devices */ if (bus_generic_attach(sc->aac_dev)) device_printf(sc->aac_dev, "bus_generic_attach failed\n"); /* mark the controller up */ sc->aac_state &= ~AAC_STATE_SUSPEND; /* enable interrupts now */ AAC_UNMASK_INTERRUPTS(sc); /* enable the timeout watchdog */ timeout((timeout_t*)aac_timeout, sc, AAC_PERIODIC_INTERVAL * hz); } /* * Create a device to respresent a new container */ static void aac_add_container(struct aac_softc *sc, struct aac_mntinforesponse *mir, int f) { struct aac_container *co; device_t child; /* * Check container volume type for validity. Note that many of * the possible types may never show up. */ if ((mir->Status == ST_OK) && (mir->MntTable[0].VolType != CT_NONE)) { MALLOC(co, struct aac_container *, sizeof *co, M_AACBUF, M_NOWAIT); if (co == NULL) panic("Out of memory?!\n"); debug(1, "id %x name '%.16s' size %u type %d", mir->MntTable[0].ObjectId, mir->MntTable[0].FileSystemName, mir->MntTable[0].Capacity, mir->MntTable[0].VolType); if ((child = device_add_child(sc->aac_dev, NULL, -1)) == NULL) device_printf(sc->aac_dev, "device_add_child failed\n"); else device_set_ivars(child, co); device_set_desc(child, aac_describe_code(aac_container_types, mir->MntTable[0].VolType)); co->co_disk = child; co->co_found = f; bcopy(&mir->MntTable[0], &co->co_mntobj, sizeof(struct aac_mntobj)); AAC_LOCK_ACQUIRE(&sc->aac_container_lock); TAILQ_INSERT_TAIL(&sc->aac_container_tqh, co, co_link); AAC_LOCK_RELEASE(&sc->aac_container_lock); } } /* * Free all of the resources associated with (sc) * * Should not be called if the controller is active. */ void aac_free(struct aac_softc *sc) { debug_called(1); /* remove the control device */ if (sc->aac_dev_t != NULL) destroy_dev(sc->aac_dev_t); /* throw away any FIB buffers, discard the FIB DMA tag */ if (sc->aac_fibs != NULL) aac_free_commands(sc); if (sc->aac_fib_dmat) bus_dma_tag_destroy(sc->aac_fib_dmat); /* destroy the common area */ if (sc->aac_common) { bus_dmamap_unload(sc->aac_common_dmat, sc->aac_common_dmamap); bus_dmamem_free(sc->aac_common_dmat, sc->aac_common, sc->aac_common_dmamap); } if (sc->aac_common_dmat) bus_dma_tag_destroy(sc->aac_common_dmat); /* disconnect the interrupt handler */ if (sc->aac_intr) bus_teardown_intr(sc->aac_dev, sc->aac_irq, sc->aac_intr); if (sc->aac_irq != NULL) bus_release_resource(sc->aac_dev, SYS_RES_IRQ, sc->aac_irq_rid, sc->aac_irq); /* destroy data-transfer DMA tag */ if (sc->aac_buffer_dmat) bus_dma_tag_destroy(sc->aac_buffer_dmat); /* destroy the parent DMA tag */ if (sc->aac_parent_dmat) bus_dma_tag_destroy(sc->aac_parent_dmat); /* release the register window mapping */ if (sc->aac_regs_resource != NULL) bus_release_resource(sc->aac_dev, SYS_RES_MEMORY, sc->aac_regs_rid, sc->aac_regs_resource); } /* * Disconnect from the controller completely, in preparation for unload. */ int aac_detach(device_t dev) { struct aac_softc *sc; #if AAC_BROKEN int error; #endif debug_called(1); sc = device_get_softc(dev); if (sc->aac_state & AAC_STATE_OPEN) return(EBUSY); #if AAC_BROKEN if (sc->aifflags & AAC_AIFFLAGS_RUNNING) { sc->aifflags |= AAC_AIFFLAGS_EXIT; wakeup(sc->aifthread); tsleep(sc->aac_dev, PUSER | PCATCH, "aacdch", 30 * hz); } if (sc->aifflags & AAC_AIFFLAGS_RUNNING) panic("Cannot shutdown AIF thread\n"); if ((error = aac_shutdown(dev))) return(error); aac_free(sc); return(0); #else return (EBUSY); #endif } /* * Bring the controller down to a dormant state and detach all child devices. * * This function is called before detach or system shutdown. * * Note that we can assume that the bioq on the controller is empty, as we won't * allow shutdown if any device is open. */ int aac_shutdown(device_t dev) { struct aac_softc *sc; struct aac_close_command cc; int s, i; debug_called(1); sc = device_get_softc(dev); s = splbio(); sc->aac_state |= AAC_STATE_SUSPEND; /* * Send a Container shutdown followed by a HostShutdown FIB to the * controller to convince it that we don't want to talk to it anymore. * We've been closed and all I/O completed already */ device_printf(sc->aac_dev, "shutting down controller..."); cc.Command = VM_CloseAll; cc.ContainerId = 0xffffffff; if (aac_sync_fib(sc, ContainerCommand, 0, &cc, sizeof(cc), NULL, NULL)) printf("FAILED.\n"); else { i = 0; /* * XXX Issuing this command to the controller makes it shut down * but also keeps it from coming back up without a reset of the * PCI bus. This is not desirable if you are just unloading the * driver module with the intent to reload it later. */ if (aac_sync_fib(sc, FsaHostShutdown, AAC_FIBSTATE_SHUTDOWN, &i, sizeof(i), NULL, NULL)) { printf("FAILED.\n"); } else { printf("done.\n"); } } AAC_MASK_INTERRUPTS(sc); splx(s); return(0); } /* * Bring the controller to a quiescent state, ready for system suspend. */ int aac_suspend(device_t dev) { struct aac_softc *sc; int s; debug_called(1); sc = device_get_softc(dev); s = splbio(); sc->aac_state |= AAC_STATE_SUSPEND; AAC_MASK_INTERRUPTS(sc); splx(s); return(0); } /* * Bring the controller back to a state ready for operation. */ int aac_resume(device_t dev) { struct aac_softc *sc; debug_called(1); sc = device_get_softc(dev); sc->aac_state &= ~AAC_STATE_SUSPEND; AAC_UNMASK_INTERRUPTS(sc); return(0); } /* * Take an interrupt. */ void aac_intr(void *arg) { struct aac_softc *sc; u_int16_t reason; debug_called(2); sc = (struct aac_softc *)arg; reason = AAC_GET_ISTATUS(sc); /* controller wants to talk to the log */ if (reason & AAC_DB_PRINTF) { AAC_CLEAR_ISTATUS(sc, AAC_DB_PRINTF); aac_print_printf(sc); } /* controller has a message for us? */ if (reason & AAC_DB_COMMAND_READY) { AAC_CLEAR_ISTATUS(sc, AAC_DB_COMMAND_READY); /* XXX What happens if the thread is already awake? */ if (sc->aifflags & AAC_AIFFLAGS_RUNNING) { sc->aifflags |= AAC_AIFFLAGS_PENDING; wakeup(sc->aifthread); } } /* controller has a response for us? */ if (reason & AAC_DB_RESPONSE_READY) { AAC_CLEAR_ISTATUS(sc, AAC_DB_RESPONSE_READY); aac_host_response(sc); } /* * spurious interrupts that we don't use - reset the mask and clear the * interrupts */ if (reason & (AAC_DB_COMMAND_NOT_FULL | AAC_DB_RESPONSE_NOT_FULL)) { AAC_UNMASK_INTERRUPTS(sc); AAC_CLEAR_ISTATUS(sc, AAC_DB_COMMAND_NOT_FULL | AAC_DB_RESPONSE_NOT_FULL); } }; /* * Command Processing */ /* * Start as much queued I/O as possible on the controller */ static void aac_startio(struct aac_softc *sc) { struct aac_command *cm; debug_called(2); for (;;) { /* * Try to get a command that's been put off for lack of * resources */ cm = aac_dequeue_ready(sc); /* * Try to build a command off the bio queue (ignore error * return) */ if (cm == NULL) aac_bio_command(sc, &cm); /* nothing to do? */ if (cm == NULL) break; /* try to give the command to the controller */ if (aac_start(cm) == EBUSY) { /* put it on the ready queue for later */ aac_requeue_ready(cm); break; } } } /* * Deliver a command to the controller; allocate controller resources at the * last moment when possible. */ static int aac_start(struct aac_command *cm) { struct aac_softc *sc; int error; debug_called(2); sc = cm->cm_sc; /* get the command mapped */ aac_map_command(cm); /* fix up the address values in the FIB */ cm->cm_fib->Header.SenderFibAddress = (u_int32_t)cm->cm_fib; cm->cm_fib->Header.ReceiverFibAddress = cm->cm_fibphys; /* save a pointer to the command for speedy reverse-lookup */ cm->cm_fib->Header.SenderData = (u_int32_t)cm; /* XXX 64-bit physical * address issue */ /* put the FIB on the outbound queue */ error = aac_enqueue_fib(sc, cm->cm_queue, cm); return(error); } /* * Handle notification of one or more FIBs coming from the controller. */ static void aac_host_command(struct aac_softc *sc) { struct aac_fib *fib; u_int32_t fib_size; int size; debug_called(2); sc->aifflags |= AAC_AIFFLAGS_RUNNING; while (!(sc->aifflags & AAC_AIFFLAGS_EXIT)) { if (!(sc->aifflags & AAC_AIFFLAGS_PENDING)) tsleep(sc->aifthread, PRIBIO, "aifthd", 15 * hz); sc->aifflags &= ~AAC_AIFFLAGS_PENDING; for (;;) { if (aac_dequeue_fib(sc, AAC_HOST_NORM_CMD_QUEUE, &fib_size, &fib)) break; /* nothing to do */ AAC_PRINT_FIB(sc, fib); switch (fib->Header.Command) { case AifRequest: aac_handle_aif(sc, fib); break; default: device_printf(sc->aac_dev, "unknown command " "from controller\n"); break; } /* Return the AIF to the controller. */ if ((fib->Header.XferState == 0) || (fib->Header.StructType != AAC_FIBTYPE_TFIB)) break; if (fib->Header.XferState & AAC_FIBSTATE_FROMADAP) { fib->Header.XferState |= AAC_FIBSTATE_DONEHOST; *(AAC_FSAStatus*)fib->data = ST_OK; /* XXX Compute the Size field? */ size = fib->Header.Size; if (size > sizeof(struct aac_fib)) { size = sizeof(struct aac_fib); fib->Header.Size = size; } /* * Since we did not generate this command, it * cannot go through the normal * enqueue->startio chain. */ aac_enqueue_response(sc, AAC_ADAP_NORM_RESP_QUEUE, fib); } } } sc->aifflags &= ~AAC_AIFFLAGS_RUNNING; wakeup(sc->aac_dev); #if __FreeBSD_version > 500005 mtx_lock(&Giant); #endif kthread_exit(0); } /* * Handle notification of one or more FIBs completed by the controller */ static void aac_host_response(struct aac_softc *sc) { struct aac_command *cm; struct aac_fib *fib; u_int32_t fib_size; debug_called(2); for (;;) { /* look for completed FIBs on our queue */ if (aac_dequeue_fib(sc, AAC_HOST_NORM_RESP_QUEUE, &fib_size, &fib)) break; /* nothing to do */ /* get the command, unmap and queue for later processing */ cm = (struct aac_command *)fib->Header.SenderData; if (cm == NULL) { AAC_PRINT_FIB(sc, fib); } else { aac_remove_busy(cm); aac_unmap_command(cm); /* XXX defer? */ aac_enqueue_complete(cm); } } /* handle completion processing */ #if __FreeBSD_version >= 500005 taskqueue_enqueue(taskqueue_swi, &sc->aac_task_complete); #else aac_complete(sc, 0); #endif } /* * Process completed commands. */ static void aac_complete(void *context, int pending) { struct aac_softc *sc; struct aac_command *cm; debug_called(2); sc = (struct aac_softc *)context; /* pull completed commands off the queue */ for (;;) { cm = aac_dequeue_complete(sc); if (cm == NULL) break; cm->cm_flags |= AAC_CMD_COMPLETED; /* is there a completion handler? */ if (cm->cm_complete != NULL) { cm->cm_complete(cm); } else { /* assume that someone is sleeping on this command */ wakeup(cm); } } /* see if we can start some more I/O */ aac_startio(sc); } /* * Handle a bio submitted from a disk device. */ void aac_submit_bio(struct bio *bp) { struct aac_disk *ad; struct aac_softc *sc; debug_called(2); ad = (struct aac_disk *)bp->bio_dev->si_drv1; sc = ad->ad_controller; /* queue the BIO and try to get some work done */ aac_enqueue_bio(sc, bp); aac_startio(sc); } /* * Get a bio and build a command to go with it. */ static int aac_bio_command(struct aac_softc *sc, struct aac_command **cmp) { struct aac_command *cm; struct aac_fib *fib; struct aac_blockread *br; struct aac_blockwrite *bw; struct aac_disk *ad; struct bio *bp; debug_called(2); /* get the resources we will need */ cm = NULL; if ((bp = aac_dequeue_bio(sc)) == NULL) goto fail; if (aac_alloc_command(sc, &cm)) /* get a command */ goto fail; /* fill out the command */ cm->cm_data = (void *)bp->bio_data; cm->cm_datalen = bp->bio_bcount; cm->cm_complete = aac_bio_complete; cm->cm_private = bp; cm->cm_timestamp = time_second; cm->cm_queue = AAC_ADAP_NORM_CMD_QUEUE; /* build the FIB */ fib = cm->cm_fib; fib->Header.XferState = AAC_FIBSTATE_HOSTOWNED | AAC_FIBSTATE_INITIALISED | AAC_FIBSTATE_FROMHOST | AAC_FIBSTATE_REXPECTED | AAC_FIBSTATE_NORM; fib->Header.Command = ContainerCommand; fib->Header.Size = sizeof(struct aac_fib_header); /* build the read/write request */ ad = (struct aac_disk *)bp->bio_dev->si_drv1; if (BIO_IS_READ(bp)) { br = (struct aac_blockread *)&fib->data[0]; br->Command = VM_CtBlockRead; br->ContainerId = ad->ad_container->co_mntobj.ObjectId; br->BlockNumber = bp->bio_pblkno; br->ByteCount = bp->bio_bcount; fib->Header.Size += sizeof(struct aac_blockread); cm->cm_sgtable = &br->SgMap; cm->cm_flags |= AAC_CMD_DATAIN; } else { bw = (struct aac_blockwrite *)&fib->data[0]; bw->Command = VM_CtBlockWrite; bw->ContainerId = ad->ad_container->co_mntobj.ObjectId; bw->BlockNumber = bp->bio_pblkno; bw->ByteCount = bp->bio_bcount; bw->Stable = CUNSTABLE; /* XXX what's appropriate here? */ fib->Header.Size += sizeof(struct aac_blockwrite); cm->cm_flags |= AAC_CMD_DATAOUT; cm->cm_sgtable = &bw->SgMap; } *cmp = cm; return(0); fail: if (bp != NULL) aac_enqueue_bio(sc, bp); if (cm != NULL) aac_release_command(cm); return(ENOMEM); } /* * Handle a bio-instigated command that has been completed. */ static void aac_bio_complete(struct aac_command *cm) { struct aac_blockread_response *brr; struct aac_blockwrite_response *bwr; struct bio *bp; AAC_FSAStatus status; /* fetch relevant status and then release the command */ bp = (struct bio *)cm->cm_private; if (BIO_IS_READ(bp)) { brr = (struct aac_blockread_response *)&cm->cm_fib->data[0]; status = brr->Status; } else { bwr = (struct aac_blockwrite_response *)&cm->cm_fib->data[0]; status = bwr->Status; } aac_release_command(cm); /* fix up the bio based on status */ if (status == ST_OK) { bp->bio_resid = 0; } else { bp->bio_error = EIO; bp->bio_flags |= BIO_ERROR; /* pass an error string out to the disk layer */ bp->bio_driver1 = aac_describe_code(aac_command_status_table, status); } aac_biodone(bp); } /* * Dump a block of data to the controller. If the queue is full, tell the * caller to hold off and wait for the queue to drain. */ int aac_dump_enqueue(struct aac_disk *ad, u_int32_t lba, void *data, int dumppages) { struct aac_softc *sc; struct aac_command *cm; struct aac_fib *fib; struct aac_blockwrite *bw; sc = ad->ad_controller; cm = NULL; if (aac_alloc_command(sc, &cm)) return (EBUSY); /* fill out the command */ cm->cm_data = data; cm->cm_datalen = dumppages * PAGE_SIZE; cm->cm_complete = NULL; cm->cm_private = NULL; cm->cm_timestamp = time_second; cm->cm_queue = AAC_ADAP_NORM_CMD_QUEUE; /* build the FIB */ fib = cm->cm_fib; fib->Header.XferState = AAC_FIBSTATE_HOSTOWNED | AAC_FIBSTATE_INITIALISED | AAC_FIBSTATE_FROMHOST | AAC_FIBSTATE_REXPECTED | AAC_FIBSTATE_NORM; fib->Header.Command = ContainerCommand; fib->Header.Size = sizeof(struct aac_fib_header); bw = (struct aac_blockwrite *)&fib->data[0]; bw->Command = VM_CtBlockWrite; bw->ContainerId = ad->ad_container->co_mntobj.ObjectId; bw->BlockNumber = lba; bw->ByteCount = dumppages * PAGE_SIZE; bw->Stable = CUNSTABLE; /* XXX what's appropriate here? */ fib->Header.Size += sizeof(struct aac_blockwrite); cm->cm_flags |= AAC_CMD_DATAOUT; cm->cm_sgtable = &bw->SgMap; return (aac_start(cm)); } /* * Wait for the card's queue to drain when dumping. Also check for monitor * printf's */ void aac_dump_complete(struct aac_softc *sc) { struct aac_fib *fib; struct aac_command *cm; u_int16_t reason; u_int32_t pi, ci, fib_size; do { reason = AAC_GET_ISTATUS(sc); if (reason & AAC_DB_RESPONSE_READY) { AAC_CLEAR_ISTATUS(sc, AAC_DB_RESPONSE_READY); for (;;) { if (aac_dequeue_fib(sc, AAC_HOST_NORM_RESP_QUEUE, &fib_size, &fib)) break; cm = (struct aac_command *) fib->Header.SenderData; if (cm == NULL) AAC_PRINT_FIB(sc, fib); else { aac_remove_busy(cm); aac_unmap_command(cm); aac_enqueue_complete(cm); aac_release_command(cm); } } } if (reason & AAC_DB_PRINTF) { AAC_CLEAR_ISTATUS(sc, AAC_DB_PRINTF); aac_print_printf(sc); } pi = sc->aac_queues->qt_qindex[AAC_ADAP_NORM_CMD_QUEUE][ AAC_PRODUCER_INDEX]; ci = sc->aac_queues->qt_qindex[AAC_ADAP_NORM_CMD_QUEUE][ AAC_CONSUMER_INDEX]; } while (ci != pi); return; } /* * Submit a command to the controller, return when it completes. * XXX This is very dangerous! If the card has gone out to lunch, we could * be stuck here forever. At the same time, signals are not caught * because there is a risk that a signal could wakeup the tsleep before * the card has a chance to complete the command. The passed in timeout * is ignored for the same reason. Since there is no way to cancel a * command in progress, we should probably create a 'dead' queue where * commands go that have been interrupted/timed-out/etc, that keeps them * out of the free pool. That way, if the card is just slow, it won't * spam the memory of a command that has been recycled. */ static int aac_wait_command(struct aac_command *cm, int timeout) { int s, error = 0; debug_called(2); /* Put the command on the ready queue and get things going */ cm->cm_queue = AAC_ADAP_NORM_CMD_QUEUE; aac_enqueue_ready(cm); aac_startio(cm->cm_sc); s = splbio(); while (!(cm->cm_flags & AAC_CMD_COMPLETED) && (error != EWOULDBLOCK)) { error = tsleep(cm, PRIBIO, "aacwait", 0); } splx(s); return(error); } /* *Command Buffer Management */ /* * Allocate a command. */ static int aac_alloc_command(struct aac_softc *sc, struct aac_command **cmp) { struct aac_command *cm; debug_called(3); if ((cm = aac_dequeue_free(sc)) == NULL) return(ENOMEM); *cmp = cm; return(0); } /* * Release a command back to the freelist. */ static void aac_release_command(struct aac_command *cm) { debug_called(3); /* (re)initialise the command/FIB */ cm->cm_sgtable = NULL; cm->cm_flags = 0; cm->cm_complete = NULL; cm->cm_private = NULL; cm->cm_fib->Header.XferState = AAC_FIBSTATE_EMPTY; cm->cm_fib->Header.StructType = AAC_FIBTYPE_TFIB; cm->cm_fib->Header.Flags = 0; cm->cm_fib->Header.SenderSize = sizeof(struct aac_fib); /* * These are duplicated in aac_start to cover the case where an * intermediate stage may have destroyed them. They're left * initialised here for debugging purposes only. */ cm->cm_fib->Header.SenderFibAddress = (u_int32_t)cm->cm_fib; cm->cm_fib->Header.ReceiverFibAddress = cm->cm_fibphys; aac_enqueue_free(cm); } /* * Map helper for command/FIB allocation. */ static void aac_map_command_helper(void *arg, bus_dma_segment_t *segs, int nseg, int error) { struct aac_softc *sc; sc = (struct aac_softc *)arg; debug_called(3); sc->aac_fibphys = segs[0].ds_addr; } /* * Allocate and initialise commands/FIBs for this adapter. */ static int aac_alloc_commands(struct aac_softc *sc) { struct aac_command *cm; int i; debug_called(1); /* allocate the FIBs in DMAable memory and load them */ if (bus_dmamem_alloc(sc->aac_fib_dmat, (void **)&sc->aac_fibs, BUS_DMA_NOWAIT, &sc->aac_fibmap)) { return(ENOMEM); } bus_dmamap_load(sc->aac_fib_dmat, sc->aac_fibmap, sc->aac_fibs, AAC_FIB_COUNT * sizeof(struct aac_fib), aac_map_command_helper, sc, 0); /* initialise constant fields in the command structure */ for (i = 0; i < AAC_FIB_COUNT; i++) { cm = &sc->aac_command[i]; cm->cm_sc = sc; cm->cm_fib = sc->aac_fibs + i; cm->cm_fibphys = sc->aac_fibphys + (i * sizeof(struct aac_fib)); if (!bus_dmamap_create(sc->aac_buffer_dmat, 0, &cm->cm_datamap)) aac_release_command(cm); } return(0); } /* * Free FIBs owned by this adapter. */ static void aac_free_commands(struct aac_softc *sc) { int i; debug_called(1); for (i = 0; i < AAC_FIB_COUNT; i++) bus_dmamap_destroy(sc->aac_buffer_dmat, sc->aac_command[i].cm_datamap); bus_dmamap_unload(sc->aac_fib_dmat, sc->aac_fibmap); bus_dmamem_free(sc->aac_fib_dmat, sc->aac_fibs, sc->aac_fibmap); } /* * Command-mapping helper function - populate this command's s/g table. */ static void aac_map_command_sg(void *arg, bus_dma_segment_t *segs, int nseg, int error) { struct aac_command *cm; struct aac_fib *fib; struct aac_sg_table *sg; int i; debug_called(3); cm = (struct aac_command *)arg; fib = cm->cm_fib; /* find the s/g table */ sg = cm->cm_sgtable; /* copy into the FIB */ if (sg != NULL) { sg->SgCount = nseg; for (i = 0; i < nseg; i++) { sg->SgEntry[i].SgAddress = segs[i].ds_addr; sg->SgEntry[i].SgByteCount = segs[i].ds_len; } /* update the FIB size for the s/g count */ fib->Header.Size += nseg * sizeof(struct aac_sg_entry); } } /* * Map a command into controller-visible space. */ static void aac_map_command(struct aac_command *cm) { struct aac_softc *sc; debug_called(2); sc = cm->cm_sc; /* don't map more than once */ if (cm->cm_flags & AAC_CMD_MAPPED) return; if (cm->cm_datalen != 0) { bus_dmamap_load(sc->aac_buffer_dmat, cm->cm_datamap, cm->cm_data, cm->cm_datalen, aac_map_command_sg, cm, 0); if (cm->cm_flags & AAC_CMD_DATAIN) bus_dmamap_sync(sc->aac_buffer_dmat, cm->cm_datamap, BUS_DMASYNC_PREREAD); if (cm->cm_flags & AAC_CMD_DATAOUT) bus_dmamap_sync(sc->aac_buffer_dmat, cm->cm_datamap, BUS_DMASYNC_PREWRITE); } cm->cm_flags |= AAC_CMD_MAPPED; } /* * Unmap a command from controller-visible space. */ static void aac_unmap_command(struct aac_command *cm) { struct aac_softc *sc; debug_called(2); sc = cm->cm_sc; if (!(cm->cm_flags & AAC_CMD_MAPPED)) return; if (cm->cm_datalen != 0) { if (cm->cm_flags & AAC_CMD_DATAIN) bus_dmamap_sync(sc->aac_buffer_dmat, cm->cm_datamap, BUS_DMASYNC_POSTREAD); if (cm->cm_flags & AAC_CMD_DATAOUT) bus_dmamap_sync(sc->aac_buffer_dmat, cm->cm_datamap, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(sc->aac_buffer_dmat, cm->cm_datamap); } cm->cm_flags &= ~AAC_CMD_MAPPED; } /* * Hardware Interface */ /* * Initialise the adapter. */ static void aac_common_map(void *arg, bus_dma_segment_t *segs, int nseg, int error) { struct aac_softc *sc; debug_called(1); sc = (struct aac_softc *)arg; sc->aac_common_busaddr = segs[0].ds_addr; } static int aac_init(struct aac_softc *sc) { struct aac_adapter_init *ip; time_t then; u_int32_t code; u_int8_t *qaddr; debug_called(1); /* * First wait for the adapter to come ready. */ then = time_second; do { code = AAC_GET_FWSTATUS(sc); if (code & AAC_SELF_TEST_FAILED) { device_printf(sc->aac_dev, "FATAL: selftest failed\n"); return(ENXIO); } if (code & AAC_KERNEL_PANIC) { device_printf(sc->aac_dev, "FATAL: controller kernel panic\n"); return(ENXIO); } if (time_second > (then + AAC_BOOT_TIMEOUT)) { device_printf(sc->aac_dev, "FATAL: controller not coming ready, " "status %x\n", code); return(ENXIO); } } while (!(code & AAC_UP_AND_RUNNING)); /* * Create DMA tag for the common structure and allocate it. */ if (bus_dma_tag_create(sc->aac_parent_dmat, /* parent */ 1, 0, /* algnmnt, boundary */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ sizeof(struct aac_common), /* maxsize */ 1, /* nsegments */ BUS_SPACE_MAXSIZE_32BIT, /* maxsegsize */ 0, /* flags */ &sc->aac_common_dmat)) { device_printf(sc->aac_dev, "can't allocate common structure DMA tag\n"); return(ENOMEM); } if (bus_dmamem_alloc(sc->aac_common_dmat, (void **)&sc->aac_common, BUS_DMA_NOWAIT, &sc->aac_common_dmamap)) { device_printf(sc->aac_dev, "can't allocate common structure\n"); return(ENOMEM); } bus_dmamap_load(sc->aac_common_dmat, sc->aac_common_dmamap, sc->aac_common, sizeof(*sc->aac_common), aac_common_map, sc, 0); bzero(sc->aac_common, sizeof(*sc->aac_common)); /* * Fill in the init structure. This tells the adapter about the * physical location of various important shared data structures. */ ip = &sc->aac_common->ac_init; ip->InitStructRevision = AAC_INIT_STRUCT_REVISION; ip->AdapterFibsPhysicalAddress = sc->aac_common_busaddr + offsetof(struct aac_common, ac_fibs); ip->AdapterFibsVirtualAddress = &sc->aac_common->ac_fibs[0]; ip->AdapterFibsSize = AAC_ADAPTER_FIBS * sizeof(struct aac_fib); ip->AdapterFibAlign = sizeof(struct aac_fib); ip->PrintfBufferAddress = sc->aac_common_busaddr + offsetof(struct aac_common, ac_printf); ip->PrintfBufferSize = AAC_PRINTF_BUFSIZE; ip->HostPhysMemPages = 0; /* not used? */ ip->HostElapsedSeconds = time_second; /* reset later if invalid */ /* * Initialise FIB queues. Note that it appears that the layout of the * indexes and the segmentation of the entries may be mandated by the * adapter, which is only told about the base of the queue index fields. * * The initial values of the indices are assumed to inform the adapter * of the sizes of the respective queues, and theoretically it could * work out the entire layout of the queue structures from this. We * take the easy route and just lay this area out like everyone else * does. * * The Linux driver uses a much more complex scheme whereby several * header records are kept for each queue. We use a couple of generic * list manipulation functions which 'know' the size of each list by * virtue of a table. */ qaddr = &sc->aac_common->ac_qbuf[0] + AAC_QUEUE_ALIGN; qaddr -= (u_int32_t)qaddr % AAC_QUEUE_ALIGN; sc->aac_queues = (struct aac_queue_table *)qaddr; ip->CommHeaderAddress = sc->aac_common_busaddr + ((u_int32_t)sc->aac_queues - (u_int32_t)sc->aac_common); bzero(sc->aac_queues, sizeof(struct aac_queue_table)); sc->aac_queues->qt_qindex[AAC_HOST_NORM_CMD_QUEUE][AAC_PRODUCER_INDEX] = AAC_HOST_NORM_CMD_ENTRIES; sc->aac_queues->qt_qindex[AAC_HOST_NORM_CMD_QUEUE][AAC_CONSUMER_INDEX] = AAC_HOST_NORM_CMD_ENTRIES; sc->aac_queues->qt_qindex[AAC_HOST_HIGH_CMD_QUEUE][AAC_PRODUCER_INDEX] = AAC_HOST_HIGH_CMD_ENTRIES; sc->aac_queues->qt_qindex[AAC_HOST_HIGH_CMD_QUEUE][AAC_CONSUMER_INDEX] = AAC_HOST_HIGH_CMD_ENTRIES; sc->aac_queues->qt_qindex[AAC_ADAP_NORM_CMD_QUEUE][AAC_PRODUCER_INDEX] = AAC_ADAP_NORM_CMD_ENTRIES; sc->aac_queues->qt_qindex[AAC_ADAP_NORM_CMD_QUEUE][AAC_CONSUMER_INDEX] = AAC_ADAP_NORM_CMD_ENTRIES; sc->aac_queues->qt_qindex[AAC_ADAP_HIGH_CMD_QUEUE][AAC_PRODUCER_INDEX] = AAC_ADAP_HIGH_CMD_ENTRIES; sc->aac_queues->qt_qindex[AAC_ADAP_HIGH_CMD_QUEUE][AAC_CONSUMER_INDEX] = AAC_ADAP_HIGH_CMD_ENTRIES; sc->aac_queues->qt_qindex[AAC_HOST_NORM_RESP_QUEUE][AAC_PRODUCER_INDEX]= AAC_HOST_NORM_RESP_ENTRIES; sc->aac_queues->qt_qindex[AAC_HOST_NORM_RESP_QUEUE][AAC_CONSUMER_INDEX]= AAC_HOST_NORM_RESP_ENTRIES; sc->aac_queues->qt_qindex[AAC_HOST_HIGH_RESP_QUEUE][AAC_PRODUCER_INDEX]= AAC_HOST_HIGH_RESP_ENTRIES; sc->aac_queues->qt_qindex[AAC_HOST_HIGH_RESP_QUEUE][AAC_CONSUMER_INDEX]= AAC_HOST_HIGH_RESP_ENTRIES; sc->aac_queues->qt_qindex[AAC_ADAP_NORM_RESP_QUEUE][AAC_PRODUCER_INDEX]= AAC_ADAP_NORM_RESP_ENTRIES; sc->aac_queues->qt_qindex[AAC_ADAP_NORM_RESP_QUEUE][AAC_CONSUMER_INDEX]= AAC_ADAP_NORM_RESP_ENTRIES; sc->aac_queues->qt_qindex[AAC_ADAP_HIGH_RESP_QUEUE][AAC_PRODUCER_INDEX]= AAC_ADAP_HIGH_RESP_ENTRIES; sc->aac_queues->qt_qindex[AAC_ADAP_HIGH_RESP_QUEUE][AAC_CONSUMER_INDEX]= AAC_ADAP_HIGH_RESP_ENTRIES; sc->aac_qentries[AAC_HOST_NORM_CMD_QUEUE] = &sc->aac_queues->qt_HostNormCmdQueue[0]; sc->aac_qentries[AAC_HOST_HIGH_CMD_QUEUE] = &sc->aac_queues->qt_HostHighCmdQueue[0]; sc->aac_qentries[AAC_ADAP_NORM_CMD_QUEUE] = &sc->aac_queues->qt_AdapNormCmdQueue[0]; sc->aac_qentries[AAC_ADAP_HIGH_CMD_QUEUE] = &sc->aac_queues->qt_AdapHighCmdQueue[0]; sc->aac_qentries[AAC_HOST_NORM_RESP_QUEUE] = &sc->aac_queues->qt_HostNormRespQueue[0]; sc->aac_qentries[AAC_HOST_HIGH_RESP_QUEUE] = &sc->aac_queues->qt_HostHighRespQueue[0]; sc->aac_qentries[AAC_ADAP_NORM_RESP_QUEUE] = &sc->aac_queues->qt_AdapNormRespQueue[0]; sc->aac_qentries[AAC_ADAP_HIGH_RESP_QUEUE] = &sc->aac_queues->qt_AdapHighRespQueue[0]; /* * Do controller-type-specific initialisation */ switch (sc->aac_hwif) { case AAC_HWIF_I960RX: AAC_SETREG4(sc, AAC_RX_ODBR, ~0); break; } /* * Give the init structure to the controller. */ if (aac_sync_command(sc, AAC_MONKER_INITSTRUCT, sc->aac_common_busaddr + offsetof(struct aac_common, ac_init), 0, 0, 0, NULL)) { device_printf(sc->aac_dev, "error establishing init structure\n"); return(EIO); } return(0); } /* * Send a synchronous command to the controller and wait for a result. */ static int aac_sync_command(struct aac_softc *sc, u_int32_t command, u_int32_t arg0, u_int32_t arg1, u_int32_t arg2, u_int32_t arg3, u_int32_t *sp) { time_t then; u_int32_t status; debug_called(3); /* populate the mailbox */ AAC_SET_MAILBOX(sc, command, arg0, arg1, arg2, arg3); /* ensure the sync command doorbell flag is cleared */ AAC_CLEAR_ISTATUS(sc, AAC_DB_SYNC_COMMAND); /* then set it to signal the adapter */ AAC_QNOTIFY(sc, AAC_DB_SYNC_COMMAND); /* spin waiting for the command to complete */ then = time_second; do { if (time_second > (then + AAC_IMMEDIATE_TIMEOUT)) { debug(2, "timed out"); return(EIO); } } while (!(AAC_GET_ISTATUS(sc) & AAC_DB_SYNC_COMMAND)); /* clear the completion flag */ AAC_CLEAR_ISTATUS(sc, AAC_DB_SYNC_COMMAND); /* get the command status */ status = AAC_GET_MAILBOXSTATUS(sc); if (sp != NULL) *sp = status; return(0); } /* * Send a synchronous FIB to the controller and wait for a result. */ static int aac_sync_fib(struct aac_softc *sc, u_int32_t command, u_int32_t xferstate, void *data, u_int16_t datasize, void *result, u_int16_t *resultsize) { struct aac_fib *fib; debug_called(3); fib = &sc->aac_common->ac_sync_fib; if (datasize > AAC_FIB_DATASIZE) return(EINVAL); /* * Set up the sync FIB */ fib->Header.XferState = AAC_FIBSTATE_HOSTOWNED | AAC_FIBSTATE_INITIALISED | AAC_FIBSTATE_EMPTY; fib->Header.XferState |= xferstate; fib->Header.Command = command; fib->Header.StructType = AAC_FIBTYPE_TFIB; fib->Header.Size = sizeof(struct aac_fib) + datasize; fib->Header.SenderSize = sizeof(struct aac_fib); fib->Header.SenderFibAddress = (u_int32_t)fib; fib->Header.ReceiverFibAddress = sc->aac_common_busaddr + offsetof(struct aac_common, ac_sync_fib); /* * Copy in data. */ if (data != NULL) { KASSERT(datasize <= sizeof(fib->data), ("aac_sync_fib: datasize to large")); bcopy(data, fib->data, datasize); fib->Header.XferState |= AAC_FIBSTATE_FROMHOST | AAC_FIBSTATE_NORM; } /* * Give the FIB to the controller, wait for a response. */ if (aac_sync_command(sc, AAC_MONKER_SYNCFIB, fib->Header.ReceiverFibAddress, 0, 0, 0, NULL)) { debug(2, "IO error"); return(EIO); } /* * Copy out the result */ if (result != NULL) { u_int copysize; copysize = fib->Header.Size - sizeof(struct aac_fib_header); if (copysize > *resultsize) copysize = *resultsize; *resultsize = fib->Header.Size - sizeof(struct aac_fib_header); bcopy(fib->data, result, copysize); } return(0); } /* * Adapter-space FIB queue manipulation * * Note that the queue implementation here is a little funky; neither the PI or * CI will ever be zero. This behaviour is a controller feature. */ static struct { int size; int notify; } aac_qinfo[] = { {AAC_HOST_NORM_CMD_ENTRIES, AAC_DB_COMMAND_NOT_FULL}, {AAC_HOST_HIGH_CMD_ENTRIES, 0}, {AAC_ADAP_NORM_CMD_ENTRIES, AAC_DB_COMMAND_READY}, {AAC_ADAP_HIGH_CMD_ENTRIES, 0}, {AAC_HOST_NORM_RESP_ENTRIES, AAC_DB_RESPONSE_NOT_FULL}, {AAC_HOST_HIGH_RESP_ENTRIES, 0}, {AAC_ADAP_NORM_RESP_ENTRIES, AAC_DB_RESPONSE_READY}, {AAC_ADAP_HIGH_RESP_ENTRIES, 0} }; /* * Atomically insert an entry into the nominated queue, returns 0 on success or * EBUSY if the queue is full. * * Note: it would be more efficient to defer notifying the controller in * the case where we may be inserting several entries in rapid succession, * but implementing this usefully may be difficult (it would involve a * separate queue/notify interface). */ static int aac_enqueue_fib(struct aac_softc *sc, int queue, struct aac_command *cm) { u_int32_t pi, ci; int s, error; u_int32_t fib_size; u_int32_t fib_addr; debug_called(3); fib_size = cm->cm_fib->Header.Size; fib_addr = cm->cm_fib->Header.ReceiverFibAddress; s = splbio(); /* get the producer/consumer indices */ pi = sc->aac_queues->qt_qindex[queue][AAC_PRODUCER_INDEX]; ci = sc->aac_queues->qt_qindex[queue][AAC_CONSUMER_INDEX]; /* wrap the queue? */ if (pi >= aac_qinfo[queue].size) pi = 0; /* check for queue full */ if ((pi + 1) == ci) { error = EBUSY; goto out; } /* populate queue entry */ (sc->aac_qentries[queue] + pi)->aq_fib_size = fib_size; (sc->aac_qentries[queue] + pi)->aq_fib_addr = fib_addr; /* update producer index */ sc->aac_queues->qt_qindex[queue][AAC_PRODUCER_INDEX] = pi + 1; /* * To avoid a race with its completion interrupt, place this command on * the busy queue prior to advertising it to the controller. */ aac_enqueue_busy(cm); /* notify the adapter if we know how */ if (aac_qinfo[queue].notify != 0) AAC_QNOTIFY(sc, aac_qinfo[queue].notify); error = 0; out: splx(s); return(error); } /* * Atomically remove one entry from the nominated queue, returns 0 on * success or ENOENT if the queue is empty. */ static int aac_dequeue_fib(struct aac_softc *sc, int queue, u_int32_t *fib_size, struct aac_fib **fib_addr) { u_int32_t pi, ci; int s, error; int notify; debug_called(3); s = splbio(); /* get the producer/consumer indices */ pi = sc->aac_queues->qt_qindex[queue][AAC_PRODUCER_INDEX]; ci = sc->aac_queues->qt_qindex[queue][AAC_CONSUMER_INDEX]; /* check for queue empty */ if (ci == pi) { error = ENOENT; goto out; } notify = 0; if (ci == pi + 1) notify++; /* wrap the queue? */ if (ci >= aac_qinfo[queue].size) ci = 0; /* fetch the entry */ *fib_size = (sc->aac_qentries[queue] + ci)->aq_fib_size; *fib_addr = (struct aac_fib *)(sc->aac_qentries[queue] + ci)->aq_fib_addr; /* update consumer index */ sc->aac_queues->qt_qindex[queue][AAC_CONSUMER_INDEX] = ci + 1; /* if we have made the queue un-full, notify the adapter */ if (notify && (aac_qinfo[queue].notify != 0)) AAC_QNOTIFY(sc, aac_qinfo[queue].notify); error = 0; out: splx(s); return(error); } /* * Put our response to an Adapter Initialed Fib on the response queue */ static int aac_enqueue_response(struct aac_softc *sc, int queue, struct aac_fib *fib) { u_int32_t pi, ci; int s, error; u_int32_t fib_size; u_int32_t fib_addr; debug_called(1); /* Tell the adapter where the FIB is */ fib_size = fib->Header.Size; fib_addr = fib->Header.SenderFibAddress; fib->Header.ReceiverFibAddress = fib_addr; s = splbio(); /* get the producer/consumer indices */ pi = sc->aac_queues->qt_qindex[queue][AAC_PRODUCER_INDEX]; ci = sc->aac_queues->qt_qindex[queue][AAC_CONSUMER_INDEX]; /* wrap the queue? */ if (pi >= aac_qinfo[queue].size) pi = 0; /* check for queue full */ if ((pi + 1) == ci) { error = EBUSY; goto out; } /* populate queue entry */ (sc->aac_qentries[queue] + pi)->aq_fib_size = fib_size; (sc->aac_qentries[queue] + pi)->aq_fib_addr = fib_addr; /* update producer index */ sc->aac_queues->qt_qindex[queue][AAC_PRODUCER_INDEX] = pi + 1; /* notify the adapter if we know how */ if (aac_qinfo[queue].notify != 0) AAC_QNOTIFY(sc, aac_qinfo[queue].notify); error = 0; out: splx(s); return(error); } /* * Check for commands that have been outstanding for a suspiciously long time, * and complain about them. */ static void aac_timeout(struct aac_softc *sc) { int s; struct aac_command *cm; time_t deadline; #if 0 /* simulate an interrupt to handle possibly-missed interrupts */ /* * XXX This was done to work around another bug which has since been * fixed. It is dangerous anyways because you don't want multiple * threads in the interrupt handler at the same time! If calling * is deamed neccesary in the future, proper mutexes must be used. */ s = splbio(); aac_intr(sc); splx(s); /* kick the I/O queue to restart it in the case of deadlock */ aac_startio(sc); #endif /* * traverse the busy command list, bitch about late commands once * only. */ deadline = time_second - AAC_CMD_TIMEOUT; s = splbio(); TAILQ_FOREACH(cm, &sc->aac_busy, cm_link) { if ((cm->cm_timestamp < deadline) /* && !(cm->cm_flags & AAC_CMD_TIMEDOUT) */) { cm->cm_flags |= AAC_CMD_TIMEDOUT; device_printf(sc->aac_dev, "COMMAND %p TIMEOUT AFTER %d SECONDS\n", cm, (int)(time_second-cm->cm_timestamp)); AAC_PRINT_FIB(sc, cm->cm_fib); } } splx(s); /* reset the timer for next time */ timeout((timeout_t*)aac_timeout, sc, AAC_PERIODIC_INTERVAL * hz); return; } /* * Interface Function Vectors */ /* * Read the current firmware status word. */ static int aac_sa_get_fwstatus(struct aac_softc *sc) { debug_called(3); return(AAC_GETREG4(sc, AAC_SA_FWSTATUS)); } static int aac_rx_get_fwstatus(struct aac_softc *sc) { debug_called(3); return(AAC_GETREG4(sc, AAC_RX_FWSTATUS)); } static int aac_fa_get_fwstatus(struct aac_softc *sc) { int val; debug_called(3); val = AAC_GETREG4(sc, AAC_FA_FWSTATUS); return (val); } /* * Notify the controller of a change in a given queue */ static void aac_sa_qnotify(struct aac_softc *sc, int qbit) { debug_called(3); AAC_SETREG2(sc, AAC_SA_DOORBELL1_SET, qbit); } static void aac_rx_qnotify(struct aac_softc *sc, int qbit) { debug_called(3); AAC_SETREG4(sc, AAC_RX_IDBR, qbit); } static void aac_fa_qnotify(struct aac_softc *sc, int qbit) { debug_called(3); AAC_SETREG2(sc, AAC_FA_DOORBELL1, qbit); AAC_FA_HACK(sc); } /* * Get the interrupt reason bits */ static int aac_sa_get_istatus(struct aac_softc *sc) { debug_called(3); return(AAC_GETREG2(sc, AAC_SA_DOORBELL0)); } static int aac_rx_get_istatus(struct aac_softc *sc) { debug_called(3); return(AAC_GETREG4(sc, AAC_RX_ODBR)); } static int aac_fa_get_istatus(struct aac_softc *sc) { int val; debug_called(3); val = AAC_GETREG2(sc, AAC_FA_DOORBELL0); return (val); } /* * Clear some interrupt reason bits */ static void aac_sa_clear_istatus(struct aac_softc *sc, int mask) { debug_called(3); AAC_SETREG2(sc, AAC_SA_DOORBELL0_CLEAR, mask); } static void aac_rx_clear_istatus(struct aac_softc *sc, int mask) { debug_called(3); AAC_SETREG4(sc, AAC_RX_ODBR, mask); } static void aac_fa_clear_istatus(struct aac_softc *sc, int mask) { debug_called(3); AAC_SETREG2(sc, AAC_FA_DOORBELL0_CLEAR, mask); AAC_FA_HACK(sc); } /* * Populate the mailbox and set the command word */ static void aac_sa_set_mailbox(struct aac_softc *sc, u_int32_t command, u_int32_t arg0, u_int32_t arg1, u_int32_t arg2, u_int32_t arg3) { debug_called(4); AAC_SETREG4(sc, AAC_SA_MAILBOX, command); AAC_SETREG4(sc, AAC_SA_MAILBOX + 4, arg0); AAC_SETREG4(sc, AAC_SA_MAILBOX + 8, arg1); AAC_SETREG4(sc, AAC_SA_MAILBOX + 12, arg2); AAC_SETREG4(sc, AAC_SA_MAILBOX + 16, arg3); } static void aac_rx_set_mailbox(struct aac_softc *sc, u_int32_t command, u_int32_t arg0, u_int32_t arg1, u_int32_t arg2, u_int32_t arg3) { debug_called(4); AAC_SETREG4(sc, AAC_RX_MAILBOX, command); AAC_SETREG4(sc, AAC_RX_MAILBOX + 4, arg0); AAC_SETREG4(sc, AAC_RX_MAILBOX + 8, arg1); AAC_SETREG4(sc, AAC_RX_MAILBOX + 12, arg2); AAC_SETREG4(sc, AAC_RX_MAILBOX + 16, arg3); } static void aac_fa_set_mailbox(struct aac_softc *sc, u_int32_t command, u_int32_t arg0, u_int32_t arg1, u_int32_t arg2, u_int32_t arg3) { debug_called(4); AAC_SETREG4(sc, AAC_FA_MAILBOX, command); AAC_FA_HACK(sc); AAC_SETREG4(sc, AAC_FA_MAILBOX + 4, arg0); AAC_FA_HACK(sc); AAC_SETREG4(sc, AAC_FA_MAILBOX + 8, arg1); AAC_FA_HACK(sc); AAC_SETREG4(sc, AAC_FA_MAILBOX + 12, arg2); AAC_FA_HACK(sc); AAC_SETREG4(sc, AAC_FA_MAILBOX + 16, arg3); AAC_FA_HACK(sc); } /* * Fetch the immediate command status word */ static int aac_sa_get_mailboxstatus(struct aac_softc *sc) { debug_called(4); return(AAC_GETREG4(sc, AAC_SA_MAILBOX)); } static int aac_rx_get_mailboxstatus(struct aac_softc *sc) { debug_called(4); return(AAC_GETREG4(sc, AAC_RX_MAILBOX)); } static int aac_fa_get_mailboxstatus(struct aac_softc *sc) { int val; debug_called(4); val = AAC_GETREG4(sc, AAC_FA_MAILBOX); return (val); } /* * Set/clear interrupt masks */ static void aac_sa_set_interrupts(struct aac_softc *sc, int enable) { debug(2, "%sable interrupts", enable ? "en" : "dis"); if (enable) { AAC_SETREG2((sc), AAC_SA_MASK0_CLEAR, AAC_DB_INTERRUPTS); } else { AAC_SETREG2((sc), AAC_SA_MASK0_SET, ~0); } } static void aac_rx_set_interrupts(struct aac_softc *sc, int enable) { debug(2, "%sable interrupts", enable ? "en" : "dis"); if (enable) { AAC_SETREG4(sc, AAC_RX_OIMR, ~AAC_DB_INTERRUPTS); } else { AAC_SETREG4(sc, AAC_RX_OIMR, ~0); } } static void aac_fa_set_interrupts(struct aac_softc *sc, int enable) { debug(2, "%sable interrupts", enable ? "en" : "dis"); if (enable) { AAC_SETREG2((sc), AAC_FA_MASK0_CLEAR, AAC_DB_INTERRUPTS); AAC_FA_HACK(sc); } else { AAC_SETREG2((sc), AAC_FA_MASK0, ~0); AAC_FA_HACK(sc); } } /* * Debugging and Diagnostics */ /* * Print some information about the controller. */ static void aac_describe_controller(struct aac_softc *sc) { u_int8_t buf[AAC_FIB_DATASIZE]; /* XXX really a bit big * for the stack */ u_int16_t bufsize; struct aac_adapter_info *info; u_int8_t arg; debug_called(2); arg = 0; bufsize = sizeof(buf); if (aac_sync_fib(sc, RequestAdapterInfo, 0, &arg, sizeof(arg), &buf, &bufsize)) { device_printf(sc->aac_dev, "RequestAdapterInfo failed\n"); return; } if (bufsize != sizeof(*info)) { device_printf(sc->aac_dev, "RequestAdapterInfo returned wrong data size " "(%d != %d)\n", bufsize, sizeof(*info)); /*return;*/ } info = (struct aac_adapter_info *)&buf[0]; device_printf(sc->aac_dev, "%s %dMHz, %dMB cache memory, %s\n", aac_describe_code(aac_cpu_variant, info->CpuVariant), info->ClockSpeed, info->BufferMem / (1024 * 1024), aac_describe_code(aac_battery_platform, info->batteryPlatform)); /* save the kernel revision structure for later use */ sc->aac_revision = info->KernelRevision; device_printf(sc->aac_dev, "Kernel %d.%d-%d, Build %d, S/N %6X\n", info->KernelRevision.external.comp.major, info->KernelRevision.external.comp.minor, info->KernelRevision.external.comp.dash, info->KernelRevision.buildNumber, (u_int32_t)(info->SerialNumber & 0xffffff)); } /* * Look up a text description of a numeric error code and return a pointer to * same. */ static char * aac_describe_code(struct aac_code_lookup *table, u_int32_t code) { int i; for (i = 0; table[i].string != NULL; i++) if (table[i].code == code) return(table[i].string); return(table[i + 1].string); } /* * Management Interface */ static int aac_open(dev_t dev, int flags, int fmt, d_thread_t *td) { struct aac_softc *sc; debug_called(2); sc = dev->si_drv1; /* Check to make sure the device isn't already open */ if (sc->aac_state & AAC_STATE_OPEN) { return EBUSY; } sc->aac_state |= AAC_STATE_OPEN; return 0; } static int aac_close(dev_t dev, int flags, int fmt, d_thread_t *td) { struct aac_softc *sc; debug_called(2); sc = dev->si_drv1; /* Mark this unit as no longer open */ sc->aac_state &= ~AAC_STATE_OPEN; return 0; } static int aac_ioctl(dev_t dev, u_long cmd, caddr_t arg, int flag, d_thread_t *td) { union aac_statrequest *as; struct aac_softc *sc; int error = 0; int i; debug_called(2); as = (union aac_statrequest *)arg; sc = dev->si_drv1; switch (cmd) { case AACIO_STATS: switch (as->as_item) { case AACQ_FREE: case AACQ_BIO: case AACQ_READY: case AACQ_BUSY: case AACQ_COMPLETE: bcopy(&sc->aac_qstat[as->as_item], &as->as_qstat, sizeof(struct aac_qstat)); break; default: error = ENOENT; break; } break; case FSACTL_SENDFIB: arg = *(caddr_t*)arg; case FSACTL_LNX_SENDFIB: debug(1, "FSACTL_SENDFIB"); error = aac_ioctl_sendfib(sc, arg); break; case FSACTL_AIF_THREAD: case FSACTL_LNX_AIF_THREAD: debug(1, "FSACTL_AIF_THREAD"); error = EINVAL; break; case FSACTL_OPEN_GET_ADAPTER_FIB: arg = *(caddr_t*)arg; case FSACTL_LNX_OPEN_GET_ADAPTER_FIB: debug(1, "FSACTL_OPEN_GET_ADAPTER_FIB"); /* * Pass the caller out an AdapterFibContext. * * Note that because we only support one opener, we * basically ignore this. Set the caller's context to a magic * number just in case. * * The Linux code hands the driver a pointer into kernel space, * and then trusts it when the caller hands it back. Aiee! * Here, we give it the proc pointer of the per-adapter aif * thread. It's only used as a sanity check in other calls. */ i = (int)sc->aifthread; error = copyout(&i, arg, sizeof(i)); break; case FSACTL_GET_NEXT_ADAPTER_FIB: arg = *(caddr_t*)arg; case FSACTL_LNX_GET_NEXT_ADAPTER_FIB: debug(1, "FSACTL_GET_NEXT_ADAPTER_FIB"); error = aac_getnext_aif(sc, arg); break; case FSACTL_CLOSE_GET_ADAPTER_FIB: case FSACTL_LNX_CLOSE_GET_ADAPTER_FIB: debug(1, "FSACTL_CLOSE_GET_ADAPTER_FIB"); /* don't do anything here */ break; case FSACTL_MINIPORT_REV_CHECK: arg = *(caddr_t*)arg; case FSACTL_LNX_MINIPORT_REV_CHECK: debug(1, "FSACTL_MINIPORT_REV_CHECK"); error = aac_rev_check(sc, arg); break; case FSACTL_QUERY_DISK: arg = *(caddr_t*)arg; case FSACTL_LNX_QUERY_DISK: debug(1, "FSACTL_QUERY_DISK"); error = aac_query_disk(sc, arg); break; case FSACTL_DELETE_DISK: case FSACTL_LNX_DELETE_DISK: /* * We don't trust the underland to tell us when to delete a * container, rather we rely on an AIF coming from the * controller */ error = 0; break; default: debug(1, "unsupported cmd 0x%lx\n", cmd); error = EINVAL; break; } return(error); } static int aac_poll(dev_t dev, int poll_events, d_thread_t *td) { struct aac_softc *sc; int revents; sc = dev->si_drv1; revents = 0; AAC_LOCK_ACQUIRE(&sc->aac_aifq_lock); if ((poll_events & (POLLRDNORM | POLLIN)) != 0) { if (sc->aac_aifq_tail != sc->aac_aifq_head) revents |= poll_events & (POLLIN | POLLRDNORM); } AAC_LOCK_RELEASE(&sc->aac_aifq_lock); if (revents == 0) { if (poll_events & (POLLIN | POLLRDNORM)) selrecord(td, &sc->rcv_select); } return (revents); } /* * Send a FIB supplied from userspace */ static int aac_ioctl_sendfib(struct aac_softc *sc, caddr_t ufib) { struct aac_command *cm; int size, error; debug_called(2); cm = NULL; /* * Get a command */ if (aac_alloc_command(sc, &cm)) { error = EBUSY; goto out; } /* * Fetch the FIB header, then re-copy to get data as well. */ if ((error = copyin(ufib, cm->cm_fib, sizeof(struct aac_fib_header))) != 0) goto out; size = cm->cm_fib->Header.Size + sizeof(struct aac_fib_header); if (size > sizeof(struct aac_fib)) { device_printf(sc->aac_dev, "incoming FIB oversized (%d > %d)\n", size, sizeof(struct aac_fib)); size = sizeof(struct aac_fib); } if ((error = copyin(ufib, cm->cm_fib, size)) != 0) goto out; cm->cm_fib->Header.Size = size; cm->cm_timestamp = time_second; /* * Pass the FIB to the controller, wait for it to complete. */ if ((error = aac_wait_command(cm, 30)) != 0) { /* XXX user timeout? */ printf("aac_wait_command return %d\n", error); goto out; } /* * Copy the FIB and data back out to the caller. */ size = cm->cm_fib->Header.Size; if (size > sizeof(struct aac_fib)) { device_printf(sc->aac_dev, "outbound FIB oversized (%d > %d)\n", size, sizeof(struct aac_fib)); size = sizeof(struct aac_fib); } error = copyout(cm->cm_fib, ufib, size); out: if (cm != NULL) { aac_release_command(cm); } return(error); } /* * Handle an AIF sent to us by the controller; queue it for later reference. * If the queue fills up, then drop the older entries. */ static void aac_handle_aif(struct aac_softc *sc, struct aac_fib *fib) { struct aac_aif_command *aif; struct aac_container *co, *co_next; struct aac_mntinfo mi; struct aac_mntinforesponse mir; u_int16_t rsize; int next, found; int added = 0, i = 0; debug_called(2); aif = (struct aac_aif_command*)&fib->data[0]; aac_print_aif(sc, aif); /* Is it an event that we should care about? */ switch (aif->command) { case AifCmdEventNotify: switch (aif->data.EN.type) { case AifEnAddContainer: case AifEnDeleteContainer: /* * A container was added or deleted, but the message * doesn't tell us anything else! Re-enumerate the * containers and sort things out. */ mi.Command = VM_NameServe; mi.MntType = FT_FILESYS; do { /* * Ask the controller for its containers one at * a time. * XXX What if the controller's list changes * midway through this enumaration? * XXX This should be done async. */ mi.MntCount = i; rsize = sizeof(mir); if (aac_sync_fib(sc, ContainerCommand, 0, &mi, sizeof(mi), &mir, &rsize)) { debug(2, "Error probing container %d\n", i); continue; } if (rsize != sizeof(mir)) { debug(2, "Container response size too " "large\n"); continue; } /* * Check the container against our list. * co->co_found was already set to 0 in a * previous run. */ if ((mir.Status == ST_OK) && (mir.MntTable[0].VolType != CT_NONE)) { found = 0; TAILQ_FOREACH(co, &sc->aac_container_tqh, co_link) { if (co->co_mntobj.ObjectId == mir.MntTable[0].ObjectId) { co->co_found = 1; found = 1; break; } } /* * If the container matched, continue * in the list. */ if (found) { i++; continue; } /* * This is a new container. Do all the * appropriate things to set it up. */ aac_add_container(sc, &mir, 1); added = 1; } i++; } while ((i < mir.MntRespCount) && (i < AAC_MAX_CONTAINERS)); /* * Go through our list of containers and see which ones * were not marked 'found'. Since the controller didn't * list them they must have been deleted. Do the * appropriate steps to destroy the device. Also reset * the co->co_found field. */ co = TAILQ_FIRST(&sc->aac_container_tqh); while (co != NULL) { if (co->co_found == 0) { device_delete_child(sc->aac_dev, co->co_disk); co_next = TAILQ_NEXT(co, co_link); AAC_LOCK_ACQUIRE(&sc-> aac_container_lock); TAILQ_REMOVE(&sc->aac_container_tqh, co, co_link); AAC_LOCK_RELEASE(&sc-> aac_container_lock); FREE(co, M_AACBUF); co = co_next; } else { co->co_found = 0; co = TAILQ_NEXT(co, co_link); } } /* Attach the newly created containers */ if (added) bus_generic_attach(sc->aac_dev); break; default: break; } default: break; } /* Copy the AIF data to the AIF queue for ioctl retrieval */ AAC_LOCK_ACQUIRE(&sc->aac_aifq_lock); next = (sc->aac_aifq_head + 1) % AAC_AIFQ_LENGTH; if (next != sc->aac_aifq_tail) { bcopy(aif, &sc->aac_aifq[next], sizeof(struct aac_aif_command)); sc->aac_aifq_head = next; /* On the off chance that someone is sleeping for an aif... */ if (sc->aac_state & AAC_STATE_AIF_SLEEPER) wakeup(sc->aac_aifq); /* Wakeup any poll()ers */ selwakeup(&sc->rcv_select); } AAC_LOCK_RELEASE(&sc->aac_aifq_lock); return; } /* * Linux Management Interface * This is soon to be removed! */ #ifdef AAC_COMPAT_LINUX #include #include #include #include /* There are multiple ioctl number ranges that need to be handled */ #define AAC_LINUX_IOCTL_MIN 0x0000 #define AAC_LINUX_IOCTL_MAX 0x21ff static linux_ioctl_function_t aac_linux_ioctl; static struct linux_ioctl_handler aac_handler = {aac_linux_ioctl, AAC_LINUX_IOCTL_MIN, AAC_LINUX_IOCTL_MAX}; SYSINIT (aac_register, SI_SUB_KLD, SI_ORDER_MIDDLE, linux_ioctl_register_handler, &aac_handler); SYSUNINIT(aac_unregister, SI_SUB_KLD, SI_ORDER_MIDDLE, linux_ioctl_unregister_handler, &aac_handler); MODULE_DEPEND(aac, linux, 1, 1, 1); static int aac_linux_ioctl(struct thread *td, struct linux_ioctl_args *args) { struct file *fp; u_long cmd; + int error; debug_called(2); - fp = td->td_proc->p_fd->fd_ofiles[args->fd]; + fp = ffind_hold(td, args->fd); + if (fp == NULL) + return (EBADF); cmd = args->cmd; /* * Pass the ioctl off to our standard handler. */ - return(fo_ioctl(fp, cmd, (caddr_t)args->arg, td)); + error = (fo_ioctl(fp, cmd, (caddr_t)args->arg, td)); + fdrop(fp, td); + return (error); } #endif /* * Return the Revision of the driver to userspace and check to see if the * userspace app is possibly compatible. This is extremely bogus since * our driver doesn't follow Adaptec's versioning system. Cheat by just * returning what the card reported. */ static int aac_rev_check(struct aac_softc *sc, caddr_t udata) { struct aac_rev_check rev_check; struct aac_rev_check_resp rev_check_resp; int error = 0; debug_called(2); /* * Copyin the revision struct from userspace */ if ((error = copyin(udata, (caddr_t)&rev_check, sizeof(struct aac_rev_check))) != 0) { return error; } debug(2, "Userland revision= %d\n", rev_check.callingRevision.buildNumber); /* * Doctor up the response struct. */ rev_check_resp.possiblyCompatible = 1; rev_check_resp.adapterSWRevision.external.ul = sc->aac_revision.external.ul; rev_check_resp.adapterSWRevision.buildNumber = sc->aac_revision.buildNumber; return(copyout((caddr_t)&rev_check_resp, udata, sizeof(struct aac_rev_check_resp))); } /* * Pass the caller the next AIF in their queue */ static int aac_getnext_aif(struct aac_softc *sc, caddr_t arg) { struct get_adapter_fib_ioctl agf; int error, s; debug_called(2); if ((error = copyin(arg, &agf, sizeof(agf))) == 0) { /* * Check the magic number that we gave the caller. */ if (agf.AdapterFibContext != (int)sc->aifthread) { error = EFAULT; } else { s = splbio(); error = aac_return_aif(sc, agf.AifFib); if ((error == EAGAIN) && (agf.Wait)) { sc->aac_state |= AAC_STATE_AIF_SLEEPER; while (error == EAGAIN) { error = tsleep(sc->aac_aifq, PRIBIO | PCATCH, "aacaif", 0); if (error == 0) error = aac_return_aif(sc, agf.AifFib); } sc->aac_state &= ~AAC_STATE_AIF_SLEEPER; } splx(s); } } return(error); } /* * Hand the next AIF off the top of the queue out to userspace. */ static int aac_return_aif(struct aac_softc *sc, caddr_t uptr) { int error; debug_called(2); AAC_LOCK_ACQUIRE(&sc->aac_aifq_lock); if (sc->aac_aifq_tail == sc->aac_aifq_head) { error = EAGAIN; } else { error = copyout(&sc->aac_aifq[sc->aac_aifq_tail], uptr, sizeof(struct aac_aif_command)); if (error) printf("aac_return_aif: copyout returned %d\n", error); if (!error) sc->aac_aifq_tail = (sc->aac_aifq_tail + 1) % AAC_AIFQ_LENGTH; } AAC_LOCK_RELEASE(&sc->aac_aifq_lock); return(error); } /* * Give the userland some information about the container. The AAC arch * expects the driver to be a SCSI passthrough type driver, so it expects * the containers to have b:t:l numbers. Fake it. */ static int aac_query_disk(struct aac_softc *sc, caddr_t uptr) { struct aac_query_disk query_disk; struct aac_container *co; struct aac_disk *disk; int error, id; debug_called(2); disk = NULL; error = copyin(uptr, (caddr_t)&query_disk, sizeof(struct aac_query_disk)); if (error) return (error); id = query_disk.ContainerNumber; if (id == -1) return (EINVAL); AAC_LOCK_ACQUIRE(&sc->aac_container_lock); TAILQ_FOREACH(co, &sc->aac_container_tqh, co_link) { if (co->co_mntobj.ObjectId == id) break; } if (co == NULL) { query_disk.Valid = 0; query_disk.Locked = 0; query_disk.Deleted = 1; /* XXX is this right? */ } else { disk = device_get_softc(co->co_disk); query_disk.Valid = 1; query_disk.Locked = (disk->ad_flags & AAC_DISK_OPEN) ? 1 : 0; query_disk.Deleted = 0; query_disk.Bus = device_get_unit(sc->aac_dev); query_disk.Target = disk->unit; query_disk.Lun = 0; query_disk.UnMapped = 0; bcopy(disk->ad_dev_t->si_name, &query_disk.diskDeviceName[0], 10); } AAC_LOCK_RELEASE(&sc->aac_container_lock); error = copyout((caddr_t)&query_disk, uptr, sizeof(struct aac_query_disk)); return (error); } Index: head/sys/dev/streams/streams.c =================================================================== --- head/sys/dev/streams/streams.c (revision 89305) +++ head/sys/dev/streams/streams.c (revision 89306) @@ -1,410 +1,428 @@ /* * Copyright (c) 1998 Mark Newton * Copyright (c) 1994 Christos Zoulas * Copyright (c) 1997 Todd Vierling * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The names of the authors may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * Stolen from NetBSD /sys/compat/svr4/svr4_net.c. Pseudo-device driver * skeleton produced from /usr/share/examples/drivers/make_pseudo_driver.sh * in 3.0-980524-SNAP then hacked a bit (but probably not enough :-). * * $FreeBSD$ */ #include #include #include /* SYSINIT stuff */ #include /* cdevsw stuff */ #include /* malloc region definitions */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int svr4_soo_close __P((struct file *, struct thread *)); static int svr4_ptm_alloc __P((struct thread *)); static d_open_t streamsopen; struct svr4_sockcache_entry { struct proc *p; /* Process for the socket */ void *cookie; /* Internal cookie used for matching */ struct sockaddr_un sock;/* Pathname for the socket */ dev_t dev; /* Device where the socket lives on */ ino_t ino; /* Inode where the socket lives on */ TAILQ_ENTRY(svr4_sockcache_entry) entries; }; TAILQ_HEAD(svr4_sockcache_head, svr4_sockcache_entry) svr4_head; /* Initialization flag (set/queried by svr4_mod LKM) */ int svr4_str_initialized = 0; /* * Device minor numbers */ enum { dev_ptm = 10, dev_arp = 26, dev_icmp = 27, dev_ip = 28, dev_tcp = 35, dev_udp = 36, dev_rawip = 37, dev_unix_dgram = 38, dev_unix_stream = 39, dev_unix_ord_stream = 40 }; static dev_t dt_ptm, dt_arp, dt_icmp, dt_ip, dt_tcp, dt_udp, dt_rawip, dt_unix_dgram, dt_unix_stream, dt_unix_ord_stream; static struct fileops svr4_netops = { soo_read, soo_write, soo_ioctl, soo_poll, sokqfilter, soo_stat, svr4_soo_close }; #define CDEV_MAJOR 103 static struct cdevsw streams_cdevsw = { /* open */ streamsopen, /* close */ noclose, /* read */ noread, /* write */ nowrite, /* ioctl */ noioctl, /* poll */ nopoll, /* mmap */ nommap, /* strategy */ nostrategy, /* name */ "streams", /* maj */ CDEV_MAJOR, /* dump */ nodump, /* psize */ nopsize, /* flags */ 0, }; struct streams_softc { struct isa_device *dev; } ; #define UNIT(dev) minor(dev) /* assume one minor number per unit */ typedef struct streams_softc *sc_p; static int streams_modevent(module_t mod, int type, void *unused) { switch (type) { case MOD_LOAD: /* XXX should make sure it isn't already loaded first */ dt_ptm = make_dev(&streams_cdevsw, dev_ptm, 0, 0, 0666, "ptm"); dt_arp = make_dev(&streams_cdevsw, dev_arp, 0, 0, 0666, "arp"); dt_icmp = make_dev(&streams_cdevsw, dev_icmp, 0, 0, 0666, "icmp"); dt_ip = make_dev(&streams_cdevsw, dev_ip, 0, 0, 0666, "ip"); dt_tcp = make_dev(&streams_cdevsw, dev_tcp, 0, 0, 0666, "tcp"); dt_udp = make_dev(&streams_cdevsw, dev_udp, 0, 0, 0666, "udp"); dt_rawip = make_dev(&streams_cdevsw, dev_rawip, 0, 0, 0666, "rawip"); dt_unix_dgram = make_dev(&streams_cdevsw, dev_unix_dgram, 0, 0, 0666, "ticlts"); dt_unix_stream = make_dev(&streams_cdevsw, dev_unix_stream, 0, 0, 0666, "ticots"); dt_unix_ord_stream = make_dev(&streams_cdevsw, dev_unix_ord_stream, 0, 0, 0666, "ticotsord"); if (! (dt_ptm && dt_arp && dt_icmp && dt_ip && dt_tcp && dt_udp && dt_rawip && dt_unix_dgram && dt_unix_stream && dt_unix_ord_stream)) { printf("WARNING: device config for STREAMS failed\n"); printf("Suggest unloading streams KLD\n"); } return 0; case MOD_UNLOAD: /* XXX should check to see if it's busy first */ destroy_dev(dt_ptm); destroy_dev(dt_arp); destroy_dev(dt_icmp); destroy_dev(dt_ip); destroy_dev(dt_tcp); destroy_dev(dt_udp); destroy_dev(dt_rawip); destroy_dev(dt_unix_dgram); destroy_dev(dt_unix_stream); destroy_dev(dt_unix_ord_stream); return 0; default: break; } return 0; } static moduledata_t streams_mod = { "streams", streams_modevent, 0 }; DECLARE_MODULE(streams, streams_mod, SI_SUB_DRIVERS, SI_ORDER_ANY); MODULE_VERSION(streams, 1); /* * We only need open() and close() routines. open() calls socreate() * to allocate a "real" object behind the stream and mallocs some state * info for use by the svr4 emulator; close() deallocates the state * information and passes the underlying object to the normal socket close * routine. */ static int streamsopen(dev_t dev, int oflags, int devtype, struct thread *td) { int type, protocol; int fd; struct file *fp; struct socket *so; int error; int family; struct proc *p = td->td_proc; PROC_LOCK(p); if (td->td_dupfd >= 0) { PROC_UNLOCK(p); return ENODEV; } PROC_UNLOCK(p); switch (minor(dev)) { case dev_udp: family = AF_INET; type = SOCK_DGRAM; protocol = IPPROTO_UDP; break; case dev_tcp: family = AF_INET; type = SOCK_STREAM; protocol = IPPROTO_TCP; break; case dev_ip: case dev_rawip: family = AF_INET; type = SOCK_RAW; protocol = IPPROTO_IP; break; case dev_icmp: family = AF_INET; type = SOCK_RAW; protocol = IPPROTO_ICMP; break; case dev_unix_dgram: family = AF_LOCAL; type = SOCK_DGRAM; protocol = 0; break; case dev_unix_stream: case dev_unix_ord_stream: family = AF_LOCAL; type = SOCK_STREAM; protocol = 0; break; case dev_ptm: return svr4_ptm_alloc(td); default: return EOPNOTSUPP; } if ((error = falloc(td, &fp, &fd)) != 0) return error; if ((error = socreate(family, &so, type, protocol, td->td_proc->p_ucred, td)) != 0) { + FILEDESC_LOCK(p->p_fd); p->p_fd->fd_ofiles[fd] = 0; + FILEDESC_UNLOCK(p->p_fd); ffree(fp); return error; } + FILEDESC_LOCK(p->p_fd); fp->f_data = (caddr_t)so; fp->f_flag = FREAD|FWRITE; fp->f_ops = &svr4_netops; fp->f_type = DTYPE_SOCKET; + FILEDESC_UNLOCK(p->p_fd); (void)svr4_stream_get(fp); PROC_LOCK(p); td->td_dupfd = fd; PROC_UNLOCK(p); return ENXIO; } static int svr4_ptm_alloc(td) struct thread *td; { struct proc *p = td->td_proc; /* * XXX this is very, very ugly. But I can't find a better * way that won't duplicate a big amount of code from * sys_open(). Ho hum... * * Fortunately for us, Solaris (at least 2.5.1) makes the * /dev/ptmx open automatically just open a pty, that (after * STREAMS I_PUSHes), is just a plain pty. fstat() is used * to get the minor device number to map to a tty. * * Cycle through the names. If sys_open() returns ENOENT (or * ENXIO), short circuit the cycle and exit. */ static char ptyname[] = "/dev/ptyXX"; static char ttyletters[] = "pqrstuwxyzPQRST"; static char ttynumbers[] = "0123456789abcdef"; caddr_t sg = stackgap_init(); char *path = stackgap_alloc(&sg, sizeof(ptyname)); struct open_args oa; int l = 0, n = 0; register_t fd = -1; int error; SCARG(&oa, path) = path; SCARG(&oa, flags) = O_RDWR; SCARG(&oa, mode) = 0; while (fd == -1) { ptyname[8] = ttyletters[l]; ptyname[9] = ttynumbers[n]; if ((error = copyout(ptyname, path, sizeof(ptyname))) != 0) return error; switch (error = open(td, &oa)) { case ENOENT: case ENXIO: return error; case 0: PROC_LOCK(p); td->td_dupfd = td->td_retval[0]; PROC_UNLOCK(p); return ENXIO; default: if (ttynumbers[++n] == '\0') { if (ttyletters[++l] == '\0') break; n = 0; } } } return ENOENT; } struct svr4_strm * svr4_stream_get(fp) struct file *fp; { struct socket *so; struct svr4_strm *st; if (fp == NULL || fp->f_type != DTYPE_SOCKET) return NULL; so = (struct socket *) fp->f_data; - if (so->so_emuldata) + /* + * mpfixme: lock socketbuffer here + */ + if (so->so_emuldata) { return so->so_emuldata; + } /* Allocate a new one. */ st = malloc(sizeof(struct svr4_strm), M_TEMP, M_WAITOK); st->s_family = so->so_proto->pr_domain->dom_family; st->s_cmd = ~0; st->s_afd = -1; st->s_eventmask = 0; - so->so_emuldata = st; - fp->f_ops = &svr4_netops; + /* + * avoid a race where we loose due to concurrancy issues + * of two threads trying to allocate the so_emuldata. + */ + if (so->so_emuldata) { + /* lost the race, use the existing emuldata */ + FREE(st, M_TEMP); + st = so->so_emuldata; + } else { + /* we won, or there was no race, use our copy */ + so->so_emuldata = st; + fp->f_ops = &svr4_netops; + } return st; } void svr4_delete_socket(p, fp) struct proc *p; struct file *fp; { struct svr4_sockcache_entry *e; void *cookie = ((struct socket *) fp->f_data)->so_emuldata; while (svr4_str_initialized != 2) { if (atomic_cmpset_acq_int(&svr4_str_initialized, 0, 1)) { TAILQ_INIT(&svr4_head); atomic_store_rel_int(&svr4_str_initialized, 2); } return; } TAILQ_FOREACH(e, &svr4_head, entries) if (e->p == p && e->cookie == cookie) { TAILQ_REMOVE(&svr4_head, e, entries); DPRINTF(("svr4_delete_socket: %s [%p,%d,%d]\n", e->sock.sun_path, p, (int)e->dev, e->ino)); free(e, M_TEMP); return; } } static int svr4_soo_close(struct file *fp, struct thread *td) { struct socket *so = (struct socket *)fp->f_data; /* CHECKUNIT_DIAG(ENXIO);*/ svr4_delete_socket(td->td_proc, fp); free(so->so_emuldata, M_TEMP); return soo_close(fp, td); - return (0); } Index: head/sys/dev/tdfx/tdfx_pci.c =================================================================== --- head/sys/dev/tdfx/tdfx_pci.c (revision 89305) +++ head/sys/dev/tdfx/tdfx_pci.c (revision 89306) @@ -1,866 +1,870 @@ /* * Copyright (c) 2000-2001 by Coleman Kane * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Gardner Buchanan. * 4. The name of Gardner Buchanan may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ /* 3dfx driver for FreeBSD 4.x - Finished 11 May 2000, 12:25AM ET * * Copyright (C) 2000-2001, by Coleman Kane , * based upon the 3dfx driver written for linux, by Daryll Straus, Jon Taylor, * and Jens Axboe, located at http://linux.3dfx.com. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* rman.h depends on machine/bus.h */ #include #include #include /* This must come first */ #include "opt_tdfx.h" #ifdef TDFX_LINUX #include #endif #include #include #include static devclass_t tdfx_devclass; static int tdfx_count = 0; /* Set up the boot probe/attach routines */ static device_method_t tdfx_methods[] = { DEVMETHOD(device_probe, tdfx_probe), DEVMETHOD(device_attach, tdfx_attach), DEVMETHOD(device_detach, tdfx_detach), DEVMETHOD(device_shutdown, tdfx_shutdown), { 0, 0 } }; MALLOC_DEFINE(M_TDFX,"TDFX Driver","3DFX Graphics[/2D]/3D Accelerator(s)"); #ifdef TDFX_LINUX MODULE_DEPEND(tdfx, linux, 1, 1, 1); LINUX_IOCTL_SET(tdfx, LINUX_IOCTL_TDFX_MIN, LINUX_IOCTL_TDFX_MAX); #endif /* Char. Dev. file operations structure */ static struct cdevsw tdfx_cdev = { tdfx_open, /* open */ tdfx_close, /* close */ noread, /* read */ nowrite, /* write */ tdfx_ioctl, /* ioctl */ nopoll, /* poll */ tdfx_mmap, /* mmap */ nostrategy, /* strategy */ "tdfx", /* dev name */ CDEV_MAJOR, /* char major */ nodump, /* dump */ nopsize, /* size */ 0, /* flags (no set flags) */ }; static int tdfx_probe(device_t dev) { /* * probe routine called on kernel boot to register supported devices. We get * a device structure to work with, and we can test the VENDOR/DEVICE IDs to * see if this PCI device is one that we support. Return 0 if yes, ENXIO if * not. */ switch(pci_get_devid(dev)) { case PCI_DEVICE_ALLIANCE_AT3D: device_set_desc(dev, "ProMotion At3D 3D Accelerator"); return 0; case PCI_DEVICE_3DFX_VOODOO2: device_set_desc(dev, "3DFX Voodoo II 3D Accelerator"); return 0; /*case PCI_DEVICE_3DFX_BANSHEE: device_set_desc(dev, "3DFX Voodoo Banshee 2D/3D Graphics Accelerator"); return 0; case PCI_DEVICE_3DFX_VOODOO3: device_set_desc(dev, "3DFX Voodoo3 2D/3D Graphics Accelerator"); return 0;*/ case PCI_DEVICE_3DFX_VOODOO1: device_set_desc(dev, "3DFX Voodoo Graphics 3D Accelerator"); return 0;; }; return ENXIO; } static int tdfx_attach(device_t dev) { /* * The attach routine is called after the probe routine successfully says it * supports a given card. We now proceed to initialize this card for use with * the system. I want to map the device memory for userland allocation and * fill an information structure with information on this card. I'd also like * to set Write Combining with the MTRR code so that we can hopefully speed * up memory writes. The last thing is to register the character device * interface to the card, so we can open it from /dev/3dfxN, where N is a * small, whole number. */ struct tdfx_softc *tdfx_info; u_long val; /* rid value tells bus_alloc_resource where to find the addresses of ports or * of memory ranges in the PCI config space*/ int rid = PCIR_MAPS; /* Increment the card counter (for the ioctl code) */ tdfx_count++; /* Enable MemMap on Voodoo */ val = pci_read_config(dev, PCIR_COMMAND, 2); val |= (PCIM_CMD_MEMEN); pci_write_config(dev, PCIR_COMMAND, val, 2); val = pci_read_config(dev, PCIR_COMMAND, 2); /* Fill the soft config struct with info about this device*/ tdfx_info = device_get_softc(dev); tdfx_info->dev = dev; tdfx_info->vendor = pci_get_vendor(dev); tdfx_info->type = pci_get_devid(dev) >> 16; tdfx_info->bus = pci_get_bus(dev); tdfx_info->dv = pci_get_slot(dev); tdfx_info->curFile = NULL; /* * Get the Memory Location from the PCI Config, mask out lower word, since * the config space register is only one word long (this is nicer than a * bitshift). */ tdfx_info->addr0 = (pci_read_config(dev, 0x10, 4) & 0xffff0000); #ifdef DEBUG device_printf(dev, "Base0 @ 0x%x\n", tdfx_info->addr0); #endif /* Notify the VM that we will be mapping some memory later */ tdfx_info->memrange = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0, ~0, 1, RF_ACTIVE | RF_SHAREABLE); if(tdfx_info->memrange == NULL) { #ifdef DEBUG device_printf(dev, "Error mapping mem, won't be able to use mmap()\n"); #endif tdfx_info->memrid = 0; } else { tdfx_info->memrid = rid; #ifdef DEBUG device_printf(dev, "Mapped to: 0x%x\n", (unsigned int)rman_get_start(tdfx_info->memrange)); #endif } /* Setup for Voodoo3 and Banshee, PIO and an extram Memrange */ if(pci_get_devid(dev) == PCI_DEVICE_3DFX_VOODOO3 || pci_get_devid(dev) == PCI_DEVICE_3DFX_BANSHEE) { rid = 0x14; /* 2nd mem map */ tdfx_info->addr1 = (pci_read_config(dev, 0x14, 4) & 0xffff0000); #ifdef DEBUG device_printf(dev, "Base1 @ 0x%x\n", tdfx_info->addr1); #endif tdfx_info->memrange2 = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0, ~0, 1, RF_ACTIVE | RF_SHAREABLE); if(tdfx_info->memrange2 == NULL) { #ifdef DEBUG device_printf(dev, "Mem1 couldn't be allocated, glide may not work."); #endif tdfx_info->memrid2 = 0; } else { tdfx_info->memrid2 = rid; } /* Now to map the PIO stuff */ rid = PCIR_IOBASE0_2; tdfx_info->pio0 = pci_read_config(dev, 0x2c, 2); tdfx_info->pio0max = pci_read_config(dev, 0x30, 2) + tdfx_info->pio0; tdfx_info->piorange = bus_alloc_resource(dev, SYS_RES_IOPORT, &rid, 0, ~0, 1, RF_ACTIVE | RF_SHAREABLE); if(tdfx_info->piorange == NULL) { #ifdef DEBUG device_printf(dev, "Couldn't map PIO range."); #endif tdfx_info->piorid = 0; } else { tdfx_info->piorid = rid; } } else { tdfx_info->addr1 = 0; tdfx_info->memrange2 = NULL; tdfx_info->piorange = NULL; } /* * Set Writecombining, or at least Uncacheable for the memory region, if we * are able to */ if(tdfx_setmtrr(dev) != 0) { #ifdef DEBUG device_printf(dev, "Some weird error setting MTRRs"); #endif return -1; } /* * make_dev registers the cdev to access the 3dfx card from /dev * use hex here for the dev num, simply to provide better support if > 10 * voodoo cards, for the mad. The user must set the link, or use MAKEDEV. * Why would we want that many voodoo cards anyhow? */ tdfx_info->devt = make_dev(&tdfx_cdev, dev->unit, 0, 0, 02660, "3dfx%x", dev->unit); return 0; } static int tdfx_detach(device_t dev) { struct tdfx_softc* tdfx_info; int retval; tdfx_info = device_get_softc(dev); /* Delete allocated resource, of course */ bus_release_resource(dev, SYS_RES_MEMORY, tdfx_info->memrid, tdfx_info->memrange); /* Release extended Voodoo3/Banshee resources */ if(pci_get_devid(dev) == PCI_DEVICE_3DFX_BANSHEE || pci_get_devid(dev) == PCI_DEVICE_3DFX_VOODOO3) { if(tdfx_info->memrange2 != NULL) bus_release_resource(dev, SYS_RES_MEMORY, tdfx_info->memrid2, tdfx_info->memrange); /* if(tdfx_info->piorange != NULL) bus_release_resource(dev, SYS_RES_IOPORT, tdfx_info->piorid, tdfx_info->piorange);*/ } /* Though it is safe to leave the WRCOMB support since the mem driver checks for it, we should remove it in order to free an MTRR for another device */ retval = tdfx_clrmtrr(dev); #ifdef DEBUG if(retval != 0) printf("tdfx: For some reason, I couldn't clear the mtrr\n"); #endif /* Remove device entry when it can no longer be accessed */ destroy_dev(tdfx_info->devt); return(0); } static int tdfx_shutdown(device_t dev) { #ifdef DEBUG device_printf(dev, "tdfx: Device Shutdown\n"); #endif return 0; } static int tdfx_clrmtrr(device_t dev) { /* This function removes the MTRR set by the attach call, so it can be used * in the future by other drivers. */ int retval, act; struct tdfx_softc *tdfx_info = device_get_softc(dev); act = MEMRANGE_SET_REMOVE; retval = mem_range_attr_set(&tdfx_info->mrdesc, &act); return retval; } static int tdfx_setmtrr(device_t dev) { /* * This is the MTRR setting function for the 3dfx card. It is called from * tdfx_attach. If we can't set the MTRR properly, it's not the end of the * world. We can still continue, just with slightly (very slightly) degraded * performance. */ int retval = 0, act; struct tdfx_softc *tdfx_info = device_get_softc(dev); /* The older Voodoo cards have a shorter memrange than the newer ones */ if((pci_get_devid(dev) == PCI_DEVICE_3DFX_VOODOO1) || (pci_get_devid(dev) == PCI_DEVICE_3DFX_VOODOO2)) { tdfx_info->mrdesc.mr_len = 0x400000; /* The memory descriptor is described as the top 15 bits of the real address */ tdfx_info->mrdesc.mr_base = tdfx_info->addr0 & 0xfffe0000; } else if((pci_get_devid(dev) == PCI_DEVICE_3DFX_VOODOO3) || (pci_get_devid(dev) == PCI_DEVICE_3DFX_BANSHEE)) { tdfx_info->mrdesc.mr_len = 0x1000000; /* The Voodoo3 and Banshee LFB is the second memory address */ /* The memory descriptor is described as the top 15 bits of the real address */ tdfx_info->mrdesc.mr_base = tdfx_info->addr1 & 0xfffe0000; } else return 0; /* * The Alliance Pro Motion AT3D was not mentioned in the linux * driver as far as MTRR support goes, so I just won't put the * code in here for it. This is where it should go, though. */ /* Firstly, try to set write combining */ tdfx_info->mrdesc.mr_flags = MDF_WRITECOMBINE; bcopy("tdfx", &tdfx_info->mrdesc.mr_owner, 4); act = MEMRANGE_SET_UPDATE; retval = mem_range_attr_set(&tdfx_info->mrdesc, &act); if(retval == 0) { #ifdef DEBUG device_printf(dev, "MTRR Set Correctly for tdfx\n"); #endif } else if((pci_get_devid(dev) == PCI_DEVICE_3DFX_VOODOO2) || (pci_get_devid(dev) == PCI_DEVICE_3DFX_VOODOO1)) { /* if, for some reason we can't set the WRCOMB range with the V1/V2, we * can still possibly use the UNCACHEABLE region for it instead, and help * out in a small way */ tdfx_info->mrdesc.mr_flags = MDF_UNCACHEABLE; /* This length of 1000h was taken from the linux device driver... */ tdfx_info->mrdesc.mr_len = 0x1000; /* * If, for some reason, we can't set the MTRR (N/A?) we may still continue */ #ifdef DEBUG if(retval == 0) { device_printf(dev, "MTRR Set Type Uncacheable %x\n", (u_int32_t)tdfx_info->mrdesc.mr_base); } else { device_printf(dev, "Couldn't Set MTRR\n"); } #endif } #ifdef DEBUG else { device_printf(dev, "Couldn't Set MTRR\n"); return 0; } #endif return 0; } static int tdfx_open(dev_t dev, int flags, int fmt, struct thread *td) { /* * The open cdev method handles open(2) calls to /dev/3dfx[n] * We can pretty much allow any opening of the device. */ struct tdfx_softc *tdfx_info = devclass_get_softc(tdfx_devclass, UNIT(minor(dev))); if(tdfx_info->busy != 0) return EBUSY; #ifdef DEBUG printf("3dfx: Opened by #%d\n", td->td_proc->p_pid); #endif /* Set the driver as busy */ tdfx_info->busy++; return 0; } static int tdfx_close(dev_t dev, int fflag, int devtype, struct thread *td) { /* * The close cdev method handles close(2) calls to /dev/3dfx[n] * We'll always want to close the device when it's called. */ struct tdfx_softc *tdfx_info = devclass_get_softc(tdfx_devclass, UNIT(minor(dev))); if(tdfx_info->busy == 0) return EBADF; tdfx_info->busy = 0; #ifdef DEBUG printf("Closed by #%d\n", td->td_proc->p_pid); #endif return 0; } static int tdfx_mmap(dev_t dev, vm_offset_t offset, int nprot) { /* * mmap(2) is called by a user process to request that an area of memory * associated with this device be mapped for the process to work with. Nprot * holds the protections requested, PROT_READ, PROT_WRITE, or both. */ /**** OLD GET CONFIG ****/ /* struct tdfx_softc* tdfx_info; */ /* Get the configuration for our card XXX*/ /*tdfx_info = (struct tdfx_softc*)devclass_get_softc(tdfx_devclass, UNIT(minor(dev)));*/ /************************/ struct tdfx_softc* tdfx_info[2]; tdfx_info[0] = (struct tdfx_softc*)devclass_get_softc(tdfx_devclass, 0); /* If, for some reason, its not configured, we bail out */ if(tdfx_info[0] == NULL) { #ifdef DEBUG printf("tdfx: tdfx_info (softc) is NULL\n"); #endif return -1; } /* We must stay within the bound of our address space */ if((offset & 0xff000000) == tdfx_info[0]->addr0) { offset &= 0xffffff; return atop(rman_get_start(tdfx_info[0]->memrange) + offset); } if(tdfx_count > 1) { tdfx_info[1] = (struct tdfx_softc*)devclass_get_softc(tdfx_devclass, 1); if((offset & 0xff000000) == tdfx_info[1]->addr0) { offset &= 0xffffff; return atop(rman_get_start(tdfx_info[1]->memrange) + offset); } } /* See if the Banshee/V3 LFB is being requested */ /*if(tdfx_info->memrange2 != NULL && (offset & 0xff000000) == tdfx_info->addr1) { offset &= 0xffffff; return atop(rman_get_start(tdfx_info[1]->memrange2) + offset); }*/ /* VoodooNG code */ /* The ret call */ /* atop -> address to page * rman_get_start, get the (struct resource*)->r_start member, * the mapping base address. */ return -1; } static int tdfx_query_boards(void) { /* * This returns the number of installed tdfx cards, we have been keeping * count, look at tdfx_attach */ return tdfx_count; } static int tdfx_query_fetch(u_int cmd, struct tdfx_pio_data *piod) { /* XXX Comment this later, after careful inspection and spring cleaning :) */ /* Various return values 8bit-32bit */ u_int8_t ret_byte; u_int16_t ret_word; u_int32_t ret_dword; struct tdfx_softc* tdfx_info = NULL; /* This one depend on the tdfx_* structs being properly initialized */ /*piod->device &= 0xf;*/ if((piod == NULL) ||(tdfx_count <= piod->device) || (piod->device < 0)) { #ifdef DEBUG printf("tdfx: Bad device or internal struct in tdfx_query_fetch\n"); #endif return -EINVAL; } tdfx_info = (struct tdfx_softc*)devclass_get_softc(tdfx_devclass, piod->device); if(tdfx_info == NULL) return -ENXIO; /* We must restrict the size reads from the port, since to high or low of a * size witll result in wrong data being passed, and that's bad */ /* A few of these were pulled during the attach phase */ switch(piod->port) { case PCI_VENDOR_ID_FREEBSD: if(piod->size != 2) return -EINVAL; copyout(&tdfx_info->vendor, piod->value, piod->size); return 0; case PCI_DEVICE_ID_FREEBSD: if(piod->size != 2) return -EINVAL; copyout(&tdfx_info->type, piod->value, piod->size); return 0; case PCI_BASE_ADDRESS_0_FREEBSD: if(piod->size != 4) return -EINVAL; copyout(&tdfx_info->addr0, piod->value, piod->size); return 0; case PCI_BASE_ADDRESS_1_FREEBSD: if(piod->size != 4) return -EINVAL; copyout(&tdfx_info->addr1, piod->value, piod->size); return 0; case PCI_PRIBUS_FREEBSD: if(piod->size != 1) return -EINVAL; break; case PCI_IOBASE_0_FREEBSD: if(piod->size != 2) return -EINVAL; break; case PCI_IOLIMIT_0_FREEBSD: if(piod->size != 2) return -EINVAL; break; case SST1_PCI_SPECIAL1_FREEBSD: if(piod->size != 4) return -EINVAL; break; case PCI_REVISION_ID_FREEBSD: if(piod->size != 1) return -EINVAL; break; case SST1_PCI_SPECIAL4_FREEBSD: if(piod->size != 4) return -EINVAL; break; default: return -EINVAL; } /* Read the value and return */ switch(piod->size) { case 1: ret_byte = pci_read_config(tdfx_info[piod->device].dev, piod->port, 1); copyout(&ret_byte, piod->value, 1); break; case 2: ret_word = pci_read_config(tdfx_info[piod->device].dev, piod->port, 2); copyout(&ret_word, piod->value, 2); break; case 4: ret_dword = pci_read_config(tdfx_info[piod->device].dev, piod->port, 4); copyout(&ret_dword, piod->value, 4); break; default: return -EINVAL; } return 0; } static int tdfx_query_update(u_int cmd, struct tdfx_pio_data *piod) { /* XXX Comment this later, after careful inspection and spring cleaning :) */ /* Return vals */ u_int8_t ret_byte; u_int16_t ret_word; u_int32_t ret_dword; /* Port vals, mask */ u_int32_t retval, preval, mask; struct tdfx_softc* tdfx_info = NULL; if((piod == NULL) || (piod->device >= (tdfx_count & 0xf))) { #ifdef DEBUG printf("tdfx: Bad struct or device in tdfx_query_update\n"); #endif return -EINVAL; } tdfx_info = (struct tdfx_softc*)devclass_get_softc(tdfx_devclass, piod->device); if(tdfx_info == NULL) return -ENXIO; /* Code below this line in the fuction was taken from the * Linux driver and converted for freebsd. */ /* Check the size for all the ports, to make sure stuff doesn't get messed up * by poorly written clients */ switch(piod->port) { case PCI_COMMAND_FREEBSD: if(piod->size != 2) return -EINVAL; break; case SST1_PCI_SPECIAL1_FREEBSD: if(piod->size != 4) return -EINVAL; break; case SST1_PCI_SPECIAL2_FREEBSD: if(piod->size != 4) return -EINVAL; break; case SST1_PCI_SPECIAL3_FREEBSD: if(piod->size != 4) return -EINVAL; break; case SST1_PCI_SPECIAL4_FREEBSD: if(piod->size != 4) return -EINVAL; break; default: return -EINVAL; } /* Read the current value */ retval = pci_read_config(tdfx_info->dev, piod->port & ~3, 4); /* These set up a mask to use, since apparently they wanted to write 4 bytes * at once to the ports */ switch (piod->size) { case 1: copyin(piod->value, &ret_byte, 1); preval = ret_byte << (8 * (piod->port & 0x3)); mask = 0xff << (8 * (piod->port & 0x3)); break; case 2: copyin(piod->value, &ret_word, 2); preval = ret_word << (8 * (piod->port & 0x3)); mask = 0xffff << (8 * (piod->port & 0x3)); break; case 4: copyin(piod->value, &ret_dword, 4); preval = ret_dword; mask = ~0; break; default: return -EINVAL; } /* Finally, combine the values and write it to the port */ retval = (retval & ~mask) | preval; pci_write_config(tdfx_info->dev, piod->port & ~3, retval, 4); return 0; } /* For both of these, I added a variable named workport of type u_int so * that I could eliminate the warning about my data type size. The * applications expect the port to be of type short, so I needed to change * this within the function */ static int tdfx_do_pio_rd(struct tdfx_pio_data *piod) { /* Return val */ u_int8_t ret_byte; u_int workport; struct tdfx_softc *tdfx_info = (struct tdfx_softc*)devclass_get_softc(tdfx_devclass, piod->device); /* Restricts the access of ports other than those we use */ if(((piod->port != VGA_INPUT_STATUS_1C) || (piod->port != SC_INDEX) || (piod->port != SC_DATA) || (piod->port != VGA_MISC_OUTPUT_READ)) && (piod->port < tdfx_info->pio0) && (piod->port > tdfx_info->pio0max)) return -EPERM; /* All VGA STATUS REGS are byte registers, size should never be > 1 */ if(piod->size != 1) { return -EINVAL; } /* Write the data to the intended port */ workport = piod->port; ret_byte = inb(workport); copyout(&ret_byte, piod->value, sizeof(u_int8_t)); return 0; } static int tdfx_do_pio_wt(struct tdfx_pio_data *piod) { /* return val */ u_int8_t ret_byte; u_int workport; struct tdfx_softc *tdfx_info = (struct tdfx_softc*)devclass_get_softc(tdfx_devclass, piod->device); /* Replace old switch w/ massive if(...) */ /* Restricts the access of ports other than those we use */ if(((piod->port != SC_INDEX) && (piod->port != SC_DATA) && (piod->port != VGA_MISC_OUTPUT_READ)) /* Can't write VGA_ST_1C */ && (piod->port < tdfx_info->pio0) && (piod->port > tdfx_info->pio0max)) return -EPERM; /* All VGA STATUS REGS are byte registers, size should never be > 1 */ if(piod->size != 1) { return -EINVAL; } /* Write the data to the intended port */ copyin(piod->value, &ret_byte, sizeof(u_int8_t)); workport = piod->port; outb(workport, ret_byte); return 0; } static int tdfx_do_query(u_int cmd, struct tdfx_pio_data *piod) { /* There are three sub-commands to the query 0x33 */ switch(_IOC_NR(cmd)) { case 2: return tdfx_query_boards(); break; case 3: return tdfx_query_fetch(cmd, piod); break; case 4: return tdfx_query_update(cmd, piod); break; default: /* In case we are thrown a bogus sub-command! */ #ifdef DEBUG printf("Bad Sub-cmd: 0x%x\n", _IOC_NR(cmd)); #endif return -EINVAL; }; } static int tdfx_do_pio(u_int cmd, struct tdfx_pio_data *piod) { /* Two types of PIO, INPUT and OUTPUT, as the name suggests */ switch(_IOC_DIR(cmd)) { case IOCV_OUT: return tdfx_do_pio_rd(piod); break; case IOCV_IN: return tdfx_do_pio_wt(piod); break; default: return -EINVAL; }; } /* Calls to ioctl(2) eventually end up here. Unhandled ioctls return an ENXIO, * normally, you would read in the data pointed to by data, then write your * output to it. The ioctl *should* normally return zero if everything is * alright, but 3dfx didn't make it that way... * * For all of the ioctl code, in the event of a real error, * we return -Exxxx rather than simply Exxxx. The reason for this * is that the ioctls actually RET information back to the program * sometimes, rather than filling it in the passed structure. We * want to distinguish errors from useful data, and maintain compatibility. * * There is this portion of the proc struct called p_retval[], we can store a * return value in td->td_retval[0] and place the return value if it is positive * in there, then we can return 0 (good). If the return value is negative, we * can return -retval and the error should be properly handled. */ static int tdfx_ioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct thread *td) { int retval = 0; struct tdfx_pio_data *piod = (struct tdfx_pio_data*)data; #ifdef DEBUG printf("IOCTL'd by #%d, cmd: 0x%x, data: 0x%x\n", td->td_proc->p_pid, (u_int32_t)cmd, (unsigned int)piod); #endif switch(_IOC_TYPE(cmd)) { /* Return the real error if negative, or simply stick the valid return * in td->td_retval */ case 0x33: /* The '3'(0x33) type IOCTL is for querying the installed cards */ if((retval = tdfx_do_query(cmd, piod)) > 0) td->td_retval[0] = retval; else return -retval; break; case 0: /* The 0 type IOCTL is for programmed I/O methods */ if((tdfx_do_pio(cmd, piod)) > 0) td->td_retval[0] = retval; else return -retval; break; default: /* Technically, we won't reach this from linux emu, but when glide * finally gets ported, watch out! */ #ifdef DEBUG printf("Bad IOCTL from #%d\n", td->td_proc->p_pid); #endif return ENXIO; } return 0; } #ifdef TDFX_LINUX /* * Linux emulation IOCTL for /dev/tdfx */ static int linux_ioctl_tdfx(struct thread *td, struct linux_ioctl_args* args) { int error = 0; u_long cmd = args->cmd & 0xffff; /* The structure passed to ioctl has two shorts, one int and one void*. */ char d_pio[2*sizeof(short) + sizeof(int) + sizeof(void*)]; - struct file *fp = td->td_proc->p_fd->fd_ofiles[args->fd]; + struct file *fp; + fp = ffind_hold(td, args->fd); + if (fp == NULL) + return (EBADF); /* We simply copy the data and send it right to ioctl */ copyin((caddr_t)args->arg, &d_pio, sizeof(d_pio)); error = fo_ioctl(fp, cmd, (caddr_t)&d_pio, td); + fdrop(fp, td); return error; } #endif /* TDFX_LINUX */ /* This is the device driver struct. This is sent to the driver subsystem to * register the method structure and the info strcut space for this particular * instance of the driver. */ static driver_t tdfx_driver = { "tdfx", tdfx_methods, sizeof(struct tdfx_softc), }; /* Tell Mr. Kernel about us! */ DRIVER_MODULE(tdfx, pci, tdfx_driver, tdfx_devclass, 0, 0); Index: head/sys/fs/fdescfs/fdesc_vfsops.c =================================================================== --- head/sys/fs/fdescfs/fdesc_vfsops.c (revision 89305) +++ head/sys/fs/fdescfs/fdesc_vfsops.c (revision 89306) @@ -1,225 +1,227 @@ /* * Copyright (c) 1992, 1993, 1995 * The Regents of the University of California. All rights reserved. * * This code is derived from software donated to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)fdesc_vfsops.c 8.4 (Berkeley) 1/21/94 * * $FreeBSD$ */ /* * /dev/fd Filesystem */ #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_FDESCMNT, "FDESC mount", "FDESC mount structure"); static int fdesc_mount __P((struct mount *mp, char *path, caddr_t data, struct nameidata *ndp, struct thread *td)); static int fdesc_unmount __P((struct mount *mp, int mntflags, struct thread *td)); static int fdesc_statfs __P((struct mount *mp, struct statfs *sbp, struct thread *td)); /* * Mount the per-process file descriptors (/dev/fd) */ static int fdesc_mount(mp, path, data, ndp, td) struct mount *mp; char *path; caddr_t data; struct nameidata *ndp; struct thread *td; { int error = 0; struct fdescmount *fmp; struct vnode *rvp; /* * Update is a no-op */ if (mp->mnt_flag & MNT_UPDATE) return (EOPNOTSUPP); error = fdesc_allocvp(Froot, FD_ROOT, mp, &rvp, td); if (error) return (error); MALLOC(fmp, struct fdescmount *, sizeof(struct fdescmount), M_FDESCMNT, M_WAITOK); /* XXX */ rvp->v_type = VDIR; rvp->v_flag |= VROOT; fmp->f_root = rvp; /* XXX -- don't mark as local to work around fts() problems */ /*mp->mnt_flag |= MNT_LOCAL;*/ mp->mnt_data = (qaddr_t) fmp; vfs_getnewfsid(mp); bzero(mp->mnt_stat.f_mntfromname, MNAMELEN); bcopy("fdesc", mp->mnt_stat.f_mntfromname, sizeof("fdesc")); (void)fdesc_statfs(mp, &mp->mnt_stat, td); return (0); } static int fdesc_unmount(mp, mntflags, td) struct mount *mp; int mntflags; struct thread *td; { int error; int flags = 0; if (mntflags & MNT_FORCE) flags |= FORCECLOSE; /* * Clear out buffer cache. I don't think we * ever get anything cached at this level at the * moment, but who knows... * * There is 1 extra root vnode reference corresponding * to f_root. */ if ((error = vflush(mp, 1, flags)) != 0) return (error); /* * Finally, throw away the fdescmount structure */ free(mp->mnt_data, M_FDESCMNT); /* XXX */ mp->mnt_data = 0; return (0); } int fdesc_root(mp, vpp) struct mount *mp; struct vnode **vpp; { struct thread *td = curthread; /* XXX */ struct vnode *vp; /* * Return locked reference to root. */ vp = VFSTOFDESC(mp)->f_root; VREF(vp); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); *vpp = vp; return (0); } static int fdesc_statfs(mp, sbp, td) struct mount *mp; struct statfs *sbp; struct thread *td; { struct filedesc *fdp; int lim; int i; int last; int freefd; /* * Compute number of free file descriptors. * [ Strange results will ensue if the open file * limit is ever reduced below the current number * of open files... ] */ lim = td->td_proc->p_rlimit[RLIMIT_NOFILE].rlim_cur; fdp = td->td_proc->p_fd; + FILEDESC_LOCK(fdp); last = min(fdp->fd_nfiles, lim); freefd = 0; for (i = fdp->fd_freefile; i < last; i++) if (fdp->fd_ofiles[i] == NULL) freefd++; /* * Adjust for the fact that the fdesc array may not * have been fully allocated yet. */ if (fdp->fd_nfiles < lim) freefd += (lim - fdp->fd_nfiles); + FILEDESC_UNLOCK(fdp); sbp->f_flags = 0; sbp->f_bsize = DEV_BSIZE; sbp->f_iosize = DEV_BSIZE; sbp->f_blocks = 2; /* 1K to keep df happy */ sbp->f_bfree = 0; sbp->f_bavail = 0; sbp->f_files = lim + 1; /* Allow for "." */ sbp->f_ffree = freefd; /* See comments above */ if (sbp != &mp->mnt_stat) { sbp->f_type = mp->mnt_vfc->vfc_typenum; bcopy(&mp->mnt_stat.f_fsid, &sbp->f_fsid, sizeof(sbp->f_fsid)); bcopy(mp->mnt_stat.f_mntonname, sbp->f_mntonname, MNAMELEN); bcopy(mp->mnt_stat.f_mntfromname, sbp->f_mntfromname, MNAMELEN); } return (0); } static struct vfsops fdesc_vfsops = { fdesc_mount, vfs_stdstart, fdesc_unmount, fdesc_root, vfs_stdquotactl, fdesc_statfs, vfs_stdsync, vfs_stdvget, vfs_stdfhtovp, vfs_stdcheckexp, vfs_stdvptofh, fdesc_init, vfs_stduninit, vfs_stdextattrctl, }; VFS_SET(fdesc_vfsops, fdescfs, VFCF_SYNTHETIC); Index: head/sys/fs/fdescfs/fdesc_vnops.c =================================================================== --- head/sys/fs/fdescfs/fdesc_vnops.c (revision 89305) +++ head/sys/fs/fdescfs/fdesc_vnops.c (revision 89306) @@ -1,563 +1,575 @@ /* * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software donated to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)fdesc_vnops.c 8.9 (Berkeley) 1/21/94 * * $FreeBSD$ */ /* * /dev/fd Filesystem */ #include #include #include #include #include #include /* boottime */ #include #include #include /* Must come after sys/malloc.h */ #include #include #include #include #include #include #define FDL_WANT 0x01 #define FDL_LOCKED 0x02 static int fdcache_lock; static vop_t **fdesc_vnodeop_p; #define NFDCACHE 4 #define FD_NHASH(ix) \ (&fdhashtbl[(ix) & fdhash]) static LIST_HEAD(fdhashhead, fdescnode) *fdhashtbl; static u_long fdhash; static int fdesc_getattr __P((struct vop_getattr_args *ap)); static int fdesc_inactive __P((struct vop_inactive_args *ap)); static int fdesc_lookup __P((struct vop_lookup_args *ap)); static int fdesc_open __P((struct vop_open_args *ap)); static int fdesc_print __P((struct vop_print_args *ap)); static int fdesc_readdir __P((struct vop_readdir_args *ap)); static int fdesc_reclaim __P((struct vop_reclaim_args *ap)); static int fdesc_poll __P((struct vop_poll_args *ap)); static int fdesc_setattr __P((struct vop_setattr_args *ap)); /* * Initialise cache headers */ int fdesc_init(vfsp) struct vfsconf *vfsp; { fdhashtbl = hashinit(NFDCACHE, M_CACHE, &fdhash); return (0); } int fdesc_allocvp(ftype, ix, mp, vpp, td) fdntype ftype; int ix; struct mount *mp; struct vnode **vpp; struct thread *td; { struct fdhashhead *fc; struct fdescnode *fd; int error = 0; fc = FD_NHASH(ix); loop: LIST_FOREACH(fd, fc, fd_hash) { if (fd->fd_ix == ix && fd->fd_vnode->v_mount == mp) { if (vget(fd->fd_vnode, 0, td)) goto loop; *vpp = fd->fd_vnode; return (error); } } /* * otherwise lock the array while we call getnewvnode * since that can block. */ if (fdcache_lock & FDL_LOCKED) { fdcache_lock |= FDL_WANT; (void) tsleep((caddr_t) &fdcache_lock, PINOD, "fdalvp", 0); goto loop; } fdcache_lock |= FDL_LOCKED; /* * Do the MALLOC before the getnewvnode since doing so afterward * might cause a bogus v_data pointer to get dereferenced * elsewhere if MALLOC should block. */ MALLOC(fd, struct fdescnode *, sizeof(struct fdescnode), M_TEMP, M_WAITOK); error = getnewvnode(VT_FDESC, mp, fdesc_vnodeop_p, vpp); if (error) { FREE(fd, M_TEMP); goto out; } (*vpp)->v_data = fd; fd->fd_vnode = *vpp; fd->fd_type = ftype; fd->fd_fd = -1; fd->fd_ix = ix; LIST_INSERT_HEAD(fc, fd, fd_hash); out: fdcache_lock &= ~FDL_LOCKED; if (fdcache_lock & FDL_WANT) { fdcache_lock &= ~FDL_WANT; wakeup((caddr_t) &fdcache_lock); } return (error); } /* * vp is the current namei directory * ndp is the name to locate in that directory... */ static int fdesc_lookup(ap) struct vop_lookup_args /* { struct vnode * a_dvp; struct vnode ** a_vpp; struct componentname * a_cnp; } */ *ap; { struct vnode **vpp = ap->a_vpp; struct vnode *dvp = ap->a_dvp; struct componentname *cnp = ap->a_cnp; char *pname = cnp->cn_nameptr; struct thread *td = cnp->cn_thread; + struct file *fp; int nlen = cnp->cn_namelen; - int nfiles = td->td_proc->p_fd->fd_nfiles; u_int fd; int error; struct vnode *fvp; if (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME) { error = EROFS; goto bad; } VOP_UNLOCK(dvp, 0, td); if (cnp->cn_namelen == 1 && *pname == '.') { *vpp = dvp; VREF(dvp); vn_lock(dvp, LK_SHARED | LK_RETRY, td); return (0); } if (VTOFDESC(dvp)->fd_type != Froot) { error = ENOTDIR; goto bad; } fd = 0; /* the only time a leading 0 is acceptable is if it's "0" */ if (*pname == '0' && nlen != 1) { error = ENOENT; goto bad; } while (nlen--) { if (*pname < '0' || *pname > '9') { error = ENOENT; goto bad; } fd = 10 * fd + *pname++ - '0'; } - if (fd >= nfiles || td->td_proc->p_fd->fd_ofiles[fd] == NULL) { + fp = ffind_hold(td, fd); + if (fp == NULL) { error = EBADF; goto bad; } error = fdesc_allocvp(Fdesc, FD_DESC+fd, dvp->v_mount, &fvp, td); + fdrop(fp, td); if (error) goto bad; VTOFDESC(fvp)->fd_fd = fd; vn_lock(fvp, LK_SHARED | LK_RETRY, td); *vpp = fvp; return (0); bad: vn_lock(dvp, LK_SHARED | LK_RETRY, td); *vpp = NULL; return (error); } static int fdesc_open(ap) struct vop_open_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct thread *a_td; } */ *ap; { struct vnode *vp = ap->a_vp; if (VTOFDESC(vp)->fd_type == Froot) return (0); /* * XXX Kludge: set td->td_proc->p_dupfd to contain the value of the the file * descriptor being sought for duplication. The error return ensures * that the vnode for this device will be released by vn_open. Open * will detect this special error and take the actions in dupfdopen. * Other callers of vn_open or VOP_OPEN will simply report the * error. */ ap->a_td->td_dupfd = VTOFDESC(vp)->fd_fd; /* XXX */ return (ENODEV); } static int fdesc_getattr(ap) struct vop_getattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct thread *a_td; } */ *ap; { struct vnode *vp = ap->a_vp; struct vattr *vap = ap->a_vap; - struct filedesc *fdp = ap->a_td->td_proc->p_fd; struct file *fp; struct stat stb; u_int fd; int error = 0; switch (VTOFDESC(vp)->fd_type) { case Froot: VATTR_NULL(vap); vap->va_mode = S_IRUSR|S_IXUSR|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH; vap->va_type = VDIR; vap->va_nlink = 2; vap->va_size = DEV_BSIZE; vap->va_fileid = VTOFDESC(vp)->fd_ix; vap->va_uid = 0; vap->va_gid = 0; vap->va_blocksize = DEV_BSIZE; vap->va_atime.tv_sec = boottime.tv_sec; vap->va_atime.tv_nsec = 0; vap->va_mtime = vap->va_atime; vap->va_ctime = vap->va_mtime; vap->va_gen = 0; vap->va_flags = 0; vap->va_rdev = 0; vap->va_bytes = 0; break; case Fdesc: fd = VTOFDESC(vp)->fd_fd; - if (fd >= fdp->fd_nfiles || (fp = fdp->fd_ofiles[fd]) == NULL) + fp = ffind_hold(ap->a_td, fd); + if (fp == NULL) return (EBADF); bzero(&stb, sizeof(stb)); error = fo_stat(fp, &stb, ap->a_td); + fdrop(fp, ap->a_td); if (error == 0) { VATTR_NULL(vap); vap->va_type = IFTOVT(stb.st_mode); vap->va_mode = stb.st_mode; #define FDRX (VREAD|VEXEC) if (vap->va_type == VDIR) vap->va_mode &= ~((FDRX)|(FDRX>>3)|(FDRX>>6)); #undef FDRX vap->va_nlink = 1; vap->va_flags = 0; vap->va_bytes = stb.st_blocks * stb.st_blksize; vap->va_fileid = VTOFDESC(vp)->fd_ix; vap->va_size = stb.st_size; vap->va_blocksize = stb.st_blksize; vap->va_rdev = stb.st_rdev; /* * If no time data is provided, use the current time. */ if (stb.st_atimespec.tv_sec == 0 && stb.st_atimespec.tv_nsec == 0) nanotime(&stb.st_atimespec); if (stb.st_ctimespec.tv_sec == 0 && stb.st_ctimespec.tv_nsec == 0) nanotime(&stb.st_ctimespec); if (stb.st_mtimespec.tv_sec == 0 && stb.st_mtimespec.tv_nsec == 0) nanotime(&stb.st_mtimespec); vap->va_atime = stb.st_atimespec; vap->va_mtime = stb.st_mtimespec; vap->va_ctime = stb.st_ctimespec; vap->va_uid = stb.st_uid; vap->va_gid = stb.st_gid; } break; default: panic("fdesc_getattr"); break; } if (error == 0) vp->v_type = vap->va_type; return (error); } static int fdesc_setattr(ap) struct vop_setattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct thread *a_td; } */ *ap; { struct vattr *vap = ap->a_vap; struct vnode *vp; struct mount *mp; struct file *fp; unsigned fd; int error; /* * Can't mess with the root vnode */ if (VTOFDESC(ap->a_vp)->fd_type == Froot) return (EACCES); fd = VTOFDESC(ap->a_vp)->fd_fd; /* * Allow setattr where there is an underlying vnode. */ error = getvnode(ap->a_td->td_proc->p_fd, fd, &fp); if (error) { /* * getvnode() returns EINVAL if the file descriptor is not * backed by a vnode. Silently drop all changes except * chflags(2) in this case. */ if (error == EINVAL) { if (vap->va_flags != VNOVAL) error = EOPNOTSUPP; else error = 0; } return (error); } vp = (struct vnode *)fp->f_data; - if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) + if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) { + fdrop(fp, ap->a_td); return (error); + } error = VOP_SETATTR(vp, ap->a_vap, ap->a_cred, ap->a_td); vn_finished_write(mp); + fdrop(fp, ap->a_td); return (error); } #define UIO_MX 16 static int fdesc_readdir(ap) struct vop_readdir_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; int *a_eofflag; u_long *a_cookies; int a_ncookies; } */ *ap; { struct uio *uio = ap->a_uio; struct filedesc *fdp; struct dirent d; struct dirent *dp = &d; int error, i, off, fcnt; /* * We don't allow exporting fdesc mounts, and currently local * requests do not need cookies. */ if (ap->a_ncookies) panic("fdesc_readdir: not hungry"); if (VTOFDESC(ap->a_vp)->fd_type != Froot) panic("fdesc_readdir: not dir"); off = (int)uio->uio_offset; if (off != uio->uio_offset || off < 0 || (u_int)off % UIO_MX != 0 || uio->uio_resid < UIO_MX) return (EINVAL); i = (u_int)off / UIO_MX; fdp = uio->uio_td->td_proc->p_fd; error = 0; fcnt = i - 2; /* The first two nodes are `.' and `..' */ + FILEDESC_LOCK(fdp); while (i < fdp->fd_nfiles + 2 && uio->uio_resid >= UIO_MX) { switch (i) { case 0: /* `.' */ case 1: /* `..' */ bzero((caddr_t)dp, UIO_MX); dp->d_fileno = i + FD_ROOT; dp->d_namlen = i + 1; dp->d_reclen = UIO_MX; bcopy("..", dp->d_name, dp->d_namlen); dp->d_name[i + 1] = '\0'; dp->d_type = DT_DIR; break; default: - if (fdp->fd_ofiles[fcnt] == NULL) + if (fdp->fd_ofiles[fcnt] == NULL) { + FILEDESC_UNLOCK(fdp); goto done; + } bzero((caddr_t) dp, UIO_MX); dp->d_namlen = sprintf(dp->d_name, "%d", fcnt); dp->d_reclen = UIO_MX; dp->d_type = DT_UNKNOWN; dp->d_fileno = i + FD_DESC; break; } /* * And ship to userland */ + FILEDESC_UNLOCK(fdp); error = uiomove((caddr_t) dp, UIO_MX, uio); if (error) - break; + goto done; + FILEDESC_LOCK(fdp); i++; fcnt++; } + FILEDESC_UNLOCK(fdp); done: uio->uio_offset = i * UIO_MX; return (error); } static int fdesc_poll(ap) struct vop_poll_args /* { struct vnode *a_vp; int a_events; struct ucred *a_cred; struct thread *a_td; } */ *ap; { return seltrue(0, ap->a_events, ap->a_td); } static int fdesc_inactive(ap) struct vop_inactive_args /* { struct vnode *a_vp; struct thread *a_td; } */ *ap; { struct vnode *vp = ap->a_vp; /* * Clear out the v_type field to avoid * nasty things happening in vgone(). */ VOP_UNLOCK(vp, 0, ap->a_td); vp->v_type = VNON; return (0); } static int fdesc_reclaim(ap) struct vop_reclaim_args /* { struct vnode *a_vp; } */ *ap; { struct vnode *vp = ap->a_vp; struct fdescnode *fd = VTOFDESC(vp); LIST_REMOVE(fd, fd_hash); FREE(vp->v_data, M_TEMP); vp->v_data = 0; return (0); } /* * Print out the contents of a /dev/fd vnode. */ /* ARGSUSED */ static int fdesc_print(ap) struct vop_print_args /* { struct vnode *a_vp; } */ *ap; { printf("tag VT_NON, fdesc vnode\n"); return (0); } static struct vnodeopv_entry_desc fdesc_vnodeop_entries[] = { { &vop_default_desc, (vop_t *) vop_defaultop }, { &vop_access_desc, (vop_t *) vop_null }, { &vop_getattr_desc, (vop_t *) fdesc_getattr }, { &vop_inactive_desc, (vop_t *) fdesc_inactive }, { &vop_lookup_desc, (vop_t *) fdesc_lookup }, { &vop_open_desc, (vop_t *) fdesc_open }, { &vop_pathconf_desc, (vop_t *) vop_stdpathconf }, { &vop_poll_desc, (vop_t *) fdesc_poll }, { &vop_print_desc, (vop_t *) fdesc_print }, { &vop_readdir_desc, (vop_t *) fdesc_readdir }, { &vop_reclaim_desc, (vop_t *) fdesc_reclaim }, { &vop_setattr_desc, (vop_t *) fdesc_setattr }, { NULL, NULL } }; static struct vnodeopv_desc fdesc_vnodeop_opv_desc = { &fdesc_vnodeop_p, fdesc_vnodeop_entries }; VNODEOP_SET(fdesc_vnodeop_opv_desc); Index: head/sys/fs/fifofs/fifo_vnops.c =================================================================== --- head/sys/fs/fifofs/fifo_vnops.c (revision 89305) +++ head/sys/fs/fifofs/fifo_vnops.c (revision 89306) @@ -1,601 +1,610 @@ /* * Copyright (c) 1990, 1993, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)fifo_vnops.c 8.10 (Berkeley) 5/27/95 * $FreeBSD$ */ #include #include #include #include #include #include #include #include /* XXXKSE */ #include #include #include #include #include #include #include #include #include /* * This structure is associated with the FIFO vnode and stores * the state associated with the FIFO. */ struct fifoinfo { struct socket *fi_readsock; struct socket *fi_writesock; long fi_readers; long fi_writers; }; static int fifo_badop __P((void)); static int fifo_print __P((struct vop_print_args *)); static int fifo_lookup __P((struct vop_lookup_args *)); static int fifo_open __P((struct vop_open_args *)); static int fifo_close __P((struct vop_close_args *)); static int fifo_read __P((struct vop_read_args *)); static int fifo_write __P((struct vop_write_args *)); static int fifo_ioctl __P((struct vop_ioctl_args *)); static int fifo_poll __P((struct vop_poll_args *)); static int fifo_kqfilter __P((struct vop_kqfilter_args *)); static int fifo_pathconf __P((struct vop_pathconf_args *)); static int fifo_advlock __P((struct vop_advlock_args *)); static void filt_fifordetach(struct knote *kn); static int filt_fiforead(struct knote *kn, long hint); static void filt_fifowdetach(struct knote *kn); static int filt_fifowrite(struct knote *kn, long hint); static struct filterops fiforead_filtops = { 1, NULL, filt_fifordetach, filt_fiforead }; static struct filterops fifowrite_filtops = { 1, NULL, filt_fifowdetach, filt_fifowrite }; vop_t **fifo_vnodeop_p; static struct vnodeopv_entry_desc fifo_vnodeop_entries[] = { { &vop_default_desc, (vop_t *) vop_defaultop }, { &vop_access_desc, (vop_t *) vop_ebadf }, { &vop_advlock_desc, (vop_t *) fifo_advlock }, { &vop_close_desc, (vop_t *) fifo_close }, { &vop_create_desc, (vop_t *) fifo_badop }, { &vop_getattr_desc, (vop_t *) vop_ebadf }, { &vop_getwritemount_desc, (vop_t *) vop_stdgetwritemount }, { &vop_ioctl_desc, (vop_t *) fifo_ioctl }, { &vop_kqfilter_desc, (vop_t *) fifo_kqfilter }, { &vop_lease_desc, (vop_t *) vop_null }, { &vop_link_desc, (vop_t *) fifo_badop }, { &vop_lookup_desc, (vop_t *) fifo_lookup }, { &vop_mkdir_desc, (vop_t *) fifo_badop }, { &vop_mknod_desc, (vop_t *) fifo_badop }, { &vop_open_desc, (vop_t *) fifo_open }, { &vop_pathconf_desc, (vop_t *) fifo_pathconf }, { &vop_poll_desc, (vop_t *) fifo_poll }, { &vop_print_desc, (vop_t *) fifo_print }, { &vop_read_desc, (vop_t *) fifo_read }, { &vop_readdir_desc, (vop_t *) fifo_badop }, { &vop_readlink_desc, (vop_t *) fifo_badop }, { &vop_reallocblks_desc, (vop_t *) fifo_badop }, { &vop_reclaim_desc, (vop_t *) vop_null }, { &vop_remove_desc, (vop_t *) fifo_badop }, { &vop_rename_desc, (vop_t *) fifo_badop }, { &vop_rmdir_desc, (vop_t *) fifo_badop }, { &vop_setattr_desc, (vop_t *) vop_ebadf }, { &vop_symlink_desc, (vop_t *) fifo_badop }, { &vop_write_desc, (vop_t *) fifo_write }, { NULL, NULL } }; static struct vnodeopv_desc fifo_vnodeop_opv_desc = { &fifo_vnodeop_p, fifo_vnodeop_entries }; VNODEOP_SET(fifo_vnodeop_opv_desc); int fifo_vnoperate(ap) struct vop_generic_args /* { struct vnodeop_desc *a_desc; } */ *ap; { return (VOCALL(fifo_vnodeop_p, ap->a_desc->vdesc_offset, ap)); } /* * Trivial lookup routine that always fails. */ /* ARGSUSED */ static int fifo_lookup(ap) struct vop_lookup_args /* { struct vnode * a_dvp; struct vnode ** a_vpp; struct componentname * a_cnp; } */ *ap; { *ap->a_vpp = NULL; return (ENOTDIR); } /* * Open called to set up a new instance of a fifo or * to find an active instance of a fifo. */ /* ARGSUSED */ static int fifo_open(ap) struct vop_open_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct thread *a_td; } */ *ap; { struct vnode *vp = ap->a_vp; struct fifoinfo *fip; struct thread *td = ap->a_td; struct socket *rso, *wso; int error; if ((fip = vp->v_fifoinfo) == NULL) { MALLOC(fip, struct fifoinfo *, sizeof(*fip), M_VNODE, M_WAITOK); vp->v_fifoinfo = fip; error = socreate(AF_LOCAL, &rso, SOCK_STREAM, 0, ap->a_td->td_proc->p_ucred, ap->a_td); if (error) { free(fip, M_VNODE); vp->v_fifoinfo = NULL; return (error); } fip->fi_readsock = rso; error = socreate(AF_LOCAL, &wso, SOCK_STREAM, 0, ap->a_td->td_proc->p_ucred, ap->a_td); if (error) { (void)soclose(rso); free(fip, M_VNODE); vp->v_fifoinfo = NULL; return (error); } fip->fi_writesock = wso; error = unp_connect2(wso, rso); if (error) { (void)soclose(wso); (void)soclose(rso); free(fip, M_VNODE); vp->v_fifoinfo = NULL; return (error); } fip->fi_readers = fip->fi_writers = 0; wso->so_snd.sb_lowat = PIPE_BUF; } if (ap->a_mode & FREAD) { fip->fi_readers++; if (fip->fi_readers == 1) { fip->fi_writesock->so_state &= ~SS_CANTSENDMORE; if (fip->fi_writers > 0) { wakeup((caddr_t)&fip->fi_writers); sowwakeup(fip->fi_writesock); } } } if (ap->a_mode & FWRITE) { fip->fi_writers++; if (fip->fi_writers == 1) { fip->fi_readsock->so_state &= ~SS_CANTRCVMORE; if (fip->fi_readers > 0) { wakeup((caddr_t)&fip->fi_readers); sorwakeup(fip->fi_writesock); } } } if ((ap->a_mode & FREAD) && (ap->a_mode & O_NONBLOCK) == 0) { while (fip->fi_writers == 0) { VOP_UNLOCK(vp, 0, td); error = tsleep((caddr_t)&fip->fi_readers, PCATCH | PSOCK, "fifoor", 0); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); if (error) goto bad; } } if (ap->a_mode & FWRITE) { if (ap->a_mode & O_NONBLOCK) { if (fip->fi_readers == 0) { error = ENXIO; goto bad; } } else { while (fip->fi_readers == 0) { VOP_UNLOCK(vp, 0, td); error = tsleep((caddr_t)&fip->fi_writers, PCATCH | PSOCK, "fifoow", 0); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); if (error) goto bad; } } } return (0); bad: VOP_CLOSE(vp, ap->a_mode, ap->a_cred, td); return (error); } /* * Vnode op for read */ /* ARGSUSED */ static int fifo_read(ap) struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { struct uio *uio = ap->a_uio; struct socket *rso = ap->a_vp->v_fifoinfo->fi_readsock; struct thread *td = uio->uio_td; int error, startresid; #ifdef DIAGNOSTIC if (uio->uio_rw != UIO_READ) panic("fifo_read mode"); #endif if (uio->uio_resid == 0) return (0); if (ap->a_ioflag & IO_NDELAY) rso->so_state |= SS_NBIO; startresid = uio->uio_resid; VOP_UNLOCK(ap->a_vp, 0, td); error = soreceive(rso, (struct sockaddr **)0, uio, (struct mbuf **)0, (struct mbuf **)0, (int *)0); /* * Clear EOF indication after first such return. */ if (uio->uio_resid == startresid) rso->so_state &= ~SS_CANTRCVMORE; vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY, td); if (ap->a_ioflag & IO_NDELAY) rso->so_state &= ~SS_NBIO; return (error); } /* * Vnode op for write */ /* ARGSUSED */ static int fifo_write(ap) struct vop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { struct socket *wso = ap->a_vp->v_fifoinfo->fi_writesock; struct thread *td = ap->a_uio->uio_td; int error; #ifdef DIAGNOSTIC if (ap->a_uio->uio_rw != UIO_WRITE) panic("fifo_write mode"); #endif if (ap->a_ioflag & IO_NDELAY) wso->so_state |= SS_NBIO; VOP_UNLOCK(ap->a_vp, 0, td); error = sosend(wso, (struct sockaddr *)0, ap->a_uio, 0, (struct mbuf *)0, 0, td); vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY, td); if (ap->a_ioflag & IO_NDELAY) wso->so_state &= ~SS_NBIO; return (error); } /* * Device ioctl operation. */ /* ARGSUSED */ static int fifo_ioctl(ap) struct vop_ioctl_args /* { struct vnode *a_vp; int a_command; caddr_t a_data; int a_fflag; struct ucred *a_cred; struct thread *a_td; } */ *ap; { struct file filetmp; - int error; + int error = 0; if (ap->a_command == FIONBIO) return (0); + mtx_init(&filetmp.f_mtx, "struct file", MTX_DEF); + filetmp.f_count = 1; if (ap->a_fflag & FREAD) { + /* filetmp is local, hence not need be locked. */ filetmp.f_data = (caddr_t)ap->a_vp->v_fifoinfo->fi_readsock; error = soo_ioctl(&filetmp, ap->a_command, ap->a_data, ap->a_td); if (error) - return (error); + goto err; } if (ap->a_fflag & FWRITE) { + /* filetmp is local, hence not need be locked. */ filetmp.f_data = (caddr_t)ap->a_vp->v_fifoinfo->fi_writesock; error = soo_ioctl(&filetmp, ap->a_command, ap->a_data, ap->a_td); if (error) - return (error); + goto err; } - return (0); +err: + mtx_destroy(&filetmp.f_mtx); + return (error); } /* ARGSUSED */ static int fifo_kqfilter(ap) struct vop_kqfilter_args /* { struct vnode *a_vp; struct knote *a_kn; } */ *ap; { struct fifoinfo *fi = ap->a_vp->v_fifoinfo; struct socket *so; struct sockbuf *sb; switch (ap->a_kn->kn_filter) { case EVFILT_READ: ap->a_kn->kn_fop = &fiforead_filtops; so = fi->fi_readsock; sb = &so->so_rcv; break; case EVFILT_WRITE: ap->a_kn->kn_fop = &fifowrite_filtops; so = fi->fi_writesock; sb = &so->so_snd; break; default: return (1); } ap->a_kn->kn_hook = (caddr_t)so; SLIST_INSERT_HEAD(&sb->sb_sel.si_note, ap->a_kn, kn_selnext); sb->sb_flags |= SB_KNOTE; return (0); } static void filt_fifordetach(struct knote *kn) { struct socket *so = (struct socket *)kn->kn_hook; SLIST_REMOVE(&so->so_rcv.sb_sel.si_note, kn, knote, kn_selnext); if (SLIST_EMPTY(&so->so_rcv.sb_sel.si_note)) so->so_rcv.sb_flags &= ~SB_KNOTE; } static int filt_fiforead(struct knote *kn, long hint) { struct socket *so = (struct socket *)kn->kn_hook; kn->kn_data = so->so_rcv.sb_cc; if (so->so_state & SS_CANTRCVMORE) { kn->kn_flags |= EV_EOF; return (1); } kn->kn_flags &= ~EV_EOF; return (kn->kn_data > 0); } static void filt_fifowdetach(struct knote *kn) { struct socket *so = (struct socket *)kn->kn_hook; SLIST_REMOVE(&so->so_snd.sb_sel.si_note, kn, knote, kn_selnext); if (SLIST_EMPTY(&so->so_snd.sb_sel.si_note)) so->so_snd.sb_flags &= ~SB_KNOTE; } static int filt_fifowrite(struct knote *kn, long hint) { struct socket *so = (struct socket *)kn->kn_hook; kn->kn_data = sbspace(&so->so_snd); if (so->so_state & SS_CANTSENDMORE) { kn->kn_flags |= EV_EOF; return (1); } kn->kn_flags &= ~EV_EOF; return (kn->kn_data >= so->so_snd.sb_lowat); } /* ARGSUSED */ static int fifo_poll(ap) struct vop_poll_args /* { struct vnode *a_vp; int a_events; struct ucred *a_cred; struct thread *a_td; } */ *ap; { struct file filetmp; int revents = 0; + mtx_init(&filetmp.f_mtx, "struct file", MTX_DEF); + filetmp.f_count = 1; if (ap->a_events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) { filetmp.f_data = (caddr_t)ap->a_vp->v_fifoinfo->fi_readsock; if (filetmp.f_data) revents |= soo_poll(&filetmp, ap->a_events, ap->a_cred, ap->a_td); } if (ap->a_events & (POLLOUT | POLLWRNORM | POLLWRBAND)) { filetmp.f_data = (caddr_t)ap->a_vp->v_fifoinfo->fi_writesock; if (filetmp.f_data) revents |= soo_poll(&filetmp, ap->a_events, ap->a_cred, ap->a_td); } + mtx_destroy(&filetmp.f_mtx); return (revents); } /* * Device close routine */ /* ARGSUSED */ static int fifo_close(ap) struct vop_close_args /* { struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct thread *a_td; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct fifoinfo *fip = vp->v_fifoinfo; int error1, error2; if (ap->a_fflag & FREAD) { fip->fi_readers--; if (fip->fi_readers == 0) socantsendmore(fip->fi_writesock); } if (ap->a_fflag & FWRITE) { fip->fi_writers--; if (fip->fi_writers == 0) socantrcvmore(fip->fi_readsock); } if (vp->v_usecount > 1) return (0); error1 = soclose(fip->fi_readsock); error2 = soclose(fip->fi_writesock); FREE(fip, M_VNODE); vp->v_fifoinfo = NULL; if (error1) return (error1); return (error2); } /* * Print out internal contents of a fifo vnode. */ int fifo_printinfo(vp) struct vnode *vp; { register struct fifoinfo *fip = vp->v_fifoinfo; printf(", fifo with %ld readers and %ld writers", fip->fi_readers, fip->fi_writers); return (0); } /* * Print out the contents of a fifo vnode. */ static int fifo_print(ap) struct vop_print_args /* { struct vnode *a_vp; } */ *ap; { printf("tag VT_NON"); fifo_printinfo(ap->a_vp); printf("\n"); return (0); } /* * Return POSIX pathconf information applicable to fifo's. */ int fifo_pathconf(ap) struct vop_pathconf_args /* { struct vnode *a_vp; int a_name; int *a_retval; } */ *ap; { switch (ap->a_name) { case _PC_LINK_MAX: *ap->a_retval = LINK_MAX; return (0); case _PC_PIPE_BUF: *ap->a_retval = PIPE_BUF; return (0); case _PC_CHOWN_RESTRICTED: *ap->a_retval = 1; return (0); default: return (EINVAL); } /* NOTREACHED */ } /* * Fifo advisory byte-level locks. */ /* ARGSUSED */ static int fifo_advlock(ap) struct vop_advlock_args /* { struct vnode *a_vp; caddr_t a_id; int a_op; struct flock *a_fl; int a_flags; } */ *ap; { return (ap->a_flags & F_FLOCK ? EOPNOTSUPP : EINVAL); } /* * Fifo bad operation */ static int fifo_badop() { panic("fifo_badop called"); /* NOTREACHED */ } Index: head/sys/fs/portalfs/portal_vfsops.c =================================================================== --- head/sys/fs/portalfs/portal_vfsops.c (revision 89305) +++ head/sys/fs/portalfs/portal_vfsops.c (revision 89306) @@ -1,260 +1,262 @@ /* * Copyright (c) 1992, 1993, 1995 * The Regents of the University of California. All rights reserved. * * This code is derived from software donated to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)portal_vfsops.c 8.11 (Berkeley) 5/14/95 * * $FreeBSD$ */ /* * Portal Filesystem */ #include #include #include #include #include #include #include #include /* Must come after sys/malloc.h */ #include #include #include #include #include #include #include static MALLOC_DEFINE(M_PORTALFSMNT, "PORTAL mount", "PORTAL mount structure"); static int portal_mount __P((struct mount *mp, char *path, caddr_t data, struct nameidata *ndp, struct thread *td)); static int portal_unmount __P((struct mount *mp, int mntflags, struct thread *td)); static int portal_root __P((struct mount *mp, struct vnode **vpp)); static int portal_statfs __P((struct mount *mp, struct statfs *sbp, struct thread *td)); /* * Mount the per-process file descriptors (/dev/fd) */ static int portal_mount(mp, path, data, ndp, td) struct mount *mp; char *path; caddr_t data; struct nameidata *ndp; struct thread *td; { struct file *fp; struct portal_args args; struct portalmount *fmp; struct socket *so; struct vnode *rvp; struct portalnode *pn; u_int size; int error; /* * Update is a no-op */ if (mp->mnt_flag & MNT_UPDATE) return (EOPNOTSUPP); error = copyin(data, (caddr_t) &args, sizeof(struct portal_args)); if (error) return (error); if ((error = fget(td, args.pa_socket, &fp)) != 0) return (error); if (fp->f_type != DTYPE_SOCKET) { fdrop(fp, td); return(ENOTSOCK); } so = (struct socket *) fp->f_data; /* XXX race against userland */ if (so->so_proto->pr_domain->dom_family != AF_UNIX) { fdrop(fp, td); return (ESOCKTNOSUPPORT); } MALLOC(pn, struct portalnode *, sizeof(struct portalnode), M_TEMP, M_WAITOK); MALLOC(fmp, struct portalmount *, sizeof(struct portalmount), M_PORTALFSMNT, M_WAITOK); /* XXX */ error = getnewvnode(VT_PORTAL, mp, portal_vnodeop_p, &rvp); /* XXX */ if (error) { FREE(fmp, M_PORTALFSMNT); FREE(pn, M_TEMP); fdrop(fp, td); return (error); } rvp->v_data = pn; rvp->v_type = VDIR; rvp->v_flag |= VROOT; VTOPORTAL(rvp)->pt_arg = 0; VTOPORTAL(rvp)->pt_size = 0; VTOPORTAL(rvp)->pt_fileid = PORTAL_ROOTFILEID; fmp->pm_root = rvp; - fmp->pm_server = fp; fp->f_count++; + fhold(fp); + fmp->pm_server = fp; mp->mnt_flag |= MNT_LOCAL; mp->mnt_data = (qaddr_t) fmp; vfs_getnewfsid(mp); (void)copyinstr(args.pa_config, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, &size); bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size); #ifdef notdef bzero(mp->mnt_stat.f_mntfromname, MNAMELEN); bcopy("portal", mp->mnt_stat.f_mntfromname, sizeof("portal")); #endif (void)portal_statfs(mp, &mp->mnt_stat, td); fdrop(fp, td); return (0); } static int portal_unmount(mp, mntflags, td) struct mount *mp; int mntflags; struct thread *td; { int error, flags = 0; + struct socket *so; if (mntflags & MNT_FORCE) flags |= FORCECLOSE; /* * Clear out buffer cache. I don't think we * ever get anything cached at this level at the * moment, but who knows... */ #ifdef notyet mntflushbuf(mp, 0); if (mntinvalbuf(mp, 1)) return (EBUSY); #endif /* There is 1 extra root vnode reference (pm_root). */ error = vflush(mp, 1, flags); if (error) return (error); /* * Shutdown the socket. This will cause the select in the * daemon to wake up, and then the accept will get ECONNABORTED * which it interprets as a request to go and bury itself. */ soshutdown((struct socket *) VFSTOPORTAL(mp)->pm_server->f_data, 2); /* * Discard reference to underlying file. Must call closef because * this may be the last reference. */ closef(VFSTOPORTAL(mp)->pm_server, (struct thread *) 0); /* * Finally, throw away the portalmount structure */ free(mp->mnt_data, M_PORTALFSMNT); /* XXX */ mp->mnt_data = 0; return (0); } static int portal_root(mp, vpp) struct mount *mp; struct vnode **vpp; { struct thread *td = curthread; /* XXX */ struct vnode *vp; /* * Return locked reference to root. */ vp = VFSTOPORTAL(mp)->pm_root; VREF(vp); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); *vpp = vp; return (0); } static int portal_statfs(mp, sbp, td) struct mount *mp; struct statfs *sbp; struct thread *td; { sbp->f_flags = 0; sbp->f_bsize = DEV_BSIZE; sbp->f_iosize = DEV_BSIZE; sbp->f_blocks = 2; /* 1K to keep df happy */ sbp->f_bfree = 0; sbp->f_bavail = 0; sbp->f_files = 1; /* Allow for "." */ sbp->f_ffree = 0; /* See comments above */ if (sbp != &mp->mnt_stat) { sbp->f_type = mp->mnt_vfc->vfc_typenum; bcopy(&mp->mnt_stat.f_fsid, &sbp->f_fsid, sizeof(sbp->f_fsid)); bcopy(mp->mnt_stat.f_mntonname, sbp->f_mntonname, MNAMELEN); bcopy(mp->mnt_stat.f_mntfromname, sbp->f_mntfromname, MNAMELEN); } return (0); } static struct vfsops portal_vfsops = { portal_mount, vfs_stdstart, portal_unmount, portal_root, vfs_stdquotactl, portal_statfs, vfs_stdsync, vfs_stdvget, vfs_stdfhtovp, vfs_stdcheckexp, vfs_stdvptofh, vfs_stdinit, vfs_stduninit, vfs_stdextattrctl, }; VFS_SET(portal_vfsops, portalfs, VFCF_SYNTHETIC); Index: head/sys/fs/portalfs/portal_vnops.c =================================================================== --- head/sys/fs/portalfs/portal_vnops.c (revision 89305) +++ head/sys/fs/portalfs/portal_vnops.c (revision 89306) @@ -1,579 +1,585 @@ /* * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software donated to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)portal_vnops.c 8.14 (Berkeley) 5/21/95 * * $FreeBSD$ */ /* * Portal Filesystem */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int portal_fileid = PORTAL_ROOTFILEID+1; static void portal_closefd __P((struct thread *td, int fd)); static int portal_connect __P((struct socket *so, struct socket *so2)); static int portal_getattr __P((struct vop_getattr_args *ap)); static int portal_lookup __P((struct vop_lookup_args *ap)); static int portal_open __P((struct vop_open_args *ap)); static int portal_print __P((struct vop_print_args *ap)); static int portal_readdir __P((struct vop_readdir_args *ap)); static int portal_reclaim __P((struct vop_reclaim_args *ap)); static int portal_setattr __P((struct vop_setattr_args *ap)); static void portal_closefd(td, fd) struct thread *td; int fd; { int error; struct close_args ua; ua.fd = fd; error = close(td, &ua); /* * We should never get an error, and there isn't anything * we could do if we got one, so just print a message. */ if (error) printf("portal_closefd: error = %d\n", error); } /* * vp is the current namei directory * cnp is the name to locate in that directory... */ static int portal_lookup(ap) struct vop_lookup_args /* { struct vnode * a_dvp; struct vnode ** a_vpp; struct componentname * a_cnp; } */ *ap; { struct componentname *cnp = ap->a_cnp; struct vnode **vpp = ap->a_vpp; struct vnode *dvp = ap->a_dvp; char *pname = cnp->cn_nameptr; struct portalnode *pt; int error; struct vnode *fvp = 0; char *path; int size; *vpp = NULLVP; if (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME) return (EROFS); if (cnp->cn_namelen == 1 && *pname == '.') { *vpp = dvp; VREF(dvp); /*VOP_LOCK(dvp);*/ return (0); } /* * Do the MALLOC before the getnewvnode since doing so afterward * might cause a bogus v_data pointer to get dereferenced * elsewhere if MALLOC should block. */ MALLOC(pt, struct portalnode *, sizeof(struct portalnode), M_TEMP, M_WAITOK); error = getnewvnode(VT_PORTAL, dvp->v_mount, portal_vnodeop_p, &fvp); if (error) { FREE(pt, M_TEMP); goto bad; } fvp->v_type = VREG; fvp->v_data = pt; /* * Save all of the remaining pathname and * advance the namei next pointer to the end * of the string. */ for (size = 0, path = pname; *path; path++) size++; cnp->cn_consume = size - cnp->cn_namelen; pt->pt_arg = malloc(size+1, M_TEMP, M_WAITOK); pt->pt_size = size+1; bcopy(pname, pt->pt_arg, pt->pt_size); pt->pt_fileid = portal_fileid++; *vpp = fvp; /*VOP_LOCK(fvp);*/ return (0); bad:; if (fvp) vrele(fvp); return (error); } static int portal_connect(so, so2) struct socket *so; struct socket *so2; { /* from unp_connect, bypassing the namei stuff... */ struct socket *so3; struct unpcb *unp2; struct unpcb *unp3; if (so2 == 0) return (ECONNREFUSED); if (so->so_type != so2->so_type) return (EPROTOTYPE); if ((so2->so_options & SO_ACCEPTCONN) == 0) return (ECONNREFUSED); if ((so3 = sonewconn(so2, 0)) == 0) return (ECONNREFUSED); unp2 = sotounpcb(so2); unp3 = sotounpcb(so3); if (unp2->unp_addr) unp3->unp_addr = (struct sockaddr_un *) dup_sockaddr((struct sockaddr *)unp2->unp_addr, 0); so2 = so3; return (unp_connect2(so, so2)); } static int portal_open(ap) struct vop_open_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct thread *a_td; } */ *ap; { struct socket *so = 0; struct portalnode *pt; struct thread *td = ap->a_td; struct vnode *vp = ap->a_vp; int s; struct uio auio; struct iovec aiov[2]; int res; struct mbuf *cm = 0; struct cmsghdr *cmsg; int newfds; int *ip; int fd; int error; int len; struct portalmount *fmp; struct file *fp; struct portal_cred pcred; /* * Nothing to do when opening the root node. */ if (vp->v_flag & VROOT) return (0); /* * Can't be opened unless the caller is set up * to deal with the side effects. Check for this * by testing whether the p_dupfd has been set. */ if (td->td_dupfd >= 0) return (ENODEV); pt = VTOPORTAL(vp); fmp = VFSTOPORTAL(vp->v_mount); /* * Create a new socket. */ error = socreate(AF_UNIX, &so, SOCK_STREAM, 0, ap->a_td->td_proc->p_ucred, ap->a_td); if (error) goto bad; /* * Reserve some buffer space */ res = pt->pt_size + sizeof(pcred) + 512; /* XXX */ error = soreserve(so, res, res); if (error) goto bad; /* * Kick off connection */ error = portal_connect(so, (struct socket *)fmp->pm_server->f_data); if (error) goto bad; /* * Wait for connection to complete */ /* * XXX: Since the mount point is holding a reference on the * underlying server socket, it is not easy to find out whether * the server process is still running. To handle this problem * we loop waiting for the new socket to be connected (something * which will only happen if the server is still running) or for * the reference count on the server socket to drop to 1, which * will happen if the server dies. Sleep for 5 second intervals * and keep polling the reference count. XXX. */ s = splnet(); while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { if (fmp->pm_server->f_count == 1) { error = ECONNREFUSED; splx(s); goto bad; } (void) tsleep((caddr_t) &so->so_timeo, PSOCK, "portalcon", 5 * hz); } splx(s); if (so->so_error) { error = so->so_error; goto bad; } /* * Set miscellaneous flags */ so->so_rcv.sb_timeo = 0; so->so_snd.sb_timeo = 0; so->so_rcv.sb_flags |= SB_NOINTR; so->so_snd.sb_flags |= SB_NOINTR; pcred.pcr_flag = ap->a_mode; pcred.pcr_uid = ap->a_cred->cr_uid; pcred.pcr_ngroups = ap->a_cred->cr_ngroups; bcopy(ap->a_cred->cr_groups, pcred.pcr_groups, NGROUPS * sizeof(gid_t)); aiov[0].iov_base = (caddr_t) &pcred; aiov[0].iov_len = sizeof(pcred); aiov[1].iov_base = pt->pt_arg; aiov[1].iov_len = pt->pt_size; auio.uio_iov = aiov; auio.uio_iovcnt = 2; auio.uio_rw = UIO_WRITE; auio.uio_segflg = UIO_SYSSPACE; auio.uio_td = td; auio.uio_offset = 0; auio.uio_resid = aiov[0].iov_len + aiov[1].iov_len; error = sosend(so, (struct sockaddr *) 0, &auio, (struct mbuf *) 0, (struct mbuf *) 0, 0 , td); if (error) goto bad; len = auio.uio_resid = sizeof(int); do { struct mbuf *m = 0; int flags = MSG_WAITALL; error = soreceive(so, (struct sockaddr **) 0, &auio, &m, &cm, &flags); if (error) goto bad; /* * Grab an error code from the mbuf. */ if (m) { m = m_pullup(m, sizeof(int)); /* Needed? */ if (m) { error = *(mtod(m, int *)); m_freem(m); } else { error = EINVAL; } } else { if (cm == 0) { error = ECONNRESET; /* XXX */ #ifdef notdef break; #endif } } } while (cm == 0 && auio.uio_resid == len && !error); if (cm == 0) goto bad; if (auio.uio_resid) { error = 0; #ifdef notdef error = EMSGSIZE; goto bad; #endif } /* * XXX: Break apart the control message, and retrieve the * received file descriptor. Note that more than one descriptor * may have been received, or that the rights chain may have more * than a single mbuf in it. What to do? */ cmsg = mtod(cm, struct cmsghdr *); newfds = (cmsg->cmsg_len - sizeof(*cmsg)) / sizeof (int); if (newfds == 0) { error = ECONNREFUSED; goto bad; } /* * At this point the rights message consists of a control message * header, followed by a data region containing a vector of * integer file descriptors. The fds were allocated by the action * of receiving the control message. */ ip = (int *) (cmsg + 1); fd = *ip++; if (newfds > 1) { /* * Close extra fds. */ int i; printf("portal_open: %d extra fds\n", newfds - 1); for (i = 1; i < newfds; i++) { portal_closefd(td, *ip); ip++; } } /* * Check that the mode the file is being opened for is a subset * of the mode of the existing descriptor. */ - fp = td->td_proc->p_fd->fd_ofiles[fd]; + fp = ffind_hold(td, fd); + if (fp == NULL) { + error = EBADF; + goto bad; + } if (((ap->a_mode & (FREAD|FWRITE)) | fp->f_flag) != fp->f_flag) { + fdrop(fp, td); portal_closefd(td, fd); error = EACCES; goto bad; } + fdrop(fp, td); /* * Save the dup fd in the proc structure then return the * special error code (ENXIO) which causes magic things to * happen in vn_open. The whole concept is, well, hmmm. */ td->td_dupfd = fd; error = ENXIO; bad:; /* * And discard the control message. */ if (cm) { m_freem(cm); } if (so) { soshutdown(so, 2); soclose(so); } return (error); } static int portal_getattr(ap) struct vop_getattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct thread *a_td; } */ *ap; { struct vnode *vp = ap->a_vp; struct vattr *vap = ap->a_vap; bzero(vap, sizeof(*vap)); vattr_null(vap); vap->va_uid = 0; vap->va_gid = 0; vap->va_size = DEV_BSIZE; vap->va_blocksize = DEV_BSIZE; nanotime(&vap->va_atime); vap->va_mtime = vap->va_atime; vap->va_ctime = vap->va_mtime; vap->va_gen = 0; vap->va_flags = 0; vap->va_rdev = 0; /* vap->va_qbytes = 0; */ vap->va_bytes = 0; /* vap->va_qsize = 0; */ if (vp->v_flag & VROOT) { vap->va_type = VDIR; vap->va_mode = S_IRUSR|S_IWUSR|S_IXUSR| S_IRGRP|S_IWGRP|S_IXGRP| S_IROTH|S_IWOTH|S_IXOTH; vap->va_nlink = 2; vap->va_fileid = 2; } else { vap->va_type = VREG; vap->va_mode = S_IRUSR|S_IWUSR| S_IRGRP|S_IWGRP| S_IROTH|S_IWOTH; vap->va_nlink = 1; vap->va_fileid = VTOPORTAL(vp)->pt_fileid; } return (0); } static int portal_setattr(ap) struct vop_setattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct thread *a_td; } */ *ap; { /* * Can't mess with the root vnode */ if (ap->a_vp->v_flag & VROOT) return (EACCES); if (ap->a_vap->va_flags != VNOVAL) return (EOPNOTSUPP); return (0); } /* * Fake readdir, just return empty directory. * It is hard to deal with '.' and '..' so don't bother. */ static int portal_readdir(ap) struct vop_readdir_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; int *a_eofflag; u_long *a_cookies; int a_ncookies; } */ *ap; { /* * We don't allow exporting portal mounts, and currently local * requests do not need cookies. */ if (ap->a_ncookies) panic("portal_readdir: not hungry"); return (0); } static int portal_reclaim(ap) struct vop_reclaim_args /* { struct vnode *a_vp; } */ *ap; { struct portalnode *pt = VTOPORTAL(ap->a_vp); if (pt->pt_arg) { free((caddr_t) pt->pt_arg, M_TEMP); pt->pt_arg = 0; } FREE(ap->a_vp->v_data, M_TEMP); ap->a_vp->v_data = 0; return (0); } /* * Print out the contents of a Portal vnode. */ /* ARGSUSED */ static int portal_print(ap) struct vop_print_args /* { struct vnode *a_vp; } */ *ap; { printf("tag VT_PORTAL, portal vnode\n"); return (0); } vop_t **portal_vnodeop_p; static struct vnodeopv_entry_desc portal_vnodeop_entries[] = { { &vop_default_desc, (vop_t *) vop_defaultop }, { &vop_access_desc, (vop_t *) vop_null }, { &vop_getattr_desc, (vop_t *) portal_getattr }, { &vop_lookup_desc, (vop_t *) portal_lookup }, { &vop_open_desc, (vop_t *) portal_open }, { &vop_pathconf_desc, (vop_t *) vop_stdpathconf }, { &vop_print_desc, (vop_t *) portal_print }, { &vop_readdir_desc, (vop_t *) portal_readdir }, { &vop_reclaim_desc, (vop_t *) portal_reclaim }, { &vop_setattr_desc, (vop_t *) portal_setattr }, { NULL, NULL } }; static struct vnodeopv_desc portal_vnodeop_opv_desc = { &portal_vnodeop_p, portal_vnodeop_entries }; VNODEOP_SET(portal_vnodeop_opv_desc); Index: head/sys/fs/unionfs/union_subr.c =================================================================== --- head/sys/fs/unionfs/union_subr.c (revision 89305) +++ head/sys/fs/unionfs/union_subr.c (revision 89306) @@ -1,1360 +1,1365 @@ /* * Copyright (c) 1994 Jan-Simon Pendry * Copyright (c) 1994 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)union_subr.c 8.20 (Berkeley) 5/20/95 * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* for vnode_pager_setsize */ #include #include /* for vm cache coherency */ #include #include extern int union_init __P((void)); /* must be power of two, otherwise change UNION_HASH() */ #define NHASH 32 /* unsigned int ... */ #define UNION_HASH(u, l) \ (((((uintptr_t) (u)) + ((uintptr_t) l)) >> 8) & (NHASH-1)) static LIST_HEAD(unhead, union_node) unhead[NHASH]; static int unvplock[NHASH]; static void union_dircache_r __P((struct vnode *vp, struct vnode ***vppp, int *cntp)); static int union_list_lock __P((int ix)); static void union_list_unlock __P((int ix)); static int union_relookup __P((struct union_mount *um, struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, struct componentname *cn, char *path, int pathlen)); static void union_updatevp __P((struct union_node *un, struct vnode *uppervp, struct vnode *lowervp)); static void union_newlower __P((struct union_node *, struct vnode *)); static void union_newupper __P((struct union_node *, struct vnode *)); static int union_copyfile __P((struct vnode *, struct vnode *, struct ucred *, struct thread *)); static int union_vn_create __P((struct vnode **, struct union_node *, struct thread *)); static int union_vn_close __P((struct vnode *, int, struct ucred *, struct thread *)); int union_init() { int i; for (i = 0; i < NHASH; i++) LIST_INIT(&unhead[i]); bzero((caddr_t)unvplock, sizeof(unvplock)); return (0); } static int union_list_lock(ix) int ix; { if (unvplock[ix] & UNVP_LOCKED) { unvplock[ix] |= UNVP_WANT; (void) tsleep((caddr_t) &unvplock[ix], PINOD, "unllck", 0); return (1); } unvplock[ix] |= UNVP_LOCKED; return (0); } static void union_list_unlock(ix) int ix; { unvplock[ix] &= ~UNVP_LOCKED; if (unvplock[ix] & UNVP_WANT) { unvplock[ix] &= ~UNVP_WANT; wakeup((caddr_t) &unvplock[ix]); } } /* * union_updatevp: * * The uppervp, if not NULL, must be referenced and not locked by us * The lowervp, if not NULL, must be referenced. * * if uppervp and lowervp match pointers already installed, nothing * happens. The passed vp's (when matching) are not adjusted. This * routine may only be called by union_newupper() and union_newlower(). */ static void union_updatevp(un, uppervp, lowervp) struct union_node *un; struct vnode *uppervp; struct vnode *lowervp; { int ohash = UNION_HASH(un->un_uppervp, un->un_lowervp); int nhash = UNION_HASH(uppervp, lowervp); int docache = (lowervp != NULLVP || uppervp != NULLVP); int lhash, uhash; /* * Ensure locking is ordered from lower to higher * to avoid deadlocks. */ if (nhash < ohash) { lhash = nhash; uhash = ohash; } else { lhash = ohash; uhash = nhash; } if (lhash != uhash) { while (union_list_lock(lhash)) continue; } while (union_list_lock(uhash)) continue; if (ohash != nhash || !docache) { if (un->un_flags & UN_CACHED) { un->un_flags &= ~UN_CACHED; LIST_REMOVE(un, un_cache); } } if (ohash != nhash) union_list_unlock(ohash); if (un->un_lowervp != lowervp) { if (un->un_lowervp) { vrele(un->un_lowervp); if (un->un_path) { free(un->un_path, M_TEMP); un->un_path = 0; } } un->un_lowervp = lowervp; un->un_lowersz = VNOVAL; } if (un->un_uppervp != uppervp) { if (un->un_uppervp) vrele(un->un_uppervp); un->un_uppervp = uppervp; un->un_uppersz = VNOVAL; } if (docache && (ohash != nhash)) { LIST_INSERT_HEAD(&unhead[nhash], un, un_cache); un->un_flags |= UN_CACHED; } union_list_unlock(nhash); } /* * Set a new lowervp. The passed lowervp must be referenced and will be * stored in the vp in a referenced state. */ static void union_newlower(un, lowervp) struct union_node *un; struct vnode *lowervp; { union_updatevp(un, un->un_uppervp, lowervp); } /* * Set a new uppervp. The passed uppervp must be locked and will be * stored in the vp in a locked state. The caller should not unlock * uppervp. */ static void union_newupper(un, uppervp) struct union_node *un; struct vnode *uppervp; { union_updatevp(un, uppervp, un->un_lowervp); } /* * Keep track of size changes in the underlying vnodes. * If the size changes, then callback to the vm layer * giving priority to the upper layer size. */ void union_newsize(vp, uppersz, lowersz) struct vnode *vp; off_t uppersz, lowersz; { struct union_node *un; off_t sz; /* only interested in regular files */ if (vp->v_type != VREG) return; un = VTOUNION(vp); sz = VNOVAL; if ((uppersz != VNOVAL) && (un->un_uppersz != uppersz)) { un->un_uppersz = uppersz; if (sz == VNOVAL) sz = un->un_uppersz; } if ((lowersz != VNOVAL) && (un->un_lowersz != lowersz)) { un->un_lowersz = lowersz; if (sz == VNOVAL) sz = un->un_lowersz; } if (sz != VNOVAL) { UDEBUG(("union: %s size now %ld\n", (uppersz != VNOVAL ? "upper" : "lower"), (long)sz)); /* * There is no need to change size of non-existent object */ /* vnode_pager_setsize(vp, sz); */ } } /* * union_allocvp: allocate a union_node and associate it with a * parent union_node and one or two vnodes. * * vpp Holds the returned vnode locked and referenced if no * error occurs. * * mp Holds the mount point. mp may or may not be busied. * allocvp makes no changes to mp. * * dvp Holds the parent union_node to the one we wish to create. * XXX may only be used to traverse an uncopied lowervp-based * tree? XXX * * dvp may or may not be locked. allocvp makes no changes * to dvp. * * upperdvp Holds the parent vnode to uppervp, generally used along * with path component information to create a shadow of * lowervp when uppervp does not exist. * * upperdvp is referenced but unlocked on entry, and will be * dereferenced on return. * * uppervp Holds the new uppervp vnode to be stored in the * union_node we are allocating. uppervp is referenced but * not locked, and will be dereferenced on return. * * lowervp Holds the new lowervp vnode to be stored in the * union_node we are allocating. lowervp is referenced but * not locked, and will be dereferenced on return. * * cnp Holds path component information to be coupled with * lowervp and upperdvp to allow unionfs to create an uppervp * later on. Only used if lowervp is valid. The conents * of cnp is only valid for the duration of the call. * * docache Determine whether this node should be entered in the * cache or whether it should be destroyed as soon as possible. * * all union_nodes are maintained on a singly-linked * list. new nodes are only allocated when they cannot * be found on this list. entries on the list are * removed when the vfs reclaim entry is called. * * a single lock is kept for the entire list. this is * needed because the getnewvnode() function can block * waiting for a vnode to become free, in which case there * may be more than one process trying to get the same * vnode. this lock is only taken if we are going to * call getnewvnode, since the kernel itself is single-threaded. * * if an entry is found on the list, then call vget() to * take a reference. this is done because there may be * zero references to it and so it needs to removed from * the vnode free list. */ int union_allocvp(vpp, mp, dvp, upperdvp, cnp, uppervp, lowervp, docache) struct vnode **vpp; struct mount *mp; struct vnode *dvp; /* parent union vnode */ struct vnode *upperdvp; /* parent vnode of uppervp */ struct componentname *cnp; /* may be null */ struct vnode *uppervp; /* may be null */ struct vnode *lowervp; /* may be null */ int docache; { int error; struct union_node *un = 0; struct union_mount *um = MOUNTTOUNIONMOUNT(mp); struct thread *td = (cnp) ? cnp->cn_thread : curthread; int hash = 0; int vflag; int try; if (uppervp == NULLVP && lowervp == NULLVP) panic("union: unidentifiable allocation"); if (uppervp && lowervp && (uppervp->v_type != lowervp->v_type)) { vrele(lowervp); lowervp = NULLVP; } /* detect the root vnode (and aliases) */ vflag = 0; if ((uppervp == um->um_uppervp) && ((lowervp == NULLVP) || lowervp == um->um_lowervp)) { if (lowervp == NULLVP) { lowervp = um->um_lowervp; if (lowervp != NULLVP) VREF(lowervp); } vflag = VROOT; } loop: if (!docache) { un = 0; } else for (try = 0; try < 3; try++) { switch (try) { case 0: if (lowervp == NULLVP) continue; hash = UNION_HASH(uppervp, lowervp); break; case 1: if (uppervp == NULLVP) continue; hash = UNION_HASH(uppervp, NULLVP); break; case 2: if (lowervp == NULLVP) continue; hash = UNION_HASH(NULLVP, lowervp); break; } while (union_list_lock(hash)) continue; LIST_FOREACH(un, &unhead[hash], un_cache) { if ((un->un_lowervp == lowervp || un->un_lowervp == NULLVP) && (un->un_uppervp == uppervp || un->un_uppervp == NULLVP) && (UNIONTOV(un)->v_mount == mp)) { if (vget(UNIONTOV(un), 0, cnp ? cnp->cn_thread : NULL)) { union_list_unlock(hash); goto loop; } break; } } union_list_unlock(hash); if (un) break; } if (un) { /* * Obtain a lock on the union_node. Everything is unlocked * except for dvp, so check that case. If they match, our * new un is already locked. Otherwise we have to lock our * new un. * * A potential deadlock situation occurs when we are holding * one lock while trying to get another. We must follow * strict ordering rules to avoid it. We try to locate dvp * by scanning up from un_vnode, since the most likely * scenario is un being under dvp. */ if (dvp && un->un_vnode != dvp) { struct vnode *scan = un->un_vnode; do { scan = VTOUNION(scan)->un_pvp; } while (scan && scan->v_tag == VT_UNION && scan != dvp); if (scan != dvp) { /* * our new un is above dvp (we never saw dvp * while moving up the tree). */ VREF(dvp); VOP_UNLOCK(dvp, 0, td); error = vn_lock(un->un_vnode, LK_EXCLUSIVE, td); vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY, td); vrele(dvp); } else { /* * our new un is under dvp */ error = vn_lock(un->un_vnode, LK_EXCLUSIVE, td); } } else if (dvp == NULLVP) { /* * dvp is NULL, we need to lock un. */ error = vn_lock(un->un_vnode, LK_EXCLUSIVE, td); } else { /* * dvp == un->un_vnode, we are already locked. */ error = 0; } if (error) goto loop; /* * At this point, the union_node is locked and referenced. * * uppervp is locked and referenced or NULL, lowervp is * referenced or NULL. */ UDEBUG(("Modify existing un %p vn %p upper %p(refs %d) -> %p(refs %d)\n", un, un->un_vnode, un->un_uppervp, (un->un_uppervp ? un->un_uppervp->v_usecount : -99), uppervp, (uppervp ? uppervp->v_usecount : -99) )); if (uppervp != un->un_uppervp) { KASSERT(uppervp == NULL || uppervp->v_usecount > 0, ("union_allocvp: too few refs %d (at least 1 required) on uppervp", uppervp->v_usecount)); union_newupper(un, uppervp); } else if (uppervp) { KASSERT(uppervp->v_usecount > 1, ("union_allocvp: too few refs %d (at least 2 required) on uppervp", uppervp->v_usecount)); vrele(uppervp); } /* * Save information about the lower layer. * This needs to keep track of pathname * and directory information which union_vn_create * might need. */ if (lowervp != un->un_lowervp) { union_newlower(un, lowervp); if (cnp && (lowervp != NULLVP)) { un->un_path = malloc(cnp->cn_namelen+1, M_TEMP, M_WAITOK); bcopy(cnp->cn_nameptr, un->un_path, cnp->cn_namelen); un->un_path[cnp->cn_namelen] = '\0'; } } else if (lowervp) { vrele(lowervp); } /* * and upperdvp */ if (upperdvp != un->un_dirvp) { if (un->un_dirvp) vrele(un->un_dirvp); un->un_dirvp = upperdvp; } else if (upperdvp) { vrele(upperdvp); } *vpp = UNIONTOV(un); return (0); } if (docache) { /* * otherwise lock the vp list while we call getnewvnode * since that can block. */ hash = UNION_HASH(uppervp, lowervp); if (union_list_lock(hash)) goto loop; } /* * Create new node rather then replace old node */ error = getnewvnode(VT_UNION, mp, union_vnodeop_p, vpp); if (error) { /* * If an error occurs clear out vnodes. */ if (lowervp) vrele(lowervp); if (uppervp) vrele(uppervp); if (upperdvp) vrele(upperdvp); *vpp = NULL; goto out; } MALLOC((*vpp)->v_data, void *, sizeof(struct union_node), M_TEMP, M_WAITOK); (*vpp)->v_flag |= vflag; if (uppervp) (*vpp)->v_type = uppervp->v_type; else (*vpp)->v_type = lowervp->v_type; un = VTOUNION(*vpp); bzero(un, sizeof(*un)); lockinit(&un->un_lock, PVFS, "unlock", VLKTIMEOUT, 0); vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, td); un->un_vnode = *vpp; un->un_uppervp = uppervp; un->un_uppersz = VNOVAL; un->un_lowervp = lowervp; un->un_lowersz = VNOVAL; un->un_dirvp = upperdvp; un->un_pvp = dvp; /* only parent dir in new allocation */ if (dvp != NULLVP) VREF(dvp); un->un_dircache = 0; un->un_openl = 0; if (cnp && (lowervp != NULLVP)) { un->un_path = malloc(cnp->cn_namelen+1, M_TEMP, M_WAITOK); bcopy(cnp->cn_nameptr, un->un_path, cnp->cn_namelen); un->un_path[cnp->cn_namelen] = '\0'; } else { un->un_path = 0; un->un_dirvp = NULL; } if (docache) { LIST_INSERT_HEAD(&unhead[hash], un, un_cache); un->un_flags |= UN_CACHED; } out: if (docache) union_list_unlock(hash); return (error); } int union_freevp(vp) struct vnode *vp; { struct union_node *un = VTOUNION(vp); if (un->un_flags & UN_CACHED) { un->un_flags &= ~UN_CACHED; LIST_REMOVE(un, un_cache); } if (un->un_pvp != NULLVP) { vrele(un->un_pvp); un->un_pvp = NULL; } if (un->un_uppervp != NULLVP) { vrele(un->un_uppervp); un->un_uppervp = NULL; } if (un->un_lowervp != NULLVP) { vrele(un->un_lowervp); un->un_lowervp = NULL; } if (un->un_dirvp != NULLVP) { vrele(un->un_dirvp); un->un_dirvp = NULL; } if (un->un_path) { free(un->un_path, M_TEMP); un->un_path = NULL; } lockdestroy(&un->un_lock); FREE(vp->v_data, M_TEMP); vp->v_data = 0; return (0); } /* * copyfile. copy the vnode (fvp) to the vnode (tvp) * using a sequence of reads and writes. both (fvp) * and (tvp) are locked on entry and exit. * * fvp and tvp are both exclusive locked on call, but their refcount's * haven't been bumped at all. */ static int union_copyfile(fvp, tvp, cred, td) struct vnode *fvp; struct vnode *tvp; struct ucred *cred; struct thread *td; { char *buf; struct uio uio; struct iovec iov; int error = 0; /* * strategy: * allocate a buffer of size MAXBSIZE. * loop doing reads and writes, keeping track * of the current uio offset. * give up at the first sign of trouble. */ bzero(&uio, sizeof(uio)); uio.uio_td = td; uio.uio_segflg = UIO_SYSSPACE; uio.uio_offset = 0; VOP_LEASE(fvp, td, cred, LEASE_READ); VOP_LEASE(tvp, td, cred, LEASE_WRITE); buf = malloc(MAXBSIZE, M_TEMP, M_WAITOK); /* ugly loop follows... */ do { off_t offset = uio.uio_offset; int count; int bufoffset; /* * Setup for big read */ uio.uio_iov = &iov; uio.uio_iovcnt = 1; iov.iov_base = buf; iov.iov_len = MAXBSIZE; uio.uio_resid = iov.iov_len; uio.uio_rw = UIO_READ; if ((error = VOP_READ(fvp, &uio, 0, cred)) != 0) break; /* * Get bytes read, handle read eof case and setup for * write loop */ if ((count = MAXBSIZE - uio.uio_resid) == 0) break; bufoffset = 0; /* * Write until an error occurs or our buffer has been * exhausted, then update the offset for the next read. */ while (bufoffset < count) { uio.uio_iov = &iov; uio.uio_iovcnt = 1; iov.iov_base = buf + bufoffset; iov.iov_len = count - bufoffset; uio.uio_offset = offset + bufoffset; uio.uio_rw = UIO_WRITE; uio.uio_resid = iov.iov_len; if ((error = VOP_WRITE(tvp, &uio, 0, cred)) != 0) break; bufoffset += (count - bufoffset) - uio.uio_resid; } uio.uio_offset = offset + bufoffset; } while (error == 0); free(buf, M_TEMP); return (error); } /* * * un's vnode is assumed to be locked on entry and remains locked on exit. */ int union_copyup(un, docopy, cred, td) struct union_node *un; int docopy; struct ucred *cred; struct thread *td; { int error; struct mount *mp; struct vnode *lvp, *uvp; /* * If the user does not have read permission, the vnode should not * be copied to upper layer. */ vn_lock(un->un_lowervp, LK_EXCLUSIVE | LK_RETRY, td); error = VOP_ACCESS(un->un_lowervp, VREAD, cred, td); VOP_UNLOCK(un->un_lowervp, 0, td); if (error) return (error); if ((error = vn_start_write(un->un_dirvp, &mp, V_WAIT | PCATCH)) != 0) return (error); if ((error = union_vn_create(&uvp, un, td)) != 0) { vn_finished_write(mp); return (error); } lvp = un->un_lowervp; KASSERT(uvp->v_usecount > 0, ("copy: uvp refcount 0: %d", uvp->v_usecount)); if (docopy) { /* * XX - should not ignore errors * from VOP_CLOSE */ vn_lock(lvp, LK_EXCLUSIVE | LK_RETRY, td); error = VOP_OPEN(lvp, FREAD, cred, td); if (error == 0 && vn_canvmio(lvp) == TRUE) error = vfs_object_create(lvp, td, cred); if (error == 0) { error = union_copyfile(lvp, uvp, cred, td); VOP_UNLOCK(lvp, 0, td); (void) VOP_CLOSE(lvp, FREAD, cred, td); } if (error == 0) UDEBUG(("union: copied up %s\n", un->un_path)); } VOP_UNLOCK(uvp, 0, td); vn_finished_write(mp); union_newupper(un, uvp); KASSERT(uvp->v_usecount > 0, ("copy: uvp refcount 0: %d", uvp->v_usecount)); union_vn_close(uvp, FWRITE, cred, td); KASSERT(uvp->v_usecount > 0, ("copy: uvp refcount 0: %d", uvp->v_usecount)); /* * Subsequent IOs will go to the top layer, so * call close on the lower vnode and open on the * upper vnode to ensure that the filesystem keeps * its references counts right. This doesn't do * the right thing with (cred) and (FREAD) though. * Ignoring error returns is not right, either. */ if (error == 0) { int i; for (i = 0; i < un->un_openl; i++) { (void) VOP_CLOSE(lvp, FREAD, cred, td); (void) VOP_OPEN(uvp, FREAD, cred, td); } if (un->un_openl) { if (vn_canvmio(uvp) == TRUE) error = vfs_object_create(uvp, td, cred); } un->un_openl = 0; } return (error); } /* * union_relookup: * * dvp should be locked on entry and will be locked on return. No * net change in the ref count will occur. * * If an error is returned, *vpp will be invalid, otherwise it * will hold a locked, referenced vnode. If *vpp == dvp then * remember that only one exclusive lock is held. */ static int union_relookup(um, dvp, vpp, cnp, cn, path, pathlen) struct union_mount *um; struct vnode *dvp; struct vnode **vpp; struct componentname *cnp; struct componentname *cn; char *path; int pathlen; { int error; /* * A new componentname structure must be faked up because * there is no way to know where the upper level cnp came * from or what it is being used for. This must duplicate * some of the work done by NDINIT, some of the work done * by namei, some of the work done by lookup and some of * the work done by VOP_LOOKUP when given a CREATE flag. * Conclusion: Horrible. */ cn->cn_namelen = pathlen; cn->cn_pnbuf = zalloc(namei_zone); bcopy(path, cn->cn_pnbuf, cn->cn_namelen); cn->cn_pnbuf[cn->cn_namelen] = '\0'; cn->cn_nameiop = CREATE; cn->cn_flags = (LOCKPARENT|LOCKLEAF|HASBUF|SAVENAME|ISLASTCN); cn->cn_thread = cnp->cn_thread; if (um->um_op == UNMNT_ABOVE) cn->cn_cred = cnp->cn_cred; else cn->cn_cred = um->um_cred; cn->cn_nameptr = cn->cn_pnbuf; cn->cn_consume = cnp->cn_consume; VREF(dvp); VOP_UNLOCK(dvp, 0, cnp->cn_thread); /* * Pass dvp unlocked and referenced on call to relookup(). * * If an error occurs, dvp will be returned unlocked and dereferenced. */ if ((error = relookup(dvp, vpp, cn)) != 0) { vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY, cnp->cn_thread); return(error); } /* * If no error occurs, dvp will be returned locked with the reference * left as before, and vpp will be returned referenced and locked. * * We want to return with dvp as it was passed to us, so we get * rid of our reference. */ vrele(dvp); return (0); } /* * Create a shadow directory in the upper layer. * The new vnode is returned locked. * * (um) points to the union mount structure for access to the * the mounting process's credentials. * (dvp) is the directory in which to create the shadow directory, * it is locked (but not ref'd) on entry and return. * (cnp) is the componentname to be created. * (vpp) is the returned newly created shadow directory, which * is returned locked and ref'd */ int union_mkshadow(um, dvp, cnp, vpp) struct union_mount *um; struct vnode *dvp; struct componentname *cnp; struct vnode **vpp; { int error; struct vattr va; struct thread *td = cnp->cn_thread; struct componentname cn; struct mount *mp; if ((error = vn_start_write(dvp, &mp, V_WAIT | PCATCH)) != 0) return (error); if ((error = union_relookup(um, dvp, vpp, cnp, &cn, cnp->cn_nameptr, cnp->cn_namelen)) != 0) { vn_finished_write(mp); return (error); } if (*vpp) { if (cn.cn_flags & HASBUF) { zfree(namei_zone, cn.cn_pnbuf); cn.cn_flags &= ~HASBUF; } if (dvp == *vpp) vrele(*vpp); else vput(*vpp); vn_finished_write(mp); *vpp = NULLVP; return (EEXIST); } /* * policy: when creating the shadow directory in the * upper layer, create it owned by the user who did * the mount, group from parent directory, and mode * 777 modified by umask (ie mostly identical to the * mkdir syscall). (jsp, kb) */ VATTR_NULL(&va); va.va_type = VDIR; va.va_mode = um->um_cmode; /* VOP_LEASE: dvp is locked */ VOP_LEASE(dvp, td, cn.cn_cred, LEASE_WRITE); error = VOP_MKDIR(dvp, vpp, &cn, &va); if (cn.cn_flags & HASBUF) { zfree(namei_zone, cn.cn_pnbuf); cn.cn_flags &= ~HASBUF; } /*vput(dvp);*/ vn_finished_write(mp); return (error); } /* * Create a whiteout entry in the upper layer. * * (um) points to the union mount structure for access to the * the mounting process's credentials. * (dvp) is the directory in which to create the whiteout. * it is locked on entry and return. * (cnp) is the componentname to be created. */ int union_mkwhiteout(um, dvp, cnp, path) struct union_mount *um; struct vnode *dvp; struct componentname *cnp; char *path; { int error; struct thread *td = cnp->cn_thread; struct vnode *wvp; struct componentname cn; struct mount *mp; if ((error = vn_start_write(dvp, &mp, V_WAIT | PCATCH)) != 0) return (error); error = union_relookup(um, dvp, &wvp, cnp, &cn, path, strlen(path)); if (error) { vn_finished_write(mp); return (error); } if (wvp) { if (cn.cn_flags & HASBUF) { zfree(namei_zone, cn.cn_pnbuf); cn.cn_flags &= ~HASBUF; } if (wvp == dvp) vrele(wvp); else vput(wvp); vn_finished_write(mp); return (EEXIST); } /* VOP_LEASE: dvp is locked */ VOP_LEASE(dvp, td, td->td_proc->p_ucred, LEASE_WRITE); error = VOP_WHITEOUT(dvp, &cn, CREATE); if (cn.cn_flags & HASBUF) { zfree(namei_zone, cn.cn_pnbuf); cn.cn_flags &= ~HASBUF; } vn_finished_write(mp); return (error); } /* * union_vn_create: creates and opens a new shadow file * on the upper union layer. this function is similar * in spirit to calling vn_open but it avoids calling namei(). * the problem with calling namei is that a) it locks too many * things, and b) it doesn't start at the "right" directory, * whereas relookup is told where to start. * * On entry, the vnode associated with un is locked. It remains locked * on return. * * If no error occurs, *vpp contains a locked referenced vnode for your * use. If an error occurs *vpp iis undefined. */ static int union_vn_create(vpp, un, td) struct vnode **vpp; struct union_node *un; struct thread *td; { struct vnode *vp; struct ucred *cred = td->td_proc->p_ucred; struct vattr vat; struct vattr *vap = &vat; int fmode = FFLAGS(O_WRONLY|O_CREAT|O_TRUNC|O_EXCL); int error; - int cmode = UN_FILEMODE & ~td->td_proc->p_fd->fd_cmask; + int cmode; struct componentname cn; *vpp = NULLVP; + FILEDESC_LOCK(td->td_proc->p_fd); + cmode = UN_FILEMODE & ~td->td_proc->p_fd->fd_cmask; + FILEDESC_UNLOCK(td->td_proc->p_fd); /* * Build a new componentname structure (for the same * reasons outlines in union_mkshadow). * The difference here is that the file is owned by * the current user, rather than by the person who * did the mount, since the current user needs to be * able to write the file (that's why it is being * copied in the first place). */ cn.cn_namelen = strlen(un->un_path); cn.cn_pnbuf = zalloc(namei_zone); bcopy(un->un_path, cn.cn_pnbuf, cn.cn_namelen+1); cn.cn_nameiop = CREATE; cn.cn_flags = (LOCKPARENT|LOCKLEAF|HASBUF|SAVENAME|ISLASTCN); cn.cn_thread = td; cn.cn_cred = td->td_proc->p_ucred; cn.cn_nameptr = cn.cn_pnbuf; cn.cn_consume = 0; /* * Pass dvp unlocked and referenced on call to relookup(). * * If an error occurs, dvp will be returned unlocked and dereferenced. */ VREF(un->un_dirvp); error = relookup(un->un_dirvp, &vp, &cn); if (error) return (error); /* * If no error occurs, dvp will be returned locked with the reference * left as before, and vpp will be returned referenced and locked. */ if (vp) { vput(un->un_dirvp); if (cn.cn_flags & HASBUF) { zfree(namei_zone, cn.cn_pnbuf); cn.cn_flags &= ~HASBUF; } if (vp == un->un_dirvp) vrele(vp); else vput(vp); return (EEXIST); } /* * Good - there was no race to create the file * so go ahead and create it. The permissions * on the file will be 0666 modified by the * current user's umask. Access to the file, while * it is unioned, will require access to the top *and* * bottom files. Access when not unioned will simply * require access to the top-level file. * TODO: confirm choice of access permissions. */ VATTR_NULL(vap); vap->va_type = VREG; vap->va_mode = cmode; VOP_LEASE(un->un_dirvp, td, cred, LEASE_WRITE); error = VOP_CREATE(un->un_dirvp, &vp, &cn, vap); if (cn.cn_flags & HASBUF) { zfree(namei_zone, cn.cn_pnbuf); cn.cn_flags &= ~HASBUF; } vput(un->un_dirvp); if (error) return (error); error = VOP_OPEN(vp, fmode, cred, td); if (error == 0 && vn_canvmio(vp) == TRUE) error = vfs_object_create(vp, td, cred); if (error) { vput(vp); return (error); } vp->v_writecount++; *vpp = vp; return (0); } static int union_vn_close(vp, fmode, cred, td) struct vnode *vp; int fmode; struct ucred *cred; struct thread *td; { if (fmode & FWRITE) --vp->v_writecount; return (VOP_CLOSE(vp, fmode, cred, td)); } #if 0 /* * union_removed_upper: * * called with union_node unlocked. XXX */ void union_removed_upper(un) struct union_node *un; { struct thread *td = curthread; /* XXX */ struct vnode **vpp; /* * Do not set the uppervp to NULLVP. If lowervp is NULLVP, * union node will have neither uppervp nor lowervp. We remove * the union node from cache, so that it will not be referrenced. */ union_newupper(un, NULLVP); if (un->un_dircache != 0) { for (vpp = un->un_dircache; *vpp != NULLVP; vpp++) vrele(*vpp); free(un->un_dircache, M_TEMP); un->un_dircache = 0; } if (un->un_flags & UN_CACHED) { un->un_flags &= ~UN_CACHED; LIST_REMOVE(un, un_cache); } } #endif /* * determine whether a whiteout is needed * during a remove/rmdir operation. */ int union_dowhiteout(un, cred, td) struct union_node *un; struct ucred *cred; struct thread *td; { struct vattr va; if (un->un_lowervp != NULLVP) return (1); if (VOP_GETATTR(un->un_uppervp, &va, cred, td) == 0 && (va.va_flags & OPAQUE)) return (1); return (0); } static void union_dircache_r(vp, vppp, cntp) struct vnode *vp; struct vnode ***vppp; int *cntp; { struct union_node *un; if (vp->v_op != union_vnodeop_p) { if (vppp) { VREF(vp); *(*vppp)++ = vp; if (--(*cntp) == 0) panic("union: dircache table too small"); } else { (*cntp)++; } return; } un = VTOUNION(vp); if (un->un_uppervp != NULLVP) union_dircache_r(un->un_uppervp, vppp, cntp); if (un->un_lowervp != NULLVP) union_dircache_r(un->un_lowervp, vppp, cntp); } struct vnode * union_dircache(vp, td) struct vnode *vp; struct thread *td; { int cnt; struct vnode *nvp; struct vnode **vpp; struct vnode **dircache; struct union_node *un; int error; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); dircache = VTOUNION(vp)->un_dircache; nvp = NULLVP; if (dircache == NULL) { cnt = 0; union_dircache_r(vp, 0, &cnt); cnt++; dircache = malloc(cnt * sizeof(struct vnode *), M_TEMP, M_WAITOK); vpp = dircache; union_dircache_r(vp, &vpp, &cnt); *vpp = NULLVP; vpp = dircache + 1; } else { vpp = dircache; do { if (*vpp++ == VTOUNION(vp)->un_uppervp) break; } while (*vpp != NULLVP); } if (*vpp == NULLVP) goto out; /*vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, td);*/ UDEBUG(("ALLOCVP-3 %p ref %d\n", *vpp, (*vpp ? (*vpp)->v_usecount : -99))); VREF(*vpp); error = union_allocvp(&nvp, vp->v_mount, NULLVP, NULLVP, NULL, *vpp, NULLVP, 0); UDEBUG(("ALLOCVP-3B %p ref %d\n", nvp, (*vpp ? (*vpp)->v_usecount : -99))); if (error) goto out; VTOUNION(vp)->un_dircache = 0; un = VTOUNION(nvp); un->un_dircache = dircache; out: VOP_UNLOCK(vp, 0, td); return (nvp); } /* * Module glue to remove #ifdef UNION from vfs_syscalls.c */ static int union_dircheck(struct thread *td, struct vnode **vp, struct file *fp) { int error = 0; if ((*vp)->v_op == union_vnodeop_p) { struct vnode *lvp; lvp = union_dircache(*vp, td); if (lvp != NULLVP) { struct vattr va; /* * If the directory is opaque, * then don't show lower entries */ error = VOP_GETATTR(*vp, &va, fp->f_cred, td); if (va.va_flags & OPAQUE) { vput(lvp); lvp = NULL; } } if (lvp != NULLVP) { error = VOP_OPEN(lvp, FREAD, fp->f_cred, td); if (error == 0 && vn_canvmio(lvp) == TRUE) error = vfs_object_create(lvp, td, fp->f_cred); if (error) { vput(lvp); return (error); } VOP_UNLOCK(lvp, 0, td); + FILE_LOCK(fp); fp->f_data = (caddr_t) lvp; fp->f_offset = 0; + FILE_UNLOCK(fp); error = vn_close(*vp, FREAD, fp->f_cred, td); if (error) return (error); *vp = lvp; return -1; /* goto unionread */ } } return error; } static int union_modevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: union_dircheckp = union_dircheck; break; case MOD_UNLOAD: union_dircheckp = NULL; break; default: break; } return 0; } static moduledata_t union_mod = { "union_dircheck", union_modevent, NULL }; DECLARE_MODULE(union_dircheck, union_mod, SI_SUB_VFS, SI_ORDER_ANY); Index: head/sys/fs/unionfs/union_vfsops.c =================================================================== --- head/sys/fs/unionfs/union_vfsops.c (revision 89305) +++ head/sys/fs/unionfs/union_vfsops.c (revision 89306) @@ -1,488 +1,490 @@ /* * Copyright (c) 1994, 1995 The Regents of the University of California. * Copyright (c) 1994, 1995 Jan-Simon Pendry. * All rights reserved. * * This code is derived from software donated to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)union_vfsops.c 8.20 (Berkeley) 5/20/95 * $FreeBSD$ */ /* * Union Layer */ #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_UNIONFSMNT, "UNION mount", "UNION mount structure"); extern int union_init __P((struct vfsconf *)); static int union_mount __P((struct mount *mp, char *path, caddr_t data, struct nameidata *ndp, struct thread *td)); static int union_root __P((struct mount *mp, struct vnode **vpp)); static int union_statfs __P((struct mount *mp, struct statfs *sbp, struct thread *td)); static int union_unmount __P((struct mount *mp, int mntflags, struct thread *td)); /* * Mount union filesystem */ static int union_mount(mp, path, data, ndp, td) struct mount *mp; char *path; caddr_t data; struct nameidata *ndp; struct thread *td; { int error = 0; struct union_args args; struct vnode *lowerrootvp = NULLVP; struct vnode *upperrootvp = NULLVP; struct union_mount *um = 0; struct ucred *cred = 0; char *cp = 0; int len; u_int size; UDEBUG(("union_mount(mp = %p)\n", (void *)mp)); /* * Disable clustered write, otherwise system becomes unstable. */ mp->mnt_flag |= MNT_NOCLUSTERW; /* * Update is a no-op */ if (mp->mnt_flag & MNT_UPDATE) { /* * Need to provide. * 1. a way to convert between rdonly and rdwr mounts. * 2. support for nfs exports. */ error = EOPNOTSUPP; goto bad; } /* * Get argument */ error = copyin(data, (caddr_t)&args, sizeof(struct union_args)); if (error) goto bad; /* * Obtain lower vnode. Vnode is stored in mp->mnt_vnodecovered. * We need to reference it but not lock it. */ lowerrootvp = mp->mnt_vnodecovered; VREF(lowerrootvp); #if 0 /* * Unlock lower node to avoid deadlock. */ if (lowerrootvp->v_op == union_vnodeop_p) VOP_UNLOCK(lowerrootvp, 0, td); #endif /* * Obtain upper vnode by calling namei() on the path. The * upperrootvp will be turned referenced but not locked. */ NDINIT(ndp, LOOKUP, FOLLOW|WANTPARENT, UIO_USERSPACE, args.target, td); error = namei(ndp); #if 0 if (lowerrootvp->v_op == union_vnodeop_p) vn_lock(lowerrootvp, LK_EXCLUSIVE | LK_RETRY, td); #endif if (error) goto bad; NDFREE(ndp, NDF_ONLY_PNBUF); upperrootvp = ndp->ni_vp; vrele(ndp->ni_dvp); ndp->ni_dvp = NULL; UDEBUG(("mount_root UPPERVP %p locked = %d\n", upperrootvp, VOP_ISLOCKED(upperrootvp, NULL))); /* * Check multi union mount to avoid `lock myself again' panic. * Also require that it be a directory. */ if (upperrootvp == VTOUNION(lowerrootvp)->un_uppervp) { #ifdef DIAGNOSTIC printf("union_mount: multi union mount?\n"); #endif error = EDEADLK; goto bad; } if (upperrootvp->v_type != VDIR) { error = EINVAL; goto bad; } /* * Allocate our union_mount structure and populate the fields. * The vnode references are stored in the union_mount as held, * unlocked references. Depending on the _BELOW flag, the * filesystems are viewed in a different order. In effect this * is the same as providing a mount-under option to the mount * syscall. */ um = (struct union_mount *) malloc(sizeof(struct union_mount), M_UNIONFSMNT, M_WAITOK | M_ZERO); um->um_op = args.mntflags & UNMNT_OPMASK; switch (um->um_op) { case UNMNT_ABOVE: um->um_lowervp = lowerrootvp; um->um_uppervp = upperrootvp; upperrootvp = NULL; lowerrootvp = NULL; break; case UNMNT_BELOW: um->um_lowervp = upperrootvp; um->um_uppervp = lowerrootvp; upperrootvp = NULL; lowerrootvp = NULL; break; case UNMNT_REPLACE: vrele(lowerrootvp); lowerrootvp = NULL; um->um_uppervp = upperrootvp; um->um_lowervp = lowerrootvp; upperrootvp = NULL; break; default: error = EINVAL; goto bad; } /* * Unless the mount is readonly, ensure that the top layer * supports whiteout operations */ if ((mp->mnt_flag & MNT_RDONLY) == 0) { error = VOP_WHITEOUT(um->um_uppervp, NULL, LOOKUP); if (error) goto bad; } um->um_cred = crhold(td->td_proc->p_ucred); + FILEDESC_LOCK(td->td_proc->p_fd); um->um_cmode = UN_DIRMODE &~ td->td_proc->p_fd->fd_cmask; + FILEDESC_UNLOCK(td->td_proc->p_fd); /* * Depending on what you think the MNT_LOCAL flag might mean, * you may want the && to be || on the conditional below. * At the moment it has been defined that the filesystem is * only local if it is all local, ie the MNT_LOCAL flag implies * that the entire namespace is local. If you think the MNT_LOCAL * flag implies that some of the files might be stored locally * then you will want to change the conditional. */ if (um->um_op == UNMNT_ABOVE) { if (((um->um_lowervp == NULLVP) || (um->um_lowervp->v_mount->mnt_flag & MNT_LOCAL)) && (um->um_uppervp->v_mount->mnt_flag & MNT_LOCAL)) mp->mnt_flag |= MNT_LOCAL; } /* * Copy in the upper layer's RDONLY flag. This is for the benefit * of lookup() which explicitly checks the flag, rather than asking * the filesystem for its own opinion. This means, that an update * mount of the underlying filesystem to go from rdonly to rdwr * will leave the unioned view as read-only. */ mp->mnt_flag |= (um->um_uppervp->v_mount->mnt_flag & MNT_RDONLY); mp->mnt_data = (qaddr_t) um; vfs_getnewfsid(mp); switch (um->um_op) { case UNMNT_ABOVE: cp = ":"; break; case UNMNT_BELOW: cp = ":"; break; case UNMNT_REPLACE: cp = ""; break; } len = strlen(cp); bcopy(cp, mp->mnt_stat.f_mntfromname, len); cp = mp->mnt_stat.f_mntfromname + len; len = MNAMELEN - len; (void) copyinstr(args.target, cp, len - 1, &size); bzero(cp + size, len - size); (void)union_statfs(mp, &mp->mnt_stat, td); UDEBUG(("union_mount: from %s, on %s\n", mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname)); return (0); bad: if (um) { if (um->um_uppervp) vrele(um->um_uppervp); if (um->um_lowervp) vrele(um->um_lowervp); /* XXX other fields */ free(um, M_UNIONFSMNT); } if (cred) crfree(cred); if (upperrootvp) vrele(upperrootvp); if (lowerrootvp) vrele(lowerrootvp); return (error); } /* * Free reference to union layer */ static int union_unmount(mp, mntflags, td) struct mount *mp; int mntflags; struct thread *td; { struct union_mount *um = MOUNTTOUNIONMOUNT(mp); int error; int freeing; int flags = 0; UDEBUG(("union_unmount(mp = %p)\n", (void *)mp)); if (mntflags & MNT_FORCE) flags |= FORCECLOSE; /* * Keep flushing vnodes from the mount list. * This is needed because of the un_pvp held * reference to the parent vnode. * If more vnodes have been freed on a given pass, * the try again. The loop will iterate at most * (d) times, where (d) is the maximum tree depth * in the filesystem. */ for (freeing = 0; (error = vflush(mp, 0, flags)) != 0;) { struct vnode *vp; int n; /* count #vnodes held on mount list */ mtx_lock(&mntvnode_mtx); n = 0; TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) n++; mtx_unlock(&mntvnode_mtx); /* if this is unchanged then stop */ if (n == freeing) break; /* otherwise try once more time */ freeing = n; } /* If the most recent vflush failed, the filesystem is still busy. */ if (error) return (error); /* * Discard references to upper and lower target vnodes. */ if (um->um_lowervp) vrele(um->um_lowervp); vrele(um->um_uppervp); crfree(um->um_cred); /* * Finally, throw away the union_mount structure */ free(mp->mnt_data, M_UNIONFSMNT); /* XXX */ mp->mnt_data = 0; return (0); } static int union_root(mp, vpp) struct mount *mp; struct vnode **vpp; { struct union_mount *um = MOUNTTOUNIONMOUNT(mp); int error; /* * Supply an unlocked reference to um_uppervp and to um_lowervp. It * is possible for um_uppervp to be locked without the associated * root union_node being locked. We let union_allocvp() deal with * it. */ UDEBUG(("union_root UPPERVP %p locked = %d\n", um->um_uppervp, VOP_ISLOCKED(um->um_uppervp, NULL))); VREF(um->um_uppervp); if (um->um_lowervp) VREF(um->um_lowervp); error = union_allocvp(vpp, mp, NULLVP, NULLVP, NULL, um->um_uppervp, um->um_lowervp, 1); UDEBUG(("error %d\n", error)); UDEBUG(("union_root2 UPPERVP %p locked = %d\n", um->um_uppervp, VOP_ISLOCKED(um->um_uppervp, NULL))); return (error); } static int union_statfs(mp, sbp, td) struct mount *mp; struct statfs *sbp; struct thread *td; { int error; struct union_mount *um = MOUNTTOUNIONMOUNT(mp); struct statfs mstat; int lbsize; UDEBUG(("union_statfs(mp = %p, lvp = %p, uvp = %p)\n", (void *)mp, (void *)um->um_lowervp, (void *)um->um_uppervp)); bzero(&mstat, sizeof(mstat)); if (um->um_lowervp) { error = VFS_STATFS(um->um_lowervp->v_mount, &mstat, td); if (error) return (error); } /* now copy across the "interesting" information and fake the rest */ #if 0 sbp->f_type = mstat.f_type; sbp->f_flags = mstat.f_flags; sbp->f_bsize = mstat.f_bsize; sbp->f_iosize = mstat.f_iosize; #endif lbsize = mstat.f_bsize; sbp->f_blocks = mstat.f_blocks; sbp->f_bfree = mstat.f_bfree; sbp->f_bavail = mstat.f_bavail; sbp->f_files = mstat.f_files; sbp->f_ffree = mstat.f_ffree; error = VFS_STATFS(um->um_uppervp->v_mount, &mstat, td); if (error) return (error); sbp->f_flags = mstat.f_flags; sbp->f_bsize = mstat.f_bsize; sbp->f_iosize = mstat.f_iosize; /* * if the lower and upper blocksizes differ, then frig the * block counts so that the sizes reported by df make some * kind of sense. none of this makes sense though. */ if (mstat.f_bsize != lbsize) sbp->f_blocks = ((off_t) sbp->f_blocks * lbsize) / mstat.f_bsize; /* * The "total" fields count total resources in all layers, * the "free" fields count only those resources which are * free in the upper layer (since only the upper layer * is writeable). */ sbp->f_blocks += mstat.f_blocks; sbp->f_bfree = mstat.f_bfree; sbp->f_bavail = mstat.f_bavail; sbp->f_files += mstat.f_files; sbp->f_ffree = mstat.f_ffree; if (sbp != &mp->mnt_stat) { sbp->f_type = mp->mnt_vfc->vfc_typenum; bcopy(&mp->mnt_stat.f_fsid, &sbp->f_fsid, sizeof(sbp->f_fsid)); bcopy(mp->mnt_stat.f_mntonname, sbp->f_mntonname, MNAMELEN); bcopy(mp->mnt_stat.f_mntfromname, sbp->f_mntfromname, MNAMELEN); } return (0); } static struct vfsops union_vfsops = { union_mount, vfs_stdstart, /* underlying start already done */ union_unmount, union_root, vfs_stdquotactl, union_statfs, vfs_stdsync, /* XXX assumes no cached data on union level */ vfs_stdvget, vfs_stdfhtovp, vfs_stdcheckexp, vfs_stdvptofh, union_init, vfs_stduninit, vfs_stdextattrctl, }; VFS_SET(union_vfsops, unionfs, VFCF_LOOPBACK); Index: head/sys/i386/ibcs2/ibcs2_fcntl.c =================================================================== --- head/sys/i386/ibcs2/ibcs2_fcntl.c (revision 89305) +++ head/sys/i386/ibcs2/ibcs2_fcntl.c (revision 89306) @@ -1,331 +1,335 @@ /* * Copyright (c) 1995 Scott Bartram * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #include "opt_spx_hack.h" #include #include #include #include #include #include #include #include #include #include #include #include #include static void cvt_iflock2flock __P((struct ibcs2_flock *, struct flock *)); static void cvt_flock2iflock __P((struct flock *, struct ibcs2_flock *)); static int cvt_o_flags __P((int)); static int oflags2ioflags __P((int)); static int ioflags2oflags __P((int)); static int cvt_o_flags(flags) int flags; { int r = 0; /* convert mode into NetBSD mode */ if (flags & IBCS2_O_WRONLY) r |= O_WRONLY; if (flags & IBCS2_O_RDWR) r |= O_RDWR; if (flags & (IBCS2_O_NDELAY | IBCS2_O_NONBLOCK)) r |= O_NONBLOCK; if (flags & IBCS2_O_APPEND) r |= O_APPEND; if (flags & IBCS2_O_SYNC) r |= O_FSYNC; if (flags & IBCS2_O_CREAT) r |= O_CREAT; if (flags & IBCS2_O_TRUNC) r |= O_TRUNC /* | O_CREAT ??? */; if (flags & IBCS2_O_EXCL) r |= O_EXCL; if (flags & IBCS2_O_RDONLY) r |= O_RDONLY; if (flags & IBCS2_O_PRIV) r |= O_EXLOCK; if (flags & IBCS2_O_NOCTTY) r |= O_NOCTTY; return r; } static void cvt_flock2iflock(flp, iflp) struct flock *flp; struct ibcs2_flock *iflp; { switch (flp->l_type) { case F_RDLCK: iflp->l_type = IBCS2_F_RDLCK; break; case F_WRLCK: iflp->l_type = IBCS2_F_WRLCK; break; case F_UNLCK: iflp->l_type = IBCS2_F_UNLCK; break; } iflp->l_whence = (short)flp->l_whence; iflp->l_start = (ibcs2_off_t)flp->l_start; iflp->l_len = (ibcs2_off_t)flp->l_len; iflp->l_sysid = 0; iflp->l_pid = (ibcs2_pid_t)flp->l_pid; } #ifdef DEBUG_IBCS2 static void print_flock(struct flock *flp) { printf("flock: start=%x len=%x pid=%d type=%d whence=%d\n", (int)flp->l_start, (int)flp->l_len, (int)flp->l_pid, flp->l_type, flp->l_whence); } #endif static void cvt_iflock2flock(iflp, flp) struct ibcs2_flock *iflp; struct flock *flp; { flp->l_start = (off_t)iflp->l_start; flp->l_len = (off_t)iflp->l_len; flp->l_pid = (pid_t)iflp->l_pid; switch (iflp->l_type) { case IBCS2_F_RDLCK: flp->l_type = F_RDLCK; break; case IBCS2_F_WRLCK: flp->l_type = F_WRLCK; break; case IBCS2_F_UNLCK: flp->l_type = F_UNLCK; break; } flp->l_whence = iflp->l_whence; } /* convert iBCS2 mode into NetBSD mode */ static int ioflags2oflags(flags) int flags; { int r = 0; if (flags & IBCS2_O_RDONLY) r |= O_RDONLY; if (flags & IBCS2_O_WRONLY) r |= O_WRONLY; if (flags & IBCS2_O_RDWR) r |= O_RDWR; if (flags & IBCS2_O_NDELAY) r |= O_NONBLOCK; if (flags & IBCS2_O_APPEND) r |= O_APPEND; if (flags & IBCS2_O_SYNC) r |= O_FSYNC; if (flags & IBCS2_O_NONBLOCK) r |= O_NONBLOCK; if (flags & IBCS2_O_CREAT) r |= O_CREAT; if (flags & IBCS2_O_TRUNC) r |= O_TRUNC; if (flags & IBCS2_O_EXCL) r |= O_EXCL; if (flags & IBCS2_O_NOCTTY) r |= O_NOCTTY; return r; } /* convert NetBSD mode into iBCS2 mode */ static int oflags2ioflags(flags) int flags; { int r = 0; if (flags & O_RDONLY) r |= IBCS2_O_RDONLY; if (flags & O_WRONLY) r |= IBCS2_O_WRONLY; if (flags & O_RDWR) r |= IBCS2_O_RDWR; if (flags & O_NDELAY) r |= IBCS2_O_NONBLOCK; if (flags & O_APPEND) r |= IBCS2_O_APPEND; if (flags & O_FSYNC) r |= IBCS2_O_SYNC; if (flags & O_NONBLOCK) r |= IBCS2_O_NONBLOCK; if (flags & O_CREAT) r |= IBCS2_O_CREAT; if (flags & O_TRUNC) r |= IBCS2_O_TRUNC; if (flags & O_EXCL) r |= IBCS2_O_EXCL; if (flags & O_NOCTTY) r |= IBCS2_O_NOCTTY; return r; } int ibcs2_open(td, uap) struct thread *td; struct ibcs2_open_args *uap; { struct proc *p = td->td_proc; int noctty = SCARG(uap, flags) & IBCS2_O_NOCTTY; int ret; caddr_t sg = stackgap_init(); SCARG(uap, flags) = cvt_o_flags(SCARG(uap, flags)); if (SCARG(uap, flags) & O_CREAT) CHECKALTCREAT(td, &sg, SCARG(uap, path)); else CHECKALTEXIST(td, &sg, SCARG(uap, path)); ret = open(td, (struct open_args *)uap); #ifdef SPX_HACK if (ret == ENXIO) { if (!strcmp(SCARG(uap, path), "/compat/ibcs2/dev/spx")) ret = spx_open(td, uap); } else #endif /* SPX_HACK */ PROC_LOCK(p); if (!ret && !noctty && SESS_LEADER(p) && !(p->p_flag & P_CONTROLT)) { - struct filedesc *fdp = p->p_fd; - struct file *fp = fdp->fd_ofiles[td->td_retval[0]]; + struct file *fp; + fp = ffind_hold(td, td->td_retval[0]); PROC_UNLOCK(p); + if (fp == NULL) + return (EBADF); + /* ignore any error, just give it a try */ if (fp->f_type == DTYPE_VNODE) fo_ioctl(fp, TIOCSCTTY, (caddr_t) 0, td); + fdrop(fp, td); } else PROC_UNLOCK(p); return ret; } int ibcs2_creat(td, uap) struct thread *td; struct ibcs2_creat_args *uap; { struct open_args cup; caddr_t sg = stackgap_init(); CHECKALTCREAT(td, &sg, SCARG(uap, path)); SCARG(&cup, path) = SCARG(uap, path); SCARG(&cup, mode) = SCARG(uap, mode); SCARG(&cup, flags) = O_WRONLY | O_CREAT | O_TRUNC; return open(td, &cup); } int ibcs2_access(td, uap) struct thread *td; struct ibcs2_access_args *uap; { struct access_args cup; caddr_t sg = stackgap_init(); CHECKALTEXIST(td, &sg, SCARG(uap, path)); SCARG(&cup, path) = SCARG(uap, path); SCARG(&cup, flags) = SCARG(uap, flags); return access(td, &cup); } int ibcs2_fcntl(td, uap) struct thread *td; struct ibcs2_fcntl_args *uap; { int error; struct fcntl_args fa; struct flock *flp; struct ibcs2_flock ifl; switch(SCARG(uap, cmd)) { case IBCS2_F_DUPFD: SCARG(&fa, fd) = SCARG(uap, fd); SCARG(&fa, cmd) = F_DUPFD; SCARG(&fa, arg) = (/* XXX */ int)SCARG(uap, arg); return fcntl(td, &fa); case IBCS2_F_GETFD: SCARG(&fa, fd) = SCARG(uap, fd); SCARG(&fa, cmd) = F_GETFD; SCARG(&fa, arg) = (/* XXX */ int)SCARG(uap, arg); return fcntl(td, &fa); case IBCS2_F_SETFD: SCARG(&fa, fd) = SCARG(uap, fd); SCARG(&fa, cmd) = F_SETFD; SCARG(&fa, arg) = (/* XXX */ int)SCARG(uap, arg); return fcntl(td, &fa); case IBCS2_F_GETFL: SCARG(&fa, fd) = SCARG(uap, fd); SCARG(&fa, cmd) = F_GETFL; SCARG(&fa, arg) = (/* XXX */ int)SCARG(uap, arg); error = fcntl(td, &fa); if (error) return error; td->td_retval[0] = oflags2ioflags(td->td_retval[0]); return error; case IBCS2_F_SETFL: SCARG(&fa, fd) = SCARG(uap, fd); SCARG(&fa, cmd) = F_SETFL; SCARG(&fa, arg) = (/* XXX */ int) ioflags2oflags((int)SCARG(uap, arg)); return fcntl(td, &fa); case IBCS2_F_GETLK: { caddr_t sg = stackgap_init(); flp = stackgap_alloc(&sg, sizeof(*flp)); error = copyin((caddr_t)SCARG(uap, arg), (caddr_t)&ifl, ibcs2_flock_len); if (error) return error; cvt_iflock2flock(&ifl, flp); SCARG(&fa, fd) = SCARG(uap, fd); SCARG(&fa, cmd) = F_GETLK; SCARG(&fa, arg) = (/* XXX */ int)flp; error = fcntl(td, &fa); if (error) return error; cvt_flock2iflock(flp, &ifl); return copyout((caddr_t)&ifl, (caddr_t)SCARG(uap, arg), ibcs2_flock_len); } case IBCS2_F_SETLK: { caddr_t sg = stackgap_init(); flp = stackgap_alloc(&sg, sizeof(*flp)); error = copyin((caddr_t)SCARG(uap, arg), (caddr_t)&ifl, ibcs2_flock_len); if (error) return error; cvt_iflock2flock(&ifl, flp); SCARG(&fa, fd) = SCARG(uap, fd); SCARG(&fa, cmd) = F_SETLK; SCARG(&fa, arg) = (/* XXX */ int)flp; return fcntl(td, &fa); } case IBCS2_F_SETLKW: { caddr_t sg = stackgap_init(); flp = stackgap_alloc(&sg, sizeof(*flp)); error = copyin((caddr_t)SCARG(uap, arg), (caddr_t)&ifl, ibcs2_flock_len); if (error) return error; cvt_iflock2flock(&ifl, flp); SCARG(&fa, fd) = SCARG(uap, fd); SCARG(&fa, cmd) = F_SETLKW; SCARG(&fa, arg) = (/* XXX */ int)flp; return fcntl(td, &fa); } } return ENOSYS; } Index: head/sys/i386/ibcs2/ibcs2_ioctl.c =================================================================== --- head/sys/i386/ibcs2/ibcs2_ioctl.c (revision 89305) +++ head/sys/i386/ibcs2/ibcs2_ioctl.c (revision 89306) @@ -1,646 +1,693 @@ /* $NetBSD: ibcs2_ioctl.c,v 1.6 1995/03/14 15:12:28 scottb Exp $ */ /* * Copyright (c) 1994, 1995 Scott Bartram * All rights reserved. * * based on compat/sunos/sun_ioctl.c * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static void stios2btios __P((struct ibcs2_termios *, struct termios *)); static void btios2stios __P((struct termios *, struct ibcs2_termios *)); static void stios2stio __P((struct ibcs2_termios *, struct ibcs2_termio *)); static void stio2stios __P((struct ibcs2_termio *, struct ibcs2_termios *)); +static int ibcs2_do_ioctl __P((struct proc *, struct ibcs2_ioctl_args *, + struct file *)); int ibcs2_gtty(struct thread *td, struct ibcs2_gtty_args *args) { struct ioctl_args ioctl_arg; ioctl_arg.fd = args->fd; ioctl_arg.com = TIOCGETC; ioctl_arg.data = (caddr_t)args->buf; return ioctl(td, &ioctl_arg); } int ibcs2_stty(struct thread *td, struct ibcs2_stty_args *args) { struct ioctl_args ioctl_arg; ioctl_arg.fd = args->fd; ioctl_arg.com = TIOCSETC; ioctl_arg.data = (caddr_t)args->buf; return ioctl(td, &ioctl_arg); } /* * iBCS2 ioctl calls. */ static struct speedtab sptab[] = { { 0, 0 }, { 50, 1 }, { 75, 2 }, { 110, 3 }, { 134, 4 }, { 135, 4 }, { 150, 5 }, { 200, 6 }, { 300, 7 }, { 600, 8 }, { 1200, 9 }, { 1800, 10 }, { 2400, 11 }, { 4800, 12 }, { 9600, 13 }, { 19200, 14 }, { 38400, 15 }, { -1, -1 } }; static u_long s2btab[] = { 0, 50, 75, 110, 134, 150, 200, 300, 600, 1200, 1800, 2400, 4800, 9600, 19200, 38400, }; static void stios2btios(st, bt) struct ibcs2_termios *st; struct termios *bt; { register u_long l, r; l = st->c_iflag; r = 0; if (l & IBCS2_IGNBRK) r |= IGNBRK; if (l & IBCS2_BRKINT) r |= BRKINT; if (l & IBCS2_IGNPAR) r |= IGNPAR; if (l & IBCS2_PARMRK) r |= PARMRK; if (l & IBCS2_INPCK) r |= INPCK; if (l & IBCS2_ISTRIP) r |= ISTRIP; if (l & IBCS2_INLCR) r |= INLCR; if (l & IBCS2_IGNCR) r |= IGNCR; if (l & IBCS2_ICRNL) r |= ICRNL; if (l & IBCS2_IXON) r |= IXON; if (l & IBCS2_IXANY) r |= IXANY; if (l & IBCS2_IXOFF) r |= IXOFF; if (l & IBCS2_IMAXBEL) r |= IMAXBEL; bt->c_iflag = r; l = st->c_oflag; r = 0; if (l & IBCS2_OPOST) r |= OPOST; if (l & IBCS2_ONLCR) r |= ONLCR; if (l & IBCS2_TAB3) r |= OXTABS; bt->c_oflag = r; l = st->c_cflag; r = 0; switch (l & IBCS2_CSIZE) { case IBCS2_CS5: r |= CS5; break; case IBCS2_CS6: r |= CS6; break; case IBCS2_CS7: r |= CS7; break; case IBCS2_CS8: r |= CS8; break; } if (l & IBCS2_CSTOPB) r |= CSTOPB; if (l & IBCS2_CREAD) r |= CREAD; if (l & IBCS2_PARENB) r |= PARENB; if (l & IBCS2_PARODD) r |= PARODD; if (l & IBCS2_HUPCL) r |= HUPCL; if (l & IBCS2_CLOCAL) r |= CLOCAL; bt->c_cflag = r; bt->c_ispeed = bt->c_ospeed = s2btab[l & 0x0000000f]; l = st->c_lflag; r = 0; if (l & IBCS2_ISIG) r |= ISIG; if (l & IBCS2_ICANON) r |= ICANON; if (l & IBCS2_ECHO) r |= ECHO; if (l & IBCS2_ECHOE) r |= ECHOE; if (l & IBCS2_ECHOK) r |= ECHOK; if (l & IBCS2_ECHONL) r |= ECHONL; if (l & IBCS2_NOFLSH) r |= NOFLSH; if (l & IBCS2_TOSTOP) r |= TOSTOP; bt->c_lflag = r; bt->c_cc[VINTR] = st->c_cc[IBCS2_VINTR] ? st->c_cc[IBCS2_VINTR] : _POSIX_VDISABLE; bt->c_cc[VQUIT] = st->c_cc[IBCS2_VQUIT] ? st->c_cc[IBCS2_VQUIT] : _POSIX_VDISABLE; bt->c_cc[VERASE] = st->c_cc[IBCS2_VERASE] ? st->c_cc[IBCS2_VERASE] : _POSIX_VDISABLE; bt->c_cc[VKILL] = st->c_cc[IBCS2_VKILL] ? st->c_cc[IBCS2_VKILL] : _POSIX_VDISABLE; if (bt->c_lflag & ICANON) { bt->c_cc[VEOF] = st->c_cc[IBCS2_VEOF] ? st->c_cc[IBCS2_VEOF] : _POSIX_VDISABLE; bt->c_cc[VEOL] = st->c_cc[IBCS2_VEOL] ? st->c_cc[IBCS2_VEOL] : _POSIX_VDISABLE; } else { bt->c_cc[VMIN] = st->c_cc[IBCS2_VMIN]; bt->c_cc[VTIME] = st->c_cc[IBCS2_VTIME]; } bt->c_cc[VEOL2] = st->c_cc[IBCS2_VEOL2] ? st->c_cc[IBCS2_VEOL2] : _POSIX_VDISABLE; #if 0 bt->c_cc[VSWTCH] = st->c_cc[IBCS2_VSWTCH] ? st->c_cc[IBCS2_VSWTCH] : _POSIX_VDISABLE; #endif bt->c_cc[VSTART] = st->c_cc[IBCS2_VSTART] ? st->c_cc[IBCS2_VSTART] : _POSIX_VDISABLE; bt->c_cc[VSTOP] = st->c_cc[IBCS2_VSTOP] ? st->c_cc[IBCS2_VSTOP] : _POSIX_VDISABLE; bt->c_cc[VSUSP] = st->c_cc[IBCS2_VSUSP] ? st->c_cc[IBCS2_VSUSP] : _POSIX_VDISABLE; bt->c_cc[VDSUSP] = _POSIX_VDISABLE; bt->c_cc[VREPRINT] = _POSIX_VDISABLE; bt->c_cc[VDISCARD] = _POSIX_VDISABLE; bt->c_cc[VWERASE] = _POSIX_VDISABLE; bt->c_cc[VLNEXT] = _POSIX_VDISABLE; bt->c_cc[VSTATUS] = _POSIX_VDISABLE; } static void btios2stios(bt, st) struct termios *bt; struct ibcs2_termios *st; { register u_long l, r; l = bt->c_iflag; r = 0; if (l & IGNBRK) r |= IBCS2_IGNBRK; if (l & BRKINT) r |= IBCS2_BRKINT; if (l & IGNPAR) r |= IBCS2_IGNPAR; if (l & PARMRK) r |= IBCS2_PARMRK; if (l & INPCK) r |= IBCS2_INPCK; if (l & ISTRIP) r |= IBCS2_ISTRIP; if (l & INLCR) r |= IBCS2_INLCR; if (l & IGNCR) r |= IBCS2_IGNCR; if (l & ICRNL) r |= IBCS2_ICRNL; if (l & IXON) r |= IBCS2_IXON; if (l & IXANY) r |= IBCS2_IXANY; if (l & IXOFF) r |= IBCS2_IXOFF; if (l & IMAXBEL) r |= IBCS2_IMAXBEL; st->c_iflag = r; l = bt->c_oflag; r = 0; if (l & OPOST) r |= IBCS2_OPOST; if (l & ONLCR) r |= IBCS2_ONLCR; if (l & OXTABS) r |= IBCS2_TAB3; st->c_oflag = r; l = bt->c_cflag; r = 0; switch (l & CSIZE) { case CS5: r |= IBCS2_CS5; break; case CS6: r |= IBCS2_CS6; break; case CS7: r |= IBCS2_CS7; break; case CS8: r |= IBCS2_CS8; break; } if (l & CSTOPB) r |= IBCS2_CSTOPB; if (l & CREAD) r |= IBCS2_CREAD; if (l & PARENB) r |= IBCS2_PARENB; if (l & PARODD) r |= IBCS2_PARODD; if (l & HUPCL) r |= IBCS2_HUPCL; if (l & CLOCAL) r |= IBCS2_CLOCAL; st->c_cflag = r; l = bt->c_lflag; r = 0; if (l & ISIG) r |= IBCS2_ISIG; if (l & ICANON) r |= IBCS2_ICANON; if (l & ECHO) r |= IBCS2_ECHO; if (l & ECHOE) r |= IBCS2_ECHOE; if (l & ECHOK) r |= IBCS2_ECHOK; if (l & ECHONL) r |= IBCS2_ECHONL; if (l & NOFLSH) r |= IBCS2_NOFLSH; if (l & TOSTOP) r |= IBCS2_TOSTOP; st->c_lflag = r; l = ttspeedtab(bt->c_ospeed, sptab); if ((int)l >= 0) st->c_cflag |= l; st->c_cc[IBCS2_VINTR] = bt->c_cc[VINTR] != _POSIX_VDISABLE ? bt->c_cc[VINTR] : 0; st->c_cc[IBCS2_VQUIT] = bt->c_cc[VQUIT] != _POSIX_VDISABLE ? bt->c_cc[VQUIT] : 0; st->c_cc[IBCS2_VERASE] = bt->c_cc[VERASE] != _POSIX_VDISABLE ? bt->c_cc[VERASE] : 0; st->c_cc[IBCS2_VKILL] = bt->c_cc[VKILL] != _POSIX_VDISABLE ? bt->c_cc[VKILL] : 0; if (bt->c_lflag & ICANON) { st->c_cc[IBCS2_VEOF] = bt->c_cc[VEOF] != _POSIX_VDISABLE ? bt->c_cc[VEOF] : 0; st->c_cc[IBCS2_VEOL] = bt->c_cc[VEOL] != _POSIX_VDISABLE ? bt->c_cc[VEOL] : 0; } else { st->c_cc[IBCS2_VMIN] = bt->c_cc[VMIN]; st->c_cc[IBCS2_VTIME] = bt->c_cc[VTIME]; } st->c_cc[IBCS2_VEOL2] = bt->c_cc[VEOL2] != _POSIX_VDISABLE ? bt->c_cc[VEOL2] : 0; st->c_cc[IBCS2_VSWTCH] = 0; st->c_cc[IBCS2_VSUSP] = bt->c_cc[VSUSP] != _POSIX_VDISABLE ? bt->c_cc[VSUSP] : 0; st->c_cc[IBCS2_VSTART] = bt->c_cc[VSTART] != _POSIX_VDISABLE ? bt->c_cc[VSTART] : 0; st->c_cc[IBCS2_VSTOP] = bt->c_cc[VSTOP] != _POSIX_VDISABLE ? bt->c_cc[VSTOP] : 0; st->c_line = 0; } static void stios2stio(ts, t) struct ibcs2_termios *ts; struct ibcs2_termio *t; { t->c_iflag = ts->c_iflag; t->c_oflag = ts->c_oflag; t->c_cflag = ts->c_cflag; t->c_lflag = ts->c_lflag; t->c_line = ts->c_line; bcopy(ts->c_cc, t->c_cc, IBCS2_NCC); } static void stio2stios(t, ts) struct ibcs2_termio *t; struct ibcs2_termios *ts; { ts->c_iflag = t->c_iflag; ts->c_oflag = t->c_oflag; ts->c_cflag = t->c_cflag; ts->c_lflag = t->c_lflag; ts->c_line = t->c_line; bcopy(t->c_cc, ts->c_cc, IBCS2_NCC); } int ibcs2_ioctl(td, uap) struct thread *td; struct ibcs2_ioctl_args *uap; { struct proc *p = td->td_proc; - struct filedesc *fdp = p->p_fd; struct file *fp; int error; - if (SCARG(uap, fd) < 0 || SCARG(uap, fd) >= fdp->fd_nfiles || - (fp = fdp->fd_ofiles[SCARG(uap, fd)]) == NULL) { + fp = ffind_hold(td, uap->fd); + if (fp == NULL) { DPRINTF(("ibcs2_ioctl(%d): bad fd %d ", p->p_pid, SCARG(uap, fd))); return EBADF; } if ((fp->f_flag & (FREAD|FWRITE)) == 0) { + fdrop(fp, td); DPRINTF(("ibcs2_ioctl(%d): bad fp flag ", p->p_pid)); return EBADF; } switch (SCARG(uap, cmd)) { case IBCS2_TCGETA: case IBCS2_XCGETA: case IBCS2_OXCGETA: { struct termios bts; struct ibcs2_termios sts; struct ibcs2_termio st; if ((error = fo_ioctl(fp, TIOCGETA, (caddr_t)&bts, td)) != 0) - return error; + break; btios2stios (&bts, &sts); if (SCARG(uap, cmd) == IBCS2_TCGETA) { stios2stio (&sts, &st); error = copyout((caddr_t)&st, SCARG(uap, data), sizeof (st)); #ifdef DEBUG_IBCS2 if (error) DPRINTF(("ibcs2_ioctl(%d): copyout failed ", p->p_pid)); #endif - return error; - } else - return copyout((caddr_t)&sts, SCARG(uap, data), + break; + } else { + error = copyout((caddr_t)&sts, SCARG(uap, data), sizeof (sts)); + break; + } /*NOTREACHED*/ } case IBCS2_TCSETA: case IBCS2_TCSETAW: case IBCS2_TCSETAF: { struct termios bts; struct ibcs2_termios sts; struct ibcs2_termio st; if ((error = copyin(SCARG(uap, data), (caddr_t)&st, sizeof(st))) != 0) { DPRINTF(("ibcs2_ioctl(%d): TCSET copyin failed ", p->p_pid)); - return error; + break; } /* get full BSD termios so we don't lose information */ if ((error = fo_ioctl(fp, TIOCGETA, (caddr_t)&bts, td)) != 0) { DPRINTF(("ibcs2_ioctl(%d): TCSET ctl failed fd %d ", p->p_pid, SCARG(uap, fd))); - return error; + break; } /* * convert to iBCS2 termios, copy in information from * termio, and convert back, then set new values. */ btios2stios(&bts, &sts); stio2stios(&st, &sts); stios2btios(&sts, &bts); - return fo_ioctl(fp, SCARG(uap, cmd) - IBCS2_TCSETA + TIOCSETA, + error = fo_ioctl(fp, SCARG(uap, cmd) - IBCS2_TCSETA + TIOCSETA, (caddr_t)&bts, td); + break; } case IBCS2_XCSETA: case IBCS2_XCSETAW: case IBCS2_XCSETAF: { struct termios bts; struct ibcs2_termios sts; if ((error = copyin(SCARG(uap, data), (caddr_t)&sts, - sizeof (sts))) != 0) { - return error; - } + sizeof (sts))) != 0) + break; stios2btios (&sts, &bts); - return fo_ioctl(fp, SCARG(uap, cmd) - IBCS2_XCSETA + TIOCSETA, + error = fo_ioctl(fp, SCARG(uap, cmd) - IBCS2_XCSETA + TIOCSETA, (caddr_t)&bts, td); + break; } case IBCS2_OXCSETA: case IBCS2_OXCSETAW: case IBCS2_OXCSETAF: { struct termios bts; struct ibcs2_termios sts; if ((error = copyin(SCARG(uap, data), (caddr_t)&sts, - sizeof (sts))) != 0) { - return error; - } + sizeof (sts))) != 0) + break; stios2btios (&sts, &bts); - return fo_ioctl(fp, SCARG(uap, cmd) - IBCS2_OXCSETA + TIOCSETA, + error = fo_ioctl(fp, SCARG(uap, cmd) - IBCS2_OXCSETA + TIOCSETA, (caddr_t)&bts, td); + break; } case IBCS2_TCSBRK: DPRINTF(("ibcs2_ioctl(%d): TCSBRK ", p->p_pid)); - return ENOSYS; + error = ENOSYS; + break; case IBCS2_TCXONC: { switch ((int)SCARG(uap, data)) { case 0: case 1: DPRINTF(("ibcs2_ioctl(%d): TCXONC ", p->p_pid)); - return ENOSYS; + error = ENOSYS; + break; case 2: - return fo_ioctl(fp, TIOCSTOP, (caddr_t)0, td); + error = fo_ioctl(fp, TIOCSTOP, (caddr_t)0, td); + break; case 3: - return fo_ioctl(fp, TIOCSTART, (caddr_t)1, td); + error = fo_ioctl(fp, TIOCSTART, (caddr_t)1, td); + break; default: - return EINVAL; + error = EINVAL; + break; } + break; } case IBCS2_TCFLSH: { int arg; switch ((int)SCARG(uap, data)) { case 0: arg = FREAD; break; case 1: arg = FWRITE; break; case 2: arg = FREAD | FWRITE; break; default: + fdrop(fp, td); return EINVAL; } - return fo_ioctl(fp, TIOCFLUSH, (caddr_t)&arg, td); + error = fo_ioctl(fp, TIOCFLUSH, (caddr_t)&arg, td); + break; } case IBCS2_TIOCGWINSZ: SCARG(uap, cmd) = TIOCGWINSZ; - return ioctl(td, (struct ioctl_args *)uap); + error = ioctl(td, (struct ioctl_args *)uap); + break; case IBCS2_TIOCSWINSZ: SCARG(uap, cmd) = TIOCSWINSZ; - return ioctl(td, (struct ioctl_args *)uap); + error = ioctl(td, (struct ioctl_args *)uap); + break; case IBCS2_TIOCGPGRP: + { + pid_t pg_id; + PROC_LOCK(p); - error = copyout((caddr_t)&p->p_pgrp->pg_id, SCARG(uap, data), - sizeof(p->p_pgrp->pg_id)); + pg_id = p->p_pgrp->pg_id; PROC_UNLOCK(p); - return error; + error = copyout((caddr_t)&pg_id, SCARG(uap, data), + sizeof(pg_id)); + break; + } case IBCS2_TIOCSPGRP: /* XXX - is uap->data a pointer to pgid? */ { struct setpgid_args sa; SCARG(&sa, pid) = 0; SCARG(&sa, pgid) = (int)SCARG(uap, data); - if ((error = setpgid(td, &sa)) != 0) - return error; - return 0; + error = setpgid(td, &sa); + break; } case IBCS2_TCGETSC: /* SCO console - get scancode flags */ - return EINTR; /* ENOSYS; */ + error = EINTR; /* ENOSYS; */ + break; case IBCS2_TCSETSC: /* SCO console - set scancode flags */ - return 0; /* ENOSYS; */ + error = 0; /* ENOSYS; */ + break; case IBCS2_JWINSIZE: /* Unix to Jerq I/O control */ { struct ibcs2_jwinsize { char bytex, bytey; short bitx, bity; } ibcs2_jwinsize; PROC_LOCK(p); ibcs2_jwinsize.bytex = 80; /* p->p_session->s_ttyp->t_winsize.ws_col; XXX */ ibcs2_jwinsize.bytey = 25; /* p->p_session->s_ttyp->t_winsize.ws_row; XXX */ ibcs2_jwinsize.bitx = p->p_session->s_ttyp->t_winsize.ws_xpixel; ibcs2_jwinsize.bity = p->p_session->s_ttyp->t_winsize.ws_ypixel; PROC_UNLOCK(p); - return copyout((caddr_t)&ibcs2_jwinsize, SCARG(uap, data), + error = copyout((caddr_t)&ibcs2_jwinsize, SCARG(uap, data), sizeof(ibcs2_jwinsize)); + break; } /* keyboard and display ioctl's -- type 'K' */ case IBCS2_KDGKBMODE: /* get keyboard translation mode */ SCARG(uap, cmd) = KDGKBMODE; /* printf("ioctl KDGKBMODE = %x\n", SCARG(uap, cmd));*/ - return ioctl(td, (struct ioctl_args *)uap); + error = ioctl(td, (struct ioctl_args *)uap); + break; case IBCS2_KDSKBMODE: /* set keyboard translation mode */ SCARG(uap, cmd) = KDSKBMODE; - return ioctl(td, (struct ioctl_args *)uap); + error = ioctl(td, (struct ioctl_args *)uap); + break; case IBCS2_KDMKTONE: /* sound tone */ SCARG(uap, cmd) = KDMKTONE; - return ioctl(td, (struct ioctl_args *)uap); + error = ioctl(td, (struct ioctl_args *)uap); + break; case IBCS2_KDGETMODE: /* get text/graphics mode */ SCARG(uap, cmd) = KDGETMODE; - return ioctl(td, (struct ioctl_args *)uap); + error = ioctl(td, (struct ioctl_args *)uap); + break; case IBCS2_KDSETMODE: /* set text/graphics mode */ SCARG(uap, cmd) = KDSETMODE; - return ioctl(td, (struct ioctl_args *)uap); + error = ioctl(td, (struct ioctl_args *)uap); + break; case IBCS2_KDSBORDER: /* set ega color border */ SCARG(uap, cmd) = KDSBORDER; - return ioctl(td, (struct ioctl_args *)uap); + error = ioctl(td, (struct ioctl_args *)uap); + break; case IBCS2_KDGKBSTATE: SCARG(uap, cmd) = KDGKBSTATE; - return ioctl(td, (struct ioctl_args *)uap); + error = ioctl(td, (struct ioctl_args *)uap); + break; case IBCS2_KDSETRAD: SCARG(uap, cmd) = KDSETRAD; - return ioctl(td, (struct ioctl_args *)uap); + error = ioctl(td, (struct ioctl_args *)uap); + break; case IBCS2_KDENABIO: /* enable direct I/O to ports */ SCARG(uap, cmd) = KDENABIO; - return ioctl(td, (struct ioctl_args *)uap); + error = ioctl(td, (struct ioctl_args *)uap); + break; case IBCS2_KDDISABIO: /* disable direct I/O to ports */ SCARG(uap, cmd) = KDDISABIO; - return ioctl(td, (struct ioctl_args *)uap); + error = ioctl(td, (struct ioctl_args *)uap); + break; case IBCS2_KIOCSOUND: /* start sound generation */ SCARG(uap, cmd) = KIOCSOUND; - return ioctl(td, (struct ioctl_args *)uap); + error = ioctl(td, (struct ioctl_args *)uap); + break; case IBCS2_KDGKBTYPE: /* get keyboard type */ SCARG(uap, cmd) = KDGKBTYPE; - return ioctl(td, (struct ioctl_args *)uap); + error = ioctl(td, (struct ioctl_args *)uap); + break; case IBCS2_KDGETLED: /* get keyboard LED status */ SCARG(uap, cmd) = KDGETLED; - return ioctl(td, (struct ioctl_args *)uap); + error = ioctl(td, (struct ioctl_args *)uap); + break; case IBCS2_KDSETLED: /* set keyboard LED status */ SCARG(uap, cmd) = KDSETLED; - return ioctl(td, (struct ioctl_args *)uap); + error = ioctl(td, (struct ioctl_args *)uap); + break; /* Xenix keyboard and display ioctl's from sys/kd.h -- type 'k' */ case IBCS2_GETFKEY: /* Get function key */ SCARG(uap, cmd) = GETFKEY; - return ioctl(td, (struct ioctl_args *)uap); + error = ioctl(td, (struct ioctl_args *)uap); + break; case IBCS2_SETFKEY: /* Set function key */ SCARG(uap, cmd) = SETFKEY; - return ioctl(td, (struct ioctl_args *)uap); + error = ioctl(td, (struct ioctl_args *)uap); + break; case IBCS2_GIO_SCRNMAP: /* Get screen output map table */ SCARG(uap, cmd) = GIO_SCRNMAP; - return ioctl(td, (struct ioctl_args *)uap); + error = ioctl(td, (struct ioctl_args *)uap); + break; case IBCS2_PIO_SCRNMAP: /* Set screen output map table */ SCARG(uap, cmd) = PIO_SCRNMAP; - return ioctl(td, (struct ioctl_args *)uap); + error = ioctl(td, (struct ioctl_args *)uap); + break; case IBCS2_GIO_KEYMAP: /* Get keyboard map table */ SCARG(uap, cmd) = GIO_KEYMAP; - return ioctl(td, (struct ioctl_args *)uap); + error = ioctl(td, (struct ioctl_args *)uap); + break; case IBCS2_PIO_KEYMAP: /* Set keyboard map table */ SCARG(uap, cmd) = PIO_KEYMAP; - return ioctl(td, (struct ioctl_args *)uap); + error = ioctl(td, (struct ioctl_args *)uap); + break; /* socksys */ case IBCS2_SIOCSOCKSYS: - return ibcs2_socksys(td, (struct ibcs2_socksys_args *)uap); + error = ibcs2_socksys(td, (struct ibcs2_socksys_args *)uap); + break; case IBCS2_FIONREAD: case IBCS2_I_NREAD: /* STREAMS */ SCARG(uap, cmd) = FIONREAD; - return ioctl(td, (struct ioctl_args *)uap); + error = ioctl(td, (struct ioctl_args *)uap); + break; default: DPRINTF(("ibcs2_ioctl(%d): unknown cmd 0x%lx ", td->proc->p_pid, SCARG(uap, cmd))); - return ENOSYS; + error = ENOSYS; + break; } - return ENOSYS; + + fdrop(fp, td); + return error; } Index: head/sys/i386/ibcs2/ibcs2_misc.c =================================================================== --- head/sys/i386/ibcs2/ibcs2_misc.c (revision 89305) +++ head/sys/i386/ibcs2/ibcs2_misc.c (revision 89306) @@ -1,1184 +1,1197 @@ /* * Copyright (c) 1995 Steven Wallace * Copyright (c) 1994, 1995 Scott Bartram * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This software was developed by the Computer Systems Engineering group * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and * contributed to Berkeley. * * All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Lawrence Berkeley Laboratory. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: Header: sun_misc.c,v 1.16 93/04/07 02:46:27 torek Exp * * @(#)sun_misc.c 8.1 (Berkeley) 6/18/93 * * $FreeBSD$ */ /* * IBCS2 compatibility module. * * IBCS2 system calls that are implemented differently in BSD are * handled here. */ #include #include #include #include #include #include #include #include #include /* Must come after sys/malloc.h */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include int ibcs2_ulimit(td, uap) struct thread *td; struct ibcs2_ulimit_args *uap; { #ifdef notyet int error; struct rlimit rl; struct setrlimit_args { int resource; struct rlimit *rlp; } sra; #endif #define IBCS2_GETFSIZE 1 #define IBCS2_SETFSIZE 2 #define IBCS2_GETPSIZE 3 #define IBCS2_GETDTABLESIZE 4 switch (SCARG(uap, cmd)) { case IBCS2_GETFSIZE: td->td_retval[0] = td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur; if (td->td_retval[0] == -1) td->td_retval[0] = 0x7fffffff; return 0; case IBCS2_SETFSIZE: /* XXX - fix this */ #ifdef notyet rl.rlim_cur = SCARG(uap, newlimit); sra.resource = RLIMIT_FSIZE; sra.rlp = &rl; error = setrlimit(td, &sra); if (!error) td->td_retval[0] = td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur; else DPRINTF(("failed ")); return error; #else td->td_retval[0] = SCARG(uap, newlimit); return 0; #endif case IBCS2_GETPSIZE: mtx_assert(&Giant, MA_OWNED); td->td_retval[0] = td->td_proc->p_rlimit[RLIMIT_RSS].rlim_cur; /* XXX */ return 0; case IBCS2_GETDTABLESIZE: uap->cmd = IBCS2_SC_OPEN_MAX; return ibcs2_sysconf(td, (struct ibcs2_sysconf_args *)uap); default: return ENOSYS; } } #define IBCS2_WSTOPPED 0177 #define IBCS2_STOPCODE(sig) ((sig) << 8 | IBCS2_WSTOPPED) int ibcs2_wait(td, uap) struct thread *td; struct ibcs2_wait_args *uap; { int error, status; struct wait_args w4; struct trapframe *tf = td->td_frame; SCARG(&w4, rusage) = NULL; if ((tf->tf_eflags & (PSL_Z|PSL_PF|PSL_N|PSL_V)) == (PSL_Z|PSL_PF|PSL_N|PSL_V)) { /* waitpid */ SCARG(&w4, pid) = SCARG(uap, a1); SCARG(&w4, status) = (int *)SCARG(uap, a2); SCARG(&w4, options) = SCARG(uap, a3); } else { /* wait */ SCARG(&w4, pid) = WAIT_ANY; SCARG(&w4, status) = (int *)SCARG(uap, a1); SCARG(&w4, options) = 0; } if ((error = wait4(td, &w4)) != 0) return error; if (SCARG(&w4, status)) { /* this is real iBCS brain-damage */ error = copyin((caddr_t)SCARG(&w4, status), (caddr_t)&status, sizeof(SCARG(&w4, status))); if(error) return error; /* convert status/signal result */ if(WIFSTOPPED(status)) status = IBCS2_STOPCODE(bsd_to_ibcs2_sig[_SIG_IDX(WSTOPSIG(status))]); else if(WIFSIGNALED(status)) status = bsd_to_ibcs2_sig[_SIG_IDX(WTERMSIG(status))]; /* else exit status -- identical */ /* record result/status */ td->td_retval[1] = status; return copyout((caddr_t)&status, (caddr_t)SCARG(&w4, status), sizeof(SCARG(&w4, status))); } return 0; } int ibcs2_execv(td, uap) struct thread *td; struct ibcs2_execv_args *uap; { struct execve_args ea; caddr_t sg = stackgap_init(); CHECKALTEXIST(td, &sg, SCARG(uap, path)); SCARG(&ea, fname) = SCARG(uap, path); SCARG(&ea, argv) = SCARG(uap, argp); SCARG(&ea, envv) = NULL; return execve(td, &ea); } int ibcs2_execve(td, uap) struct thread *td; struct ibcs2_execve_args *uap; { caddr_t sg = stackgap_init(); CHECKALTEXIST(td, &sg, SCARG(uap, path)); return execve(td, (struct execve_args *)uap); } int ibcs2_umount(td, uap) struct thread *td; struct ibcs2_umount_args *uap; { struct unmount_args um; SCARG(&um, path) = SCARG(uap, name); SCARG(&um, flags) = 0; return unmount(td, &um); } int ibcs2_mount(td, uap) struct thread *td; struct ibcs2_mount_args *uap; { #ifdef notyet int oflags = SCARG(uap, flags), nflags, error; char fsname[MFSNAMELEN]; if (oflags & (IBCS2_MS_NOSUB | IBCS2_MS_SYS5)) return (EINVAL); if ((oflags & IBCS2_MS_NEWTYPE) == 0) return (EINVAL); nflags = 0; if (oflags & IBCS2_MS_RDONLY) nflags |= MNT_RDONLY; if (oflags & IBCS2_MS_NOSUID) nflags |= MNT_NOSUID; if (oflags & IBCS2_MS_REMOUNT) nflags |= MNT_UPDATE; SCARG(uap, flags) = nflags; if (error = copyinstr((caddr_t)SCARG(uap, type), fsname, sizeof fsname, (u_int *)0)) return (error); if (strcmp(fsname, "4.2") == 0) { SCARG(uap, type) = (caddr_t)STACK_ALLOC(); if (error = copyout("ufs", SCARG(uap, type), sizeof("ufs"))) return (error); } else if (strcmp(fsname, "nfs") == 0) { struct ibcs2_nfs_args sna; struct sockaddr_in sain; struct nfs_args na; struct sockaddr sa; if (error = copyin(SCARG(uap, data), &sna, sizeof sna)) return (error); if (error = copyin(sna.addr, &sain, sizeof sain)) return (error); bcopy(&sain, &sa, sizeof sa); sa.sa_len = sizeof(sain); SCARG(uap, data) = (caddr_t)STACK_ALLOC(); na.addr = (struct sockaddr *)((int)SCARG(uap, data) + sizeof na); na.sotype = SOCK_DGRAM; na.proto = IPPROTO_UDP; na.fh = (nfsv2fh_t *)sna.fh; na.flags = sna.flags; na.wsize = sna.wsize; na.rsize = sna.rsize; na.timeo = sna.timeo; na.retrans = sna.retrans; na.hostname = sna.hostname; if (error = copyout(&sa, na.addr, sizeof sa)) return (error); if (error = copyout(&na, SCARG(uap, data), sizeof na)) return (error); } return (mount(td, uap)); #else return EINVAL; #endif } /* * Read iBCS2-style directory entries. We suck them into kernel space so * that they can be massaged before being copied out to user code. Like * SunOS, we squish out `empty' entries. * * This is quite ugly, but what do you expect from compatibility code? */ int ibcs2_getdents(td, uap) struct thread *td; register struct ibcs2_getdents_args *uap; { register struct vnode *vp; register caddr_t inp, buf; /* BSD-format */ register int len, reclen; /* BSD-format */ register caddr_t outp; /* iBCS2-format */ register int resid; /* iBCS2-format */ struct file *fp; struct uio auio; struct iovec aiov; struct ibcs2_dirent idb; off_t off; /* true file offset */ int buflen, error, eofflag; u_long *cookies = NULL, *cookiep; int ncookies; #define BSD_DIRENT(cp) ((struct dirent *)(cp)) #define IBCS2_RECLEN(reclen) (reclen + sizeof(u_short)) if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0) return (error); - if ((fp->f_flag & FREAD) == 0) + if ((fp->f_flag & FREAD) == 0) { + fdrop(fp, td); return (EBADF); + } vp = (struct vnode *)fp->f_data; - if (vp->v_type != VDIR) /* XXX vnode readdir op should do this */ + if (vp->v_type != VDIR) { /* XXX vnode readdir op should do this */ + fdrop(fp, td); return (EINVAL); + } off = fp->f_offset; #define DIRBLKSIZ 512 /* XXX we used to use ufs's DIRBLKSIZ */ buflen = max(DIRBLKSIZ, SCARG(uap, nbytes)); buflen = min(buflen, MAXBSIZE); buf = malloc(buflen, M_TEMP, M_WAITOK); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); again: aiov.iov_base = buf; aiov.iov_len = buflen; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_SYSSPACE; auio.uio_td = td; auio.uio_resid = buflen; auio.uio_offset = off; if (cookies) { free(cookies, M_TEMP); cookies = NULL; } /* * First we read into the malloc'ed buffer, then * we massage it into user space, one record at a time. */ if ((error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, &ncookies, &cookies)) != 0) goto out; inp = buf; outp = SCARG(uap, buf); resid = SCARG(uap, nbytes); if ((len = buflen - auio.uio_resid) <= 0) goto eof; cookiep = cookies; if (cookies) { /* * When using cookies, the vfs has the option of reading from * a different offset than that supplied (UFS truncates the * offset to a block boundary to make sure that it never reads * partway through a directory entry, even if the directory * has been compacted). */ while (len > 0 && ncookies > 0 && *cookiep <= off) { len -= BSD_DIRENT(inp)->d_reclen; inp += BSD_DIRENT(inp)->d_reclen; cookiep++; ncookies--; } } for (; len > 0; len -= reclen) { if (cookiep && ncookies == 0) break; reclen = BSD_DIRENT(inp)->d_reclen; if (reclen & 3) { printf("ibcs2_getdents: reclen=%d\n", reclen); error = EFAULT; goto out; } if (BSD_DIRENT(inp)->d_fileno == 0) { inp += reclen; /* it is a hole; squish it out */ if (cookiep) { off = *cookiep++; ncookies--; } else off += reclen; continue; } if (reclen > len || resid < IBCS2_RECLEN(reclen)) { /* entry too big for buffer, so just stop */ outp++; break; } /* * Massage in place to make a iBCS2-shaped dirent (otherwise * we have to worry about touching user memory outside of * the copyout() call). */ idb.d_ino = (ibcs2_ino_t)BSD_DIRENT(inp)->d_fileno; idb.d_off = (ibcs2_off_t)off; idb.d_reclen = (u_short)IBCS2_RECLEN(reclen); if ((error = copyout((caddr_t)&idb, outp, 10)) != 0 || (error = copyout(BSD_DIRENT(inp)->d_name, outp + 10, BSD_DIRENT(inp)->d_namlen + 1)) != 0) goto out; /* advance past this real entry */ if (cookiep) { off = *cookiep++; ncookies--; } else off += reclen; inp += reclen; /* advance output past iBCS2-shaped entry */ outp += IBCS2_RECLEN(reclen); resid -= IBCS2_RECLEN(reclen); } /* if we squished out the whole block, try again */ if (outp == SCARG(uap, buf)) goto again; fp->f_offset = off; /* update the vnode offset */ eof: td->td_retval[0] = SCARG(uap, nbytes) - resid; out: + VOP_UNLOCK(vp, 0, td); + fdrop(fp, td); if (cookies) free(cookies, M_TEMP); - VOP_UNLOCK(vp, 0, td); free(buf, M_TEMP); return (error); } int ibcs2_read(td, uap) struct thread *td; struct ibcs2_read_args *uap; { register struct vnode *vp; register caddr_t inp, buf; /* BSD-format */ register int len, reclen; /* BSD-format */ register caddr_t outp; /* iBCS2-format */ register int resid; /* iBCS2-format */ struct file *fp; struct uio auio; struct iovec aiov; struct ibcs2_direct { ibcs2_ino_t ino; char name[14]; } idb; off_t off; /* true file offset */ int buflen, error, eofflag, size; u_long *cookies = NULL, *cookiep; int ncookies; if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0) { if (error == EINVAL) return read(td, (struct read_args *)uap); else return error; } - if ((fp->f_flag & FREAD) == 0) + if ((fp->f_flag & FREAD) == 0) { + fdrop(fp, td); return (EBADF); + } vp = (struct vnode *)fp->f_data; + if (vp->v_type != VDIR) { + fdrop(fp, td); + return read(td, (struct read_args *)uap); + } + + off = fp->f_offset; if (vp->v_type != VDIR) return read(td, (struct read_args *)uap); DPRINTF(("ibcs2_read: read directory\n")); - off = fp->f_offset; buflen = max(DIRBLKSIZ, SCARG(uap, nbytes)); buflen = min(buflen, MAXBSIZE); buf = malloc(buflen, M_TEMP, M_WAITOK); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); again: aiov.iov_base = buf; aiov.iov_len = buflen; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_SYSSPACE; auio.uio_td = td; auio.uio_resid = buflen; auio.uio_offset = off; if (cookies) { free(cookies, M_TEMP); cookies = NULL; } /* * First we read into the malloc'ed buffer, then * we massage it into user space, one record at a time. */ if ((error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, &ncookies, &cookies)) != 0) { DPRINTF(("VOP_READDIR failed: %d\n", error)); goto out; } inp = buf; outp = SCARG(uap, buf); resid = SCARG(uap, nbytes); if ((len = buflen - auio.uio_resid) <= 0) goto eof; cookiep = cookies; if (cookies) { /* * When using cookies, the vfs has the option of reading from * a different offset than that supplied (UFS truncates the * offset to a block boundary to make sure that it never reads * partway through a directory entry, even if the directory * has been compacted). */ while (len > 0 && ncookies > 0 && *cookiep <= off) { len -= BSD_DIRENT(inp)->d_reclen; inp += BSD_DIRENT(inp)->d_reclen; cookiep++; ncookies--; } } for (; len > 0 && resid > 0; len -= reclen) { if (cookiep && ncookies == 0) break; reclen = BSD_DIRENT(inp)->d_reclen; if (reclen & 3) { printf("ibcs2_read: reclen=%d\n", reclen); error = EFAULT; goto out; } if (BSD_DIRENT(inp)->d_fileno == 0) { inp += reclen; /* it is a hole; squish it out */ if (cookiep) { off = *cookiep++; ncookies--; } else off += reclen; continue; } if (reclen > len || resid < sizeof(struct ibcs2_direct)) { /* entry too big for buffer, so just stop */ outp++; break; } /* * Massage in place to make a iBCS2-shaped dirent (otherwise * we have to worry about touching user memory outside of * the copyout() call). * * TODO: if length(filename) > 14, then break filename into * multiple entries and set inode = 0xffff except last */ idb.ino = (BSD_DIRENT(inp)->d_fileno > 0xfffe) ? 0xfffe : BSD_DIRENT(inp)->d_fileno; (void)copystr(BSD_DIRENT(inp)->d_name, idb.name, 14, &size); bzero(idb.name + size, 14 - size); if ((error = copyout(&idb, outp, sizeof(struct ibcs2_direct))) != 0) goto out; /* advance past this real entry */ if (cookiep) { off = *cookiep++; ncookies--; } else off += reclen; inp += reclen; /* advance output past iBCS2-shaped entry */ outp += sizeof(struct ibcs2_direct); resid -= sizeof(struct ibcs2_direct); } /* if we squished out the whole block, try again */ if (outp == SCARG(uap, buf)) goto again; fp->f_offset = off; /* update the vnode offset */ eof: td->td_retval[0] = SCARG(uap, nbytes) - resid; out: + VOP_UNLOCK(vp, 0, td); + fdrop(fp, td); if (cookies) free(cookies, M_TEMP); - VOP_UNLOCK(vp, 0, td); free(buf, M_TEMP); return (error); } int ibcs2_mknod(td, uap) struct thread *td; struct ibcs2_mknod_args *uap; { caddr_t sg = stackgap_init(); CHECKALTCREAT(td, &sg, SCARG(uap, path)); if (S_ISFIFO(SCARG(uap, mode))) { struct mkfifo_args ap; SCARG(&ap, path) = SCARG(uap, path); SCARG(&ap, mode) = SCARG(uap, mode); return mkfifo(td, &ap); } else { struct mknod_args ap; SCARG(&ap, path) = SCARG(uap, path); SCARG(&ap, mode) = SCARG(uap, mode); SCARG(&ap, dev) = SCARG(uap, dev); return mknod(td, &ap); } } int ibcs2_getgroups(td, uap) struct thread *td; struct ibcs2_getgroups_args *uap; { int error, i; ibcs2_gid_t *iset = NULL; struct getgroups_args sa; gid_t *gp; caddr_t sg = stackgap_init(); SCARG(&sa, gidsetsize) = SCARG(uap, gidsetsize); if (SCARG(uap, gidsetsize)) { SCARG(&sa, gidset) = stackgap_alloc(&sg, NGROUPS_MAX * sizeof(gid_t *)); iset = stackgap_alloc(&sg, SCARG(uap, gidsetsize) * sizeof(ibcs2_gid_t)); } if ((error = getgroups(td, &sa)) != 0) return error; if (SCARG(uap, gidsetsize) == 0) return 0; for (i = 0, gp = SCARG(&sa, gidset); i < td->td_retval[0]; i++) iset[i] = (ibcs2_gid_t)*gp++; if (td->td_retval[0] && (error = copyout((caddr_t)iset, (caddr_t)SCARG(uap, gidset), sizeof(ibcs2_gid_t) * td->td_retval[0]))) return error; return 0; } int ibcs2_setgroups(td, uap) struct thread *td; struct ibcs2_setgroups_args *uap; { int error, i; ibcs2_gid_t *iset; struct setgroups_args sa; gid_t *gp; caddr_t sg = stackgap_init(); SCARG(&sa, gidsetsize) = SCARG(uap, gidsetsize); SCARG(&sa, gidset) = stackgap_alloc(&sg, SCARG(&sa, gidsetsize) * sizeof(gid_t *)); iset = stackgap_alloc(&sg, SCARG(&sa, gidsetsize) * sizeof(ibcs2_gid_t *)); if (SCARG(&sa, gidsetsize)) { if ((error = copyin((caddr_t)SCARG(uap, gidset), (caddr_t)iset, sizeof(ibcs2_gid_t *) * SCARG(uap, gidsetsize))) != 0) return error; } for (i = 0, gp = SCARG(&sa, gidset); i < SCARG(&sa, gidsetsize); i++) *gp++ = (gid_t)iset[i]; return setgroups(td, &sa); } int ibcs2_setuid(td, uap) struct thread *td; struct ibcs2_setuid_args *uap; { struct setuid_args sa; SCARG(&sa, uid) = (uid_t)SCARG(uap, uid); return setuid(td, &sa); } int ibcs2_setgid(td, uap) struct thread *td; struct ibcs2_setgid_args *uap; { struct setgid_args sa; SCARG(&sa, gid) = (gid_t)SCARG(uap, gid); return setgid(td, &sa); } int ibcs2_time(td, uap) struct thread *td; struct ibcs2_time_args *uap; { struct timeval tv; microtime(&tv); td->td_retval[0] = tv.tv_sec; if (SCARG(uap, tp)) return copyout((caddr_t)&tv.tv_sec, (caddr_t)SCARG(uap, tp), sizeof(ibcs2_time_t)); else return 0; } int ibcs2_pathconf(td, uap) struct thread *td; struct ibcs2_pathconf_args *uap; { SCARG(uap, name)++; /* iBCS2 _PC_* defines are offset by one */ return pathconf(td, (struct pathconf_args *)uap); } int ibcs2_fpathconf(td, uap) struct thread *td; struct ibcs2_fpathconf_args *uap; { SCARG(uap, name)++; /* iBCS2 _PC_* defines are offset by one */ return fpathconf(td, (struct fpathconf_args *)uap); } int ibcs2_sysconf(td, uap) struct thread *td; struct ibcs2_sysconf_args *uap; { int mib[2], value, len, error; struct sysctl_args sa; struct __getrlimit_args ga; switch(SCARG(uap, name)) { case IBCS2_SC_ARG_MAX: mib[1] = KERN_ARGMAX; break; case IBCS2_SC_CHILD_MAX: { caddr_t sg = stackgap_init(); SCARG(&ga, which) = RLIMIT_NPROC; SCARG(&ga, rlp) = stackgap_alloc(&sg, sizeof(struct rlimit *)); if ((error = getrlimit(td, &ga)) != 0) return error; td->td_retval[0] = SCARG(&ga, rlp)->rlim_cur; return 0; } case IBCS2_SC_CLK_TCK: td->td_retval[0] = hz; return 0; case IBCS2_SC_NGROUPS_MAX: mib[1] = KERN_NGROUPS; break; case IBCS2_SC_OPEN_MAX: { caddr_t sg = stackgap_init(); SCARG(&ga, which) = RLIMIT_NOFILE; SCARG(&ga, rlp) = stackgap_alloc(&sg, sizeof(struct rlimit *)); if ((error = getrlimit(td, &ga)) != 0) return error; td->td_retval[0] = SCARG(&ga, rlp)->rlim_cur; return 0; } case IBCS2_SC_JOB_CONTROL: mib[1] = KERN_JOB_CONTROL; break; case IBCS2_SC_SAVED_IDS: mib[1] = KERN_SAVED_IDS; break; case IBCS2_SC_VERSION: mib[1] = KERN_POSIX1; break; case IBCS2_SC_PASS_MAX: td->td_retval[0] = 128; /* XXX - should we create PASS_MAX ? */ return 0; case IBCS2_SC_XOPEN_VERSION: td->td_retval[0] = 2; /* XXX: What should that be? */ return 0; default: return EINVAL; } mib[0] = CTL_KERN; len = sizeof(value); SCARG(&sa, name) = mib; SCARG(&sa, namelen) = 2; SCARG(&sa, old) = &value; SCARG(&sa, oldlenp) = &len; SCARG(&sa, new) = NULL; SCARG(&sa, newlen) = 0; if ((error = __sysctl(td, &sa)) != 0) return error; td->td_retval[0] = value; return 0; } int ibcs2_alarm(td, uap) struct thread *td; struct ibcs2_alarm_args *uap; { int error; struct itimerval *itp, *oitp; struct setitimer_args sa; caddr_t sg = stackgap_init(); itp = stackgap_alloc(&sg, sizeof(*itp)); oitp = stackgap_alloc(&sg, sizeof(*oitp)); timevalclear(&itp->it_interval); itp->it_value.tv_sec = SCARG(uap, sec); itp->it_value.tv_usec = 0; SCARG(&sa, which) = ITIMER_REAL; SCARG(&sa, itv) = itp; SCARG(&sa, oitv) = oitp; error = setitimer(td, &sa); if (error) return error; if (oitp->it_value.tv_usec) oitp->it_value.tv_sec++; td->td_retval[0] = oitp->it_value.tv_sec; return 0; } int ibcs2_times(td, uap) struct thread *td; struct ibcs2_times_args *uap; { int error; struct getrusage_args ga; struct tms tms; struct timeval t; caddr_t sg = stackgap_init(); struct rusage *ru = stackgap_alloc(&sg, sizeof(*ru)); #define CONVTCK(r) (r.tv_sec * hz + r.tv_usec / (1000000 / hz)) SCARG(&ga, who) = RUSAGE_SELF; SCARG(&ga, rusage) = ru; error = getrusage(td, &ga); if (error) return error; tms.tms_utime = CONVTCK(ru->ru_utime); tms.tms_stime = CONVTCK(ru->ru_stime); SCARG(&ga, who) = RUSAGE_CHILDREN; error = getrusage(td, &ga); if (error) return error; tms.tms_cutime = CONVTCK(ru->ru_utime); tms.tms_cstime = CONVTCK(ru->ru_stime); microtime(&t); td->td_retval[0] = CONVTCK(t); return copyout((caddr_t)&tms, (caddr_t)SCARG(uap, tp), sizeof(struct tms)); } int ibcs2_stime(td, uap) struct thread *td; struct ibcs2_stime_args *uap; { int error; struct settimeofday_args sa; caddr_t sg = stackgap_init(); SCARG(&sa, tv) = stackgap_alloc(&sg, sizeof(*SCARG(&sa, tv))); SCARG(&sa, tzp) = NULL; if ((error = copyin((caddr_t)SCARG(uap, timep), &(SCARG(&sa, tv)->tv_sec), sizeof(long))) != 0) return error; SCARG(&sa, tv)->tv_usec = 0; if ((error = settimeofday(td, &sa)) != 0) return EPERM; return 0; } int ibcs2_utime(td, uap) struct thread *td; struct ibcs2_utime_args *uap; { int error; struct utimes_args sa; struct timeval *tp; caddr_t sg = stackgap_init(); CHECKALTEXIST(td, &sg, SCARG(uap, path)); SCARG(&sa, path) = SCARG(uap, path); if (SCARG(uap, buf)) { struct ibcs2_utimbuf ubuf; if ((error = copyin((caddr_t)SCARG(uap, buf), (caddr_t)&ubuf, sizeof(ubuf))) != 0) return error; SCARG(&sa, tptr) = stackgap_alloc(&sg, 2 * sizeof(struct timeval *)); tp = (struct timeval *)SCARG(&sa, tptr); tp->tv_sec = ubuf.actime; tp->tv_usec = 0; tp++; tp->tv_sec = ubuf.modtime; tp->tv_usec = 0; } else SCARG(&sa, tptr) = NULL; return utimes(td, &sa); } int ibcs2_nice(td, uap) struct thread *td; struct ibcs2_nice_args *uap; { int error; struct setpriority_args sa; SCARG(&sa, which) = PRIO_PROCESS; SCARG(&sa, who) = 0; SCARG(&sa, prio) = td->td_ksegrp->kg_nice + SCARG(uap, incr); if ((error = setpriority(td, &sa)) != 0) return EPERM; td->td_retval[0] = td->td_ksegrp->kg_nice; return 0; } /* * iBCS2 getpgrp, setpgrp, setsid, and setpgid */ int ibcs2_pgrpsys(td, uap) struct thread *td; struct ibcs2_pgrpsys_args *uap; { struct proc *p = td->td_proc; switch (SCARG(uap, type)) { case 0: /* getpgrp */ PROC_LOCK(p); td->td_retval[0] = p->p_pgrp->pg_id; PROC_UNLOCK(p); return 0; case 1: /* setpgrp */ { struct setpgid_args sa; SCARG(&sa, pid) = 0; SCARG(&sa, pgid) = 0; setpgid(td, &sa); PROC_LOCK(p); td->td_retval[0] = p->p_pgrp->pg_id; PROC_UNLOCK(p); return 0; } case 2: /* setpgid */ { struct setpgid_args sa; SCARG(&sa, pid) = SCARG(uap, pid); SCARG(&sa, pgid) = SCARG(uap, pgid); return setpgid(td, &sa); } case 3: /* setsid */ return setsid(td, NULL); default: return EINVAL; } } /* * XXX - need to check for nested calls */ int ibcs2_plock(td, uap) struct thread *td; struct ibcs2_plock_args *uap; { int error; #define IBCS2_UNLOCK 0 #define IBCS2_PROCLOCK 1 #define IBCS2_TEXTLOCK 2 #define IBCS2_DATALOCK 4 if ((error = suser_td(td)) != 0) return EPERM; switch(SCARG(uap, cmd)) { case IBCS2_UNLOCK: case IBCS2_PROCLOCK: case IBCS2_TEXTLOCK: case IBCS2_DATALOCK: return 0; /* XXX - TODO */ } return EINVAL; } int ibcs2_uadmin(td, uap) struct thread *td; struct ibcs2_uadmin_args *uap; { #define SCO_A_REBOOT 1 #define SCO_A_SHUTDOWN 2 #define SCO_A_REMOUNT 4 #define SCO_A_CLOCK 8 #define SCO_A_SETCONFIG 128 #define SCO_A_GETDEV 130 #define SCO_AD_HALT 0 #define SCO_AD_BOOT 1 #define SCO_AD_IBOOT 2 #define SCO_AD_PWRDOWN 3 #define SCO_AD_PWRNAP 4 #define SCO_AD_PANICBOOT 1 #define SCO_AD_GETBMAJ 0 #define SCO_AD_GETCMAJ 1 if (suser_td(td)) return EPERM; switch(SCARG(uap, cmd)) { case SCO_A_REBOOT: case SCO_A_SHUTDOWN: switch(SCARG(uap, func)) { struct reboot_args r; case SCO_AD_HALT: case SCO_AD_PWRDOWN: case SCO_AD_PWRNAP: r.opt = RB_HALT; reboot(td, &r); case SCO_AD_BOOT: case SCO_AD_IBOOT: r.opt = RB_AUTOBOOT; reboot(td, &r); } return EINVAL; case SCO_A_REMOUNT: case SCO_A_CLOCK: case SCO_A_SETCONFIG: return 0; case SCO_A_GETDEV: return EINVAL; /* XXX - TODO */ } return EINVAL; } int ibcs2_sysfs(td, uap) struct thread *td; struct ibcs2_sysfs_args *uap; { #define IBCS2_GETFSIND 1 #define IBCS2_GETFSTYP 2 #define IBCS2_GETNFSTYP 3 switch(SCARG(uap, cmd)) { case IBCS2_GETFSIND: case IBCS2_GETFSTYP: case IBCS2_GETNFSTYP: break; } return EINVAL; /* XXX - TODO */ } int ibcs2_unlink(td, uap) struct thread *td; struct ibcs2_unlink_args *uap; { caddr_t sg = stackgap_init(); CHECKALTEXIST(td, &sg, SCARG(uap, path)); return unlink(td, (struct unlink_args *)uap); } int ibcs2_chdir(td, uap) struct thread *td; struct ibcs2_chdir_args *uap; { caddr_t sg = stackgap_init(); CHECKALTEXIST(td, &sg, SCARG(uap, path)); return chdir(td, (struct chdir_args *)uap); } int ibcs2_chmod(td, uap) struct thread *td; struct ibcs2_chmod_args *uap; { caddr_t sg = stackgap_init(); CHECKALTEXIST(td, &sg, SCARG(uap, path)); return chmod(td, (struct chmod_args *)uap); } int ibcs2_chown(td, uap) struct thread *td; struct ibcs2_chown_args *uap; { caddr_t sg = stackgap_init(); CHECKALTEXIST(td, &sg, SCARG(uap, path)); return chown(td, (struct chown_args *)uap); } int ibcs2_rmdir(td, uap) struct thread *td; struct ibcs2_rmdir_args *uap; { caddr_t sg = stackgap_init(); CHECKALTEXIST(td, &sg, SCARG(uap, path)); return rmdir(td, (struct rmdir_args *)uap); } int ibcs2_mkdir(td, uap) struct thread *td; struct ibcs2_mkdir_args *uap; { caddr_t sg = stackgap_init(); CHECKALTCREAT(td, &sg, SCARG(uap, path)); return mkdir(td, (struct mkdir_args *)uap); } int ibcs2_symlink(td, uap) struct thread *td; struct ibcs2_symlink_args *uap; { caddr_t sg = stackgap_init(); CHECKALTEXIST(td, &sg, SCARG(uap, path)); CHECKALTCREAT(td, &sg, SCARG(uap, link)); return symlink(td, (struct symlink_args *)uap); } int ibcs2_rename(td, uap) struct thread *td; struct ibcs2_rename_args *uap; { caddr_t sg = stackgap_init(); CHECKALTEXIST(td, &sg, SCARG(uap, from)); CHECKALTCREAT(td, &sg, SCARG(uap, to)); return rename(td, (struct rename_args *)uap); } int ibcs2_readlink(td, uap) struct thread *td; struct ibcs2_readlink_args *uap; { caddr_t sg = stackgap_init(); CHECKALTEXIST(td, &sg, SCARG(uap, path)); return readlink(td, (struct readlink_args *) uap); } Index: head/sys/i386/ibcs2/ibcs2_stat.c =================================================================== --- head/sys/i386/ibcs2/ibcs2_stat.c (revision 89305) +++ head/sys/i386/ibcs2/ibcs2_stat.c (revision 89306) @@ -1,257 +1,259 @@ /* * Copyright (c) 1995 Scott Bartram * Copyright (c) 1995 Steven Wallace * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static void bsd_stat2ibcs_stat __P((struct stat *, struct ibcs2_stat *)); static int cvt_statfs __P((struct statfs *, caddr_t, int)); static void bsd_stat2ibcs_stat(st, st4) struct stat *st; struct ibcs2_stat *st4; { bzero(st4, sizeof(*st4)); st4->st_dev = (ibcs2_dev_t)st->st_dev; st4->st_ino = (ibcs2_ino_t)st->st_ino; st4->st_mode = (ibcs2_mode_t)st->st_mode; st4->st_nlink= (ibcs2_nlink_t)st->st_nlink; st4->st_uid = (ibcs2_uid_t)st->st_uid; st4->st_gid = (ibcs2_gid_t)st->st_gid; st4->st_rdev = (ibcs2_dev_t)st->st_rdev; if (st->st_size < (quad_t)1 << 32) st4->st_size = (ibcs2_off_t)st->st_size; else st4->st_size = -2; st4->st_atim = (ibcs2_time_t)st->st_atime; st4->st_mtim = (ibcs2_time_t)st->st_mtime; st4->st_ctim = (ibcs2_time_t)st->st_ctime; } static int cvt_statfs(sp, buf, len) struct statfs *sp; caddr_t buf; int len; { struct ibcs2_statfs ssfs; bzero(&ssfs, sizeof ssfs); ssfs.f_fstyp = 0; ssfs.f_bsize = sp->f_bsize; ssfs.f_frsize = 0; ssfs.f_blocks = sp->f_blocks; ssfs.f_bfree = sp->f_bfree; ssfs.f_files = sp->f_files; ssfs.f_ffree = sp->f_ffree; ssfs.f_fname[0] = 0; ssfs.f_fpack[0] = 0; return copyout((caddr_t)&ssfs, buf, len); } int ibcs2_statfs(td, uap) struct thread *td; struct ibcs2_statfs_args *uap; { register struct mount *mp; register struct statfs *sp; int error; struct nameidata nd; caddr_t sg = stackgap_init(); CHECKALTEXIST(td, &sg, SCARG(uap, path)); NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); mp = nd.ni_vp->v_mount; sp = &mp->mnt_stat; vrele(nd.ni_vp); if ((error = VFS_STATFS(mp, sp, td)) != 0) return (error); sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; return cvt_statfs(sp, (caddr_t)SCARG(uap, buf), SCARG(uap, len)); } int ibcs2_fstatfs(td, uap) struct thread *td; struct ibcs2_fstatfs_args *uap; { struct file *fp; struct mount *mp; register struct statfs *sp; int error; if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0) return (error); mp = ((struct vnode *)fp->f_data)->v_mount; sp = &mp->mnt_stat; - if ((error = VFS_STATFS(mp, sp, td)) != 0) + error = VFS_STATFS(mp, sp, td); + fdrop(fp, td); + if (error != 0) return (error); sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; return cvt_statfs(sp, (caddr_t)SCARG(uap, buf), SCARG(uap, len)); } int ibcs2_stat(td, uap) struct thread *td; struct ibcs2_stat_args *uap; { struct stat st; struct ibcs2_stat ibcs2_st; struct stat_args cup; int error; caddr_t sg = stackgap_init(); CHECKALTEXIST(td, &sg, SCARG(uap, path)); SCARG(&cup, path) = SCARG(uap, path); SCARG(&cup, ub) = stackgap_alloc(&sg, sizeof(st)); if ((error = stat(td, &cup)) != 0) return error; if ((error = copyin(SCARG(&cup, ub), &st, sizeof(st))) != 0) return error; bsd_stat2ibcs_stat(&st, &ibcs2_st); return copyout((caddr_t)&ibcs2_st, (caddr_t)SCARG(uap, st), ibcs2_stat_len); } int ibcs2_lstat(td, uap) struct thread *td; struct ibcs2_lstat_args *uap; { struct stat st; struct ibcs2_stat ibcs2_st; struct lstat_args cup; int error; caddr_t sg = stackgap_init(); CHECKALTEXIST(td, &sg, SCARG(uap, path)); SCARG(&cup, path) = SCARG(uap, path); SCARG(&cup, ub) = stackgap_alloc(&sg, sizeof(st)); if ((error = lstat(td, &cup)) != 0) return error; if ((error = copyin(SCARG(&cup, ub), &st, sizeof(st))) != 0) return error; bsd_stat2ibcs_stat(&st, &ibcs2_st); return copyout((caddr_t)&ibcs2_st, (caddr_t)SCARG(uap, st), ibcs2_stat_len); } int ibcs2_fstat(td, uap) struct thread *td; struct ibcs2_fstat_args *uap; { struct stat st; struct ibcs2_stat ibcs2_st; struct fstat_args cup; int error; caddr_t sg = stackgap_init(); SCARG(&cup, fd) = SCARG(uap, fd); SCARG(&cup, sb) = stackgap_alloc(&sg, sizeof(st)); if ((error = fstat(td, &cup)) != 0) return error; if ((error = copyin(SCARG(&cup, sb), &st, sizeof(st))) != 0) return error; bsd_stat2ibcs_stat(&st, &ibcs2_st); return copyout((caddr_t)&ibcs2_st, (caddr_t)SCARG(uap, st), ibcs2_stat_len); } int ibcs2_utssys(td, uap) struct thread *td; struct ibcs2_utssys_args *uap; { switch (SCARG(uap, flag)) { case 0: /* uname(2) */ { char machine_name[9], *p; struct ibcs2_utsname sut; bzero(&sut, ibcs2_utsname_len); strncpy(sut.sysname, IBCS2_UNAME_SYSNAME, sizeof(sut.sysname) - 1); strncpy(sut.release, IBCS2_UNAME_RELEASE, sizeof(sut.release) - 1); strncpy(sut.version, IBCS2_UNAME_VERSION, sizeof(sut.version) - 1); strncpy(machine_name, hostname, sizeof(machine_name) - 1); machine_name[sizeof(machine_name) - 1] = 0; p = index(machine_name, '.'); if ( p ) *p = '\0'; strncpy(sut.nodename, machine_name, sizeof(sut.nodename) - 1); strncpy(sut.machine, machine, sizeof(sut.machine) - 1); DPRINTF(("IBCS2 uname: sys=%s rel=%s ver=%s node=%s mach=%s\n", sut.sysname, sut.release, sut.version, sut.nodename, sut.machine)); return copyout((caddr_t)&sut, (caddr_t)SCARG(uap, a1), ibcs2_utsname_len); } case 2: /* ustat(2) */ { return ENOSYS; /* XXX - TODO */ } default: return ENOSYS; } } Index: head/sys/kern/init_main.c =================================================================== --- head/sys/kern/init_main.c (revision 89305) +++ head/sys/kern/init_main.c (revision 89306) @@ -1,649 +1,652 @@ /* * Copyright (c) 1995 Terrence R. Lambert * All rights reserved. * * Copyright (c) 1982, 1986, 1989, 1991, 1992, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)init_main.c 8.9 (Berkeley) 1/21/94 * $FreeBSD$ */ #include "opt_init_path.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include void mi_startup(void); /* Should be elsewhere */ /* Components of the first process -- never freed. */ static struct session session0; static struct pgrp pgrp0; struct proc proc0; struct thread *thread0; static struct procsig procsig0; static struct filedesc0 filedesc0; static struct plimit limit0; static struct vmspace vmspace0; struct proc *initproc; int cmask = CMASK; extern int fallback_elf_brand; struct vnode *rootvp; int boothowto = 0; /* initialized so that it can be patched */ SYSCTL_INT(_debug, OID_AUTO, boothowto, CTLFLAG_RD, &boothowto, 0, ""); int bootverbose; SYSCTL_INT(_debug, OID_AUTO, bootverbose, CTLFLAG_RW, &bootverbose, 0, ""); /* * This ensures that there is at least one entry so that the sysinit_set * symbol is not undefined. A sybsystem ID of SI_SUB_DUMMY is never * executed. */ SYSINIT(placeholder, SI_SUB_DUMMY, SI_ORDER_ANY, NULL, NULL) /* * The sysinit table itself. Items are checked off as the are run. * If we want to register new sysinit types, add them to newsysinit. */ SET_DECLARE(sysinit_set, struct sysinit); struct sysinit **sysinit, **sysinit_end; struct sysinit **newsysinit, **newsysinit_end; /* * Merge a new sysinit set into the current set, reallocating it if * necessary. This can only be called after malloc is running. */ void sysinit_add(struct sysinit **set, struct sysinit **set_end) { struct sysinit **newset; struct sysinit **sipp; struct sysinit **xipp; int count; count = set_end - set; if (newsysinit) count += newsysinit_end - newsysinit; else count += sysinit_end - sysinit; newset = malloc(count * sizeof(*sipp), M_TEMP, M_NOWAIT); if (newset == NULL) panic("cannot malloc for sysinit"); xipp = newset; if (newsysinit) for (sipp = newsysinit; sipp < newsysinit_end; sipp++) *xipp++ = *sipp; else for (sipp = sysinit; sipp < sysinit_end; sipp++) *xipp++ = *sipp; for (sipp = set; sipp < set_end; sipp++) *xipp++ = *sipp; if (newsysinit) free(newsysinit, M_TEMP); newsysinit = newset; newsysinit_end = newset + count; } /* * System startup; initialize the world, create process 0, mount root * filesystem, and fork to create init and pagedaemon. Most of the * hard work is done in the lower-level initialization routines including * startup(), which does memory initialization and autoconfiguration. * * This allows simple addition of new kernel subsystems that require * boot time initialization. It also allows substitution of subsystem * (for instance, a scheduler, kernel profiler, or VM system) by object * module. Finally, it allows for optional "kernel threads". */ void mi_startup(void) { register struct sysinit **sipp; /* system initialization*/ register struct sysinit **xipp; /* interior loop of sort*/ register struct sysinit *save; /* bubble*/ if (sysinit == NULL) { sysinit = SET_BEGIN(sysinit_set); sysinit_end = SET_LIMIT(sysinit_set); } restart: /* * Perform a bubble sort of the system initialization objects by * their subsystem (primary key) and order (secondary key). */ for (sipp = sysinit; sipp < sysinit_end; sipp++) { for (xipp = sipp + 1; xipp < sysinit_end; xipp++) { if ((*sipp)->subsystem < (*xipp)->subsystem || ((*sipp)->subsystem == (*xipp)->subsystem && (*sipp)->order <= (*xipp)->order)) continue; /* skip*/ save = *sipp; *sipp = *xipp; *xipp = save; } } /* * Traverse the (now) ordered list of system initialization tasks. * Perform each task, and continue on to the next task. * * The last item on the list is expected to be the scheduler, * which will not return. */ for (sipp = sysinit; sipp < sysinit_end; sipp++) { if ((*sipp)->subsystem == SI_SUB_DUMMY) continue; /* skip dummy task(s)*/ if ((*sipp)->subsystem == SI_SUB_DONE) continue; /* Call function */ (*((*sipp)->func))((*sipp)->udata); /* Check off the one we're just done */ (*sipp)->subsystem = SI_SUB_DONE; /* Check if we've installed more sysinit items via KLD */ if (newsysinit != NULL) { if (sysinit != SET_BEGIN(sysinit_set)) free(sysinit, M_TEMP); sysinit = newsysinit; sysinit_end = newsysinit_end; newsysinit = NULL; newsysinit_end = NULL; goto restart; } } panic("Shouldn't get here!"); /* NOTREACHED*/ } /* *************************************************************************** **** **** The following SYSINIT's belong elsewhere, but have not yet **** been moved. **** *************************************************************************** */ static void print_caddr_t(void *data __unused) { printf("%s", (char *)data); } SYSINIT(announce, SI_SUB_COPYRIGHT, SI_ORDER_FIRST, print_caddr_t, copyright) SYSINIT(version, SI_SUB_COPYRIGHT, SI_ORDER_SECOND, print_caddr_t, version) static void set_boot_verbose(void *data __unused) { if (boothowto & RB_VERBOSE) bootverbose++; } SYSINIT(boot_verbose, SI_SUB_TUNABLES, SI_ORDER_ANY, set_boot_verbose, NULL) /* *************************************************************************** **** **** The two following SYSINT's are proc0 specific glue code. I am not **** convinced that they can not be safely combined, but their order of **** operation has been maintained as the same as the original init_main.c **** for right now. **** **** These probably belong in init_proc.c or kern_proc.c, since they **** deal with proc0 (the fork template process). **** *************************************************************************** */ /* ARGSUSED*/ static void proc0_init(void *dummy __unused) { register struct proc *p; register struct filedesc0 *fdp; register unsigned i; struct thread *td; GIANT_REQUIRED; /* * This assumes the proc0 struct has already been linked * using proc_linkup() in the machine specific initialisation * e.g. i386_init() */ p = &proc0; td = thread0; /* * Initialize magic number. */ p->p_magic = P_MAGIC; /* * Initialize process and pgrp structures. */ procinit(); /* * Initialize sleep queue hash table */ sleepinit(); /* * additional VM structures */ vm_init2(); /* * Create process 0 (the swapper). */ LIST_INSERT_HEAD(&allproc, p, p_list); LIST_INSERT_HEAD(PIDHASH(0), p, p_hash); p->p_pgrp = &pgrp0; LIST_INSERT_HEAD(PGRPHASH(0), &pgrp0, pg_hash); LIST_INIT(&pgrp0.pg_members); LIST_INSERT_HEAD(&pgrp0.pg_members, p, p_pglist); pgrp0.pg_session = &session0; session0.s_count = 1; session0.s_leader = p; #ifdef __ELF__ p->p_sysent = &elf_freebsd_sysvec; #else p->p_sysent = &aout_sysvec; #endif p->p_flag = P_SYSTEM; p->p_sflag = PS_INMEM; p->p_stat = SRUN; p->p_ksegrp.kg_nice = NZERO; p->p_ksegrp.kg_pri.pri_class = PRI_TIMESHARE; p->p_ksegrp.kg_pri.pri_level = PVM; p->p_ksegrp.kg_pri.pri_native = PUSER; p->p_ksegrp.kg_pri.pri_user = PUSER; p->p_peers = 0; p->p_leader = p; bcopy("swapper", p->p_comm, sizeof ("swapper")); callout_init(&p->p_itcallout, 0); callout_init(&td->td_slpcallout, 1); /* Create credentials. */ p->p_ucred = crget(); p->p_ucred->cr_ngroups = 1; /* group 0 */ p->p_ucred->cr_uidinfo = uifind(0); p->p_ucred->cr_ruidinfo = uifind(0); p->p_ucred->cr_prison = NULL; /* Don't jail it. */ td->td_ucred = crhold(p->p_ucred); /* Create procsig. */ p->p_procsig = &procsig0; p->p_procsig->ps_refcnt = 1; /* Initialize signal state for process 0. */ siginit(&proc0); /* Create the file descriptor table. */ fdp = &filedesc0; p->p_fd = &fdp->fd_fd; + mtx_init(&fdp->fd_fd.fd_mtx, "struct filedesc", MTX_DEF); fdp->fd_fd.fd_refcnt = 1; fdp->fd_fd.fd_cmask = cmask; fdp->fd_fd.fd_ofiles = fdp->fd_dfiles; fdp->fd_fd.fd_ofileflags = fdp->fd_dfileflags; fdp->fd_fd.fd_nfiles = NDFILE; /* Create the limits structures. */ p->p_limit = &limit0; for (i = 0; i < sizeof(p->p_rlimit)/sizeof(p->p_rlimit[0]); i++) limit0.pl_rlimit[i].rlim_cur = limit0.pl_rlimit[i].rlim_max = RLIM_INFINITY; limit0.pl_rlimit[RLIMIT_NOFILE].rlim_cur = limit0.pl_rlimit[RLIMIT_NOFILE].rlim_max = maxfiles; limit0.pl_rlimit[RLIMIT_NPROC].rlim_cur = limit0.pl_rlimit[RLIMIT_NPROC].rlim_max = maxproc; i = ptoa(cnt.v_free_count); limit0.pl_rlimit[RLIMIT_RSS].rlim_max = i; limit0.pl_rlimit[RLIMIT_MEMLOCK].rlim_max = i; limit0.pl_rlimit[RLIMIT_MEMLOCK].rlim_cur = i / 3; limit0.p_cpulimit = RLIM_INFINITY; limit0.p_refcnt = 1; /* Allocate a prototype map so we have something to fork. */ pmap_pinit0(vmspace_pmap(&vmspace0)); p->p_vmspace = &vmspace0; vmspace0.vm_refcnt = 1; vm_map_init(&vmspace0.vm_map, round_page(VM_MIN_ADDRESS), trunc_page(VM_MAXUSER_ADDRESS)); vmspace0.vm_map.pmap = vmspace_pmap(&vmspace0); /* * We continue to place resource usage info and signal * actions in the user struct so they're pageable. */ p->p_stats = &p->p_uarea->u_stats; p->p_sigacts = &p->p_uarea->u_sigacts; /* * Charge root for one process. */ (void)chgproccnt(p->p_ucred->cr_ruidinfo, 1, 0); } SYSINIT(p0init, SI_SUB_INTRINSIC, SI_ORDER_FIRST, proc0_init, NULL) /* ARGSUSED*/ static void proc0_post(void *dummy __unused) { struct timespec ts; struct proc *p; /* * Now we can look at the time, having had a chance to verify the * time from the file system. Pretend that proc0 started now. */ sx_slock(&allproc_lock); LIST_FOREACH(p, &allproc, p_list) { microtime(&p->p_stats->p_start); p->p_runtime = 0; } sx_sunlock(&allproc_lock); microuptime(PCPU_PTR(switchtime)); PCPU_SET(switchticks, ticks); /* * Give the ``random'' number generator a thump. */ nanotime(&ts); srandom(ts.tv_sec ^ ts.tv_nsec); } SYSINIT(p0post, SI_SUB_INTRINSIC_POST, SI_ORDER_FIRST, proc0_post, NULL) /* *************************************************************************** **** **** The following SYSINIT's and glue code should be moved to the **** respective files on a per subsystem basis. **** *************************************************************************** */ /* *************************************************************************** **** **** The following code probably belongs in another file, like **** kern/init_init.c. **** *************************************************************************** */ /* * List of paths to try when searching for "init". */ static char init_path[MAXPATHLEN] = #ifdef INIT_PATH __XSTRING(INIT_PATH); #else "/sbin/init:/sbin/oinit:/sbin/init.bak:/stand/sysinstall"; #endif SYSCTL_STRING(_kern, OID_AUTO, init_path, CTLFLAG_RD, init_path, 0, "Path used to search the init process"); /* * Start the initial user process; try exec'ing each pathname in init_path. * The program is invoked with one argument containing the boot flags. */ static void start_init(void *dummy) { vm_offset_t addr; struct execve_args args; int options, error; char *var, *path, *next, *s; char *ucp, **uap, *arg0, *arg1; struct thread *td; struct proc *p; int init_does_devfs = 0; mtx_lock(&Giant); GIANT_REQUIRED; td = curthread; p = td->td_proc; /* Get the vnode for '/'. Set p->p_fd->fd_cdir to reference it. */ if (VFS_ROOT(TAILQ_FIRST(&mountlist), &rootvnode)) panic("cannot find root vnode"); + FILEDESC_LOCK(p->p_fd); p->p_fd->fd_cdir = rootvnode; VREF(p->p_fd->fd_cdir); p->p_fd->fd_rdir = rootvnode; VREF(p->p_fd->fd_rdir); + FILEDESC_UNLOCK(p->p_fd); VOP_UNLOCK(rootvnode, 0, td); if (devfs_present) { /* * For disk based systems, we probably cannot do this yet * since the fs will be read-only. But a NFS root * might be ok. It is worth a shot. */ error = vn_mkdir("/dev", 0700, UIO_SYSSPACE, td); if (error == EEXIST) error = 0; if (error == 0) error = vfs_mount(td, "devfs", "/dev", 0, 0); if (error != 0) init_does_devfs = 1; } /* * Need just enough stack to hold the faked-up "execve()" arguments. */ addr = trunc_page(USRSTACK - PAGE_SIZE); if (vm_map_find(&p->p_vmspace->vm_map, NULL, 0, &addr, PAGE_SIZE, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0) != 0) panic("init: couldn't allocate argument space"); p->p_vmspace->vm_maxsaddr = (caddr_t)addr; p->p_vmspace->vm_ssize = 1; if ((var = getenv("init_path")) != NULL) { strncpy(init_path, var, sizeof init_path); init_path[sizeof init_path - 1] = 0; } if ((var = getenv("kern.fallback_elf_brand")) != NULL) fallback_elf_brand = strtol(var, NULL, 0); for (path = init_path; *path != '\0'; path = next) { while (*path == ':') path++; if (*path == '\0') break; for (next = path; *next != '\0' && *next != ':'; next++) /* nothing */ ; if (bootverbose) printf("start_init: trying %.*s\n", (int)(next - path), path); /* * Move out the boot flag argument. */ options = 0; ucp = (char *)USRSTACK; (void)subyte(--ucp, 0); /* trailing zero */ if (boothowto & RB_SINGLE) { (void)subyte(--ucp, 's'); options = 1; } #ifdef notyet if (boothowto & RB_FASTBOOT) { (void)subyte(--ucp, 'f'); options = 1; } #endif #ifdef BOOTCDROM (void)subyte(--ucp, 'C'); options = 1; #endif if (init_does_devfs) { (void)subyte(--ucp, 'd'); options = 1; } if (options == 0) (void)subyte(--ucp, '-'); (void)subyte(--ucp, '-'); /* leading hyphen */ arg1 = ucp; /* * Move out the file name (also arg 0). */ (void)subyte(--ucp, 0); for (s = next - 1; s >= path; s--) (void)subyte(--ucp, *s); arg0 = ucp; /* * Move out the arg pointers. */ uap = (char **)((intptr_t)ucp & ~(sizeof(intptr_t)-1)); (void)suword((caddr_t)--uap, (long)0); /* terminator */ (void)suword((caddr_t)--uap, (long)(intptr_t)arg1); (void)suword((caddr_t)--uap, (long)(intptr_t)arg0); /* * Point at the arguments. */ args.fname = arg0; args.argv = uap; args.envv = NULL; /* * Now try to exec the program. If can't for any reason * other than it doesn't exist, complain. * * Otherwise, return via fork_trampoline() all the way * to user mode as init! */ if ((error = execve(td, &args)) == 0) { mtx_unlock(&Giant); return; } if (error != ENOENT) printf("exec %.*s: error %d\n", (int)(next - path), path, error); } printf("init: not found in path %s\n", init_path); panic("no init"); } /* * Like kthread_create(), but runs in it's own address space. * We do this early to reserve pid 1. * * Note special case - do not make it runnable yet. Other work * in progress will change this more. */ static void create_init(const void *udata __unused) { int error; error = fork1(thread0, RFFDG | RFPROC | RFSTOPPED, &initproc); if (error) panic("cannot fork init: %d\n", error); PROC_LOCK(initproc); initproc->p_flag |= P_SYSTEM; PROC_UNLOCK(initproc); mtx_lock_spin(&sched_lock); initproc->p_sflag |= PS_INMEM; mtx_unlock_spin(&sched_lock); cpu_set_fork_handler(&initproc->p_thread, start_init, NULL); } SYSINIT(init, SI_SUB_CREATE_INIT, SI_ORDER_FIRST, create_init, NULL) /* * Make it runnable now. */ static void kick_init(const void *udata __unused) { mtx_lock_spin(&sched_lock); initproc->p_stat = SRUN; setrunqueue(&initproc->p_thread); /* XXXKSE */ mtx_unlock_spin(&sched_lock); } SYSINIT(kickinit, SI_SUB_KTHREAD_INIT, SI_ORDER_FIRST, kick_init, NULL) Index: head/sys/kern/kern_acl.c =================================================================== --- head/sys/kern/kern_acl.c (revision 89305) +++ head/sys/kern/kern_acl.c (revision 89306) @@ -1,817 +1,821 @@ /*- * Copyright (c) 1999-2001 Robert N. M. Watson * All rights reserved. * * This software was developed by Robert Watson for the TrustedBSD Project. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ /* * Developed by the TrustedBSD Project. * Support for POSIX.1e access control lists. */ #include "opt_cap.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include MALLOC_DEFINE(M_ACL, "acl", "access control list"); static int vacl_set_acl(struct thread *td, struct vnode *vp, acl_type_t type, struct acl *aclp); static int vacl_get_acl(struct thread *td, struct vnode *vp, acl_type_t type, struct acl *aclp); static int vacl_aclcheck(struct thread *td, struct vnode *vp, acl_type_t type, struct acl *aclp); /* * Implement a version of vaccess() that understands POSIX.1e ACL semantics. * Return 0 on success, else an errno value. Should be merged into * vaccess() eventually. */ int vaccess_acl_posix1e(enum vtype type, uid_t file_uid, gid_t file_gid, struct acl *acl, mode_t acc_mode, struct ucred *cred, int *privused) { struct acl_entry *acl_other, *acl_mask; mode_t dac_granted; mode_t cap_granted; mode_t acl_mask_granted; int group_matched, i; /* * Look for a normal, non-privileged way to access the file/directory * as requested. If it exists, go with that. Otherwise, attempt * to use privileges granted via cap_granted. In some cases, * which privileges to use may be ambiguous due to "best match", * in which case fall back on first match for the time being. */ if (privused != NULL) *privused = 0; /* * Determine privileges now, but don't apply until we've found * a DAC entry that matches but has failed to allow access. */ #ifndef CAPABILITIES if (suser_xxx(cred, NULL, PRISON_ROOT) == 0) cap_granted = (VEXEC | VREAD | VWRITE | VADMIN); else cap_granted = 0; #else cap_granted = 0; if (type == VDIR) { if ((acc_mode & VEXEC) && !cap_check(cred, NULL, CAP_DAC_READ_SEARCH, PRISON_ROOT)) cap_granted |= VEXEC; } else { if ((acc_mode & VEXEC) && !cap_check(cred, NULL, CAP_DAC_EXECUTE, PRISON_ROOT)) cap_granted |= VEXEC; } if ((acc_mode & VREAD) && !cap_check(cred, NULL, CAP_DAC_READ_SEARCH, PRISON_ROOT)) cap_granted |= VREAD; if ((acc_mode & VWRITE) && !cap_check(cred, NULL, CAP_DAC_WRITE, PRISON_ROOT)) cap_granted |= VWRITE; if ((acc_mode & VADMIN) && !cap_check(cred, NULL, CAP_FOWNER, PRISON_ROOT)) cap_granted |= VADMIN; #endif /* CAPABILITIES */ /* * The owner matches if the effective uid associated with the * credential matches that of the ACL_USER_OBJ entry. While we're * doing the first scan, also cache the location of the ACL_MASK * and ACL_OTHER entries, preventing some future iterations. */ acl_mask = acl_other = NULL; for (i = 0; i < acl->acl_cnt; i++) { switch (acl->acl_entry[i].ae_tag) { case ACL_USER_OBJ: if (file_uid != cred->cr_uid) break; dac_granted = 0; dac_granted |= VADMIN; if (acl->acl_entry[i].ae_perm & ACL_EXECUTE) dac_granted |= VEXEC; if (acl->acl_entry[i].ae_perm & ACL_READ) dac_granted |= VREAD; if (acl->acl_entry[i].ae_perm & ACL_WRITE) dac_granted |= VWRITE; if ((acc_mode & dac_granted) == acc_mode) return (0); if ((acc_mode & (dac_granted | cap_granted)) == acc_mode) { if (privused != NULL) *privused = 1; return (0); } goto error; case ACL_MASK: acl_mask = &acl->acl_entry[i]; break; case ACL_OTHER: acl_other = &acl->acl_entry[i]; break; default: } } /* * An ACL_OTHER entry should always exist in a valid access * ACL. If it doesn't, then generate a serious failure. For now, * this means a debugging message and EPERM, but in the future * should probably be a panic. */ if (acl_other == NULL) { /* * XXX This should never happen */ printf("vaccess_acl_posix1e: ACL_OTHER missing\n"); return (EPERM); } /* * Checks against ACL_USER, ACL_GROUP_OBJ, and ACL_GROUP fields * are masked by an ACL_MASK entry, if any. As such, first identify * the ACL_MASK field, then iterate through identifying potential * user matches, then group matches. If there is no ACL_MASK, * assume that the mask allows all requests to succeed. */ if (acl_mask != NULL) { acl_mask_granted = 0; if (acl_mask->ae_perm & ACL_EXECUTE) acl_mask_granted |= VEXEC; if (acl_mask->ae_perm & ACL_READ) acl_mask_granted |= VREAD; if (acl_mask->ae_perm & ACL_WRITE) acl_mask_granted |= VWRITE; } else acl_mask_granted = VEXEC | VREAD | VWRITE; /* * Iterate through user ACL entries. Do checks twice, first * without privilege, and then if a match is found but failed, * a second time with privilege. */ /* * Check ACL_USER ACL entries. */ for (i = 0; i < acl->acl_cnt; i++) { switch (acl->acl_entry[i].ae_tag) { case ACL_USER: if (acl->acl_entry[i].ae_id != cred->cr_uid) break; dac_granted = 0; if (acl->acl_entry[i].ae_perm & ACL_EXECUTE) dac_granted |= VEXEC; if (acl->acl_entry[i].ae_perm & ACL_READ) dac_granted |= VREAD; if (acl->acl_entry[i].ae_perm & ACL_WRITE) dac_granted |= VWRITE; dac_granted &= acl_mask_granted; if ((acc_mode & dac_granted) == acc_mode) return (0); if ((acc_mode & (dac_granted | cap_granted)) != acc_mode) goto error; if (privused != NULL) *privused = 1; return (0); } } /* * Group match is best-match, not first-match, so find a * "best" match. Iterate across, testing each potential group * match. Make sure we keep track of whether we found a match * or not, so that we know if we should try again with any * available privilege, or if we should move on to ACL_OTHER. */ group_matched = 0; for (i = 0; i < acl->acl_cnt; i++) { switch (acl->acl_entry[i].ae_tag) { case ACL_GROUP_OBJ: if (!groupmember(file_gid, cred)) break; dac_granted = 0; if (acl->acl_entry[i].ae_perm & ACL_EXECUTE) dac_granted |= VEXEC; if (acl->acl_entry[i].ae_perm & ACL_READ) dac_granted |= VREAD; if (acl->acl_entry[i].ae_perm & ACL_WRITE) dac_granted |= VWRITE; dac_granted &= acl_mask_granted; if ((acc_mode & dac_granted) == acc_mode) return (0); group_matched = 1; break; case ACL_GROUP: if (!groupmember(acl->acl_entry[i].ae_id, cred)) break; dac_granted = 0; if (acl->acl_entry[i].ae_perm & ACL_EXECUTE) dac_granted |= VEXEC; if (acl->acl_entry[i].ae_perm & ACL_READ) dac_granted |= VREAD; if (acl->acl_entry[i].ae_perm & ACL_WRITE) dac_granted |= VWRITE; dac_granted &= acl_mask_granted; if ((acc_mode & dac_granted) == acc_mode) return (0); group_matched = 1; break; default: } } if (group_matched == 1) { /* * There was a match, but it did not grant rights via * pure DAC. Try again, this time with privilege. */ for (i = 0; i < acl->acl_cnt; i++) { switch (acl->acl_entry[i].ae_tag) { case ACL_GROUP_OBJ: if (!groupmember(file_gid, cred)) break; dac_granted = 0; if (acl->acl_entry[i].ae_perm & ACL_EXECUTE) dac_granted |= VEXEC; if (acl->acl_entry[i].ae_perm & ACL_READ) dac_granted |= VREAD; if (acl->acl_entry[i].ae_perm & ACL_WRITE) dac_granted |= VWRITE; dac_granted &= acl_mask_granted; if ((acc_mode & (dac_granted | cap_granted)) != acc_mode) break; if (privused != NULL) *privused = 1; return (0); case ACL_GROUP: if (!groupmember(acl->acl_entry[i].ae_id, cred)) break; dac_granted = 0; if (acl->acl_entry[i].ae_perm & ACL_EXECUTE) dac_granted |= VEXEC; if (acl->acl_entry[i].ae_perm & ACL_READ) dac_granted |= VREAD; if (acl->acl_entry[i].ae_perm & ACL_WRITE) dac_granted |= VWRITE; dac_granted &= acl_mask_granted; if ((acc_mode & (dac_granted | cap_granted)) != acc_mode) break; if (privused != NULL) *privused = 1; return (0); default: } } /* * Even with privilege, group membership was not sufficient. * Return failure. */ goto error; } /* * Fall back on ACL_OTHER. ACL_MASK is not applied to ACL_OTHER. */ dac_granted = 0; if (acl_other->ae_perm & ACL_EXECUTE) dac_granted |= VEXEC; if (acl_other->ae_perm & ACL_READ) dac_granted |= VREAD; if (acl_other->ae_perm & ACL_WRITE) dac_granted |= VWRITE; if ((acc_mode & dac_granted) == acc_mode) return (0); if ((acc_mode & (dac_granted | cap_granted)) == acc_mode) { if (privused != NULL) *privused = 1; return (0); } error: return ((acc_mode & VADMIN) ? EPERM : EACCES); } /* * For the purposes of file systems maintaining the _OBJ entries in an * inode with a mode_t field, this routine converts a mode_t entry * to an acl_perm_t. */ acl_perm_t acl_posix1e_mode_to_perm(acl_tag_t tag, mode_t mode) { acl_perm_t perm = 0; switch(tag) { case ACL_USER_OBJ: if (mode & S_IXUSR) perm |= ACL_EXECUTE; if (mode & S_IRUSR) perm |= ACL_READ; if (mode & S_IWUSR) perm |= ACL_WRITE; return (perm); case ACL_GROUP_OBJ: if (mode & S_IXGRP) perm |= ACL_EXECUTE; if (mode & S_IRGRP) perm |= ACL_READ; if (mode & S_IWGRP) perm |= ACL_WRITE; return (perm); case ACL_OTHER: if (mode & S_IXOTH) perm |= ACL_EXECUTE; if (mode & S_IROTH) perm |= ACL_READ; if (mode & S_IWOTH) perm |= ACL_WRITE; return (perm); default: printf("acl_posix1e_mode_to_perm: invalid tag (%d)\n", tag); return (0); } } /* * Given inode information (uid, gid, mode), return an acl entry of the * appropriate type. */ struct acl_entry acl_posix1e_mode_to_entry(acl_tag_t tag, uid_t uid, gid_t gid, mode_t mode) { struct acl_entry acl_entry; acl_entry.ae_tag = tag; acl_entry.ae_perm = acl_posix1e_mode_to_perm(tag, mode); switch(tag) { case ACL_USER_OBJ: acl_entry.ae_id = uid; break; case ACL_GROUP_OBJ: acl_entry.ae_id = gid; break; case ACL_OTHER: acl_entry.ae_id = ACL_UNDEFINED_ID; break; default: acl_entry.ae_id = ACL_UNDEFINED_ID; printf("acl_posix1e_mode_to_entry: invalid tag (%d)\n", tag); } return (acl_entry); } /* * Utility function to generate a file mode given appropriate ACL entries. */ mode_t acl_posix1e_perms_to_mode(struct acl_entry *acl_user_obj_entry, struct acl_entry *acl_group_obj_entry, struct acl_entry *acl_other_entry) { mode_t mode; mode = 0; if (acl_user_obj_entry->ae_perm & ACL_EXECUTE) mode |= S_IXUSR; if (acl_user_obj_entry->ae_perm & ACL_READ) mode |= S_IRUSR; if (acl_user_obj_entry->ae_perm & ACL_WRITE) mode |= S_IWUSR; if (acl_group_obj_entry->ae_perm & ACL_EXECUTE) mode |= S_IXGRP; if (acl_group_obj_entry->ae_perm & ACL_READ) mode |= S_IRGRP; if (acl_group_obj_entry->ae_perm & ACL_WRITE) mode |= S_IWGRP; if (acl_other_entry->ae_perm & ACL_EXECUTE) mode |= S_IXOTH; if (acl_other_entry->ae_perm & ACL_READ) mode |= S_IROTH; if (acl_other_entry->ae_perm & ACL_WRITE) mode |= S_IWOTH; return (mode); } /* * Perform a syntactic check of the ACL, sufficient to allow an * implementing file system to determine if it should accept this and * rely on the POSIX.1e ACL properties. */ int acl_posix1e_check(struct acl *acl) { int num_acl_user_obj, num_acl_user, num_acl_group_obj, num_acl_group; int num_acl_mask, num_acl_other, i; /* * Verify that the number of entries does not exceed the maximum * defined for acl_t. * Verify that the correct number of various sorts of ae_tags are * present: * Exactly one ACL_USER_OBJ * Exactly one ACL_GROUP_OBJ * Exactly one ACL_OTHER * If any ACL_USER or ACL_GROUP entries appear, then exactly one * ACL_MASK entry must also appear. * Verify that all ae_perm entries are in ACL_PERM_BITS. * Verify all ae_tag entries are understood by this implementation. * Note: Does not check for uniqueness of qualifier (ae_id) field. */ num_acl_user_obj = num_acl_user = num_acl_group_obj = num_acl_group = num_acl_mask = num_acl_other = 0; if (acl->acl_cnt > ACL_MAX_ENTRIES || acl->acl_cnt < 0) return (EINVAL); for (i = 0; i < acl->acl_cnt; i++) { /* * Check for a valid tag. */ switch(acl->acl_entry[i].ae_tag) { case ACL_USER_OBJ: acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */ if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID) return (EINVAL); num_acl_user_obj++; break; case ACL_GROUP_OBJ: acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */ if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID) return (EINVAL); num_acl_group_obj++; break; case ACL_USER: if (acl->acl_entry[i].ae_id == ACL_UNDEFINED_ID) return (EINVAL); num_acl_user++; break; case ACL_GROUP: if (acl->acl_entry[i].ae_id == ACL_UNDEFINED_ID) return (EINVAL); num_acl_group++; break; case ACL_OTHER: acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */ if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID) return (EINVAL); num_acl_other++; break; case ACL_MASK: acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */ if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID) return (EINVAL); num_acl_mask++; break; default: return (EINVAL); } /* * Check for valid perm entries. */ if ((acl->acl_entry[i].ae_perm | ACL_PERM_BITS) != ACL_PERM_BITS) return (EINVAL); } if ((num_acl_user_obj != 1) || (num_acl_group_obj != 1) || (num_acl_other != 1) || (num_acl_mask != 0 && num_acl_mask != 1)) return (EINVAL); if (((num_acl_group != 0) || (num_acl_user != 0)) && (num_acl_mask != 1)) return (EINVAL); return (0); } /* * These calls wrap the real vnode operations, and are called by the * syscall code once the syscall has converted the path or file * descriptor to a vnode (unlocked). The aclp pointer is assumed * still to point to userland, so this should not be consumed within * the kernel except by syscall code. Other code should directly * invoke VOP_{SET,GET}ACL. */ /* * Given a vnode, set its ACL. */ static int vacl_set_acl(struct thread *td, struct vnode *vp, acl_type_t type, struct acl *aclp) { struct acl inkernacl; int error; error = copyin(aclp, &inkernacl, sizeof(struct acl)); if (error) return(error); VOP_LEASE(vp, td, td->td_proc->p_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); error = VOP_SETACL(vp, type, &inkernacl, td->td_proc->p_ucred, td); VOP_UNLOCK(vp, 0, td); return(error); } /* * Given a vnode, get its ACL. */ static int vacl_get_acl(struct thread *td, struct vnode *vp, acl_type_t type, struct acl *aclp) { struct acl inkernelacl; int error; VOP_LEASE(vp, td, td->td_proc->p_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); error = VOP_GETACL(vp, type, &inkernelacl, td->td_proc->p_ucred, td); VOP_UNLOCK(vp, 0, td); if (error == 0) error = copyout(&inkernelacl, aclp, sizeof(struct acl)); return (error); } /* * Given a vnode, delete its ACL. */ static int vacl_delete(struct thread *td, struct vnode *vp, acl_type_t type) { int error; VOP_LEASE(vp, td, td->td_proc->p_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); error = VOP_SETACL(vp, ACL_TYPE_DEFAULT, 0, td->td_proc->p_ucred, td); VOP_UNLOCK(vp, 0, td); return (error); } /* * Given a vnode, check whether an ACL is appropriate for it */ static int vacl_aclcheck(struct thread *td, struct vnode *vp, acl_type_t type, struct acl *aclp) { struct acl inkernelacl; int error; error = copyin(aclp, &inkernelacl, sizeof(struct acl)); if (error) return(error); error = VOP_ACLCHECK(vp, type, &inkernelacl, td->td_proc->p_ucred, td); return (error); } /* * syscalls -- convert the path/fd to a vnode, and call vacl_whatever. * Don't need to lock, as the vacl_ code will get/release any locks * required. */ /* * Given a file path, get an ACL for it * * MPSAFE */ int __acl_get_file(struct thread *td, struct __acl_get_file_args *uap) { struct nameidata nd; int error; mtx_lock(&Giant); NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td); error = namei(&nd); if (error == 0) { error = vacl_get_acl(td, nd.ni_vp, SCARG(uap, type), SCARG(uap, aclp)); NDFREE(&nd, 0); } mtx_unlock(&Giant); return (error); } /* * Given a file path, set an ACL for it * * MPSAFE */ int __acl_set_file(struct thread *td, struct __acl_set_file_args *uap) { struct nameidata nd; int error; mtx_lock(&Giant); NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td); error = namei(&nd); if (error == 0) { error = vacl_set_acl(td, nd.ni_vp, SCARG(uap, type), SCARG(uap, aclp)); NDFREE(&nd, 0); } mtx_unlock(&Giant); return (error); } /* * Given a file descriptor, get an ACL for it * * MPSAFE */ int __acl_get_fd(struct thread *td, struct __acl_get_fd_args *uap) { struct file *fp; int error; mtx_lock(&Giant); error = getvnode(td->td_proc->p_fd, SCARG(uap, filedes), &fp); if (error == 0) { error = vacl_get_acl(td, (struct vnode *)fp->f_data, SCARG(uap, type), SCARG(uap, aclp)); + fdrop(fp, td); } mtx_unlock(&Giant); return (error); } /* * Given a file descriptor, set an ACL for it * * MPSAFE */ int __acl_set_fd(struct thread *td, struct __acl_set_fd_args *uap) { struct file *fp; int error; mtx_lock(&Giant); error = getvnode(td->td_proc->p_fd, SCARG(uap, filedes), &fp); if (error == 0) { error = vacl_set_acl(td, (struct vnode *)fp->f_data, SCARG(uap, type), SCARG(uap, aclp)); + fdrop(fp, td); } mtx_unlock(&Giant); return (error); } /* * Given a file path, delete an ACL from it. * * MPSAFE */ int __acl_delete_file(struct thread *td, struct __acl_delete_file_args *uap) { struct nameidata nd; int error; mtx_lock(&Giant); NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td); error = namei(&nd); if (error == 0) { error = vacl_delete(td, nd.ni_vp, SCARG(uap, type)); NDFREE(&nd, 0); } mtx_unlock(&Giant); return (error); } /* * Given a file path, delete an ACL from it. * * MPSAFE */ int __acl_delete_fd(struct thread *td, struct __acl_delete_fd_args *uap) { struct file *fp; int error; mtx_lock(&Giant); error = getvnode(td->td_proc->p_fd, SCARG(uap, filedes), &fp); if (error == 0) { error = vacl_delete(td, (struct vnode *)fp->f_data, SCARG(uap, type)); + fdrop(fp, td); } mtx_unlock(&Giant); return (error); } /* * Given a file path, check an ACL for it * * MPSAFE */ int __acl_aclcheck_file(struct thread *td, struct __acl_aclcheck_file_args *uap) { struct nameidata nd; int error; mtx_lock(&Giant); NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td); error = namei(&nd); if (error == 0) { error = vacl_aclcheck(td, nd.ni_vp, SCARG(uap, type), SCARG(uap, aclp)); NDFREE(&nd, 0); } mtx_unlock(&Giant); return (error); } /* * Given a file descriptor, check an ACL for it * * MPSAFE */ int __acl_aclcheck_fd(struct thread *td, struct __acl_aclcheck_fd_args *uap) { struct file *fp; int error; mtx_lock(&Giant); error = getvnode(td->td_proc->p_fd, SCARG(uap, filedes), &fp); if (error == 0) { error = vacl_aclcheck(td, (struct vnode *)fp->f_data, SCARG(uap, type), SCARG(uap, aclp)); + fdrop(fp, td); } mtx_unlock(&Giant); return (error); } Index: head/sys/kern/kern_descrip.c =================================================================== --- head/sys/kern/kern_descrip.c (revision 89305) +++ head/sys/kern/kern_descrip.c (revision 89306) @@ -1,1844 +1,2087 @@ /* * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_descrip.c 8.6 (Berkeley) 4/19/94 * $FreeBSD$ */ #include "opt_compat.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_FILEDESC, "file desc", "Open file descriptor table"); MALLOC_DEFINE(M_FILE, "file", "Open file structure"); static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures"); static d_open_t fdopen; #define NUMFDESC 64 #define CDEV_MAJOR 22 static struct cdevsw fildesc_cdevsw = { /* open */ fdopen, /* close */ noclose, /* read */ noread, /* write */ nowrite, /* ioctl */ noioctl, /* poll */ nopoll, /* mmap */ nommap, /* strategy */ nostrategy, /* name */ "FD", /* maj */ CDEV_MAJOR, /* dump */ nodump, /* psize */ nopsize, /* flags */ 0, }; static int do_dup __P((struct filedesc *fdp, int old, int new, register_t *retval, struct thread *td)); static int badfo_readwrite __P((struct file *fp, struct uio *uio, struct ucred *cred, int flags, struct thread *td)); static int badfo_ioctl __P((struct file *fp, u_long com, caddr_t data, struct thread *td)); static int badfo_poll __P((struct file *fp, int events, struct ucred *cred, struct thread *td)); static int badfo_kqfilter __P((struct file *fp, struct knote *kn)); static int badfo_stat __P((struct file *fp, struct stat *sb, struct thread *td)); static int badfo_close __P((struct file *fp, struct thread *td)); /* * Descriptor management. */ struct filelist filehead; /* head of list of open files */ int nfiles; /* actual number of open files */ extern int cmask; +struct sx filelist_lock; /* sx to protect filelist */ /* * System calls on descriptors. */ #ifndef _SYS_SYSPROTO_H_ struct getdtablesize_args { int dummy; }; #endif /* * MPSAFE */ /* ARGSUSED */ int getdtablesize(td, uap) struct thread *td; struct getdtablesize_args *uap; { struct proc *p = td->td_proc; mtx_lock(&Giant); td->td_retval[0] = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfilesperproc); mtx_unlock(&Giant); return (0); } /* * Duplicate a file descriptor to a particular value. * * note: keep in mind that a potential race condition exists when closing * descriptors from a shared descriptor table (via rfork). */ #ifndef _SYS_SYSPROTO_H_ struct dup2_args { u_int from; u_int to; }; #endif /* * MPSAFE */ /* ARGSUSED */ int dup2(td, uap) struct thread *td; struct dup2_args *uap; { struct proc *p = td->td_proc; register struct filedesc *fdp = td->td_proc->p_fd; register u_int old = uap->from, new = uap->to; int i, error; mtx_lock(&Giant); + FILEDESC_LOCK(fdp); retry: if (old >= fdp->fd_nfiles || fdp->fd_ofiles[old] == NULL || new >= p->p_rlimit[RLIMIT_NOFILE].rlim_cur || new >= maxfilesperproc) { + FILEDESC_UNLOCK(fdp); error = EBADF; goto done2; } if (old == new) { td->td_retval[0] = new; + FILEDESC_UNLOCK(fdp); error = 0; goto done2; } if (new >= fdp->fd_nfiles) { - if ((error = fdalloc(td, new, &i))) + if ((error = fdalloc(td, new, &i))) { + FILEDESC_UNLOCK(fdp); goto done2; + } if (new != i) panic("dup2: fdalloc"); /* * fdalloc() may block, retest everything. */ goto retry; } error = do_dup(fdp, (int)old, (int)new, td->td_retval, td); done2: mtx_unlock(&Giant); return(error); } /* * Duplicate a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct dup_args { u_int fd; }; #endif /* * MPSAFE */ /* ARGSUSED */ int dup(td, uap) struct thread *td; struct dup_args *uap; { register struct filedesc *fdp; u_int old; int new, error; mtx_lock(&Giant); old = uap->fd; fdp = td->td_proc->p_fd; + FILEDESC_LOCK(fdp); if (old >= fdp->fd_nfiles || fdp->fd_ofiles[old] == NULL) { + FILEDESC_UNLOCK(fdp); error = EBADF; goto done2; } - if ((error = fdalloc(td, 0, &new))) + if ((error = fdalloc(td, 0, &new))) { + FILEDESC_UNLOCK(fdp); goto done2; + } error = do_dup(fdp, (int)old, new, td->td_retval, td); done2: mtx_unlock(&Giant); return (error); } /* * The file control system call. */ #ifndef _SYS_SYSPROTO_H_ struct fcntl_args { int fd; int cmd; long arg; }; #endif /* * MPSAFE */ /* ARGSUSED */ int fcntl(td, uap) struct thread *td; register struct fcntl_args *uap; { register struct proc *p = td->td_proc; register struct filedesc *fdp; register struct file *fp; register char *pop; struct vnode *vp; int i, tmp, error = 0, flg = F_POSIX; struct flock fl; u_int newmin; + struct proc *leaderp; mtx_lock(&Giant); fdp = p->p_fd; + FILEDESC_LOCK(fdp); if ((unsigned)uap->fd >= fdp->fd_nfiles || (fp = fdp->fd_ofiles[uap->fd]) == NULL) { + FILEDESC_UNLOCK(fdp); error = EBADF; goto done2; } pop = &fdp->fd_ofileflags[uap->fd]; switch (uap->cmd) { case F_DUPFD: newmin = uap->arg; if (newmin >= p->p_rlimit[RLIMIT_NOFILE].rlim_cur || newmin >= maxfilesperproc) { + FILEDESC_UNLOCK(fdp); error = EINVAL; break; } - if ((error = fdalloc(td, newmin, &i))) + if ((error = fdalloc(td, newmin, &i))) { + FILEDESC_UNLOCK(fdp); break; + } error = do_dup(fdp, uap->fd, i, td->td_retval, td); break; case F_GETFD: td->td_retval[0] = *pop & 1; + FILEDESC_UNLOCK(fdp); break; case F_SETFD: *pop = (*pop &~ 1) | (uap->arg & 1); + FILEDESC_UNLOCK(fdp); break; case F_GETFL: + FILE_LOCK(fp); + FILEDESC_UNLOCK(fdp); td->td_retval[0] = OFLAGS(fp->f_flag); + FILE_UNLOCK(fp); break; case F_SETFL: fhold(fp); + FILEDESC_UNLOCK(fdp); fp->f_flag &= ~FCNTLFLAGS; fp->f_flag |= FFLAGS(uap->arg & ~O_ACCMODE) & FCNTLFLAGS; tmp = fp->f_flag & FNONBLOCK; error = fo_ioctl(fp, FIONBIO, (caddr_t)&tmp, td); if (error) { fdrop(fp, td); break; } tmp = fp->f_flag & FASYNC; error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, td); if (!error) { fdrop(fp, td); break; } fp->f_flag &= ~FNONBLOCK; tmp = 0; (void)fo_ioctl(fp, FIONBIO, (caddr_t)&tmp, td); fdrop(fp, td); break; case F_GETOWN: fhold(fp); + FILEDESC_UNLOCK(fdp); error = fo_ioctl(fp, FIOGETOWN, (caddr_t)td->td_retval, td); fdrop(fp, td); break; case F_SETOWN: fhold(fp); + FILEDESC_UNLOCK(fdp); error = fo_ioctl(fp, FIOSETOWN, (caddr_t)&uap->arg, td); fdrop(fp, td); break; case F_SETLKW: flg |= F_WAIT; /* Fall into F_SETLK */ case F_SETLK: if (fp->f_type != DTYPE_VNODE) { + FILEDESC_UNLOCK(fdp); error = EBADF; break; } vp = (struct vnode *)fp->f_data; - /* * copyin/lockop may block */ fhold(fp); + FILEDESC_UNLOCK(fdp); + vp = (struct vnode *)fp->f_data; + /* Copy in the lock structure */ error = copyin((caddr_t)(intptr_t)uap->arg, (caddr_t)&fl, sizeof(fl)); if (error) { fdrop(fp, td); break; } if (fl.l_whence == SEEK_CUR) { if (fp->f_offset < 0 || (fl.l_start > 0 && fp->f_offset > OFF_MAX - fl.l_start)) { fdrop(fp, td); error = EOVERFLOW; break; } fl.l_start += fp->f_offset; } switch (fl.l_type) { case F_RDLCK: if ((fp->f_flag & FREAD) == 0) { error = EBADF; break; } + PROC_LOCK(p); p->p_flag |= P_ADVLOCK; - error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK, + leaderp = p->p_leader; + PROC_UNLOCK(p); + error = VOP_ADVLOCK(vp, (caddr_t)leaderp, F_SETLK, &fl, flg); break; case F_WRLCK: if ((fp->f_flag & FWRITE) == 0) { error = EBADF; break; } + PROC_LOCK(p); p->p_flag |= P_ADVLOCK; - error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK, + leaderp = p->p_leader; + PROC_UNLOCK(p); + error = VOP_ADVLOCK(vp, (caddr_t)leaderp, F_SETLK, &fl, flg); break; case F_UNLCK: - error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK, + PROC_LOCK(p); + leaderp = p->p_leader; + PROC_UNLOCK(p); + error = VOP_ADVLOCK(vp, (caddr_t)leaderp, F_UNLCK, &fl, F_POSIX); break; default: error = EINVAL; break; } fdrop(fp, td); break; case F_GETLK: if (fp->f_type != DTYPE_VNODE) { + FILEDESC_UNLOCK(fdp); error = EBADF; break; } vp = (struct vnode *)fp->f_data; /* * copyin/lockop may block */ fhold(fp); + FILEDESC_UNLOCK(fdp); + vp = (struct vnode *)fp->f_data; + /* Copy in the lock structure */ error = copyin((caddr_t)(intptr_t)uap->arg, (caddr_t)&fl, sizeof(fl)); if (error) { fdrop(fp, td); break; } if (fl.l_type != F_RDLCK && fl.l_type != F_WRLCK && fl.l_type != F_UNLCK) { fdrop(fp, td); error = EINVAL; break; } if (fl.l_whence == SEEK_CUR) { if ((fl.l_start > 0 && fp->f_offset > OFF_MAX - fl.l_start) || (fl.l_start < 0 && fp->f_offset < OFF_MIN - fl.l_start)) { fdrop(fp, td); error = EOVERFLOW; break; } fl.l_start += fp->f_offset; } error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK, &fl, F_POSIX); fdrop(fp, td); if (error == 0) { error = copyout((caddr_t)&fl, (caddr_t)(intptr_t)uap->arg, sizeof(fl)); } break; default: + FILEDESC_UNLOCK(fdp); error = EINVAL; break; } done2: mtx_unlock(&Giant); return (error); } /* * Common code for dup, dup2, and fcntl(F_DUPFD). + * filedesc must be locked, but will be unlocked as a side effect. */ static int do_dup(fdp, old, new, retval, td) register struct filedesc *fdp; register int old, new; register_t *retval; struct thread *td; { struct file *fp; struct file *delfp; + FILEDESC_LOCK_ASSERT(fdp, MA_OWNED); + /* * Save info on the descriptor being overwritten. We have * to do the unmap now, but we cannot close it without * introducing an ownership race for the slot. */ delfp = fdp->fd_ofiles[new]; #if 0 if (delfp && (fdp->fd_ofileflags[new] & UF_MAPPED)) (void) munmapfd(td, new); #endif /* * Duplicate the source descriptor, update lastfile */ fp = fdp->fd_ofiles[old]; fdp->fd_ofiles[new] = fp; fdp->fd_ofileflags[new] = fdp->fd_ofileflags[old] &~ UF_EXCLOSE; fhold(fp); if (new > fdp->fd_lastfile) fdp->fd_lastfile = new; *retval = new; + FILEDESC_UNLOCK(fdp); + /* * If we dup'd over a valid file, we now own the reference to it * and must dispose of it using closef() semantics (as if a * close() were performed on it). */ if (delfp) (void) closef(delfp, td); return (0); } /* * If sigio is on the list associated with a process or process group, * disable signalling from the device, remove sigio from the list and * free sigio. */ void funsetown(sigio) struct sigio *sigio; { int s; if (sigio == NULL) return; s = splhigh(); *(sigio->sio_myref) = NULL; splx(s); if (sigio->sio_pgid < 0) { SLIST_REMOVE(&sigio->sio_pgrp->pg_sigiolst, sigio, sigio, sio_pgsigio); } else /* if ((*sigiop)->sio_pgid > 0) */ { SLIST_REMOVE(&sigio->sio_proc->p_sigiolst, sigio, sigio, sio_pgsigio); } crfree(sigio->sio_ucred); FREE(sigio, M_SIGIO); } /* Free a list of sigio structures. */ void funsetownlst(sigiolst) struct sigiolst *sigiolst; { struct sigio *sigio; while ((sigio = SLIST_FIRST(sigiolst)) != NULL) funsetown(sigio); } /* * This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg). * * After permission checking, add a sigio structure to the sigio list for * the process or process group. */ int fsetown(pgid, sigiop) pid_t pgid; struct sigio **sigiop; { struct proc *proc; struct pgrp *pgrp; struct sigio *sigio; int s; if (pgid == 0) { funsetown(*sigiop); return (0); } if (pgid > 0) { proc = pfind(pgid); if (proc == NULL) return (ESRCH); /* * Policy - Don't allow a process to FSETOWN a process * in another session. * * Remove this test to allow maximum flexibility or * restrict FSETOWN to the current process or process * group for maximum safety. */ if (proc->p_session != curthread->td_proc->p_session) { PROC_UNLOCK(proc); return (EPERM); } PROC_UNLOCK(proc); pgrp = NULL; } else /* if (pgid < 0) */ { pgrp = pgfind(-pgid); if (pgrp == NULL) return (ESRCH); /* * Policy - Don't allow a process to FSETOWN a process * in another session. * * Remove this test to allow maximum flexibility or * restrict FSETOWN to the current process or process * group for maximum safety. */ if (pgrp->pg_session != curthread->td_proc->p_session) return (EPERM); proc = NULL; } funsetown(*sigiop); MALLOC(sigio, struct sigio *, sizeof(struct sigio), M_SIGIO, M_WAITOK); if (pgid > 0) { SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio, sio_pgsigio); sigio->sio_proc = proc; } else { SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio, sio_pgsigio); sigio->sio_pgrp = pgrp; } sigio->sio_pgid = pgid; sigio->sio_ucred = crhold(curthread->td_proc->p_ucred); sigio->sio_myref = sigiop; s = splhigh(); *sigiop = sigio; splx(s); return (0); } /* * This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg). */ pid_t fgetown(sigio) struct sigio *sigio; { return (sigio != NULL ? sigio->sio_pgid : 0); } /* * Close a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct close_args { int fd; }; #endif /* * MPSAFE */ /* ARGSUSED */ int close(td, uap) struct thread *td; struct close_args *uap; { register struct filedesc *fdp; register struct file *fp; register int fd = uap->fd; int error = 0; mtx_lock(&Giant); fdp = td->td_proc->p_fd; + FILEDESC_LOCK(fdp); if ((unsigned)fd >= fdp->fd_nfiles || (fp = fdp->fd_ofiles[fd]) == NULL) { + FILEDESC_UNLOCK(fdp); error = EBADF; goto done2; } #if 0 if (fdp->fd_ofileflags[fd] & UF_MAPPED) (void) munmapfd(td, fd); #endif fdp->fd_ofiles[fd] = NULL; fdp->fd_ofileflags[fd] = 0; /* * we now hold the fp reference that used to be owned by the descriptor * array. */ while (fdp->fd_lastfile > 0 && fdp->fd_ofiles[fdp->fd_lastfile] == NULL) fdp->fd_lastfile--; if (fd < fdp->fd_freefile) fdp->fd_freefile = fd; - if (fd < fdp->fd_knlistsize) + if (fd < fdp->fd_knlistsize) { + FILEDESC_UNLOCK(fdp); knote_fdclose(td, fd); + } else + FILEDESC_UNLOCK(fdp); + error = closef(fp, td); done2: mtx_unlock(&Giant); return(error); } #if defined(COMPAT_43) || defined(COMPAT_SUNOS) /* * Return status information about a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct ofstat_args { int fd; struct ostat *sb; }; #endif /* * MPSAFE */ /* ARGSUSED */ int ofstat(td, uap) struct thread *td; register struct ofstat_args *uap; { struct file *fp; struct stat ub; struct ostat oub; int error; mtx_lock(&Giant); if ((error = fget(td, uap->fd, &fp)) != 0) goto done2; error = fo_stat(fp, &ub, td); if (error == 0) { cvtstat(&ub, &oub); error = copyout((caddr_t)&oub, (caddr_t)uap->sb, sizeof (oub)); } fdrop(fp, td); done2: mtx_unlock(&Giant); return (error); } #endif /* COMPAT_43 || COMPAT_SUNOS */ /* * Return status information about a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fstat_args { int fd; struct stat *sb; }; #endif /* * MPSAFE */ /* ARGSUSED */ int fstat(td, uap) struct thread *td; struct fstat_args *uap; { struct file *fp; struct stat ub; int error; mtx_lock(&Giant); if ((error = fget(td, uap->fd, &fp)) != 0) goto done2; error = fo_stat(fp, &ub, td); if (error == 0) error = copyout((caddr_t)&ub, (caddr_t)uap->sb, sizeof (ub)); fdrop(fp, td); done2: mtx_unlock(&Giant); return (error); } /* * Return status information about a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct nfstat_args { int fd; struct nstat *sb; }; #endif /* * MPSAFE */ /* ARGSUSED */ int nfstat(td, uap) struct thread *td; register struct nfstat_args *uap; { struct file *fp; struct stat ub; struct nstat nub; int error; - mtx_lock(&Giant); if ((error = fget(td, uap->fd, &fp)) != 0) goto done2; error = fo_stat(fp, &ub, td); if (error == 0) { cvtnstat(&ub, &nub); error = copyout((caddr_t)&nub, (caddr_t)uap->sb, sizeof (nub)); } fdrop(fp, td); done2: mtx_unlock(&Giant); return (error); } /* * Return pathconf information about a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fpathconf_args { int fd; int name; }; #endif /* * MPSAFE */ /* ARGSUSED */ int fpathconf(td, uap) struct thread *td; register struct fpathconf_args *uap; { struct file *fp; struct vnode *vp; int error; + fp = ffind_hold(td, uap->fd); + if (fp == NULL) + return (EBADF); mtx_lock(&Giant); if ((error = fget(td, uap->fd, &fp)) != 0) goto done2; switch (fp->f_type) { case DTYPE_PIPE: case DTYPE_SOCKET: if (uap->name != _PC_PIPE_BUF) { + fdrop(fp, td); error = EINVAL; goto done2; } td->td_retval[0] = PIPE_BUF; error = 0; break; case DTYPE_FIFO: case DTYPE_VNODE: vp = (struct vnode *)fp->f_data; error = VOP_PATHCONF(vp, uap->name, td->td_retval); break; default: error = EOPNOTSUPP; break; } fdrop(fp, td); done2: mtx_unlock(&Giant); return(error); } /* * Allocate a file descriptor for the process. */ static int fdexpand; SYSCTL_INT(_debug, OID_AUTO, fdexpand, CTLFLAG_RD, &fdexpand, 0, ""); int fdalloc(td, want, result) struct thread *td; int want; int *result; { struct proc *p = td->td_proc; register struct filedesc *fdp = td->td_proc->p_fd; register int i; int lim, last, nfiles; - struct file **newofile; + struct file **newofile, **oldofile; char *newofileflags; + FILEDESC_LOCK_ASSERT(fdp, MA_OWNED); + /* * Search for a free descriptor starting at the higher * of want or fd_freefile. If that fails, consider * expanding the ofile array. */ lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfilesperproc); for (;;) { last = min(fdp->fd_nfiles, lim); if ((i = want) < fdp->fd_freefile) i = fdp->fd_freefile; for (; i < last; i++) { if (fdp->fd_ofiles[i] == NULL) { fdp->fd_ofileflags[i] = 0; if (i > fdp->fd_lastfile) fdp->fd_lastfile = i; if (want <= fdp->fd_freefile) fdp->fd_freefile = i; *result = i; return (0); } } /* * No space in current array. Expand? */ if (fdp->fd_nfiles >= lim) return (EMFILE); if (fdp->fd_nfiles < NDEXTENT) nfiles = NDEXTENT; else nfiles = 2 * fdp->fd_nfiles; + FILEDESC_UNLOCK(fdp); MALLOC(newofile, struct file **, nfiles * OFILESIZE, M_FILEDESC, M_WAITOK); + FILEDESC_LOCK(fdp); /* * deal with file-table extend race that might have occured * when malloc was blocked. */ if (fdp->fd_nfiles >= nfiles) { + FILEDESC_UNLOCK(fdp); FREE(newofile, M_FILEDESC); + FILEDESC_LOCK(fdp); continue; } newofileflags = (char *) &newofile[nfiles]; /* * Copy the existing ofile and ofileflags arrays * and zero the new portion of each array. */ bcopy(fdp->fd_ofiles, newofile, (i = sizeof(struct file *) * fdp->fd_nfiles)); bzero((char *)newofile + i, nfiles * sizeof(struct file *) - i); bcopy(fdp->fd_ofileflags, newofileflags, (i = sizeof(char) * fdp->fd_nfiles)); bzero(newofileflags + i, nfiles * sizeof(char) - i); if (fdp->fd_nfiles > NDFILE) - FREE(fdp->fd_ofiles, M_FILEDESC); + oldofile = fdp->fd_ofiles; + else + oldofile = NULL; fdp->fd_ofiles = newofile; fdp->fd_ofileflags = newofileflags; fdp->fd_nfiles = nfiles; fdexpand++; + if (oldofile != NULL) + FREE(oldofile, M_FILEDESC); } return (0); } /* * Check to see whether n user file descriptors * are available to the process p. */ int fdavail(td, n) struct thread *td; register int n; { struct proc *p = td->td_proc; register struct filedesc *fdp = td->td_proc->p_fd; register struct file **fpp; register int i, lim, last; + FILEDESC_LOCK_ASSERT(fdp, MA_OWNED); + lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfilesperproc); if ((i = lim - fdp->fd_nfiles) > 0 && (n -= i) <= 0) return (1); last = min(fdp->fd_nfiles, lim); fpp = &fdp->fd_ofiles[fdp->fd_freefile]; for (i = last - fdp->fd_freefile; --i >= 0; fpp++) { if (*fpp == NULL && --n <= 0) return (1); } return (0); } /* * Create a new open file structure and allocate * a file decriptor for the process that refers to it. */ int falloc(td, resultfp, resultfd) register struct thread *td; struct file **resultfp; int *resultfd; { struct proc *p = td->td_proc; register struct file *fp, *fq; int error, i; + sx_xlock(&filelist_lock); if (nfiles >= maxfiles) { + sx_xunlock(&filelist_lock); tablefull("file"); return (ENFILE); } + nfiles++; + sx_xunlock(&filelist_lock); /* * Allocate a new file descriptor. * If the process has file descriptor zero open, add to the list * of open files at that point, otherwise put it at the front of * the list of open files. */ - nfiles++; MALLOC(fp, struct file *, sizeof(struct file), M_FILE, M_WAITOK | M_ZERO); /* * wait until after malloc (which may have blocked) returns before * allocating the slot, else a race might have shrunk it if we had * allocated it before the malloc. */ + FILEDESC_LOCK(p->p_fd); if ((error = fdalloc(td, 0, &i))) { + FILEDESC_UNLOCK(p->p_fd); + sx_xlock(&filelist_lock); nfiles--; + sx_xunlock(&filelist_lock); FREE(fp, M_FILE); return (error); } + mtx_init(&fp->f_mtx, "file structure", MTX_DEF); + fp->f_gcflag = 0; fp->f_count = 1; fp->f_cred = crhold(p->p_ucred); fp->f_ops = &badfileops; fp->f_seqcount = 1; + FILEDESC_UNLOCK(p->p_fd); + sx_xlock(&filelist_lock); + FILEDESC_LOCK(p->p_fd); if ((fq = p->p_fd->fd_ofiles[0])) { LIST_INSERT_AFTER(fq, fp, f_list); } else { LIST_INSERT_HEAD(&filehead, fp, f_list); } p->p_fd->fd_ofiles[i] = fp; + FILEDESC_UNLOCK(p->p_fd); + sx_xunlock(&filelist_lock); if (resultfp) *resultfp = fp; if (resultfd) *resultfd = i; return (0); } /* * Free a file descriptor. */ void ffree(fp) register struct file *fp; { + KASSERT((fp->f_count == 0), ("ffree: fp_fcount not 0!")); + sx_xlock(&filelist_lock); LIST_REMOVE(fp, f_list); - crfree(fp->f_cred); nfiles--; + sx_xunlock(&filelist_lock); + crfree(fp->f_cred); + mtx_destroy(&fp->f_mtx); FREE(fp, M_FILE); } /* * Build a new filedesc structure. */ struct filedesc * fdinit(td) struct thread *td; { register struct filedesc0 *newfdp; register struct filedesc *fdp = td->td_proc->p_fd; MALLOC(newfdp, struct filedesc0 *, sizeof(struct filedesc0), M_FILEDESC, M_WAITOK | M_ZERO); + mtx_init(&newfdp->fd_fd.fd_mtx, "filedesc structure", MTX_DEF); + FILEDESC_LOCK(&newfdp->fd_fd); newfdp->fd_fd.fd_cdir = fdp->fd_cdir; if (newfdp->fd_fd.fd_cdir) VREF(newfdp->fd_fd.fd_cdir); newfdp->fd_fd.fd_rdir = fdp->fd_rdir; if (newfdp->fd_fd.fd_rdir) VREF(newfdp->fd_fd.fd_rdir); newfdp->fd_fd.fd_jdir = fdp->fd_jdir; if (newfdp->fd_fd.fd_jdir) VREF(newfdp->fd_fd.fd_jdir); /* Create the file descriptor table. */ newfdp->fd_fd.fd_refcnt = 1; newfdp->fd_fd.fd_cmask = cmask; newfdp->fd_fd.fd_ofiles = newfdp->fd_dfiles; newfdp->fd_fd.fd_ofileflags = newfdp->fd_dfileflags; newfdp->fd_fd.fd_nfiles = NDFILE; newfdp->fd_fd.fd_knlistsize = -1; + FILEDESC_UNLOCK(&newfdp->fd_fd); return (&newfdp->fd_fd); } /* * Share a filedesc structure. */ struct filedesc * fdshare(p) struct proc *p; { + FILEDESC_LOCK(p->p_fd); p->p_fd->fd_refcnt++; + FILEDESC_UNLOCK(p->p_fd); return (p->p_fd); } /* * Copy a filedesc structure. */ struct filedesc * fdcopy(td) struct thread *td; { register struct filedesc *newfdp, *fdp = td->td_proc->p_fd; register struct file **fpp; - register int i; + register int i, j; /* Certain daemons might not have file descriptors. */ if (fdp == NULL) return (NULL); + FILEDESC_LOCK_ASSERT(fdp, MA_OWNED); + + FILEDESC_UNLOCK(fdp); MALLOC(newfdp, struct filedesc *, sizeof(struct filedesc0), M_FILEDESC, M_WAITOK); + FILEDESC_LOCK(fdp); bcopy(fdp, newfdp, sizeof(struct filedesc)); + FILEDESC_UNLOCK(fdp); + bzero(&newfdp->fd_mtx, sizeof(newfdp->fd_mtx)); + mtx_init(&newfdp->fd_mtx, "filedesc structure", MTX_DEF); if (newfdp->fd_cdir) VREF(newfdp->fd_cdir); if (newfdp->fd_rdir) VREF(newfdp->fd_rdir); if (newfdp->fd_jdir) VREF(newfdp->fd_jdir); newfdp->fd_refcnt = 1; /* * If the number of open files fits in the internal arrays * of the open file structure, use them, otherwise allocate * additional memory for the number of descriptors currently * in use. */ + FILEDESC_LOCK(fdp); + newfdp->fd_lastfile = fdp->fd_lastfile; + newfdp->fd_nfiles = fdp->fd_nfiles; if (newfdp->fd_lastfile < NDFILE) { newfdp->fd_ofiles = ((struct filedesc0 *) newfdp)->fd_dfiles; newfdp->fd_ofileflags = ((struct filedesc0 *) newfdp)->fd_dfileflags; i = NDFILE; } else { /* * Compute the smallest multiple of NDEXTENT needed * for the file descriptors currently in use, * allowing the table to shrink. */ +retry: i = newfdp->fd_nfiles; while (i > 2 * NDEXTENT && i > newfdp->fd_lastfile * 2) i /= 2; + FILEDESC_UNLOCK(fdp); MALLOC(newfdp->fd_ofiles, struct file **, i * OFILESIZE, M_FILEDESC, M_WAITOK); + FILEDESC_LOCK(fdp); + newfdp->fd_lastfile = fdp->fd_lastfile; + newfdp->fd_nfiles = fdp->fd_nfiles; + j = newfdp->fd_nfiles; + while (j > 2 * NDEXTENT && j > newfdp->fd_lastfile * 2) + j /= 2; + if (i != j) { + /* + * The size of the original table has changed. + * Go over once again. + */ + FILEDESC_UNLOCK(fdp); + FREE(newfdp->fd_ofiles, M_FILEDESC); + FILEDESC_LOCK(fdp); + newfdp->fd_lastfile = fdp->fd_lastfile; + newfdp->fd_nfiles = fdp->fd_nfiles; + goto retry; + } newfdp->fd_ofileflags = (char *) &newfdp->fd_ofiles[i]; } newfdp->fd_nfiles = i; bcopy(fdp->fd_ofiles, newfdp->fd_ofiles, i * sizeof(struct file **)); bcopy(fdp->fd_ofileflags, newfdp->fd_ofileflags, i * sizeof(char)); /* * kq descriptors cannot be copied. */ if (newfdp->fd_knlistsize != -1) { fpp = &newfdp->fd_ofiles[newfdp->fd_lastfile]; for (i = newfdp->fd_lastfile; i >= 0; i--, fpp--) { if (*fpp != NULL && (*fpp)->f_type == DTYPE_KQUEUE) { *fpp = NULL; if (i < newfdp->fd_freefile) newfdp->fd_freefile = i; } if (*fpp == NULL && i == newfdp->fd_lastfile && i > 0) newfdp->fd_lastfile--; } newfdp->fd_knlist = NULL; newfdp->fd_knlistsize = -1; newfdp->fd_knhash = NULL; newfdp->fd_knhashmask = 0; } fpp = newfdp->fd_ofiles; for (i = newfdp->fd_lastfile; i-- >= 0; fpp++) { - if (*fpp != NULL) + if (*fpp != NULL) { fhold(*fpp); + } } return (newfdp); } /* * Release a filedesc structure. */ void fdfree(td) struct thread *td; { register struct filedesc *fdp = td->td_proc->p_fd; struct file **fpp; register int i; /* Certain daemons might not have file descriptors. */ if (fdp == NULL) return; - if (--fdp->fd_refcnt > 0) + FILEDESC_LOCK(fdp); + if (--fdp->fd_refcnt > 0) { + FILEDESC_UNLOCK(fdp); return; + } /* * we are the last reference to the structure, we can * safely assume it will not change out from under us. */ + FILEDESC_UNLOCK(fdp); fpp = fdp->fd_ofiles; for (i = fdp->fd_lastfile; i-- >= 0; fpp++) { if (*fpp) (void) closef(*fpp, td); } if (fdp->fd_nfiles > NDFILE) FREE(fdp->fd_ofiles, M_FILEDESC); if (fdp->fd_cdir) vrele(fdp->fd_cdir); if (fdp->fd_rdir) vrele(fdp->fd_rdir); if (fdp->fd_jdir) vrele(fdp->fd_jdir); if (fdp->fd_knlist) FREE(fdp->fd_knlist, M_KQUEUE); if (fdp->fd_knhash) FREE(fdp->fd_knhash, M_KQUEUE); + mtx_destroy(&fdp->fd_mtx); FREE(fdp, M_FILEDESC); } /* * For setugid programs, we don't want to people to use that setugidness * to generate error messages which write to a file which otherwise would * otherwise be off-limits to the process. * * This is a gross hack to plug the hole. A better solution would involve * a special vop or other form of generalized access control mechanism. We * go ahead and just reject all procfs file systems accesses as dangerous. * * Since setugidsafety calls this only for fd 0, 1 and 2, this check is * sufficient. We also don't for check setugidness since we know we are. */ static int is_unsafe(struct file *fp) { if (fp->f_type == DTYPE_VNODE && ((struct vnode *)(fp->f_data))->v_tag == VT_PROCFS) return (1); return (0); } /* * Make this setguid thing safe, if at all possible. */ void setugidsafety(td) struct thread *td; { struct filedesc *fdp = td->td_proc->p_fd; register int i; /* Certain daemons might not have file descriptors. */ if (fdp == NULL) return; /* * note: fdp->fd_ofiles may be reallocated out from under us while * we are blocked in a close. Be careful! */ + FILEDESC_LOCK(fdp); for (i = 0; i <= fdp->fd_lastfile; i++) { if (i > 2) break; if (fdp->fd_ofiles[i] && is_unsafe(fdp->fd_ofiles[i])) { struct file *fp; #if 0 if ((fdp->fd_ofileflags[i] & UF_MAPPED) != 0) (void) munmapfd(td, i); #endif - if (i < fdp->fd_knlistsize) + if (i < fdp->fd_knlistsize) { + FILEDESC_UNLOCK(fdp); knote_fdclose(td, i); + FILEDESC_LOCK(fdp); + } /* * NULL-out descriptor prior to close to avoid * a race while close blocks. */ fp = fdp->fd_ofiles[i]; fdp->fd_ofiles[i] = NULL; fdp->fd_ofileflags[i] = 0; if (i < fdp->fd_freefile) fdp->fd_freefile = i; + FILEDESC_UNLOCK(fdp); (void) closef(fp, td); + FILEDESC_LOCK(fdp); } } while (fdp->fd_lastfile > 0 && fdp->fd_ofiles[fdp->fd_lastfile] == NULL) fdp->fd_lastfile--; + FILEDESC_UNLOCK(fdp); } /* * Close any files on exec? */ void fdcloseexec(td) struct thread *td; { struct filedesc *fdp = td->td_proc->p_fd; register int i; /* Certain daemons might not have file descriptors. */ if (fdp == NULL) return; + FILEDESC_LOCK(fdp); + /* * We cannot cache fd_ofiles or fd_ofileflags since operations * may block and rip them out from under us. */ for (i = 0; i <= fdp->fd_lastfile; i++) { if (fdp->fd_ofiles[i] != NULL && (fdp->fd_ofileflags[i] & UF_EXCLOSE)) { struct file *fp; #if 0 if (fdp->fd_ofileflags[i] & UF_MAPPED) (void) munmapfd(td, i); #endif - if (i < fdp->fd_knlistsize) + if (i < fdp->fd_knlistsize) { + FILEDESC_UNLOCK(fdp); knote_fdclose(td, i); + FILEDESC_LOCK(fdp); + } /* * NULL-out descriptor prior to close to avoid * a race while close blocks. */ fp = fdp->fd_ofiles[i]; fdp->fd_ofiles[i] = NULL; fdp->fd_ofileflags[i] = 0; if (i < fdp->fd_freefile) fdp->fd_freefile = i; + FILEDESC_UNLOCK(fdp); (void) closef(fp, td); + FILEDESC_LOCK(fdp); } } while (fdp->fd_lastfile > 0 && fdp->fd_ofiles[fdp->fd_lastfile] == NULL) fdp->fd_lastfile--; + FILEDESC_UNLOCK(fdp); } /* * Internal form of close. * Decrement reference count on file structure. * Note: td may be NULL when closing a file * that was being passed in a message. */ int closef(fp, td) register struct file *fp; register struct thread *td; { struct vnode *vp; struct flock lf; if (fp == NULL) return (0); /* * POSIX record locking dictates that any close releases ALL * locks owned by this process. This is handled by setting * a flag in the unlock to free ONLY locks obeying POSIX * semantics, and not to free BSD-style file locks. * If the descriptor was in a message, POSIX-style locks * aren't passed with the descriptor. */ if (td && (td->td_proc->p_flag & P_ADVLOCK) && fp->f_type == DTYPE_VNODE) { lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; lf.l_type = F_UNLCK; vp = (struct vnode *)fp->f_data; (void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc->p_leader, F_UNLCK, &lf, F_POSIX); } return (fdrop(fp, td)); } /* + * Find the struct file 'fd' in process 'p' and bump it's refcount + * struct file is not locked on return. + */ +struct file * +ffind_hold(td, fd) + struct thread *td; + int fd; +{ + struct filedesc *fdp; + struct file *fp; + + if (td == NULL || (fdp = td->td_proc->p_fd) == NULL) + return (NULL); + FILEDESC_LOCK(fdp); + if (fd < 0 || fd >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[fd]) == NULL || + fp->f_ops == &badfileops) + fp = NULL; + else + fhold(fp); + FILEDESC_UNLOCK(fdp); + return (fp); +} + +/* + * Find the struct file 'fd' in process 'p' and bump it's refcount, + * struct file is locked on return. + */ +struct file * +ffind_lock(td, fd) + struct thread *td; + int fd; +{ + struct filedesc *fdp; + struct file *fp; + + if (td == NULL || (fdp = td->td_proc->p_fd) == NULL) + return (NULL); + FILEDESC_LOCK(fdp); + if (fd < 0 || fd >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[fd]) == NULL || + fp->f_ops == &badfileops) { + fp = NULL; + } else { + FILE_LOCK(fp); + fhold_locked(fp); + } + FILEDESC_UNLOCK(fdp); + return (fp); +} + +int +fdrop(fp, td) + struct file *fp; + struct thread *td; +{ + + FILE_LOCK(fp); + return (fdrop_locked(fp, td)); +} + +/* * Extract the file pointer associated with the specified descriptor for * the current user process. If no error occured 0 is returned, *fpp * will be set to the file pointer, and the file pointer's ref count * will be bumped. Use fdrop() to drop it. If an error occured the * non-zero error is returned and *fpp is set to NULL. * * This routine requires Giant for the moment. Once enough of the * system is converted over to this and other encapsulated APIs we * will be able to mutex it and call it without Giant. */ static __inline int _fget(struct thread *td, int fd, struct file **fpp, int flags) { struct filedesc *fdp; struct file *fp; GIANT_REQUIRED; fdp = td->td_proc->p_fd; *fpp = NULL; if ((u_int)fd >= fdp->fd_nfiles) return(EBADF); if ((fp = fdp->fd_ofiles[fd]) == NULL) return(EBADF); /* * Note: FREAD failures returns EBADF to maintain backwards * compatibility with what routines returned before. * * Only one flag, or 0, may be specified. */ if (flags == FREAD && (fp->f_flag & FREAD) == 0) return(EBADF); if (flags == FWRITE && (fp->f_flag & FWRITE) == 0) return(EINVAL); ++fp->f_count; *fpp = fp; return(0); } int fget(struct thread *td, int fd, struct file **fpp) { return(_fget(td, fd, fpp, 0)); } int fget_read(struct thread *td, int fd, struct file **fpp) { return(_fget(td, fd, fpp, FREAD)); } int fget_write(struct thread *td, int fd, struct file **fpp) { return(_fget(td, fd, fpp, FWRITE)); } /* * Like fget() but loads the underlying vnode, or returns an error if * the descriptor does not represent a vnode. Note that pipes use vnodes * but never have VM objects (so VOP_GETVOBJECT() calls will return an * error). The returned vnode will be vref()d. */ static __inline int _fgetvp(struct thread *td, int fd, struct vnode **vpp, int flags) { struct filedesc *fdp; struct file *fp; GIANT_REQUIRED; fdp = td->td_proc->p_fd; *vpp = NULL; if ((u_int)fd >= fdp->fd_nfiles) return(EBADF); if ((fp = fdp->fd_ofiles[fd]) == NULL) return(EBADF); if (fp->f_type != DTYPE_VNODE && fp->f_type != DTYPE_FIFO) return(EINVAL); if (fp->f_data == NULL) return(EINVAL); /* * Note: FREAD failures returns EBADF to maintain backwards * compatibility with what routines returned before. * * Only one flag, or 0, may be specified. */ if (flags == FREAD && (fp->f_flag & FREAD) == 0) return(EBADF); if (flags == FWRITE && (fp->f_flag & FWRITE) == 0) return(EINVAL); *vpp = (struct vnode *)fp->f_data; vref(*vpp); return(0); } int fgetvp(struct thread *td, int fd, struct vnode **vpp) { return(_fgetvp(td, fd, vpp, 0)); } int fgetvp_read(struct thread *td, int fd, struct vnode **vpp) { return(_fgetvp(td, fd, vpp, FREAD)); } int fgetvp_write(struct thread *td, int fd, struct vnode **vpp) { return(_fgetvp(td, fd, vpp, FWRITE)); } /* * Like fget() but loads the underlying socket, or returns an error if * the descriptor does not represent a socket. * * We bump the ref count on the returned socket. XXX Also obtain the SX lock in * the future. */ int fgetsock(struct thread *td, int fd, struct socket **spp, u_int *fflagp) { struct filedesc *fdp; struct file *fp; struct socket *so; GIANT_REQUIRED; fdp = td->td_proc->p_fd; *spp = NULL; if (fflagp) *fflagp = 0; if ((u_int)fd >= fdp->fd_nfiles) return(EBADF); if ((fp = fdp->fd_ofiles[fd]) == NULL) return(EBADF); if (fp->f_type != DTYPE_SOCKET) return(ENOTSOCK); if (fp->f_data == NULL) return(EINVAL); so = (struct socket *)fp->f_data; if (fflagp) *fflagp = fp->f_flag; soref(so); *spp = so; return(0); } /* * Drop the reference count on the the socket and XXX release the SX lock in * the future. The last reference closes the socket. */ void fputsock(struct socket *so) { sorele(so); } int -fdrop(fp, td) +fdrop_locked(fp, td) struct file *fp; struct thread *td; { struct flock lf; struct vnode *vp; int error; - if (--fp->f_count > 0) + FILE_LOCK_ASSERT(fp, MA_OWNED); + + if (--fp->f_count > 0) { + FILE_UNLOCK(fp); return (0); + } if (fp->f_count < 0) panic("fdrop: count < 0"); if ((fp->f_flag & FHASLOCK) && fp->f_type == DTYPE_VNODE) { lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; lf.l_type = F_UNLCK; vp = (struct vnode *)fp->f_data; + FILE_UNLOCK(fp); (void) VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK); - } + } else + FILE_UNLOCK(fp); if (fp->f_ops != &badfileops) error = fo_close(fp, td); else error = 0; ffree(fp); return (error); } /* * Apply an advisory lock on a file descriptor. * * Just attempt to get a record lock of the requested type on * the entire file (l_whence = SEEK_SET, l_start = 0, l_len = 0). */ #ifndef _SYS_SYSPROTO_H_ struct flock_args { int fd; int how; }; #endif /* * MPSAFE */ /* ARGSUSED */ int flock(td, uap) struct thread *td; register struct flock_args *uap; { - register struct filedesc *fdp = td->td_proc->p_fd; register struct file *fp; struct vnode *vp; struct flock lf; int error; - mtx_lock(&Giant); - - if ((unsigned)uap->fd >= fdp->fd_nfiles || - (fp = fdp->fd_ofiles[uap->fd]) == NULL) { - error = EBADF; - goto done2; - } + fp = ffind_hold(td, uap->fd); + if (fp == NULL) + return (EBADF); if (fp->f_type != DTYPE_VNODE) { - error = EOPNOTSUPP; - goto done2; + fdrop(fp, td); + return (EOPNOTSUPP); } + + mtx_lock(&Giant); vp = (struct vnode *)fp->f_data; lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; if (uap->how & LOCK_UN) { lf.l_type = F_UNLCK; + FILE_LOCK(fp); fp->f_flag &= ~FHASLOCK; + FILE_UNLOCK(fp); error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK); goto done2; } if (uap->how & LOCK_EX) lf.l_type = F_WRLCK; else if (uap->how & LOCK_SH) lf.l_type = F_RDLCK; else { error = EBADF; goto done2; } + FILE_LOCK(fp); fp->f_flag |= FHASLOCK; - if (uap->how & LOCK_NB) - error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, F_FLOCK); - else - error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, F_FLOCK|F_WAIT); + FILE_UNLOCK(fp); + error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, + (uap->how & LOCK_NB) ? F_FLOCK : F_FLOCK | F_WAIT); done2: + fdrop(fp, td); mtx_unlock(&Giant); return (error); } /* * File Descriptor pseudo-device driver (/dev/fd/). * * Opening minor device N dup()s the file (if any) connected to file * descriptor N belonging to the calling process. Note that this driver * consists of only the ``open()'' routine, because all subsequent * references to this file will be direct to the other driver. */ /* ARGSUSED */ static int fdopen(dev, mode, type, td) dev_t dev; int mode, type; struct thread *td; { /* * XXX Kludge: set curthread->td_dupfd to contain the value of the * the file descriptor being sought for duplication. The error * return ensures that the vnode for this device will be released * by vn_open. Open will detect this special error and take the * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN * will simply report the error. */ td->td_dupfd = dev2unit(dev); return (ENODEV); } /* * Duplicate the specified descriptor to a free descriptor. */ int dupfdopen(td, fdp, indx, dfd, mode, error) struct thread *td; struct filedesc *fdp; int indx, dfd; int mode; int error; { register struct file *wfp; struct file *fp; /* * If the to-be-dup'd fd number is greater than the allowed number * of file descriptors, or the fd to be dup'd has already been * closed, then reject. */ + FILEDESC_LOCK(fdp); if ((u_int)dfd >= fdp->fd_nfiles || (wfp = fdp->fd_ofiles[dfd]) == NULL) { + FILEDESC_UNLOCK(fdp); return (EBADF); } /* * There are two cases of interest here. * * For ENODEV simply dup (dfd) to file descriptor * (indx) and return. * * For ENXIO steal away the file structure from (dfd) and * store it in (indx). (dfd) is effectively closed by * this operation. * * Any other error code is just returned. */ switch (error) { case ENODEV: /* * Check that the mode the file is being opened for is a * subset of the mode of the existing descriptor. */ - if (((mode & (FREAD|FWRITE)) | wfp->f_flag) != wfp->f_flag) + FILE_LOCK(wfp); + if (((mode & (FREAD|FWRITE)) | wfp->f_flag) != wfp->f_flag) { + FILE_UNLOCK(wfp); + FILEDESC_UNLOCK(fdp); return (EACCES); + } fp = fdp->fd_ofiles[indx]; #if 0 if (fp && fdp->fd_ofileflags[indx] & UF_MAPPED) (void) munmapfd(td, indx); #endif fdp->fd_ofiles[indx] = wfp; fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd]; - fhold(wfp); + fhold_locked(wfp); + FILE_UNLOCK(wfp); if (indx > fdp->fd_lastfile) fdp->fd_lastfile = indx; + if (fp != NULL) + FILE_LOCK(fp); + FILEDESC_UNLOCK(fdp); /* * we now own the reference to fp that the ofiles[] array * used to own. Release it. */ - if (fp) - fdrop(fp, td); + if (fp != NULL) + fdrop_locked(fp, td); return (0); case ENXIO: /* * Steal away the file pointer from dfd, and stuff it into indx. */ fp = fdp->fd_ofiles[indx]; #if 0 if (fp && fdp->fd_ofileflags[indx] & UF_MAPPED) (void) munmapfd(td, indx); #endif fdp->fd_ofiles[indx] = fdp->fd_ofiles[dfd]; fdp->fd_ofiles[dfd] = NULL; fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd]; fdp->fd_ofileflags[dfd] = 0; /* - * we now own the reference to fp that the ofiles[] array - * used to own. Release it. - */ - if (fp) - fdrop(fp, td); - /* * Complete the clean up of the filedesc structure by * recomputing the various hints. */ if (indx > fdp->fd_lastfile) { fdp->fd_lastfile = indx; } else { while (fdp->fd_lastfile > 0 && fdp->fd_ofiles[fdp->fd_lastfile] == NULL) { fdp->fd_lastfile--; } if (dfd < fdp->fd_freefile) fdp->fd_freefile = dfd; } + if (fp != NULL) + FILE_LOCK(fp); + FILEDESC_UNLOCK(fdp); + + /* + * we now own the reference to fp that the ofiles[] array + * used to own. Release it. + */ + if (fp != NULL) + fdrop_locked(fp, td); return (0); default: + FILEDESC_UNLOCK(fdp); return (error); } /* NOTREACHED */ } /* * Get file structures. */ static int sysctl_kern_file(SYSCTL_HANDLER_ARGS) { int error; struct file *fp; + sx_slock(&filelist_lock); if (!req->oldptr) { /* * overestimate by 10 files */ - return (SYSCTL_OUT(req, 0, sizeof(filehead) + - (nfiles + 10) * sizeof(struct file))); + error = SYSCTL_OUT(req, 0, sizeof(filehead) + + (nfiles + 10) * sizeof(struct file)); + sx_sunlock(&filelist_lock); + return (error); } error = SYSCTL_OUT(req, (caddr_t)&filehead, sizeof(filehead)); - if (error) + if (error) { + sx_sunlock(&filelist_lock); return (error); + } /* * followed by an array of file structures */ LIST_FOREACH(fp, &filehead, f_list) { error = SYSCTL_OUT(req, (caddr_t)fp, sizeof (struct file)); - if (error) + if (error) { + sx_sunlock(&filelist_lock); return (error); + } } + sx_sunlock(&filelist_lock); return (0); } SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD, 0, 0, sysctl_kern_file, "S,file", "Entire file table"); SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW, &maxfilesperproc, 0, "Maximum files allowed open per process"); SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW, &maxfiles, 0, "Maximum number of files"); SYSCTL_INT(_kern, OID_AUTO, openfiles, CTLFLAG_RD, &nfiles, 0, "System-wide number of open files"); static void fildesc_drvinit(void *unused) { dev_t dev; dev = make_dev(&fildesc_cdevsw, 0, UID_BIN, GID_BIN, 0666, "fd/0"); make_dev_alias(dev, "stdin"); dev = make_dev(&fildesc_cdevsw, 1, UID_BIN, GID_BIN, 0666, "fd/1"); make_dev_alias(dev, "stdout"); dev = make_dev(&fildesc_cdevsw, 2, UID_BIN, GID_BIN, 0666, "fd/2"); make_dev_alias(dev, "stderr"); if (!devfs_present) { int fd; for (fd = 3; fd < NUMFDESC; fd++) make_dev(&fildesc_cdevsw, fd, UID_BIN, GID_BIN, 0666, "fd/%d", fd); } } struct fileops badfileops = { badfo_readwrite, badfo_readwrite, badfo_ioctl, badfo_poll, badfo_kqfilter, badfo_stat, badfo_close }; static int badfo_readwrite(fp, uio, cred, flags, td) struct file *fp; struct uio *uio; struct ucred *cred; struct thread *td; int flags; { return (EBADF); } static int badfo_ioctl(fp, com, data, td) struct file *fp; u_long com; caddr_t data; struct thread *td; { return (EBADF); } static int badfo_poll(fp, events, cred, td) struct file *fp; int events; struct ucred *cred; struct thread *td; { return (0); } static int badfo_kqfilter(fp, kn) struct file *fp; struct knote *kn; { return (0); } static int badfo_stat(fp, sb, td) struct file *fp; struct stat *sb; struct thread *td; { return (EBADF); } static int badfo_close(fp, td) struct file *fp; struct thread *td; { return (EBADF); } SYSINIT(fildescdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE+CDEV_MAJOR, fildesc_drvinit,NULL) + +static void filelistinit __P((void *)); +SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, filelistinit, NULL) + +/* ARGSUSED*/ +static void +filelistinit(dummy) + void *dummy; +{ + sx_init(&filelist_lock, "filelist lock"); +} Index: head/sys/kern/kern_event.c =================================================================== --- head/sys/kern/kern_event.c (revision 89305) +++ head/sys/kern/kern_event.c (revision 89306) @@ -1,1029 +1,1079 @@ /*- * Copyright (c) 1999,2000,2001 Jonathan Lemon * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system"); static int kqueue_scan(struct file *fp, int maxevents, struct kevent *ulistp, const struct timespec *timeout, struct thread *td); static int kqueue_read(struct file *fp, struct uio *uio, struct ucred *cred, int flags, struct thread *td); static int kqueue_write(struct file *fp, struct uio *uio, struct ucred *cred, int flags, struct thread *td); static int kqueue_ioctl(struct file *fp, u_long com, caddr_t data, struct thread *td); static int kqueue_poll(struct file *fp, int events, struct ucred *cred, struct thread *td); static int kqueue_kqfilter(struct file *fp, struct knote *kn); static int kqueue_stat(struct file *fp, struct stat *st, struct thread *td); static int kqueue_close(struct file *fp, struct thread *td); static void kqueue_wakeup(struct kqueue *kq); static struct fileops kqueueops = { kqueue_read, kqueue_write, kqueue_ioctl, kqueue_poll, kqueue_kqfilter, kqueue_stat, kqueue_close }; static void knote_attach(struct knote *kn, struct filedesc *fdp); static void knote_drop(struct knote *kn, struct thread *td); static void knote_enqueue(struct knote *kn); static void knote_dequeue(struct knote *kn); static void knote_init(void); static struct knote *knote_alloc(void); static void knote_free(struct knote *kn); static void filt_kqdetach(struct knote *kn); static int filt_kqueue(struct knote *kn, long hint); static int filt_procattach(struct knote *kn); static void filt_procdetach(struct knote *kn); static int filt_proc(struct knote *kn, long hint); static int filt_fileattach(struct knote *kn); static void filt_timerexpire(void *knx); static int filt_timerattach(struct knote *kn); static void filt_timerdetach(struct knote *kn); static int filt_timer(struct knote *kn, long hint); static struct filterops file_filtops = { 1, filt_fileattach, NULL, NULL }; static struct filterops kqread_filtops = { 1, NULL, filt_kqdetach, filt_kqueue }; static struct filterops proc_filtops = { 0, filt_procattach, filt_procdetach, filt_proc }; static struct filterops timer_filtops = { 0, filt_timerattach, filt_timerdetach, filt_timer }; static vm_zone_t knote_zone; static int kq_ncallouts = 0; static int kq_calloutmax = (4 * 1024); SYSCTL_INT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW, &kq_calloutmax, 0, "Maximum number of callouts allocated for kqueue"); #define KNOTE_ACTIVATE(kn) do { \ kn->kn_status |= KN_ACTIVE; \ if ((kn->kn_status & (KN_QUEUED | KN_DISABLED)) == 0) \ knote_enqueue(kn); \ } while(0) #define KN_HASHSIZE 64 /* XXX should be tunable */ #define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask)) static int filt_nullattach(struct knote *kn) { return (ENXIO); }; struct filterops null_filtops = { 0, filt_nullattach, NULL, NULL }; extern struct filterops sig_filtops; /* * Table for for all system-defined filters. */ static struct filterops *sysfilt_ops[] = { &file_filtops, /* EVFILT_READ */ &file_filtops, /* EVFILT_WRITE */ &null_filtops, /* EVFILT_AIO */ &file_filtops, /* EVFILT_VNODE */ &proc_filtops, /* EVFILT_PROC */ &sig_filtops, /* EVFILT_SIGNAL */ &timer_filtops, /* EVFILT_TIMER */ }; static int filt_fileattach(struct knote *kn) { return (fo_kqfilter(kn->kn_fp, kn)); } /*ARGSUSED*/ static int kqueue_kqfilter(struct file *fp, struct knote *kn) { struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data; if (kn->kn_filter != EVFILT_READ) return (1); kn->kn_fop = &kqread_filtops; SLIST_INSERT_HEAD(&kq->kq_sel.si_note, kn, kn_selnext); return (0); } static void filt_kqdetach(struct knote *kn) { struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data; SLIST_REMOVE(&kq->kq_sel.si_note, kn, knote, kn_selnext); } /*ARGSUSED*/ static int filt_kqueue(struct knote *kn, long hint) { struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data; kn->kn_data = kq->kq_count; return (kn->kn_data > 0); } static int filt_procattach(struct knote *kn) { struct proc *p; int error; p = pfind(kn->kn_id); if (p == NULL) return (ESRCH); if ((error = p_cansee(curproc, p))) { PROC_UNLOCK(p); return (error); } kn->kn_ptr.p_proc = p; kn->kn_flags |= EV_CLEAR; /* automatically set */ /* * internal flag indicating registration done by kernel */ if (kn->kn_flags & EV_FLAG1) { kn->kn_data = kn->kn_sdata; /* ppid */ kn->kn_fflags = NOTE_CHILD; kn->kn_flags &= ~EV_FLAG1; } SLIST_INSERT_HEAD(&p->p_klist, kn, kn_selnext); PROC_UNLOCK(p); return (0); } /* * The knote may be attached to a different process, which may exit, * leaving nothing for the knote to be attached to. So when the process * exits, the knote is marked as DETACHED and also flagged as ONESHOT so * it will be deleted when read out. However, as part of the knote deletion, * this routine is called, so a check is needed to avoid actually performing * a detach, because the original process does not exist any more. */ static void filt_procdetach(struct knote *kn) { struct proc *p = kn->kn_ptr.p_proc; if (kn->kn_status & KN_DETACHED) return; PROC_LOCK(p); SLIST_REMOVE(&p->p_klist, kn, knote, kn_selnext); PROC_UNLOCK(p); } static int filt_proc(struct knote *kn, long hint) { u_int event; /* * mask off extra data */ event = (u_int)hint & NOTE_PCTRLMASK; /* * if the user is interested in this event, record it. */ if (kn->kn_sfflags & event) kn->kn_fflags |= event; /* * process is gone, so flag the event as finished. */ if (event == NOTE_EXIT) { kn->kn_status |= KN_DETACHED; kn->kn_flags |= (EV_EOF | EV_ONESHOT); return (1); } /* * process forked, and user wants to track the new process, * so attach a new knote to it, and immediately report an * event with the parent's pid. */ if ((event == NOTE_FORK) && (kn->kn_sfflags & NOTE_TRACK)) { struct kevent kev; int error; /* * register knote with new process. */ kev.ident = hint & NOTE_PDATAMASK; /* pid */ kev.filter = kn->kn_filter; kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1; kev.fflags = kn->kn_sfflags; kev.data = kn->kn_id; /* parent */ kev.udata = kn->kn_kevent.udata; /* preserve udata */ error = kqueue_register(kn->kn_kq, &kev, NULL); if (error) kn->kn_fflags |= NOTE_TRACKERR; } return (kn->kn_fflags != 0); } static void filt_timerexpire(void *knx) { struct knote *kn = knx; struct callout *calloutp; struct timeval tv; int tticks; kn->kn_data++; KNOTE_ACTIVATE(kn); if ((kn->kn_flags & EV_ONESHOT) == 0) { tv.tv_sec = kn->kn_sdata / 1000; tv.tv_usec = (kn->kn_sdata % 1000) * 1000; tticks = tvtohz(&tv); calloutp = (struct callout *)kn->kn_hook; callout_reset(calloutp, tticks, filt_timerexpire, kn); } } /* * data contains amount of time to sleep, in milliseconds */ static int filt_timerattach(struct knote *kn) { struct callout *calloutp; struct timeval tv; int tticks; if (kq_ncallouts >= kq_calloutmax) return (ENOMEM); kq_ncallouts++; tv.tv_sec = kn->kn_sdata / 1000; tv.tv_usec = (kn->kn_sdata % 1000) * 1000; tticks = tvtohz(&tv); kn->kn_flags |= EV_CLEAR; /* automatically set */ MALLOC(calloutp, struct callout *, sizeof(*calloutp), M_KQUEUE, M_WAITOK); callout_init(calloutp, 0); callout_reset(calloutp, tticks, filt_timerexpire, kn); kn->kn_hook = (caddr_t)calloutp; return (0); } static void filt_timerdetach(struct knote *kn) { struct callout *calloutp; calloutp = (struct callout *)kn->kn_hook; callout_stop(calloutp); FREE(calloutp, M_KQUEUE); kq_ncallouts--; } static int filt_timer(struct knote *kn, long hint) { return (kn->kn_data != 0); } /* * MPSAFE */ int kqueue(struct thread *td, struct kqueue_args *uap) { struct filedesc *fdp; struct kqueue *kq; struct file *fp; int fd, error; mtx_lock(&Giant); fdp = td->td_proc->p_fd; error = falloc(td, &fp, &fd); if (error) goto done2; + kq = malloc(sizeof(struct kqueue), M_KQUEUE, M_WAITOK | M_ZERO); + TAILQ_INIT(&kq->kq_head); + FILE_LOCK(fp); fp->f_flag = FREAD | FWRITE; fp->f_type = DTYPE_KQUEUE; fp->f_ops = &kqueueops; - kq = malloc(sizeof(struct kqueue), M_KQUEUE, M_WAITOK | M_ZERO); TAILQ_INIT(&kq->kq_head); fp->f_data = (caddr_t)kq; + FILE_UNLOCK(fp); + FILEDESC_LOCK(fdp); td->td_retval[0] = fd; if (fdp->fd_knlistsize < 0) fdp->fd_knlistsize = 0; /* this process has a kq */ + FILEDESC_UNLOCK(fdp); kq->kq_fdp = fdp; done2: mtx_unlock(&Giant); return (error); } #ifndef _SYS_SYSPROTO_H_ struct kevent_args { int fd; const struct kevent *changelist; int nchanges; struct kevent *eventlist; int nevents; const struct timespec *timeout; }; #endif /* * MPSAFE */ int kevent(struct thread *td, struct kevent_args *uap) { struct kevent *kevp; struct kqueue *kq; struct file *fp; struct timespec ts; int i, n, nerrors, error; - mtx_lock(&Giant); - if ((error = fget(td, uap->fd, &fp)) != 0) - goto done; - if (fp->f_type != DTYPE_KQUEUE) { - error = EBADF; - goto done; + fp = ffind_hold(td, uap->fd); + if (fp == NULL || fp->f_type != DTYPE_KQUEUE) { + if (fp != NULL) + fdrop(fp, td); + return (EBADF); } if (uap->timeout != NULL) { error = copyin(uap->timeout, &ts, sizeof(ts)); if (error) - goto done; + goto done_nogiant; uap->timeout = &ts; } + mtx_lock(&Giant); kq = (struct kqueue *)fp->f_data; nerrors = 0; while (uap->nchanges > 0) { n = uap->nchanges > KQ_NEVENTS ? KQ_NEVENTS : uap->nchanges; error = copyin(uap->changelist, kq->kq_kev, n * sizeof(struct kevent)); if (error) goto done; for (i = 0; i < n; i++) { kevp = &kq->kq_kev[i]; kevp->flags &= ~EV_SYSFLAGS; error = kqueue_register(kq, kevp, td); if (error) { if (uap->nevents != 0) { kevp->flags = EV_ERROR; kevp->data = error; (void) copyout((caddr_t)kevp, (caddr_t)uap->eventlist, sizeof(*kevp)); uap->eventlist++; uap->nevents--; nerrors++; } else { goto done; } } } uap->nchanges -= n; uap->changelist += n; } if (nerrors) { td->td_retval[0] = nerrors; error = 0; goto done; } error = kqueue_scan(fp, uap->nevents, uap->eventlist, uap->timeout, td); done: + mtx_unlock(&Giant); +done_nogiant: if (fp != NULL) fdrop(fp, td); - mtx_unlock(&Giant); return (error); } int kqueue_add_filteropts(int filt, struct filterops *filtops) { if (filt > 0) panic("filt(%d) > 0", filt); if (filt + EVFILT_SYSCOUNT < 0) panic("filt(%d) + EVFILT_SYSCOUNT(%d) == %d < 0", filt, EVFILT_SYSCOUNT, filt + EVFILT_SYSCOUNT); if (sysfilt_ops[~filt] != &null_filtops) panic("sysfilt_ops[~filt(%d)] != &null_filtops", filt); sysfilt_ops[~filt] = filtops; return (0); } int kqueue_del_filteropts(int filt) { if (filt > 0) panic("filt(%d) > 0", filt); if (filt + EVFILT_SYSCOUNT < 0) panic("filt(%d) + EVFILT_SYSCOUNT(%d) == %d < 0", filt, EVFILT_SYSCOUNT, filt + EVFILT_SYSCOUNT); if (sysfilt_ops[~filt] == &null_filtops) panic("sysfilt_ops[~filt(%d)] != &null_filtops", filt); sysfilt_ops[~filt] = &null_filtops; return (0); } int kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td) { struct filedesc *fdp = kq->kq_fdp; struct filterops *fops; struct file *fp = NULL; struct knote *kn = NULL; int s, error = 0; if (kev->filter < 0) { if (kev->filter + EVFILT_SYSCOUNT < 0) return (EINVAL); fops = sysfilt_ops[~kev->filter]; /* to 0-base index */ } else { /* * XXX * filter attach routine is responsible for insuring that * the identifier can be attached to it. */ printf("unknown filter: %d\n", kev->filter); return (EINVAL); } + FILEDESC_LOCK(fdp); if (fops->f_isfd) { /* validate descriptor */ if ((u_int)kev->ident >= fdp->fd_nfiles || - (fp = fdp->fd_ofiles[kev->ident]) == NULL) + (fp = fdp->fd_ofiles[kev->ident]) == NULL) { + FILEDESC_UNLOCK(fdp); return (EBADF); + } fhold(fp); if (kev->ident < fdp->fd_knlistsize) { SLIST_FOREACH(kn, &fdp->fd_knlist[kev->ident], kn_link) if (kq == kn->kn_kq && kev->filter == kn->kn_filter) break; } } else { if (fdp->fd_knhashmask != 0) { struct klist *list; list = &fdp->fd_knhash[ KN_HASH((u_long)kev->ident, fdp->fd_knhashmask)]; SLIST_FOREACH(kn, list, kn_link) if (kev->ident == kn->kn_id && kq == kn->kn_kq && kev->filter == kn->kn_filter) break; } } + FILEDESC_UNLOCK(fdp); if (kn == NULL && ((kev->flags & EV_ADD) == 0)) { error = ENOENT; goto done; } /* * kn now contains the matching knote, or NULL if no match */ if (kev->flags & EV_ADD) { if (kn == NULL) { kn = knote_alloc(); if (kn == NULL) { error = ENOMEM; goto done; } kn->kn_fp = fp; kn->kn_kq = kq; kn->kn_fop = fops; /* * apply reference count to knote structure, and * do not release it at the end of this routine. */ fp = NULL; kn->kn_sfflags = kev->fflags; kn->kn_sdata = kev->data; kev->fflags = 0; kev->data = 0; kn->kn_kevent = *kev; knote_attach(kn, fdp); if ((error = fops->f_attach(kn)) != 0) { knote_drop(kn, td); goto done; } } else { /* * The user may change some filter values after the * initial EV_ADD, but doing so will not reset any * filter which have already been triggered. */ kn->kn_sfflags = kev->fflags; kn->kn_sdata = kev->data; kn->kn_kevent.udata = kev->udata; } s = splhigh(); if (kn->kn_fop->f_event(kn, 0)) KNOTE_ACTIVATE(kn); splx(s); } else if (kev->flags & EV_DELETE) { kn->kn_fop->f_detach(kn); knote_drop(kn, td); goto done; } if ((kev->flags & EV_DISABLE) && ((kn->kn_status & KN_DISABLED) == 0)) { s = splhigh(); kn->kn_status |= KN_DISABLED; splx(s); } if ((kev->flags & EV_ENABLE) && (kn->kn_status & KN_DISABLED)) { s = splhigh(); kn->kn_status &= ~KN_DISABLED; if ((kn->kn_status & KN_ACTIVE) && ((kn->kn_status & KN_QUEUED) == 0)) knote_enqueue(kn); splx(s); } done: if (fp != NULL) fdrop(fp, td); return (error); } static int kqueue_scan(struct file *fp, int maxevents, struct kevent *ulistp, const struct timespec *tsp, struct thread *td) { - struct kqueue *kq = (struct kqueue *)fp->f_data; + struct kqueue *kq; struct kevent *kevp; struct timeval atv, rtv, ttv; struct knote *kn, marker; int s, count, timeout, nkev = 0, error = 0; + FILE_LOCK_ASSERT(fp, MA_NOTOWNED); + + kq = (struct kqueue *)fp->f_data; count = maxevents; if (count == 0) goto done; if (tsp != NULL) { TIMESPEC_TO_TIMEVAL(&atv, tsp); if (itimerfix(&atv)) { error = EINVAL; goto done; } if (tsp->tv_sec == 0 && tsp->tv_nsec == 0) timeout = -1; else timeout = atv.tv_sec > 24 * 60 * 60 ? 24 * 60 * 60 * hz : tvtohz(&atv); getmicrouptime(&rtv); timevaladd(&atv, &rtv); } else { atv.tv_sec = 0; atv.tv_usec = 0; timeout = 0; } goto start; retry: if (atv.tv_sec || atv.tv_usec) { getmicrouptime(&rtv); if (timevalcmp(&rtv, &atv, >=)) goto done; ttv = atv; timevalsub(&ttv, &rtv); timeout = ttv.tv_sec > 24 * 60 * 60 ? 24 * 60 * 60 * hz : tvtohz(&ttv); } start: kevp = kq->kq_kev; s = splhigh(); if (kq->kq_count == 0) { if (timeout < 0) { error = EWOULDBLOCK; } else { kq->kq_state |= KQ_SLEEP; error = tsleep(kq, PSOCK | PCATCH, "kqread", timeout); } splx(s); if (error == 0) goto retry; /* don't restart after signals... */ if (error == ERESTART) error = EINTR; else if (error == EWOULDBLOCK) error = 0; goto done; } TAILQ_INSERT_TAIL(&kq->kq_head, &marker, kn_tqe); while (count) { kn = TAILQ_FIRST(&kq->kq_head); TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); if (kn == &marker) { splx(s); if (count == maxevents) goto retry; goto done; } if (kn->kn_status & KN_DISABLED) { kn->kn_status &= ~KN_QUEUED; kq->kq_count--; continue; } if ((kn->kn_flags & EV_ONESHOT) == 0 && kn->kn_fop->f_event(kn, 0) == 0) { kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE); kq->kq_count--; continue; } *kevp = kn->kn_kevent; kevp++; nkev++; if (kn->kn_flags & EV_ONESHOT) { kn->kn_status &= ~KN_QUEUED; kq->kq_count--; splx(s); kn->kn_fop->f_detach(kn); knote_drop(kn, td); s = splhigh(); } else if (kn->kn_flags & EV_CLEAR) { kn->kn_data = 0; kn->kn_fflags = 0; kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE); kq->kq_count--; } else { TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); } count--; if (nkev == KQ_NEVENTS) { splx(s); error = copyout((caddr_t)&kq->kq_kev, (caddr_t)ulistp, sizeof(struct kevent) * nkev); ulistp += nkev; nkev = 0; kevp = kq->kq_kev; s = splhigh(); if (error) break; } } TAILQ_REMOVE(&kq->kq_head, &marker, kn_tqe); splx(s); done: if (nkev != 0) error = copyout((caddr_t)&kq->kq_kev, (caddr_t)ulistp, sizeof(struct kevent) * nkev); td->td_retval[0] = maxevents - count; return (error); } /* * XXX * This could be expanded to call kqueue_scan, if desired. */ /*ARGSUSED*/ static int kqueue_read(struct file *fp, struct uio *uio, struct ucred *cred, int flags, struct thread *td) { return (ENXIO); } /*ARGSUSED*/ static int kqueue_write(struct file *fp, struct uio *uio, struct ucred *cred, int flags, struct thread *td) { return (ENXIO); } /*ARGSUSED*/ static int kqueue_ioctl(struct file *fp, u_long com, caddr_t data, struct thread *td) { return (ENOTTY); } /*ARGSUSED*/ static int kqueue_poll(struct file *fp, int events, struct ucred *cred, struct thread *td) { - struct kqueue *kq = (struct kqueue *)fp->f_data; + struct kqueue *kq; int revents = 0; int s = splnet(); + kq = (struct kqueue *)fp->f_data; if (events & (POLLIN | POLLRDNORM)) { if (kq->kq_count) { revents |= events & (POLLIN | POLLRDNORM); } else { selrecord(td, &kq->kq_sel); kq->kq_state |= KQ_SEL; } } splx(s); return (revents); } /*ARGSUSED*/ static int kqueue_stat(struct file *fp, struct stat *st, struct thread *td) { - struct kqueue *kq = (struct kqueue *)fp->f_data; + struct kqueue *kq; + kq = (struct kqueue *)fp->f_data; bzero((void *)st, sizeof(*st)); st->st_size = kq->kq_count; st->st_blksize = sizeof(struct kevent); st->st_mode = S_IFIFO; return (0); } /*ARGSUSED*/ static int kqueue_close(struct file *fp, struct thread *td) { struct kqueue *kq = (struct kqueue *)fp->f_data; struct filedesc *fdp = td->td_proc->p_fd; struct knote **knp, *kn, *kn0; int i; + FILEDESC_LOCK(fdp); for (i = 0; i < fdp->fd_knlistsize; i++) { knp = &SLIST_FIRST(&fdp->fd_knlist[i]); kn = *knp; while (kn != NULL) { kn0 = SLIST_NEXT(kn, kn_link); if (kq == kn->kn_kq) { kn->kn_fop->f_detach(kn); - fdrop(kn->kn_fp, td); - knote_free(kn); *knp = kn0; + FILE_LOCK(kn->kn_fp); + FILEDESC_UNLOCK(fdp); + fdrop_locked(kn->kn_fp, td); + knote_free(kn); + FILEDESC_LOCK(fdp); } else { knp = &SLIST_NEXT(kn, kn_link); } kn = kn0; } } if (fdp->fd_knhashmask != 0) { for (i = 0; i < fdp->fd_knhashmask + 1; i++) { knp = &SLIST_FIRST(&fdp->fd_knhash[i]); kn = *knp; while (kn != NULL) { kn0 = SLIST_NEXT(kn, kn_link); if (kq == kn->kn_kq) { kn->kn_fop->f_detach(kn); + *knp = kn0; /* XXX non-fd release of kn->kn_ptr */ + FILEDESC_UNLOCK(fdp); knote_free(kn); - *knp = kn0; + FILEDESC_LOCK(fdp); } else { knp = &SLIST_NEXT(kn, kn_link); } kn = kn0; } } } + FILEDESC_UNLOCK(fdp); free(kq, M_KQUEUE); fp->f_data = NULL; return (0); } static void kqueue_wakeup(struct kqueue *kq) { if (kq->kq_state & KQ_SLEEP) { kq->kq_state &= ~KQ_SLEEP; wakeup(kq); } if (kq->kq_state & KQ_SEL) { kq->kq_state &= ~KQ_SEL; selwakeup(&kq->kq_sel); } KNOTE(&kq->kq_sel.si_note, 0); } /* * walk down a list of knotes, activating them if their event has triggered. */ void knote(struct klist *list, long hint) { struct knote *kn; SLIST_FOREACH(kn, list, kn_selnext) if (kn->kn_fop->f_event(kn, hint)) KNOTE_ACTIVATE(kn); } /* * remove all knotes from a specified klist */ void knote_remove(struct thread *td, struct klist *list) { struct knote *kn; while ((kn = SLIST_FIRST(list)) != NULL) { kn->kn_fop->f_detach(kn); knote_drop(kn, td); } } /* * remove all knotes referencing a specified fd */ void knote_fdclose(struct thread *td, int fd) { struct filedesc *fdp = td->td_proc->p_fd; - struct klist *list = &fdp->fd_knlist[fd]; + struct klist *list; + FILEDESC_LOCK(fdp); + list = &fdp->fd_knlist[fd]; + FILEDESC_UNLOCK(fdp); knote_remove(td, list); } static void knote_attach(struct knote *kn, struct filedesc *fdp) { - struct klist *list; - int size; + struct klist *list, *oldlist; + int size, newsize; + FILEDESC_LOCK(fdp); + if (! kn->kn_fop->f_isfd) { if (fdp->fd_knhashmask == 0) fdp->fd_knhash = hashinit(KN_HASHSIZE, M_KQUEUE, &fdp->fd_knhashmask); list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)]; goto done; } if (fdp->fd_knlistsize <= kn->kn_id) { +retry: size = fdp->fd_knlistsize; while (size <= kn->kn_id) size += KQEXTENT; + FILEDESC_UNLOCK(fdp); MALLOC(list, struct klist *, size * sizeof(struct klist *), M_KQUEUE, M_WAITOK); + FILEDESC_LOCK(fdp); + newsize = fdp->fd_knlistsize; + while (newsize <= kn->kn_id) + newsize += KQEXTENT; + if (newsize != size) { + FILEDESC_UNLOCK(fdp); + free(list, M_TEMP); + FILEDESC_LOCK(fdp); + goto retry; + } bcopy((caddr_t)fdp->fd_knlist, (caddr_t)list, fdp->fd_knlistsize * sizeof(struct klist *)); bzero((caddr_t)list + fdp->fd_knlistsize * sizeof(struct klist *), (size - fdp->fd_knlistsize) * sizeof(struct klist *)); if (fdp->fd_knlist != NULL) - FREE(fdp->fd_knlist, M_KQUEUE); + oldlist = fdp->fd_knlist; + else + oldlist = NULL; fdp->fd_knlistsize = size; fdp->fd_knlist = list; + FILEDESC_UNLOCK(fdp); + if (oldlist != NULL) + FREE(oldlist, M_KQUEUE); + FILEDESC_LOCK(fdp); } list = &fdp->fd_knlist[kn->kn_id]; done: + FILEDESC_UNLOCK(fdp); SLIST_INSERT_HEAD(list, kn, kn_link); kn->kn_status = 0; } /* * should be called at spl == 0, since we don't want to hold spl * while calling fdrop and free. */ static void knote_drop(struct knote *kn, struct thread *td) { struct filedesc *fdp = td->td_proc->p_fd; struct klist *list; + FILEDESC_LOCK(fdp); if (kn->kn_fop->f_isfd) list = &fdp->fd_knlist[kn->kn_id]; else list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)]; + if (kn->kn_fop->f_isfd) + FILE_LOCK(kn->kn_fp); + FILEDESC_UNLOCK(fdp); SLIST_REMOVE(list, kn, knote, kn_link); if (kn->kn_status & KN_QUEUED) knote_dequeue(kn); if (kn->kn_fop->f_isfd) - fdrop(kn->kn_fp, td); + fdrop_locked(kn->kn_fp, td); knote_free(kn); } static void knote_enqueue(struct knote *kn) { struct kqueue *kq = kn->kn_kq; int s = splhigh(); KASSERT((kn->kn_status & KN_QUEUED) == 0, ("knote already queued")); TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); kn->kn_status |= KN_QUEUED; kq->kq_count++; splx(s); kqueue_wakeup(kq); } static void knote_dequeue(struct knote *kn) { struct kqueue *kq = kn->kn_kq; int s = splhigh(); KASSERT(kn->kn_status & KN_QUEUED, ("knote not queued")); TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); kn->kn_status &= ~KN_QUEUED; kq->kq_count--; splx(s); } static void knote_init(void) { knote_zone = zinit("KNOTE", sizeof(struct knote), 0, 0, 1); } SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL) static struct knote * knote_alloc(void) { return ((struct knote *)zalloc(knote_zone)); } static void knote_free(struct knote *kn) { zfree(knote_zone, kn); } Index: head/sys/kern/kern_exec.c =================================================================== --- head/sys/kern/kern_exec.c (revision 89305) +++ head/sys/kern/kern_exec.c (revision 89306) @@ -1,986 +1,989 @@ /* * Copyright (c) 1993, David Greenman * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include MALLOC_DEFINE(M_PARGS, "proc-args", "Process arguments"); static MALLOC_DEFINE(M_ATEXEC, "atexec", "atexec callback"); /* * callout list for things to do at exec time */ struct execlist { execlist_fn function; TAILQ_ENTRY(execlist) next; }; TAILQ_HEAD(exec_list_head, execlist); static struct exec_list_head exec_list = TAILQ_HEAD_INITIALIZER(exec_list); static register_t *exec_copyout_strings __P((struct image_params *)); /* XXX This should be vm_size_t. */ static u_long ps_strings = PS_STRINGS; SYSCTL_ULONG(_kern, KERN_PS_STRINGS, ps_strings, CTLFLAG_RD, &ps_strings, 0, ""); /* XXX This should be vm_size_t. */ static u_long usrstack = USRSTACK; SYSCTL_ULONG(_kern, KERN_USRSTACK, usrstack, CTLFLAG_RD, &usrstack, 0, ""); u_long ps_arg_cache_limit = PAGE_SIZE / 16; SYSCTL_ULONG(_kern, OID_AUTO, ps_arg_cache_limit, CTLFLAG_RW, &ps_arg_cache_limit, 0, ""); int ps_argsopen = 1; SYSCTL_INT(_kern, OID_AUTO, ps_argsopen, CTLFLAG_RW, &ps_argsopen, 0, ""); /* * Each of the items is a pointer to a `const struct execsw', hence the * double pointer here. */ static const struct execsw **execsw; #ifndef _SYS_SYSPROTO_H_ struct execve_args { char *fname; char **argv; char **envv; }; #endif /* * execve() system call. * * MPSAFE */ int execve(td, uap) struct thread *td; register struct execve_args *uap; { struct proc *p = td->td_proc; struct nameidata nd, *ndp; struct ucred *newcred, *oldcred; register_t *stack_base; int error, len, i; struct image_params image_params, *imgp; struct vattr attr; int (*img_first) __P((struct image_params *)); struct pargs *pa; struct execlist *ep; imgp = &image_params; /* * Lock the process and set the P_INEXEC flag to indicate that * it should be left alone until we're done here. This is * necessary to avoid race conditions - e.g. in ptrace() - * that might allow a local user to illicitly obtain elevated * privileges. */ mtx_lock(&Giant); PROC_LOCK(p); KASSERT((p->p_flag & P_INEXEC) == 0, ("%s(): process already has P_INEXEC flag", __func__)); p->p_flag |= P_INEXEC; PROC_UNLOCK(p); /* XXXKSE */ /* !!!!!!!! we need abort all the other threads of this process before we */ /* proceed beyond his point! */ /* * Initialize part of the common data */ imgp->proc = p; imgp->uap = uap; imgp->attr = &attr; imgp->argc = imgp->envc = 0; imgp->argv0 = NULL; imgp->entry_addr = 0; imgp->vmspace_destroyed = 0; imgp->interpreted = 0; imgp->interpreter_name[0] = '\0'; imgp->auxargs = NULL; imgp->vp = NULL; imgp->firstpage = NULL; imgp->ps_strings = 0; imgp->auxarg_size = 0; /* * Allocate temporary demand zeroed space for argument and * environment strings */ imgp->stringbase = (char *)kmem_alloc_wait(exec_map, ARG_MAX + PAGE_SIZE); if (imgp->stringbase == NULL) { error = ENOMEM; goto exec_fail; } imgp->stringp = imgp->stringbase; imgp->stringspace = ARG_MAX; imgp->image_header = imgp->stringbase + ARG_MAX; /* * Translate the file name. namei() returns a vnode pointer * in ni_vp amoung other things. */ ndp = &nd; NDINIT(ndp, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME, UIO_USERSPACE, uap->fname, td); interpret: error = namei(ndp); if (error) { kmem_free_wakeup(exec_map, (vm_offset_t)imgp->stringbase, ARG_MAX + PAGE_SIZE); goto exec_fail; } imgp->vp = ndp->ni_vp; imgp->fname = uap->fname; /* * Check file permissions (also 'opens' file) */ error = exec_check_permissions(imgp); if (error) { VOP_UNLOCK(imgp->vp, 0, td); goto exec_fail_dealloc; } error = exec_map_first_page(imgp); VOP_UNLOCK(imgp->vp, 0, td); if (error) goto exec_fail_dealloc; /* * If the current process has a special image activator it * wants to try first, call it. For example, emulating shell * scripts differently. */ error = -1; if ((img_first = imgp->proc->p_sysent->sv_imgact_try) != NULL) error = img_first(imgp); /* * Loop through the list of image activators, calling each one. * An activator returns -1 if there is no match, 0 on success, * and an error otherwise. */ for (i = 0; error == -1 && execsw[i]; ++i) { if (execsw[i]->ex_imgact == NULL || execsw[i]->ex_imgact == img_first) { continue; } error = (*execsw[i]->ex_imgact)(imgp); } if (error) { if (error == -1) error = ENOEXEC; goto exec_fail_dealloc; } /* * Special interpreter operation, cleanup and loop up to try to * activate the interpreter. */ if (imgp->interpreted) { exec_unmap_first_page(imgp); /* free name buffer and old vnode */ NDFREE(ndp, NDF_ONLY_PNBUF); vrele(ndp->ni_vp); /* set new name to that of the interpreter */ NDINIT(ndp, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME, UIO_SYSSPACE, imgp->interpreter_name, td); goto interpret; } TAILQ_FOREACH(ep, &exec_list, next) (*ep->function)(p); /* * Copy out strings (args and env) and initialize stack base */ stack_base = exec_copyout_strings(imgp); p->p_vmspace->vm_minsaddr = (char *)stack_base; /* * If custom stack fixup routine present for this process * let it do the stack setup. * Else stuff argument count as first item on stack */ if (p->p_sysent->sv_fixup) (*p->p_sysent->sv_fixup)(&stack_base, imgp); else suword(--stack_base, imgp->argc); /* * For security and other reasons, the file descriptor table cannot * be shared after an exec. */ + FILEDESC_LOCK(p->p_fd); if (p->p_fd->fd_refcnt > 1) { struct filedesc *tmp; tmp = fdcopy(td); + FILEDESC_UNLOCK(p->p_fd); fdfree(td); p->p_fd = tmp; - } + } else + FILEDESC_UNLOCK(p->p_fd); /* * For security and other reasons, signal handlers cannot * be shared after an exec. The new process gets a copy of the old * handlers. In execsigs(), the new process will have its signals * reset. */ if (p->p_procsig->ps_refcnt > 1) { struct procsig *newprocsig; MALLOC(newprocsig, struct procsig *, sizeof(struct procsig), M_SUBPROC, M_WAITOK); bcopy(p->p_procsig, newprocsig, sizeof(*newprocsig)); p->p_procsig->ps_refcnt--; p->p_procsig = newprocsig; p->p_procsig->ps_refcnt = 1; if (p->p_sigacts == &p->p_uarea->u_sigacts) panic("shared procsig but private sigacts?"); p->p_uarea->u_sigacts = *p->p_sigacts; p->p_sigacts = &p->p_uarea->u_sigacts; } /* Stop profiling */ stopprofclock(p); /* close files on exec */ fdcloseexec(td); /* reset caught signals */ execsigs(p); /* name this process - nameiexec(p, ndp) */ len = min(ndp->ni_cnd.cn_namelen,MAXCOMLEN); bcopy(ndp->ni_cnd.cn_nameptr, p->p_comm, len); p->p_comm[len] = 0; /* * mark as execed, wakeup the process that vforked (if any) and tell * it that it now has its own resources back */ PROC_LOCK(p); p->p_flag |= P_EXEC; if (p->p_pptr && (p->p_flag & P_PPWAIT)) { p->p_flag &= ~P_PPWAIT; wakeup((caddr_t)p->p_pptr); } /* * Implement image setuid/setgid. * * Don't honor setuid/setgid if the filesystem prohibits it or if * the process is being traced. */ oldcred = p->p_ucred; newcred = NULL; if ((((attr.va_mode & VSUID) && oldcred->cr_uid != attr.va_uid) || ((attr.va_mode & VSGID) && oldcred->cr_gid != attr.va_gid)) && (imgp->vp->v_mount->mnt_flag & MNT_NOSUID) == 0 && (p->p_flag & P_TRACED) == 0) { PROC_UNLOCK(p); /* * Turn off syscall tracing for set-id programs, except for * root. Record any set-id flags first to make sure that * we do not regain any tracing during a possible block. */ setsugid(p); if (p->p_tracep && suser_xxx(oldcred, NULL, PRISON_ROOT)) { struct vnode *vtmp; if ((vtmp = p->p_tracep) != NULL) { p->p_tracep = NULL; p->p_traceflag = 0; vrele(vtmp); } } /* * Set the new credentials. */ newcred = crdup(oldcred); if (attr.va_mode & VSUID) change_euid(newcred, attr.va_uid); if (attr.va_mode & VSGID) change_egid(newcred, attr.va_gid); setugidsafety(td); } else { if (oldcred->cr_uid == oldcred->cr_ruid && oldcred->cr_gid == oldcred->cr_rgid) p->p_flag &= ~P_SUGID; PROC_UNLOCK(p); } /* * Implement correct POSIX saved-id behavior. * * XXX: It's not clear that the existing behavior is * POSIX-compliant. A number of sources indicate that the saved * uid/gid should only be updated if the new ruid is not equal to * the old ruid, or the new euid is not equal to the old euid and * the new euid is not equal to the old ruid. The FreeBSD code * always updates the saved uid/gid. Also, this code uses the new * (replaced) euid and egid as the source, which may or may not be * the right ones to use. */ if (newcred == NULL) { if (oldcred->cr_svuid != oldcred->cr_uid || oldcred->cr_svgid != oldcred->cr_gid) { newcred = crdup(oldcred); change_svuid(newcred, newcred->cr_uid); change_svgid(newcred, newcred->cr_gid); } } else { change_svuid(newcred, newcred->cr_uid); change_svgid(newcred, newcred->cr_gid); } if (newcred != NULL) { PROC_LOCK(p); p->p_ucred = newcred; PROC_UNLOCK(p); crfree(oldcred); } /* * Store the vp for use in procfs */ if (p->p_textvp) /* release old reference */ vrele(p->p_textvp); VREF(ndp->ni_vp); p->p_textvp = ndp->ni_vp; /* * Notify others that we exec'd, and clear the P_INEXEC flag * as we're now a bona fide freshly-execed process. */ PROC_LOCK(p); KNOTE(&p->p_klist, NOTE_EXEC); p->p_flag &= ~P_INEXEC; /* * If tracing the process, trap to debugger so breakpoints * can be set before the program executes. */ _STOPEVENT(p, S_EXEC, 0); if (p->p_flag & P_TRACED) psignal(p, SIGTRAP); /* clear "fork but no exec" flag, as we _are_ execing */ p->p_acflag &= ~AFORK; /* Free any previous argument cache */ pa = p->p_args; p->p_args = NULL; PROC_UNLOCK(p); if (pa != NULL && --pa->ar_ref == 0) FREE(pa, M_PARGS); /* Set values passed into the program in registers. */ setregs(td, imgp->entry_addr, (u_long)(uintptr_t)stack_base, imgp->ps_strings); /* Cache arguments if they fit inside our allowance */ i = imgp->endargs - imgp->stringbase; if (ps_arg_cache_limit >= i + sizeof(struct pargs)) { MALLOC(pa, struct pargs *, sizeof(struct pargs) + i, M_PARGS, M_WAITOK); pa->ar_ref = 1; pa->ar_length = i; bcopy(imgp->stringbase, pa->ar_args, i); PROC_LOCK(p); p->p_args = pa; PROC_UNLOCK(p); } exec_fail_dealloc: /* * free various allocated resources */ if (imgp->firstpage) exec_unmap_first_page(imgp); if (imgp->stringbase != NULL) kmem_free_wakeup(exec_map, (vm_offset_t)imgp->stringbase, ARG_MAX + PAGE_SIZE); if (imgp->vp) { NDFREE(ndp, NDF_ONLY_PNBUF); vrele(imgp->vp); } if (error == 0) goto done2; exec_fail: /* we're done here, clear P_INEXEC */ PROC_LOCK(p); p->p_flag &= ~P_INEXEC; PROC_UNLOCK(p); if (imgp->vmspace_destroyed) { /* sorry, no more process anymore. exit gracefully */ exit1(td, W_EXITCODE(0, SIGABRT)); /* NOT REACHED */ error = 0; } done2: mtx_unlock(&Giant); return (error); } int exec_map_first_page(imgp) struct image_params *imgp; { int rv, i; int initial_pagein; vm_page_t ma[VM_INITIAL_PAGEIN]; vm_object_t object; GIANT_REQUIRED; if (imgp->firstpage) { exec_unmap_first_page(imgp); } VOP_GETVOBJECT(imgp->vp, &object); ma[0] = vm_page_grab(object, 0, VM_ALLOC_NORMAL | VM_ALLOC_RETRY); if ((ma[0]->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL) { initial_pagein = VM_INITIAL_PAGEIN; if (initial_pagein > object->size) initial_pagein = object->size; for (i = 1; i < initial_pagein; i++) { if ((ma[i] = vm_page_lookup(object, i)) != NULL) { if ((ma[i]->flags & PG_BUSY) || ma[i]->busy) break; if (ma[i]->valid) break; vm_page_busy(ma[i]); } else { ma[i] = vm_page_alloc(object, i, VM_ALLOC_NORMAL); if (ma[i] == NULL) break; } } initial_pagein = i; rv = vm_pager_get_pages(object, ma, initial_pagein, 0); ma[0] = vm_page_lookup(object, 0); if ((rv != VM_PAGER_OK) || (ma[0] == NULL) || (ma[0]->valid == 0)) { if (ma[0]) { vm_page_protect(ma[0], VM_PROT_NONE); vm_page_free(ma[0]); } return EIO; } } vm_page_wire(ma[0]); vm_page_wakeup(ma[0]); pmap_kenter((vm_offset_t) imgp->image_header, VM_PAGE_TO_PHYS(ma[0])); imgp->firstpage = ma[0]; return 0; } void exec_unmap_first_page(imgp) struct image_params *imgp; { GIANT_REQUIRED; if (imgp->firstpage) { pmap_kremove((vm_offset_t) imgp->image_header); vm_page_unwire(imgp->firstpage, 1); imgp->firstpage = NULL; } } /* * Destroy old address space, and allocate a new stack * The new stack is only SGROWSIZ large because it is grown * automatically in trap.c. */ int exec_new_vmspace(imgp) struct image_params *imgp; { int error; struct vmspace *vmspace = imgp->proc->p_vmspace; vm_offset_t stack_addr = USRSTACK - maxssiz; vm_map_t map = &vmspace->vm_map; GIANT_REQUIRED; imgp->vmspace_destroyed = 1; /* * Blow away entire process VM, if address space not shared, * otherwise, create a new VM space so that other threads are * not disrupted */ if (vmspace->vm_refcnt == 1) { if (vmspace->vm_shm) shmexit(imgp->proc); pmap_remove_pages(vmspace_pmap(vmspace), 0, VM_MAXUSER_ADDRESS); vm_map_remove(map, 0, VM_MAXUSER_ADDRESS); } else { vmspace_exec(imgp->proc); vmspace = imgp->proc->p_vmspace; map = &vmspace->vm_map; } /* Allocate a new stack */ error = vm_map_stack(&vmspace->vm_map, stack_addr, (vm_size_t)maxssiz, VM_PROT_ALL, VM_PROT_ALL, 0); if (error) return (error); #ifdef __ia64__ { /* * Allocate backing store. We really need something * similar to vm_map_stack which can allow the backing * store to grow upwards. This will do for now. */ vm_offset_t bsaddr; bsaddr = USRSTACK - 2*maxssiz; error = vm_map_find(&vmspace->vm_map, 0, 0, &bsaddr, 4*PAGE_SIZE, 0, VM_PROT_ALL, VM_PROT_ALL, 0); imgp->proc->p_thread.td_md.md_bspstore = bsaddr; } #endif /* vm_ssize and vm_maxsaddr are somewhat antiquated concepts in the * VM_STACK case, but they are still used to monitor the size of the * process stack so we can check the stack rlimit. */ vmspace->vm_ssize = sgrowsiz >> PAGE_SHIFT; vmspace->vm_maxsaddr = (char *)USRSTACK - maxssiz; return(0); } /* * Copy out argument and environment strings from the old process * address space into the temporary string buffer. */ int exec_extract_strings(imgp) struct image_params *imgp; { char **argv, **envv; char *argp, *envp; int error; size_t length; /* * extract arguments first */ argv = imgp->uap->argv; if (argv) { argp = (caddr_t) (intptr_t) fuword(argv); if (argp == (caddr_t) -1) return (EFAULT); if (argp) argv++; if (imgp->argv0) argp = imgp->argv0; if (argp) { do { if (argp == (caddr_t) -1) return (EFAULT); if ((error = copyinstr(argp, imgp->stringp, imgp->stringspace, &length))) { if (error == ENAMETOOLONG) return(E2BIG); return (error); } imgp->stringspace -= length; imgp->stringp += length; imgp->argc++; } while ((argp = (caddr_t) (intptr_t) fuword(argv++))); } } imgp->endargs = imgp->stringp; /* * extract environment strings */ envv = imgp->uap->envv; if (envv) { while ((envp = (caddr_t) (intptr_t) fuword(envv++))) { if (envp == (caddr_t) -1) return (EFAULT); if ((error = copyinstr(envp, imgp->stringp, imgp->stringspace, &length))) { if (error == ENAMETOOLONG) return(E2BIG); return (error); } imgp->stringspace -= length; imgp->stringp += length; imgp->envc++; } } return (0); } /* * Copy strings out to the new process address space, constructing * new arg and env vector tables. Return a pointer to the base * so that it can be used as the initial stack pointer. */ register_t * exec_copyout_strings(imgp) struct image_params *imgp; { int argc, envc; char **vectp; char *stringp, *destp; register_t *stack_base; struct ps_strings *arginfo; int szsigcode; /* * Calculate string base and vector table pointers. * Also deal with signal trampoline code for this exec type. */ arginfo = (struct ps_strings *)PS_STRINGS; szsigcode = *(imgp->proc->p_sysent->sv_szsigcode); destp = (caddr_t)arginfo - szsigcode - SPARE_USRSPACE - roundup((ARG_MAX - imgp->stringspace), sizeof(char *)); /* * install sigcode */ if (szsigcode) copyout(imgp->proc->p_sysent->sv_sigcode, ((caddr_t)arginfo - szsigcode), szsigcode); /* * If we have a valid auxargs ptr, prepare some room * on the stack. */ if (imgp->auxargs) { /* * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for * lower compatibility. */ imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size : (AT_COUNT * 2); /* * The '+ 2' is for the null pointers at the end of each of * the arg and env vector sets,and imgp->auxarg_size is room * for argument of Runtime loader. */ vectp = (char **) (destp - (imgp->argc + imgp->envc + 2 + imgp->auxarg_size) * sizeof(char *)); } else /* * The '+ 2' is for the null pointers at the end of each of * the arg and env vector sets */ vectp = (char **) (destp - (imgp->argc + imgp->envc + 2) * sizeof(char *)); /* * vectp also becomes our initial stack base */ stack_base = (register_t *)vectp; stringp = imgp->stringbase; argc = imgp->argc; envc = imgp->envc; /* * Copy out strings - arguments and environment. */ copyout(stringp, destp, ARG_MAX - imgp->stringspace); /* * Fill in "ps_strings" struct for ps, w, etc. */ suword(&arginfo->ps_argvstr, (long)(intptr_t)vectp); suword(&arginfo->ps_nargvstr, argc); /* * Fill in argument portion of vector table. */ for (; argc > 0; --argc) { suword(vectp++, (long)(intptr_t)destp); while (*stringp++ != 0) destp++; destp++; } /* a null vector table pointer separates the argp's from the envp's */ suword(vectp++, 0); suword(&arginfo->ps_envstr, (long)(intptr_t)vectp); suword(&arginfo->ps_nenvstr, envc); /* * Fill in environment portion of vector table. */ for (; envc > 0; --envc) { suword(vectp++, (long)(intptr_t)destp); while (*stringp++ != 0) destp++; destp++; } /* end of vector table is a null pointer */ suword(vectp, 0); return (stack_base); } /* * Check permissions of file to execute. * Called with imgp->vp locked. * Return 0 for success or error code on failure. */ int exec_check_permissions(imgp) struct image_params *imgp; { struct proc *p = imgp->proc; struct vnode *vp = imgp->vp; struct vattr *attr = imgp->attr; int error; /* Get file attributes */ error = VOP_GETATTR(vp, attr, p->p_ucred, curthread); /* XXXKSE */ if (error) return (error); /* * 1) Check if file execution is disabled for the filesystem that this * file resides on. * 2) Insure that at least one execute bit is on - otherwise root * will always succeed, and we don't want to happen unless the * file really is executable. * 3) Insure that the file is a regular file. */ if ((vp->v_mount->mnt_flag & MNT_NOEXEC) || ((attr->va_mode & 0111) == 0) || (attr->va_type != VREG)) { return (EACCES); } /* * Zero length files can't be exec'd */ if (attr->va_size == 0) return (ENOEXEC); /* * Check for execute permission to file based on current credentials. */ error = VOP_ACCESS(vp, VEXEC, p->p_ucred, curthread); /* XXXKSE */ if (error) return (error); /* * Check number of open-for-writes on the file and deny execution * if there are any. */ if (vp->v_writecount) return (ETXTBSY); /* * Call filesystem specific open routine (which does nothing in the * general case). */ error = VOP_OPEN(vp, FREAD, p->p_ucred, curthread); /* XXXKSE */ if (error) return (error); return (0); } /* * Exec handler registration */ int exec_register(execsw_arg) const struct execsw *execsw_arg; { const struct execsw **es, **xs, **newexecsw; int count = 2; /* New slot and trailing NULL */ if (execsw) for (es = execsw; *es; es++) count++; newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK); if (newexecsw == NULL) return ENOMEM; xs = newexecsw; if (execsw) for (es = execsw; *es; es++) *xs++ = *es; *xs++ = execsw_arg; *xs = NULL; if (execsw) free(execsw, M_TEMP); execsw = newexecsw; return 0; } int exec_unregister(execsw_arg) const struct execsw *execsw_arg; { const struct execsw **es, **xs, **newexecsw; int count = 1; if (execsw == NULL) panic("unregister with no handlers left?\n"); for (es = execsw; *es; es++) { if (*es == execsw_arg) break; } if (*es == NULL) return ENOENT; for (es = execsw; *es; es++) if (*es != execsw_arg) count++; newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK); if (newexecsw == NULL) return ENOMEM; xs = newexecsw; for (es = execsw; *es; es++) if (*es != execsw_arg) *xs++ = *es; *xs = NULL; if (execsw) free(execsw, M_TEMP); execsw = newexecsw; return 0; } int at_exec(function) execlist_fn function; { struct execlist *ep; #ifdef INVARIANTS /* Be noisy if the programmer has lost track of things */ if (rm_at_exec(function)) printf("WARNING: exec callout entry (%p) already present\n", function); #endif ep = malloc(sizeof(*ep), M_ATEXEC, M_NOWAIT); if (ep == NULL) return (ENOMEM); ep->function = function; TAILQ_INSERT_TAIL(&exec_list, ep, next); return (0); } /* * Scan the exec callout list for the given item and remove it. * Returns the number of items removed (0 or 1) */ int rm_at_exec(function) execlist_fn function; { struct execlist *ep; TAILQ_FOREACH(ep, &exec_list, next) { if (ep->function == function) { TAILQ_REMOVE(&exec_list, ep, next); free(ep, M_ATEXEC); return(1); } } return (0); } Index: head/sys/kern/kern_fork.c =================================================================== --- head/sys/kern/kern_fork.c (revision 89305) +++ head/sys/kern/kern_fork.c (revision 89306) @@ -1,820 +1,826 @@ /* * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_fork.c 8.6 (Berkeley) 4/8/94 * $FreeBSD$ */ #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_ATFORK, "atfork", "atfork callback"); /* * These are the stuctures used to create a callout list for things to do * when forking a process */ struct forklist { forklist_fn function; TAILQ_ENTRY(forklist) next; }; static struct sx fork_list_lock; TAILQ_HEAD(forklist_head, forklist); static struct forklist_head fork_list = TAILQ_HEAD_INITIALIZER(fork_list); #ifndef _SYS_SYSPROTO_H_ struct fork_args { int dummy; }; #endif static void init_fork_list(void *data __unused) { sx_init(&fork_list_lock, "fork list"); } SYSINIT(fork_list, SI_SUB_INTRINSIC, SI_ORDER_ANY, init_fork_list, NULL); /* * MPSAFE */ /* ARGSUSED */ int fork(td, uap) struct thread *td; struct fork_args *uap; { int error; struct proc *p2; mtx_lock(&Giant); error = fork1(td, RFFDG | RFPROC, &p2); if (error == 0) { td->td_retval[0] = p2->p_pid; td->td_retval[1] = 0; } mtx_unlock(&Giant); return error; } /* * MPSAFE */ /* ARGSUSED */ int vfork(td, uap) struct thread *td; struct vfork_args *uap; { int error; struct proc *p2; mtx_lock(&Giant); error = fork1(td, RFFDG | RFPROC | RFPPWAIT | RFMEM, &p2); if (error == 0) { td->td_retval[0] = p2->p_pid; td->td_retval[1] = 0; } mtx_unlock(&Giant); return error; } /* * MPSAFE */ int rfork(td, uap) struct thread *td; struct rfork_args *uap; { int error; struct proc *p2; /* Don't allow kernel only flags. */ if ((uap->flags & RFKERNELONLY) != 0) return (EINVAL); mtx_lock(&Giant); error = fork1(td, uap->flags, &p2); if (error == 0) { td->td_retval[0] = p2 ? p2->p_pid : 0; td->td_retval[1] = 0; } mtx_unlock(&Giant); return error; } int nprocs = 1; /* process 0 */ int lastpid = 0; SYSCTL_INT(_kern, OID_AUTO, lastpid, CTLFLAG_RD, &lastpid, 0, "Last used PID"); /* * Random component to lastpid generation. We mix in a random factor to make * it a little harder to predict. We sanity check the modulus value to avoid * doing it in critical paths. Don't let it be too small or we pointlessly * waste randomness entropy, and don't let it be impossibly large. Using a * modulus that is too big causes a LOT more process table scans and slows * down fork processing as the pidchecked caching is defeated. */ static int randompid = 0; static int sysctl_kern_randompid(SYSCTL_HANDLER_ARGS) { int error, pid; pid = randompid; error = sysctl_handle_int(oidp, &pid, 0, req); if (error || !req->newptr) return (error); if (pid < 0 || pid > PID_MAX - 100) /* out of range */ pid = PID_MAX - 100; else if (pid < 2) /* NOP */ pid = 0; else if (pid < 100) /* Make it reasonable */ pid = 100; randompid = pid; return (error); } SYSCTL_PROC(_kern, OID_AUTO, randompid, CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_kern_randompid, "I", "Random PID modulus"); #if 0 void kse_init(struct kse *kse1, struct kse *kse2) { } void thread_init(struct thread *thread1, struct thread *thread2) { } void ksegrp_init(struct ksegrp *ksegrp1, struct ksegrp *ksegrp2) { } #endif int fork1(td, flags, procp) struct thread *td; /* parent proc */ int flags; struct proc **procp; /* child proc */ { struct proc *p2, *pptr; uid_t uid; struct proc *newproc; int trypid; int ok; static int pidchecked = 0; struct forklist *ep; struct filedesc *fd; struct proc *p1 = td->td_proc; GIANT_REQUIRED; /* Can't copy and clear */ if ((flags & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG)) return (EINVAL); /* * Here we don't create a new process, but we divorce * certain parts of a process from itself. */ if ((flags & RFPROC) == 0) { vm_forkproc(td, 0, flags); /* * Close all file descriptors. */ if (flags & RFCFDG) { struct filedesc *fdtmp; fdtmp = fdinit(td); /* XXXKSE */ PROC_LOCK(p1); fdfree(td); /* XXXKSE */ p1->p_fd = fdtmp; PROC_UNLOCK(p1); } /* * Unshare file descriptors (from parent.) */ if (flags & RFFDG) { + FILEDESC_LOCK(p1->p_fd); if (p1->p_fd->fd_refcnt > 1) { struct filedesc *newfd; + newfd = fdcopy(td); + FILEDESC_UNLOCK(p1->p_fd); PROC_LOCK(p1); fdfree(td); p1->p_fd = newfd; PROC_UNLOCK(p1); - } + } else + FILEDESC_UNLOCK(p1->p_fd); } *procp = NULL; return (0); } /* * Although process entries are dynamically created, we still keep * a global limit on the maximum number we will create. Don't allow * a nonprivileged user to use the last process; don't let root * exceed the limit. The variable nprocs is the current number of * processes, maxproc is the limit. */ uid = p1->p_ucred->cr_ruid; if ((nprocs >= maxproc - 1 && uid != 0) || nprocs >= maxproc) { tablefull("proc"); return (EAGAIN); } /* * Increment the nprocs resource before blocking can occur. There * are hard-limits as to the number of processes that can run. */ nprocs++; /* * Increment the count of procs running with this uid. Don't allow * a nonprivileged user to exceed their current limit. */ ok = chgproccnt(p1->p_ucred->cr_ruidinfo, 1, (uid != 0) ? p1->p_rlimit[RLIMIT_NPROC].rlim_cur : 0); if (!ok) { /* * Back out the process count */ nprocs--; return (EAGAIN); } /* Allocate new proc. */ newproc = zalloc(proc_zone); /* * Setup linkage for kernel based threading */ if((flags & RFTHREAD) != 0) { newproc->p_peers = p1->p_peers; p1->p_peers = newproc; newproc->p_leader = p1->p_leader; } else { newproc->p_peers = NULL; newproc->p_leader = newproc; } newproc->p_vmspace = NULL; /* * Find an unused process ID. We remember a range of unused IDs * ready to use (from lastpid+1 through pidchecked-1). * * If RFHIGHPID is set (used during system boot), do not allocate * low-numbered pids. */ sx_xlock(&allproc_lock); trypid = lastpid + 1; if (flags & RFHIGHPID) { if (trypid < 10) { trypid = 10; } } else { if (randompid) trypid += arc4random() % randompid; } retry: /* * If the process ID prototype has wrapped around, * restart somewhat above 0, as the low-numbered procs * tend to include daemons that don't exit. */ if (trypid >= PID_MAX) { trypid = trypid % PID_MAX; if (trypid < 100) trypid += 100; pidchecked = 0; } if (trypid >= pidchecked) { int doingzomb = 0; pidchecked = PID_MAX; /* * Scan the active and zombie procs to check whether this pid * is in use. Remember the lowest pid that's greater * than trypid, so we can avoid checking for a while. */ p2 = LIST_FIRST(&allproc); again: for (; p2 != NULL; p2 = LIST_NEXT(p2, p_list)) { while (p2->p_pid == trypid || p2->p_pgrp->pg_id == trypid || p2->p_session->s_sid == trypid) { trypid++; if (trypid >= pidchecked) goto retry; } if (p2->p_pid > trypid && pidchecked > p2->p_pid) pidchecked = p2->p_pid; if (p2->p_pgrp->pg_id > trypid && pidchecked > p2->p_pgrp->pg_id) pidchecked = p2->p_pgrp->pg_id; if (p2->p_session->s_sid > trypid && pidchecked > p2->p_session->s_sid) pidchecked = p2->p_session->s_sid; } if (!doingzomb) { doingzomb = 1; p2 = LIST_FIRST(&zombproc); goto again; } } /* * RFHIGHPID does not mess with the lastpid counter during boot. */ if (flags & RFHIGHPID) pidchecked = 0; else lastpid = trypid; p2 = newproc; p2->p_stat = SIDL; /* protect against others */ p2->p_pid = trypid; LIST_INSERT_HEAD(&allproc, p2, p_list); LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash); sx_xunlock(&allproc_lock); /* * Make a proc table entry for the new process. * Start by zeroing the section of proc that is zero-initialized, * then copy the section that is copied directly from the parent. */ bzero(&p2->p_startzero, (unsigned) ((caddr_t)&p2->p_endzero - (caddr_t)&p2->p_startzero)); bzero(&p2->p_kse.ke_startzero, (unsigned) ((caddr_t)&p2->p_kse.ke_endzero - (caddr_t)&p2->p_kse.ke_startzero)); bzero(&p2->p_thread.td_startzero, (unsigned) ((caddr_t)&p2->p_thread.td_endzero - (caddr_t)&p2->p_thread.td_startzero)); bzero(&p2->p_ksegrp.kg_startzero, (unsigned) ((caddr_t)&p2->p_ksegrp.kg_endzero - (caddr_t)&p2->p_ksegrp.kg_startzero)); PROC_LOCK(p1); bcopy(&p1->p_startcopy, &p2->p_startcopy, (unsigned) ((caddr_t)&p2->p_endcopy - (caddr_t)&p2->p_startcopy)); bcopy(&p1->p_kse.ke_startcopy, &p2->p_kse.ke_startcopy, (unsigned) ((caddr_t)&p2->p_kse.ke_endcopy - (caddr_t)&p2->p_kse.ke_startcopy)); bcopy(&p1->p_thread.td_startcopy, &p2->p_thread.td_startcopy, (unsigned) ((caddr_t)&p2->p_thread.td_endcopy - (caddr_t)&p2->p_thread.td_startcopy)); bcopy(&p1->p_ksegrp.kg_startcopy, &p2->p_ksegrp.kg_startcopy, (unsigned) ((caddr_t)&p2->p_ksegrp.kg_endcopy - (caddr_t)&p2->p_ksegrp.kg_startcopy)); PROC_UNLOCK(p1); /* * XXXKSE Theoretically only the running thread would get copied * Others in the kernel would be 'aborted' in the child. * i.e return E*something* */ proc_linkup(p2); mtx_init(&p2->p_mtx, "process lock", MTX_DEF); PROC_LOCK(p2); /* note.. XXXKSE no pcb or u-area yet */ /* * Duplicate sub-structures as needed. * Increase reference counts on shared objects. * The p_stats and p_sigacts substructs are set in vm_forkproc. */ p2->p_flag = 0; mtx_lock_spin(&sched_lock); p2->p_sflag = PS_INMEM; if (p1->p_sflag & PS_PROFIL) startprofclock(p2); mtx_unlock_spin(&sched_lock); PROC_LOCK(p1); p2->p_ucred = crhold(p1->p_ucred); p2->p_thread.td_ucred = crhold(p2->p_ucred); /* XXXKSE */ if (p2->p_args) p2->p_args->ar_ref++; if (flags & RFSIGSHARE) { p2->p_procsig = p1->p_procsig; p2->p_procsig->ps_refcnt++; if (p1->p_sigacts == &p1->p_uarea->u_sigacts) { struct sigacts *newsigacts; PROC_UNLOCK(p1); PROC_UNLOCK(p2); /* Create the shared sigacts structure */ MALLOC(newsigacts, struct sigacts *, sizeof(struct sigacts), M_SUBPROC, M_WAITOK); PROC_LOCK(p2); PROC_LOCK(p1); /* * Set p_sigacts to the new shared structure. * Note that this is updating p1->p_sigacts at the * same time, since p_sigacts is just a pointer to * the shared p_procsig->ps_sigacts. */ p2->p_sigacts = newsigacts; *p2->p_sigacts = p1->p_uarea->u_sigacts; } } else { PROC_UNLOCK(p1); PROC_UNLOCK(p2); MALLOC(p2->p_procsig, struct procsig *, sizeof(struct procsig), M_SUBPROC, M_WAITOK); PROC_LOCK(p2); PROC_LOCK(p1); bcopy(p1->p_procsig, p2->p_procsig, sizeof(*p2->p_procsig)); p2->p_procsig->ps_refcnt = 1; p2->p_sigacts = NULL; /* finished in vm_forkproc() */ } if (flags & RFLINUXTHPN) p2->p_sigparent = SIGUSR1; else p2->p_sigparent = SIGCHLD; /* bump references to the text vnode (for procfs) */ p2->p_textvp = p1->p_textvp; PROC_UNLOCK(p1); PROC_UNLOCK(p2); if (p2->p_textvp) VREF(p2->p_textvp); if (flags & RFCFDG) fd = fdinit(td); - else if (flags & RFFDG) + else if (flags & RFFDG) { + FILEDESC_LOCK(p1->p_fd); fd = fdcopy(td); - else + FILEDESC_UNLOCK(p1->p_fd); + } else fd = fdshare(p1); PROC_LOCK(p2); p2->p_fd = fd; /* * If p_limit is still copy-on-write, bump refcnt, * otherwise get a copy that won't be modified. * (If PL_SHAREMOD is clear, the structure is shared * copy-on-write.) */ PROC_LOCK(p1); if (p1->p_limit->p_lflags & PL_SHAREMOD) p2->p_limit = limcopy(p1->p_limit); else { p2->p_limit = p1->p_limit; p2->p_limit->p_refcnt++; } /* * Preserve some more flags in subprocess. PS_PROFIL has already * been preserved. */ p2->p_flag |= p1->p_flag & (P_SUGID | P_ALTSTACK); if (p1->p_session->s_ttyvp != NULL && p1->p_flag & P_CONTROLT) p2->p_flag |= P_CONTROLT; if (flags & RFPPWAIT) p2->p_flag |= P_PPWAIT; LIST_INSERT_AFTER(p1, p2, p_pglist); PROC_UNLOCK(p1); PROC_UNLOCK(p2); /* * Attach the new process to its parent. * * If RFNOWAIT is set, the newly created process becomes a child * of init. This effectively disassociates the child from the * parent. */ if (flags & RFNOWAIT) pptr = initproc; else pptr = p1; sx_xlock(&proctree_lock); PROC_LOCK(p2); p2->p_pptr = pptr; PROC_UNLOCK(p2); LIST_INSERT_HEAD(&pptr->p_children, p2, p_sibling); sx_xunlock(&proctree_lock); PROC_LOCK(p2); LIST_INIT(&p2->p_children); LIST_INIT(&p2->p_thread.td_contested); /* XXXKSE only 1 thread? */ callout_init(&p2->p_itcallout, 0); callout_init(&p2->p_thread.td_slpcallout, 1); /* XXXKSE */ PROC_LOCK(p1); #ifdef KTRACE /* * Copy traceflag and tracefile if enabled. If not inherited, * these were zeroed above but we still could have a trace race * so make sure p2's p_tracep is NULL. */ if ((p1->p_traceflag & KTRFAC_INHERIT) && p2->p_tracep == NULL) { p2->p_traceflag = p1->p_traceflag; if ((p2->p_tracep = p1->p_tracep) != NULL) { PROC_UNLOCK(p1); PROC_UNLOCK(p2); VREF(p2->p_tracep); PROC_LOCK(p2); PROC_LOCK(p1); } } #endif /* * set priority of child to be that of parent * XXXKSE hey! copying the estcpu seems dodgy.. should split it.. */ mtx_lock_spin(&sched_lock); p2->p_ksegrp.kg_estcpu = p1->p_ksegrp.kg_estcpu; mtx_unlock_spin(&sched_lock); /* * This begins the section where we must prevent the parent * from being swapped. */ _PHOLD(p1); PROC_UNLOCK(p1); PROC_UNLOCK(p2); /* * Finish creating the child process. It will return via a different * execution path later. (ie: directly into user mode) */ vm_forkproc(td, p2, flags); if (flags == (RFFDG | RFPROC)) { cnt.v_forks++; cnt.v_forkpages += p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize; } else if (flags == (RFFDG | RFPROC | RFPPWAIT | RFMEM)) { cnt.v_vforks++; cnt.v_vforkpages += p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize; } else if (p1 == &proc0) { cnt.v_kthreads++; cnt.v_kthreadpages += p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize; } else { cnt.v_rforks++; cnt.v_rforkpages += p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize; } /* * Both processes are set up, now check if any loadable modules want * to adjust anything. * What if they have an error? XXX */ sx_slock(&fork_list_lock); TAILQ_FOREACH(ep, &fork_list, next) { (*ep->function)(p1, p2, flags); } sx_sunlock(&fork_list_lock); /* * If RFSTOPPED not requested, make child runnable and add to * run queue. */ microtime(&(p2->p_stats->p_start)); p2->p_acflag = AFORK; if ((flags & RFSTOPPED) == 0) { mtx_lock_spin(&sched_lock); p2->p_stat = SRUN; setrunqueue(&p2->p_thread); mtx_unlock_spin(&sched_lock); } /* * Now can be swapped. */ PROC_LOCK(p1); _PRELE(p1); /* * tell any interested parties about the new process */ KNOTE(&p1->p_klist, NOTE_FORK | p2->p_pid); PROC_UNLOCK(p1); /* * Preserve synchronization semantics of vfork. If waiting for * child to exec or exit, set P_PPWAIT on child, and sleep on our * proc (in case of exit). */ PROC_LOCK(p2); while (p2->p_flag & P_PPWAIT) msleep(p1, &p2->p_mtx, PWAIT, "ppwait", 0); PROC_UNLOCK(p2); /* * Return child proc pointer to parent. */ *procp = p2; return (0); } /* * The next two functionms are general routines to handle adding/deleting * items on the fork callout list. * * at_fork(): * Take the arguments given and put them onto the fork callout list, * However first make sure that it's not already there. * Returns 0 on success or a standard error number. */ int at_fork(function) forklist_fn function; { struct forklist *ep; #ifdef INVARIANTS /* let the programmer know if he's been stupid */ if (rm_at_fork(function)) printf("WARNING: fork callout entry (%p) already present\n", function); #endif ep = malloc(sizeof(*ep), M_ATFORK, M_NOWAIT); if (ep == NULL) return (ENOMEM); ep->function = function; sx_xlock(&fork_list_lock); TAILQ_INSERT_TAIL(&fork_list, ep, next); sx_xunlock(&fork_list_lock); return (0); } /* * Scan the exit callout list for the given item and remove it.. * Returns the number of items removed (0 or 1) */ int rm_at_fork(function) forklist_fn function; { struct forklist *ep; sx_xlock(&fork_list_lock); TAILQ_FOREACH(ep, &fork_list, next) { if (ep->function == function) { TAILQ_REMOVE(&fork_list, ep, next); sx_xunlock(&fork_list_lock); free(ep, M_ATFORK); return(1); } } sx_xunlock(&fork_list_lock); return (0); } /* * Handle the return of a child process from fork1(). This function * is called from the MD fork_trampoline() entry point. */ void fork_exit(callout, arg, frame) void (*callout)(void *, struct trapframe *); void *arg; struct trapframe *frame; { struct thread *td = curthread; struct proc *p = td->td_proc; td->td_kse->ke_oncpu = PCPU_GET(cpuid); /* * Setup the sched_lock state so that we can release it. */ sched_lock.mtx_lock = (uintptr_t)td; sched_lock.mtx_recurse = 0; td->td_critnest = 1; td->td_savecrit = CRITICAL_FORK; CTR3(KTR_PROC, "fork_exit: new proc %p (pid %d, %s)", p, p->p_pid, p->p_comm); if (PCPU_GET(switchtime.tv_sec) == 0) microuptime(PCPU_PTR(switchtime)); PCPU_SET(switchticks, ticks); mtx_unlock_spin(&sched_lock); /* * cpu_set_fork_handler intercepts this function call to * have this call a non-return function to stay in kernel mode. * initproc has its own fork handler, but it does return. */ KASSERT(callout != NULL, ("NULL callout in fork_exit")); callout(arg, frame); /* * Check if a kernel thread misbehaved and returned from its main * function. */ PROC_LOCK(p); if (p->p_flag & P_KTHREAD) { PROC_UNLOCK(p); mtx_lock(&Giant); printf("Kernel thread \"%s\" (pid %d) exited prematurely.\n", p->p_comm, p->p_pid); kthread_exit(0); } PROC_UNLOCK(p); mtx_lock(&Giant); crfree(td->td_ucred); mtx_unlock(&Giant); td->td_ucred = NULL; mtx_assert(&Giant, MA_NOTOWNED); } /* * Simplified back end of syscall(), used when returning from fork() * directly into user mode. Giant is not held on entry, and must not * be held on return. This function is passed in to fork_exit() as the * first parameter and is called when returning to a new userland process. */ void fork_return(td, frame) struct thread *td; struct trapframe *frame; { userret(td, frame, 0); #ifdef KTRACE if (KTRPOINT(td->td_proc, KTR_SYSRET)) { ktrsysret(td->td_proc->p_tracep, SYS_fork, 0, 0); } #endif mtx_assert(&Giant, MA_NOTOWNED); } Index: head/sys/kern/subr_acl_posix1e.c =================================================================== --- head/sys/kern/subr_acl_posix1e.c (revision 89305) +++ head/sys/kern/subr_acl_posix1e.c (revision 89306) @@ -1,817 +1,821 @@ /*- * Copyright (c) 1999-2001 Robert N. M. Watson * All rights reserved. * * This software was developed by Robert Watson for the TrustedBSD Project. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ /* * Developed by the TrustedBSD Project. * Support for POSIX.1e access control lists. */ #include "opt_cap.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include MALLOC_DEFINE(M_ACL, "acl", "access control list"); static int vacl_set_acl(struct thread *td, struct vnode *vp, acl_type_t type, struct acl *aclp); static int vacl_get_acl(struct thread *td, struct vnode *vp, acl_type_t type, struct acl *aclp); static int vacl_aclcheck(struct thread *td, struct vnode *vp, acl_type_t type, struct acl *aclp); /* * Implement a version of vaccess() that understands POSIX.1e ACL semantics. * Return 0 on success, else an errno value. Should be merged into * vaccess() eventually. */ int vaccess_acl_posix1e(enum vtype type, uid_t file_uid, gid_t file_gid, struct acl *acl, mode_t acc_mode, struct ucred *cred, int *privused) { struct acl_entry *acl_other, *acl_mask; mode_t dac_granted; mode_t cap_granted; mode_t acl_mask_granted; int group_matched, i; /* * Look for a normal, non-privileged way to access the file/directory * as requested. If it exists, go with that. Otherwise, attempt * to use privileges granted via cap_granted. In some cases, * which privileges to use may be ambiguous due to "best match", * in which case fall back on first match for the time being. */ if (privused != NULL) *privused = 0; /* * Determine privileges now, but don't apply until we've found * a DAC entry that matches but has failed to allow access. */ #ifndef CAPABILITIES if (suser_xxx(cred, NULL, PRISON_ROOT) == 0) cap_granted = (VEXEC | VREAD | VWRITE | VADMIN); else cap_granted = 0; #else cap_granted = 0; if (type == VDIR) { if ((acc_mode & VEXEC) && !cap_check(cred, NULL, CAP_DAC_READ_SEARCH, PRISON_ROOT)) cap_granted |= VEXEC; } else { if ((acc_mode & VEXEC) && !cap_check(cred, NULL, CAP_DAC_EXECUTE, PRISON_ROOT)) cap_granted |= VEXEC; } if ((acc_mode & VREAD) && !cap_check(cred, NULL, CAP_DAC_READ_SEARCH, PRISON_ROOT)) cap_granted |= VREAD; if ((acc_mode & VWRITE) && !cap_check(cred, NULL, CAP_DAC_WRITE, PRISON_ROOT)) cap_granted |= VWRITE; if ((acc_mode & VADMIN) && !cap_check(cred, NULL, CAP_FOWNER, PRISON_ROOT)) cap_granted |= VADMIN; #endif /* CAPABILITIES */ /* * The owner matches if the effective uid associated with the * credential matches that of the ACL_USER_OBJ entry. While we're * doing the first scan, also cache the location of the ACL_MASK * and ACL_OTHER entries, preventing some future iterations. */ acl_mask = acl_other = NULL; for (i = 0; i < acl->acl_cnt; i++) { switch (acl->acl_entry[i].ae_tag) { case ACL_USER_OBJ: if (file_uid != cred->cr_uid) break; dac_granted = 0; dac_granted |= VADMIN; if (acl->acl_entry[i].ae_perm & ACL_EXECUTE) dac_granted |= VEXEC; if (acl->acl_entry[i].ae_perm & ACL_READ) dac_granted |= VREAD; if (acl->acl_entry[i].ae_perm & ACL_WRITE) dac_granted |= VWRITE; if ((acc_mode & dac_granted) == acc_mode) return (0); if ((acc_mode & (dac_granted | cap_granted)) == acc_mode) { if (privused != NULL) *privused = 1; return (0); } goto error; case ACL_MASK: acl_mask = &acl->acl_entry[i]; break; case ACL_OTHER: acl_other = &acl->acl_entry[i]; break; default: } } /* * An ACL_OTHER entry should always exist in a valid access * ACL. If it doesn't, then generate a serious failure. For now, * this means a debugging message and EPERM, but in the future * should probably be a panic. */ if (acl_other == NULL) { /* * XXX This should never happen */ printf("vaccess_acl_posix1e: ACL_OTHER missing\n"); return (EPERM); } /* * Checks against ACL_USER, ACL_GROUP_OBJ, and ACL_GROUP fields * are masked by an ACL_MASK entry, if any. As such, first identify * the ACL_MASK field, then iterate through identifying potential * user matches, then group matches. If there is no ACL_MASK, * assume that the mask allows all requests to succeed. */ if (acl_mask != NULL) { acl_mask_granted = 0; if (acl_mask->ae_perm & ACL_EXECUTE) acl_mask_granted |= VEXEC; if (acl_mask->ae_perm & ACL_READ) acl_mask_granted |= VREAD; if (acl_mask->ae_perm & ACL_WRITE) acl_mask_granted |= VWRITE; } else acl_mask_granted = VEXEC | VREAD | VWRITE; /* * Iterate through user ACL entries. Do checks twice, first * without privilege, and then if a match is found but failed, * a second time with privilege. */ /* * Check ACL_USER ACL entries. */ for (i = 0; i < acl->acl_cnt; i++) { switch (acl->acl_entry[i].ae_tag) { case ACL_USER: if (acl->acl_entry[i].ae_id != cred->cr_uid) break; dac_granted = 0; if (acl->acl_entry[i].ae_perm & ACL_EXECUTE) dac_granted |= VEXEC; if (acl->acl_entry[i].ae_perm & ACL_READ) dac_granted |= VREAD; if (acl->acl_entry[i].ae_perm & ACL_WRITE) dac_granted |= VWRITE; dac_granted &= acl_mask_granted; if ((acc_mode & dac_granted) == acc_mode) return (0); if ((acc_mode & (dac_granted | cap_granted)) != acc_mode) goto error; if (privused != NULL) *privused = 1; return (0); } } /* * Group match is best-match, not first-match, so find a * "best" match. Iterate across, testing each potential group * match. Make sure we keep track of whether we found a match * or not, so that we know if we should try again with any * available privilege, or if we should move on to ACL_OTHER. */ group_matched = 0; for (i = 0; i < acl->acl_cnt; i++) { switch (acl->acl_entry[i].ae_tag) { case ACL_GROUP_OBJ: if (!groupmember(file_gid, cred)) break; dac_granted = 0; if (acl->acl_entry[i].ae_perm & ACL_EXECUTE) dac_granted |= VEXEC; if (acl->acl_entry[i].ae_perm & ACL_READ) dac_granted |= VREAD; if (acl->acl_entry[i].ae_perm & ACL_WRITE) dac_granted |= VWRITE; dac_granted &= acl_mask_granted; if ((acc_mode & dac_granted) == acc_mode) return (0); group_matched = 1; break; case ACL_GROUP: if (!groupmember(acl->acl_entry[i].ae_id, cred)) break; dac_granted = 0; if (acl->acl_entry[i].ae_perm & ACL_EXECUTE) dac_granted |= VEXEC; if (acl->acl_entry[i].ae_perm & ACL_READ) dac_granted |= VREAD; if (acl->acl_entry[i].ae_perm & ACL_WRITE) dac_granted |= VWRITE; dac_granted &= acl_mask_granted; if ((acc_mode & dac_granted) == acc_mode) return (0); group_matched = 1; break; default: } } if (group_matched == 1) { /* * There was a match, but it did not grant rights via * pure DAC. Try again, this time with privilege. */ for (i = 0; i < acl->acl_cnt; i++) { switch (acl->acl_entry[i].ae_tag) { case ACL_GROUP_OBJ: if (!groupmember(file_gid, cred)) break; dac_granted = 0; if (acl->acl_entry[i].ae_perm & ACL_EXECUTE) dac_granted |= VEXEC; if (acl->acl_entry[i].ae_perm & ACL_READ) dac_granted |= VREAD; if (acl->acl_entry[i].ae_perm & ACL_WRITE) dac_granted |= VWRITE; dac_granted &= acl_mask_granted; if ((acc_mode & (dac_granted | cap_granted)) != acc_mode) break; if (privused != NULL) *privused = 1; return (0); case ACL_GROUP: if (!groupmember(acl->acl_entry[i].ae_id, cred)) break; dac_granted = 0; if (acl->acl_entry[i].ae_perm & ACL_EXECUTE) dac_granted |= VEXEC; if (acl->acl_entry[i].ae_perm & ACL_READ) dac_granted |= VREAD; if (acl->acl_entry[i].ae_perm & ACL_WRITE) dac_granted |= VWRITE; dac_granted &= acl_mask_granted; if ((acc_mode & (dac_granted | cap_granted)) != acc_mode) break; if (privused != NULL) *privused = 1; return (0); default: } } /* * Even with privilege, group membership was not sufficient. * Return failure. */ goto error; } /* * Fall back on ACL_OTHER. ACL_MASK is not applied to ACL_OTHER. */ dac_granted = 0; if (acl_other->ae_perm & ACL_EXECUTE) dac_granted |= VEXEC; if (acl_other->ae_perm & ACL_READ) dac_granted |= VREAD; if (acl_other->ae_perm & ACL_WRITE) dac_granted |= VWRITE; if ((acc_mode & dac_granted) == acc_mode) return (0); if ((acc_mode & (dac_granted | cap_granted)) == acc_mode) { if (privused != NULL) *privused = 1; return (0); } error: return ((acc_mode & VADMIN) ? EPERM : EACCES); } /* * For the purposes of file systems maintaining the _OBJ entries in an * inode with a mode_t field, this routine converts a mode_t entry * to an acl_perm_t. */ acl_perm_t acl_posix1e_mode_to_perm(acl_tag_t tag, mode_t mode) { acl_perm_t perm = 0; switch(tag) { case ACL_USER_OBJ: if (mode & S_IXUSR) perm |= ACL_EXECUTE; if (mode & S_IRUSR) perm |= ACL_READ; if (mode & S_IWUSR) perm |= ACL_WRITE; return (perm); case ACL_GROUP_OBJ: if (mode & S_IXGRP) perm |= ACL_EXECUTE; if (mode & S_IRGRP) perm |= ACL_READ; if (mode & S_IWGRP) perm |= ACL_WRITE; return (perm); case ACL_OTHER: if (mode & S_IXOTH) perm |= ACL_EXECUTE; if (mode & S_IROTH) perm |= ACL_READ; if (mode & S_IWOTH) perm |= ACL_WRITE; return (perm); default: printf("acl_posix1e_mode_to_perm: invalid tag (%d)\n", tag); return (0); } } /* * Given inode information (uid, gid, mode), return an acl entry of the * appropriate type. */ struct acl_entry acl_posix1e_mode_to_entry(acl_tag_t tag, uid_t uid, gid_t gid, mode_t mode) { struct acl_entry acl_entry; acl_entry.ae_tag = tag; acl_entry.ae_perm = acl_posix1e_mode_to_perm(tag, mode); switch(tag) { case ACL_USER_OBJ: acl_entry.ae_id = uid; break; case ACL_GROUP_OBJ: acl_entry.ae_id = gid; break; case ACL_OTHER: acl_entry.ae_id = ACL_UNDEFINED_ID; break; default: acl_entry.ae_id = ACL_UNDEFINED_ID; printf("acl_posix1e_mode_to_entry: invalid tag (%d)\n", tag); } return (acl_entry); } /* * Utility function to generate a file mode given appropriate ACL entries. */ mode_t acl_posix1e_perms_to_mode(struct acl_entry *acl_user_obj_entry, struct acl_entry *acl_group_obj_entry, struct acl_entry *acl_other_entry) { mode_t mode; mode = 0; if (acl_user_obj_entry->ae_perm & ACL_EXECUTE) mode |= S_IXUSR; if (acl_user_obj_entry->ae_perm & ACL_READ) mode |= S_IRUSR; if (acl_user_obj_entry->ae_perm & ACL_WRITE) mode |= S_IWUSR; if (acl_group_obj_entry->ae_perm & ACL_EXECUTE) mode |= S_IXGRP; if (acl_group_obj_entry->ae_perm & ACL_READ) mode |= S_IRGRP; if (acl_group_obj_entry->ae_perm & ACL_WRITE) mode |= S_IWGRP; if (acl_other_entry->ae_perm & ACL_EXECUTE) mode |= S_IXOTH; if (acl_other_entry->ae_perm & ACL_READ) mode |= S_IROTH; if (acl_other_entry->ae_perm & ACL_WRITE) mode |= S_IWOTH; return (mode); } /* * Perform a syntactic check of the ACL, sufficient to allow an * implementing file system to determine if it should accept this and * rely on the POSIX.1e ACL properties. */ int acl_posix1e_check(struct acl *acl) { int num_acl_user_obj, num_acl_user, num_acl_group_obj, num_acl_group; int num_acl_mask, num_acl_other, i; /* * Verify that the number of entries does not exceed the maximum * defined for acl_t. * Verify that the correct number of various sorts of ae_tags are * present: * Exactly one ACL_USER_OBJ * Exactly one ACL_GROUP_OBJ * Exactly one ACL_OTHER * If any ACL_USER or ACL_GROUP entries appear, then exactly one * ACL_MASK entry must also appear. * Verify that all ae_perm entries are in ACL_PERM_BITS. * Verify all ae_tag entries are understood by this implementation. * Note: Does not check for uniqueness of qualifier (ae_id) field. */ num_acl_user_obj = num_acl_user = num_acl_group_obj = num_acl_group = num_acl_mask = num_acl_other = 0; if (acl->acl_cnt > ACL_MAX_ENTRIES || acl->acl_cnt < 0) return (EINVAL); for (i = 0; i < acl->acl_cnt; i++) { /* * Check for a valid tag. */ switch(acl->acl_entry[i].ae_tag) { case ACL_USER_OBJ: acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */ if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID) return (EINVAL); num_acl_user_obj++; break; case ACL_GROUP_OBJ: acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */ if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID) return (EINVAL); num_acl_group_obj++; break; case ACL_USER: if (acl->acl_entry[i].ae_id == ACL_UNDEFINED_ID) return (EINVAL); num_acl_user++; break; case ACL_GROUP: if (acl->acl_entry[i].ae_id == ACL_UNDEFINED_ID) return (EINVAL); num_acl_group++; break; case ACL_OTHER: acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */ if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID) return (EINVAL); num_acl_other++; break; case ACL_MASK: acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */ if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID) return (EINVAL); num_acl_mask++; break; default: return (EINVAL); } /* * Check for valid perm entries. */ if ((acl->acl_entry[i].ae_perm | ACL_PERM_BITS) != ACL_PERM_BITS) return (EINVAL); } if ((num_acl_user_obj != 1) || (num_acl_group_obj != 1) || (num_acl_other != 1) || (num_acl_mask != 0 && num_acl_mask != 1)) return (EINVAL); if (((num_acl_group != 0) || (num_acl_user != 0)) && (num_acl_mask != 1)) return (EINVAL); return (0); } /* * These calls wrap the real vnode operations, and are called by the * syscall code once the syscall has converted the path or file * descriptor to a vnode (unlocked). The aclp pointer is assumed * still to point to userland, so this should not be consumed within * the kernel except by syscall code. Other code should directly * invoke VOP_{SET,GET}ACL. */ /* * Given a vnode, set its ACL. */ static int vacl_set_acl(struct thread *td, struct vnode *vp, acl_type_t type, struct acl *aclp) { struct acl inkernacl; int error; error = copyin(aclp, &inkernacl, sizeof(struct acl)); if (error) return(error); VOP_LEASE(vp, td, td->td_proc->p_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); error = VOP_SETACL(vp, type, &inkernacl, td->td_proc->p_ucred, td); VOP_UNLOCK(vp, 0, td); return(error); } /* * Given a vnode, get its ACL. */ static int vacl_get_acl(struct thread *td, struct vnode *vp, acl_type_t type, struct acl *aclp) { struct acl inkernelacl; int error; VOP_LEASE(vp, td, td->td_proc->p_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); error = VOP_GETACL(vp, type, &inkernelacl, td->td_proc->p_ucred, td); VOP_UNLOCK(vp, 0, td); if (error == 0) error = copyout(&inkernelacl, aclp, sizeof(struct acl)); return (error); } /* * Given a vnode, delete its ACL. */ static int vacl_delete(struct thread *td, struct vnode *vp, acl_type_t type) { int error; VOP_LEASE(vp, td, td->td_proc->p_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); error = VOP_SETACL(vp, ACL_TYPE_DEFAULT, 0, td->td_proc->p_ucred, td); VOP_UNLOCK(vp, 0, td); return (error); } /* * Given a vnode, check whether an ACL is appropriate for it */ static int vacl_aclcheck(struct thread *td, struct vnode *vp, acl_type_t type, struct acl *aclp) { struct acl inkernelacl; int error; error = copyin(aclp, &inkernelacl, sizeof(struct acl)); if (error) return(error); error = VOP_ACLCHECK(vp, type, &inkernelacl, td->td_proc->p_ucred, td); return (error); } /* * syscalls -- convert the path/fd to a vnode, and call vacl_whatever. * Don't need to lock, as the vacl_ code will get/release any locks * required. */ /* * Given a file path, get an ACL for it * * MPSAFE */ int __acl_get_file(struct thread *td, struct __acl_get_file_args *uap) { struct nameidata nd; int error; mtx_lock(&Giant); NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td); error = namei(&nd); if (error == 0) { error = vacl_get_acl(td, nd.ni_vp, SCARG(uap, type), SCARG(uap, aclp)); NDFREE(&nd, 0); } mtx_unlock(&Giant); return (error); } /* * Given a file path, set an ACL for it * * MPSAFE */ int __acl_set_file(struct thread *td, struct __acl_set_file_args *uap) { struct nameidata nd; int error; mtx_lock(&Giant); NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td); error = namei(&nd); if (error == 0) { error = vacl_set_acl(td, nd.ni_vp, SCARG(uap, type), SCARG(uap, aclp)); NDFREE(&nd, 0); } mtx_unlock(&Giant); return (error); } /* * Given a file descriptor, get an ACL for it * * MPSAFE */ int __acl_get_fd(struct thread *td, struct __acl_get_fd_args *uap) { struct file *fp; int error; mtx_lock(&Giant); error = getvnode(td->td_proc->p_fd, SCARG(uap, filedes), &fp); if (error == 0) { error = vacl_get_acl(td, (struct vnode *)fp->f_data, SCARG(uap, type), SCARG(uap, aclp)); + fdrop(fp, td); } mtx_unlock(&Giant); return (error); } /* * Given a file descriptor, set an ACL for it * * MPSAFE */ int __acl_set_fd(struct thread *td, struct __acl_set_fd_args *uap) { struct file *fp; int error; mtx_lock(&Giant); error = getvnode(td->td_proc->p_fd, SCARG(uap, filedes), &fp); if (error == 0) { error = vacl_set_acl(td, (struct vnode *)fp->f_data, SCARG(uap, type), SCARG(uap, aclp)); + fdrop(fp, td); } mtx_unlock(&Giant); return (error); } /* * Given a file path, delete an ACL from it. * * MPSAFE */ int __acl_delete_file(struct thread *td, struct __acl_delete_file_args *uap) { struct nameidata nd; int error; mtx_lock(&Giant); NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td); error = namei(&nd); if (error == 0) { error = vacl_delete(td, nd.ni_vp, SCARG(uap, type)); NDFREE(&nd, 0); } mtx_unlock(&Giant); return (error); } /* * Given a file path, delete an ACL from it. * * MPSAFE */ int __acl_delete_fd(struct thread *td, struct __acl_delete_fd_args *uap) { struct file *fp; int error; mtx_lock(&Giant); error = getvnode(td->td_proc->p_fd, SCARG(uap, filedes), &fp); if (error == 0) { error = vacl_delete(td, (struct vnode *)fp->f_data, SCARG(uap, type)); + fdrop(fp, td); } mtx_unlock(&Giant); return (error); } /* * Given a file path, check an ACL for it * * MPSAFE */ int __acl_aclcheck_file(struct thread *td, struct __acl_aclcheck_file_args *uap) { struct nameidata nd; int error; mtx_lock(&Giant); NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td); error = namei(&nd); if (error == 0) { error = vacl_aclcheck(td, nd.ni_vp, SCARG(uap, type), SCARG(uap, aclp)); NDFREE(&nd, 0); } mtx_unlock(&Giant); return (error); } /* * Given a file descriptor, check an ACL for it * * MPSAFE */ int __acl_aclcheck_fd(struct thread *td, struct __acl_aclcheck_fd_args *uap) { struct file *fp; int error; mtx_lock(&Giant); error = getvnode(td->td_proc->p_fd, SCARG(uap, filedes), &fp); if (error == 0) { error = vacl_aclcheck(td, (struct vnode *)fp->f_data, SCARG(uap, type), SCARG(uap, aclp)); + fdrop(fp, td); } mtx_unlock(&Giant); return (error); } Index: head/sys/kern/sys_generic.c =================================================================== --- head/sys/kern/sys_generic.c (revision 89305) +++ head/sys/kern/sys_generic.c (revision 89306) @@ -1,1298 +1,1357 @@ /* * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94 * $FreeBSD$ */ #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include #include #include static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer"); static MALLOC_DEFINE(M_SELECT, "select", "select() buffer"); MALLOC_DEFINE(M_IOV, "iov", "large iov's"); static int pollscan __P((struct thread *, struct pollfd *, u_int)); static int pollholddrop __P((struct thread *, struct pollfd *, u_int, int)); static int selscan __P((struct thread *, fd_mask **, fd_mask **, int)); static int selholddrop __P((struct thread *, fd_mask *, fd_mask *, int, int)); static int dofileread __P((struct thread *, struct file *, int, void *, size_t, off_t, int)); static int dofilewrite __P((struct thread *, struct file *, int, const void *, size_t, off_t, int)); +struct file* +holdfp(fdp, fd, flag) + struct filedesc* fdp; + int fd, flag; +{ + struct file* fp; + + FILEDESC_LOCK(fdp); + if (((u_int)fd) >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[fd]) == NULL) { + FILEDESC_UNLOCK(fdp); + return (NULL); + } + FILE_LOCK(fp); + FILEDESC_UNLOCK(fdp); + if ((fp->f_flag & flag) == 0) { + FILE_UNLOCK(fp); + return (NULL); + } + fp->f_count++; + FILE_UNLOCK(fp); + return (fp); +} + /* * Read system call. */ #ifndef _SYS_SYSPROTO_H_ struct read_args { int fd; void *buf; size_t nbyte; }; #endif /* * MPSAFE */ int read(td, uap) struct thread *td; struct read_args *uap; { struct file *fp; int error; mtx_lock(&Giant); if ((error = fget_read(td, uap->fd, &fp)) == 0) { error = dofileread(td, fp, uap->fd, uap->buf, uap->nbyte, (off_t)-1, 0); fdrop(fp, td); } mtx_unlock(&Giant); return(error); } /* * Pread system call */ #ifndef _SYS_SYSPROTO_H_ struct pread_args { int fd; void *buf; size_t nbyte; int pad; off_t offset; }; #endif /* * MPSAFE */ int pread(td, uap) struct thread *td; struct pread_args *uap; { struct file *fp; int error; - mtx_lock(&Giant); - if ((error = fget_read(td, uap->fd, &fp)) == 0) { - if (fp->f_type == DTYPE_VNODE) { - error = dofileread(td, fp, uap->fd, uap->buf, - uap->nbyte, uap->offset, FOF_OFFSET); - } else { - error = ESPIPE; - } - fdrop(fp, td); + fp = holdfp(td->td_proc->p_fd, uap->fd, FREAD); + if (fp == NULL) + return (EBADF); + if (fp->f_type != DTYPE_VNODE) { + error = ESPIPE; + } else { + mtx_lock(&Giant); + error = dofileread(td, fp, uap->fd, uap->buf, uap->nbyte, + uap->offset, FOF_OFFSET); + mtx_unlock(&Giant); } - mtx_unlock(&Giant); + fdrop(fp, td); return(error); } /* * Code common for read and pread */ int dofileread(td, fp, fd, buf, nbyte, offset, flags) struct thread *td; struct file *fp; int fd, flags; void *buf; size_t nbyte; off_t offset; { struct uio auio; struct iovec aiov; long cnt, error = 0; #ifdef KTRACE struct iovec ktriov; struct uio ktruio; int didktr = 0; #endif aiov.iov_base = (caddr_t)buf; aiov.iov_len = nbyte; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = offset; if (nbyte > INT_MAX) return (EINVAL); auio.uio_resid = nbyte; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_USERSPACE; auio.uio_td = td; #ifdef KTRACE /* * if tracing, save a copy of iovec */ if (KTRPOINT(td->td_proc, KTR_GENIO)) { ktriov = aiov; ktruio = auio; didktr = 1; } #endif cnt = nbyte; if ((error = fo_read(fp, &auio, fp->f_cred, flags, td))) { if (auio.uio_resid != cnt && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; } cnt -= auio.uio_resid; #ifdef KTRACE if (didktr && error == 0) { ktruio.uio_iov = &ktriov; ktruio.uio_resid = cnt; ktrgenio(td->td_proc->p_tracep, fd, UIO_READ, &ktruio, error); } #endif td->td_retval[0] = cnt; return (error); } /* * Scatter read system call. */ #ifndef _SYS_SYSPROTO_H_ struct readv_args { int fd; struct iovec *iovp; u_int iovcnt; }; #endif /* * MPSAFE */ int readv(td, uap) struct thread *td; struct readv_args *uap; { struct file *fp; struct uio auio; struct iovec *iov; struct iovec *needfree; struct iovec aiov[UIO_SMALLIOV]; long i, cnt, error = 0; u_int iovlen; #ifdef KTRACE struct iovec *ktriov = NULL; struct uio ktruio; #endif mtx_lock(&Giant); if ((error = fget_read(td, uap->fd, &fp)) != 0) goto done2; /* note: can't use iovlen until iovcnt is validated */ iovlen = uap->iovcnt * sizeof (struct iovec); if (uap->iovcnt > UIO_SMALLIOV) { if (uap->iovcnt > UIO_MAXIOV) { error = EINVAL; goto done2; } MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK); needfree = iov; } else { iov = aiov; needfree = NULL; } auio.uio_iov = iov; auio.uio_iovcnt = uap->iovcnt; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_USERSPACE; auio.uio_td = td; auio.uio_offset = -1; if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen))) goto done; auio.uio_resid = 0; for (i = 0; i < uap->iovcnt; i++) { if (iov->iov_len > INT_MAX - auio.uio_resid) { error = EINVAL; goto done; } auio.uio_resid += iov->iov_len; iov++; } #ifdef KTRACE /* * if tracing, save a copy of iovec */ if (KTRPOINT(td->td_proc, KTR_GENIO)) { MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen); ktruio = auio; } #endif cnt = auio.uio_resid; if ((error = fo_read(fp, &auio, fp->f_cred, 0, td))) { if (auio.uio_resid != cnt && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; } cnt -= auio.uio_resid; #ifdef KTRACE if (ktriov != NULL) { if (error == 0) { ktruio.uio_iov = ktriov; ktruio.uio_resid = cnt; ktrgenio(td->td_proc->p_tracep, uap->fd, UIO_READ, &ktruio, error); } FREE(ktriov, M_TEMP); } #endif td->td_retval[0] = cnt; done: fdrop(fp, td); if (needfree) FREE(needfree, M_IOV); done2: mtx_unlock(&Giant); return (error); } /* * Write system call */ #ifndef _SYS_SYSPROTO_H_ struct write_args { int fd; const void *buf; size_t nbyte; }; #endif /* * MPSAFE */ int write(td, uap) struct thread *td; struct write_args *uap; { struct file *fp; int error; mtx_lock(&Giant); if ((error = fget_write(td, uap->fd, &fp)) == 0) { error = dofilewrite(td, fp, uap->fd, uap->buf, uap->nbyte, (off_t)-1, 0); fdrop(fp, td); } else { error = EBADF; /* XXX this can't be right */ } mtx_unlock(&Giant); return(error); } /* * Pwrite system call */ #ifndef _SYS_SYSPROTO_H_ struct pwrite_args { int fd; const void *buf; size_t nbyte; int pad; off_t offset; }; #endif /* * MPSAFE */ int pwrite(td, uap) struct thread *td; struct pwrite_args *uap; { struct file *fp; int error; mtx_lock(&Giant); if ((error = fget_write(td, uap->fd, &fp)) == 0) { if (fp->f_type == DTYPE_VNODE) { error = dofilewrite(td, fp, uap->fd, uap->buf, uap->nbyte, uap->offset, FOF_OFFSET); } else { error = ESPIPE; } fdrop(fp, td); } else { error = EBADF; /* this can't be right */ } - mtx_unlock(&Giant); return(error); } static int dofilewrite(td, fp, fd, buf, nbyte, offset, flags) struct thread *td; struct file *fp; int fd, flags; const void *buf; size_t nbyte; off_t offset; { struct uio auio; struct iovec aiov; long cnt, error = 0; #ifdef KTRACE struct iovec ktriov; struct uio ktruio; int didktr = 0; #endif aiov.iov_base = (void *)(uintptr_t)buf; aiov.iov_len = nbyte; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = offset; if (nbyte > INT_MAX) return (EINVAL); auio.uio_resid = nbyte; auio.uio_rw = UIO_WRITE; auio.uio_segflg = UIO_USERSPACE; auio.uio_td = td; #ifdef KTRACE /* * if tracing, save a copy of iovec and uio */ if (KTRPOINT(td->td_proc, KTR_GENIO)) { ktriov = aiov; ktruio = auio; didktr = 1; } #endif cnt = nbyte; if (fp->f_type == DTYPE_VNODE) bwillwrite(); if ((error = fo_write(fp, &auio, fp->f_cred, flags, td))) { if (auio.uio_resid != cnt && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; if (error == EPIPE) { PROC_LOCK(td->td_proc); psignal(td->td_proc, SIGPIPE); PROC_UNLOCK(td->td_proc); } } cnt -= auio.uio_resid; #ifdef KTRACE if (didktr && error == 0) { ktruio.uio_iov = &ktriov; ktruio.uio_resid = cnt; ktrgenio(td->td_proc->p_tracep, fd, UIO_WRITE, &ktruio, error); } #endif td->td_retval[0] = cnt; return (error); } /* * Gather write system call */ #ifndef _SYS_SYSPROTO_H_ struct writev_args { int fd; struct iovec *iovp; u_int iovcnt; }; #endif /* * MPSAFE */ int writev(td, uap) struct thread *td; register struct writev_args *uap; { struct file *fp; struct uio auio; register struct iovec *iov; struct iovec *needfree; struct iovec aiov[UIO_SMALLIOV]; long i, cnt, error = 0; u_int iovlen; #ifdef KTRACE struct iovec *ktriov = NULL; struct uio ktruio; #endif mtx_lock(&Giant); if ((error = fget_write(td, uap->fd, &fp)) != 0) { error = EBADF; goto done2; } /* note: can't use iovlen until iovcnt is validated */ iovlen = uap->iovcnt * sizeof (struct iovec); if (uap->iovcnt > UIO_SMALLIOV) { if (uap->iovcnt > UIO_MAXIOV) { needfree = NULL; error = EINVAL; goto done; } MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK); needfree = iov; } else { iov = aiov; needfree = NULL; } auio.uio_iov = iov; auio.uio_iovcnt = uap->iovcnt; auio.uio_rw = UIO_WRITE; auio.uio_segflg = UIO_USERSPACE; auio.uio_td = td; auio.uio_offset = -1; if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen))) goto done; auio.uio_resid = 0; for (i = 0; i < uap->iovcnt; i++) { if (iov->iov_len > INT_MAX - auio.uio_resid) { error = EINVAL; goto done; } auio.uio_resid += iov->iov_len; iov++; } #ifdef KTRACE /* * if tracing, save a copy of iovec and uio */ if (KTRPOINT(td->td_proc, KTR_GENIO)) { MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen); ktruio = auio; } #endif cnt = auio.uio_resid; if (fp->f_type == DTYPE_VNODE) bwillwrite(); if ((error = fo_write(fp, &auio, fp->f_cred, 0, td))) { if (auio.uio_resid != cnt && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; if (error == EPIPE) { PROC_LOCK(td->td_proc); psignal(td->td_proc, SIGPIPE); PROC_UNLOCK(td->td_proc); } } cnt -= auio.uio_resid; #ifdef KTRACE if (ktriov != NULL) { if (error == 0) { ktruio.uio_iov = ktriov; ktruio.uio_resid = cnt; ktrgenio(td->td_proc->p_tracep, uap->fd, UIO_WRITE, &ktruio, error); } FREE(ktriov, M_TEMP); } #endif td->td_retval[0] = cnt; done: fdrop(fp, td); if (needfree) FREE(needfree, M_IOV); done2: mtx_unlock(&Giant); return (error); } /* * Ioctl system call */ #ifndef _SYS_SYSPROTO_H_ struct ioctl_args { int fd; u_long com; caddr_t data; }; #endif /* * MPSAFE */ /* ARGSUSED */ int ioctl(td, uap) struct thread *td; register struct ioctl_args *uap; { register struct file *fp; register struct filedesc *fdp; register u_long com; int error = 0; register u_int size; caddr_t data, memp; int tmp; #define STK_PARAMS 128 union { char stkbuf[STK_PARAMS]; long align; } ubuf; - mtx_lock(&Giant); - fdp = td->td_proc->p_fd; - if ((u_int)uap->fd >= fdp->fd_nfiles || - (fp = fdp->fd_ofiles[uap->fd]) == NULL) { - error = EBADF; - goto done2; - } - + fp = ffind_hold(td, uap->fd); + if (fp == NULL) + return (EBADF); if ((fp->f_flag & (FREAD | FWRITE)) == 0) { - error = EBADF; - goto done2; + fdrop(fp, td); + return (EBADF); } - + fdp = td->td_proc->p_fd; switch (com = uap->com) { case FIONCLEX: + FILEDESC_LOCK(fdp); fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE; - goto done2; + FILEDESC_UNLOCK(fdp); + fdrop(fp, td); + return (0); case FIOCLEX: + FILEDESC_LOCK(fdp); fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE; - goto done2; + FILEDESC_UNLOCK(fdp); + fdrop(fp, td); + return (0); } /* * Interpret high order word to find amount of data to be * copied to/from the user's address space. */ size = IOCPARM_LEN(com); if (size > IOCPARM_MAX) { - error = ENOTTY; - goto done2; + fdrop(fp, td); + return (ENOTTY); } - fhold(fp); - + mtx_lock(&Giant); memp = NULL; if (size > sizeof (ubuf.stkbuf)) { memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK); data = memp; } else { data = ubuf.stkbuf; } if (com&IOC_IN) { if (size) { error = copyin(uap->data, data, (u_int)size); if (error) { if (memp) free(memp, M_IOCTLOPS); fdrop(fp, td); - goto done2; + goto done; } } else { *(caddr_t *)data = uap->data; } } else if ((com&IOC_OUT) && size) { /* * Zero the buffer so the user always * gets back something deterministic. */ bzero(data, size); } else if (com&IOC_VOID) { *(caddr_t *)data = uap->data; } switch (com) { case FIONBIO: + FILE_LOCK(fp); if ((tmp = *(int *)data)) fp->f_flag |= FNONBLOCK; else fp->f_flag &= ~FNONBLOCK; + FILE_UNLOCK(fp); error = fo_ioctl(fp, FIONBIO, (caddr_t)&tmp, td); break; case FIOASYNC: + FILE_LOCK(fp); if ((tmp = *(int *)data)) fp->f_flag |= FASYNC; else fp->f_flag &= ~FASYNC; + FILE_UNLOCK(fp); error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, td); break; default: error = fo_ioctl(fp, com, data, td); /* * Copy any data to user, size was * already set and checked above. */ if (error == 0 && (com&IOC_OUT) && size) error = copyout(data, uap->data, (u_int)size); break; } if (memp) free(memp, M_IOCTLOPS); fdrop(fp, td); -done2: +done: mtx_unlock(&Giant); return (error); } static int nselcoll; /* Select collisions since boot */ struct cv selwait; SYSCTL_INT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, ""); /* * Select system call. */ #ifndef _SYS_SYSPROTO_H_ struct select_args { int nd; fd_set *in, *ou, *ex; struct timeval *tv; }; #endif /* * MPSAFE */ int select(td, uap) register struct thread *td; register struct select_args *uap; { + struct filedesc *fdp; /* * The magic 2048 here is chosen to be just enough for FD_SETSIZE * infds with the new FD_SETSIZE of 1024, and more than enough for * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE * of 256. */ fd_mask s_selbits[howmany(2048, NFDBITS)]; fd_mask s_heldbits[howmany(2048, NFDBITS)]; fd_mask *ibits[3], *obits[3], *selbits, *sbp, *heldbits, *hibits, *hobits; struct timeval atv, rtv, ttv; int ncoll, error, timo, i; u_int nbufbytes, ncpbytes, nfdbits; if (uap->nd < 0) return (EINVAL); - + fdp = td->td_proc->p_fd; mtx_lock(&Giant); + FILEDESC_LOCK(fdp); if (uap->nd > td->td_proc->p_fd->fd_nfiles) uap->nd = td->td_proc->p_fd->fd_nfiles; /* forgiving; slightly wrong */ + FILEDESC_UNLOCK(fdp); /* * Allocate just enough bits for the non-null fd_sets. Use the * preallocated auto buffer if possible. */ nfdbits = roundup(uap->nd, NFDBITS); ncpbytes = nfdbits / NBBY; nbufbytes = 0; if (uap->in != NULL) nbufbytes += 2 * ncpbytes; if (uap->ou != NULL) nbufbytes += 2 * ncpbytes; if (uap->ex != NULL) nbufbytes += 2 * ncpbytes; if (nbufbytes <= sizeof s_selbits) selbits = &s_selbits[0]; else selbits = malloc(nbufbytes, M_SELECT, M_WAITOK); if (2 * ncpbytes <= sizeof s_heldbits) { bzero(s_heldbits, sizeof(s_heldbits)); heldbits = &s_heldbits[0]; } else heldbits = malloc(2 * ncpbytes, M_SELECT, M_WAITOK | M_ZERO); /* * Assign pointers into the bit buffers and fetch the input bits. * Put the output buffers together so that they can be bzeroed * together. */ sbp = selbits; hibits = heldbits + ncpbytes / sizeof *heldbits; hobits = heldbits; #define getbits(name, x) \ do { \ if (uap->name == NULL) \ ibits[x] = NULL; \ else { \ ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \ obits[x] = sbp; \ sbp += ncpbytes / sizeof *sbp; \ error = copyin(uap->name, ibits[x], ncpbytes); \ if (error != 0) \ goto done_noproclock; \ for (i = 0; \ i < ncpbytes / sizeof ibits[i][0]; \ i++) \ hibits[i] |= ibits[x][i]; \ } \ } while (0) getbits(in, 0); getbits(ou, 1); getbits(ex, 2); #undef getbits if (nbufbytes != 0) bzero(selbits, nbufbytes / 2); if (uap->tv) { error = copyin((caddr_t)uap->tv, (caddr_t)&atv, sizeof (atv)); if (error) goto done_noproclock; if (itimerfix(&atv)) { error = EINVAL; goto done_noproclock; } getmicrouptime(&rtv); timevaladd(&atv, &rtv); } else { atv.tv_sec = 0; atv.tv_usec = 0; } selholddrop(td, hibits, hobits, uap->nd, 1); timo = 0; PROC_LOCK(td->td_proc); retry: ncoll = nselcoll; mtx_lock_spin(&sched_lock); td->td_flags |= TDF_SELECT; mtx_unlock_spin(&sched_lock); PROC_UNLOCK(td->td_proc); error = selscan(td, ibits, obits, uap->nd); PROC_LOCK(td->td_proc); if (error || td->td_retval[0]) goto done; if (atv.tv_sec || atv.tv_usec) { getmicrouptime(&rtv); if (timevalcmp(&rtv, &atv, >=)) { /* * An event of our interest may occur during locking a process. * In order to avoid missing the event that occured during locking * the process, test TDF_SELECT and rescan file descriptors if * necessary. */ mtx_lock_spin(&sched_lock); if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) { ncoll = nselcoll; td->td_flags |= TDF_SELECT; mtx_unlock_spin(&sched_lock); PROC_UNLOCK(td->td_proc); error = selscan(td, ibits, obits, uap->nd); PROC_LOCK(td->td_proc); } else mtx_unlock_spin(&sched_lock); goto done; } ttv = atv; timevalsub(&ttv, &rtv); timo = ttv.tv_sec > 24 * 60 * 60 ? 24 * 60 * 60 * hz : tvtohz(&ttv); } mtx_lock_spin(&sched_lock); td->td_flags &= ~TDF_SELECT; mtx_unlock_spin(&sched_lock); if (timo > 0) error = cv_timedwait_sig(&selwait, &td->td_proc->p_mtx, timo); else error = cv_wait_sig(&selwait, &td->td_proc->p_mtx); if (error == 0) goto retry; done: mtx_lock_spin(&sched_lock); td->td_flags &= ~TDF_SELECT; mtx_unlock_spin(&sched_lock); PROC_UNLOCK(td->td_proc); selholddrop(td, hibits, hobits, uap->nd, 0); done_noproclock: /* select is not restarted after signals... */ if (error == ERESTART) error = EINTR; if (error == EWOULDBLOCK) error = 0; #define putbits(name, x) \ if (uap->name && (error2 = copyout(obits[x], uap->name, ncpbytes))) \ error = error2; if (error == 0) { int error2; putbits(in, 0); putbits(ou, 1); putbits(ex, 2); #undef putbits } if (selbits != &s_selbits[0]) free(selbits, M_SELECT); if (heldbits != &s_heldbits[0]) free(heldbits, M_SELECT); mtx_unlock(&Giant); return (error); } +/* + * Used to hold then release a group of fds for select(2). + * Hold (hold == 1) or release (hold == 0) a group of filedescriptors. + * if holding then use ibits setting the bits in obits, otherwise use obits. + */ static int selholddrop(td, ibits, obits, nfd, hold) struct thread *td; fd_mask *ibits, *obits; int nfd, hold; { struct filedesc *fdp = td->td_proc->p_fd; int i, fd; fd_mask bits; struct file *fp; + FILEDESC_LOCK(fdp); for (i = 0; i < nfd; i += NFDBITS) { if (hold) bits = ibits[i/NFDBITS]; else bits = obits[i/NFDBITS]; /* ffs(int mask) not portable, fd_mask is long */ for (fd = i; bits && fd < nfd; fd++, bits >>= 1) { if (!(bits & 1)) continue; fp = fdp->fd_ofiles[fd]; - if (fp == NULL) + if (fp == NULL) { + FILEDESC_UNLOCK(fdp); return (EBADF); + } if (hold) { fhold(fp); obits[(fd)/NFDBITS] |= ((fd_mask)1 << ((fd) % NFDBITS)); - } else + } else { + /* XXX: optimize by making a special + * version of fdrop that only unlocks + * the filedesc if needed? This would + * redcuce the number of lock/unlock + * pairs by quite a bit. + */ + FILEDESC_UNLOCK(fdp); fdrop(fp, td); + FILEDESC_LOCK(fdp); + } } } + FILEDESC_UNLOCK(fdp); return (0); } static int selscan(td, ibits, obits, nfd) struct thread *td; fd_mask **ibits, **obits; int nfd; { - struct filedesc *fdp = td->td_proc->p_fd; int msk, i, fd; fd_mask bits; struct file *fp; int n = 0; /* Note: backend also returns POLLHUP/POLLERR if appropriate. */ static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND }; for (msk = 0; msk < 3; msk++) { if (ibits[msk] == NULL) continue; for (i = 0; i < nfd; i += NFDBITS) { bits = ibits[msk][i/NFDBITS]; /* ffs(int mask) not portable, fd_mask is long */ for (fd = i; bits && fd < nfd; fd++, bits >>= 1) { if (!(bits & 1)) continue; - fp = fdp->fd_ofiles[fd]; + fp = ffind_hold(td, fd); if (fp == NULL) return (EBADF); if (fo_poll(fp, flag[msk], fp->f_cred, td)) { obits[msk][(fd)/NFDBITS] |= ((fd_mask)1 << ((fd) % NFDBITS)); n++; } + fdrop(fp, td); } } } td->td_retval[0] = n; return (0); } /* * Poll system call. */ #ifndef _SYS_SYSPROTO_H_ struct poll_args { struct pollfd *fds; u_int nfds; int timeout; }; #endif /* * MPSAFE */ int poll(td, uap) struct thread *td; struct poll_args *uap; { caddr_t bits; char smallbits[32 * sizeof(struct pollfd)]; struct timeval atv, rtv, ttv; int ncoll, error = 0, timo; u_int nfds; size_t ni; struct pollfd p_heldbits[32]; struct pollfd *heldbits; nfds = SCARG(uap, nfds); mtx_lock(&Giant); /* * This is kinda bogus. We have fd limits, but that is not * really related to the size of the pollfd array. Make sure * we let the process use at least FD_SETSIZE entries and at * least enough for the current limits. We want to be reasonably * safe, but not overly restrictive. */ if ((nfds > td->td_proc->p_rlimit[RLIMIT_NOFILE].rlim_cur) && (nfds > FD_SETSIZE)) { error = EINVAL; goto done2; } ni = nfds * sizeof(struct pollfd); if (ni > sizeof(smallbits)) bits = malloc(ni, M_TEMP, M_WAITOK); else bits = smallbits; if (ni > sizeof(p_heldbits)) heldbits = malloc(ni, M_TEMP, M_WAITOK); else { bzero(p_heldbits, sizeof(p_heldbits)); heldbits = p_heldbits; } error = copyin(SCARG(uap, fds), bits, ni); if (error) goto done_noproclock; bcopy(bits, heldbits, ni); if (SCARG(uap, timeout) != INFTIM) { atv.tv_sec = SCARG(uap, timeout) / 1000; atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000; if (itimerfix(&atv)) { error = EINVAL; goto done_noproclock; } getmicrouptime(&rtv); timevaladd(&atv, &rtv); } else { atv.tv_sec = 0; atv.tv_usec = 0; } pollholddrop(td, heldbits, nfds, 1); timo = 0; PROC_LOCK(td->td_proc); retry: ncoll = nselcoll; mtx_lock_spin(&sched_lock); td->td_flags |= TDF_SELECT; mtx_unlock_spin(&sched_lock); PROC_UNLOCK(td->td_proc); error = pollscan(td, (struct pollfd *)bits, nfds); PROC_LOCK(td->td_proc); if (error || td->td_retval[0]) goto done; if (atv.tv_sec || atv.tv_usec) { getmicrouptime(&rtv); if (timevalcmp(&rtv, &atv, >=)) { /* * An event of our interest may occur during locking a process. * In order to avoid missing the event that occured during locking * the process, test TDF_SELECT and rescan file descriptors if * necessary. */ mtx_lock_spin(&sched_lock); if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) { ncoll = nselcoll; td->td_flags |= TDF_SELECT; mtx_unlock_spin(&sched_lock); PROC_UNLOCK(td->td_proc); error = pollscan(td, (struct pollfd *)bits, nfds); PROC_LOCK(td->td_proc); } else mtx_unlock_spin(&sched_lock); goto done; } ttv = atv; timevalsub(&ttv, &rtv); timo = ttv.tv_sec > 24 * 60 * 60 ? 24 * 60 * 60 * hz : tvtohz(&ttv); } mtx_lock_spin(&sched_lock); td->td_flags &= ~TDF_SELECT; mtx_unlock_spin(&sched_lock); if (timo > 0) error = cv_timedwait_sig(&selwait, &td->td_proc->p_mtx, timo); else error = cv_wait_sig(&selwait, &td->td_proc->p_mtx); if (error == 0) goto retry; done: mtx_lock_spin(&sched_lock); td->td_flags &= ~TDF_SELECT; mtx_unlock_spin(&sched_lock); PROC_UNLOCK(td->td_proc); pollholddrop(td, heldbits, nfds, 0); done_noproclock: /* poll is not restarted after signals... */ if (error == ERESTART) error = EINTR; if (error == EWOULDBLOCK) error = 0; if (error == 0) { error = copyout(bits, SCARG(uap, fds), ni); if (error) goto out; } out: if (ni > sizeof(smallbits)) free(bits, M_TEMP); if (ni > sizeof(p_heldbits)) free(heldbits, M_TEMP); done2: mtx_unlock(&Giant); return (error); } static int pollholddrop(td, fds, nfd, hold) struct thread *td; struct pollfd *fds; u_int nfd; int hold; { register struct filedesc *fdp = td->td_proc->p_fd; int i; struct file *fp; + FILEDESC_LOCK(fdp); for (i = 0; i < nfd; i++, fds++) { if (0 <= fds->fd && fds->fd < fdp->fd_nfiles) { fp = fdp->fd_ofiles[fds->fd]; if (hold) { if (fp != NULL) { fhold(fp); fds->revents = 1; } else fds->revents = 0; - } else if(fp != NULL && fds->revents) - fdrop(fp, td); + } else if(fp != NULL && fds->revents) { + FILE_LOCK(fp); + FILEDESC_UNLOCK(fdp); + fdrop_locked(fp, td); + FILEDESC_LOCK(fdp); + } } } + FILEDESC_UNLOCK(fdp); return (0); } static int pollscan(td, fds, nfd) struct thread *td; struct pollfd *fds; u_int nfd; { register struct filedesc *fdp = td->td_proc->p_fd; int i; struct file *fp; int n = 0; for (i = 0; i < nfd; i++, fds++) { + FILEDESC_LOCK(fdp); if (fds->fd >= fdp->fd_nfiles) { fds->revents = POLLNVAL; n++; + FILEDESC_UNLOCK(fdp); } else if (fds->fd < 0) { fds->revents = 0; + FILEDESC_UNLOCK(fdp); } else { fp = fdp->fd_ofiles[fds->fd]; + FILEDESC_UNLOCK(fdp); if (fp == NULL) { fds->revents = POLLNVAL; n++; } else { /* * Note: backend also returns POLLHUP and * POLLERR if appropriate. */ fds->revents = fo_poll(fp, fds->events, fp->f_cred, td); if (fds->revents != 0) n++; } } } td->td_retval[0] = n; return (0); } /* * OpenBSD poll system call. * XXX this isn't quite a true representation.. OpenBSD uses select ops. */ #ifndef _SYS_SYSPROTO_H_ struct openbsd_poll_args { struct pollfd *fds; u_int nfds; int timeout; }; #endif /* * MPSAFE */ int openbsd_poll(td, uap) register struct thread *td; register struct openbsd_poll_args *uap; { return (poll(td, (struct poll_args *)uap)); } /*ARGSUSED*/ int seltrue(dev, events, td) dev_t dev; int events; struct thread *td; { return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)); } static int find_thread_in_proc(struct proc *p, struct thread *td) { struct thread *td2; FOREACH_THREAD_IN_PROC(p, td2) { if (td2 == td) { return (1); } } return (0); } /* * Record a select request. */ void selrecord(selector, sip) struct thread *selector; struct selinfo *sip; { struct proc *p; pid_t mypid; mypid = selector->td_proc->p_pid; if ((sip->si_pid == mypid) && (sip->si_thread == selector)) { /* XXXKSE should be an ID? */ return; } if (sip->si_pid && (p = pfind(sip->si_pid)) && (find_thread_in_proc(p, sip->si_thread))) { mtx_lock_spin(&sched_lock); if (sip->si_thread->td_wchan == (caddr_t)&selwait) { mtx_unlock_spin(&sched_lock); PROC_UNLOCK(p); sip->si_flags |= SI_COLL; return; } mtx_unlock_spin(&sched_lock); PROC_UNLOCK(p); } sip->si_pid = mypid; sip->si_thread = selector; } /* * Do a wakeup when a selectable event occurs. */ void selwakeup(sip) register struct selinfo *sip; { struct thread *td; register struct proc *p; if (sip->si_pid == 0) return; if (sip->si_flags & SI_COLL) { nselcoll++; sip->si_flags &= ~SI_COLL; cv_broadcast(&selwait); } p = pfind(sip->si_pid); sip->si_pid = 0; td = sip->si_thread; if (p != NULL) { if (!find_thread_in_proc(p, td)) { PROC_UNLOCK(p); /* lock is in pfind() */; return; } mtx_lock_spin(&sched_lock); if (td->td_wchan == (caddr_t)&selwait) { if (td->td_proc->p_stat == SSLEEP) setrunnable(td); else cv_waitq_remove(td); } else td->td_flags &= ~TDF_SELECT; mtx_unlock_spin(&sched_lock); PROC_UNLOCK(p); /* Lock is in pfind() */ } } static void selectinit __P((void *)); SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL) /* ARGSUSED*/ static void selectinit(dummy) void *dummy; { cv_init(&selwait, "select"); } Index: head/sys/kern/sys_pipe.c =================================================================== --- head/sys/kern/sys_pipe.c (revision 89305) +++ head/sys/kern/sys_pipe.c (revision 89306) @@ -1,1302 +1,1318 @@ /* * Copyright (c) 1996 John S. Dyson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice immediately at the beginning of the file, without modification, * this list of conditions, and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Absolutely no warranty of function or purpose is made by the author * John S. Dyson. * 4. Modifications may be freely made to this file if the above conditions * are met. * * $FreeBSD$ */ /* * This file contains a high-performance replacement for the socket-based * pipes scheme originally used in FreeBSD/4.4Lite. It does not support * all features of sockets, but does do everything that pipes normally * do. */ /* * This code has two modes of operation, a small write mode and a large * write mode. The small write mode acts like conventional pipes with * a kernel buffer. If the buffer is less than PIPE_MINDIRECT, then the * "normal" pipe buffering is done. If the buffer is between PIPE_MINDIRECT * and PIPE_SIZE in size, it is fully mapped and wired into the kernel, and * the receiving process can copy it directly from the pages in the sending * process. * * If the sending process receives a signal, it is possible that it will * go away, and certainly its address space can change, because control * is returned back to the user-mode side. In that case, the pipe code * arranges to copy the buffer supplied by the user process, to a pageable * kernel buffer, and the receiving process will grab the data from the * pageable kernel buffer. Since signals don't happen all that often, * the copy operation is normally eliminated. * * The constant PIPE_MINDIRECT is chosen to make sure that buffering will * happen for small transfers so that the system will not spend all of * its time context switching. PIPE_SIZE is constrained by the * amount of kernel virtual memory. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Use this define if you want to disable *fancy* VM things. Expect an * approx 30% decrease in transfer rate. This could be useful for * NetBSD or OpenBSD. */ /* #define PIPE_NODIRECT */ /* * interfaces to the outside world */ static int pipe_read __P((struct file *fp, struct uio *uio, struct ucred *cred, int flags, struct thread *td)); static int pipe_write __P((struct file *fp, struct uio *uio, struct ucred *cred, int flags, struct thread *td)); static int pipe_close __P((struct file *fp, struct thread *td)); static int pipe_poll __P((struct file *fp, int events, struct ucred *cred, struct thread *td)); static int pipe_kqfilter __P((struct file *fp, struct knote *kn)); static int pipe_stat __P((struct file *fp, struct stat *sb, struct thread *td)); static int pipe_ioctl __P((struct file *fp, u_long cmd, caddr_t data, struct thread *td)); static struct fileops pipeops = { pipe_read, pipe_write, pipe_ioctl, pipe_poll, pipe_kqfilter, pipe_stat, pipe_close }; static void filt_pipedetach(struct knote *kn); static int filt_piperead(struct knote *kn, long hint); static int filt_pipewrite(struct knote *kn, long hint); static struct filterops pipe_rfiltops = { 1, NULL, filt_pipedetach, filt_piperead }; static struct filterops pipe_wfiltops = { 1, NULL, filt_pipedetach, filt_pipewrite }; /* * Default pipe buffer size(s), this can be kind-of large now because pipe * space is pageable. The pipe code will try to maintain locality of * reference for performance reasons, so small amounts of outstanding I/O * will not wipe the cache. */ #define MINPIPESIZE (PIPE_SIZE/3) #define MAXPIPESIZE (2*PIPE_SIZE/3) /* * Maximum amount of kva for pipes -- this is kind-of a soft limit, but * is there so that on large systems, we don't exhaust it. */ #define MAXPIPEKVA (8*1024*1024) /* * Limit for direct transfers, we cannot, of course limit * the amount of kva for pipes in general though. */ #define LIMITPIPEKVA (16*1024*1024) /* * Limit the number of "big" pipes */ #define LIMITBIGPIPES 32 static int nbigpipe; static int amountpipekva; static void pipeclose __P((struct pipe *cpipe)); static void pipe_free_kmem __P((struct pipe *cpipe)); static int pipe_create __P((struct pipe **cpipep)); static __inline int pipelock __P((struct pipe *cpipe, int catch)); static __inline void pipeunlock __P((struct pipe *cpipe)); static __inline void pipeselwakeup __P((struct pipe *cpipe)); #ifndef PIPE_NODIRECT static int pipe_build_write_buffer __P((struct pipe *wpipe, struct uio *uio)); static void pipe_destroy_write_buffer __P((struct pipe *wpipe)); static int pipe_direct_write __P((struct pipe *wpipe, struct uio *uio)); static void pipe_clone_write_buffer __P((struct pipe *wpipe)); #endif static int pipespace __P((struct pipe *cpipe, int size)); static vm_zone_t pipe_zone; /* * The pipe system call for the DTYPE_PIPE type of pipes */ /* ARGSUSED */ int pipe(td, uap) struct thread *td; struct pipe_args /* { int dummy; } */ *uap; { struct filedesc *fdp = td->td_proc->p_fd; struct file *rf, *wf; struct pipe *rpipe, *wpipe; int fd, error; if (pipe_zone == NULL) pipe_zone = zinit("PIPE", sizeof(struct pipe), 0, 0, 4); rpipe = wpipe = NULL; if (pipe_create(&rpipe) || pipe_create(&wpipe)) { pipeclose(rpipe); pipeclose(wpipe); return (ENFILE); } rpipe->pipe_state |= PIPE_DIRECTOK; wpipe->pipe_state |= PIPE_DIRECTOK; error = falloc(td, &rf, &fd); if (error) { pipeclose(rpipe); pipeclose(wpipe); return (error); } fhold(rf); td->td_retval[0] = fd; /* * Warning: once we've gotten past allocation of the fd for the * read-side, we can only drop the read side via fdrop() in order * to avoid races against processes which manage to dup() the read * side while we are blocked trying to allocate the write side. */ + FILE_LOCK(rf); rf->f_flag = FREAD | FWRITE; rf->f_type = DTYPE_PIPE; rf->f_data = (caddr_t)rpipe; rf->f_ops = &pipeops; + FILE_UNLOCK(rf); error = falloc(td, &wf, &fd); if (error) { + FILEDESC_LOCK(fdp); if (fdp->fd_ofiles[td->td_retval[0]] == rf) { fdp->fd_ofiles[td->td_retval[0]] = NULL; + FILEDESC_UNLOCK(fdp); fdrop(rf, td); - } + } else + FILEDESC_UNLOCK(fdp); fdrop(rf, td); /* rpipe has been closed by fdrop(). */ pipeclose(wpipe); return (error); } + FILE_LOCK(wf); wf->f_flag = FREAD | FWRITE; wf->f_type = DTYPE_PIPE; wf->f_data = (caddr_t)wpipe; wf->f_ops = &pipeops; + FILE_UNLOCK(wf); td->td_retval[1] = fd; - rpipe->pipe_peer = wpipe; wpipe->pipe_peer = rpipe; fdrop(rf, td); return (0); } /* * Allocate kva for pipe circular buffer, the space is pageable * This routine will 'realloc' the size of a pipe safely, if it fails * it will retain the old buffer. * If it fails it will return ENOMEM. */ static int pipespace(cpipe, size) struct pipe *cpipe; int size; { struct vm_object *object; caddr_t buffer; int npages, error; GIANT_REQUIRED; npages = round_page(size)/PAGE_SIZE; /* * Create an object, I don't like the idea of paging to/from * kernel_object. * XXX -- minor change needed here for NetBSD/OpenBSD VM systems. */ object = vm_object_allocate(OBJT_DEFAULT, npages); buffer = (caddr_t) vm_map_min(kernel_map); /* * Insert the object into the kernel map, and allocate kva for it. * The map entry is, by default, pageable. * XXX -- minor change needed here for NetBSD/OpenBSD VM systems. */ error = vm_map_find(kernel_map, object, 0, (vm_offset_t *) &buffer, size, 1, VM_PROT_ALL, VM_PROT_ALL, 0); if (error != KERN_SUCCESS) { vm_object_deallocate(object); return (ENOMEM); } /* free old resources if we're resizing */ pipe_free_kmem(cpipe); cpipe->pipe_buffer.object = object; cpipe->pipe_buffer.buffer = buffer; cpipe->pipe_buffer.size = size; cpipe->pipe_buffer.in = 0; cpipe->pipe_buffer.out = 0; cpipe->pipe_buffer.cnt = 0; amountpipekva += cpipe->pipe_buffer.size; return (0); } /* * initialize and allocate VM and memory for pipe */ static int pipe_create(cpipep) struct pipe **cpipep; { struct pipe *cpipe; int error; *cpipep = zalloc(pipe_zone); if (*cpipep == NULL) return (ENOMEM); cpipe = *cpipep; /* so pipespace()->pipe_free_kmem() doesn't follow junk pointer */ cpipe->pipe_buffer.object = NULL; #ifndef PIPE_NODIRECT cpipe->pipe_map.kva = NULL; #endif /* * protect so pipeclose() doesn't follow a junk pointer * if pipespace() fails. */ bzero(&cpipe->pipe_sel, sizeof(cpipe->pipe_sel)); cpipe->pipe_state = 0; cpipe->pipe_peer = NULL; cpipe->pipe_busy = 0; #ifndef PIPE_NODIRECT /* * pipe data structure initializations to support direct pipe I/O */ cpipe->pipe_map.cnt = 0; cpipe->pipe_map.kva = 0; cpipe->pipe_map.pos = 0; cpipe->pipe_map.npages = 0; /* cpipe->pipe_map.ms[] = invalid */ #endif error = pipespace(cpipe, PIPE_SIZE); if (error) return (error); vfs_timestamp(&cpipe->pipe_ctime); cpipe->pipe_atime = cpipe->pipe_ctime; cpipe->pipe_mtime = cpipe->pipe_ctime; return (0); } /* * lock a pipe for I/O, blocking other access */ static __inline int pipelock(cpipe, catch) struct pipe *cpipe; int catch; { int error; while (cpipe->pipe_state & PIPE_LOCK) { cpipe->pipe_state |= PIPE_LWANT; error = tsleep(cpipe, catch ? (PRIBIO | PCATCH) : PRIBIO, "pipelk", 0); if (error != 0) return (error); } cpipe->pipe_state |= PIPE_LOCK; return (0); } /* * unlock a pipe I/O lock */ static __inline void pipeunlock(cpipe) struct pipe *cpipe; { cpipe->pipe_state &= ~PIPE_LOCK; if (cpipe->pipe_state & PIPE_LWANT) { cpipe->pipe_state &= ~PIPE_LWANT; wakeup(cpipe); } } static __inline void pipeselwakeup(cpipe) struct pipe *cpipe; { if (cpipe->pipe_state & PIPE_SEL) { cpipe->pipe_state &= ~PIPE_SEL; selwakeup(&cpipe->pipe_sel); } if ((cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio) pgsigio(cpipe->pipe_sigio, SIGIO, 0); KNOTE(&cpipe->pipe_sel.si_note, 0); } /* ARGSUSED */ static int pipe_read(fp, uio, cred, flags, td) struct file *fp; struct uio *uio; struct ucred *cred; struct thread *td; int flags; { struct pipe *rpipe = (struct pipe *) fp->f_data; int error; int nread = 0; u_int size; ++rpipe->pipe_busy; error = pipelock(rpipe, 1); if (error) goto unlocked_error; while (uio->uio_resid) { /* * normal pipe buffer receive */ if (rpipe->pipe_buffer.cnt > 0) { size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out; if (size > rpipe->pipe_buffer.cnt) size = rpipe->pipe_buffer.cnt; if (size > (u_int) uio->uio_resid) size = (u_int) uio->uio_resid; error = uiomove(&rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out], size, uio); if (error) break; rpipe->pipe_buffer.out += size; if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size) rpipe->pipe_buffer.out = 0; rpipe->pipe_buffer.cnt -= size; /* * If there is no more to read in the pipe, reset * its pointers to the beginning. This improves * cache hit stats. */ if (rpipe->pipe_buffer.cnt == 0) { rpipe->pipe_buffer.in = 0; rpipe->pipe_buffer.out = 0; } nread += size; #ifndef PIPE_NODIRECT /* * Direct copy, bypassing a kernel buffer. */ } else if ((size = rpipe->pipe_map.cnt) && (rpipe->pipe_state & PIPE_DIRECTW)) { caddr_t va; if (size > (u_int) uio->uio_resid) size = (u_int) uio->uio_resid; va = (caddr_t) rpipe->pipe_map.kva + rpipe->pipe_map.pos; error = uiomove(va, size, uio); if (error) break; nread += size; rpipe->pipe_map.pos += size; rpipe->pipe_map.cnt -= size; if (rpipe->pipe_map.cnt == 0) { rpipe->pipe_state &= ~PIPE_DIRECTW; wakeup(rpipe); } #endif } else { /* * detect EOF condition * read returns 0 on EOF, no need to set error */ if (rpipe->pipe_state & PIPE_EOF) break; /* * If the "write-side" has been blocked, wake it up now. */ if (rpipe->pipe_state & PIPE_WANTW) { rpipe->pipe_state &= ~PIPE_WANTW; wakeup(rpipe); } /* * Break if some data was read. */ if (nread > 0) break; /* * Unlock the pipe buffer for our remaining processing. We * will either break out with an error or we will sleep and * relock to loop. */ pipeunlock(rpipe); /* * Handle non-blocking mode operation or * wait for more data. */ + FILE_LOCK(fp); if (fp->f_flag & FNONBLOCK) { + FILE_UNLOCK(fp); error = EAGAIN; } else { + FILE_UNLOCK(fp); rpipe->pipe_state |= PIPE_WANTR; if ((error = tsleep(rpipe, PRIBIO | PCATCH, "piperd", 0)) == 0) error = pipelock(rpipe, 1); } if (error) goto unlocked_error; } } pipeunlock(rpipe); if (error == 0) vfs_timestamp(&rpipe->pipe_atime); unlocked_error: --rpipe->pipe_busy; /* * PIPE_WANT processing only makes sense if pipe_busy is 0. */ if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) { rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW); wakeup(rpipe); } else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) { /* * Handle write blocking hysteresis. */ if (rpipe->pipe_state & PIPE_WANTW) { rpipe->pipe_state &= ~PIPE_WANTW; wakeup(rpipe); } } if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF) pipeselwakeup(rpipe); return (error); } #ifndef PIPE_NODIRECT /* * Map the sending processes' buffer into kernel space and wire it. * This is similar to a physical write operation. */ static int pipe_build_write_buffer(wpipe, uio) struct pipe *wpipe; struct uio *uio; { u_int size; int i; vm_offset_t addr, endaddr, paddr; GIANT_REQUIRED; size = (u_int) uio->uio_iov->iov_len; if (size > wpipe->pipe_buffer.size) size = wpipe->pipe_buffer.size; endaddr = round_page((vm_offset_t)uio->uio_iov->iov_base + size); addr = trunc_page((vm_offset_t)uio->uio_iov->iov_base); for (i = 0; addr < endaddr; addr += PAGE_SIZE, i++) { vm_page_t m; if (vm_fault_quick((caddr_t)addr, VM_PROT_READ) < 0 || (paddr = pmap_kextract(addr)) == 0) { int j; for (j = 0; j < i; j++) vm_page_unwire(wpipe->pipe_map.ms[j], 1); return (EFAULT); } m = PHYS_TO_VM_PAGE(paddr); vm_page_wire(m); wpipe->pipe_map.ms[i] = m; } /* * set up the control block */ wpipe->pipe_map.npages = i; wpipe->pipe_map.pos = ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK; wpipe->pipe_map.cnt = size; /* * and map the buffer */ if (wpipe->pipe_map.kva == 0) { /* * We need to allocate space for an extra page because the * address range might (will) span pages at times. */ wpipe->pipe_map.kva = kmem_alloc_pageable(kernel_map, wpipe->pipe_buffer.size + PAGE_SIZE); amountpipekva += wpipe->pipe_buffer.size + PAGE_SIZE; } pmap_qenter(wpipe->pipe_map.kva, wpipe->pipe_map.ms, wpipe->pipe_map.npages); /* * and update the uio data */ uio->uio_iov->iov_len -= size; uio->uio_iov->iov_base += size; if (uio->uio_iov->iov_len == 0) uio->uio_iov++; uio->uio_resid -= size; uio->uio_offset += size; return (0); } /* * unmap and unwire the process buffer */ static void pipe_destroy_write_buffer(wpipe) struct pipe *wpipe; { int i; GIANT_REQUIRED; if (wpipe->pipe_map.kva) { pmap_qremove(wpipe->pipe_map.kva, wpipe->pipe_map.npages); if (amountpipekva > MAXPIPEKVA) { vm_offset_t kva = wpipe->pipe_map.kva; wpipe->pipe_map.kva = 0; kmem_free(kernel_map, kva, wpipe->pipe_buffer.size + PAGE_SIZE); amountpipekva -= wpipe->pipe_buffer.size + PAGE_SIZE; } } for (i = 0; i < wpipe->pipe_map.npages; i++) vm_page_unwire(wpipe->pipe_map.ms[i], 1); } /* * In the case of a signal, the writing process might go away. This * code copies the data into the circular buffer so that the source * pages can be freed without loss of data. */ static void pipe_clone_write_buffer(wpipe) struct pipe *wpipe; { int size; int pos; size = wpipe->pipe_map.cnt; pos = wpipe->pipe_map.pos; bcopy((caddr_t) wpipe->pipe_map.kva + pos, (caddr_t) wpipe->pipe_buffer.buffer, size); wpipe->pipe_buffer.in = size; wpipe->pipe_buffer.out = 0; wpipe->pipe_buffer.cnt = size; wpipe->pipe_state &= ~PIPE_DIRECTW; pipe_destroy_write_buffer(wpipe); } /* * This implements the pipe buffer write mechanism. Note that only * a direct write OR a normal pipe write can be pending at any given time. * If there are any characters in the pipe buffer, the direct write will * be deferred until the receiving process grabs all of the bytes from * the pipe buffer. Then the direct mapping write is set-up. */ static int pipe_direct_write(wpipe, uio) struct pipe *wpipe; struct uio *uio; { int error; retry: while (wpipe->pipe_state & PIPE_DIRECTW) { if (wpipe->pipe_state & PIPE_WANTR) { wpipe->pipe_state &= ~PIPE_WANTR; wakeup(wpipe); } wpipe->pipe_state |= PIPE_WANTW; error = tsleep(wpipe, PRIBIO | PCATCH, "pipdww", 0); if (error) goto error1; if (wpipe->pipe_state & PIPE_EOF) { error = EPIPE; goto error1; } } wpipe->pipe_map.cnt = 0; /* transfer not ready yet */ if (wpipe->pipe_buffer.cnt > 0) { if (wpipe->pipe_state & PIPE_WANTR) { wpipe->pipe_state &= ~PIPE_WANTR; wakeup(wpipe); } wpipe->pipe_state |= PIPE_WANTW; error = tsleep(wpipe, PRIBIO | PCATCH, "pipdwc", 0); if (error) goto error1; if (wpipe->pipe_state & PIPE_EOF) { error = EPIPE; goto error1; } goto retry; } wpipe->pipe_state |= PIPE_DIRECTW; error = pipe_build_write_buffer(wpipe, uio); if (error) { wpipe->pipe_state &= ~PIPE_DIRECTW; goto error1; } error = 0; while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) { if (wpipe->pipe_state & PIPE_EOF) { pipelock(wpipe, 0); pipe_destroy_write_buffer(wpipe); pipeunlock(wpipe); pipeselwakeup(wpipe); error = EPIPE; goto error1; } if (wpipe->pipe_state & PIPE_WANTR) { wpipe->pipe_state &= ~PIPE_WANTR; wakeup(wpipe); } pipeselwakeup(wpipe); error = tsleep(wpipe, PRIBIO | PCATCH, "pipdwt", 0); } pipelock(wpipe,0); if (wpipe->pipe_state & PIPE_DIRECTW) { /* * this bit of trickery substitutes a kernel buffer for * the process that might be going away. */ pipe_clone_write_buffer(wpipe); } else { pipe_destroy_write_buffer(wpipe); } pipeunlock(wpipe); return (error); error1: wakeup(wpipe); return (error); } #endif static int pipe_write(fp, uio, cred, flags, td) struct file *fp; struct uio *uio; struct ucred *cred; struct thread *td; int flags; { int error = 0; int orig_resid; struct pipe *wpipe, *rpipe; rpipe = (struct pipe *) fp->f_data; wpipe = rpipe->pipe_peer; /* * detect loss of pipe read side, issue SIGPIPE if lost. */ if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) { return (EPIPE); } ++wpipe->pipe_busy; /* * If it is advantageous to resize the pipe buffer, do * so. */ if ((uio->uio_resid > PIPE_SIZE) && (nbigpipe < LIMITBIGPIPES) && (wpipe->pipe_state & PIPE_DIRECTW) == 0 && (wpipe->pipe_buffer.size <= PIPE_SIZE) && (wpipe->pipe_buffer.cnt == 0)) { if ((error = pipelock(wpipe,1)) == 0) { if (pipespace(wpipe, BIG_PIPE_SIZE) == 0) nbigpipe++; pipeunlock(wpipe); } } /* * If an early error occured unbusy and return, waking up any pending * readers. */ if (error) { --wpipe->pipe_busy; if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANT)) { wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR); wakeup(wpipe); } return(error); } KASSERT(wpipe->pipe_buffer.buffer != NULL, ("pipe buffer gone")); orig_resid = uio->uio_resid; while (uio->uio_resid) { int space; #ifndef PIPE_NODIRECT /* * If the transfer is large, we can gain performance if * we do process-to-process copies directly. * If the write is non-blocking, we don't use the * direct write mechanism. * * The direct write mechanism will detect the reader going * away on us. */ + FILE_LOCK(fp); if ((uio->uio_iov->iov_len >= PIPE_MINDIRECT) && (fp->f_flag & FNONBLOCK) == 0 && (wpipe->pipe_map.kva || (amountpipekva < LIMITPIPEKVA)) && (uio->uio_iov->iov_len >= PIPE_MINDIRECT)) { + FILE_UNLOCK(fp); error = pipe_direct_write( wpipe, uio); if (error) break; continue; - } + } else + FILE_UNLOCK(fp); #endif /* * Pipe buffered writes cannot be coincidental with * direct writes. We wait until the currently executing * direct write is completed before we start filling the * pipe buffer. We break out if a signal occurs or the * reader goes away. */ retrywrite: while (wpipe->pipe_state & PIPE_DIRECTW) { if (wpipe->pipe_state & PIPE_WANTR) { wpipe->pipe_state &= ~PIPE_WANTR; wakeup(wpipe); } error = tsleep(wpipe, PRIBIO | PCATCH, "pipbww", 0); if (wpipe->pipe_state & PIPE_EOF) break; if (error) break; } if (wpipe->pipe_state & PIPE_EOF) { error = EPIPE; break; } space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt; /* Writes of size <= PIPE_BUF must be atomic. */ if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF)) space = 0; if (space > 0 && (wpipe->pipe_buffer.cnt < PIPE_SIZE)) { if ((error = pipelock(wpipe,1)) == 0) { int size; /* Transfer size */ int segsize; /* first segment to transfer */ /* * It is possible for a direct write to * slip in on us... handle it here... */ if (wpipe->pipe_state & PIPE_DIRECTW) { pipeunlock(wpipe); goto retrywrite; } /* * If a process blocked in uiomove, our * value for space might be bad. * * XXX will we be ok if the reader has gone * away here? */ if (space > wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) { pipeunlock(wpipe); goto retrywrite; } /* * Transfer size is minimum of uio transfer * and free space in pipe buffer. */ if (space > uio->uio_resid) size = uio->uio_resid; else size = space; /* * First segment to transfer is minimum of * transfer size and contiguous space in * pipe buffer. If first segment to transfer * is less than the transfer size, we've got * a wraparound in the buffer. */ segsize = wpipe->pipe_buffer.size - wpipe->pipe_buffer.in; if (segsize > size) segsize = size; /* Transfer first segment */ error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in], segsize, uio); if (error == 0 && segsize < size) { /* * Transfer remaining part now, to * support atomic writes. Wraparound * happened. */ if (wpipe->pipe_buffer.in + segsize != wpipe->pipe_buffer.size) panic("Expected pipe buffer wraparound disappeared"); error = uiomove(&wpipe->pipe_buffer.buffer[0], size - segsize, uio); } if (error == 0) { wpipe->pipe_buffer.in += size; if (wpipe->pipe_buffer.in >= wpipe->pipe_buffer.size) { if (wpipe->pipe_buffer.in != size - segsize + wpipe->pipe_buffer.size) panic("Expected wraparound bad"); wpipe->pipe_buffer.in = size - segsize; } wpipe->pipe_buffer.cnt += size; if (wpipe->pipe_buffer.cnt > wpipe->pipe_buffer.size) panic("Pipe buffer overflow"); } pipeunlock(wpipe); } if (error) break; } else { /* * If the "read-side" has been blocked, wake it up now. */ if (wpipe->pipe_state & PIPE_WANTR) { wpipe->pipe_state &= ~PIPE_WANTR; wakeup(wpipe); } /* * don't block on non-blocking I/O */ + FILE_LOCK(fp); if (fp->f_flag & FNONBLOCK) { + FILE_UNLOCK(fp); error = EAGAIN; break; } + FILE_UNLOCK(fp); /* * We have no more space and have something to offer, * wake up select/poll. */ pipeselwakeup(wpipe); wpipe->pipe_state |= PIPE_WANTW; error = tsleep(wpipe, PRIBIO | PCATCH, "pipewr", 0); if (error != 0) break; /* * If read side wants to go away, we just issue a signal * to ourselves. */ if (wpipe->pipe_state & PIPE_EOF) { error = EPIPE; break; } } } --wpipe->pipe_busy; if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANT)) { wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR); wakeup(wpipe); } else if (wpipe->pipe_buffer.cnt > 0) { /* * If we have put any characters in the buffer, we wake up * the reader. */ if (wpipe->pipe_state & PIPE_WANTR) { wpipe->pipe_state &= ~PIPE_WANTR; wakeup(wpipe); } } /* * Don't return EPIPE if I/O was successful */ if ((wpipe->pipe_buffer.cnt == 0) && (uio->uio_resid == 0) && (error == EPIPE)) { error = 0; } if (error == 0) vfs_timestamp(&wpipe->pipe_mtime); /* * We have something to offer, * wake up select/poll. */ if (wpipe->pipe_buffer.cnt) pipeselwakeup(wpipe); return (error); } /* * we implement a very minimal set of ioctls for compatibility with sockets. */ int pipe_ioctl(fp, cmd, data, td) struct file *fp; u_long cmd; caddr_t data; struct thread *td; { struct pipe *mpipe = (struct pipe *)fp->f_data; switch (cmd) { case FIONBIO: return (0); case FIOASYNC: if (*(int *)data) { mpipe->pipe_state |= PIPE_ASYNC; } else { mpipe->pipe_state &= ~PIPE_ASYNC; } return (0); case FIONREAD: if (mpipe->pipe_state & PIPE_DIRECTW) *(int *)data = mpipe->pipe_map.cnt; else *(int *)data = mpipe->pipe_buffer.cnt; return (0); case FIOSETOWN: return (fsetown(*(int *)data, &mpipe->pipe_sigio)); case FIOGETOWN: *(int *)data = fgetown(mpipe->pipe_sigio); return (0); /* This is deprecated, FIOSETOWN should be used instead. */ case TIOCSPGRP: return (fsetown(-(*(int *)data), &mpipe->pipe_sigio)); /* This is deprecated, FIOGETOWN should be used instead. */ case TIOCGPGRP: *(int *)data = -fgetown(mpipe->pipe_sigio); return (0); } return (ENOTTY); } int pipe_poll(fp, events, cred, td) struct file *fp; int events; struct ucred *cred; struct thread *td; { struct pipe *rpipe = (struct pipe *)fp->f_data; struct pipe *wpipe; int revents = 0; wpipe = rpipe->pipe_peer; if (events & (POLLIN | POLLRDNORM)) if ((rpipe->pipe_state & PIPE_DIRECTW) || (rpipe->pipe_buffer.cnt > 0) || (rpipe->pipe_state & PIPE_EOF)) revents |= events & (POLLIN | POLLRDNORM); if (events & (POLLOUT | POLLWRNORM)) if (wpipe == NULL || (wpipe->pipe_state & PIPE_EOF) || (((wpipe->pipe_state & PIPE_DIRECTW) == 0) && (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF)) revents |= events & (POLLOUT | POLLWRNORM); if ((rpipe->pipe_state & PIPE_EOF) || (wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) revents |= POLLHUP; if (revents == 0) { if (events & (POLLIN | POLLRDNORM)) { selrecord(td, &rpipe->pipe_sel); rpipe->pipe_state |= PIPE_SEL; } if (events & (POLLOUT | POLLWRNORM)) { selrecord(td, &wpipe->pipe_sel); wpipe->pipe_state |= PIPE_SEL; } } return (revents); } static int pipe_stat(fp, ub, td) struct file *fp; struct stat *ub; struct thread *td; { struct pipe *pipe = (struct pipe *)fp->f_data; bzero((caddr_t)ub, sizeof(*ub)); ub->st_mode = S_IFIFO; ub->st_blksize = pipe->pipe_buffer.size; ub->st_size = pipe->pipe_buffer.cnt; ub->st_blocks = (ub->st_size + ub->st_blksize - 1) / ub->st_blksize; ub->st_atimespec = pipe->pipe_atime; ub->st_mtimespec = pipe->pipe_mtime; ub->st_ctimespec = pipe->pipe_ctime; ub->st_uid = fp->f_cred->cr_uid; ub->st_gid = fp->f_cred->cr_gid; /* * Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen. * XXX (st_dev, st_ino) should be unique. */ return (0); } /* ARGSUSED */ static int pipe_close(fp, td) struct file *fp; struct thread *td; { struct pipe *cpipe = (struct pipe *)fp->f_data; fp->f_ops = &badfileops; fp->f_data = NULL; funsetown(cpipe->pipe_sigio); pipeclose(cpipe); return (0); } static void pipe_free_kmem(cpipe) struct pipe *cpipe; { GIANT_REQUIRED; if (cpipe->pipe_buffer.buffer != NULL) { if (cpipe->pipe_buffer.size > PIPE_SIZE) --nbigpipe; amountpipekva -= cpipe->pipe_buffer.size; kmem_free(kernel_map, (vm_offset_t)cpipe->pipe_buffer.buffer, cpipe->pipe_buffer.size); cpipe->pipe_buffer.buffer = NULL; } #ifndef PIPE_NODIRECT if (cpipe->pipe_map.kva != NULL) { amountpipekva -= cpipe->pipe_buffer.size + PAGE_SIZE; kmem_free(kernel_map, cpipe->pipe_map.kva, cpipe->pipe_buffer.size + PAGE_SIZE); cpipe->pipe_map.cnt = 0; cpipe->pipe_map.kva = 0; cpipe->pipe_map.pos = 0; cpipe->pipe_map.npages = 0; } #endif } /* * shutdown the pipe */ static void pipeclose(cpipe) struct pipe *cpipe; { struct pipe *ppipe; if (cpipe) { pipeselwakeup(cpipe); /* * If the other side is blocked, wake it up saying that * we want to close it down. */ while (cpipe->pipe_busy) { wakeup(cpipe); cpipe->pipe_state |= PIPE_WANT | PIPE_EOF; tsleep(cpipe, PRIBIO, "pipecl", 0); } /* * Disconnect from peer */ if ((ppipe = cpipe->pipe_peer) != NULL) { pipeselwakeup(ppipe); ppipe->pipe_state |= PIPE_EOF; wakeup(ppipe); KNOTE(&ppipe->pipe_sel.si_note, 0); ppipe->pipe_peer = NULL; } /* * free resources */ pipe_free_kmem(cpipe); zfree(pipe_zone, cpipe); } } /*ARGSUSED*/ static int pipe_kqfilter(struct file *fp, struct knote *kn) { - struct pipe *cpipe = (struct pipe *)kn->kn_fp->f_data; + struct pipe *cpipe; + cpipe = (struct pipe *)kn->kn_fp->f_data; switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &pipe_rfiltops; break; case EVFILT_WRITE: kn->kn_fop = &pipe_wfiltops; cpipe = cpipe->pipe_peer; break; default: return (1); } kn->kn_hook = (caddr_t)cpipe; SLIST_INSERT_HEAD(&cpipe->pipe_sel.si_note, kn, kn_selnext); return (0); } static void filt_pipedetach(struct knote *kn) { struct pipe *cpipe = (struct pipe *)kn->kn_hook; SLIST_REMOVE(&cpipe->pipe_sel.si_note, kn, knote, kn_selnext); } /*ARGSUSED*/ static int filt_piperead(struct knote *kn, long hint) { struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data; struct pipe *wpipe = rpipe->pipe_peer; kn->kn_data = rpipe->pipe_buffer.cnt; if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW)) kn->kn_data = rpipe->pipe_map.cnt; if ((rpipe->pipe_state & PIPE_EOF) || (wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) { kn->kn_flags |= EV_EOF; return (1); } return (kn->kn_data > 0); } /*ARGSUSED*/ static int filt_pipewrite(struct knote *kn, long hint) { struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data; struct pipe *wpipe = rpipe->pipe_peer; if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) { kn->kn_data = 0; kn->kn_flags |= EV_EOF; return (1); } kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt; if (wpipe->pipe_state & PIPE_DIRECTW) kn->kn_data = 0; return (kn->kn_data >= PIPE_BUF); } Index: head/sys/kern/sys_socket.c =================================================================== --- head/sys/kern/sys_socket.c (revision 89305) +++ head/sys/kern/sys_socket.c (revision 89306) @@ -1,206 +1,207 @@ /* * Copyright (c) 1982, 1986, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)sys_socket.c 8.1 (Berkeley) 6/10/93 * $FreeBSD$ */ #include #include #include #include #include #include #include /* XXX */ #include #include #include #include #include #include #include struct fileops socketops = { soo_read, soo_write, soo_ioctl, soo_poll, sokqfilter, soo_stat, soo_close }; /* ARGSUSED */ int soo_read(fp, uio, cred, flags, td) struct file *fp; struct uio *uio; struct ucred *cred; struct thread *td; int flags; { struct socket *so = (struct socket *)fp->f_data; return so->so_proto->pr_usrreqs->pru_soreceive(so, 0, uio, 0, 0, 0); } /* ARGSUSED */ int soo_write(fp, uio, cred, flags, td) struct file *fp; struct uio *uio; struct ucred *cred; struct thread *td; int flags; { struct socket *so = (struct socket *)fp->f_data; return so->so_proto->pr_usrreqs->pru_sosend(so, 0, uio, 0, 0, 0, uio->uio_td); } int soo_ioctl(fp, cmd, data, td) struct file *fp; u_long cmd; register caddr_t data; struct thread *td; { register struct socket *so = (struct socket *)fp->f_data; switch (cmd) { case FIONBIO: if (*(int *)data) so->so_state |= SS_NBIO; else so->so_state &= ~SS_NBIO; return (0); case FIOASYNC: if (*(int *)data) { so->so_state |= SS_ASYNC; so->so_rcv.sb_flags |= SB_ASYNC; so->so_snd.sb_flags |= SB_ASYNC; } else { so->so_state &= ~SS_ASYNC; so->so_rcv.sb_flags &= ~SB_ASYNC; so->so_snd.sb_flags &= ~SB_ASYNC; } return (0); case FIONREAD: *(int *)data = so->so_rcv.sb_cc; return (0); case FIOSETOWN: return (fsetown(*(int *)data, &so->so_sigio)); case FIOGETOWN: *(int *)data = fgetown(so->so_sigio); return (0); case SIOCSPGRP: return (fsetown(-(*(int *)data), &so->so_sigio)); case SIOCGPGRP: *(int *)data = -fgetown(so->so_sigio); return (0); case SIOCATMARK: *(int *)data = (so->so_state&SS_RCVATMARK) != 0; return (0); } /* * Interface/routing/protocol specific ioctls: * interface and routing ioctls should have a * different entry since a socket's unnecessary */ if (IOCGROUP(cmd) == 'i') return (ifioctl(so, cmd, data, td)); if (IOCGROUP(cmd) == 'r') return (rtioctl(cmd, data)); return ((*so->so_proto->pr_usrreqs->pru_control)(so, cmd, data, 0, td)); } int soo_poll(fp, events, cred, td) struct file *fp; int events; struct ucred *cred; struct thread *td; { struct socket *so = (struct socket *)fp->f_data; return so->so_proto->pr_usrreqs->pru_sopoll(so, events, cred, td); } int soo_stat(fp, ub, td) struct file *fp; struct stat *ub; struct thread *td; { struct socket *so = (struct socket *)fp->f_data; bzero((caddr_t)ub, sizeof (*ub)); ub->st_mode = S_IFSOCK; /* * If SS_CANTRCVMORE is set, but there's still data left in the * receive buffer, the socket is still readable. */ if ((so->so_state & SS_CANTRCVMORE) == 0 || so->so_rcv.sb_cc != 0) ub->st_mode |= S_IRUSR | S_IRGRP | S_IROTH; if ((so->so_state & SS_CANTSENDMORE) == 0) ub->st_mode |= S_IWUSR | S_IWGRP | S_IWOTH; ub->st_size = so->so_rcv.sb_cc; ub->st_uid = so->so_cred->cr_uid; ub->st_gid = so->so_cred->cr_gid; return ((*so->so_proto->pr_usrreqs->pru_sense)(so, ub)); } /* * API socket close on file pointer. We call soclose() to close the * socket (including initiating closing protocols). soclose() will * sorele() the file reference but the actual socket will not go away * until the socket's ref count hits 0. */ /* ARGSUSED */ int soo_close(fp, td) struct file *fp; struct thread *td; { int error = 0; struct socket *so; + so = (struct socket *)fp->f_data; fp->f_ops = &badfileops; - if ((so = (struct socket *)fp->f_data) != NULL) { - fp->f_data = NULL; + fp->f_data = 0; + + if (so) error = soclose(so); - } return (error); } Index: head/sys/kern/uipc_syscalls.c =================================================================== --- head/sys/kern/uipc_syscalls.c (revision 89305) +++ head/sys/kern/uipc_syscalls.c (revision 89306) @@ -1,1935 +1,1957 @@ /* * Copyright (c) 1982, 1986, 1989, 1990, 1993 * The Regents of the University of California. All rights reserved. * * sendfile(2) and related extensions: * Copyright (c) 1998, David Greenman. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)uipc_syscalls.c 8.4 (Berkeley) 2/21/94 * $FreeBSD$ */ #include "opt_compat.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include #include #include #include #include #include static void sf_buf_init(void *arg); SYSINIT(sock_sf, SI_SUB_MBUF, SI_ORDER_ANY, sf_buf_init, NULL) static struct sf_buf *sf_buf_alloc(void); static void sf_buf_free(caddr_t addr, void *args); static int sendit __P((struct thread *td, int s, struct msghdr *mp, int flags)); static int recvit __P((struct thread *td, int s, struct msghdr *mp, caddr_t namelenp)); static int accept1 __P((struct thread *td, struct accept_args *uap, int compat)); static int getsockname1 __P((struct thread *td, struct getsockname_args *uap, int compat)); static int getpeername1 __P((struct thread *td, struct getpeername_args *uap, int compat)); /* * Expanded sf_freelist head. Really an SLIST_HEAD() in disguise, with the * sf_freelist head with the sf_lock mutex. */ static struct { SLIST_HEAD(, sf_buf) sf_head; struct mtx sf_lock; } sf_freelist; static vm_offset_t sf_base; static struct sf_buf *sf_bufs; static u_int sf_buf_alloc_want; /* * System call interface to the socket abstraction. */ #if defined(COMPAT_43) || defined(COMPAT_SUNOS) #define COMPAT_OLDSOCK #endif extern struct fileops socketops; /* * MPSAFE */ int socket(td, uap) struct thread *td; register struct socket_args /* { int domain; int type; int protocol; } */ *uap; { struct filedesc *fdp; struct socket *so; struct file *fp; int fd, error; mtx_lock(&Giant); fdp = td->td_proc->p_fd; error = falloc(td, &fp, &fd); if (error) goto done2; fhold(fp); error = socreate(uap->domain, &so, uap->type, uap->protocol, td->td_proc->p_ucred, td); + FILEDESC_LOCK(fdp); if (error) { if (fdp->fd_ofiles[fd] == fp) { fdp->fd_ofiles[fd] = NULL; + FILEDESC_UNLOCK(fdp); fdrop(fp, td); - } + } else + FILEDESC_UNLOCK(fdp); } else { fp->f_data = (caddr_t)so; /* already has ref count */ fp->f_flag = FREAD|FWRITE; fp->f_ops = &socketops; fp->f_type = DTYPE_SOCKET; + FILEDESC_UNLOCK(fdp); td->td_retval[0] = fd; } fdrop(fp, td); done2: mtx_unlock(&Giant); return (error); } /* * MPSAFE */ /* ARGSUSED */ int bind(td, uap) struct thread *td; register struct bind_args /* { int s; caddr_t name; int namelen; } */ *uap; { struct socket *so; struct sockaddr *sa; int error; mtx_lock(&Giant); if ((error = fgetsock(td, uap->s, &so, NULL)) != 0) goto done2; if ((error = getsockaddr(&sa, uap->name, uap->namelen)) != 0) goto done1; error = sobind(so, sa, td); FREE(sa, M_SONAME); done1: fputsock(so); done2: mtx_unlock(&Giant); return (error); } /* * MPSAFE */ /* ARGSUSED */ int listen(td, uap) struct thread *td; register struct listen_args /* { int s; int backlog; } */ *uap; { struct socket *so; int error; mtx_lock(&Giant); if ((error = fgetsock(td, uap->s, &so, NULL)) == 0) { error = solisten(so, uap->backlog, td); fputsock(so); } mtx_unlock(&Giant); return(error); } /* * accept1() * MPSAFE */ static int accept1(td, uap, compat) struct thread *td; register struct accept_args /* { int s; caddr_t name; int *anamelen; } */ *uap; int compat; { struct filedesc *fdp; struct file *nfp = NULL; struct sockaddr *sa; int namelen, error, s; struct socket *head, *so; int fd; u_int fflag; mtx_lock(&Giant); fdp = td->td_proc->p_fd; if (uap->name) { error = copyin((caddr_t)uap->anamelen, (caddr_t)&namelen, sizeof (namelen)); if(error) goto done2; } error = fgetsock(td, uap->s, &head, &fflag); if (error) goto done2; s = splnet(); if ((head->so_options & SO_ACCEPTCONN) == 0) { splx(s); error = EINVAL; goto done; } if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->so_comp)) { splx(s); error = EWOULDBLOCK; goto done; } while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) { if (head->so_state & SS_CANTRCVMORE) { head->so_error = ECONNABORTED; break; } error = tsleep((caddr_t)&head->so_timeo, PSOCK | PCATCH, "accept", 0); if (error) { splx(s); goto done; } } if (head->so_error) { error = head->so_error; head->so_error = 0; splx(s); goto done; } /* * At this point we know that there is at least one connection * ready to be accepted. Remove it from the queue prior to * allocating the file descriptor for it since falloc() may * block allowing another process to accept the connection * instead. */ so = TAILQ_FIRST(&head->so_comp); TAILQ_REMOVE(&head->so_comp, so, so_list); head->so_qlen--; error = falloc(td, &nfp, &fd); if (error) { /* * Probably ran out of file descriptors. Put the * unaccepted connection back onto the queue and * do another wakeup so some other process might * have a chance at it. */ TAILQ_INSERT_HEAD(&head->so_comp, so, so_list); head->so_qlen++; wakeup_one(&head->so_timeo); splx(s); goto done; } fhold(nfp); td->td_retval[0] = fd; /* connection has been removed from the listen queue */ KNOTE(&head->so_rcv.sb_sel.si_note, 0); so->so_state &= ~SS_COMP; so->so_head = NULL; if (head->so_sigio != NULL) fsetown(fgetown(head->so_sigio), &so->so_sigio); + FILE_LOCK(nfp); soref(so); /* file descriptor reference */ nfp->f_data = (caddr_t)so; /* nfp has ref count from falloc */ nfp->f_flag = fflag; nfp->f_ops = &socketops; nfp->f_type = DTYPE_SOCKET; + FILE_UNLOCK(nfp); sa = 0; error = soaccept(so, &sa); if (error) { /* * return a namelen of zero for older code which might * ignore the return value from accept. */ if (uap->name != NULL) { namelen = 0; (void) copyout((caddr_t)&namelen, (caddr_t)uap->anamelen, sizeof(*uap->anamelen)); } goto noconnection; } if (sa == NULL) { namelen = 0; if (uap->name) goto gotnoname; splx(s); error = 0; goto done; } if (uap->name) { /* check sa_len before it is destroyed */ if (namelen > sa->sa_len) namelen = sa->sa_len; #ifdef COMPAT_OLDSOCK if (compat) ((struct osockaddr *)sa)->sa_family = sa->sa_family; #endif error = copyout(sa, (caddr_t)uap->name, (u_int)namelen); if (!error) gotnoname: error = copyout((caddr_t)&namelen, (caddr_t)uap->anamelen, sizeof (*uap->anamelen)); } noconnection: if (sa) FREE(sa, M_SONAME); /* * close the new descriptor, assuming someone hasn't ripped it * out from under us. */ if (error) { + FILEDESC_LOCK(fdp); if (fdp->fd_ofiles[fd] == nfp) { fdp->fd_ofiles[fd] = NULL; + FILEDESC_UNLOCK(fdp); fdrop(nfp, td); + } else { + FILEDESC_UNLOCK(fdp); } } splx(s); /* * Release explicitly held references before returning. */ done: if (nfp != NULL) fdrop(nfp, td); fputsock(head); done2: mtx_unlock(&Giant); return (error); } /* * MPSAFE (accept1() is MPSAFE) */ int accept(td, uap) struct thread *td; struct accept_args *uap; { return (accept1(td, uap, 0)); } #ifdef COMPAT_OLDSOCK /* * MPSAFE (accept1() is MPSAFE) */ int oaccept(td, uap) struct thread *td; struct accept_args *uap; { return (accept1(td, uap, 1)); } #endif /* COMPAT_OLDSOCK */ /* * MPSAFE */ /* ARGSUSED */ int connect(td, uap) struct thread *td; register struct connect_args /* { int s; caddr_t name; int namelen; } */ *uap; { struct socket *so; struct sockaddr *sa; int error, s; mtx_lock(&Giant); if ((error = fgetsock(td, uap->s, &so, NULL)) != 0) goto done2; if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) { error = EALREADY; goto done1; } error = getsockaddr(&sa, uap->name, uap->namelen); if (error) goto done1; error = soconnect(so, sa, td); if (error) goto bad; if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) { FREE(sa, M_SONAME); error = EINPROGRESS; goto done1; } s = splnet(); while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { error = tsleep((caddr_t)&so->so_timeo, PSOCK | PCATCH, "connec", 0); if (error) break; } if (error == 0) { error = so->so_error; so->so_error = 0; } splx(s); bad: so->so_state &= ~SS_ISCONNECTING; FREE(sa, M_SONAME); if (error == ERESTART) error = EINTR; done1: fputsock(so); done2: mtx_unlock(&Giant); return (error); } /* * MPSAFE */ int socketpair(td, uap) struct thread *td; register struct socketpair_args /* { int domain; int type; int protocol; int *rsv; } */ *uap; { register struct filedesc *fdp = td->td_proc->p_fd; struct file *fp1, *fp2; struct socket *so1, *so2; int fd, error, sv[2]; mtx_lock(&Giant); error = socreate(uap->domain, &so1, uap->type, uap->protocol, td->td_proc->p_ucred, td); if (error) goto done2; error = socreate(uap->domain, &so2, uap->type, uap->protocol, td->td_proc->p_ucred, td); if (error) goto free1; error = falloc(td, &fp1, &fd); if (error) goto free2; fhold(fp1); sv[0] = fd; fp1->f_data = (caddr_t)so1; /* so1 already has ref count */ error = falloc(td, &fp2, &fd); if (error) goto free3; fhold(fp2); fp2->f_data = (caddr_t)so2; /* so2 already has ref count */ sv[1] = fd; error = soconnect2(so1, so2); if (error) goto free4; if (uap->type == SOCK_DGRAM) { /* * Datagram socket connection is asymmetric. */ error = soconnect2(so2, so1); if (error) goto free4; } - fp1->f_flag = fp2->f_flag = FREAD|FWRITE; - fp1->f_ops = fp2->f_ops = &socketops; - fp1->f_type = fp2->f_type = DTYPE_SOCKET; + FILE_LOCK(fp1); + fp1->f_flag = FREAD|FWRITE; + fp1->f_ops = &socketops; + fp1->f_type = DTYPE_SOCKET; + FILE_UNLOCK(fp1); + FILE_LOCK(fp2); + fp2->f_flag = FREAD|FWRITE; + fp2->f_ops = &socketops; + fp2->f_type = DTYPE_SOCKET; + FILE_UNLOCK(fp2); error = copyout((caddr_t)sv, (caddr_t)uap->rsv, 2 * sizeof (int)); fdrop(fp1, td); fdrop(fp2, td); goto done2; free4: + FILEDESC_LOCK(fdp); if (fdp->fd_ofiles[sv[1]] == fp2) { fdp->fd_ofiles[sv[1]] = NULL; + FILEDESC_UNLOCK(fdp); fdrop(fp2, td); - } + } else + FILEDESC_UNLOCK(fdp); fdrop(fp2, td); free3: + FILEDESC_LOCK(fdp); if (fdp->fd_ofiles[sv[0]] == fp1) { fdp->fd_ofiles[sv[0]] = NULL; + FILEDESC_UNLOCK(fdp); fdrop(fp1, td); - } + } else + FILEDESC_UNLOCK(fdp); fdrop(fp1, td); free2: (void)soclose(so2); free1: (void)soclose(so1); done2: mtx_unlock(&Giant); return (error); } static int sendit(td, s, mp, flags) register struct thread *td; int s; register struct msghdr *mp; int flags; { struct uio auio; register struct iovec *iov; register int i; struct mbuf *control; struct sockaddr *to = NULL; int len, error; struct socket *so; #ifdef KTRACE struct iovec *ktriov = NULL; struct uio ktruio; #endif if ((error = fgetsock(td, s, &so, NULL)) != 0) return (error); auio.uio_iov = mp->msg_iov; auio.uio_iovcnt = mp->msg_iovlen; auio.uio_segflg = UIO_USERSPACE; auio.uio_rw = UIO_WRITE; auio.uio_td = td; auio.uio_offset = 0; /* XXX */ auio.uio_resid = 0; iov = mp->msg_iov; for (i = 0; i < mp->msg_iovlen; i++, iov++) { if ((auio.uio_resid += iov->iov_len) < 0) { error = EINVAL; goto bad; } } if (mp->msg_name) { error = getsockaddr(&to, mp->msg_name, mp->msg_namelen); if (error) goto bad; } if (mp->msg_control) { if (mp->msg_controllen < sizeof(struct cmsghdr) #ifdef COMPAT_OLDSOCK && mp->msg_flags != MSG_COMPAT #endif ) { error = EINVAL; goto bad; } error = sockargs(&control, mp->msg_control, mp->msg_controllen, MT_CONTROL); if (error) goto bad; #ifdef COMPAT_OLDSOCK if (mp->msg_flags == MSG_COMPAT) { register struct cmsghdr *cm; M_PREPEND(control, sizeof(*cm), M_TRYWAIT); if (control == 0) { error = ENOBUFS; goto bad; } else { cm = mtod(control, struct cmsghdr *); cm->cmsg_len = control->m_len; cm->cmsg_level = SOL_SOCKET; cm->cmsg_type = SCM_RIGHTS; } } #endif } else { control = 0; } #ifdef KTRACE if (KTRPOINT(td->td_proc, KTR_GENIO)) { int iovlen = auio.uio_iovcnt * sizeof (struct iovec); MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen); ktruio = auio; } #endif len = auio.uio_resid; error = so->so_proto->pr_usrreqs->pru_sosend(so, to, &auio, 0, control, flags, td); if (error) { if (auio.uio_resid != len && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; if (error == EPIPE) { PROC_LOCK(td->td_proc); psignal(td->td_proc, SIGPIPE); PROC_UNLOCK(td->td_proc); } } if (error == 0) td->td_retval[0] = len - auio.uio_resid; #ifdef KTRACE if (ktriov != NULL) { if (error == 0) { ktruio.uio_iov = ktriov; ktruio.uio_resid = td->td_retval[0]; ktrgenio(td->td_proc->p_tracep, s, UIO_WRITE, &ktruio, error); } FREE(ktriov, M_TEMP); } #endif bad: fputsock(so); if (to) FREE(to, M_SONAME); return (error); } /* * MPSAFE */ int sendto(td, uap) struct thread *td; register struct sendto_args /* { int s; caddr_t buf; size_t len; int flags; caddr_t to; int tolen; } */ *uap; { struct msghdr msg; struct iovec aiov; int error; msg.msg_name = uap->to; msg.msg_namelen = uap->tolen; msg.msg_iov = &aiov; msg.msg_iovlen = 1; msg.msg_control = 0; #ifdef COMPAT_OLDSOCK msg.msg_flags = 0; #endif aiov.iov_base = uap->buf; aiov.iov_len = uap->len; mtx_lock(&Giant); error = sendit(td, uap->s, &msg, uap->flags); mtx_unlock(&Giant); return (error); } #ifdef COMPAT_OLDSOCK /* * MPSAFE */ int osend(td, uap) struct thread *td; register struct osend_args /* { int s; caddr_t buf; int len; int flags; } */ *uap; { struct msghdr msg; struct iovec aiov; int error; msg.msg_name = 0; msg.msg_namelen = 0; msg.msg_iov = &aiov; msg.msg_iovlen = 1; aiov.iov_base = uap->buf; aiov.iov_len = uap->len; msg.msg_control = 0; msg.msg_flags = 0; mtx_lock(&Giant); error = sendit(td, uap->s, &msg, uap->flags); mtx_unlock(&Giant); return (error); } /* * MPSAFE */ int osendmsg(td, uap) struct thread *td; register struct osendmsg_args /* { int s; caddr_t msg; int flags; } */ *uap; { struct msghdr msg; struct iovec aiov[UIO_SMALLIOV], *iov; int error; mtx_lock(&Giant); error = copyin(uap->msg, (caddr_t)&msg, sizeof (struct omsghdr)); if (error) goto done2; if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) { if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) { error = EMSGSIZE; goto done2; } MALLOC(iov, struct iovec *, sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV, M_WAITOK); } else { iov = aiov; } error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov, (unsigned)(msg.msg_iovlen * sizeof (struct iovec))); if (error) goto done; msg.msg_flags = MSG_COMPAT; msg.msg_iov = iov; error = sendit(td, uap->s, &msg, uap->flags); done: if (iov != aiov) FREE(iov, M_IOV); done2: mtx_unlock(&Giant); return (error); } #endif /* * MPSAFE */ int sendmsg(td, uap) struct thread *td; register struct sendmsg_args /* { int s; caddr_t msg; int flags; } */ *uap; { struct msghdr msg; struct iovec aiov[UIO_SMALLIOV], *iov; int error; mtx_lock(&Giant); error = copyin(uap->msg, (caddr_t)&msg, sizeof (msg)); if (error) goto done2; if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) { if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) { error = EMSGSIZE; goto done2; } MALLOC(iov, struct iovec *, sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV, M_WAITOK); } else { iov = aiov; } if (msg.msg_iovlen && (error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov, (unsigned)(msg.msg_iovlen * sizeof (struct iovec))))) goto done; msg.msg_iov = iov; #ifdef COMPAT_OLDSOCK msg.msg_flags = 0; #endif error = sendit(td, uap->s, &msg, uap->flags); done: if (iov != aiov) FREE(iov, M_IOV); done2: mtx_unlock(&Giant); return (error); } static int recvit(td, s, mp, namelenp) register struct thread *td; int s; register struct msghdr *mp; caddr_t namelenp; { struct uio auio; register struct iovec *iov; register int i; int len, error; struct mbuf *m, *control = 0; caddr_t ctlbuf; struct socket *so; struct sockaddr *fromsa = 0; #ifdef KTRACE struct iovec *ktriov = NULL; struct uio ktruio; #endif if ((error = fgetsock(td, s, &so, NULL)) != 0) return (error); auio.uio_iov = mp->msg_iov; auio.uio_iovcnt = mp->msg_iovlen; auio.uio_segflg = UIO_USERSPACE; auio.uio_rw = UIO_READ; auio.uio_td = td; auio.uio_offset = 0; /* XXX */ auio.uio_resid = 0; iov = mp->msg_iov; for (i = 0; i < mp->msg_iovlen; i++, iov++) { if ((auio.uio_resid += iov->iov_len) < 0) { fputsock(so); return (EINVAL); } } #ifdef KTRACE if (KTRPOINT(td->td_proc, KTR_GENIO)) { int iovlen = auio.uio_iovcnt * sizeof (struct iovec); MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen); ktruio = auio; } #endif len = auio.uio_resid; error = so->so_proto->pr_usrreqs->pru_soreceive(so, &fromsa, &auio, (struct mbuf **)0, mp->msg_control ? &control : (struct mbuf **)0, &mp->msg_flags); if (error) { if (auio.uio_resid != len && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; } #ifdef KTRACE if (ktriov != NULL) { if (error == 0) { ktruio.uio_iov = ktriov; ktruio.uio_resid = len - auio.uio_resid; ktrgenio(td->td_proc->p_tracep, s, UIO_READ, &ktruio, error); } FREE(ktriov, M_TEMP); } #endif if (error) goto out; td->td_retval[0] = len - auio.uio_resid; if (mp->msg_name) { len = mp->msg_namelen; if (len <= 0 || fromsa == 0) len = 0; else { #ifndef MIN #define MIN(a,b) ((a)>(b)?(b):(a)) #endif /* save sa_len before it is destroyed by MSG_COMPAT */ len = MIN(len, fromsa->sa_len); #ifdef COMPAT_OLDSOCK if (mp->msg_flags & MSG_COMPAT) ((struct osockaddr *)fromsa)->sa_family = fromsa->sa_family; #endif error = copyout(fromsa, (caddr_t)mp->msg_name, (unsigned)len); if (error) goto out; } mp->msg_namelen = len; if (namelenp && (error = copyout((caddr_t)&len, namelenp, sizeof (int)))) { #ifdef COMPAT_OLDSOCK if (mp->msg_flags & MSG_COMPAT) error = 0; /* old recvfrom didn't check */ else #endif goto out; } } if (mp->msg_control) { #ifdef COMPAT_OLDSOCK /* * We assume that old recvmsg calls won't receive access * rights and other control info, esp. as control info * is always optional and those options didn't exist in 4.3. * If we receive rights, trim the cmsghdr; anything else * is tossed. */ if (control && mp->msg_flags & MSG_COMPAT) { if (mtod(control, struct cmsghdr *)->cmsg_level != SOL_SOCKET || mtod(control, struct cmsghdr *)->cmsg_type != SCM_RIGHTS) { mp->msg_controllen = 0; goto out; } control->m_len -= sizeof (struct cmsghdr); control->m_data += sizeof (struct cmsghdr); } #endif len = mp->msg_controllen; m = control; mp->msg_controllen = 0; ctlbuf = (caddr_t) mp->msg_control; while (m && len > 0) { unsigned int tocopy; if (len >= m->m_len) tocopy = m->m_len; else { mp->msg_flags |= MSG_CTRUNC; tocopy = len; } if ((error = copyout((caddr_t)mtod(m, caddr_t), ctlbuf, tocopy)) != 0) goto out; ctlbuf += tocopy; len -= tocopy; m = m->m_next; } mp->msg_controllen = ctlbuf - (caddr_t)mp->msg_control; } out: fputsock(so); if (fromsa) FREE(fromsa, M_SONAME); if (control) m_freem(control); return (error); } /* * MPSAFE */ int recvfrom(td, uap) struct thread *td; register struct recvfrom_args /* { int s; caddr_t buf; size_t len; int flags; caddr_t from; int *fromlenaddr; } */ *uap; { struct msghdr msg; struct iovec aiov; int error; mtx_lock(&Giant); if (uap->fromlenaddr) { error = copyin((caddr_t)uap->fromlenaddr, (caddr_t)&msg.msg_namelen, sizeof (msg.msg_namelen)); if (error) goto done2; } else { msg.msg_namelen = 0; } msg.msg_name = uap->from; msg.msg_iov = &aiov; msg.msg_iovlen = 1; aiov.iov_base = uap->buf; aiov.iov_len = uap->len; msg.msg_control = 0; msg.msg_flags = uap->flags; error = recvit(td, uap->s, &msg, (caddr_t)uap->fromlenaddr); done2: mtx_unlock(&Giant); return(error); } #ifdef COMPAT_OLDSOCK /* * MPSAFE */ int orecvfrom(td, uap) struct thread *td; struct recvfrom_args *uap; { uap->flags |= MSG_COMPAT; return (recvfrom(td, uap)); } #endif #ifdef COMPAT_OLDSOCK /* * MPSAFE */ int orecv(td, uap) struct thread *td; register struct orecv_args /* { int s; caddr_t buf; int len; int flags; } */ *uap; { struct msghdr msg; struct iovec aiov; int error; mtx_lock(&Giant); msg.msg_name = 0; msg.msg_namelen = 0; msg.msg_iov = &aiov; msg.msg_iovlen = 1; aiov.iov_base = uap->buf; aiov.iov_len = uap->len; msg.msg_control = 0; msg.msg_flags = uap->flags; error = recvit(td, uap->s, &msg, (caddr_t)0); mtx_unlock(&Giant); return (error); } /* * Old recvmsg. This code takes advantage of the fact that the old msghdr * overlays the new one, missing only the flags, and with the (old) access * rights where the control fields are now. * * MPSAFE */ int orecvmsg(td, uap) struct thread *td; register struct orecvmsg_args /* { int s; struct omsghdr *msg; int flags; } */ *uap; { struct msghdr msg; struct iovec aiov[UIO_SMALLIOV], *iov; int error; error = copyin((caddr_t)uap->msg, (caddr_t)&msg, sizeof (struct omsghdr)); if (error) return (error); mtx_lock(&Giant); if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) { if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) { error = EMSGSIZE; goto done2; } MALLOC(iov, struct iovec *, sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV, M_WAITOK); } else { iov = aiov; } msg.msg_flags = uap->flags | MSG_COMPAT; error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov, (unsigned)(msg.msg_iovlen * sizeof (struct iovec))); if (error) goto done; msg.msg_iov = iov; error = recvit(td, uap->s, &msg, (caddr_t)&uap->msg->msg_namelen); if (msg.msg_controllen && error == 0) error = copyout((caddr_t)&msg.msg_controllen, (caddr_t)&uap->msg->msg_accrightslen, sizeof (int)); done: if (iov != aiov) FREE(iov, M_IOV); done2: mtx_unlock(&Giant); return (error); } #endif /* * MPSAFE */ int recvmsg(td, uap) struct thread *td; register struct recvmsg_args /* { int s; struct msghdr *msg; int flags; } */ *uap; { struct msghdr msg; struct iovec aiov[UIO_SMALLIOV], *uiov, *iov; register int error; mtx_lock(&Giant); error = copyin((caddr_t)uap->msg, (caddr_t)&msg, sizeof (msg)); if (error) goto done2; if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) { if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) { error = EMSGSIZE; goto done2; } MALLOC(iov, struct iovec *, sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV, M_WAITOK); } else { iov = aiov; } #ifdef COMPAT_OLDSOCK msg.msg_flags = uap->flags &~ MSG_COMPAT; #else msg.msg_flags = uap->flags; #endif uiov = msg.msg_iov; msg.msg_iov = iov; error = copyin((caddr_t)uiov, (caddr_t)iov, (unsigned)(msg.msg_iovlen * sizeof (struct iovec))); if (error) goto done; error = recvit(td, uap->s, &msg, (caddr_t)0); if (!error) { msg.msg_iov = uiov; error = copyout((caddr_t)&msg, (caddr_t)uap->msg, sizeof(msg)); } done: if (iov != aiov) FREE(iov, M_IOV); done2: mtx_unlock(&Giant); return (error); } /* * MPSAFE */ /* ARGSUSED */ int shutdown(td, uap) struct thread *td; register struct shutdown_args /* { int s; int how; } */ *uap; { struct socket *so; int error; mtx_lock(&Giant); if ((error = fgetsock(td, uap->s, &so, NULL)) == 0) { error = soshutdown(so, uap->how); fputsock(so); } mtx_unlock(&Giant); return(error); } /* * MPSAFE */ /* ARGSUSED */ int setsockopt(td, uap) struct thread *td; register struct setsockopt_args /* { int s; int level; int name; caddr_t val; int valsize; } */ *uap; { struct socket *so; struct sockopt sopt; int error; if (uap->val == 0 && uap->valsize != 0) return (EFAULT); if (uap->valsize < 0) return (EINVAL); mtx_lock(&Giant); if ((error = fgetsock(td, uap->s, &so, NULL)) == 0) { sopt.sopt_dir = SOPT_SET; sopt.sopt_level = uap->level; sopt.sopt_name = uap->name; sopt.sopt_val = uap->val; sopt.sopt_valsize = uap->valsize; sopt.sopt_td = td; error = sosetopt(so, &sopt); fputsock(so); } mtx_unlock(&Giant); return(error); } /* * MPSAFE */ /* ARGSUSED */ int getsockopt(td, uap) struct thread *td; register struct getsockopt_args /* { int s; int level; int name; caddr_t val; int *avalsize; } */ *uap; { int valsize, error; struct socket *so; struct sockopt sopt; mtx_lock(&Giant); if ((error = fgetsock(td, uap->s, &so, NULL)) != 0) goto done2; if (uap->val) { error = copyin((caddr_t)uap->avalsize, (caddr_t)&valsize, sizeof (valsize)); if (error) goto done1; if (valsize < 0) { error = EINVAL; goto done1; } } else { valsize = 0; } sopt.sopt_dir = SOPT_GET; sopt.sopt_level = uap->level; sopt.sopt_name = uap->name; sopt.sopt_val = uap->val; sopt.sopt_valsize = (size_t)valsize; /* checked non-negative above */ sopt.sopt_td = td; error = sogetopt(so, &sopt); if (error == 0) { valsize = sopt.sopt_valsize; error = copyout((caddr_t)&valsize, (caddr_t)uap->avalsize, sizeof (valsize)); } done1: fputsock(so); done2: mtx_unlock(&Giant); return (error); } /* * getsockname1() - Get socket name. * * MPSAFE */ /* ARGSUSED */ static int getsockname1(td, uap, compat) struct thread *td; register struct getsockname_args /* { int fdes; caddr_t asa; int *alen; } */ *uap; int compat; { struct socket *so; struct sockaddr *sa; int len, error; mtx_lock(&Giant); if ((error = fgetsock(td, uap->fdes, &so, NULL)) != 0) goto done2; error = copyin((caddr_t)uap->alen, (caddr_t)&len, sizeof (len)); if (error) goto done1; sa = 0; error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &sa); if (error) goto bad; if (sa == 0) { len = 0; goto gotnothing; } len = MIN(len, sa->sa_len); #ifdef COMPAT_OLDSOCK if (compat) ((struct osockaddr *)sa)->sa_family = sa->sa_family; #endif error = copyout(sa, (caddr_t)uap->asa, (u_int)len); if (error == 0) gotnothing: error = copyout((caddr_t)&len, (caddr_t)uap->alen, sizeof (len)); bad: if (sa) FREE(sa, M_SONAME); done1: fputsock(so); done2: mtx_unlock(&Giant); return (error); } /* * MPSAFE */ int getsockname(td, uap) struct thread *td; struct getsockname_args *uap; { return (getsockname1(td, uap, 0)); } #ifdef COMPAT_OLDSOCK /* * MPSAFE */ int ogetsockname(td, uap) struct thread *td; struct getsockname_args *uap; { return (getsockname1(td, uap, 1)); } #endif /* COMPAT_OLDSOCK */ /* * getpeername1() - Get name of peer for connected socket. * * MPSAFE */ /* ARGSUSED */ static int getpeername1(td, uap, compat) struct thread *td; register struct getpeername_args /* { int fdes; caddr_t asa; int *alen; } */ *uap; int compat; { struct socket *so; struct sockaddr *sa; int len, error; mtx_lock(&Giant); if ((error = fgetsock(td, uap->fdes, &so, NULL)) != 0) goto done2; if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) { error = ENOTCONN; goto done1; } error = copyin((caddr_t)uap->alen, (caddr_t)&len, sizeof (len)); if (error) goto done1; sa = 0; error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, &sa); if (error) goto bad; if (sa == 0) { len = 0; goto gotnothing; } len = MIN(len, sa->sa_len); #ifdef COMPAT_OLDSOCK if (compat) ((struct osockaddr *)sa)->sa_family = sa->sa_family; #endif error = copyout(sa, (caddr_t)uap->asa, (u_int)len); if (error) goto bad; gotnothing: error = copyout((caddr_t)&len, (caddr_t)uap->alen, sizeof (len)); bad: if (sa) FREE(sa, M_SONAME); done1: fputsock(so); done2: mtx_unlock(&Giant); return (error); } /* * MPSAFE */ int getpeername(td, uap) struct thread *td; struct getpeername_args *uap; { return (getpeername1(td, uap, 0)); } #ifdef COMPAT_OLDSOCK /* * MPSAFE */ int ogetpeername(td, uap) struct thread *td; struct ogetpeername_args *uap; { /* XXX uap should have type `getpeername_args *' to begin with. */ return (getpeername1(td, (struct getpeername_args *)uap, 1)); } #endif /* COMPAT_OLDSOCK */ int sockargs(mp, buf, buflen, type) struct mbuf **mp; caddr_t buf; int buflen, type; { register struct sockaddr *sa; register struct mbuf *m; int error; if ((u_int)buflen > MLEN) { #ifdef COMPAT_OLDSOCK if (type == MT_SONAME && (u_int)buflen <= 112) buflen = MLEN; /* unix domain compat. hack */ else #endif return (EINVAL); } m = m_get(M_TRYWAIT, type); if (m == NULL) return (ENOBUFS); m->m_len = buflen; error = copyin(buf, mtod(m, caddr_t), (u_int)buflen); if (error) (void) m_free(m); else { *mp = m; if (type == MT_SONAME) { sa = mtod(m, struct sockaddr *); #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN if (sa->sa_family == 0 && sa->sa_len < AF_MAX) sa->sa_family = sa->sa_len; #endif sa->sa_len = buflen; } } return (error); } int getsockaddr(namp, uaddr, len) struct sockaddr **namp; caddr_t uaddr; size_t len; { struct sockaddr *sa; int error; if (len > SOCK_MAXADDRLEN) return ENAMETOOLONG; MALLOC(sa, struct sockaddr *, len, M_SONAME, M_WAITOK); error = copyin(uaddr, sa, len); if (error) { FREE(sa, M_SONAME); } else { #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN if (sa->sa_family == 0 && sa->sa_len < AF_MAX) sa->sa_family = sa->sa_len; #endif sa->sa_len = len; *namp = sa; } return error; } /* * Allocate a pool of sf_bufs (sendfile(2) or "super-fast" if you prefer. :-)) * XXX - The sf_buf functions are currently private to sendfile(2), so have * been made static, but may be useful in the future for doing zero-copy in * other parts of the networking code. */ static void sf_buf_init(void *arg) { int i; mtx_init(&sf_freelist.sf_lock, "sf_bufs list lock", MTX_DEF); mtx_lock(&sf_freelist.sf_lock); SLIST_INIT(&sf_freelist.sf_head); sf_base = kmem_alloc_pageable(kernel_map, nsfbufs * PAGE_SIZE); sf_bufs = malloc(nsfbufs * sizeof(struct sf_buf), M_TEMP, M_NOWAIT | M_ZERO); for (i = 0; i < nsfbufs; i++) { sf_bufs[i].kva = sf_base + i * PAGE_SIZE; SLIST_INSERT_HEAD(&sf_freelist.sf_head, &sf_bufs[i], free_list); } sf_buf_alloc_want = 0; mtx_unlock(&sf_freelist.sf_lock); } /* * Get an sf_buf from the freelist. Will block if none are available. */ static struct sf_buf * sf_buf_alloc() { struct sf_buf *sf; int error; mtx_lock(&sf_freelist.sf_lock); while ((sf = SLIST_FIRST(&sf_freelist.sf_head)) == NULL) { sf_buf_alloc_want++; error = msleep(&sf_freelist, &sf_freelist.sf_lock, PVM|PCATCH, "sfbufa", 0); sf_buf_alloc_want--; /* * If we got a signal, don't risk going back to sleep. */ if (error) break; } if (sf != NULL) SLIST_REMOVE_HEAD(&sf_freelist.sf_head, free_list); mtx_unlock(&sf_freelist.sf_lock); return (sf); } #define dtosf(x) (&sf_bufs[((uintptr_t)(x) - (uintptr_t)sf_base) >> PAGE_SHIFT]) /* * Detatch mapped page and release resources back to the system. */ static void sf_buf_free(caddr_t addr, void *args) { struct sf_buf *sf; struct vm_page *m; GIANT_REQUIRED; sf = dtosf(addr); pmap_qremove((vm_offset_t)addr, 1); m = sf->m; vm_page_unwire(m, 0); /* * Check for the object going away on us. This can * happen since we don't hold a reference to it. * If so, we're responsible for freeing the page. */ if (m->wire_count == 0 && m->object == NULL) vm_page_free(m); sf->m = NULL; mtx_lock(&sf_freelist.sf_lock); SLIST_INSERT_HEAD(&sf_freelist.sf_head, sf, free_list); if (sf_buf_alloc_want > 0) wakeup_one(&sf_freelist); mtx_unlock(&sf_freelist.sf_lock); } /* * sendfile(2) * * MPSAFE * * int sendfile(int fd, int s, off_t offset, size_t nbytes, * struct sf_hdtr *hdtr, off_t *sbytes, int flags) * * Send a file specified by 'fd' and starting at 'offset' to a socket * specified by 's'. Send only 'nbytes' of the file or until EOF if * nbytes == 0. Optionally add a header and/or trailer to the socket * output. If specified, write the total number of bytes sent into *sbytes. * */ int sendfile(struct thread *td, struct sendfile_args *uap) { struct vnode *vp; struct vm_object *obj; struct socket *so = NULL; struct mbuf *m; struct sf_buf *sf; struct vm_page *pg; struct writev_args nuap; struct sf_hdtr hdtr; off_t off, xfsize, sbytes = 0; int error, s; mtx_lock(&Giant); /* * The descriptor must be a regular file and have a backing VM object. */ if ((error = fgetvp_read(td, uap->fd, &vp)) != 0) goto done; if (vp->v_type != VREG || VOP_GETVOBJECT(vp, &obj) != 0) { error = EINVAL; goto done; } if ((error = fgetsock(td, uap->s, &so, NULL)) != 0) goto done; if (so->so_type != SOCK_STREAM) { error = EINVAL; goto done; } if ((so->so_state & SS_ISCONNECTED) == 0) { error = ENOTCONN; goto done; } if (uap->offset < 0) { error = EINVAL; goto done; } /* * If specified, get the pointer to the sf_hdtr struct for * any headers/trailers. */ if (uap->hdtr != NULL) { error = copyin(uap->hdtr, &hdtr, sizeof(hdtr)); if (error) goto done; /* * Send any headers. Wimp out and use writev(2). */ if (hdtr.headers != NULL) { nuap.fd = uap->s; nuap.iovp = hdtr.headers; nuap.iovcnt = hdtr.hdr_cnt; error = writev(td, &nuap); if (error) goto done; sbytes += td->td_retval[0]; } } /* * Protect against multiple writers to the socket. */ (void) sblock(&so->so_snd, M_WAITOK); /* * Loop through the pages in the file, starting with the requested * offset. Get a file page (do I/O if necessary), map the file page * into an sf_buf, attach an mbuf header to the sf_buf, and queue * it on the socket. */ for (off = uap->offset; ; off += xfsize, sbytes += xfsize) { vm_pindex_t pindex; vm_offset_t pgoff; pindex = OFF_TO_IDX(off); retry_lookup: /* * Calculate the amount to transfer. Not to exceed a page, * the EOF, or the passed in nbytes. */ xfsize = obj->un_pager.vnp.vnp_size - off; if (xfsize > PAGE_SIZE) xfsize = PAGE_SIZE; pgoff = (vm_offset_t)(off & PAGE_MASK); if (PAGE_SIZE - pgoff < xfsize) xfsize = PAGE_SIZE - pgoff; if (uap->nbytes && xfsize > (uap->nbytes - sbytes)) xfsize = uap->nbytes - sbytes; if (xfsize <= 0) break; /* * Optimize the non-blocking case by looking at the socket space * before going to the extra work of constituting the sf_buf. */ if ((so->so_state & SS_NBIO) && sbspace(&so->so_snd) <= 0) { if (so->so_state & SS_CANTSENDMORE) error = EPIPE; else error = EAGAIN; sbunlock(&so->so_snd); goto done; } /* * Attempt to look up the page. * * Allocate if not found * * Wait and loop if busy. */ pg = vm_page_lookup(obj, pindex); if (pg == NULL) { pg = vm_page_alloc(obj, pindex, VM_ALLOC_NORMAL); if (pg == NULL) { VM_WAIT; goto retry_lookup; } vm_page_wakeup(pg); } else if (vm_page_sleep_busy(pg, TRUE, "sfpbsy")) { goto retry_lookup; } /* * Wire the page so it does not get ripped out from under * us. */ vm_page_wire(pg); /* * If page is not valid for what we need, initiate I/O */ if (!pg->valid || !vm_page_is_valid(pg, pgoff, xfsize)) { struct uio auio; struct iovec aiov; int bsize; /* * Ensure that our page is still around when the I/O * completes. */ vm_page_io_start(pg); /* * Get the page from backing store. */ bsize = vp->v_mount->mnt_stat.f_iosize; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; aiov.iov_base = 0; aiov.iov_len = MAXBSIZE; auio.uio_resid = MAXBSIZE; auio.uio_offset = trunc_page(off); auio.uio_segflg = UIO_NOCOPY; auio.uio_rw = UIO_READ; auio.uio_td = td; vn_lock(vp, LK_SHARED | LK_NOPAUSE | LK_RETRY, td); error = VOP_READ(vp, &auio, IO_VMIO | ((MAXBSIZE / bsize) << 16), td->td_proc->p_ucred); VOP_UNLOCK(vp, 0, td); vm_page_flag_clear(pg, PG_ZERO); vm_page_io_finish(pg); if (error) { vm_page_unwire(pg, 0); /* * See if anyone else might know about this page. * If not and it is not valid, then free it. */ if (pg->wire_count == 0 && pg->valid == 0 && pg->busy == 0 && !(pg->flags & PG_BUSY) && pg->hold_count == 0) { vm_page_busy(pg); vm_page_free(pg); } sbunlock(&so->so_snd); goto done; } } /* * Get a sendfile buf. We usually wait as long as necessary, * but this wait can be interrupted. */ if ((sf = sf_buf_alloc()) == NULL) { vm_page_unwire(pg, 0); if (pg->wire_count == 0 && pg->object == NULL) vm_page_free(pg); sbunlock(&so->so_snd); error = EINTR; goto done; } /* * Allocate a kernel virtual page and insert the physical page * into it. */ sf->m = pg; pmap_qenter(sf->kva, &pg, 1); /* * Get an mbuf header and set it up as having external storage. */ MGETHDR(m, M_TRYWAIT, MT_DATA); if (m == NULL) { error = ENOBUFS; sf_buf_free((void *)sf->kva, NULL); sbunlock(&so->so_snd); goto done; } /* * Setup external storage for mbuf. */ MEXTADD(m, sf->kva, PAGE_SIZE, sf_buf_free, NULL, M_RDONLY, EXT_SFBUF); m->m_data = (char *) sf->kva + pgoff; m->m_pkthdr.len = m->m_len = xfsize; /* * Add the buffer to the socket buffer chain. */ s = splnet(); retry_space: /* * Make sure that the socket is still able to take more data. * CANTSENDMORE being true usually means that the connection * was closed. so_error is true when an error was sensed after * a previous send. * The state is checked after the page mapping and buffer * allocation above since those operations may block and make * any socket checks stale. From this point forward, nothing * blocks before the pru_send (or more accurately, any blocking * results in a loop back to here to re-check). */ if ((so->so_state & SS_CANTSENDMORE) || so->so_error) { if (so->so_state & SS_CANTSENDMORE) { error = EPIPE; } else { error = so->so_error; so->so_error = 0; } m_freem(m); sbunlock(&so->so_snd); splx(s); goto done; } /* * Wait for socket space to become available. We do this just * after checking the connection state above in order to avoid * a race condition with sbwait(). */ if (sbspace(&so->so_snd) < so->so_snd.sb_lowat) { if (so->so_state & SS_NBIO) { m_freem(m); sbunlock(&so->so_snd); splx(s); error = EAGAIN; goto done; } error = sbwait(&so->so_snd); /* * An error from sbwait usually indicates that we've * been interrupted by a signal. If we've sent anything * then return bytes sent, otherwise return the error. */ if (error) { m_freem(m); sbunlock(&so->so_snd); splx(s); goto done; } goto retry_space; } error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, m, 0, 0, td); splx(s); if (error) { sbunlock(&so->so_snd); goto done; } } sbunlock(&so->so_snd); /* * Send trailers. Wimp out and use writev(2). */ if (uap->hdtr != NULL && hdtr.trailers != NULL) { nuap.fd = uap->s; nuap.iovp = hdtr.trailers; nuap.iovcnt = hdtr.trl_cnt; error = writev(td, &nuap); if (error) goto done; sbytes += td->td_retval[0]; } done: /* * If there was no error we have to clear td->td_retval[0] * because it may have been set by writev. */ if (error == 0) { td->td_retval[0] = 0; } if (uap->sbytes != NULL) { copyout(&sbytes, uap->sbytes, sizeof(off_t)); } if (vp) vrele(vp); if (so) fputsock(so); mtx_unlock(&Giant); return (error); } - Index: head/sys/kern/uipc_usrreq.c =================================================================== --- head/sys/kern/uipc_usrreq.c (revision 89305) +++ head/sys/kern/uipc_usrreq.c (revision 89306) @@ -1,1478 +1,1518 @@ /* * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * From: @(#)uipc_usrreq.c 8.3 (Berkeley) 1/4/94 * $FreeBSD$ */ #include #include #include #include #include #include #include #include /* XXX must be before */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include static struct vm_zone *unp_zone; static unp_gen_t unp_gencnt; static u_int unp_count; static struct unp_head unp_shead, unp_dhead; /* * Unix communications domain. * * TODO: * SEQPACKET, RDM * rethink name space problems * need a proper out-of-band * lock pushdown */ static struct sockaddr sun_noname = { sizeof(sun_noname), AF_LOCAL }; static ino_t unp_ino; /* prototype for fake inode numbers */ static int unp_attach __P((struct socket *)); static void unp_detach __P((struct unpcb *)); static int unp_bind __P((struct unpcb *,struct sockaddr *, struct thread *)); static int unp_connect __P((struct socket *,struct sockaddr *, struct thread *)); static void unp_disconnect __P((struct unpcb *)); static void unp_shutdown __P((struct unpcb *)); static void unp_drop __P((struct unpcb *, int)); static void unp_gc __P((void)); static void unp_scan __P((struct mbuf *, void (*)(struct file *))); static void unp_mark __P((struct file *)); static void unp_discard __P((struct file *)); static void unp_freerights __P((struct file **, int)); static int unp_internalize __P((struct mbuf **, struct thread *)); static int unp_listen __P((struct unpcb *, struct proc *)); static int uipc_abort(struct socket *so) { struct unpcb *unp = sotounpcb(so); if (unp == 0) return EINVAL; unp_drop(unp, ECONNABORTED); return 0; } static int uipc_accept(struct socket *so, struct sockaddr **nam) { struct unpcb *unp = sotounpcb(so); if (unp == 0) return EINVAL; /* * Pass back name of connected socket, * if it was bound and we are still connected * (our peer may have closed already!). */ if (unp->unp_conn && unp->unp_conn->unp_addr) { *nam = dup_sockaddr((struct sockaddr *)unp->unp_conn->unp_addr, 1); } else { *nam = dup_sockaddr((struct sockaddr *)&sun_noname, 1); } return 0; } static int uipc_attach(struct socket *so, int proto, struct thread *td) { struct unpcb *unp = sotounpcb(so); if (unp != 0) return EISCONN; return unp_attach(so); } static int uipc_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { struct unpcb *unp = sotounpcb(so); if (unp == 0) return EINVAL; return unp_bind(unp, nam, td); } static int uipc_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { struct unpcb *unp = sotounpcb(so); if (unp == 0) return EINVAL; return unp_connect(so, nam, curthread); } static int uipc_connect2(struct socket *so1, struct socket *so2) { struct unpcb *unp = sotounpcb(so1); if (unp == 0) return EINVAL; return unp_connect2(so1, so2); } /* control is EOPNOTSUPP */ static int uipc_detach(struct socket *so) { struct unpcb *unp = sotounpcb(so); if (unp == 0) return EINVAL; unp_detach(unp); return 0; } static int uipc_disconnect(struct socket *so) { struct unpcb *unp = sotounpcb(so); if (unp == 0) return EINVAL; unp_disconnect(unp); return 0; } static int uipc_listen(struct socket *so, struct thread *td) { struct unpcb *unp = sotounpcb(so); if (unp == 0 || unp->unp_vnode == 0) return EINVAL; return unp_listen(unp, td->td_proc); } static int uipc_peeraddr(struct socket *so, struct sockaddr **nam) { struct unpcb *unp = sotounpcb(so); if (unp == 0) return EINVAL; if (unp->unp_conn && unp->unp_conn->unp_addr) *nam = dup_sockaddr((struct sockaddr *)unp->unp_conn->unp_addr, 1); return 0; } static int uipc_rcvd(struct socket *so, int flags) { struct unpcb *unp = sotounpcb(so); struct socket *so2; u_long newhiwat; if (unp == 0) return EINVAL; switch (so->so_type) { case SOCK_DGRAM: panic("uipc_rcvd DGRAM?"); /*NOTREACHED*/ case SOCK_STREAM: if (unp->unp_conn == 0) break; so2 = unp->unp_conn->unp_socket; /* * Adjust backpressure on sender * and wakeup any waiting to write. */ so2->so_snd.sb_mbmax += unp->unp_mbcnt - so->so_rcv.sb_mbcnt; unp->unp_mbcnt = so->so_rcv.sb_mbcnt; newhiwat = so2->so_snd.sb_hiwat + unp->unp_cc - so->so_rcv.sb_cc; (void)chgsbsize(so2->so_cred->cr_uidinfo, &so2->so_snd.sb_hiwat, newhiwat, RLIM_INFINITY); unp->unp_cc = so->so_rcv.sb_cc; sowwakeup(so2); break; default: panic("uipc_rcvd unknown socktype"); } return 0; } /* pru_rcvoob is EOPNOTSUPP */ static int uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct thread *td) { int error = 0; struct unpcb *unp = sotounpcb(so); struct socket *so2; u_long newhiwat; if (unp == 0) { error = EINVAL; goto release; } if (flags & PRUS_OOB) { error = EOPNOTSUPP; goto release; } if (control && (error = unp_internalize(&control, td))) goto release; switch (so->so_type) { case SOCK_DGRAM: { struct sockaddr *from; if (nam) { if (unp->unp_conn) { error = EISCONN; break; } error = unp_connect(so, nam, td); if (error) break; } else { if (unp->unp_conn == 0) { error = ENOTCONN; break; } } so2 = unp->unp_conn->unp_socket; if (unp->unp_addr) from = (struct sockaddr *)unp->unp_addr; else from = &sun_noname; if (sbappendaddr(&so2->so_rcv, from, m, control)) { sorwakeup(so2); m = 0; control = 0; } else error = ENOBUFS; if (nam) unp_disconnect(unp); break; } case SOCK_STREAM: /* Connect if not connected yet. */ /* * Note: A better implementation would complain * if not equal to the peer's address. */ if ((so->so_state & SS_ISCONNECTED) == 0) { if (nam) { error = unp_connect(so, nam, td); if (error) break; /* XXX */ } else { error = ENOTCONN; break; } } if (so->so_state & SS_CANTSENDMORE) { error = EPIPE; break; } if (unp->unp_conn == 0) panic("uipc_send connected but no connection?"); so2 = unp->unp_conn->unp_socket; /* * Send to paired receive port, and then reduce * send buffer hiwater marks to maintain backpressure. * Wake up readers. */ if (control) { if (sbappendcontrol(&so2->so_rcv, m, control)) control = 0; } else sbappend(&so2->so_rcv, m); so->so_snd.sb_mbmax -= so2->so_rcv.sb_mbcnt - unp->unp_conn->unp_mbcnt; unp->unp_conn->unp_mbcnt = so2->so_rcv.sb_mbcnt; newhiwat = so->so_snd.sb_hiwat - (so2->so_rcv.sb_cc - unp->unp_conn->unp_cc); (void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_snd.sb_hiwat, newhiwat, RLIM_INFINITY); unp->unp_conn->unp_cc = so2->so_rcv.sb_cc; sorwakeup(so2); m = 0; break; default: panic("uipc_send unknown socktype"); } /* * SEND_EOF is equivalent to a SEND followed by * a SHUTDOWN. */ if (flags & PRUS_EOF) { socantsendmore(so); unp_shutdown(unp); } if (control && error != 0) unp_dispose(control); release: if (control) m_freem(control); if (m) m_freem(m); return error; } static int uipc_sense(struct socket *so, struct stat *sb) { struct unpcb *unp = sotounpcb(so); struct socket *so2; if (unp == 0) return EINVAL; sb->st_blksize = so->so_snd.sb_hiwat; if (so->so_type == SOCK_STREAM && unp->unp_conn != 0) { so2 = unp->unp_conn->unp_socket; sb->st_blksize += so2->so_rcv.sb_cc; } sb->st_dev = NOUDEV; if (unp->unp_ino == 0) unp->unp_ino = unp_ino++; sb->st_ino = unp->unp_ino; return (0); } static int uipc_shutdown(struct socket *so) { struct unpcb *unp = sotounpcb(so); if (unp == 0) return EINVAL; socantsendmore(so); unp_shutdown(unp); return 0; } static int uipc_sockaddr(struct socket *so, struct sockaddr **nam) { struct unpcb *unp = sotounpcb(so); if (unp == 0) return EINVAL; if (unp->unp_addr) *nam = dup_sockaddr((struct sockaddr *)unp->unp_addr, 1); else *nam = dup_sockaddr((struct sockaddr *)&sun_noname, 1); return 0; } struct pr_usrreqs uipc_usrreqs = { uipc_abort, uipc_accept, uipc_attach, uipc_bind, uipc_connect, uipc_connect2, pru_control_notsupp, uipc_detach, uipc_disconnect, uipc_listen, uipc_peeraddr, uipc_rcvd, pru_rcvoob_notsupp, uipc_send, uipc_sense, uipc_shutdown, uipc_sockaddr, sosend, soreceive, sopoll }; int uipc_ctloutput(so, sopt) struct socket *so; struct sockopt *sopt; { struct unpcb *unp = sotounpcb(so); int error; switch (sopt->sopt_dir) { case SOPT_GET: switch (sopt->sopt_name) { case LOCAL_PEERCRED: if (unp->unp_flags & UNP_HAVEPC) error = sooptcopyout(sopt, &unp->unp_peercred, sizeof(unp->unp_peercred)); else { if (so->so_type == SOCK_STREAM) error = ENOTCONN; else error = EINVAL; } break; default: error = EOPNOTSUPP; break; } break; case SOPT_SET: default: error = EOPNOTSUPP; break; } return (error); } /* * Both send and receive buffers are allocated PIPSIZ bytes of buffering * for stream sockets, although the total for sender and receiver is * actually only PIPSIZ. * Datagram sockets really use the sendspace as the maximum datagram size, * and don't really want to reserve the sendspace. Their recvspace should * be large enough for at least one max-size datagram plus address. */ #ifndef PIPSIZ #define PIPSIZ 8192 #endif static u_long unpst_sendspace = PIPSIZ; static u_long unpst_recvspace = PIPSIZ; static u_long unpdg_sendspace = 2*1024; /* really max datagram size */ static u_long unpdg_recvspace = 4*1024; static int unp_rights; /* file descriptors in flight */ SYSCTL_DECL(_net_local_stream); SYSCTL_INT(_net_local_stream, OID_AUTO, sendspace, CTLFLAG_RW, &unpst_sendspace, 0, ""); SYSCTL_INT(_net_local_stream, OID_AUTO, recvspace, CTLFLAG_RW, &unpst_recvspace, 0, ""); SYSCTL_DECL(_net_local_dgram); SYSCTL_INT(_net_local_dgram, OID_AUTO, maxdgram, CTLFLAG_RW, &unpdg_sendspace, 0, ""); SYSCTL_INT(_net_local_dgram, OID_AUTO, recvspace, CTLFLAG_RW, &unpdg_recvspace, 0, ""); SYSCTL_DECL(_net_local); SYSCTL_INT(_net_local, OID_AUTO, inflight, CTLFLAG_RD, &unp_rights, 0, ""); static int unp_attach(so) struct socket *so; { register struct unpcb *unp; int error; if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { switch (so->so_type) { case SOCK_STREAM: error = soreserve(so, unpst_sendspace, unpst_recvspace); break; case SOCK_DGRAM: error = soreserve(so, unpdg_sendspace, unpdg_recvspace); break; default: panic("unp_attach"); } if (error) return (error); } unp = zalloc(unp_zone); if (unp == NULL) return (ENOBUFS); bzero(unp, sizeof *unp); unp->unp_gencnt = ++unp_gencnt; unp_count++; LIST_INIT(&unp->unp_refs); unp->unp_socket = so; + FILEDESC_LOCK(curproc->p_fd); unp->unp_rvnode = curthread->td_proc->p_fd->fd_rdir; + FILEDESC_UNLOCK(curproc->p_fd); LIST_INSERT_HEAD(so->so_type == SOCK_DGRAM ? &unp_dhead : &unp_shead, unp, unp_link); so->so_pcb = (caddr_t)unp; return (0); } static void unp_detach(unp) register struct unpcb *unp; { LIST_REMOVE(unp, unp_link); unp->unp_gencnt = ++unp_gencnt; --unp_count; if (unp->unp_vnode) { unp->unp_vnode->v_socket = 0; vrele(unp->unp_vnode); unp->unp_vnode = 0; } if (unp->unp_conn) unp_disconnect(unp); while (!LIST_EMPTY(&unp->unp_refs)) unp_drop(LIST_FIRST(&unp->unp_refs), ECONNRESET); soisdisconnected(unp->unp_socket); unp->unp_socket->so_pcb = 0; if (unp_rights) { /* * Normally the receive buffer is flushed later, * in sofree, but if our receive buffer holds references * to descriptors that are now garbage, we will dispose * of those descriptor references after the garbage collector * gets them (resulting in a "panic: closef: count < 0"). */ sorflush(unp->unp_socket); unp_gc(); } if (unp->unp_addr) FREE(unp->unp_addr, M_SONAME); zfree(unp_zone, unp); } static int unp_bind(unp, nam, td) struct unpcb *unp; struct sockaddr *nam; struct thread *td; { struct sockaddr_un *soun = (struct sockaddr_un *)nam; struct vnode *vp; struct mount *mp; struct vattr vattr; int error, namelen; struct nameidata nd; char *buf; if (unp->unp_vnode != NULL) return (EINVAL); namelen = soun->sun_len - offsetof(struct sockaddr_un, sun_path); if (namelen <= 0) return EINVAL; buf = malloc(SOCK_MAXADDRLEN, M_TEMP, M_WAITOK); strncpy(buf, soun->sun_path, namelen); buf[namelen] = 0; /* null-terminate the string */ restart: NDINIT(&nd, CREATE, NOFOLLOW | LOCKPARENT, UIO_SYSSPACE, buf, td); /* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */ error = namei(&nd); if (error) { free(buf, M_TEMP); return (error); } vp = nd.ni_vp; if (vp != NULL || vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_dvp == vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); if (vp != NULL) { vrele(vp); free(buf, M_TEMP); return (EADDRINUSE); } error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH); if (error) { free(buf, M_TEMP); return (error); } goto restart; } VATTR_NULL(&vattr); vattr.va_type = VSOCK; + FILEDESC_LOCK(td->td_proc->p_fd); vattr.va_mode = (ACCESSPERMS & ~td->td_proc->p_fd->fd_cmask); + FILEDESC_UNLOCK(td->td_proc->p_fd); VOP_LEASE(nd.ni_dvp, td, td->td_proc->p_ucred, LEASE_WRITE); error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if (error) { free(buf, M_TEMP); return (error); } vp = nd.ni_vp; vp->v_socket = unp->unp_socket; unp->unp_vnode = vp; unp->unp_addr = (struct sockaddr_un *)dup_sockaddr(nam, 1); VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); free(buf, M_TEMP); return (0); } static int unp_connect(so, nam, td) struct socket *so; struct sockaddr *nam; struct thread *td; { register struct sockaddr_un *soun = (struct sockaddr_un *)nam; register struct vnode *vp; register struct socket *so2, *so3; struct unpcb *unp, *unp2, *unp3; int error, len; struct nameidata nd; char buf[SOCK_MAXADDRLEN]; len = nam->sa_len - offsetof(struct sockaddr_un, sun_path); if (len <= 0) return EINVAL; strncpy(buf, soun->sun_path, len); buf[len] = 0; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, buf, td); error = namei(&nd); if (error) return (error); vp = nd.ni_vp; NDFREE(&nd, NDF_ONLY_PNBUF); if (vp->v_type != VSOCK) { error = ENOTSOCK; goto bad; } error = VOP_ACCESS(vp, VWRITE, td->td_proc->p_ucred, td); if (error) goto bad; so2 = vp->v_socket; if (so2 == 0) { error = ECONNREFUSED; goto bad; } if (so->so_type != so2->so_type) { error = EPROTOTYPE; goto bad; } if (so->so_proto->pr_flags & PR_CONNREQUIRED) { if ((so2->so_options & SO_ACCEPTCONN) == 0 || (so3 = sonewconn(so2, 0)) == 0) { error = ECONNREFUSED; goto bad; } unp = sotounpcb(so); unp2 = sotounpcb(so2); unp3 = sotounpcb(so3); if (unp2->unp_addr) unp3->unp_addr = (struct sockaddr_un *) dup_sockaddr((struct sockaddr *) unp2->unp_addr, 1); /* * unp_peercred management: * * The connecter's (client's) credentials are copied * from its process structure at the time of connect() * (which is now). */ memset(&unp3->unp_peercred, '\0', sizeof(unp3->unp_peercred)); unp3->unp_peercred.cr_uid = td->td_proc->p_ucred->cr_uid; unp3->unp_peercred.cr_ngroups = td->td_proc->p_ucred->cr_ngroups; memcpy(unp3->unp_peercred.cr_groups, td->td_proc->p_ucred->cr_groups, sizeof(unp3->unp_peercred.cr_groups)); unp3->unp_flags |= UNP_HAVEPC; /* * The receiver's (server's) credentials are copied * from the unp_peercred member of socket on which the * former called listen(); unp_listen() cached that * process's credentials at that time so we can use * them now. */ KASSERT(unp2->unp_flags & UNP_HAVEPCCACHED, ("unp_connect: listener without cached peercred")); memcpy(&unp->unp_peercred, &unp2->unp_peercred, sizeof(unp->unp_peercred)); unp->unp_flags |= UNP_HAVEPC; so2 = so3; } error = unp_connect2(so, so2); bad: vput(vp); return (error); } int unp_connect2(so, so2) register struct socket *so; register struct socket *so2; { register struct unpcb *unp = sotounpcb(so); register struct unpcb *unp2; if (so2->so_type != so->so_type) return (EPROTOTYPE); unp2 = sotounpcb(so2); unp->unp_conn = unp2; switch (so->so_type) { case SOCK_DGRAM: LIST_INSERT_HEAD(&unp2->unp_refs, unp, unp_reflink); soisconnected(so); break; case SOCK_STREAM: unp2->unp_conn = unp; soisconnected(so); soisconnected(so2); break; default: panic("unp_connect2"); } return (0); } static void unp_disconnect(unp) struct unpcb *unp; { register struct unpcb *unp2 = unp->unp_conn; if (unp2 == 0) return; unp->unp_conn = 0; switch (unp->unp_socket->so_type) { case SOCK_DGRAM: LIST_REMOVE(unp, unp_reflink); unp->unp_socket->so_state &= ~SS_ISCONNECTED; break; case SOCK_STREAM: soisdisconnected(unp->unp_socket); unp2->unp_conn = 0; soisdisconnected(unp2->unp_socket); break; } } #ifdef notdef void unp_abort(unp) struct unpcb *unp; { unp_detach(unp); } #endif static int unp_pcblist(SYSCTL_HANDLER_ARGS) { int error, i, n; struct unpcb *unp, **unp_list; unp_gen_t gencnt; struct xunpgen *xug; struct unp_head *head; struct xunpcb *xu; head = ((intptr_t)arg1 == SOCK_DGRAM ? &unp_dhead : &unp_shead); /* * The process of preparing the PCB list is too time-consuming and * resource-intensive to repeat twice on every request. */ if (req->oldptr == 0) { n = unp_count; req->oldidx = 2 * (sizeof *xug) + (n + n/8) * sizeof(struct xunpcb); return 0; } if (req->newptr != 0) return EPERM; /* * OK, now we're committed to doing something. */ xug = malloc(sizeof(*xug), M_TEMP, M_WAITOK); gencnt = unp_gencnt; n = unp_count; xug->xug_len = sizeof *xug; xug->xug_count = n; xug->xug_gen = gencnt; xug->xug_sogen = so_gencnt; error = SYSCTL_OUT(req, xug, sizeof *xug); if (error) { free(xug, M_TEMP); return error; } unp_list = malloc(n * sizeof *unp_list, M_TEMP, M_WAITOK); for (unp = LIST_FIRST(head), i = 0; unp && i < n; unp = LIST_NEXT(unp, unp_link)) { if (unp->unp_gencnt <= gencnt) { if (cr_cansee(req->td->td_proc->p_ucred, unp->unp_socket->so_cred)) continue; unp_list[i++] = unp; } } n = i; /* in case we lost some during malloc */ error = 0; xu = malloc(sizeof(*xu), M_TEMP, M_WAITOK); for (i = 0; i < n; i++) { unp = unp_list[i]; if (unp->unp_gencnt <= gencnt) { xu->xu_len = sizeof *xu; xu->xu_unpp = unp; /* * XXX - need more locking here to protect against * connect/disconnect races for SMP. */ if (unp->unp_addr) bcopy(unp->unp_addr, &xu->xu_addr, unp->unp_addr->sun_len); if (unp->unp_conn && unp->unp_conn->unp_addr) bcopy(unp->unp_conn->unp_addr, &xu->xu_caddr, unp->unp_conn->unp_addr->sun_len); bcopy(unp, &xu->xu_unp, sizeof *unp); sotoxsocket(unp->unp_socket, &xu->xu_socket); error = SYSCTL_OUT(req, xu, sizeof *xu); } } free(xu, M_TEMP); if (!error) { /* * Give the user an updated idea of our state. * If the generation differs from what we told * her before, she knows that something happened * while we were processing this request, and it * might be necessary to retry. */ xug->xug_gen = unp_gencnt; xug->xug_sogen = so_gencnt; xug->xug_count = unp_count; error = SYSCTL_OUT(req, xug, sizeof *xug); } free(unp_list, M_TEMP); free(xug, M_TEMP); return error; } SYSCTL_PROC(_net_local_dgram, OID_AUTO, pcblist, CTLFLAG_RD, (caddr_t)(long)SOCK_DGRAM, 0, unp_pcblist, "S,xunpcb", "List of active local datagram sockets"); SYSCTL_PROC(_net_local_stream, OID_AUTO, pcblist, CTLFLAG_RD, (caddr_t)(long)SOCK_STREAM, 0, unp_pcblist, "S,xunpcb", "List of active local stream sockets"); static void unp_shutdown(unp) struct unpcb *unp; { struct socket *so; if (unp->unp_socket->so_type == SOCK_STREAM && unp->unp_conn && (so = unp->unp_conn->unp_socket)) socantrcvmore(so); } static void unp_drop(unp, errno) struct unpcb *unp; int errno; { struct socket *so = unp->unp_socket; so->so_error = errno; unp_disconnect(unp); if (so->so_head) { LIST_REMOVE(unp, unp_link); unp->unp_gencnt = ++unp_gencnt; unp_count--; so->so_pcb = (caddr_t) 0; if (unp->unp_addr) FREE(unp->unp_addr, M_SONAME); zfree(unp_zone, unp); sotryfree(so); } } #ifdef notdef void unp_drain() { } #endif static void unp_freerights(rp, fdcount) struct file **rp; int fdcount; { int i; struct file *fp; for (i = 0; i < fdcount; i++) { fp = *rp; /* * zero the pointer before calling * unp_discard since it may end up * in unp_gc().. */ *rp++ = 0; unp_discard(fp); } } int unp_externalize(control, controlp) struct mbuf *control, **controlp; { struct thread *td = curthread; /* XXX */ struct cmsghdr *cm = mtod(control, struct cmsghdr *); int i; int *fdp; struct file **rp; struct file *fp; void *data; socklen_t clen = control->m_len, datalen; int error, newfds; int f; u_int newlen; error = 0; if (controlp != NULL) /* controlp == NULL => free control messages */ *controlp = NULL; while (cm != NULL) { if (sizeof(*cm) > clen || cm->cmsg_len > clen) { error = EINVAL; break; } data = CMSG_DATA(cm); datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data; if (cm->cmsg_level == SOL_SOCKET && cm->cmsg_type == SCM_RIGHTS) { newfds = datalen / sizeof(struct file *); rp = data; /* If we're not outputting the discriptors free them. */ if (error || controlp == NULL) { unp_freerights(rp, newfds); goto next; } + FILEDESC_LOCK(td->td_proc->p_fd); /* if the new FD's will not fit free them. */ if (!fdavail(td, newfds)) { + FILEDESC_UNLOCK(td->td_proc->p_fd); error = EMSGSIZE; unp_freerights(rp, newfds); goto next; } /* * now change each pointer to an fd in the global * table to an integer that is the index to the * local fd table entry that we set up to point * to the global one we are transferring. */ newlen = newfds * sizeof(int); *controlp = sbcreatecontrol(NULL, newlen, SCM_RIGHTS, SOL_SOCKET); if (*controlp == NULL) { + FILEDESC_UNLOCK(td->td_proc->p_fd); error = E2BIG; unp_freerights(rp, newfds); goto next; } fdp = (int *) CMSG_DATA(mtod(*controlp, struct cmsghdr *)); for (i = 0; i < newfds; i++) { if (fdalloc(td, 0, &f)) panic("unp_externalize fdalloc failed"); fp = *rp++; td->td_proc->p_fd->fd_ofiles[f] = fp; + FILE_LOCK(fp); fp->f_msgcount--; + FILE_UNLOCK(fp); unp_rights--; *fdp++ = f; } + FILEDESC_UNLOCK(td->td_proc->p_fd); } else { /* We can just copy anything else across */ if (error || controlp == NULL) goto next; *controlp = sbcreatecontrol(NULL, datalen, cm->cmsg_type, cm->cmsg_level); if (*controlp == NULL) { error = ENOBUFS; goto next; } bcopy(data, CMSG_DATA(mtod(*controlp, struct cmsghdr *)), datalen); } controlp = &(*controlp)->m_next; next: if (CMSG_SPACE(datalen) < clen) { clen -= CMSG_SPACE(datalen); cm = (struct cmsghdr *) ((caddr_t)cm + CMSG_SPACE(datalen)); } else { clen = 0; cm = NULL; } } + FILEDESC_UNLOCK(td->td_proc->p_fd); m_freem(control); return (error); } void unp_init(void) { unp_zone = zinit("unpcb", sizeof(struct unpcb), nmbclusters, 0, 0); if (unp_zone == 0) panic("unp_init"); LIST_INIT(&unp_dhead); LIST_INIT(&unp_shead); } #ifndef MIN #define MIN(a,b) (((a)<(b))?(a):(b)) #endif static int unp_internalize(controlp, td) struct mbuf **controlp; struct thread *td; { struct mbuf *control = *controlp; struct proc *p = td->td_proc; struct filedesc *fdescp = p->p_fd; struct cmsghdr *cm = mtod(control, struct cmsghdr *); struct cmsgcred *cmcred; struct file **rp; struct file *fp; struct timeval *tv; int i, fd, *fdp; void *data; socklen_t clen = control->m_len, datalen; int error, oldfds; u_int newlen; error = 0; *controlp = NULL; while (cm != NULL) { if (sizeof(*cm) > clen || cm->cmsg_level != SOL_SOCKET || cm->cmsg_len > clen) { error = EINVAL; goto out; } data = CMSG_DATA(cm); datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data; switch (cm->cmsg_type) { /* * Fill in credential information. */ case SCM_CREDS: *controlp = sbcreatecontrol(NULL, sizeof(*cmcred), SCM_CREDS, SOL_SOCKET); if (*controlp == NULL) { error = ENOBUFS; goto out; } cmcred = (struct cmsgcred *) CMSG_DATA(mtod(*controlp, struct cmsghdr *)); cmcred->cmcred_pid = p->p_pid; cmcred->cmcred_uid = p->p_ucred->cr_ruid; cmcred->cmcred_gid = p->p_ucred->cr_rgid; cmcred->cmcred_euid = p->p_ucred->cr_uid; cmcred->cmcred_ngroups = MIN(p->p_ucred->cr_ngroups, CMGROUP_MAX); for (i = 0; i < cmcred->cmcred_ngroups; i++) cmcred->cmcred_groups[i] = p->p_ucred->cr_groups[i]; break; case SCM_RIGHTS: oldfds = datalen / sizeof (int); /* * check that all the FDs passed in refer to legal files * If not, reject the entire operation. */ fdp = data; + FILEDESC_LOCK(fdescp); for (i = 0; i < oldfds; i++) { fd = *fdp++; if ((unsigned)fd >= fdescp->fd_nfiles || fdescp->fd_ofiles[fd] == NULL) { + FILEDESC_UNLOCK(fdescp); error = EBADF; goto out; } } /* * Now replace the integer FDs with pointers to * the associated global file table entry.. */ newlen = oldfds * sizeof(struct file *); *controlp = sbcreatecontrol(NULL, newlen, SCM_RIGHTS, SOL_SOCKET); if (*controlp == NULL) { + FILEDESC_UNLOCK(fdescp); error = E2BIG; goto out; } fdp = data; rp = (struct file **) CMSG_DATA(mtod(*controlp, struct cmsghdr *)); for (i = 0; i < oldfds; i++) { fp = fdescp->fd_ofiles[*fdp++]; *rp++ = fp; + FILE_LOCK(fp); fp->f_count++; fp->f_msgcount++; + FILE_UNLOCK(fp); unp_rights++; } + FILEDESC_UNLOCK(fdescp); break; case SCM_TIMESTAMP: *controlp = sbcreatecontrol(NULL, sizeof(*tv), SCM_TIMESTAMP, SOL_SOCKET); if (*controlp == NULL) { error = ENOBUFS; goto out; } tv = (struct timeval *) CMSG_DATA(mtod(*controlp, struct cmsghdr *)); microtime(tv); break; default: error = EINVAL; goto out; } controlp = &(*controlp)->m_next; if (CMSG_SPACE(datalen) < clen) { clen -= CMSG_SPACE(datalen); cm = (struct cmsghdr *) ((caddr_t)cm + CMSG_SPACE(datalen)); } else { clen = 0; cm = NULL; } } out: m_freem(control); return (error); } static int unp_defer, unp_gcing; static void unp_gc() { register struct file *fp, *nextfp; register struct socket *so; struct file **extra_ref, **fpp; int nunref, i; if (unp_gcing) return; unp_gcing = 1; unp_defer = 0; /* * before going through all this, set all FDs to * be NOT defered and NOT externally accessible */ + sx_slock(&filelist_lock); LIST_FOREACH(fp, &filehead, f_list) - fp->f_flag &= ~(FMARK|FDEFER); + fp->f_gcflag &= ~(FMARK|FDEFER); do { LIST_FOREACH(fp, &filehead, f_list) { + FILE_LOCK(fp); /* * If the file is not open, skip it */ - if (fp->f_count == 0) + if (fp->f_count == 0) { + FILE_UNLOCK(fp); continue; + } /* * If we already marked it as 'defer' in a * previous pass, then try process it this time * and un-mark it */ - if (fp->f_flag & FDEFER) { - fp->f_flag &= ~FDEFER; + if (fp->f_gcflag & FDEFER) { + fp->f_gcflag &= ~FDEFER; unp_defer--; } else { /* * if it's not defered, then check if it's * already marked.. if so skip it */ - if (fp->f_flag & FMARK) + if (fp->f_gcflag & FMARK) { + FILE_UNLOCK(fp); continue; + } /* * If all references are from messages * in transit, then skip it. it's not * externally accessible. */ - if (fp->f_count == fp->f_msgcount) + if (fp->f_count == fp->f_msgcount) { + FILE_UNLOCK(fp); continue; + } /* * If it got this far then it must be * externally accessible. */ - fp->f_flag |= FMARK; + fp->f_gcflag |= FMARK; } /* * either it was defered, or it is externally * accessible and not already marked so. * Now check if it is possibly one of OUR sockets. */ if (fp->f_type != DTYPE_SOCKET || - (so = (struct socket *)fp->f_data) == 0) + (so = (struct socket *)fp->f_data) == 0) { + FILE_UNLOCK(fp); continue; + } + FILE_UNLOCK(fp); if (so->so_proto->pr_domain != &localdomain || (so->so_proto->pr_flags&PR_RIGHTS) == 0) continue; #ifdef notdef if (so->so_rcv.sb_flags & SB_LOCK) { /* * This is problematical; it's not clear * we need to wait for the sockbuf to be * unlocked (on a uniprocessor, at least), * and it's also not clear what to do * if sbwait returns an error due to receipt * of a signal. If sbwait does return * an error, we'll go into an infinite * loop. Delete all of this for now. */ (void) sbwait(&so->so_rcv); goto restart; } #endif /* * So, Ok, it's one of our sockets and it IS externally * accessible (or was defered). Now we look * to see if we hold any file descriptors in its * message buffers. Follow those links and mark them * as accessible too. */ unp_scan(so->so_rcv.sb_mb, unp_mark); } } while (unp_defer); + sx_sunlock(&filelist_lock); /* * We grab an extra reference to each of the file table entries * that are not otherwise accessible and then free the rights * that are stored in messages on them. * * The bug in the orginal code is a little tricky, so I'll describe * what's wrong with it here. * * It is incorrect to simply unp_discard each entry for f_msgcount * times -- consider the case of sockets A and B that contain * references to each other. On a last close of some other socket, * we trigger a gc since the number of outstanding rights (unp_rights) * is non-zero. If during the sweep phase the gc code un_discards, * we end up doing a (full) closef on the descriptor. A closef on A * results in the following chain. Closef calls soo_close, which * calls soclose. Soclose calls first (through the switch * uipc_usrreq) unp_detach, which re-invokes unp_gc. Unp_gc simply * returns because the previous instance had set unp_gcing, and * we return all the way back to soclose, which marks the socket * with SS_NOFDREF, and then calls sofree. Sofree calls sorflush * to free up the rights that are queued in messages on the socket A, * i.e., the reference on B. The sorflush calls via the dom_dispose * switch unp_dispose, which unp_scans with unp_discard. This second * instance of unp_discard just calls closef on B. * * Well, a similar chain occurs on B, resulting in a sorflush on B, * which results in another closef on A. Unfortunately, A is already * being closed, and the descriptor has already been marked with * SS_NOFDREF, and soclose panics at this point. * * Here, we first take an extra reference to each inaccessible * descriptor. Then, we call sorflush ourself, since we know * it is a Unix domain socket anyhow. After we destroy all the * rights carried in messages, we do a last closef to get rid * of our extra reference. This is the last close, and the * unp_detach etc will shut down the socket. * * 91/09/19, bsy@cs.cmu.edu */ extra_ref = malloc(nfiles * sizeof(struct file *), M_FILE, M_WAITOK); + sx_slock(&filelist_lock); for (nunref = 0, fp = LIST_FIRST(&filehead), fpp = extra_ref; fp != 0; fp = nextfp) { nextfp = LIST_NEXT(fp, f_list); + FILE_LOCK(fp); /* * If it's not open, skip it */ - if (fp->f_count == 0) + if (fp->f_count == 0) { + FILE_UNLOCK(fp); continue; + } /* * If all refs are from msgs, and it's not marked accessible * then it must be referenced from some unreachable cycle * of (shut-down) FDs, so include it in our * list of FDs to remove */ - if (fp->f_count == fp->f_msgcount && !(fp->f_flag & FMARK)) { + if (fp->f_count == fp->f_msgcount && !(fp->f_gcflag & FMARK)) { *fpp++ = fp; nunref++; fp->f_count++; } + FILE_UNLOCK(fp); } + sx_sunlock(&filelist_lock); /* * for each FD on our hit list, do the following two things */ for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) { struct file *tfp = *fpp; - if (tfp->f_type == DTYPE_SOCKET && tfp->f_data != NULL) + FILE_LOCK(tfp); + if (tfp->f_type == DTYPE_SOCKET && tfp->f_data != NULL) { + FILE_UNLOCK(tfp); sorflush((struct socket *)(tfp->f_data)); + } else + FILE_UNLOCK(tfp); } for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) closef(*fpp, (struct thread *) NULL); free((caddr_t)extra_ref, M_FILE); unp_gcing = 0; } void unp_dispose(m) struct mbuf *m; { if (m) unp_scan(m, unp_discard); } static int unp_listen(unp, p) struct unpcb *unp; struct proc *p; { bzero(&unp->unp_peercred, sizeof(unp->unp_peercred)); unp->unp_peercred.cr_uid = p->p_ucred->cr_uid; unp->unp_peercred.cr_ngroups = p->p_ucred->cr_ngroups; bcopy(p->p_ucred->cr_groups, unp->unp_peercred.cr_groups, sizeof(unp->unp_peercred.cr_groups)); unp->unp_flags |= UNP_HAVEPCCACHED; return (0); } static void unp_scan(m0, op) register struct mbuf *m0; void (*op) __P((struct file *)); { struct mbuf *m; struct file **rp; struct cmsghdr *cm; void *data; int i; socklen_t clen, datalen; int qfds; while (m0) { for (m = m0; m; m = m->m_next) { if (m->m_type != MT_CONTROL) continue; cm = mtod(m, struct cmsghdr *); clen = m->m_len; while (cm != NULL) { if (sizeof(*cm) > clen || cm->cmsg_len > clen) break; data = CMSG_DATA(cm); datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data; if (cm->cmsg_level == SOL_SOCKET && cm->cmsg_type == SCM_RIGHTS) { qfds = datalen / sizeof (struct file *); rp = data; for (i = 0; i < qfds; i++) (*op)(*rp++); } if (CMSG_SPACE(datalen) < clen) { clen -= CMSG_SPACE(datalen); cm = (struct cmsghdr *) ((caddr_t)cm + CMSG_SPACE(datalen)); } else { clen = 0; cm = NULL; } } } m0 = m0->m_act; } } static void unp_mark(fp) struct file *fp; { - - if (fp->f_flag & FMARK) + if (fp->f_gcflag & FMARK) return; unp_defer++; - fp->f_flag |= (FMARK|FDEFER); + fp->f_gcflag |= (FMARK|FDEFER); } static void unp_discard(fp) struct file *fp; { - + FILE_LOCK(fp); fp->f_msgcount--; unp_rights--; + FILE_UNLOCK(fp); (void) closef(fp, (struct thread *)NULL); } Index: head/sys/kern/vfs_acl.c =================================================================== --- head/sys/kern/vfs_acl.c (revision 89305) +++ head/sys/kern/vfs_acl.c (revision 89306) @@ -1,817 +1,821 @@ /*- * Copyright (c) 1999-2001 Robert N. M. Watson * All rights reserved. * * This software was developed by Robert Watson for the TrustedBSD Project. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ /* * Developed by the TrustedBSD Project. * Support for POSIX.1e access control lists. */ #include "opt_cap.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include MALLOC_DEFINE(M_ACL, "acl", "access control list"); static int vacl_set_acl(struct thread *td, struct vnode *vp, acl_type_t type, struct acl *aclp); static int vacl_get_acl(struct thread *td, struct vnode *vp, acl_type_t type, struct acl *aclp); static int vacl_aclcheck(struct thread *td, struct vnode *vp, acl_type_t type, struct acl *aclp); /* * Implement a version of vaccess() that understands POSIX.1e ACL semantics. * Return 0 on success, else an errno value. Should be merged into * vaccess() eventually. */ int vaccess_acl_posix1e(enum vtype type, uid_t file_uid, gid_t file_gid, struct acl *acl, mode_t acc_mode, struct ucred *cred, int *privused) { struct acl_entry *acl_other, *acl_mask; mode_t dac_granted; mode_t cap_granted; mode_t acl_mask_granted; int group_matched, i; /* * Look for a normal, non-privileged way to access the file/directory * as requested. If it exists, go with that. Otherwise, attempt * to use privileges granted via cap_granted. In some cases, * which privileges to use may be ambiguous due to "best match", * in which case fall back on first match for the time being. */ if (privused != NULL) *privused = 0; /* * Determine privileges now, but don't apply until we've found * a DAC entry that matches but has failed to allow access. */ #ifndef CAPABILITIES if (suser_xxx(cred, NULL, PRISON_ROOT) == 0) cap_granted = (VEXEC | VREAD | VWRITE | VADMIN); else cap_granted = 0; #else cap_granted = 0; if (type == VDIR) { if ((acc_mode & VEXEC) && !cap_check(cred, NULL, CAP_DAC_READ_SEARCH, PRISON_ROOT)) cap_granted |= VEXEC; } else { if ((acc_mode & VEXEC) && !cap_check(cred, NULL, CAP_DAC_EXECUTE, PRISON_ROOT)) cap_granted |= VEXEC; } if ((acc_mode & VREAD) && !cap_check(cred, NULL, CAP_DAC_READ_SEARCH, PRISON_ROOT)) cap_granted |= VREAD; if ((acc_mode & VWRITE) && !cap_check(cred, NULL, CAP_DAC_WRITE, PRISON_ROOT)) cap_granted |= VWRITE; if ((acc_mode & VADMIN) && !cap_check(cred, NULL, CAP_FOWNER, PRISON_ROOT)) cap_granted |= VADMIN; #endif /* CAPABILITIES */ /* * The owner matches if the effective uid associated with the * credential matches that of the ACL_USER_OBJ entry. While we're * doing the first scan, also cache the location of the ACL_MASK * and ACL_OTHER entries, preventing some future iterations. */ acl_mask = acl_other = NULL; for (i = 0; i < acl->acl_cnt; i++) { switch (acl->acl_entry[i].ae_tag) { case ACL_USER_OBJ: if (file_uid != cred->cr_uid) break; dac_granted = 0; dac_granted |= VADMIN; if (acl->acl_entry[i].ae_perm & ACL_EXECUTE) dac_granted |= VEXEC; if (acl->acl_entry[i].ae_perm & ACL_READ) dac_granted |= VREAD; if (acl->acl_entry[i].ae_perm & ACL_WRITE) dac_granted |= VWRITE; if ((acc_mode & dac_granted) == acc_mode) return (0); if ((acc_mode & (dac_granted | cap_granted)) == acc_mode) { if (privused != NULL) *privused = 1; return (0); } goto error; case ACL_MASK: acl_mask = &acl->acl_entry[i]; break; case ACL_OTHER: acl_other = &acl->acl_entry[i]; break; default: } } /* * An ACL_OTHER entry should always exist in a valid access * ACL. If it doesn't, then generate a serious failure. For now, * this means a debugging message and EPERM, but in the future * should probably be a panic. */ if (acl_other == NULL) { /* * XXX This should never happen */ printf("vaccess_acl_posix1e: ACL_OTHER missing\n"); return (EPERM); } /* * Checks against ACL_USER, ACL_GROUP_OBJ, and ACL_GROUP fields * are masked by an ACL_MASK entry, if any. As such, first identify * the ACL_MASK field, then iterate through identifying potential * user matches, then group matches. If there is no ACL_MASK, * assume that the mask allows all requests to succeed. */ if (acl_mask != NULL) { acl_mask_granted = 0; if (acl_mask->ae_perm & ACL_EXECUTE) acl_mask_granted |= VEXEC; if (acl_mask->ae_perm & ACL_READ) acl_mask_granted |= VREAD; if (acl_mask->ae_perm & ACL_WRITE) acl_mask_granted |= VWRITE; } else acl_mask_granted = VEXEC | VREAD | VWRITE; /* * Iterate through user ACL entries. Do checks twice, first * without privilege, and then if a match is found but failed, * a second time with privilege. */ /* * Check ACL_USER ACL entries. */ for (i = 0; i < acl->acl_cnt; i++) { switch (acl->acl_entry[i].ae_tag) { case ACL_USER: if (acl->acl_entry[i].ae_id != cred->cr_uid) break; dac_granted = 0; if (acl->acl_entry[i].ae_perm & ACL_EXECUTE) dac_granted |= VEXEC; if (acl->acl_entry[i].ae_perm & ACL_READ) dac_granted |= VREAD; if (acl->acl_entry[i].ae_perm & ACL_WRITE) dac_granted |= VWRITE; dac_granted &= acl_mask_granted; if ((acc_mode & dac_granted) == acc_mode) return (0); if ((acc_mode & (dac_granted | cap_granted)) != acc_mode) goto error; if (privused != NULL) *privused = 1; return (0); } } /* * Group match is best-match, not first-match, so find a * "best" match. Iterate across, testing each potential group * match. Make sure we keep track of whether we found a match * or not, so that we know if we should try again with any * available privilege, or if we should move on to ACL_OTHER. */ group_matched = 0; for (i = 0; i < acl->acl_cnt; i++) { switch (acl->acl_entry[i].ae_tag) { case ACL_GROUP_OBJ: if (!groupmember(file_gid, cred)) break; dac_granted = 0; if (acl->acl_entry[i].ae_perm & ACL_EXECUTE) dac_granted |= VEXEC; if (acl->acl_entry[i].ae_perm & ACL_READ) dac_granted |= VREAD; if (acl->acl_entry[i].ae_perm & ACL_WRITE) dac_granted |= VWRITE; dac_granted &= acl_mask_granted; if ((acc_mode & dac_granted) == acc_mode) return (0); group_matched = 1; break; case ACL_GROUP: if (!groupmember(acl->acl_entry[i].ae_id, cred)) break; dac_granted = 0; if (acl->acl_entry[i].ae_perm & ACL_EXECUTE) dac_granted |= VEXEC; if (acl->acl_entry[i].ae_perm & ACL_READ) dac_granted |= VREAD; if (acl->acl_entry[i].ae_perm & ACL_WRITE) dac_granted |= VWRITE; dac_granted &= acl_mask_granted; if ((acc_mode & dac_granted) == acc_mode) return (0); group_matched = 1; break; default: } } if (group_matched == 1) { /* * There was a match, but it did not grant rights via * pure DAC. Try again, this time with privilege. */ for (i = 0; i < acl->acl_cnt; i++) { switch (acl->acl_entry[i].ae_tag) { case ACL_GROUP_OBJ: if (!groupmember(file_gid, cred)) break; dac_granted = 0; if (acl->acl_entry[i].ae_perm & ACL_EXECUTE) dac_granted |= VEXEC; if (acl->acl_entry[i].ae_perm & ACL_READ) dac_granted |= VREAD; if (acl->acl_entry[i].ae_perm & ACL_WRITE) dac_granted |= VWRITE; dac_granted &= acl_mask_granted; if ((acc_mode & (dac_granted | cap_granted)) != acc_mode) break; if (privused != NULL) *privused = 1; return (0); case ACL_GROUP: if (!groupmember(acl->acl_entry[i].ae_id, cred)) break; dac_granted = 0; if (acl->acl_entry[i].ae_perm & ACL_EXECUTE) dac_granted |= VEXEC; if (acl->acl_entry[i].ae_perm & ACL_READ) dac_granted |= VREAD; if (acl->acl_entry[i].ae_perm & ACL_WRITE) dac_granted |= VWRITE; dac_granted &= acl_mask_granted; if ((acc_mode & (dac_granted | cap_granted)) != acc_mode) break; if (privused != NULL) *privused = 1; return (0); default: } } /* * Even with privilege, group membership was not sufficient. * Return failure. */ goto error; } /* * Fall back on ACL_OTHER. ACL_MASK is not applied to ACL_OTHER. */ dac_granted = 0; if (acl_other->ae_perm & ACL_EXECUTE) dac_granted |= VEXEC; if (acl_other->ae_perm & ACL_READ) dac_granted |= VREAD; if (acl_other->ae_perm & ACL_WRITE) dac_granted |= VWRITE; if ((acc_mode & dac_granted) == acc_mode) return (0); if ((acc_mode & (dac_granted | cap_granted)) == acc_mode) { if (privused != NULL) *privused = 1; return (0); } error: return ((acc_mode & VADMIN) ? EPERM : EACCES); } /* * For the purposes of file systems maintaining the _OBJ entries in an * inode with a mode_t field, this routine converts a mode_t entry * to an acl_perm_t. */ acl_perm_t acl_posix1e_mode_to_perm(acl_tag_t tag, mode_t mode) { acl_perm_t perm = 0; switch(tag) { case ACL_USER_OBJ: if (mode & S_IXUSR) perm |= ACL_EXECUTE; if (mode & S_IRUSR) perm |= ACL_READ; if (mode & S_IWUSR) perm |= ACL_WRITE; return (perm); case ACL_GROUP_OBJ: if (mode & S_IXGRP) perm |= ACL_EXECUTE; if (mode & S_IRGRP) perm |= ACL_READ; if (mode & S_IWGRP) perm |= ACL_WRITE; return (perm); case ACL_OTHER: if (mode & S_IXOTH) perm |= ACL_EXECUTE; if (mode & S_IROTH) perm |= ACL_READ; if (mode & S_IWOTH) perm |= ACL_WRITE; return (perm); default: printf("acl_posix1e_mode_to_perm: invalid tag (%d)\n", tag); return (0); } } /* * Given inode information (uid, gid, mode), return an acl entry of the * appropriate type. */ struct acl_entry acl_posix1e_mode_to_entry(acl_tag_t tag, uid_t uid, gid_t gid, mode_t mode) { struct acl_entry acl_entry; acl_entry.ae_tag = tag; acl_entry.ae_perm = acl_posix1e_mode_to_perm(tag, mode); switch(tag) { case ACL_USER_OBJ: acl_entry.ae_id = uid; break; case ACL_GROUP_OBJ: acl_entry.ae_id = gid; break; case ACL_OTHER: acl_entry.ae_id = ACL_UNDEFINED_ID; break; default: acl_entry.ae_id = ACL_UNDEFINED_ID; printf("acl_posix1e_mode_to_entry: invalid tag (%d)\n", tag); } return (acl_entry); } /* * Utility function to generate a file mode given appropriate ACL entries. */ mode_t acl_posix1e_perms_to_mode(struct acl_entry *acl_user_obj_entry, struct acl_entry *acl_group_obj_entry, struct acl_entry *acl_other_entry) { mode_t mode; mode = 0; if (acl_user_obj_entry->ae_perm & ACL_EXECUTE) mode |= S_IXUSR; if (acl_user_obj_entry->ae_perm & ACL_READ) mode |= S_IRUSR; if (acl_user_obj_entry->ae_perm & ACL_WRITE) mode |= S_IWUSR; if (acl_group_obj_entry->ae_perm & ACL_EXECUTE) mode |= S_IXGRP; if (acl_group_obj_entry->ae_perm & ACL_READ) mode |= S_IRGRP; if (acl_group_obj_entry->ae_perm & ACL_WRITE) mode |= S_IWGRP; if (acl_other_entry->ae_perm & ACL_EXECUTE) mode |= S_IXOTH; if (acl_other_entry->ae_perm & ACL_READ) mode |= S_IROTH; if (acl_other_entry->ae_perm & ACL_WRITE) mode |= S_IWOTH; return (mode); } /* * Perform a syntactic check of the ACL, sufficient to allow an * implementing file system to determine if it should accept this and * rely on the POSIX.1e ACL properties. */ int acl_posix1e_check(struct acl *acl) { int num_acl_user_obj, num_acl_user, num_acl_group_obj, num_acl_group; int num_acl_mask, num_acl_other, i; /* * Verify that the number of entries does not exceed the maximum * defined for acl_t. * Verify that the correct number of various sorts of ae_tags are * present: * Exactly one ACL_USER_OBJ * Exactly one ACL_GROUP_OBJ * Exactly one ACL_OTHER * If any ACL_USER or ACL_GROUP entries appear, then exactly one * ACL_MASK entry must also appear. * Verify that all ae_perm entries are in ACL_PERM_BITS. * Verify all ae_tag entries are understood by this implementation. * Note: Does not check for uniqueness of qualifier (ae_id) field. */ num_acl_user_obj = num_acl_user = num_acl_group_obj = num_acl_group = num_acl_mask = num_acl_other = 0; if (acl->acl_cnt > ACL_MAX_ENTRIES || acl->acl_cnt < 0) return (EINVAL); for (i = 0; i < acl->acl_cnt; i++) { /* * Check for a valid tag. */ switch(acl->acl_entry[i].ae_tag) { case ACL_USER_OBJ: acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */ if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID) return (EINVAL); num_acl_user_obj++; break; case ACL_GROUP_OBJ: acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */ if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID) return (EINVAL); num_acl_group_obj++; break; case ACL_USER: if (acl->acl_entry[i].ae_id == ACL_UNDEFINED_ID) return (EINVAL); num_acl_user++; break; case ACL_GROUP: if (acl->acl_entry[i].ae_id == ACL_UNDEFINED_ID) return (EINVAL); num_acl_group++; break; case ACL_OTHER: acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */ if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID) return (EINVAL); num_acl_other++; break; case ACL_MASK: acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */ if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID) return (EINVAL); num_acl_mask++; break; default: return (EINVAL); } /* * Check for valid perm entries. */ if ((acl->acl_entry[i].ae_perm | ACL_PERM_BITS) != ACL_PERM_BITS) return (EINVAL); } if ((num_acl_user_obj != 1) || (num_acl_group_obj != 1) || (num_acl_other != 1) || (num_acl_mask != 0 && num_acl_mask != 1)) return (EINVAL); if (((num_acl_group != 0) || (num_acl_user != 0)) && (num_acl_mask != 1)) return (EINVAL); return (0); } /* * These calls wrap the real vnode operations, and are called by the * syscall code once the syscall has converted the path or file * descriptor to a vnode (unlocked). The aclp pointer is assumed * still to point to userland, so this should not be consumed within * the kernel except by syscall code. Other code should directly * invoke VOP_{SET,GET}ACL. */ /* * Given a vnode, set its ACL. */ static int vacl_set_acl(struct thread *td, struct vnode *vp, acl_type_t type, struct acl *aclp) { struct acl inkernacl; int error; error = copyin(aclp, &inkernacl, sizeof(struct acl)); if (error) return(error); VOP_LEASE(vp, td, td->td_proc->p_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); error = VOP_SETACL(vp, type, &inkernacl, td->td_proc->p_ucred, td); VOP_UNLOCK(vp, 0, td); return(error); } /* * Given a vnode, get its ACL. */ static int vacl_get_acl(struct thread *td, struct vnode *vp, acl_type_t type, struct acl *aclp) { struct acl inkernelacl; int error; VOP_LEASE(vp, td, td->td_proc->p_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); error = VOP_GETACL(vp, type, &inkernelacl, td->td_proc->p_ucred, td); VOP_UNLOCK(vp, 0, td); if (error == 0) error = copyout(&inkernelacl, aclp, sizeof(struct acl)); return (error); } /* * Given a vnode, delete its ACL. */ static int vacl_delete(struct thread *td, struct vnode *vp, acl_type_t type) { int error; VOP_LEASE(vp, td, td->td_proc->p_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); error = VOP_SETACL(vp, ACL_TYPE_DEFAULT, 0, td->td_proc->p_ucred, td); VOP_UNLOCK(vp, 0, td); return (error); } /* * Given a vnode, check whether an ACL is appropriate for it */ static int vacl_aclcheck(struct thread *td, struct vnode *vp, acl_type_t type, struct acl *aclp) { struct acl inkernelacl; int error; error = copyin(aclp, &inkernelacl, sizeof(struct acl)); if (error) return(error); error = VOP_ACLCHECK(vp, type, &inkernelacl, td->td_proc->p_ucred, td); return (error); } /* * syscalls -- convert the path/fd to a vnode, and call vacl_whatever. * Don't need to lock, as the vacl_ code will get/release any locks * required. */ /* * Given a file path, get an ACL for it * * MPSAFE */ int __acl_get_file(struct thread *td, struct __acl_get_file_args *uap) { struct nameidata nd; int error; mtx_lock(&Giant); NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td); error = namei(&nd); if (error == 0) { error = vacl_get_acl(td, nd.ni_vp, SCARG(uap, type), SCARG(uap, aclp)); NDFREE(&nd, 0); } mtx_unlock(&Giant); return (error); } /* * Given a file path, set an ACL for it * * MPSAFE */ int __acl_set_file(struct thread *td, struct __acl_set_file_args *uap) { struct nameidata nd; int error; mtx_lock(&Giant); NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td); error = namei(&nd); if (error == 0) { error = vacl_set_acl(td, nd.ni_vp, SCARG(uap, type), SCARG(uap, aclp)); NDFREE(&nd, 0); } mtx_unlock(&Giant); return (error); } /* * Given a file descriptor, get an ACL for it * * MPSAFE */ int __acl_get_fd(struct thread *td, struct __acl_get_fd_args *uap) { struct file *fp; int error; mtx_lock(&Giant); error = getvnode(td->td_proc->p_fd, SCARG(uap, filedes), &fp); if (error == 0) { error = vacl_get_acl(td, (struct vnode *)fp->f_data, SCARG(uap, type), SCARG(uap, aclp)); + fdrop(fp, td); } mtx_unlock(&Giant); return (error); } /* * Given a file descriptor, set an ACL for it * * MPSAFE */ int __acl_set_fd(struct thread *td, struct __acl_set_fd_args *uap) { struct file *fp; int error; mtx_lock(&Giant); error = getvnode(td->td_proc->p_fd, SCARG(uap, filedes), &fp); if (error == 0) { error = vacl_set_acl(td, (struct vnode *)fp->f_data, SCARG(uap, type), SCARG(uap, aclp)); + fdrop(fp, td); } mtx_unlock(&Giant); return (error); } /* * Given a file path, delete an ACL from it. * * MPSAFE */ int __acl_delete_file(struct thread *td, struct __acl_delete_file_args *uap) { struct nameidata nd; int error; mtx_lock(&Giant); NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td); error = namei(&nd); if (error == 0) { error = vacl_delete(td, nd.ni_vp, SCARG(uap, type)); NDFREE(&nd, 0); } mtx_unlock(&Giant); return (error); } /* * Given a file path, delete an ACL from it. * * MPSAFE */ int __acl_delete_fd(struct thread *td, struct __acl_delete_fd_args *uap) { struct file *fp; int error; mtx_lock(&Giant); error = getvnode(td->td_proc->p_fd, SCARG(uap, filedes), &fp); if (error == 0) { error = vacl_delete(td, (struct vnode *)fp->f_data, SCARG(uap, type)); + fdrop(fp, td); } mtx_unlock(&Giant); return (error); } /* * Given a file path, check an ACL for it * * MPSAFE */ int __acl_aclcheck_file(struct thread *td, struct __acl_aclcheck_file_args *uap) { struct nameidata nd; int error; mtx_lock(&Giant); NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td); error = namei(&nd); if (error == 0) { error = vacl_aclcheck(td, nd.ni_vp, SCARG(uap, type), SCARG(uap, aclp)); NDFREE(&nd, 0); } mtx_unlock(&Giant); return (error); } /* * Given a file descriptor, check an ACL for it * * MPSAFE */ int __acl_aclcheck_fd(struct thread *td, struct __acl_aclcheck_fd_args *uap) { struct file *fp; int error; mtx_lock(&Giant); error = getvnode(td->td_proc->p_fd, SCARG(uap, filedes), &fp); if (error == 0) { error = vacl_aclcheck(td, (struct vnode *)fp->f_data, SCARG(uap, type), SCARG(uap, aclp)); + fdrop(fp, td); } mtx_unlock(&Giant); return (error); } Index: head/sys/kern/vfs_cache.c =================================================================== --- head/sys/kern/vfs_cache.c (revision 89305) +++ head/sys/kern/vfs_cache.c (revision 89306) @@ -1,868 +1,885 @@ /* * Copyright (c) 1989, 1993, 1995 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Poul-Henning Kamp of the FreeBSD Project. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95 * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include /* * This structure describes the elements in the cache of recent * names looked up by namei. */ struct namecache { LIST_ENTRY(namecache) nc_hash; /* hash chain */ LIST_ENTRY(namecache) nc_src; /* source vnode list */ TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */ struct vnode *nc_dvp; /* vnode of parent of name */ struct vnode *nc_vp; /* vnode the name refers to */ u_char nc_flag; /* flag bits */ u_char nc_nlen; /* length of name */ char nc_name[0]; /* segment name */ }; /* * Name caching works as follows: * * Names found by directory scans are retained in a cache * for future reference. It is managed LRU, so frequently * used names will hang around. Cache is indexed by hash value * obtained from (vp, name) where vp refers to the directory * containing name. * * If it is a "negative" entry, (i.e. for a name that is known NOT to * exist) the vnode pointer will be NULL. * * Upon reaching the last segment of a path, if the reference * is for DELETE, or NOCACHE is set (rewrite), and the * name is located in the cache, it will be dropped. */ /* * Structures associated with name cacheing. */ #define NCHHASH(hash) \ (&nchashtbl[(hash) & nchash]) static LIST_HEAD(nchashhead, namecache) *nchashtbl; /* Hash Table */ static TAILQ_HEAD(, namecache) ncneg; /* Hash Table */ static u_long nchash; /* size of hash table */ SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, ""); static u_long ncnegfactor = 16; /* ratio of negative entries */ SYSCTL_ULONG(_debug, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, ""); static u_long numneg; /* number of cache entries allocated */ SYSCTL_ULONG(_debug, OID_AUTO, numneg, CTLFLAG_RD, &numneg, 0, ""); static u_long numcache; /* number of cache entries allocated */ SYSCTL_ULONG(_debug, OID_AUTO, numcache, CTLFLAG_RD, &numcache, 0, ""); static u_long numcachehv; /* number of cache entries with vnodes held */ SYSCTL_ULONG(_debug, OID_AUTO, numcachehv, CTLFLAG_RD, &numcachehv, 0, ""); #if 0 static u_long numcachepl; /* number of cache purge for leaf entries */ SYSCTL_ULONG(_debug, OID_AUTO, numcachepl, CTLFLAG_RD, &numcachepl, 0, ""); #endif struct nchstats nchstats; /* cache effectiveness statistics */ static int doingcache = 1; /* 1 => enable the cache */ SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, ""); SYSCTL_INT(_debug, OID_AUTO, vnsize, CTLFLAG_RD, 0, sizeof(struct vnode), ""); SYSCTL_INT(_debug, OID_AUTO, ncsize, CTLFLAG_RD, 0, sizeof(struct namecache), ""); /* * The new name cache statistics */ SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, "Name cache statistics"); #define STATNODE(mode, name, var) \ SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, mode, var, 0, ""); STATNODE(CTLFLAG_RD, numneg, &numneg); STATNODE(CTLFLAG_RD, numcache, &numcache); static u_long numcalls; STATNODE(CTLFLAG_RD, numcalls, &numcalls); static u_long dothits; STATNODE(CTLFLAG_RD, dothits, &dothits); static u_long dotdothits; STATNODE(CTLFLAG_RD, dotdothits, &dotdothits); static u_long numchecks; STATNODE(CTLFLAG_RD, numchecks, &numchecks); static u_long nummiss; STATNODE(CTLFLAG_RD, nummiss, &nummiss); static u_long nummisszap; STATNODE(CTLFLAG_RD, nummisszap, &nummisszap); static u_long numposzaps; STATNODE(CTLFLAG_RD, numposzaps, &numposzaps); static u_long numposhits; STATNODE(CTLFLAG_RD, numposhits, &numposhits); static u_long numnegzaps; STATNODE(CTLFLAG_RD, numnegzaps, &numnegzaps); static u_long numneghits; STATNODE(CTLFLAG_RD, numneghits, &numneghits); SYSCTL_OPAQUE(_vfs_cache, OID_AUTO, nchstats, CTLFLAG_RD, &nchstats, sizeof(nchstats), "LU", "VFS cache effectiveness statistics"); static void cache_zap __P((struct namecache *ncp)); static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries"); /* * Flags in namecache.nc_flag */ #define NCF_WHITE 1 /* * Grab an atomic snapshot of the name cache hash chain lengths */ SYSCTL_NODE(_debug, OID_AUTO, hashstat, CTLFLAG_RW, NULL, "hash table stats"); static int sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS) { int error; struct nchashhead *ncpp; struct namecache *ncp; int n_nchash; int count; n_nchash = nchash + 1; /* nchash is max index, not count */ if (!req->oldptr) return SYSCTL_OUT(req, 0, n_nchash * sizeof(int)); /* Scan hash tables for applicable entries */ for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) { count = 0; LIST_FOREACH(ncp, ncpp, nc_hash) { count++; } error = SYSCTL_OUT(req, (caddr_t)&count, sizeof(count)); if (error) return (error); } return (0); } SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int", "nchash chain lengths"); static int sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS) { int error; struct nchashhead *ncpp; struct namecache *ncp; int n_nchash; int count, maxlength, used, pct; if (!req->oldptr) return SYSCTL_OUT(req, 0, 4 * sizeof(int)); n_nchash = nchash + 1; /* nchash is max index, not count */ used = 0; maxlength = 0; /* Scan hash tables for applicable entries */ for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) { count = 0; LIST_FOREACH(ncp, ncpp, nc_hash) { count++; } if (count) used++; if (maxlength < count) maxlength = count; } n_nchash = nchash + 1; pct = (used * 100 * 100) / n_nchash; error = SYSCTL_OUT(req, (caddr_t)&n_nchash, sizeof(n_nchash)); if (error) return (error); error = SYSCTL_OUT(req, (caddr_t)&used, sizeof(used)); if (error) return (error); error = SYSCTL_OUT(req, (caddr_t)&maxlength, sizeof(maxlength)); if (error) return (error); error = SYSCTL_OUT(req, (caddr_t)&pct, sizeof(pct)); if (error) return (error); return (0); } SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_debug_hashstat_nchash, "I", "nchash chain lengths"); /* * Delete an entry from its hash list and move it to the front * of the LRU list for immediate reuse. */ static void cache_zap(ncp) struct namecache *ncp; { LIST_REMOVE(ncp, nc_hash); LIST_REMOVE(ncp, nc_src); if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) { vdrop(ncp->nc_dvp); numcachehv--; } if (ncp->nc_vp) { TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst); } else { TAILQ_REMOVE(&ncneg, ncp, nc_dst); numneg--; } numcache--; free(ncp, M_VFSCACHE); } /* * cache_leaf_test() * * Test whether this (directory) vnode's namei cache entry contains * subdirectories or not. Used to determine whether the directory is * a leaf in the namei cache or not. Note: the directory may still * contain files in the namei cache. * * Returns 0 if the directory is a leaf, -1 if it isn't. */ int cache_leaf_test(struct vnode *vp) { struct namecache *ncpc; for (ncpc = LIST_FIRST(&vp->v_cache_src); ncpc != NULL; ncpc = LIST_NEXT(ncpc, nc_src) ) { if (ncpc->nc_vp != NULL && ncpc->nc_vp->v_type == VDIR) return(-1); } return(0); } /* * Lookup an entry in the cache * * We don't do this if the segment name is long, simply so the cache * can avoid holding long names (which would either waste space, or * add greatly to the complexity). * * Lookup is called with dvp pointing to the directory to search, * cnp pointing to the name of the entry being sought. If the lookup * succeeds, the vnode is returned in *vpp, and a status of -1 is * returned. If the lookup determines that the name does not exist * (negative cacheing), a status of ENOENT is returned. If the lookup * fails, a status of zero is returned. */ int cache_lookup(dvp, vpp, cnp) struct vnode *dvp; struct vnode **vpp; struct componentname *cnp; { struct namecache *ncp; u_int32_t hash; if (!doingcache) { cnp->cn_flags &= ~MAKEENTRY; return (0); } numcalls++; if (cnp->cn_nameptr[0] == '.') { if (cnp->cn_namelen == 1) { *vpp = dvp; dothits++; return (-1); } if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { dotdothits++; if (dvp->v_dd->v_id != dvp->v_ddid || (cnp->cn_flags & MAKEENTRY) == 0) { dvp->v_ddid = 0; return (0); } *vpp = dvp->v_dd; return (-1); } } hash = fnv_32_buf(cnp->cn_nameptr, cnp->cn_namelen, FNV1_32_INIT); hash = fnv_32_buf(&dvp->v_id, sizeof(dvp->v_id), hash); LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { numchecks++; if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) break; } /* We failed to find an entry */ if (ncp == 0) { if ((cnp->cn_flags & MAKEENTRY) == 0) { nummisszap++; } else { nummiss++; } nchstats.ncs_miss++; return (0); } /* We don't want to have an entry, so dump it */ if ((cnp->cn_flags & MAKEENTRY) == 0) { numposzaps++; nchstats.ncs_badhits++; cache_zap(ncp); return (0); } /* We found a "positive" match, return the vnode */ if (ncp->nc_vp) { numposhits++; nchstats.ncs_goodhits++; *vpp = ncp->nc_vp; return (-1); } /* We found a negative match, and want to create it, so purge */ if (cnp->cn_nameiop == CREATE) { numnegzaps++; nchstats.ncs_badhits++; cache_zap(ncp); return (0); } numneghits++; /* * We found a "negative" match, ENOENT notifies client of this match. * The nc_vpid field records whether this is a whiteout. */ TAILQ_REMOVE(&ncneg, ncp, nc_dst); TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst); nchstats.ncs_neghits++; if (ncp->nc_flag & NCF_WHITE) cnp->cn_flags |= ISWHITEOUT; return (ENOENT); } /* * Add an entry to the cache. */ void cache_enter(dvp, vp, cnp) struct vnode *dvp; struct vnode *vp; struct componentname *cnp; { struct namecache *ncp; struct nchashhead *ncpp; u_int32_t hash; int len; if (!doingcache) return; if (cnp->cn_nameptr[0] == '.') { if (cnp->cn_namelen == 1) { return; } if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { if (vp) { dvp->v_dd = vp; dvp->v_ddid = vp->v_id; } else { dvp->v_dd = dvp; dvp->v_ddid = 0; } return; } } ncp = (struct namecache *) malloc(sizeof *ncp + cnp->cn_namelen, M_VFSCACHE, M_WAITOK); bzero((char *)ncp, sizeof *ncp); numcache++; if (!vp) { numneg++; ncp->nc_flag = cnp->cn_flags & ISWHITEOUT ? NCF_WHITE : 0; } else if (vp->v_type == VDIR) { vp->v_dd = dvp; vp->v_ddid = dvp->v_id; } /* * Fill in cache info, if vp is NULL this is a "negative" cache entry. * For negative entries, we have to record whether it is a whiteout. * the whiteout flag is stored in the nc_vpid field which is * otherwise unused. */ ncp->nc_vp = vp; ncp->nc_dvp = dvp; len = ncp->nc_nlen = cnp->cn_namelen; hash = fnv_32_buf(cnp->cn_nameptr, len, FNV1_32_INIT); bcopy(cnp->cn_nameptr, ncp->nc_name, len); hash = fnv_32_buf(&dvp->v_id, sizeof(dvp->v_id), hash); ncpp = NCHHASH(hash); LIST_INSERT_HEAD(ncpp, ncp, nc_hash); if (LIST_EMPTY(&dvp->v_cache_src)) { vhold(dvp); numcachehv++; } LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src); if (vp) { TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst); } else { TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst); } if (numneg * ncnegfactor > numcache) { ncp = TAILQ_FIRST(&ncneg); cache_zap(ncp); } } /* * Name cache initialization, from vfs_init() when we are booting */ static void nchinit(void *dummy __unused) { TAILQ_INIT(&ncneg); nchashtbl = hashinit(desiredvnodes * 2, M_VFSCACHE, &nchash); } SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL) /* * Invalidate all entries to a particular vnode. * * Remove all entries in the namecache relating to this vnode and * change the v_id. We take the v_id from a global counter, since * it becomes a handy sequence number in crash-dumps that way. * No valid vnode will ever have (v_id == 0). * * XXX: Only time and the size of v_id prevents this from failing: * XXX: In theory we should hunt down all (struct vnode*, v_id) * XXX: soft references and nuke them, at least on the global * XXX: v_id wraparound. The period of resistance can be extended * XXX: by incrementing each vnodes v_id individually instead of * XXX: using the global v_id. */ void cache_purge(vp) struct vnode *vp; { static u_long nextid; while (!LIST_EMPTY(&vp->v_cache_src)) cache_zap(LIST_FIRST(&vp->v_cache_src)); while (!TAILQ_EMPTY(&vp->v_cache_dst)) cache_zap(TAILQ_FIRST(&vp->v_cache_dst)); do nextid++; while (nextid == vp->v_id || !nextid); vp->v_id = nextid; vp->v_dd = vp; vp->v_ddid = 0; } /* * Flush all entries referencing a particular filesystem. * * Since we need to check it anyway, we will flush all the invalid * entries at the same time. */ void cache_purgevfs(mp) struct mount *mp; { struct nchashhead *ncpp; struct namecache *ncp, *nnp; /* Scan hash tables for applicable entries */ for (ncpp = &nchashtbl[nchash]; ncpp >= nchashtbl; ncpp--) { for (ncp = LIST_FIRST(ncpp); ncp != 0; ncp = nnp) { nnp = LIST_NEXT(ncp, nc_hash); if (ncp->nc_dvp->v_mount == mp) { cache_zap(ncp); } } } } #if 0 /* * Flush all dirctory entries with no child directories held in * the cache. * * Since we need to check it anyway, we will flush all the invalid * entries at the same time. */ void cache_purgeleafdirs(ndir) int ndir; { struct nchashhead *ncpp; struct namecache *ncp, *nnp, *ncpc, *nnpc; struct vnode *dvp; /* Scan hash tables for applicable entries */ for (ncpp = &nchashtbl[nchash]; ncpp >= nchashtbl && ndir > 0; ncpp--) { for (ncp = LIST_FIRST(ncpp); ncp != 0 && ndir > 0; ncp = nnp) { nnp = LIST_NEXT(ncp, nc_hash); if (ncp->nc_dvp != 0) { /* * Skip over if nc_dvp of this cache holds * a child directory, or the hold count of * nc_dvp is greater than 1 (in which case * nc_dvp is likely to be the working * directory of a process). */ if (ncp->nc_dvp->v_holdcnt > 1) continue; for (ncpc = LIST_FIRST(&ncp->nc_dvp->v_cache_src); ncpc != 0; ncpc = nnpc) { nnpc = LIST_NEXT(ncpc, nc_src); if (ncpc->nc_vp != 0 && ncpc->nc_vp->v_type == VDIR) break; } if (ncpc == 0) { /* * Zap all of this directory's children, * held in ncp->nc_dvp->v_cache_src. */ dvp = ncp->nc_dvp; while (!LIST_EMPTY(&dvp->v_cache_src)) cache_zap(LIST_FIRST(&dvp->v_cache_src)); ndir--; /* Restart in case where nnp is reclaimed. */ nnp = LIST_FIRST(ncpp); continue; } } } } numcachepl++; } #endif /* * Perform canonical checks and cache lookup and pass on to filesystem * through the vop_cachedlookup only if needed. */ int vfs_cache_lookup(ap) struct vop_lookup_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; } */ *ap; { struct vnode *dvp, *vp; int lockparent; int error; struct vnode **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; struct ucred *cred = cnp->cn_cred; int flags = cnp->cn_flags; struct thread *td = cnp->cn_thread; u_long vpid; /* capability number of vnode */ *vpp = NULL; dvp = ap->a_dvp; lockparent = flags & LOCKPARENT; if (dvp->v_type != VDIR) return (ENOTDIR); if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) return (EROFS); error = VOP_ACCESS(dvp, VEXEC, cred, td); if (error) return (error); error = cache_lookup(dvp, vpp, cnp); if (!error) return (VOP_CACHEDLOOKUP(dvp, vpp, cnp)); if (error == ENOENT) return (error); vp = *vpp; vpid = vp->v_id; cnp->cn_flags &= ~PDIRUNLOCK; if (dvp == vp) { /* lookup on "." */ VREF(vp); error = 0; } else if (flags & ISDOTDOT) { VOP_UNLOCK(dvp, 0, td); cnp->cn_flags |= PDIRUNLOCK; error = vget(vp, LK_EXCLUSIVE, td); if (!error && lockparent && (flags & ISLASTCN)) { if ((error = vn_lock(dvp, LK_EXCLUSIVE, td)) == 0) cnp->cn_flags &= ~PDIRUNLOCK; } } else { error = vget(vp, LK_EXCLUSIVE, td); if (!lockparent || error || !(flags & ISLASTCN)) { VOP_UNLOCK(dvp, 0, td); cnp->cn_flags |= PDIRUNLOCK; } } /* * Check that the capability number did not change * while we were waiting for the lock. */ if (!error) { if (vpid == vp->v_id) return (0); vput(vp); if (lockparent && dvp != vp && (flags & ISLASTCN)) { VOP_UNLOCK(dvp, 0, td); cnp->cn_flags |= PDIRUNLOCK; } } if (cnp->cn_flags & PDIRUNLOCK) { error = vn_lock(dvp, LK_EXCLUSIVE, td); if (error) return (error); cnp->cn_flags &= ~PDIRUNLOCK; } return (VOP_CACHEDLOOKUP(dvp, vpp, cnp)); } #ifndef _SYS_SYSPROTO_H_ struct __getcwd_args { u_char *buf; u_int buflen; }; #endif static int disablecwd; SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0, ""); static u_long numcwdcalls; STATNODE(CTLFLAG_RD, numcwdcalls, &numcwdcalls); static u_long numcwdfail1; STATNODE(CTLFLAG_RD, numcwdfail1, &numcwdfail1); static u_long numcwdfail2; STATNODE(CTLFLAG_RD, numcwdfail2, &numcwdfail2); static u_long numcwdfail3; STATNODE(CTLFLAG_RD, numcwdfail3, &numcwdfail3); static u_long numcwdfail4; STATNODE(CTLFLAG_RD, numcwdfail4, &numcwdfail4); static u_long numcwdfound; STATNODE(CTLFLAG_RD, numcwdfound, &numcwdfound); int __getcwd(td, uap) struct thread *td; struct __getcwd_args *uap; { char *bp, *buf; int error, i, slash_prefixed; struct filedesc *fdp; struct namecache *ncp; struct vnode *vp; numcwdcalls++; if (disablecwd) return (ENODEV); if (uap->buflen < 2) return (EINVAL); if (uap->buflen > MAXPATHLEN) uap->buflen = MAXPATHLEN; buf = bp = malloc(uap->buflen, M_TEMP, M_WAITOK); bp += uap->buflen - 1; *bp = '\0'; fdp = td->td_proc->p_fd; slash_prefixed = 0; + FILEDESC_LOCK(fdp); for (vp = fdp->fd_cdir; vp != fdp->fd_rdir && vp != rootvnode;) { if (vp->v_flag & VROOT) { if (vp->v_mount == NULL) { /* forced unmount */ + FILEDESC_UNLOCK(fdp); free(buf, M_TEMP); return (EBADF); } vp = vp->v_mount->mnt_vnodecovered; continue; } if (vp->v_dd->v_id != vp->v_ddid) { + FILEDESC_UNLOCK(fdp); numcwdfail1++; free(buf, M_TEMP); return (ENOTDIR); } ncp = TAILQ_FIRST(&vp->v_cache_dst); if (!ncp) { + FILEDESC_UNLOCK(fdp); numcwdfail2++; free(buf, M_TEMP); return (ENOENT); } if (ncp->nc_dvp != vp->v_dd) { + FILEDESC_UNLOCK(fdp); numcwdfail3++; free(buf, M_TEMP); return (EBADF); } for (i = ncp->nc_nlen - 1; i >= 0; i--) { if (bp == buf) { + FILEDESC_UNLOCK(fdp); numcwdfail4++; free(buf, M_TEMP); return (ENOMEM); } *--bp = ncp->nc_name[i]; } if (bp == buf) { + FILEDESC_UNLOCK(fdp); numcwdfail4++; free(buf, M_TEMP); return (ENOMEM); } *--bp = '/'; slash_prefixed = 1; vp = vp->v_dd; } + FILEDESC_UNLOCK(fdp); if (!slash_prefixed) { if (bp == buf) { numcwdfail4++; free(buf, M_TEMP); return (ENOMEM); } *--bp = '/'; } numcwdfound++; error = copyout(bp, uap->buf, strlen(bp) + 1); free(buf, M_TEMP); return (error); } /* * Thus begins the fullpath magic. */ #undef STATNODE #define STATNODE(name) \ static u_int name; \ SYSCTL_UINT(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, "") static int disablefullpath; SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW, &disablefullpath, 0, ""); STATNODE(numfullpathcalls); STATNODE(numfullpathfail1); STATNODE(numfullpathfail2); STATNODE(numfullpathfail3); STATNODE(numfullpathfail4); STATNODE(numfullpathfound); int vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf) { char *bp, *buf; int i, slash_prefixed; struct filedesc *fdp; struct namecache *ncp; struct vnode *vp; numfullpathcalls++; if (disablefullpath) return (ENODEV); if (vn == NULL) return (EINVAL); buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); bp = buf + MAXPATHLEN - 1; *bp = '\0'; fdp = td->td_proc->p_fd; slash_prefixed = 0; + FILEDESC_LOCK(fdp); for (vp = vn; vp != fdp->fd_rdir && vp != rootvnode;) { if (vp->v_flag & VROOT) { if (vp->v_mount == NULL) { /* forced unmount */ + FILEDESC_UNLOCK(fdp); free(buf, M_TEMP); return (EBADF); } vp = vp->v_mount->mnt_vnodecovered; continue; } if (vp != vn && vp->v_dd->v_id != vp->v_ddid) { + FILEDESC_UNLOCK(fdp); numfullpathfail1++; free(buf, M_TEMP); return (ENOTDIR); } ncp = TAILQ_FIRST(&vp->v_cache_dst); if (!ncp) { + FILEDESC_UNLOCK(fdp); numfullpathfail2++; free(buf, M_TEMP); return (ENOENT); } if (vp != vn && ncp->nc_dvp != vp->v_dd) { + FILEDESC_UNLOCK(fdp); numfullpathfail3++; free(buf, M_TEMP); return (EBADF); } for (i = ncp->nc_nlen - 1; i >= 0; i--) { if (bp == buf) { + FILEDESC_UNLOCK(fdp); numfullpathfail4++; free(buf, M_TEMP); return (ENOMEM); } *--bp = ncp->nc_name[i]; } if (bp == buf) { + FILEDESC_UNLOCK(fdp); numfullpathfail4++; free(buf, M_TEMP); return (ENOMEM); } *--bp = '/'; slash_prefixed = 1; vp = ncp->nc_dvp; } if (!slash_prefixed) { if (bp == buf) { + FILEDESC_UNLOCK(fdp); numfullpathfail4++; free(buf, M_TEMP); return (ENOMEM); } *--bp = '/'; } + FILEDESC_UNLOCK(fdp); numfullpathfound++; *retbuf = bp; *freebuf = buf; return (0); } Index: head/sys/kern/vfs_extattr.c =================================================================== --- head/sys/kern/vfs_extattr.c (revision 89305) +++ head/sys/kern/vfs_extattr.c (revision 89306) @@ -1,4191 +1,4304 @@ /* * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_syscalls.c 8.13 (Berkeley) 4/15/94 * $FreeBSD$ */ /* For 4.3 integer FS ID compatibility */ #include "opt_compat.h" #include "opt_ffs.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int change_dir __P((struct nameidata *ndp, struct thread *td)); static void checkdirs __P((struct vnode *olddp, struct vnode *newdp)); static int chroot_refuse_vdir_fds __P((struct filedesc *fdp)); static int getutimes __P((const struct timeval *, struct timespec *)); static int setfown __P((struct thread *td, struct vnode *, uid_t, gid_t)); static int setfmode __P((struct thread *td, struct vnode *, int)); static int setfflags __P((struct thread *td, struct vnode *, int)); static int setutimes __P((struct thread *td, struct vnode *, const struct timespec *, int)); static int vn_access __P((struct vnode *vp, int user_flags, struct ucred *cred, struct thread *td)); static int usermount = 0; /* if 1, non-root can mount fs. */ int (*union_dircheckp) __P((struct thread *td, struct vnode **, struct file *)); SYSCTL_INT(_vfs, OID_AUTO, usermount, CTLFLAG_RW, &usermount, 0, ""); /* * Virtual File System System Calls */ #ifndef _SYS_SYSPROTO_H_ struct nmount_args { struct iovec *iovp; unsigned int iovcnt; int flags; }; #endif /* ARGSUSED */ int nmount(td, uap) struct thread *td; struct nmount_args /* { syscallarg(struct iovec *) iovp; syscallarg(unsigned int) iovcnt; syscallarg(int) flags; } */ *uap; { return(EOPNOTSUPP); } /* * Mount a file system. */ #ifndef _SYS_SYSPROTO_H_ struct mount_args { char *type; char *path; int flags; caddr_t data; }; #endif /* ARGSUSED */ int mount(td, uap) struct thread *td; struct mount_args /* { syscallarg(char *) type; syscallarg(char *) path; syscallarg(int) flags; syscallarg(caddr_t) data; } */ *uap; { char *fstype; char *fspath; int error; fstype = malloc(MFSNAMELEN, M_TEMP, M_WAITOK | M_ZERO); fspath = malloc(MNAMELEN, M_TEMP, M_WAITOK | M_ZERO); /* * vfs_mount() actually takes a kernel string for `type' and * `path' now, so extract them. */ error = copyinstr(SCARG(uap, type), fstype, MFSNAMELEN, NULL); if (error) goto finish; error = copyinstr(SCARG(uap, path), fspath, MNAMELEN, NULL); if (error) goto finish; error = vfs_mount(td, fstype, fspath, SCARG(uap, flags), SCARG(uap, data)); finish: free(fstype, M_TEMP); free(fspath, M_TEMP); return (error); } /* * vfs_mount(): actually attempt a filesystem mount. * * This routine is designed to be a "generic" entry point for routines * that wish to mount a filesystem. All parameters except `fsdata' are * pointers into kernel space. `fsdata' is currently still a pointer * into userspace. */ int vfs_mount(td, fstype, fspath, fsflags, fsdata) struct thread *td; const char *fstype; char *fspath; int fsflags; void *fsdata; { struct vnode *vp; struct mount *mp; struct vfsconf *vfsp; int error, flag = 0, flag2 = 0; struct vattr va; struct nameidata nd; /* * Be ultra-paranoid about making sure the type and fspath * variables will fit in our mp buffers, including the * terminating NUL. */ if ((strlen(fstype) >= MFSNAMELEN - 1) || (strlen(fspath) >= MNAMELEN - 1)) return (ENAMETOOLONG); if (usermount == 0) { error = suser_td(td); if (error) return (error); } /* * Do not allow NFS export by non-root users. */ if (fsflags & MNT_EXPORTED) { error = suser_td(td); if (error) return (error); } /* * Silently enforce MNT_NOSUID and MNT_NODEV for non-root users */ if (suser_xxx(td->td_proc->p_ucred, 0, 0)) fsflags |= MNT_NOSUID | MNT_NODEV; /* * Get vnode to be covered */ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspath, td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; if (fsflags & MNT_UPDATE) { if ((vp->v_flag & VROOT) == 0) { vput(vp); return (EINVAL); } mp = vp->v_mount; flag = mp->mnt_flag; flag2 = mp->mnt_kern_flag; /* * We only allow the filesystem to be reloaded if it * is currently mounted read-only. */ if ((fsflags & MNT_RELOAD) && ((mp->mnt_flag & MNT_RDONLY) == 0)) { vput(vp); return (EOPNOTSUPP); /* Needs translation */ } /* * Only root, or the user that did the original mount is * permitted to update it. */ if (mp->mnt_stat.f_owner != td->td_proc->p_ucred->cr_uid) { error = suser_td(td); if (error) { vput(vp); return (error); } } if (vfs_busy(mp, LK_NOWAIT, 0, td)) { vput(vp); return (EBUSY); } mtx_lock(&vp->v_interlock); if ((vp->v_flag & VMOUNT) != 0 || vp->v_mountedhere != NULL) { mtx_unlock(&vp->v_interlock); vfs_unbusy(mp, td); vput(vp); return (EBUSY); } vp->v_flag |= VMOUNT; mtx_unlock(&vp->v_interlock); mp->mnt_flag |= fsflags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE | MNT_SNAPSHOT); VOP_UNLOCK(vp, 0, td); goto update; } /* * If the user is not root, ensure that they own the directory * onto which we are attempting to mount. */ error = VOP_GETATTR(vp, &va, td->td_proc->p_ucred, td); if (error) { vput(vp); return (error); } if (va.va_uid != td->td_proc->p_ucred->cr_uid) { error = suser_td(td); if (error) { vput(vp); return (error); } } if ((error = vinvalbuf(vp, V_SAVE, td->td_proc->p_ucred, td, 0, 0)) != 0) { vput(vp); return (error); } if (vp->v_type != VDIR) { vput(vp); return (ENOTDIR); } for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) if (!strcmp(vfsp->vfc_name, fstype)) break; if (vfsp == NULL) { linker_file_t lf; /* Only load modules for root (very important!) */ error = suser_td(td); if (error) { vput(vp); return error; } error = linker_load_file(fstype, &lf); if (error || lf == NULL) { vput(vp); if (lf == NULL) error = ENODEV; return error; } lf->userrefs++; /* lookup again, see if the VFS was loaded */ for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) if (!strcmp(vfsp->vfc_name, fstype)) break; if (vfsp == NULL) { lf->userrefs--; linker_file_unload(lf); vput(vp); return (ENODEV); } } mtx_lock(&vp->v_interlock); if ((vp->v_flag & VMOUNT) != 0 || vp->v_mountedhere != NULL) { mtx_unlock(&vp->v_interlock); vput(vp); return (EBUSY); } vp->v_flag |= VMOUNT; mtx_unlock(&vp->v_interlock); /* * Allocate and initialize the filesystem. */ mp = malloc(sizeof(struct mount), M_MOUNT, M_WAITOK | M_ZERO); TAILQ_INIT(&mp->mnt_nvnodelist); TAILQ_INIT(&mp->mnt_reservedvnlist); lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, LK_NOPAUSE); (void)vfs_busy(mp, LK_NOWAIT, 0, td); mp->mnt_op = vfsp->vfc_vfsops; mp->mnt_vfc = vfsp; vfsp->vfc_refcount++; mp->mnt_stat.f_type = vfsp->vfc_typenum; mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK; strncpy(mp->mnt_stat.f_fstypename, fstype, MFSNAMELEN); mp->mnt_stat.f_fstypename[MFSNAMELEN - 1] = '\0'; mp->mnt_vnodecovered = vp; mp->mnt_stat.f_owner = td->td_proc->p_ucred->cr_uid; strncpy(mp->mnt_stat.f_mntonname, fspath, MNAMELEN); mp->mnt_stat.f_mntonname[MNAMELEN - 1] = '\0'; mp->mnt_iosize_max = DFLTPHYS; VOP_UNLOCK(vp, 0, td); update: /* * Set the mount level flags. */ if (fsflags & MNT_RDONLY) mp->mnt_flag |= MNT_RDONLY; else if (mp->mnt_flag & MNT_RDONLY) mp->mnt_kern_flag |= MNTK_WANTRDWR; mp->mnt_flag &=~ MNT_UPDATEMASK; mp->mnt_flag |= fsflags & (MNT_UPDATEMASK | MNT_FORCE); /* * Mount the filesystem. * XXX The final recipients of VFS_MOUNT just overwrite the ndp they * get. No freeing of cn_pnbuf. */ error = VFS_MOUNT(mp, fspath, fsdata, &nd, td); if (mp->mnt_flag & MNT_UPDATE) { if (mp->mnt_kern_flag & MNTK_WANTRDWR) mp->mnt_flag &= ~MNT_RDONLY; mp->mnt_flag &=~ (MNT_UPDATE | MNT_RELOAD | MNT_FORCE | MNT_SNAPSHOT); mp->mnt_kern_flag &=~ MNTK_WANTRDWR; if (error) { mp->mnt_flag = flag; mp->mnt_kern_flag = flag2; } if ((mp->mnt_flag & MNT_RDONLY) == 0) { if (mp->mnt_syncer == NULL) error = vfs_allocate_syncvnode(mp); } else { if (mp->mnt_syncer != NULL) vrele(mp->mnt_syncer); mp->mnt_syncer = NULL; } vfs_unbusy(mp, td); mtx_lock(&vp->v_interlock); vp->v_flag &= ~VMOUNT; mtx_unlock(&vp->v_interlock); vrele(vp); return (error); } vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); /* * Put the new filesystem on the mount list after root. */ cache_purge(vp); if (!error) { struct vnode *newdp; mtx_lock(&vp->v_interlock); vp->v_flag &= ~VMOUNT; vp->v_mountedhere = mp; mtx_unlock(&vp->v_interlock); mtx_lock(&mountlist_mtx); TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list); mtx_unlock(&mountlist_mtx); if (VFS_ROOT(mp, &newdp)) panic("mount: lost mount"); checkdirs(vp, newdp); vput(newdp); VOP_UNLOCK(vp, 0, td); if ((mp->mnt_flag & MNT_RDONLY) == 0) error = vfs_allocate_syncvnode(mp); vfs_unbusy(mp, td); if ((error = VFS_START(mp, 0, td)) != 0) vrele(vp); } else { mtx_lock(&vp->v_interlock); vp->v_flag &= ~VMOUNT; mtx_unlock(&vp->v_interlock); mp->mnt_vfc->vfc_refcount--; vfs_unbusy(mp, td); free((caddr_t)mp, M_MOUNT); vput(vp); } return (error); } /* * Scan all active processes to see if any of them have a current * or root directory of `olddp'. If so, replace them with the new * mount point. */ static void checkdirs(olddp, newdp) struct vnode *olddp, *newdp; { struct filedesc *fdp; struct proc *p; if (olddp->v_usecount == 1) return; sx_slock(&allproc_lock); LIST_FOREACH(p, &allproc, p_list) { fdp = p->p_fd; if (fdp == NULL) continue; + FILEDESC_LOCK(fdp); if (fdp->fd_cdir == olddp) { - vrele(fdp->fd_cdir); VREF(newdp); fdp->fd_cdir = newdp; + FILEDESC_UNLOCK(fdp); + vrele(olddp); + FILEDESC_LOCK(fdp); } if (fdp->fd_rdir == olddp) { - vrele(fdp->fd_rdir); VREF(newdp); fdp->fd_rdir = newdp; - } + FILEDESC_UNLOCK(fdp); + vrele(olddp); + } else + FILEDESC_UNLOCK(fdp); } sx_sunlock(&allproc_lock); if (rootvnode == olddp) { vrele(rootvnode); VREF(newdp); rootvnode = newdp; } } /* * Unmount a file system. * * Note: unmount takes a path to the vnode mounted on as argument, * not special file (as before). */ #ifndef _SYS_SYSPROTO_H_ struct unmount_args { char *path; int flags; }; #endif /* ARGSUSED */ int unmount(td, uap) struct thread *td; register struct unmount_args /* { syscallarg(char *) path; syscallarg(int) flags; } */ *uap; { register struct vnode *vp; struct mount *mp; int error; struct nameidata nd; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, SCARG(uap, path), td); if ((error = namei(&nd)) != 0) return (error); vp = nd.ni_vp; NDFREE(&nd, NDF_ONLY_PNBUF); mp = vp->v_mount; /* * Only root, or the user that did the original mount is * permitted to unmount this filesystem. */ if (mp->mnt_stat.f_owner != td->td_proc->p_ucred->cr_uid) { error = suser_td(td); if (error) { vput(vp); return (error); } } /* * Don't allow unmounting the root file system. */ if (mp->mnt_flag & MNT_ROOTFS) { vput(vp); return (EINVAL); } /* * Must be the root of the filesystem */ if ((vp->v_flag & VROOT) == 0) { vput(vp); return (EINVAL); } vput(vp); return (dounmount(mp, SCARG(uap, flags), td)); } /* * Do the actual file system unmount. */ int dounmount(mp, flags, td) struct mount *mp; int flags; struct thread *td; { struct vnode *coveredvp, *fsrootvp; int error; int async_flag; mtx_lock(&mountlist_mtx); mp->mnt_kern_flag |= MNTK_UNMOUNT; error = lockmgr(&mp->mnt_lock, LK_DRAIN | LK_INTERLOCK | ((flags & MNT_FORCE) ? 0 : LK_NOWAIT), &mountlist_mtx, td); if (error) { mp->mnt_kern_flag &= ~MNTK_UNMOUNT; if (mp->mnt_kern_flag & MNTK_MWAIT) wakeup((caddr_t)mp); return (error); } vn_start_write(NULL, &mp, V_WAIT); if (mp->mnt_flag & MNT_EXPUBLIC) vfs_setpublicfs(NULL, NULL, NULL); vfs_msync(mp, MNT_WAIT); async_flag = mp->mnt_flag & MNT_ASYNC; mp->mnt_flag &=~ MNT_ASYNC; cache_purgevfs(mp); /* remove cache entries for this file sys */ if (mp->mnt_syncer != NULL) vrele(mp->mnt_syncer); /* Move process cdir/rdir refs on fs root to underlying vnode. */ if (VFS_ROOT(mp, &fsrootvp) == 0) { if (mp->mnt_vnodecovered != NULL) checkdirs(fsrootvp, mp->mnt_vnodecovered); if (fsrootvp == rootvnode) { vrele(rootvnode); rootvnode = NULL; } vput(fsrootvp); } if (((mp->mnt_flag & MNT_RDONLY) || (error = VFS_SYNC(mp, MNT_WAIT, td->td_proc->p_ucred, td)) == 0) || (flags & MNT_FORCE)) { error = VFS_UNMOUNT(mp, flags, td); } vn_finished_write(mp); if (error) { /* Undo cdir/rdir and rootvnode changes made above. */ if (VFS_ROOT(mp, &fsrootvp) == 0) { if (mp->mnt_vnodecovered != NULL) checkdirs(mp->mnt_vnodecovered, fsrootvp); if (rootvnode == NULL) { rootvnode = fsrootvp; vref(rootvnode); } vput(fsrootvp); } if ((mp->mnt_flag & MNT_RDONLY) == 0 && mp->mnt_syncer == NULL) (void) vfs_allocate_syncvnode(mp); mtx_lock(&mountlist_mtx); mp->mnt_kern_flag &= ~MNTK_UNMOUNT; mp->mnt_flag |= async_flag; lockmgr(&mp->mnt_lock, LK_RELEASE | LK_INTERLOCK, &mountlist_mtx, td); if (mp->mnt_kern_flag & MNTK_MWAIT) wakeup((caddr_t)mp); return (error); } mtx_lock(&mountlist_mtx); TAILQ_REMOVE(&mountlist, mp, mnt_list); if ((coveredvp = mp->mnt_vnodecovered) != NULL) coveredvp->v_mountedhere = NULL; mp->mnt_vfc->vfc_refcount--; if (!TAILQ_EMPTY(&mp->mnt_nvnodelist)) panic("unmount: dangling vnode"); lockmgr(&mp->mnt_lock, LK_RELEASE | LK_INTERLOCK, &mountlist_mtx, td); lockdestroy(&mp->mnt_lock); if (coveredvp != NULL) vrele(coveredvp); if (mp->mnt_kern_flag & MNTK_MWAIT) wakeup((caddr_t)mp); free((caddr_t)mp, M_MOUNT); return (0); } /* * Sync each mounted filesystem. */ #ifndef _SYS_SYSPROTO_H_ struct sync_args { int dummy; }; #endif #ifdef DEBUG static int syncprt = 0; SYSCTL_INT(_debug, OID_AUTO, syncprt, CTLFLAG_RW, &syncprt, 0, ""); #endif /* ARGSUSED */ int sync(td, uap) struct thread *td; struct sync_args *uap; { struct mount *mp, *nmp; int asyncflag; mtx_lock(&mountlist_mtx); for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } if ((mp->mnt_flag & MNT_RDONLY) == 0 && vn_start_write(NULL, &mp, V_NOWAIT) == 0) { asyncflag = mp->mnt_flag & MNT_ASYNC; mp->mnt_flag &= ~MNT_ASYNC; vfs_msync(mp, MNT_NOWAIT); VFS_SYNC(mp, MNT_NOWAIT, ((td != NULL) ? td->td_proc->p_ucred : NOCRED), td); mp->mnt_flag |= asyncflag; vn_finished_write(mp); } mtx_lock(&mountlist_mtx); nmp = TAILQ_NEXT(mp, mnt_list); vfs_unbusy(mp, td); } mtx_unlock(&mountlist_mtx); #if 0 /* * XXX don't call vfs_bufstats() yet because that routine * was not imported in the Lite2 merge. */ #ifdef DIAGNOSTIC if (syncprt) vfs_bufstats(); #endif /* DIAGNOSTIC */ #endif return (0); } /* XXX PRISON: could be per prison flag */ static int prison_quotas; #if 0 SYSCTL_INT(_kern_prison, OID_AUTO, quotas, CTLFLAG_RW, &prison_quotas, 0, ""); #endif /* * Change filesystem quotas. */ #ifndef _SYS_SYSPROTO_H_ struct quotactl_args { char *path; int cmd; int uid; caddr_t arg; }; #endif /* ARGSUSED */ int quotactl(td, uap) struct thread *td; register struct quotactl_args /* { syscallarg(char *) path; syscallarg(int) cmd; syscallarg(int) uid; syscallarg(caddr_t) arg; } */ *uap; { struct mount *mp; int error; struct nameidata nd; if (jailed(td->td_proc->p_ucred) && !prison_quotas) return (EPERM); NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = vn_start_write(nd.ni_vp, &mp, V_WAIT | PCATCH); vrele(nd.ni_vp); if (error) return (error); error = VFS_QUOTACTL(mp, SCARG(uap, cmd), SCARG(uap, uid), SCARG(uap, arg), td); vn_finished_write(mp); return (error); } /* * Get filesystem statistics. */ #ifndef _SYS_SYSPROTO_H_ struct statfs_args { char *path; struct statfs *buf; }; #endif /* ARGSUSED */ int statfs(td, uap) struct thread *td; register struct statfs_args /* { syscallarg(char *) path; syscallarg(struct statfs *) buf; } */ *uap; { register struct mount *mp; register struct statfs *sp; int error; struct nameidata nd; struct statfs sb; NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td); if ((error = namei(&nd)) != 0) return (error); mp = nd.ni_vp->v_mount; sp = &mp->mnt_stat; NDFREE(&nd, NDF_ONLY_PNBUF); vrele(nd.ni_vp); error = VFS_STATFS(mp, sp, td); if (error) return (error); sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; if (suser_xxx(td->td_proc->p_ucred, 0, 0)) { bcopy((caddr_t)sp, (caddr_t)&sb, sizeof(sb)); sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0; sp = &sb; } return (copyout((caddr_t)sp, (caddr_t)SCARG(uap, buf), sizeof(*sp))); } /* * Get filesystem statistics. */ #ifndef _SYS_SYSPROTO_H_ struct fstatfs_args { int fd; struct statfs *buf; }; #endif /* ARGSUSED */ int fstatfs(td, uap) struct thread *td; register struct fstatfs_args /* { syscallarg(int) fd; syscallarg(struct statfs *) buf; } */ *uap; { struct file *fp; struct mount *mp; register struct statfs *sp; int error; struct statfs sb; if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0) return (error); mp = ((struct vnode *)fp->f_data)->v_mount; + fdrop(fp, td); if (mp == NULL) return (EBADF); sp = &mp->mnt_stat; error = VFS_STATFS(mp, sp, td); if (error) return (error); sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; if (suser_xxx(td->td_proc->p_ucred, 0, 0)) { bcopy((caddr_t)sp, (caddr_t)&sb, sizeof(sb)); sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0; sp = &sb; } return (copyout((caddr_t)sp, (caddr_t)SCARG(uap, buf), sizeof(*sp))); } /* * Get statistics on all filesystems. */ #ifndef _SYS_SYSPROTO_H_ struct getfsstat_args { struct statfs *buf; long bufsize; int flags; }; #endif int getfsstat(td, uap) struct thread *td; register struct getfsstat_args /* { syscallarg(struct statfs *) buf; syscallarg(long) bufsize; syscallarg(int) flags; } */ *uap; { register struct mount *mp, *nmp; register struct statfs *sp; caddr_t sfsp; long count, maxcount, error; maxcount = SCARG(uap, bufsize) / sizeof(struct statfs); sfsp = (caddr_t)SCARG(uap, buf); count = 0; mtx_lock(&mountlist_mtx); for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } if (sfsp && count < maxcount) { sp = &mp->mnt_stat; /* * If MNT_NOWAIT or MNT_LAZY is specified, do not * refresh the fsstat cache. MNT_NOWAIT or MNT_LAZY * overrides MNT_WAIT. */ if (((SCARG(uap, flags) & (MNT_LAZY|MNT_NOWAIT)) == 0 || (SCARG(uap, flags) & MNT_WAIT)) && (error = VFS_STATFS(mp, sp, td))) { mtx_lock(&mountlist_mtx); nmp = TAILQ_NEXT(mp, mnt_list); vfs_unbusy(mp, td); continue; } sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; error = copyout((caddr_t)sp, sfsp, sizeof(*sp)); if (error) { vfs_unbusy(mp, td); return (error); } sfsp += sizeof(*sp); } count++; mtx_lock(&mountlist_mtx); nmp = TAILQ_NEXT(mp, mnt_list); vfs_unbusy(mp, td); } mtx_unlock(&mountlist_mtx); if (sfsp && count > maxcount) td->td_retval[0] = maxcount; else td->td_retval[0] = count; return (0); } /* * Change current working directory to a given file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fchdir_args { int fd; }; #endif /* ARGSUSED */ int fchdir(td, uap) struct thread *td; struct fchdir_args /* { syscallarg(int) fd; } */ *uap; { register struct filedesc *fdp = td->td_proc->p_fd; - struct vnode *vp, *tdp; + struct vnode *vp, *tdp, *vpold; struct mount *mp; struct file *fp; int error; if ((error = getvnode(fdp, SCARG(uap, fd), &fp)) != 0) return (error); vp = (struct vnode *)fp->f_data; VREF(vp); + fdrop(fp, td); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); if (vp->v_type != VDIR) error = ENOTDIR; else error = VOP_ACCESS(vp, VEXEC, td->td_proc->p_ucred, td); while (!error && (mp = vp->v_mountedhere) != NULL) { if (vfs_busy(mp, 0, 0, td)) continue; error = VFS_ROOT(mp, &tdp); vfs_unbusy(mp, td); if (error) break; vput(vp); vp = tdp; } if (error) { vput(vp); return (error); } VOP_UNLOCK(vp, 0, td); - vrele(fdp->fd_cdir); + FILEDESC_LOCK(fdp); + vpold = fdp->fd_cdir; fdp->fd_cdir = vp; + FILEDESC_UNLOCK(fdp); + vrele(vpold); return (0); } /* * Change current working directory (``.''). */ #ifndef _SYS_SYSPROTO_H_ struct chdir_args { char *path; }; #endif /* ARGSUSED */ int chdir(td, uap) struct thread *td; struct chdir_args /* { syscallarg(char *) path; } */ *uap; { register struct filedesc *fdp = td->td_proc->p_fd; int error; struct nameidata nd; + struct vnode *vp; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, SCARG(uap, path), td); if ((error = change_dir(&nd, td)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); - vrele(fdp->fd_cdir); + FILEDESC_LOCK(fdp); + vp = fdp->fd_cdir; fdp->fd_cdir = nd.ni_vp; + FILEDESC_UNLOCK(fdp); + vrele(vp); return (0); } /* * Helper function for raised chroot(2) security function: Refuse if * any filedescriptors are open directories. */ static int chroot_refuse_vdir_fds(fdp) struct filedesc *fdp; { struct vnode *vp; struct file *fp; + struct thread *td = curthread; int error; int fd; + FILEDESC_LOCK(fdp); for (fd = 0; fd < fdp->fd_nfiles ; fd++) { error = getvnode(fdp, fd, &fp); if (error) continue; vp = (struct vnode *)fp->f_data; + fdrop(fp, td); if (vp->v_type != VDIR) continue; + FILEDESC_UNLOCK(fdp); return(EPERM); } + FILEDESC_UNLOCK(fdp); return (0); } /* * This sysctl determines if we will allow a process to chroot(2) if it * has a directory open: * 0: disallowed for all processes. * 1: allowed for processes that were not already chroot(2)'ed. * 2: allowed for all processes. */ static int chroot_allow_open_directories = 1; SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW, &chroot_allow_open_directories, 0, ""); /* * Change notion of root (``/'') directory. */ #ifndef _SYS_SYSPROTO_H_ struct chroot_args { char *path; }; #endif /* ARGSUSED */ int chroot(td, uap) struct thread *td; struct chroot_args /* { syscallarg(char *) path; } */ *uap; { register struct filedesc *fdp = td->td_proc->p_fd; int error; struct nameidata nd; + struct vnode *vp; error = suser_xxx(0, td->td_proc, PRISON_ROOT); if (error) return (error); + FILEDESC_LOCK(fdp); if (chroot_allow_open_directories == 0 || - (chroot_allow_open_directories == 1 && fdp->fd_rdir != rootvnode)) + (chroot_allow_open_directories == 1 && fdp->fd_rdir != rootvnode)) { + FILEDESC_UNLOCK(fdp); error = chroot_refuse_vdir_fds(fdp); + } else + FILEDESC_UNLOCK(fdp); if (error) return (error); NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, SCARG(uap, path), td); if ((error = change_dir(&nd, td)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); - vrele(fdp->fd_rdir); + FILEDESC_LOCK(fdp); + vp = fdp->fd_rdir; fdp->fd_rdir = nd.ni_vp; if (!fdp->fd_jdir) { fdp->fd_jdir = nd.ni_vp; VREF(fdp->fd_jdir); } + FILEDESC_UNLOCK(fdp); + vrele(vp); return (0); } /* * Common routine for chroot and chdir. */ static int change_dir(ndp, td) register struct nameidata *ndp; struct thread *td; { struct vnode *vp; int error; error = namei(ndp); if (error) return (error); vp = ndp->ni_vp; if (vp->v_type != VDIR) error = ENOTDIR; else error = VOP_ACCESS(vp, VEXEC, td->td_proc->p_ucred, td); if (error) vput(vp); else VOP_UNLOCK(vp, 0, td); return (error); } /* * Check permissions, allocate an open file structure, * and call the device open routine if any. */ #ifndef _SYS_SYSPROTO_H_ struct open_args { char *path; int flags; int mode; }; #endif int open(td, uap) struct thread *td; register struct open_args /* { syscallarg(char *) path; syscallarg(int) flags; syscallarg(int) mode; } */ *uap; { struct proc *p = td->td_proc; struct filedesc *fdp = p->p_fd; struct file *fp; struct vnode *vp; struct vattr vat; struct mount *mp; int cmode, flags, oflags; struct file *nfp; int type, indx, error; struct flock lf; struct nameidata nd; oflags = SCARG(uap, flags); if ((oflags & O_ACCMODE) == O_ACCMODE) return (EINVAL); flags = FFLAGS(oflags); error = falloc(td, &nfp, &indx); if (error) return (error); fp = nfp; + FILEDESC_LOCK(fdp); cmode = ((SCARG(uap, mode) &~ fdp->fd_cmask) & ALLPERMS) &~ S_ISTXT; + FILEDESC_UNLOCK(fdp); NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td); td->td_dupfd = -indx - 1; /* XXX check for fdopen */ /* * Bump the ref count to prevent another process from closing * the descriptor while we are blocked in vn_open() */ fhold(fp); error = vn_open(&nd, &flags, cmode); if (error) { /* * release our own reference */ fdrop(fp, td); /* * handle special fdopen() case. bleh. dupfdopen() is * responsible for dropping the old contents of ofiles[indx] * if it succeeds. */ if ((error == ENODEV || error == ENXIO) && td->td_dupfd >= 0 && /* XXX from fdopen */ (error = dupfdopen(td, fdp, indx, td->td_dupfd, flags, error)) == 0) { td->td_retval[0] = indx; return (0); } /* * Clean up the descriptor, but only if another thread hadn't * replaced or closed it. */ + FILEDESC_LOCK(fdp); if (fdp->fd_ofiles[indx] == fp) { fdp->fd_ofiles[indx] = NULL; + FILEDESC_UNLOCK(fdp); fdrop(fp, td); - } + } else + FILEDESC_UNLOCK(fdp); if (error == ERESTART) error = EINTR; return (error); } td->td_dupfd = 0; NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; /* * There should be 2 references on the file, one from the descriptor * table, and one for us. * * Handle the case where someone closed the file (via its file * descriptor) while we were blocked. The end result should look * like opening the file succeeded but it was immediately closed. */ + FILEDESC_LOCK(fdp); + FILE_LOCK(fp); if (fp->f_count == 1) { KASSERT(fdp->fd_ofiles[indx] != fp, ("Open file descriptor lost all refs")); + FILEDESC_UNLOCK(fdp); + FILE_UNLOCK(fp); VOP_UNLOCK(vp, 0, td); vn_close(vp, flags & FMASK, fp->f_cred, td); fdrop(fp, td); td->td_retval[0] = indx; return 0; } fp->f_data = (caddr_t)vp; fp->f_flag = flags & FMASK; fp->f_ops = &vnops; fp->f_type = (vp->v_type == VFIFO ? DTYPE_FIFO : DTYPE_VNODE); + FILEDESC_UNLOCK(fdp); + FILE_UNLOCK(fp); VOP_UNLOCK(vp, 0, td); if (flags & (O_EXLOCK | O_SHLOCK)) { lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; if (flags & O_EXLOCK) lf.l_type = F_WRLCK; else lf.l_type = F_RDLCK; type = F_FLOCK; if ((flags & FNONBLOCK) == 0) type |= F_WAIT; if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type)) != 0) goto bad; fp->f_flag |= FHASLOCK; } if (flags & O_TRUNC) { if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) goto bad; VOP_LEASE(vp, td, p->p_ucred, LEASE_WRITE); VATTR_NULL(&vat); vat.va_size = 0; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); error = VOP_SETATTR(vp, &vat, p->p_ucred, td); VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); if (error) goto bad; } /* assert that vn_open created a backing object if one is needed */ KASSERT(!vn_canvmio(vp) || VOP_GETVOBJECT(vp, NULL) == 0, ("open: vmio vnode has no backing object after vn_open")); /* * Release our private reference, leaving the one associated with * the descriptor table intact. */ fdrop(fp, td); td->td_retval[0] = indx; return (0); bad: + FILEDESC_LOCK(fdp); if (fdp->fd_ofiles[indx] == fp) { fdp->fd_ofiles[indx] = NULL; + FILEDESC_UNLOCK(fdp); fdrop(fp, td); - } - fdrop(fp, td); + } else + FILEDESC_UNLOCK(fdp); return (error); } #ifdef COMPAT_43 /* * Create a file. */ #ifndef _SYS_SYSPROTO_H_ struct ocreat_args { char *path; int mode; }; #endif int ocreat(td, uap) struct thread *td; register struct ocreat_args /* { syscallarg(char *) path; syscallarg(int) mode; } */ *uap; { struct open_args /* { syscallarg(char *) path; syscallarg(int) flags; syscallarg(int) mode; } */ nuap; SCARG(&nuap, path) = SCARG(uap, path); SCARG(&nuap, mode) = SCARG(uap, mode); SCARG(&nuap, flags) = O_WRONLY | O_CREAT | O_TRUNC; return (open(td, &nuap)); } #endif /* COMPAT_43 */ /* * Create a special file. */ #ifndef _SYS_SYSPROTO_H_ struct mknod_args { char *path; int mode; int dev; }; #endif /* ARGSUSED */ int mknod(td, uap) struct thread *td; register struct mknod_args /* { syscallarg(char *) path; syscallarg(int) mode; syscallarg(int) dev; } */ *uap; { struct vnode *vp; struct mount *mp; struct vattr vattr; int error; int whiteout = 0; struct nameidata nd; switch (SCARG(uap, mode) & S_IFMT) { case S_IFCHR: case S_IFBLK: error = suser_td(td); break; default: error = suser_xxx(0, td->td_proc, PRISON_ROOT); break; } if (error) return (error); restart: bwillwrite(); NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), td); if ((error = namei(&nd)) != 0) return (error); vp = nd.ni_vp; if (vp != NULL) { vrele(vp); error = EEXIST; } else { VATTR_NULL(&vattr); + FILEDESC_LOCK(td->td_proc->p_fd); vattr.va_mode = (SCARG(uap, mode) & ALLPERMS) &~ td->td_proc->p_fd->fd_cmask; + FILEDESC_UNLOCK(td->td_proc->p_fd); vattr.va_rdev = SCARG(uap, dev); whiteout = 0; switch (SCARG(uap, mode) & S_IFMT) { case S_IFMT: /* used by badsect to flag bad sectors */ vattr.va_type = VBAD; break; case S_IFCHR: vattr.va_type = VCHR; break; case S_IFBLK: vattr.va_type = VBLK; break; case S_IFWHT: whiteout = 1; break; default: error = EINVAL; break; } } if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } if (!error) { VOP_LEASE(nd.ni_dvp, td, td->td_proc->p_ucred, LEASE_WRITE); if (whiteout) error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE); else { error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); if (error == 0) vput(nd.ni_vp); } } NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); vn_finished_write(mp); ASSERT_VOP_UNLOCKED(nd.ni_dvp, "mknod"); ASSERT_VOP_UNLOCKED(nd.ni_vp, "mknod"); return (error); } /* * Create a named pipe. */ #ifndef _SYS_SYSPROTO_H_ struct mkfifo_args { char *path; int mode; }; #endif /* ARGSUSED */ int mkfifo(td, uap) struct thread *td; register struct mkfifo_args /* { syscallarg(char *) path; syscallarg(int) mode; } */ *uap; { struct mount *mp; struct vattr vattr; int error; struct nameidata nd; restart: bwillwrite(); NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), td); if ((error = namei(&nd)) != 0) return (error); if (nd.ni_vp != NULL) { NDFREE(&nd, NDF_ONLY_PNBUF); vrele(nd.ni_vp); vput(nd.ni_dvp); return (EEXIST); } if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } VATTR_NULL(&vattr); vattr.va_type = VFIFO; + FILEDESC_LOCK(td->td_proc->p_fd); vattr.va_mode = (SCARG(uap, mode) & ALLPERMS) &~ td->td_proc->p_fd->fd_cmask; + FILEDESC_UNLOCK(td->td_proc->p_fd); VOP_LEASE(nd.ni_dvp, td, td->td_proc->p_ucred, LEASE_WRITE); error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); if (error == 0) vput(nd.ni_vp); NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); vn_finished_write(mp); return (error); } /* * Make a hard file link. */ #ifndef _SYS_SYSPROTO_H_ struct link_args { char *path; char *link; }; #endif /* ARGSUSED */ int link(td, uap) struct thread *td; register struct link_args /* { syscallarg(char *) path; syscallarg(char *) link; } */ *uap; { struct vnode *vp; struct mount *mp; struct nameidata nd; int error; bwillwrite(); NDINIT(&nd, LOOKUP, FOLLOW|NOOBJ, UIO_USERSPACE, SCARG(uap, path), td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; if (vp->v_type == VDIR) { vrele(vp); return (EPERM); /* POSIX */ } if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) { vrele(vp); return (error); } NDINIT(&nd, CREATE, LOCKPARENT|NOOBJ, UIO_USERSPACE, SCARG(uap, link), td); if ((error = namei(&nd)) == 0) { if (nd.ni_vp != NULL) { vrele(nd.ni_vp); error = EEXIST; } else { VOP_LEASE(nd.ni_dvp, td, td->td_proc->p_ucred, LEASE_WRITE); VOP_LEASE(vp, td, td->td_proc->p_ucred, LEASE_WRITE); error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd); } NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); } vrele(vp); vn_finished_write(mp); ASSERT_VOP_UNLOCKED(nd.ni_dvp, "link"); ASSERT_VOP_UNLOCKED(nd.ni_vp, "link"); return (error); } /* * Make a symbolic link. */ #ifndef _SYS_SYSPROTO_H_ struct symlink_args { char *path; char *link; }; #endif /* ARGSUSED */ int symlink(td, uap) struct thread *td; register struct symlink_args /* { syscallarg(char *) path; syscallarg(char *) link; } */ *uap; { struct mount *mp; struct vattr vattr; char *path; int error; struct nameidata nd; path = zalloc(namei_zone); if ((error = copyinstr(SCARG(uap, path), path, MAXPATHLEN, NULL)) != 0) goto out; restart: bwillwrite(); NDINIT(&nd, CREATE, LOCKPARENT|NOOBJ, UIO_USERSPACE, SCARG(uap, link), td); if ((error = namei(&nd)) != 0) goto out; if (nd.ni_vp) { NDFREE(&nd, NDF_ONLY_PNBUF); vrele(nd.ni_vp); vput(nd.ni_dvp); error = EEXIST; goto out; } if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } VATTR_NULL(&vattr); + FILEDESC_LOCK(td->td_proc->p_fd); vattr.va_mode = ACCESSPERMS &~ td->td_proc->p_fd->fd_cmask; + FILEDESC_UNLOCK(td->td_proc->p_fd); VOP_LEASE(nd.ni_dvp, td, td->td_proc->p_ucred, LEASE_WRITE); error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, path); NDFREE(&nd, NDF_ONLY_PNBUF); if (error == 0) vput(nd.ni_vp); vput(nd.ni_dvp); vn_finished_write(mp); ASSERT_VOP_UNLOCKED(nd.ni_dvp, "symlink"); ASSERT_VOP_UNLOCKED(nd.ni_vp, "symlink"); out: zfree(namei_zone, path); return (error); } /* * Delete a whiteout from the filesystem. */ /* ARGSUSED */ int undelete(td, uap) struct thread *td; register struct undelete_args /* { syscallarg(char *) path; } */ *uap; { int error; struct mount *mp; struct nameidata nd; restart: bwillwrite(); NDINIT(&nd, DELETE, LOCKPARENT|DOWHITEOUT, UIO_USERSPACE, SCARG(uap, path), td); error = namei(&nd); if (error) return (error); if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) { NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_vp) vrele(nd.ni_vp); vput(nd.ni_dvp); return (EEXIST); } if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } VOP_LEASE(nd.ni_dvp, td, td->td_proc->p_ucred, LEASE_WRITE); error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE); NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); vn_finished_write(mp); ASSERT_VOP_UNLOCKED(nd.ni_dvp, "undelete"); ASSERT_VOP_UNLOCKED(nd.ni_vp, "undelete"); return (error); } /* * Delete a name from the filesystem. */ #ifndef _SYS_SYSPROTO_H_ struct unlink_args { char *path; }; #endif /* ARGSUSED */ int unlink(td, uap) struct thread *td; struct unlink_args /* { syscallarg(char *) path; } */ *uap; { struct mount *mp; struct vnode *vp; int error; struct nameidata nd; restart: bwillwrite(); NDINIT(&nd, DELETE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), td); if ((error = namei(&nd)) != 0) return (error); vp = nd.ni_vp; if (vp->v_type == VDIR) error = EPERM; /* POSIX */ else { /* * The root of a mounted filesystem cannot be deleted. * * XXX: can this only be a VDIR case? */ if (vp->v_flag & VROOT) error = EBUSY; } if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vrele(vp); vput(nd.ni_dvp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } VOP_LEASE(vp, td, td->td_proc->p_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); if (!error) { VOP_LEASE(nd.ni_dvp, td, td->td_proc->p_ucred, LEASE_WRITE); error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd); } NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); vput(vp); vn_finished_write(mp); ASSERT_VOP_UNLOCKED(nd.ni_dvp, "unlink"); ASSERT_VOP_UNLOCKED(nd.ni_vp, "unlink"); return (error); } /* * Reposition read/write file offset. */ #ifndef _SYS_SYSPROTO_H_ struct lseek_args { int fd; int pad; off_t offset; int whence; }; #endif int lseek(td, uap) struct thread *td; register struct lseek_args /* { syscallarg(int) fd; syscallarg(int) pad; syscallarg(off_t) offset; syscallarg(int) whence; } */ *uap; { struct ucred *cred = td->td_proc->p_ucred; - register struct filedesc *fdp = td->td_proc->p_fd; register struct file *fp; - struct vattr vattr; struct vnode *vp; + struct vattr vattr; off_t offset; int error, noneg; - if ((u_int)SCARG(uap, fd) >= fdp->fd_nfiles || - (fp = fdp->fd_ofiles[SCARG(uap, fd)]) == NULL) + fp = ffind_hold(td, uap->fd); + if (fp == NULL) return (EBADF); - if (fp->f_type != DTYPE_VNODE) + if (fp->f_type != DTYPE_VNODE) { + fdrop(fp, td); return (ESPIPE); + } vp = (struct vnode *)fp->f_data; noneg = (vp->v_type != VCHR); offset = SCARG(uap, offset); switch (SCARG(uap, whence)) { case L_INCR: if (noneg && (fp->f_offset < 0 || (offset > 0 && fp->f_offset > OFF_MAX - offset))) return (EOVERFLOW); offset += fp->f_offset; break; case L_XTND: error = VOP_GETATTR(vp, &vattr, cred, td); if (error) return (error); if (noneg && (vattr.va_size > OFF_MAX || (offset > 0 && vattr.va_size > OFF_MAX - offset))) return (EOVERFLOW); offset += vattr.va_size; break; case L_SET: break; default: + fdrop(fp, td); return (EINVAL); } if (noneg && offset < 0) return (EINVAL); fp->f_offset = offset; *(off_t *)(td->td_retval) = fp->f_offset; + fdrop(fp, td); return (0); } #if defined(COMPAT_43) || defined(COMPAT_SUNOS) /* * Reposition read/write file offset. */ #ifndef _SYS_SYSPROTO_H_ struct olseek_args { int fd; long offset; int whence; }; #endif int olseek(td, uap) struct thread *td; register struct olseek_args /* { syscallarg(int) fd; syscallarg(long) offset; syscallarg(int) whence; } */ *uap; { struct lseek_args /* { syscallarg(int) fd; syscallarg(int) pad; syscallarg(off_t) offset; syscallarg(int) whence; } */ nuap; int error; SCARG(&nuap, fd) = SCARG(uap, fd); SCARG(&nuap, offset) = SCARG(uap, offset); SCARG(&nuap, whence) = SCARG(uap, whence); error = lseek(td, &nuap); return (error); } #endif /* COMPAT_43 */ /* * Check access permissions using passed credentials. */ static int vn_access(vp, user_flags, cred, td) struct vnode *vp; int user_flags; struct ucred *cred; struct thread *td; { int error, flags; /* Flags == 0 means only check for existence. */ error = 0; if (user_flags) { flags = 0; if (user_flags & R_OK) flags |= VREAD; if (user_flags & W_OK) flags |= VWRITE; if (user_flags & X_OK) flags |= VEXEC; if ((flags & VWRITE) == 0 || (error = vn_writechk(vp)) == 0) error = VOP_ACCESS(vp, flags, cred, td); } return (error); } /* * Check access permissions using "real" credentials. */ #ifndef _SYS_SYSPROTO_H_ struct access_args { char *path; int flags; }; #endif int access(td, uap) struct thread *td; register struct access_args /* { syscallarg(char *) path; syscallarg(int) flags; } */ *uap; { struct ucred *cred, *tmpcred; register struct vnode *vp; int error; struct nameidata nd; cred = td->td_proc->p_ucred; /* * Create and modify a temporary credential instead of one that * is potentially shared. This could also mess up socket * buffer accounting which can run in an interrupt context. * * XXX - Depending on how "threads" are finally implemented, it * may be better to explicitly pass the credential to namei() * rather than to modify the potentially shared process structure. */ tmpcred = crdup(cred); tmpcred->cr_uid = cred->cr_ruid; tmpcred->cr_groups[0] = cred->cr_rgid; td->td_proc->p_ucred = tmpcred; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, SCARG(uap, path), td); if ((error = namei(&nd)) != 0) goto out1; vp = nd.ni_vp; error = vn_access(vp, SCARG(uap, flags), tmpcred, td); NDFREE(&nd, NDF_ONLY_PNBUF); vput(vp); out1: td->td_proc->p_ucred = cred; crfree(tmpcred); return (error); } /* * Check access permissions using "effective" credentials. */ #ifndef _SYS_SYSPROTO_H_ struct eaccess_args { char *path; int flags; }; #endif int eaccess(td, uap) struct thread *td; register struct eaccess_args /* { syscallarg(char *) path; syscallarg(int) flags; } */ *uap; { struct nameidata nd; struct vnode *vp; int error; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, SCARG(uap, path), td); if ((error = namei(&nd)) != 0) return (error); vp = nd.ni_vp; error = vn_access(vp, SCARG(uap, flags), td->td_proc->p_ucred, td); NDFREE(&nd, NDF_ONLY_PNBUF); vput(vp); return (error); } #if defined(COMPAT_43) || defined(COMPAT_SUNOS) /* * Get file status; this version follows links. */ #ifndef _SYS_SYSPROTO_H_ struct ostat_args { char *path; struct ostat *ub; }; #endif /* ARGSUSED */ int ostat(td, uap) struct thread *td; register struct ostat_args /* { syscallarg(char *) path; syscallarg(struct ostat *) ub; } */ *uap; { struct stat sb; struct ostat osb; int error; struct nameidata nd; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, SCARG(uap, path), td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = vn_stat(nd.ni_vp, &sb, td); vput(nd.ni_vp); if (error) return (error); cvtstat(&sb, &osb); error = copyout((caddr_t)&osb, (caddr_t)SCARG(uap, ub), sizeof (osb)); return (error); } /* * Get file status; this version does not follow links. */ #ifndef _SYS_SYSPROTO_H_ struct olstat_args { char *path; struct ostat *ub; }; #endif /* ARGSUSED */ int olstat(td, uap) struct thread *td; register struct olstat_args /* { syscallarg(char *) path; syscallarg(struct ostat *) ub; } */ *uap; { struct vnode *vp; struct stat sb; struct ostat osb; int error; struct nameidata nd; NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, SCARG(uap, path), td); if ((error = namei(&nd)) != 0) return (error); vp = nd.ni_vp; error = vn_stat(vp, &sb, td); NDFREE(&nd, NDF_ONLY_PNBUF); vput(vp); if (error) return (error); cvtstat(&sb, &osb); error = copyout((caddr_t)&osb, (caddr_t)SCARG(uap, ub), sizeof (osb)); return (error); } /* * Convert from an old to a new stat structure. */ void cvtstat(st, ost) struct stat *st; struct ostat *ost; { ost->st_dev = st->st_dev; ost->st_ino = st->st_ino; ost->st_mode = st->st_mode; ost->st_nlink = st->st_nlink; ost->st_uid = st->st_uid; ost->st_gid = st->st_gid; ost->st_rdev = st->st_rdev; if (st->st_size < (quad_t)1 << 32) ost->st_size = st->st_size; else ost->st_size = -2; ost->st_atime = st->st_atime; ost->st_mtime = st->st_mtime; ost->st_ctime = st->st_ctime; ost->st_blksize = st->st_blksize; ost->st_blocks = st->st_blocks; ost->st_flags = st->st_flags; ost->st_gen = st->st_gen; } #endif /* COMPAT_43 || COMPAT_SUNOS */ /* * Get file status; this version follows links. */ #ifndef _SYS_SYSPROTO_H_ struct stat_args { char *path; struct stat *ub; }; #endif /* ARGSUSED */ int stat(td, uap) struct thread *td; register struct stat_args /* { syscallarg(char *) path; syscallarg(struct stat *) ub; } */ *uap; { struct stat sb; int error; struct nameidata nd; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, SCARG(uap, path), td); if ((error = namei(&nd)) != 0) return (error); error = vn_stat(nd.ni_vp, &sb, td); NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_vp); if (error) return (error); error = copyout((caddr_t)&sb, (caddr_t)SCARG(uap, ub), sizeof (sb)); return (error); } /* * Get file status; this version does not follow links. */ #ifndef _SYS_SYSPROTO_H_ struct lstat_args { char *path; struct stat *ub; }; #endif /* ARGSUSED */ int lstat(td, uap) struct thread *td; register struct lstat_args /* { syscallarg(char *) path; syscallarg(struct stat *) ub; } */ *uap; { int error; struct vnode *vp; struct stat sb; struct nameidata nd; NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, SCARG(uap, path), td); if ((error = namei(&nd)) != 0) return (error); vp = nd.ni_vp; error = vn_stat(vp, &sb, td); NDFREE(&nd, NDF_ONLY_PNBUF); vput(vp); if (error) return (error); error = copyout((caddr_t)&sb, (caddr_t)SCARG(uap, ub), sizeof (sb)); return (error); } /* * Implementation of the NetBSD stat() function. * XXX This should probably be collapsed with the FreeBSD version, * as the differences are only due to vn_stat() clearing spares at * the end of the structures. vn_stat could be split to avoid this, * and thus collapse the following to close to zero code. */ void cvtnstat(sb, nsb) struct stat *sb; struct nstat *nsb; { nsb->st_dev = sb->st_dev; nsb->st_ino = sb->st_ino; nsb->st_mode = sb->st_mode; nsb->st_nlink = sb->st_nlink; nsb->st_uid = sb->st_uid; nsb->st_gid = sb->st_gid; nsb->st_rdev = sb->st_rdev; nsb->st_atimespec = sb->st_atimespec; nsb->st_mtimespec = sb->st_mtimespec; nsb->st_ctimespec = sb->st_ctimespec; nsb->st_size = sb->st_size; nsb->st_blocks = sb->st_blocks; nsb->st_blksize = sb->st_blksize; nsb->st_flags = sb->st_flags; nsb->st_gen = sb->st_gen; nsb->st_qspare[0] = sb->st_qspare[0]; nsb->st_qspare[1] = sb->st_qspare[1]; } #ifndef _SYS_SYSPROTO_H_ struct nstat_args { char *path; struct nstat *ub; }; #endif /* ARGSUSED */ int nstat(td, uap) struct thread *td; register struct nstat_args /* { syscallarg(char *) path; syscallarg(struct nstat *) ub; } */ *uap; { struct stat sb; struct nstat nsb; int error; struct nameidata nd; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, SCARG(uap, path), td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = vn_stat(nd.ni_vp, &sb, td); vput(nd.ni_vp); if (error) return (error); cvtnstat(&sb, &nsb); error = copyout((caddr_t)&nsb, (caddr_t)SCARG(uap, ub), sizeof (nsb)); return (error); } /* * NetBSD lstat. Get file status; this version does not follow links. */ #ifndef _SYS_SYSPROTO_H_ struct lstat_args { char *path; struct stat *ub; }; #endif /* ARGSUSED */ int nlstat(td, uap) struct thread *td; register struct nlstat_args /* { syscallarg(char *) path; syscallarg(struct nstat *) ub; } */ *uap; { int error; struct vnode *vp; struct stat sb; struct nstat nsb; struct nameidata nd; NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, SCARG(uap, path), td); if ((error = namei(&nd)) != 0) return (error); vp = nd.ni_vp; NDFREE(&nd, NDF_ONLY_PNBUF); error = vn_stat(vp, &sb, td); vput(vp); if (error) return (error); cvtnstat(&sb, &nsb); error = copyout((caddr_t)&nsb, (caddr_t)SCARG(uap, ub), sizeof (nsb)); return (error); } /* * Get configurable pathname variables. */ #ifndef _SYS_SYSPROTO_H_ struct pathconf_args { char *path; int name; }; #endif /* ARGSUSED */ int pathconf(td, uap) struct thread *td; register struct pathconf_args /* { syscallarg(char *) path; syscallarg(int) name; } */ *uap; { int error; struct nameidata nd; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, SCARG(uap, path), td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = VOP_PATHCONF(nd.ni_vp, SCARG(uap, name), td->td_retval); vput(nd.ni_vp); return (error); } /* * Return target name of a symbolic link. */ #ifndef _SYS_SYSPROTO_H_ struct readlink_args { char *path; char *buf; int count; }; #endif /* ARGSUSED */ int readlink(td, uap) struct thread *td; register struct readlink_args /* { syscallarg(char *) path; syscallarg(char *) buf; syscallarg(int) count; } */ *uap; { register struct vnode *vp; struct iovec aiov; struct uio auio; int error; struct nameidata nd; NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, SCARG(uap, path), td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; if (vp->v_type != VLNK) error = EINVAL; else { aiov.iov_base = SCARG(uap, buf); aiov.iov_len = SCARG(uap, count); auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = 0; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_USERSPACE; auio.uio_td = td; auio.uio_resid = SCARG(uap, count); error = VOP_READLINK(vp, &auio, td->td_proc->p_ucred); } vput(vp); td->td_retval[0] = SCARG(uap, count) - auio.uio_resid; return (error); } /* * Common implementation code for chflags() and fchflags(). */ static int setfflags(td, vp, flags) struct thread *td; struct vnode *vp; int flags; { int error; struct mount *mp; struct vattr vattr; /* * Prevent non-root users from setting flags on devices. When * a device is reused, users can retain ownership of the device * if they are allowed to set flags and programs assume that * chown can't fail when done as root. */ if (vp->v_type == VCHR || vp->v_type == VBLK) { error = suser_xxx(td->td_proc->p_ucred, td->td_proc, PRISON_ROOT); if (error) return (error); } if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) return (error); VOP_LEASE(vp, td, td->td_proc->p_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); VATTR_NULL(&vattr); vattr.va_flags = flags; error = VOP_SETATTR(vp, &vattr, td->td_proc->p_ucred, td); VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); return (error); } /* * Change flags of a file given a path name. */ #ifndef _SYS_SYSPROTO_H_ struct chflags_args { char *path; int flags; }; #endif /* ARGSUSED */ int chflags(td, uap) struct thread *td; register struct chflags_args /* { syscallarg(char *) path; syscallarg(int) flags; } */ *uap; { int error; struct nameidata nd; NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = setfflags(td, nd.ni_vp, SCARG(uap, flags)); vrele(nd.ni_vp); return error; } /* * Change flags of a file given a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fchflags_args { int fd; int flags; }; #endif /* ARGSUSED */ int fchflags(td, uap) struct thread *td; register struct fchflags_args /* { syscallarg(int) fd; syscallarg(int) flags; } */ *uap; { struct file *fp; int error; if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0) return (error); - return setfflags(td, (struct vnode *) fp->f_data, SCARG(uap, flags)); + error = setfflags(td, (struct vnode *) fp->f_data, SCARG(uap, flags)); + fdrop(fp, td); + return (error); } /* * Common implementation code for chmod(), lchmod() and fchmod(). */ static int setfmode(td, vp, mode) struct thread *td; struct vnode *vp; int mode; { int error; struct mount *mp; struct vattr vattr; if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) return (error); VOP_LEASE(vp, td, td->td_proc->p_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); VATTR_NULL(&vattr); vattr.va_mode = mode & ALLPERMS; error = VOP_SETATTR(vp, &vattr, td->td_proc->p_ucred, td); VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); return error; } /* * Change mode of a file given path name. */ #ifndef _SYS_SYSPROTO_H_ struct chmod_args { char *path; int mode; }; #endif /* ARGSUSED */ int chmod(td, uap) struct thread *td; register struct chmod_args /* { syscallarg(char *) path; syscallarg(int) mode; } */ *uap; { int error; struct nameidata nd; NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = setfmode(td, nd.ni_vp, SCARG(uap, mode)); vrele(nd.ni_vp); return error; } /* * Change mode of a file given path name (don't follow links.) */ #ifndef _SYS_SYSPROTO_H_ struct lchmod_args { char *path; int mode; }; #endif /* ARGSUSED */ int lchmod(td, uap) struct thread *td; register struct lchmod_args /* { syscallarg(char *) path; syscallarg(int) mode; } */ *uap; { int error; struct nameidata nd; NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, SCARG(uap, path), td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = setfmode(td, nd.ni_vp, SCARG(uap, mode)); vrele(nd.ni_vp); return error; } /* * Change mode of a file given a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fchmod_args { int fd; int mode; }; #endif /* ARGSUSED */ int fchmod(td, uap) struct thread *td; register struct fchmod_args /* { syscallarg(int) fd; syscallarg(int) mode; } */ *uap; { struct file *fp; + struct vnode *vp; int error; if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0) return (error); - return setfmode(td, (struct vnode *)fp->f_data, SCARG(uap, mode)); + vp = (struct vnode *)fp->f_data; + error = setfmode(td, (struct vnode *)fp->f_data, SCARG(uap, mode)); + fdrop(fp, td); + return (error); } /* * Common implementation for chown(), lchown(), and fchown() */ static int setfown(td, vp, uid, gid) struct thread *td; struct vnode *vp; uid_t uid; gid_t gid; { int error; struct mount *mp; struct vattr vattr; if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) return (error); VOP_LEASE(vp, td, td->td_proc->p_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); VATTR_NULL(&vattr); vattr.va_uid = uid; vattr.va_gid = gid; error = VOP_SETATTR(vp, &vattr, td->td_proc->p_ucred, td); VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); return error; } /* * Set ownership given a path name. */ #ifndef _SYS_SYSPROTO_H_ struct chown_args { char *path; int uid; int gid; }; #endif /* ARGSUSED */ int chown(td, uap) struct thread *td; register struct chown_args /* { syscallarg(char *) path; syscallarg(int) uid; syscallarg(int) gid; } */ *uap; { int error; struct nameidata nd; NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = setfown(td, nd.ni_vp, SCARG(uap, uid), SCARG(uap, gid)); vrele(nd.ni_vp); return (error); } /* * Set ownership given a path name, do not cross symlinks. */ #ifndef _SYS_SYSPROTO_H_ struct lchown_args { char *path; int uid; int gid; }; #endif /* ARGSUSED */ int lchown(td, uap) struct thread *td; register struct lchown_args /* { syscallarg(char *) path; syscallarg(int) uid; syscallarg(int) gid; } */ *uap; { int error; struct nameidata nd; NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, SCARG(uap, path), td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = setfown(td, nd.ni_vp, SCARG(uap, uid), SCARG(uap, gid)); vrele(nd.ni_vp); return (error); } /* * Set ownership given a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fchown_args { int fd; int uid; int gid; }; #endif /* ARGSUSED */ int fchown(td, uap) struct thread *td; register struct fchown_args /* { syscallarg(int) fd; syscallarg(int) uid; syscallarg(int) gid; } */ *uap; { struct file *fp; + struct vnode *vp; int error; if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0) return (error); - return setfown(td, (struct vnode *)fp->f_data, + vp = (struct vnode *)fp->f_data; + error = setfown(td, (struct vnode *)fp->f_data, SCARG(uap, uid), SCARG(uap, gid)); + fdrop(fp, td); + return (error); } /* * Common implementation code for utimes(), lutimes(), and futimes(). */ static int getutimes(usrtvp, tsp) const struct timeval *usrtvp; struct timespec *tsp; { struct timeval tv[2]; int error; if (usrtvp == NULL) { microtime(&tv[0]); TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]); tsp[1] = tsp[0]; } else { if ((error = copyin(usrtvp, tv, sizeof (tv))) != 0) return (error); TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]); TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]); } return 0; } /* * Common implementation code for utimes(), lutimes(), and futimes(). */ static int setutimes(td, vp, ts, nullflag) struct thread *td; struct vnode *vp; const struct timespec *ts; int nullflag; { int error; struct mount *mp; struct vattr vattr; if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) return (error); VOP_LEASE(vp, td, td->td_proc->p_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); VATTR_NULL(&vattr); vattr.va_atime = ts[0]; vattr.va_mtime = ts[1]; if (nullflag) vattr.va_vaflags |= VA_UTIMES_NULL; error = VOP_SETATTR(vp, &vattr, td->td_proc->p_ucred, td); VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); return error; } /* * Set the access and modification times of a file. */ #ifndef _SYS_SYSPROTO_H_ struct utimes_args { char *path; struct timeval *tptr; }; #endif /* ARGSUSED */ int utimes(td, uap) struct thread *td; register struct utimes_args /* { syscallarg(char *) path; syscallarg(struct timeval *) tptr; } */ *uap; { struct timespec ts[2]; struct timeval *usrtvp; int error; struct nameidata nd; usrtvp = SCARG(uap, tptr); if ((error = getutimes(usrtvp, ts)) != 0) return (error); NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = setutimes(td, nd.ni_vp, ts, usrtvp == NULL); vrele(nd.ni_vp); return (error); } /* * Set the access and modification times of a file. */ #ifndef _SYS_SYSPROTO_H_ struct lutimes_args { char *path; struct timeval *tptr; }; #endif /* ARGSUSED */ int lutimes(td, uap) struct thread *td; register struct lutimes_args /* { syscallarg(char *) path; syscallarg(struct timeval *) tptr; } */ *uap; { struct timespec ts[2]; struct timeval *usrtvp; int error; struct nameidata nd; usrtvp = SCARG(uap, tptr); if ((error = getutimes(usrtvp, ts)) != 0) return (error); NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, SCARG(uap, path), td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = setutimes(td, nd.ni_vp, ts, usrtvp == NULL); vrele(nd.ni_vp); return (error); } /* * Set the access and modification times of a file. */ #ifndef _SYS_SYSPROTO_H_ struct futimes_args { int fd; struct timeval *tptr; }; #endif /* ARGSUSED */ int futimes(td, uap) struct thread *td; register struct futimes_args /* { syscallarg(int ) fd; syscallarg(struct timeval *) tptr; } */ *uap; { struct timespec ts[2]; struct file *fp; struct timeval *usrtvp; int error; usrtvp = SCARG(uap, tptr); if ((error = getutimes(usrtvp, ts)) != 0) return (error); if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0) return (error); - return setutimes(td, (struct vnode *)fp->f_data, ts, usrtvp == NULL); + error = setutimes(td, (struct vnode *)fp->f_data, ts, usrtvp == NULL); + fdrop(fp, td); + return (error); } /* * Truncate a file given its path name. */ #ifndef _SYS_SYSPROTO_H_ struct truncate_args { char *path; int pad; off_t length; }; #endif /* ARGSUSED */ int truncate(td, uap) struct thread *td; register struct truncate_args /* { syscallarg(char *) path; syscallarg(int) pad; syscallarg(off_t) length; } */ *uap; { struct mount *mp; struct vnode *vp; struct vattr vattr; int error; struct nameidata nd; if (uap->length < 0) return(EINVAL); NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td); if ((error = namei(&nd)) != 0) return (error); vp = nd.ni_vp; if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) { vrele(vp); return (error); } NDFREE(&nd, NDF_ONLY_PNBUF); VOP_LEASE(vp, td, td->td_proc->p_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); if (vp->v_type == VDIR) error = EISDIR; else if ((error = vn_writechk(vp)) == 0 && (error = VOP_ACCESS(vp, VWRITE, td->td_proc->p_ucred, td)) == 0) { VATTR_NULL(&vattr); vattr.va_size = SCARG(uap, length); error = VOP_SETATTR(vp, &vattr, td->td_proc->p_ucred, td); } vput(vp); vn_finished_write(mp); return (error); } /* * Truncate a file given a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct ftruncate_args { int fd; int pad; off_t length; }; #endif /* ARGSUSED */ int ftruncate(td, uap) struct thread *td; register struct ftruncate_args /* { syscallarg(int) fd; syscallarg(int) pad; syscallarg(off_t) length; } */ *uap; { struct mount *mp; struct vattr vattr; struct vnode *vp; struct file *fp; int error; if (uap->length < 0) return(EINVAL); if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0) return (error); - if ((fp->f_flag & FWRITE) == 0) + if ((fp->f_flag & FWRITE) == 0) { + fdrop(fp, td); return (EINVAL); + } vp = (struct vnode *)fp->f_data; - if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) + if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) { + fdrop(fp, td); return (error); + } VOP_LEASE(vp, td, td->td_proc->p_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); if (vp->v_type == VDIR) error = EISDIR; else if ((error = vn_writechk(vp)) == 0) { VATTR_NULL(&vattr); vattr.va_size = SCARG(uap, length); error = VOP_SETATTR(vp, &vattr, fp->f_cred, td); } VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); + fdrop(fp, td); return (error); } #if defined(COMPAT_43) || defined(COMPAT_SUNOS) /* * Truncate a file given its path name. */ #ifndef _SYS_SYSPROTO_H_ struct otruncate_args { char *path; long length; }; #endif /* ARGSUSED */ int otruncate(td, uap) struct thread *td; register struct otruncate_args /* { syscallarg(char *) path; syscallarg(long) length; } */ *uap; { struct truncate_args /* { syscallarg(char *) path; syscallarg(int) pad; syscallarg(off_t) length; } */ nuap; SCARG(&nuap, path) = SCARG(uap, path); SCARG(&nuap, length) = SCARG(uap, length); return (truncate(td, &nuap)); } /* * Truncate a file given a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct oftruncate_args { int fd; long length; }; #endif /* ARGSUSED */ int oftruncate(td, uap) struct thread *td; register struct oftruncate_args /* { syscallarg(int) fd; syscallarg(long) length; } */ *uap; { struct ftruncate_args /* { syscallarg(int) fd; syscallarg(int) pad; syscallarg(off_t) length; } */ nuap; SCARG(&nuap, fd) = SCARG(uap, fd); SCARG(&nuap, length) = SCARG(uap, length); return (ftruncate(td, &nuap)); } #endif /* COMPAT_43 || COMPAT_SUNOS */ /* * Sync an open file. */ #ifndef _SYS_SYSPROTO_H_ struct fsync_args { int fd; }; #endif /* ARGSUSED */ int fsync(td, uap) struct thread *td; struct fsync_args /* { syscallarg(int) fd; } */ *uap; { struct vnode *vp; struct mount *mp; struct file *fp; vm_object_t obj; int error; GIANT_REQUIRED; if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0) return (error); vp = (struct vnode *)fp->f_data; - if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) + if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) { + fdrop(fp, td); return (error); + } vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); if (VOP_GETVOBJECT(vp, &obj) == 0) { vm_object_page_clean(obj, 0, 0, 0); } error = VOP_FSYNC(vp, fp->f_cred, MNT_WAIT, td); #ifdef SOFTUPDATES if (error == 0 && vp->v_mount && (vp->v_mount->mnt_flag & MNT_SOFTDEP)) error = softdep_fsync(vp); #endif VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); + fdrop(fp, td); return (error); } /* * Rename files. Source and destination must either both be directories, * or both not be directories. If target is a directory, it must be empty. */ #ifndef _SYS_SYSPROTO_H_ struct rename_args { char *from; char *to; }; #endif /* ARGSUSED */ int rename(td, uap) struct thread *td; register struct rename_args /* { syscallarg(char *) from; syscallarg(char *) to; } */ *uap; { struct mount *mp; struct vnode *tvp, *fvp, *tdvp; struct nameidata fromnd, tond; int error; bwillwrite(); NDINIT(&fromnd, DELETE, WANTPARENT | SAVESTART, UIO_USERSPACE, SCARG(uap, from), td); if ((error = namei(&fromnd)) != 0) return (error); fvp = fromnd.ni_vp; if ((error = vn_start_write(fvp, &mp, V_WAIT | PCATCH)) != 0) { NDFREE(&fromnd, NDF_ONLY_PNBUF); vrele(fromnd.ni_dvp); vrele(fvp); goto out1; } NDINIT(&tond, RENAME, LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | NOOBJ, UIO_USERSPACE, SCARG(uap, to), td); if (fromnd.ni_vp->v_type == VDIR) tond.ni_cnd.cn_flags |= WILLBEDIR; if ((error = namei(&tond)) != 0) { /* Translate error code for rename("dir1", "dir2/."). */ if (error == EISDIR && fvp->v_type == VDIR) error = EINVAL; NDFREE(&fromnd, NDF_ONLY_PNBUF); vrele(fromnd.ni_dvp); vrele(fvp); goto out1; } tdvp = tond.ni_dvp; tvp = tond.ni_vp; if (tvp != NULL) { if (fvp->v_type == VDIR && tvp->v_type != VDIR) { error = ENOTDIR; goto out; } else if (fvp->v_type != VDIR && tvp->v_type == VDIR) { error = EISDIR; goto out; } } if (fvp == tdvp) error = EINVAL; /* * If source is the same as the destination (that is the * same inode number with the same name in the same directory), * then there is nothing to do. */ if (fvp == tvp && fromnd.ni_dvp == tdvp && fromnd.ni_cnd.cn_namelen == tond.ni_cnd.cn_namelen && !bcmp(fromnd.ni_cnd.cn_nameptr, tond.ni_cnd.cn_nameptr, fromnd.ni_cnd.cn_namelen)) error = -1; out: if (!error) { VOP_LEASE(tdvp, td, td->td_proc->p_ucred, LEASE_WRITE); if (fromnd.ni_dvp != tdvp) { VOP_LEASE(fromnd.ni_dvp, td, td->td_proc->p_ucred, LEASE_WRITE); } if (tvp) { VOP_LEASE(tvp, td, td->td_proc->p_ucred, LEASE_WRITE); } error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd, tond.ni_dvp, tond.ni_vp, &tond.ni_cnd); NDFREE(&fromnd, NDF_ONLY_PNBUF); NDFREE(&tond, NDF_ONLY_PNBUF); } else { NDFREE(&fromnd, NDF_ONLY_PNBUF); NDFREE(&tond, NDF_ONLY_PNBUF); if (tdvp == tvp) vrele(tdvp); else vput(tdvp); if (tvp) vput(tvp); vrele(fromnd.ni_dvp); vrele(fvp); } vrele(tond.ni_startdir); vn_finished_write(mp); ASSERT_VOP_UNLOCKED(fromnd.ni_dvp, "rename"); ASSERT_VOP_UNLOCKED(fromnd.ni_vp, "rename"); ASSERT_VOP_UNLOCKED(tond.ni_dvp, "rename"); ASSERT_VOP_UNLOCKED(tond.ni_vp, "rename"); out1: if (fromnd.ni_startdir) vrele(fromnd.ni_startdir); if (error == -1) return (0); return (error); } /* * Make a directory file. */ #ifndef _SYS_SYSPROTO_H_ struct mkdir_args { char *path; int mode; }; #endif /* ARGSUSED */ int mkdir(td, uap) struct thread *td; register struct mkdir_args /* { syscallarg(char *) path; syscallarg(int) mode; } */ *uap; { return vn_mkdir(uap->path, uap->mode, UIO_USERSPACE, td); } int vn_mkdir(path, mode, segflg, td) char *path; int mode; enum uio_seg segflg; struct thread *td; { struct mount *mp; struct vnode *vp; struct vattr vattr; int error; struct nameidata nd; restart: bwillwrite(); NDINIT(&nd, CREATE, LOCKPARENT, segflg, path, td); nd.ni_cnd.cn_flags |= WILLBEDIR; if ((error = namei(&nd)) != 0) return (error); vp = nd.ni_vp; if (vp != NULL) { NDFREE(&nd, NDF_ONLY_PNBUF); vrele(vp); vput(nd.ni_dvp); return (EEXIST); } if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } VATTR_NULL(&vattr); vattr.va_type = VDIR; + FILEDESC_LOCK(td->td_proc->p_fd); vattr.va_mode = (mode & ACCESSPERMS) &~ td->td_proc->p_fd->fd_cmask; + FILEDESC_UNLOCK(td->td_proc->p_fd); VOP_LEASE(nd.ni_dvp, td, td->td_proc->p_ucred, LEASE_WRITE); error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if (!error) vput(nd.ni_vp); vn_finished_write(mp); ASSERT_VOP_UNLOCKED(nd.ni_dvp, "mkdir"); ASSERT_VOP_UNLOCKED(nd.ni_vp, "mkdir"); return (error); } /* * Remove a directory file. */ #ifndef _SYS_SYSPROTO_H_ struct rmdir_args { char *path; }; #endif /* ARGSUSED */ int rmdir(td, uap) struct thread *td; struct rmdir_args /* { syscallarg(char *) path; } */ *uap; { struct mount *mp; struct vnode *vp; int error; struct nameidata nd; restart: bwillwrite(); NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF, UIO_USERSPACE, SCARG(uap, path), td); if ((error = namei(&nd)) != 0) return (error); vp = nd.ni_vp; if (vp->v_type != VDIR) { error = ENOTDIR; goto out; } /* * No rmdir "." please. */ if (nd.ni_dvp == vp) { error = EINVAL; goto out; } /* * The root of a mounted filesystem cannot be deleted. */ if (vp->v_flag & VROOT) { error = EBUSY; goto out; } if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_dvp == vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); vput(vp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } VOP_LEASE(nd.ni_dvp, td, td->td_proc->p_ucred, LEASE_WRITE); VOP_LEASE(vp, td, td->td_proc->p_ucred, LEASE_WRITE); error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd); vn_finished_write(mp); out: NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_dvp == vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); vput(vp); ASSERT_VOP_UNLOCKED(nd.ni_dvp, "rmdir"); ASSERT_VOP_UNLOCKED(nd.ni_vp, "rmdir"); return (error); } #ifdef COMPAT_43 /* * Read a block of directory entries in a file system independent format. */ #ifndef _SYS_SYSPROTO_H_ struct ogetdirentries_args { int fd; char *buf; u_int count; long *basep; }; #endif int ogetdirentries(td, uap) struct thread *td; register struct ogetdirentries_args /* { syscallarg(int) fd; syscallarg(char *) buf; syscallarg(u_int) count; syscallarg(long *) basep; } */ *uap; { struct vnode *vp; struct file *fp; struct uio auio, kuio; struct iovec aiov, kiov; struct dirent *dp, *edp; caddr_t dirbuf; int error, eofflag, readcnt; long loff; /* XXX arbitrary sanity limit on `count'. */ if (SCARG(uap, count) > 64 * 1024) return (EINVAL); if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0) return (error); - if ((fp->f_flag & FREAD) == 0) + if ((fp->f_flag & FREAD) == 0) { + fdrop(fp, td); return (EBADF); + } vp = (struct vnode *)fp->f_data; unionread: - if (vp->v_type != VDIR) + if (vp->v_type != VDIR) { + fdrop(fp, td); return (EINVAL); + } aiov.iov_base = SCARG(uap, buf); aiov.iov_len = SCARG(uap, count); auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_USERSPACE; auio.uio_td = td; auio.uio_resid = SCARG(uap, count); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); loff = auio.uio_offset = fp->f_offset; # if (BYTE_ORDER != LITTLE_ENDIAN) if (vp->v_mount->mnt_maxsymlinklen <= 0) { error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL, NULL); fp->f_offset = auio.uio_offset; } else # endif { kuio = auio; kuio.uio_iov = &kiov; kuio.uio_segflg = UIO_SYSSPACE; kiov.iov_len = SCARG(uap, count); MALLOC(dirbuf, caddr_t, SCARG(uap, count), M_TEMP, M_WAITOK); kiov.iov_base = dirbuf; error = VOP_READDIR(vp, &kuio, fp->f_cred, &eofflag, NULL, NULL); fp->f_offset = kuio.uio_offset; if (error == 0) { readcnt = SCARG(uap, count) - kuio.uio_resid; edp = (struct dirent *)&dirbuf[readcnt]; for (dp = (struct dirent *)dirbuf; dp < edp; ) { # if (BYTE_ORDER == LITTLE_ENDIAN) /* * The expected low byte of * dp->d_namlen is our dp->d_type. * The high MBZ byte of dp->d_namlen * is our dp->d_namlen. */ dp->d_type = dp->d_namlen; dp->d_namlen = 0; # else /* * The dp->d_type is the high byte * of the expected dp->d_namlen, * so must be zero'ed. */ dp->d_type = 0; # endif if (dp->d_reclen > 0) { dp = (struct dirent *) ((char *)dp + dp->d_reclen); } else { error = EIO; break; } } if (dp >= edp) error = uiomove(dirbuf, readcnt, &auio); } FREE(dirbuf, M_TEMP); } VOP_UNLOCK(vp, 0, td); - if (error) + if (error) { + fdrop(fp, td); return (error); + } if (SCARG(uap, count) == auio.uio_resid) { if (union_dircheckp) { error = union_dircheckp(td, &vp, fp); if (error == -1) goto unionread; - if (error) + if (error) { + fdrop(fp, td); return (error); + } } if ((vp->v_flag & VROOT) && (vp->v_mount->mnt_flag & MNT_UNION)) { struct vnode *tvp = vp; vp = vp->v_mount->mnt_vnodecovered; VREF(vp); fp->f_data = (caddr_t) vp; fp->f_offset = 0; vrele(tvp); goto unionread; } } error = copyout((caddr_t)&loff, (caddr_t)SCARG(uap, basep), sizeof(long)); + fdrop(fp, td); td->td_retval[0] = SCARG(uap, count) - auio.uio_resid; return (error); } #endif /* COMPAT_43 */ /* * Read a block of directory entries in a file system independent format. */ #ifndef _SYS_SYSPROTO_H_ struct getdirentries_args { int fd; char *buf; u_int count; long *basep; }; #endif int getdirentries(td, uap) struct thread *td; register struct getdirentries_args /* { syscallarg(int) fd; syscallarg(char *) buf; syscallarg(u_int) count; syscallarg(long *) basep; } */ *uap; { struct vnode *vp; struct file *fp; struct uio auio; struct iovec aiov; long loff; int error, eofflag; if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0) return (error); - if ((fp->f_flag & FREAD) == 0) + if ((fp->f_flag & FREAD) == 0) { + fdrop(fp, td); return (EBADF); + } vp = (struct vnode *)fp->f_data; unionread: - if (vp->v_type != VDIR) + if (vp->v_type != VDIR) { + fdrop(fp, td); return (EINVAL); + } aiov.iov_base = SCARG(uap, buf); aiov.iov_len = SCARG(uap, count); auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_USERSPACE; auio.uio_td = td; auio.uio_resid = SCARG(uap, count); /* vn_lock(vp, LK_SHARED | LK_RETRY, td); */ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); loff = auio.uio_offset = fp->f_offset; error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL, NULL); fp->f_offset = auio.uio_offset; VOP_UNLOCK(vp, 0, td); - if (error) + if (error) { + fdrop(fp, td); return (error); + } if (SCARG(uap, count) == auio.uio_resid) { if (union_dircheckp) { error = union_dircheckp(td, &vp, fp); if (error == -1) goto unionread; - if (error) + if (error) { + fdrop(fp, td); return (error); + } } if ((vp->v_flag & VROOT) && (vp->v_mount->mnt_flag & MNT_UNION)) { struct vnode *tvp = vp; vp = vp->v_mount->mnt_vnodecovered; VREF(vp); fp->f_data = (caddr_t) vp; fp->f_offset = 0; vrele(tvp); goto unionread; } } if (SCARG(uap, basep) != NULL) { error = copyout((caddr_t)&loff, (caddr_t)SCARG(uap, basep), sizeof(long)); } td->td_retval[0] = SCARG(uap, count) - auio.uio_resid; + fdrop(fp, td); return (error); } #ifndef _SYS_SYSPROTO_H_ struct getdents_args { int fd; char *buf; size_t count; }; #endif int getdents(td, uap) struct thread *td; register struct getdents_args /* { syscallarg(int) fd; syscallarg(char *) buf; syscallarg(u_int) count; } */ *uap; { struct getdirentries_args ap; ap.fd = uap->fd; ap.buf = uap->buf; ap.count = uap->count; ap.basep = NULL; return getdirentries(td, &ap); } /* * Set the mode mask for creation of filesystem nodes. * * MP SAFE */ #ifndef _SYS_SYSPROTO_H_ struct umask_args { int newmask; }; #endif int umask(td, uap) struct thread *td; struct umask_args /* { syscallarg(int) newmask; } */ *uap; { register struct filedesc *fdp; + FILEDESC_LOCK(td->td_proc->p_fd); fdp = td->td_proc->p_fd; td->td_retval[0] = fdp->fd_cmask; fdp->fd_cmask = SCARG(uap, newmask) & ALLPERMS; + FILEDESC_UNLOCK(td->td_proc->p_fd); return (0); } /* * Void all references to file by ripping underlying filesystem * away from vnode. */ #ifndef _SYS_SYSPROTO_H_ struct revoke_args { char *path; }; #endif /* ARGSUSED */ int revoke(td, uap) struct thread *td; register struct revoke_args /* { syscallarg(char *) path; } */ *uap; { struct mount *mp; struct vnode *vp; struct vattr vattr; int error; struct nameidata nd; NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td); if ((error = namei(&nd)) != 0) return (error); vp = nd.ni_vp; NDFREE(&nd, NDF_ONLY_PNBUF); if (vp->v_type != VCHR) { error = EINVAL; goto out; } error = VOP_GETATTR(vp, &vattr, td->td_proc->p_ucred, td); if (error) goto out; if (td->td_proc->p_ucred->cr_uid != vattr.va_uid) { error = suser_xxx(0, td->td_proc, PRISON_ROOT); if (error) goto out; } if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) goto out; if (vcount(vp) > 1) VOP_REVOKE(vp, REVOKEALL); vn_finished_write(mp); out: vrele(vp); return (error); } /* * Convert a user file descriptor to a kernel file entry. + * The file entry is locked upon returning. */ int getvnode(fdp, fd, fpp) struct filedesc *fdp; int fd; struct file **fpp; { + int error; struct file *fp; - if ((u_int)fd >= fdp->fd_nfiles || - (fp = fdp->fd_ofiles[fd]) == NULL) - return (EBADF); - if (fp->f_type != DTYPE_VNODE && fp->f_type != DTYPE_FIFO) - return (EINVAL); + fp = NULL; + if (fdp == NULL) + error = EBADF; + else { + FILEDESC_LOCK(fdp); + if ((u_int)fd >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[fd]) == NULL) + error = EBADF; + else if (fp->f_type != DTYPE_VNODE && fp->f_type != DTYPE_FIFO) { + fp = NULL; + error = EINVAL; + } else { + fhold(fp); + error = 0; + } + FILEDESC_UNLOCK(fdp); + } *fpp = fp; - return (0); + return (error); } /* * Get (NFS) file handle */ #ifndef _SYS_SYSPROTO_H_ struct getfh_args { char *fname; fhandle_t *fhp; }; #endif int getfh(td, uap) struct thread *td; register struct getfh_args *uap; { struct nameidata nd; fhandle_t fh; register struct vnode *vp; int error; /* * Must be super user */ error = suser_td(td); if (error) return (error); NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, uap->fname, td); error = namei(&nd); if (error) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; bzero(&fh, sizeof(fh)); fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid; error = VFS_VPTOFH(vp, &fh.fh_fid); vput(vp); if (error) return (error); error = copyout(&fh, uap->fhp, sizeof (fh)); return (error); } /* * syscall for the rpc.lockd to use to translate a NFS file handle into * an open descriptor. * * warning: do not remove the suser() call or this becomes one giant * security hole. */ #ifndef _SYS_SYSPROTO_H_ struct fhopen_args { const struct fhandle *u_fhp; int flags; }; #endif int fhopen(td, uap) struct thread *td; struct fhopen_args /* { syscallarg(const struct fhandle *) u_fhp; syscallarg(int) flags; } */ *uap; { struct proc *p = td->td_proc; struct mount *mp; struct vnode *vp; struct fhandle fhp; struct vattr vat; struct vattr *vap = &vat; struct flock lf; struct file *fp; register struct filedesc *fdp = p->p_fd; int fmode, mode, error, type; struct file *nfp; int indx; /* * Must be super user */ error = suser_td(td); if (error) return (error); fmode = FFLAGS(SCARG(uap, flags)); /* why not allow a non-read/write open for our lockd? */ if (((fmode & (FREAD | FWRITE)) == 0) || (fmode & O_CREAT)) return (EINVAL); error = copyin(SCARG(uap,u_fhp), &fhp, sizeof(fhp)); if (error) return(error); /* find the mount point */ mp = vfs_getvfs(&fhp.fh_fsid); if (mp == NULL) return (ESTALE); /* now give me my vnode, it gets returned to me locked */ error = VFS_FHTOVP(mp, &fhp.fh_fid, &vp); if (error) return (error); /* * from now on we have to make sure not * to forget about the vnode * any error that causes an abort must vput(vp) * just set error = err and 'goto bad;'. */ /* * from vn_open */ if (vp->v_type == VLNK) { error = EMLINK; goto bad; } if (vp->v_type == VSOCK) { error = EOPNOTSUPP; goto bad; } mode = 0; if (fmode & (FWRITE | O_TRUNC)) { if (vp->v_type == VDIR) { error = EISDIR; goto bad; } error = vn_writechk(vp); if (error) goto bad; mode |= VWRITE; } if (fmode & FREAD) mode |= VREAD; if (mode) { error = VOP_ACCESS(vp, mode, p->p_ucred, td); if (error) goto bad; } if (fmode & O_TRUNC) { VOP_UNLOCK(vp, 0, td); /* XXX */ if ((error = vn_start_write(NULL, &mp, V_WAIT | PCATCH)) != 0) { vrele(vp); return (error); } VOP_LEASE(vp, td, p->p_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); /* XXX */ VATTR_NULL(vap); vap->va_size = 0; error = VOP_SETATTR(vp, vap, p->p_ucred, td); vn_finished_write(mp); if (error) goto bad; } error = VOP_OPEN(vp, fmode, p->p_ucred, td); if (error) goto bad; /* * Make sure that a VM object is created for VMIO support. */ if (vn_canvmio(vp) == TRUE) { if ((error = vfs_object_create(vp, td, p->p_ucred)) != 0) goto bad; } if (fmode & FWRITE) vp->v_writecount++; /* * end of vn_open code */ if ((error = falloc(td, &nfp, &indx)) != 0) { if (fmode & FWRITE) vp->v_writecount--; goto bad; } fp = nfp; /* * Hold an extra reference to avoid having fp ripped out * from under us while we block in the lock op */ fhold(fp); nfp->f_data = (caddr_t)vp; nfp->f_flag = fmode & FMASK; nfp->f_ops = &vnops; nfp->f_type = DTYPE_VNODE; if (fmode & (O_EXLOCK | O_SHLOCK)) { lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; if (fmode & O_EXLOCK) lf.l_type = F_WRLCK; else lf.l_type = F_RDLCK; type = F_FLOCK; if ((fmode & FNONBLOCK) == 0) type |= F_WAIT; VOP_UNLOCK(vp, 0, td); if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type)) != 0) { /* * The lock request failed. Normally close the * descriptor but handle the case where someone might * have dup()d or close()d it when we weren't looking. */ + FILEDESC_LOCK(fdp); if (fdp->fd_ofiles[indx] == fp) { fdp->fd_ofiles[indx] = NULL; + FILEDESC_UNLOCK(fdp); fdrop(fp, td); - } + } else + FILEDESC_UNLOCK(fdp); /* * release our private reference */ fdrop(fp, td); return(error); } vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); fp->f_flag |= FHASLOCK; } if ((vp->v_type == VREG) && (VOP_GETVOBJECT(vp, NULL) != 0)) vfs_object_create(vp, td, p->p_ucred); VOP_UNLOCK(vp, 0, td); fdrop(fp, td); td->td_retval[0] = indx; return (0); bad: vput(vp); return (error); } /* * Stat an (NFS) file handle. */ #ifndef _SYS_SYSPROTO_H_ struct fhstat_args { struct fhandle *u_fhp; struct stat *sb; }; #endif int fhstat(td, uap) struct thread *td; register struct fhstat_args /* { syscallarg(struct fhandle *) u_fhp; syscallarg(struct stat *) sb; } */ *uap; { struct stat sb; fhandle_t fh; struct mount *mp; struct vnode *vp; int error; /* * Must be super user */ error = suser_td(td); if (error) return (error); error = copyin(SCARG(uap, u_fhp), &fh, sizeof(fhandle_t)); if (error) return (error); if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL) return (ESTALE); if ((error = VFS_FHTOVP(mp, &fh.fh_fid, &vp))) return (error); error = vn_stat(vp, &sb, td); vput(vp); if (error) return (error); error = copyout(&sb, SCARG(uap, sb), sizeof(sb)); return (error); } /* * Implement fstatfs() for (NFS) file handles. */ #ifndef _SYS_SYSPROTO_H_ struct fhstatfs_args { struct fhandle *u_fhp; struct statfs *buf; }; #endif int fhstatfs(td, uap) struct thread *td; struct fhstatfs_args /* { syscallarg(struct fhandle) *u_fhp; syscallarg(struct statfs) *buf; } */ *uap; { struct statfs *sp; struct mount *mp; struct vnode *vp; struct statfs sb; fhandle_t fh; int error; /* * Must be super user */ error = suser_td(td); if (error) return (error); if ((error = copyin(SCARG(uap, u_fhp), &fh, sizeof(fhandle_t))) != 0) return (error); if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL) return (ESTALE); if ((error = VFS_FHTOVP(mp, &fh.fh_fid, &vp))) return (error); mp = vp->v_mount; sp = &mp->mnt_stat; vput(vp); if ((error = VFS_STATFS(mp, sp, td)) != 0) return (error); sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; if (suser_xxx(td->td_proc->p_ucred, 0, 0)) { bcopy((caddr_t)sp, (caddr_t)&sb, sizeof(sb)); sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0; sp = &sb; } return (copyout(sp, SCARG(uap, buf), sizeof(*sp))); } /* * Syscall to push extended attribute configuration information into the * VFS. Accepts a path, which it converts to a mountpoint, as well as * a command (int cmd), and attribute name and misc data. For now, the * attribute name is left in userspace for consumption by the VFS_op. * It will probably be changed to be copied into sysspace by the * syscall in the future, once issues with various consumers of the * attribute code have raised their hands. * * Currently this is used only by UFS Extended Attributes. */ int extattrctl(td, uap) struct thread *td; struct extattrctl_args *uap; { struct vnode *filename_vp; struct nameidata nd; struct mount *mp; char attrname[EXTATTR_MAXNAMELEN]; int error; /* * SCARG(uap, attrname) not always defined. We check again later * when we invoke the VFS call so as to pass in NULL there if needed. */ if (SCARG(uap, attrname) != NULL) { error = copyinstr(SCARG(uap, attrname), attrname, EXTATTR_MAXNAMELEN, NULL); if (error) return (error); } /* * SCARG(uap, filename) not always defined. If it is, grab * a vnode lock, which VFS_EXTATTRCTL() will later release. */ filename_vp = NULL; if (SCARG(uap, filename) != NULL) { NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, SCARG(uap, filename), td); if ((error = namei(&nd)) != 0) return (error); filename_vp = nd.ni_vp; NDFREE(&nd, NDF_NO_VP_RELE | NDF_NO_VP_UNLOCK); } /* SCARG(uap, path) always defined. */ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td); if ((error = namei(&nd)) != 0) return (error); error = vn_start_write(nd.ni_vp, &mp, V_WAIT | PCATCH); NDFREE(&nd, 0); if (error) { if (filename_vp) vrele(filename_vp); return (error); } if (SCARG(uap, attrname) != NULL) { error = VFS_EXTATTRCTL(mp, SCARG(uap, cmd), filename_vp, SCARG(uap, attrnamespace), attrname, td); } else { error = VFS_EXTATTRCTL(mp, SCARG(uap, cmd), filename_vp, SCARG(uap, attrnamespace), NULL, td); } vn_finished_write(mp); /* * VFS_EXTATTRCTL will have unlocked, but not de-ref'd, * filename_vp, so vrele it if it is defined. */ if (filename_vp != NULL) vrele(filename_vp); return (error); } /* * extattr_set_vp(): Set a named extended attribute on a file or directory * * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace", * kernelspace string pointer "attrname", * userspace iovec array pointer "iovp", unsigned int iovcnt * proc "p" * Returns: 0 on success, an error number otherwise * Locks: none * References: vp must be a valid reference for the duration of the call */ static int extattr_set_vp(struct vnode *vp, int attrnamespace, const char *attrname, struct iovec *iovp, unsigned iovcnt, struct thread *td) { struct mount *mp; struct uio auio; struct iovec *iov, *needfree = NULL, aiov[UIO_SMALLIOV]; u_int iovlen, cnt; int error, i; if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) return (error); VOP_LEASE(vp, td, td->td_proc->p_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); iovlen = iovcnt * sizeof(struct iovec); if (iovcnt > UIO_SMALLIOV) { if (iovcnt > UIO_MAXIOV) { error = EINVAL; goto done; } MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK); needfree = iov; } else iov = aiov; auio.uio_iov = iov; auio.uio_iovcnt = iovcnt; auio.uio_rw = UIO_WRITE; auio.uio_segflg = UIO_USERSPACE; auio.uio_td = td; auio.uio_offset = 0; if ((error = copyin((caddr_t)iovp, (caddr_t)iov, iovlen))) goto done; auio.uio_resid = 0; for (i = 0; i < iovcnt; i++) { if (iov->iov_len > INT_MAX - auio.uio_resid) { error = EINVAL; goto done; } auio.uio_resid += iov->iov_len; iov++; } cnt = auio.uio_resid; error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, td->td_proc->p_ucred, td); cnt -= auio.uio_resid; td->td_retval[0] = cnt; done: if (needfree) FREE(needfree, M_IOV); VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); return (error); } int extattr_set_file(td, uap) struct thread *td; struct extattr_set_file_args *uap; { struct nameidata nd; char attrname[EXTATTR_MAXNAMELEN]; int error; error = copyinstr(SCARG(uap, attrname), attrname, EXTATTR_MAXNAMELEN, NULL); if (error) return (error); NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = extattr_set_vp(nd.ni_vp, SCARG(uap, attrnamespace), attrname, SCARG(uap, iovp), SCARG(uap, iovcnt), td); vrele(nd.ni_vp); return (error); } int extattr_set_fd(td, uap) struct thread *td; struct extattr_set_fd_args *uap; { struct file *fp; char attrname[EXTATTR_MAXNAMELEN]; int error; error = copyinstr(SCARG(uap, attrname), attrname, EXTATTR_MAXNAMELEN, NULL); if (error) return (error); if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0) return (error); error = extattr_set_vp((struct vnode *)fp->f_data, SCARG(uap, attrnamespace), attrname, SCARG(uap, iovp), SCARG(uap, iovcnt), td); + fdrop(fp, td); return (error); } /* * extattr_get_vp(): Get a named extended attribute on a file or directory * * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace", * kernelspace string pointer "attrname", * userspace iovec array pointer "iovp", unsigned int iovcnt, * proc "p" * Returns: 0 on success, an error number otherwise * Locks: none * References: vp must be a valid reference for the duration of the call */ static int extattr_get_vp(struct vnode *vp, int attrnamespace, const char *attrname, struct iovec *iovp, unsigned iovcnt, struct thread *td) { struct uio auio; struct iovec *iov, *needfree = NULL, aiov[UIO_SMALLIOV]; u_int iovlen, cnt; int error, i; VOP_LEASE(vp, td, td->td_proc->p_ucred, LEASE_READ); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); iovlen = iovcnt * sizeof (struct iovec); if (iovcnt > UIO_SMALLIOV) { if (iovcnt > UIO_MAXIOV) { error = EINVAL; goto done; } MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK); needfree = iov; } else iov = aiov; auio.uio_iov = iov; auio.uio_iovcnt = iovcnt; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_USERSPACE; auio.uio_td = td; auio.uio_offset = 0; if ((error = copyin((caddr_t)iovp, (caddr_t)iov, iovlen))) goto done; auio.uio_resid = 0; for (i = 0; i < iovcnt; i++) { if (iov->iov_len > INT_MAX - auio.uio_resid) { error = EINVAL; goto done; } auio.uio_resid += iov->iov_len; iov++; } cnt = auio.uio_resid; error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, td->td_proc->p_ucred, td); cnt -= auio.uio_resid; td->td_retval[0] = cnt; done: if (needfree) FREE(needfree, M_IOV); VOP_UNLOCK(vp, 0, td); return (error); } int extattr_get_file(td, uap) struct thread *td; struct extattr_get_file_args *uap; { struct nameidata nd; char attrname[EXTATTR_MAXNAMELEN]; int error; error = copyinstr(SCARG(uap, attrname), attrname, EXTATTR_MAXNAMELEN, NULL); if (error) return (error); NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = extattr_get_vp(nd.ni_vp, SCARG(uap, attrnamespace), attrname, SCARG(uap, iovp), SCARG(uap, iovcnt), td); vrele(nd.ni_vp); return (error); } int extattr_get_fd(td, uap) struct thread *td; struct extattr_get_fd_args *uap; { struct file *fp; char attrname[EXTATTR_MAXNAMELEN]; int error; error = copyinstr(SCARG(uap, attrname), attrname, EXTATTR_MAXNAMELEN, NULL); if (error) return (error); if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0) return (error); error = extattr_get_vp((struct vnode *)fp->f_data, SCARG(uap, attrnamespace), attrname, SCARG(uap, iovp), SCARG(uap, iovcnt), td); + fdrop(fp, td); return (error); } /* * extattr_delete_vp(): Delete a named extended attribute on a file or * directory * * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace", * kernelspace string pointer "attrname", proc "p" * Returns: 0 on success, an error number otherwise * Locks: none * References: vp must be a valid reference for the duration of the call */ static int extattr_delete_vp(struct vnode *vp, int attrnamespace, const char *attrname, struct thread *td) { struct mount *mp; int error; if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) return (error); VOP_LEASE(vp, td, td->td_proc->p_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL, td->td_proc->p_ucred, td); VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); return (error); } int extattr_delete_file(td, uap) struct thread *td; struct extattr_delete_file_args *uap; { struct nameidata nd; char attrname[EXTATTR_MAXNAMELEN]; int error; error = copyinstr(SCARG(uap, attrname), attrname, EXTATTR_MAXNAMELEN, NULL); if (error) return(error); NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td); if ((error = namei(&nd)) != 0) return(error); NDFREE(&nd, NDF_ONLY_PNBUF); error = extattr_delete_vp(nd.ni_vp, SCARG(uap, attrnamespace), attrname, td); vrele(nd.ni_vp); return(error); } int extattr_delete_fd(td, uap) struct thread *td; struct extattr_delete_fd_args *uap; { struct file *fp; + struct vnode *vp; char attrname[EXTATTR_MAXNAMELEN]; int error; error = copyinstr(SCARG(uap, attrname), attrname, EXTATTR_MAXNAMELEN, NULL); if (error) return (error); if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0) return (error); + vp = (struct vnode *)fp->f_data; error = extattr_delete_vp((struct vnode *)fp->f_data, SCARG(uap, attrnamespace), attrname, td); + fdrop(fp, td); return (error); } Index: head/sys/kern/vfs_lookup.c =================================================================== --- head/sys/kern/vfs_lookup.c (revision 89305) +++ head/sys/kern/vfs_lookup.c (revision 89306) @@ -1,747 +1,749 @@ /* * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_lookup.c 8.4 (Berkeley) 2/16/94 * $FreeBSD$ */ #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include /* * Allocation zone for namei */ struct vm_zone *namei_zone; static void nameiinit(void *dummy __unused) { namei_zone = zinit("NAMEI", MAXPATHLEN, 0, 0, 2); } SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nameiinit, NULL) /* * Convert a pathname into a pointer to a locked inode. * * The FOLLOW flag is set when symbolic links are to be followed * when they occur at the end of the name translation process. * Symbolic links are always followed for all other pathname * components other than the last. * * The segflg defines whether the name is to be copied from user * space or kernel space. * * Overall outline of namei: * * copy in name * get starting directory * while (!done && !error) { * call lookup to search path. * if symbolic link, massage name in buffer and continue * } */ int namei(ndp) register struct nameidata *ndp; { register struct filedesc *fdp; /* pointer to file descriptor state */ register char *cp; /* pointer into pathname argument */ register struct vnode *dp; /* the directory we are searching */ struct iovec aiov; /* uio for reading symbolic links */ struct uio auio; int error, linklen; struct componentname *cnp = &ndp->ni_cnd; struct thread *td = cnp->cn_thread; struct proc *p = td->td_proc; ndp->ni_cnd.cn_cred = ndp->ni_cnd.cn_thread->td_proc->p_ucred; KASSERT(cnp->cn_cred && p, ("namei: bad cred/proc")); KASSERT((cnp->cn_nameiop & (~OPMASK)) == 0, ("namei: nameiop contaminated with flags")); KASSERT((cnp->cn_flags & OPMASK) == 0, ("namei: flags contaminated with nameiops")); fdp = p->p_fd; /* * Get a buffer for the name to be translated, and copy the * name into the buffer. */ if ((cnp->cn_flags & HASBUF) == 0) cnp->cn_pnbuf = zalloc(namei_zone); if (ndp->ni_segflg == UIO_SYSSPACE) error = copystr(ndp->ni_dirp, cnp->cn_pnbuf, MAXPATHLEN, (size_t *)&ndp->ni_pathlen); else error = copyinstr(ndp->ni_dirp, cnp->cn_pnbuf, MAXPATHLEN, (size_t *)&ndp->ni_pathlen); /* * Don't allow empty pathnames. */ if (!error && *cnp->cn_pnbuf == '\0') error = ENOENT; if (error) { zfree(namei_zone, cnp->cn_pnbuf); ndp->ni_vp = NULL; return (error); } ndp->ni_loopcnt = 0; #ifdef KTRACE if (KTRPOINT(p, KTR_NAMEI)) ktrnamei(p->p_tracep, cnp->cn_pnbuf); #endif /* * Get starting point for the translation. */ + FILEDESC_LOCK(fdp); ndp->ni_rootdir = fdp->fd_rdir; ndp->ni_topdir = fdp->fd_jdir; dp = fdp->fd_cdir; VREF(dp); + FILEDESC_UNLOCK(fdp); for (;;) { /* * Check if root directory should replace current directory. * Done at start of translation and after symbolic link. */ cnp->cn_nameptr = cnp->cn_pnbuf; if (*(cnp->cn_nameptr) == '/') { vrele(dp); while (*(cnp->cn_nameptr) == '/') { cnp->cn_nameptr++; ndp->ni_pathlen--; } dp = ndp->ni_rootdir; VREF(dp); } ndp->ni_startdir = dp; error = lookup(ndp); if (error) { zfree(namei_zone, cnp->cn_pnbuf); return (error); } /* * Check for symbolic link */ if ((cnp->cn_flags & ISSYMLINK) == 0) { if ((cnp->cn_flags & (SAVENAME | SAVESTART)) == 0) zfree(namei_zone, cnp->cn_pnbuf); else cnp->cn_flags |= HASBUF; if (vn_canvmio(ndp->ni_vp) == TRUE && (cnp->cn_nameiop != DELETE) && ((cnp->cn_flags & (NOOBJ|LOCKLEAF)) == LOCKLEAF)) vfs_object_create(ndp->ni_vp, td, ndp->ni_cnd.cn_cred); return (0); } if ((cnp->cn_flags & LOCKPARENT) && ndp->ni_pathlen == 1) VOP_UNLOCK(ndp->ni_dvp, 0, td); if (ndp->ni_loopcnt++ >= MAXSYMLINKS) { error = ELOOP; break; } if (ndp->ni_pathlen > 1) cp = zalloc(namei_zone); else cp = cnp->cn_pnbuf; aiov.iov_base = cp; aiov.iov_len = MAXPATHLEN; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = 0; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_SYSSPACE; auio.uio_td = (struct thread *)0; auio.uio_resid = MAXPATHLEN; error = VOP_READLINK(ndp->ni_vp, &auio, cnp->cn_cred); if (error) { if (ndp->ni_pathlen > 1) zfree(namei_zone, cp); break; } linklen = MAXPATHLEN - auio.uio_resid; if (linklen == 0) { if (ndp->ni_pathlen > 1) zfree(namei_zone, cp); error = ENOENT; break; } if (linklen + ndp->ni_pathlen >= MAXPATHLEN) { if (ndp->ni_pathlen > 1) zfree(namei_zone, cp); error = ENAMETOOLONG; break; } if (ndp->ni_pathlen > 1) { bcopy(ndp->ni_next, cp + linklen, ndp->ni_pathlen); zfree(namei_zone, cnp->cn_pnbuf); cnp->cn_pnbuf = cp; } else cnp->cn_pnbuf[linklen] = '\0'; ndp->ni_pathlen += linklen; vput(ndp->ni_vp); dp = ndp->ni_dvp; } zfree(namei_zone, cnp->cn_pnbuf); vrele(ndp->ni_dvp); vput(ndp->ni_vp); ndp->ni_vp = NULL; return (error); } /* * Search a pathname. * This is a very central and rather complicated routine. * * The pathname is pointed to by ni_ptr and is of length ni_pathlen. * The starting directory is taken from ni_startdir. The pathname is * descended until done, or a symbolic link is encountered. The variable * ni_more is clear if the path is completed; it is set to one if a * symbolic link needing interpretation is encountered. * * The flag argument is LOOKUP, CREATE, RENAME, or DELETE depending on * whether the name is to be looked up, created, renamed, or deleted. * When CREATE, RENAME, or DELETE is specified, information usable in * creating, renaming, or deleting a directory entry may be calculated. * If flag has LOCKPARENT or'ed into it, the parent directory is returned * locked. If flag has WANTPARENT or'ed into it, the parent directory is * returned unlocked. Otherwise the parent directory is not returned. If * the target of the pathname exists and LOCKLEAF is or'ed into the flag * the target is returned locked, otherwise it is returned unlocked. * When creating or renaming and LOCKPARENT is specified, the target may not * be ".". When deleting and LOCKPARENT is specified, the target may be ".". * * Overall outline of lookup: * * dirloop: * identify next component of name at ndp->ni_ptr * handle degenerate case where name is null string * if .. and crossing mount points and on mounted filesys, find parent * call VOP_LOOKUP routine for next component name * directory vnode returned in ni_dvp, unlocked unless LOCKPARENT set * component vnode returned in ni_vp (if it exists), locked. * if result vnode is mounted on and crossing mount points, * find mounted on vnode * if more components of name, do next level at dirloop * return the answer in ni_vp, locked if LOCKLEAF set * if LOCKPARENT set, return locked parent in ni_dvp * if WANTPARENT set, return unlocked parent in ni_dvp */ int lookup(ndp) register struct nameidata *ndp; { register char *cp; /* pointer into pathname argument */ register struct vnode *dp = 0; /* the directory we are searching */ struct vnode *tdp; /* saved dp */ struct mount *mp; /* mount table entry */ int docache; /* == 0 do not cache last component */ int wantparent; /* 1 => wantparent or lockparent flag */ int rdonly; /* lookup read-only flag bit */ int trailing_slash; int error = 0; int dpunlocked = 0; /* dp has already been unlocked */ struct componentname *cnp = &ndp->ni_cnd; struct thread *td = cnp->cn_thread; /* * Setup: break out flag bits into variables. */ wantparent = cnp->cn_flags & (LOCKPARENT | WANTPARENT); docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE; if (cnp->cn_nameiop == DELETE || (wantparent && cnp->cn_nameiop != CREATE && cnp->cn_nameiop != LOOKUP)) docache = 0; rdonly = cnp->cn_flags & RDONLY; ndp->ni_dvp = NULL; cnp->cn_flags &= ~ISSYMLINK; dp = ndp->ni_startdir; ndp->ni_startdir = NULLVP; vn_lock(dp, LK_EXCLUSIVE | LK_RETRY, td); dirloop: /* * Search a new directory. * * The last component of the filename is left accessible via * cnp->cn_nameptr for callers that need the name. Callers needing * the name set the SAVENAME flag. When done, they assume * responsibility for freeing the pathname buffer. */ cnp->cn_consume = 0; for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++) continue; cnp->cn_namelen = cp - cnp->cn_nameptr; if (cnp->cn_namelen > NAME_MAX) { error = ENAMETOOLONG; goto bad; } #ifdef NAMEI_DIAGNOSTIC { char c = *cp; *cp = '\0'; printf("{%s}: ", cnp->cn_nameptr); *cp = c; } #endif ndp->ni_pathlen -= cnp->cn_namelen; ndp->ni_next = cp; /* * Replace multiple slashes by a single slash and trailing slashes * by a null. This must be done before VOP_LOOKUP() because some * fs's don't know about trailing slashes. Remember if there were * trailing slashes to handle symlinks, existing non-directories * and non-existing files that won't be directories specially later. */ trailing_slash = 0; while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) { cp++; ndp->ni_pathlen--; if (*cp == '\0') { trailing_slash = 1; *ndp->ni_next = '\0'; /* XXX for direnter() ... */ } } ndp->ni_next = cp; cnp->cn_flags |= MAKEENTRY; if (*cp == '\0' && docache == 0) cnp->cn_flags &= ~MAKEENTRY; if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.') cnp->cn_flags |= ISDOTDOT; else cnp->cn_flags &= ~ISDOTDOT; if (*ndp->ni_next == 0) cnp->cn_flags |= ISLASTCN; else cnp->cn_flags &= ~ISLASTCN; /* * Check for degenerate name (e.g. / or "") * which is a way of talking about a directory, * e.g. like "/." or ".". */ if (cnp->cn_nameptr[0] == '\0') { if (dp->v_type != VDIR) { error = ENOTDIR; goto bad; } if (cnp->cn_nameiop != LOOKUP) { error = EISDIR; goto bad; } if (wantparent) { ndp->ni_dvp = dp; VREF(dp); } ndp->ni_vp = dp; if (!(cnp->cn_flags & (LOCKPARENT | LOCKLEAF))) VOP_UNLOCK(dp, 0, td); /* XXX This should probably move to the top of function. */ if (cnp->cn_flags & SAVESTART) panic("lookup: SAVESTART"); return (0); } /* * Handle "..": two special cases. * 1. If at root directory (e.g. after chroot) * or at absolute root directory * then ignore it so can't get out. * 2. If this vnode is the root of a mounted * filesystem, then replace it with the * vnode which was mounted on so we take the * .. in the other file system. * 3. If the vnode is the top directory of * the jail or chroot, don't let them out. */ if (cnp->cn_flags & ISDOTDOT) { for (;;) { if (dp == ndp->ni_rootdir || dp == ndp->ni_topdir || dp == rootvnode) { ndp->ni_dvp = dp; ndp->ni_vp = dp; VREF(dp); goto nextname; } if ((dp->v_flag & VROOT) == 0 || (cnp->cn_flags & NOCROSSMOUNT)) break; if (dp->v_mount == NULL) { /* forced unmount */ error = EBADF; goto bad; } tdp = dp; dp = dp->v_mount->mnt_vnodecovered; vput(tdp); VREF(dp); vn_lock(dp, LK_EXCLUSIVE | LK_RETRY, td); } } /* * We now have a segment name to search for, and a directory to search. */ unionlookup: ndp->ni_dvp = dp; ndp->ni_vp = NULL; cnp->cn_flags &= ~PDIRUNLOCK; ASSERT_VOP_LOCKED(dp, "lookup"); if ((error = VOP_LOOKUP(dp, &ndp->ni_vp, cnp)) != 0) { KASSERT(ndp->ni_vp == NULL, ("leaf should be empty")); #ifdef NAMEI_DIAGNOSTIC printf("not found\n"); #endif if ((error == ENOENT) && (dp->v_flag & VROOT) && (dp->v_mount != NULL) && (dp->v_mount->mnt_flag & MNT_UNION)) { tdp = dp; dp = dp->v_mount->mnt_vnodecovered; if (cnp->cn_flags & PDIRUNLOCK) vrele(tdp); else vput(tdp); VREF(dp); vn_lock(dp, LK_EXCLUSIVE | LK_RETRY, td); goto unionlookup; } if (error != EJUSTRETURN) goto bad; /* * If creating and at end of pathname, then can consider * allowing file to be created. */ if (rdonly) { error = EROFS; goto bad; } if (*cp == '\0' && trailing_slash && !(cnp->cn_flags & WILLBEDIR)) { error = ENOENT; goto bad; } /* * We return with ni_vp NULL to indicate that the entry * doesn't currently exist, leaving a pointer to the * (possibly locked) directory inode in ndp->ni_dvp. */ if (cnp->cn_flags & SAVESTART) { ndp->ni_startdir = ndp->ni_dvp; VREF(ndp->ni_startdir); } return (0); } #ifdef NAMEI_DIAGNOSTIC printf("found\n"); #endif ASSERT_VOP_LOCKED(ndp->ni_vp, "lookup"); /* * Take into account any additional components consumed by * the underlying filesystem. */ if (cnp->cn_consume > 0) { cnp->cn_nameptr += cnp->cn_consume; ndp->ni_next += cnp->cn_consume; ndp->ni_pathlen -= cnp->cn_consume; cnp->cn_consume = 0; } dp = ndp->ni_vp; /* * Check to see if the vnode has been mounted on; * if so find the root of the mounted file system. */ while (dp->v_type == VDIR && (mp = dp->v_mountedhere) && (cnp->cn_flags & NOCROSSMOUNT) == 0) { if (vfs_busy(mp, 0, 0, td)) continue; VOP_UNLOCK(dp, 0, td); error = VFS_ROOT(mp, &tdp); vfs_unbusy(mp, td); if (error) { dpunlocked = 1; goto bad2; } vrele(dp); ndp->ni_vp = dp = tdp; } /* * Check for symbolic link */ if ((dp->v_type == VLNK) && ((cnp->cn_flags & FOLLOW) || trailing_slash || *ndp->ni_next == '/')) { cnp->cn_flags |= ISSYMLINK; if (dp->v_mount == NULL) { /* We can't know whether the directory was mounted with * NOSYMFOLLOW, so we can't follow safely. */ error = EBADF; goto bad2; } if (dp->v_mount->mnt_flag & MNT_NOSYMFOLLOW) { error = EACCES; goto bad2; } return (0); } /* * Check for bogus trailing slashes. */ if (trailing_slash && dp->v_type != VDIR) { error = ENOTDIR; goto bad2; } nextname: /* * Not a symbolic link. If more pathname, * continue at next component, else return. */ if (*ndp->ni_next == '/') { cnp->cn_nameptr = ndp->ni_next; while (*cnp->cn_nameptr == '/') { cnp->cn_nameptr++; ndp->ni_pathlen--; } if (ndp->ni_dvp != ndp->ni_vp) ASSERT_VOP_UNLOCKED(ndp->ni_dvp, "lookup"); vrele(ndp->ni_dvp); goto dirloop; } /* * Disallow directory write attempts on read-only file systems. */ if (rdonly && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) { error = EROFS; goto bad2; } if (cnp->cn_flags & SAVESTART) { ndp->ni_startdir = ndp->ni_dvp; VREF(ndp->ni_startdir); } if (!wantparent) vrele(ndp->ni_dvp); if ((cnp->cn_flags & LOCKLEAF) == 0) VOP_UNLOCK(dp, 0, td); return (0); bad2: if ((cnp->cn_flags & (LOCKPARENT | PDIRUNLOCK)) == LOCKPARENT && *ndp->ni_next == '\0') VOP_UNLOCK(ndp->ni_dvp, 0, td); vrele(ndp->ni_dvp); bad: if (dpunlocked) vrele(dp); else vput(dp); ndp->ni_vp = NULL; return (error); } /* * relookup - lookup a path name component * Used by lookup to re-aquire things. */ int relookup(dvp, vpp, cnp) struct vnode *dvp, **vpp; struct componentname *cnp; { struct thread *td = cnp->cn_thread; struct vnode *dp = 0; /* the directory we are searching */ int docache; /* == 0 do not cache last component */ int wantparent; /* 1 => wantparent or lockparent flag */ int rdonly; /* lookup read-only flag bit */ int error = 0; #ifdef NAMEI_DIAGNOSTIC int newhash; /* DEBUG: check name hash */ char *cp; /* DEBUG: check name ptr/len */ #endif /* * Setup: break out flag bits into variables. */ wantparent = cnp->cn_flags & (LOCKPARENT|WANTPARENT); docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE; if (cnp->cn_nameiop == DELETE || (wantparent && cnp->cn_nameiop != CREATE)) docache = 0; rdonly = cnp->cn_flags & RDONLY; cnp->cn_flags &= ~ISSYMLINK; dp = dvp; vn_lock(dp, LK_EXCLUSIVE | LK_RETRY, td); /* dirloop: */ /* * Search a new directory. * * The last component of the filename is left accessible via * cnp->cn_nameptr for callers that need the name. Callers needing * the name set the SAVENAME flag. When done, they assume * responsibility for freeing the pathname buffer. */ #ifdef NAMEI_DIAGNOSTIC if (cnp->cn_namelen != cp - cnp->cn_nameptr) panic ("relookup: bad len"); if (*cp != 0) panic("relookup: not last component"); printf("{%s}: ", cnp->cn_nameptr); #endif /* * Check for degenerate name (e.g. / or "") * which is a way of talking about a directory, * e.g. like "/." or ".". */ if (cnp->cn_nameptr[0] == '\0') { if (cnp->cn_nameiop != LOOKUP || wantparent) { error = EISDIR; goto bad; } if (dp->v_type != VDIR) { error = ENOTDIR; goto bad; } if (!(cnp->cn_flags & LOCKLEAF)) VOP_UNLOCK(dp, 0, td); *vpp = dp; /* XXX This should probably move to the top of function. */ if (cnp->cn_flags & SAVESTART) panic("lookup: SAVESTART"); return (0); } if (cnp->cn_flags & ISDOTDOT) panic ("relookup: lookup on dot-dot"); /* * We now have a segment name to search for, and a directory to search. */ if ((error = VOP_LOOKUP(dp, vpp, cnp)) != 0) { KASSERT(*vpp == NULL, ("leaf should be empty")); if (error != EJUSTRETURN) goto bad; /* * If creating and at end of pathname, then can consider * allowing file to be created. */ if (rdonly) { error = EROFS; goto bad; } /* ASSERT(dvp == ndp->ni_startdir) */ if (cnp->cn_flags & SAVESTART) VREF(dvp); /* * We return with ni_vp NULL to indicate that the entry * doesn't currently exist, leaving a pointer to the * (possibly locked) directory inode in ndp->ni_dvp. */ return (0); } dp = *vpp; /* * Check for symbolic link */ KASSERT(dp->v_type != VLNK || !(cnp->cn_flags & FOLLOW), ("relookup: symlink found.\n")); /* * Disallow directory write attempts on read-only file systems. */ if (rdonly && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) { error = EROFS; goto bad2; } /* ASSERT(dvp == ndp->ni_startdir) */ if (cnp->cn_flags & SAVESTART) VREF(dvp); if (!wantparent) vrele(dvp); if (vn_canvmio(dp) == TRUE && ((cnp->cn_flags & (NOOBJ|LOCKLEAF)) == LOCKLEAF)) vfs_object_create(dp, td, cnp->cn_cred); if ((cnp->cn_flags & LOCKLEAF) == 0) VOP_UNLOCK(dp, 0, td); return (0); bad2: if ((cnp->cn_flags & LOCKPARENT) && (cnp->cn_flags & ISLASTCN)) VOP_UNLOCK(dvp, 0, td); vrele(dvp); bad: vput(dp); *vpp = NULL; return (error); } Index: head/sys/kern/vfs_syscalls.c =================================================================== --- head/sys/kern/vfs_syscalls.c (revision 89305) +++ head/sys/kern/vfs_syscalls.c (revision 89306) @@ -1,4191 +1,4304 @@ /* * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_syscalls.c 8.13 (Berkeley) 4/15/94 * $FreeBSD$ */ /* For 4.3 integer FS ID compatibility */ #include "opt_compat.h" #include "opt_ffs.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int change_dir __P((struct nameidata *ndp, struct thread *td)); static void checkdirs __P((struct vnode *olddp, struct vnode *newdp)); static int chroot_refuse_vdir_fds __P((struct filedesc *fdp)); static int getutimes __P((const struct timeval *, struct timespec *)); static int setfown __P((struct thread *td, struct vnode *, uid_t, gid_t)); static int setfmode __P((struct thread *td, struct vnode *, int)); static int setfflags __P((struct thread *td, struct vnode *, int)); static int setutimes __P((struct thread *td, struct vnode *, const struct timespec *, int)); static int vn_access __P((struct vnode *vp, int user_flags, struct ucred *cred, struct thread *td)); static int usermount = 0; /* if 1, non-root can mount fs. */ int (*union_dircheckp) __P((struct thread *td, struct vnode **, struct file *)); SYSCTL_INT(_vfs, OID_AUTO, usermount, CTLFLAG_RW, &usermount, 0, ""); /* * Virtual File System System Calls */ #ifndef _SYS_SYSPROTO_H_ struct nmount_args { struct iovec *iovp; unsigned int iovcnt; int flags; }; #endif /* ARGSUSED */ int nmount(td, uap) struct thread *td; struct nmount_args /* { syscallarg(struct iovec *) iovp; syscallarg(unsigned int) iovcnt; syscallarg(int) flags; } */ *uap; { return(EOPNOTSUPP); } /* * Mount a file system. */ #ifndef _SYS_SYSPROTO_H_ struct mount_args { char *type; char *path; int flags; caddr_t data; }; #endif /* ARGSUSED */ int mount(td, uap) struct thread *td; struct mount_args /* { syscallarg(char *) type; syscallarg(char *) path; syscallarg(int) flags; syscallarg(caddr_t) data; } */ *uap; { char *fstype; char *fspath; int error; fstype = malloc(MFSNAMELEN, M_TEMP, M_WAITOK | M_ZERO); fspath = malloc(MNAMELEN, M_TEMP, M_WAITOK | M_ZERO); /* * vfs_mount() actually takes a kernel string for `type' and * `path' now, so extract them. */ error = copyinstr(SCARG(uap, type), fstype, MFSNAMELEN, NULL); if (error) goto finish; error = copyinstr(SCARG(uap, path), fspath, MNAMELEN, NULL); if (error) goto finish; error = vfs_mount(td, fstype, fspath, SCARG(uap, flags), SCARG(uap, data)); finish: free(fstype, M_TEMP); free(fspath, M_TEMP); return (error); } /* * vfs_mount(): actually attempt a filesystem mount. * * This routine is designed to be a "generic" entry point for routines * that wish to mount a filesystem. All parameters except `fsdata' are * pointers into kernel space. `fsdata' is currently still a pointer * into userspace. */ int vfs_mount(td, fstype, fspath, fsflags, fsdata) struct thread *td; const char *fstype; char *fspath; int fsflags; void *fsdata; { struct vnode *vp; struct mount *mp; struct vfsconf *vfsp; int error, flag = 0, flag2 = 0; struct vattr va; struct nameidata nd; /* * Be ultra-paranoid about making sure the type and fspath * variables will fit in our mp buffers, including the * terminating NUL. */ if ((strlen(fstype) >= MFSNAMELEN - 1) || (strlen(fspath) >= MNAMELEN - 1)) return (ENAMETOOLONG); if (usermount == 0) { error = suser_td(td); if (error) return (error); } /* * Do not allow NFS export by non-root users. */ if (fsflags & MNT_EXPORTED) { error = suser_td(td); if (error) return (error); } /* * Silently enforce MNT_NOSUID and MNT_NODEV for non-root users */ if (suser_xxx(td->td_proc->p_ucred, 0, 0)) fsflags |= MNT_NOSUID | MNT_NODEV; /* * Get vnode to be covered */ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspath, td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; if (fsflags & MNT_UPDATE) { if ((vp->v_flag & VROOT) == 0) { vput(vp); return (EINVAL); } mp = vp->v_mount; flag = mp->mnt_flag; flag2 = mp->mnt_kern_flag; /* * We only allow the filesystem to be reloaded if it * is currently mounted read-only. */ if ((fsflags & MNT_RELOAD) && ((mp->mnt_flag & MNT_RDONLY) == 0)) { vput(vp); return (EOPNOTSUPP); /* Needs translation */ } /* * Only root, or the user that did the original mount is * permitted to update it. */ if (mp->mnt_stat.f_owner != td->td_proc->p_ucred->cr_uid) { error = suser_td(td); if (error) { vput(vp); return (error); } } if (vfs_busy(mp, LK_NOWAIT, 0, td)) { vput(vp); return (EBUSY); } mtx_lock(&vp->v_interlock); if ((vp->v_flag & VMOUNT) != 0 || vp->v_mountedhere != NULL) { mtx_unlock(&vp->v_interlock); vfs_unbusy(mp, td); vput(vp); return (EBUSY); } vp->v_flag |= VMOUNT; mtx_unlock(&vp->v_interlock); mp->mnt_flag |= fsflags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE | MNT_SNAPSHOT); VOP_UNLOCK(vp, 0, td); goto update; } /* * If the user is not root, ensure that they own the directory * onto which we are attempting to mount. */ error = VOP_GETATTR(vp, &va, td->td_proc->p_ucred, td); if (error) { vput(vp); return (error); } if (va.va_uid != td->td_proc->p_ucred->cr_uid) { error = suser_td(td); if (error) { vput(vp); return (error); } } if ((error = vinvalbuf(vp, V_SAVE, td->td_proc->p_ucred, td, 0, 0)) != 0) { vput(vp); return (error); } if (vp->v_type != VDIR) { vput(vp); return (ENOTDIR); } for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) if (!strcmp(vfsp->vfc_name, fstype)) break; if (vfsp == NULL) { linker_file_t lf; /* Only load modules for root (very important!) */ error = suser_td(td); if (error) { vput(vp); return error; } error = linker_load_file(fstype, &lf); if (error || lf == NULL) { vput(vp); if (lf == NULL) error = ENODEV; return error; } lf->userrefs++; /* lookup again, see if the VFS was loaded */ for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) if (!strcmp(vfsp->vfc_name, fstype)) break; if (vfsp == NULL) { lf->userrefs--; linker_file_unload(lf); vput(vp); return (ENODEV); } } mtx_lock(&vp->v_interlock); if ((vp->v_flag & VMOUNT) != 0 || vp->v_mountedhere != NULL) { mtx_unlock(&vp->v_interlock); vput(vp); return (EBUSY); } vp->v_flag |= VMOUNT; mtx_unlock(&vp->v_interlock); /* * Allocate and initialize the filesystem. */ mp = malloc(sizeof(struct mount), M_MOUNT, M_WAITOK | M_ZERO); TAILQ_INIT(&mp->mnt_nvnodelist); TAILQ_INIT(&mp->mnt_reservedvnlist); lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, LK_NOPAUSE); (void)vfs_busy(mp, LK_NOWAIT, 0, td); mp->mnt_op = vfsp->vfc_vfsops; mp->mnt_vfc = vfsp; vfsp->vfc_refcount++; mp->mnt_stat.f_type = vfsp->vfc_typenum; mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK; strncpy(mp->mnt_stat.f_fstypename, fstype, MFSNAMELEN); mp->mnt_stat.f_fstypename[MFSNAMELEN - 1] = '\0'; mp->mnt_vnodecovered = vp; mp->mnt_stat.f_owner = td->td_proc->p_ucred->cr_uid; strncpy(mp->mnt_stat.f_mntonname, fspath, MNAMELEN); mp->mnt_stat.f_mntonname[MNAMELEN - 1] = '\0'; mp->mnt_iosize_max = DFLTPHYS; VOP_UNLOCK(vp, 0, td); update: /* * Set the mount level flags. */ if (fsflags & MNT_RDONLY) mp->mnt_flag |= MNT_RDONLY; else if (mp->mnt_flag & MNT_RDONLY) mp->mnt_kern_flag |= MNTK_WANTRDWR; mp->mnt_flag &=~ MNT_UPDATEMASK; mp->mnt_flag |= fsflags & (MNT_UPDATEMASK | MNT_FORCE); /* * Mount the filesystem. * XXX The final recipients of VFS_MOUNT just overwrite the ndp they * get. No freeing of cn_pnbuf. */ error = VFS_MOUNT(mp, fspath, fsdata, &nd, td); if (mp->mnt_flag & MNT_UPDATE) { if (mp->mnt_kern_flag & MNTK_WANTRDWR) mp->mnt_flag &= ~MNT_RDONLY; mp->mnt_flag &=~ (MNT_UPDATE | MNT_RELOAD | MNT_FORCE | MNT_SNAPSHOT); mp->mnt_kern_flag &=~ MNTK_WANTRDWR; if (error) { mp->mnt_flag = flag; mp->mnt_kern_flag = flag2; } if ((mp->mnt_flag & MNT_RDONLY) == 0) { if (mp->mnt_syncer == NULL) error = vfs_allocate_syncvnode(mp); } else { if (mp->mnt_syncer != NULL) vrele(mp->mnt_syncer); mp->mnt_syncer = NULL; } vfs_unbusy(mp, td); mtx_lock(&vp->v_interlock); vp->v_flag &= ~VMOUNT; mtx_unlock(&vp->v_interlock); vrele(vp); return (error); } vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); /* * Put the new filesystem on the mount list after root. */ cache_purge(vp); if (!error) { struct vnode *newdp; mtx_lock(&vp->v_interlock); vp->v_flag &= ~VMOUNT; vp->v_mountedhere = mp; mtx_unlock(&vp->v_interlock); mtx_lock(&mountlist_mtx); TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list); mtx_unlock(&mountlist_mtx); if (VFS_ROOT(mp, &newdp)) panic("mount: lost mount"); checkdirs(vp, newdp); vput(newdp); VOP_UNLOCK(vp, 0, td); if ((mp->mnt_flag & MNT_RDONLY) == 0) error = vfs_allocate_syncvnode(mp); vfs_unbusy(mp, td); if ((error = VFS_START(mp, 0, td)) != 0) vrele(vp); } else { mtx_lock(&vp->v_interlock); vp->v_flag &= ~VMOUNT; mtx_unlock(&vp->v_interlock); mp->mnt_vfc->vfc_refcount--; vfs_unbusy(mp, td); free((caddr_t)mp, M_MOUNT); vput(vp); } return (error); } /* * Scan all active processes to see if any of them have a current * or root directory of `olddp'. If so, replace them with the new * mount point. */ static void checkdirs(olddp, newdp) struct vnode *olddp, *newdp; { struct filedesc *fdp; struct proc *p; if (olddp->v_usecount == 1) return; sx_slock(&allproc_lock); LIST_FOREACH(p, &allproc, p_list) { fdp = p->p_fd; if (fdp == NULL) continue; + FILEDESC_LOCK(fdp); if (fdp->fd_cdir == olddp) { - vrele(fdp->fd_cdir); VREF(newdp); fdp->fd_cdir = newdp; + FILEDESC_UNLOCK(fdp); + vrele(olddp); + FILEDESC_LOCK(fdp); } if (fdp->fd_rdir == olddp) { - vrele(fdp->fd_rdir); VREF(newdp); fdp->fd_rdir = newdp; - } + FILEDESC_UNLOCK(fdp); + vrele(olddp); + } else + FILEDESC_UNLOCK(fdp); } sx_sunlock(&allproc_lock); if (rootvnode == olddp) { vrele(rootvnode); VREF(newdp); rootvnode = newdp; } } /* * Unmount a file system. * * Note: unmount takes a path to the vnode mounted on as argument, * not special file (as before). */ #ifndef _SYS_SYSPROTO_H_ struct unmount_args { char *path; int flags; }; #endif /* ARGSUSED */ int unmount(td, uap) struct thread *td; register struct unmount_args /* { syscallarg(char *) path; syscallarg(int) flags; } */ *uap; { register struct vnode *vp; struct mount *mp; int error; struct nameidata nd; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, SCARG(uap, path), td); if ((error = namei(&nd)) != 0) return (error); vp = nd.ni_vp; NDFREE(&nd, NDF_ONLY_PNBUF); mp = vp->v_mount; /* * Only root, or the user that did the original mount is * permitted to unmount this filesystem. */ if (mp->mnt_stat.f_owner != td->td_proc->p_ucred->cr_uid) { error = suser_td(td); if (error) { vput(vp); return (error); } } /* * Don't allow unmounting the root file system. */ if (mp->mnt_flag & MNT_ROOTFS) { vput(vp); return (EINVAL); } /* * Must be the root of the filesystem */ if ((vp->v_flag & VROOT) == 0) { vput(vp); return (EINVAL); } vput(vp); return (dounmount(mp, SCARG(uap, flags), td)); } /* * Do the actual file system unmount. */ int dounmount(mp, flags, td) struct mount *mp; int flags; struct thread *td; { struct vnode *coveredvp, *fsrootvp; int error; int async_flag; mtx_lock(&mountlist_mtx); mp->mnt_kern_flag |= MNTK_UNMOUNT; error = lockmgr(&mp->mnt_lock, LK_DRAIN | LK_INTERLOCK | ((flags & MNT_FORCE) ? 0 : LK_NOWAIT), &mountlist_mtx, td); if (error) { mp->mnt_kern_flag &= ~MNTK_UNMOUNT; if (mp->mnt_kern_flag & MNTK_MWAIT) wakeup((caddr_t)mp); return (error); } vn_start_write(NULL, &mp, V_WAIT); if (mp->mnt_flag & MNT_EXPUBLIC) vfs_setpublicfs(NULL, NULL, NULL); vfs_msync(mp, MNT_WAIT); async_flag = mp->mnt_flag & MNT_ASYNC; mp->mnt_flag &=~ MNT_ASYNC; cache_purgevfs(mp); /* remove cache entries for this file sys */ if (mp->mnt_syncer != NULL) vrele(mp->mnt_syncer); /* Move process cdir/rdir refs on fs root to underlying vnode. */ if (VFS_ROOT(mp, &fsrootvp) == 0) { if (mp->mnt_vnodecovered != NULL) checkdirs(fsrootvp, mp->mnt_vnodecovered); if (fsrootvp == rootvnode) { vrele(rootvnode); rootvnode = NULL; } vput(fsrootvp); } if (((mp->mnt_flag & MNT_RDONLY) || (error = VFS_SYNC(mp, MNT_WAIT, td->td_proc->p_ucred, td)) == 0) || (flags & MNT_FORCE)) { error = VFS_UNMOUNT(mp, flags, td); } vn_finished_write(mp); if (error) { /* Undo cdir/rdir and rootvnode changes made above. */ if (VFS_ROOT(mp, &fsrootvp) == 0) { if (mp->mnt_vnodecovered != NULL) checkdirs(mp->mnt_vnodecovered, fsrootvp); if (rootvnode == NULL) { rootvnode = fsrootvp; vref(rootvnode); } vput(fsrootvp); } if ((mp->mnt_flag & MNT_RDONLY) == 0 && mp->mnt_syncer == NULL) (void) vfs_allocate_syncvnode(mp); mtx_lock(&mountlist_mtx); mp->mnt_kern_flag &= ~MNTK_UNMOUNT; mp->mnt_flag |= async_flag; lockmgr(&mp->mnt_lock, LK_RELEASE | LK_INTERLOCK, &mountlist_mtx, td); if (mp->mnt_kern_flag & MNTK_MWAIT) wakeup((caddr_t)mp); return (error); } mtx_lock(&mountlist_mtx); TAILQ_REMOVE(&mountlist, mp, mnt_list); if ((coveredvp = mp->mnt_vnodecovered) != NULL) coveredvp->v_mountedhere = NULL; mp->mnt_vfc->vfc_refcount--; if (!TAILQ_EMPTY(&mp->mnt_nvnodelist)) panic("unmount: dangling vnode"); lockmgr(&mp->mnt_lock, LK_RELEASE | LK_INTERLOCK, &mountlist_mtx, td); lockdestroy(&mp->mnt_lock); if (coveredvp != NULL) vrele(coveredvp); if (mp->mnt_kern_flag & MNTK_MWAIT) wakeup((caddr_t)mp); free((caddr_t)mp, M_MOUNT); return (0); } /* * Sync each mounted filesystem. */ #ifndef _SYS_SYSPROTO_H_ struct sync_args { int dummy; }; #endif #ifdef DEBUG static int syncprt = 0; SYSCTL_INT(_debug, OID_AUTO, syncprt, CTLFLAG_RW, &syncprt, 0, ""); #endif /* ARGSUSED */ int sync(td, uap) struct thread *td; struct sync_args *uap; { struct mount *mp, *nmp; int asyncflag; mtx_lock(&mountlist_mtx); for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } if ((mp->mnt_flag & MNT_RDONLY) == 0 && vn_start_write(NULL, &mp, V_NOWAIT) == 0) { asyncflag = mp->mnt_flag & MNT_ASYNC; mp->mnt_flag &= ~MNT_ASYNC; vfs_msync(mp, MNT_NOWAIT); VFS_SYNC(mp, MNT_NOWAIT, ((td != NULL) ? td->td_proc->p_ucred : NOCRED), td); mp->mnt_flag |= asyncflag; vn_finished_write(mp); } mtx_lock(&mountlist_mtx); nmp = TAILQ_NEXT(mp, mnt_list); vfs_unbusy(mp, td); } mtx_unlock(&mountlist_mtx); #if 0 /* * XXX don't call vfs_bufstats() yet because that routine * was not imported in the Lite2 merge. */ #ifdef DIAGNOSTIC if (syncprt) vfs_bufstats(); #endif /* DIAGNOSTIC */ #endif return (0); } /* XXX PRISON: could be per prison flag */ static int prison_quotas; #if 0 SYSCTL_INT(_kern_prison, OID_AUTO, quotas, CTLFLAG_RW, &prison_quotas, 0, ""); #endif /* * Change filesystem quotas. */ #ifndef _SYS_SYSPROTO_H_ struct quotactl_args { char *path; int cmd; int uid; caddr_t arg; }; #endif /* ARGSUSED */ int quotactl(td, uap) struct thread *td; register struct quotactl_args /* { syscallarg(char *) path; syscallarg(int) cmd; syscallarg(int) uid; syscallarg(caddr_t) arg; } */ *uap; { struct mount *mp; int error; struct nameidata nd; if (jailed(td->td_proc->p_ucred) && !prison_quotas) return (EPERM); NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = vn_start_write(nd.ni_vp, &mp, V_WAIT | PCATCH); vrele(nd.ni_vp); if (error) return (error); error = VFS_QUOTACTL(mp, SCARG(uap, cmd), SCARG(uap, uid), SCARG(uap, arg), td); vn_finished_write(mp); return (error); } /* * Get filesystem statistics. */ #ifndef _SYS_SYSPROTO_H_ struct statfs_args { char *path; struct statfs *buf; }; #endif /* ARGSUSED */ int statfs(td, uap) struct thread *td; register struct statfs_args /* { syscallarg(char *) path; syscallarg(struct statfs *) buf; } */ *uap; { register struct mount *mp; register struct statfs *sp; int error; struct nameidata nd; struct statfs sb; NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td); if ((error = namei(&nd)) != 0) return (error); mp = nd.ni_vp->v_mount; sp = &mp->mnt_stat; NDFREE(&nd, NDF_ONLY_PNBUF); vrele(nd.ni_vp); error = VFS_STATFS(mp, sp, td); if (error) return (error); sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; if (suser_xxx(td->td_proc->p_ucred, 0, 0)) { bcopy((caddr_t)sp, (caddr_t)&sb, sizeof(sb)); sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0; sp = &sb; } return (copyout((caddr_t)sp, (caddr_t)SCARG(uap, buf), sizeof(*sp))); } /* * Get filesystem statistics. */ #ifndef _SYS_SYSPROTO_H_ struct fstatfs_args { int fd; struct statfs *buf; }; #endif /* ARGSUSED */ int fstatfs(td, uap) struct thread *td; register struct fstatfs_args /* { syscallarg(int) fd; syscallarg(struct statfs *) buf; } */ *uap; { struct file *fp; struct mount *mp; register struct statfs *sp; int error; struct statfs sb; if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0) return (error); mp = ((struct vnode *)fp->f_data)->v_mount; + fdrop(fp, td); if (mp == NULL) return (EBADF); sp = &mp->mnt_stat; error = VFS_STATFS(mp, sp, td); if (error) return (error); sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; if (suser_xxx(td->td_proc->p_ucred, 0, 0)) { bcopy((caddr_t)sp, (caddr_t)&sb, sizeof(sb)); sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0; sp = &sb; } return (copyout((caddr_t)sp, (caddr_t)SCARG(uap, buf), sizeof(*sp))); } /* * Get statistics on all filesystems. */ #ifndef _SYS_SYSPROTO_H_ struct getfsstat_args { struct statfs *buf; long bufsize; int flags; }; #endif int getfsstat(td, uap) struct thread *td; register struct getfsstat_args /* { syscallarg(struct statfs *) buf; syscallarg(long) bufsize; syscallarg(int) flags; } */ *uap; { register struct mount *mp, *nmp; register struct statfs *sp; caddr_t sfsp; long count, maxcount, error; maxcount = SCARG(uap, bufsize) / sizeof(struct statfs); sfsp = (caddr_t)SCARG(uap, buf); count = 0; mtx_lock(&mountlist_mtx); for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } if (sfsp && count < maxcount) { sp = &mp->mnt_stat; /* * If MNT_NOWAIT or MNT_LAZY is specified, do not * refresh the fsstat cache. MNT_NOWAIT or MNT_LAZY * overrides MNT_WAIT. */ if (((SCARG(uap, flags) & (MNT_LAZY|MNT_NOWAIT)) == 0 || (SCARG(uap, flags) & MNT_WAIT)) && (error = VFS_STATFS(mp, sp, td))) { mtx_lock(&mountlist_mtx); nmp = TAILQ_NEXT(mp, mnt_list); vfs_unbusy(mp, td); continue; } sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; error = copyout((caddr_t)sp, sfsp, sizeof(*sp)); if (error) { vfs_unbusy(mp, td); return (error); } sfsp += sizeof(*sp); } count++; mtx_lock(&mountlist_mtx); nmp = TAILQ_NEXT(mp, mnt_list); vfs_unbusy(mp, td); } mtx_unlock(&mountlist_mtx); if (sfsp && count > maxcount) td->td_retval[0] = maxcount; else td->td_retval[0] = count; return (0); } /* * Change current working directory to a given file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fchdir_args { int fd; }; #endif /* ARGSUSED */ int fchdir(td, uap) struct thread *td; struct fchdir_args /* { syscallarg(int) fd; } */ *uap; { register struct filedesc *fdp = td->td_proc->p_fd; - struct vnode *vp, *tdp; + struct vnode *vp, *tdp, *vpold; struct mount *mp; struct file *fp; int error; if ((error = getvnode(fdp, SCARG(uap, fd), &fp)) != 0) return (error); vp = (struct vnode *)fp->f_data; VREF(vp); + fdrop(fp, td); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); if (vp->v_type != VDIR) error = ENOTDIR; else error = VOP_ACCESS(vp, VEXEC, td->td_proc->p_ucred, td); while (!error && (mp = vp->v_mountedhere) != NULL) { if (vfs_busy(mp, 0, 0, td)) continue; error = VFS_ROOT(mp, &tdp); vfs_unbusy(mp, td); if (error) break; vput(vp); vp = tdp; } if (error) { vput(vp); return (error); } VOP_UNLOCK(vp, 0, td); - vrele(fdp->fd_cdir); + FILEDESC_LOCK(fdp); + vpold = fdp->fd_cdir; fdp->fd_cdir = vp; + FILEDESC_UNLOCK(fdp); + vrele(vpold); return (0); } /* * Change current working directory (``.''). */ #ifndef _SYS_SYSPROTO_H_ struct chdir_args { char *path; }; #endif /* ARGSUSED */ int chdir(td, uap) struct thread *td; struct chdir_args /* { syscallarg(char *) path; } */ *uap; { register struct filedesc *fdp = td->td_proc->p_fd; int error; struct nameidata nd; + struct vnode *vp; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, SCARG(uap, path), td); if ((error = change_dir(&nd, td)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); - vrele(fdp->fd_cdir); + FILEDESC_LOCK(fdp); + vp = fdp->fd_cdir; fdp->fd_cdir = nd.ni_vp; + FILEDESC_UNLOCK(fdp); + vrele(vp); return (0); } /* * Helper function for raised chroot(2) security function: Refuse if * any filedescriptors are open directories. */ static int chroot_refuse_vdir_fds(fdp) struct filedesc *fdp; { struct vnode *vp; struct file *fp; + struct thread *td = curthread; int error; int fd; + FILEDESC_LOCK(fdp); for (fd = 0; fd < fdp->fd_nfiles ; fd++) { error = getvnode(fdp, fd, &fp); if (error) continue; vp = (struct vnode *)fp->f_data; + fdrop(fp, td); if (vp->v_type != VDIR) continue; + FILEDESC_UNLOCK(fdp); return(EPERM); } + FILEDESC_UNLOCK(fdp); return (0); } /* * This sysctl determines if we will allow a process to chroot(2) if it * has a directory open: * 0: disallowed for all processes. * 1: allowed for processes that were not already chroot(2)'ed. * 2: allowed for all processes. */ static int chroot_allow_open_directories = 1; SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW, &chroot_allow_open_directories, 0, ""); /* * Change notion of root (``/'') directory. */ #ifndef _SYS_SYSPROTO_H_ struct chroot_args { char *path; }; #endif /* ARGSUSED */ int chroot(td, uap) struct thread *td; struct chroot_args /* { syscallarg(char *) path; } */ *uap; { register struct filedesc *fdp = td->td_proc->p_fd; int error; struct nameidata nd; + struct vnode *vp; error = suser_xxx(0, td->td_proc, PRISON_ROOT); if (error) return (error); + FILEDESC_LOCK(fdp); if (chroot_allow_open_directories == 0 || - (chroot_allow_open_directories == 1 && fdp->fd_rdir != rootvnode)) + (chroot_allow_open_directories == 1 && fdp->fd_rdir != rootvnode)) { + FILEDESC_UNLOCK(fdp); error = chroot_refuse_vdir_fds(fdp); + } else + FILEDESC_UNLOCK(fdp); if (error) return (error); NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, SCARG(uap, path), td); if ((error = change_dir(&nd, td)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); - vrele(fdp->fd_rdir); + FILEDESC_LOCK(fdp); + vp = fdp->fd_rdir; fdp->fd_rdir = nd.ni_vp; if (!fdp->fd_jdir) { fdp->fd_jdir = nd.ni_vp; VREF(fdp->fd_jdir); } + FILEDESC_UNLOCK(fdp); + vrele(vp); return (0); } /* * Common routine for chroot and chdir. */ static int change_dir(ndp, td) register struct nameidata *ndp; struct thread *td; { struct vnode *vp; int error; error = namei(ndp); if (error) return (error); vp = ndp->ni_vp; if (vp->v_type != VDIR) error = ENOTDIR; else error = VOP_ACCESS(vp, VEXEC, td->td_proc->p_ucred, td); if (error) vput(vp); else VOP_UNLOCK(vp, 0, td); return (error); } /* * Check permissions, allocate an open file structure, * and call the device open routine if any. */ #ifndef _SYS_SYSPROTO_H_ struct open_args { char *path; int flags; int mode; }; #endif int open(td, uap) struct thread *td; register struct open_args /* { syscallarg(char *) path; syscallarg(int) flags; syscallarg(int) mode; } */ *uap; { struct proc *p = td->td_proc; struct filedesc *fdp = p->p_fd; struct file *fp; struct vnode *vp; struct vattr vat; struct mount *mp; int cmode, flags, oflags; struct file *nfp; int type, indx, error; struct flock lf; struct nameidata nd; oflags = SCARG(uap, flags); if ((oflags & O_ACCMODE) == O_ACCMODE) return (EINVAL); flags = FFLAGS(oflags); error = falloc(td, &nfp, &indx); if (error) return (error); fp = nfp; + FILEDESC_LOCK(fdp); cmode = ((SCARG(uap, mode) &~ fdp->fd_cmask) & ALLPERMS) &~ S_ISTXT; + FILEDESC_UNLOCK(fdp); NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td); td->td_dupfd = -indx - 1; /* XXX check for fdopen */ /* * Bump the ref count to prevent another process from closing * the descriptor while we are blocked in vn_open() */ fhold(fp); error = vn_open(&nd, &flags, cmode); if (error) { /* * release our own reference */ fdrop(fp, td); /* * handle special fdopen() case. bleh. dupfdopen() is * responsible for dropping the old contents of ofiles[indx] * if it succeeds. */ if ((error == ENODEV || error == ENXIO) && td->td_dupfd >= 0 && /* XXX from fdopen */ (error = dupfdopen(td, fdp, indx, td->td_dupfd, flags, error)) == 0) { td->td_retval[0] = indx; return (0); } /* * Clean up the descriptor, but only if another thread hadn't * replaced or closed it. */ + FILEDESC_LOCK(fdp); if (fdp->fd_ofiles[indx] == fp) { fdp->fd_ofiles[indx] = NULL; + FILEDESC_UNLOCK(fdp); fdrop(fp, td); - } + } else + FILEDESC_UNLOCK(fdp); if (error == ERESTART) error = EINTR; return (error); } td->td_dupfd = 0; NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; /* * There should be 2 references on the file, one from the descriptor * table, and one for us. * * Handle the case where someone closed the file (via its file * descriptor) while we were blocked. The end result should look * like opening the file succeeded but it was immediately closed. */ + FILEDESC_LOCK(fdp); + FILE_LOCK(fp); if (fp->f_count == 1) { KASSERT(fdp->fd_ofiles[indx] != fp, ("Open file descriptor lost all refs")); + FILEDESC_UNLOCK(fdp); + FILE_UNLOCK(fp); VOP_UNLOCK(vp, 0, td); vn_close(vp, flags & FMASK, fp->f_cred, td); fdrop(fp, td); td->td_retval[0] = indx; return 0; } fp->f_data = (caddr_t)vp; fp->f_flag = flags & FMASK; fp->f_ops = &vnops; fp->f_type = (vp->v_type == VFIFO ? DTYPE_FIFO : DTYPE_VNODE); + FILEDESC_UNLOCK(fdp); + FILE_UNLOCK(fp); VOP_UNLOCK(vp, 0, td); if (flags & (O_EXLOCK | O_SHLOCK)) { lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; if (flags & O_EXLOCK) lf.l_type = F_WRLCK; else lf.l_type = F_RDLCK; type = F_FLOCK; if ((flags & FNONBLOCK) == 0) type |= F_WAIT; if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type)) != 0) goto bad; fp->f_flag |= FHASLOCK; } if (flags & O_TRUNC) { if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) goto bad; VOP_LEASE(vp, td, p->p_ucred, LEASE_WRITE); VATTR_NULL(&vat); vat.va_size = 0; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); error = VOP_SETATTR(vp, &vat, p->p_ucred, td); VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); if (error) goto bad; } /* assert that vn_open created a backing object if one is needed */ KASSERT(!vn_canvmio(vp) || VOP_GETVOBJECT(vp, NULL) == 0, ("open: vmio vnode has no backing object after vn_open")); /* * Release our private reference, leaving the one associated with * the descriptor table intact. */ fdrop(fp, td); td->td_retval[0] = indx; return (0); bad: + FILEDESC_LOCK(fdp); if (fdp->fd_ofiles[indx] == fp) { fdp->fd_ofiles[indx] = NULL; + FILEDESC_UNLOCK(fdp); fdrop(fp, td); - } - fdrop(fp, td); + } else + FILEDESC_UNLOCK(fdp); return (error); } #ifdef COMPAT_43 /* * Create a file. */ #ifndef _SYS_SYSPROTO_H_ struct ocreat_args { char *path; int mode; }; #endif int ocreat(td, uap) struct thread *td; register struct ocreat_args /* { syscallarg(char *) path; syscallarg(int) mode; } */ *uap; { struct open_args /* { syscallarg(char *) path; syscallarg(int) flags; syscallarg(int) mode; } */ nuap; SCARG(&nuap, path) = SCARG(uap, path); SCARG(&nuap, mode) = SCARG(uap, mode); SCARG(&nuap, flags) = O_WRONLY | O_CREAT | O_TRUNC; return (open(td, &nuap)); } #endif /* COMPAT_43 */ /* * Create a special file. */ #ifndef _SYS_SYSPROTO_H_ struct mknod_args { char *path; int mode; int dev; }; #endif /* ARGSUSED */ int mknod(td, uap) struct thread *td; register struct mknod_args /* { syscallarg(char *) path; syscallarg(int) mode; syscallarg(int) dev; } */ *uap; { struct vnode *vp; struct mount *mp; struct vattr vattr; int error; int whiteout = 0; struct nameidata nd; switch (SCARG(uap, mode) & S_IFMT) { case S_IFCHR: case S_IFBLK: error = suser_td(td); break; default: error = suser_xxx(0, td->td_proc, PRISON_ROOT); break; } if (error) return (error); restart: bwillwrite(); NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), td); if ((error = namei(&nd)) != 0) return (error); vp = nd.ni_vp; if (vp != NULL) { vrele(vp); error = EEXIST; } else { VATTR_NULL(&vattr); + FILEDESC_LOCK(td->td_proc->p_fd); vattr.va_mode = (SCARG(uap, mode) & ALLPERMS) &~ td->td_proc->p_fd->fd_cmask; + FILEDESC_UNLOCK(td->td_proc->p_fd); vattr.va_rdev = SCARG(uap, dev); whiteout = 0; switch (SCARG(uap, mode) & S_IFMT) { case S_IFMT: /* used by badsect to flag bad sectors */ vattr.va_type = VBAD; break; case S_IFCHR: vattr.va_type = VCHR; break; case S_IFBLK: vattr.va_type = VBLK; break; case S_IFWHT: whiteout = 1; break; default: error = EINVAL; break; } } if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } if (!error) { VOP_LEASE(nd.ni_dvp, td, td->td_proc->p_ucred, LEASE_WRITE); if (whiteout) error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE); else { error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); if (error == 0) vput(nd.ni_vp); } } NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); vn_finished_write(mp); ASSERT_VOP_UNLOCKED(nd.ni_dvp, "mknod"); ASSERT_VOP_UNLOCKED(nd.ni_vp, "mknod"); return (error); } /* * Create a named pipe. */ #ifndef _SYS_SYSPROTO_H_ struct mkfifo_args { char *path; int mode; }; #endif /* ARGSUSED */ int mkfifo(td, uap) struct thread *td; register struct mkfifo_args /* { syscallarg(char *) path; syscallarg(int) mode; } */ *uap; { struct mount *mp; struct vattr vattr; int error; struct nameidata nd; restart: bwillwrite(); NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), td); if ((error = namei(&nd)) != 0) return (error); if (nd.ni_vp != NULL) { NDFREE(&nd, NDF_ONLY_PNBUF); vrele(nd.ni_vp); vput(nd.ni_dvp); return (EEXIST); } if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } VATTR_NULL(&vattr); vattr.va_type = VFIFO; + FILEDESC_LOCK(td->td_proc->p_fd); vattr.va_mode = (SCARG(uap, mode) & ALLPERMS) &~ td->td_proc->p_fd->fd_cmask; + FILEDESC_UNLOCK(td->td_proc->p_fd); VOP_LEASE(nd.ni_dvp, td, td->td_proc->p_ucred, LEASE_WRITE); error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); if (error == 0) vput(nd.ni_vp); NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); vn_finished_write(mp); return (error); } /* * Make a hard file link. */ #ifndef _SYS_SYSPROTO_H_ struct link_args { char *path; char *link; }; #endif /* ARGSUSED */ int link(td, uap) struct thread *td; register struct link_args /* { syscallarg(char *) path; syscallarg(char *) link; } */ *uap; { struct vnode *vp; struct mount *mp; struct nameidata nd; int error; bwillwrite(); NDINIT(&nd, LOOKUP, FOLLOW|NOOBJ, UIO_USERSPACE, SCARG(uap, path), td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; if (vp->v_type == VDIR) { vrele(vp); return (EPERM); /* POSIX */ } if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) { vrele(vp); return (error); } NDINIT(&nd, CREATE, LOCKPARENT|NOOBJ, UIO_USERSPACE, SCARG(uap, link), td); if ((error = namei(&nd)) == 0) { if (nd.ni_vp != NULL) { vrele(nd.ni_vp); error = EEXIST; } else { VOP_LEASE(nd.ni_dvp, td, td->td_proc->p_ucred, LEASE_WRITE); VOP_LEASE(vp, td, td->td_proc->p_ucred, LEASE_WRITE); error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd); } NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); } vrele(vp); vn_finished_write(mp); ASSERT_VOP_UNLOCKED(nd.ni_dvp, "link"); ASSERT_VOP_UNLOCKED(nd.ni_vp, "link"); return (error); } /* * Make a symbolic link. */ #ifndef _SYS_SYSPROTO_H_ struct symlink_args { char *path; char *link; }; #endif /* ARGSUSED */ int symlink(td, uap) struct thread *td; register struct symlink_args /* { syscallarg(char *) path; syscallarg(char *) link; } */ *uap; { struct mount *mp; struct vattr vattr; char *path; int error; struct nameidata nd; path = zalloc(namei_zone); if ((error = copyinstr(SCARG(uap, path), path, MAXPATHLEN, NULL)) != 0) goto out; restart: bwillwrite(); NDINIT(&nd, CREATE, LOCKPARENT|NOOBJ, UIO_USERSPACE, SCARG(uap, link), td); if ((error = namei(&nd)) != 0) goto out; if (nd.ni_vp) { NDFREE(&nd, NDF_ONLY_PNBUF); vrele(nd.ni_vp); vput(nd.ni_dvp); error = EEXIST; goto out; } if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } VATTR_NULL(&vattr); + FILEDESC_LOCK(td->td_proc->p_fd); vattr.va_mode = ACCESSPERMS &~ td->td_proc->p_fd->fd_cmask; + FILEDESC_UNLOCK(td->td_proc->p_fd); VOP_LEASE(nd.ni_dvp, td, td->td_proc->p_ucred, LEASE_WRITE); error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, path); NDFREE(&nd, NDF_ONLY_PNBUF); if (error == 0) vput(nd.ni_vp); vput(nd.ni_dvp); vn_finished_write(mp); ASSERT_VOP_UNLOCKED(nd.ni_dvp, "symlink"); ASSERT_VOP_UNLOCKED(nd.ni_vp, "symlink"); out: zfree(namei_zone, path); return (error); } /* * Delete a whiteout from the filesystem. */ /* ARGSUSED */ int undelete(td, uap) struct thread *td; register struct undelete_args /* { syscallarg(char *) path; } */ *uap; { int error; struct mount *mp; struct nameidata nd; restart: bwillwrite(); NDINIT(&nd, DELETE, LOCKPARENT|DOWHITEOUT, UIO_USERSPACE, SCARG(uap, path), td); error = namei(&nd); if (error) return (error); if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) { NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_vp) vrele(nd.ni_vp); vput(nd.ni_dvp); return (EEXIST); } if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } VOP_LEASE(nd.ni_dvp, td, td->td_proc->p_ucred, LEASE_WRITE); error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE); NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); vn_finished_write(mp); ASSERT_VOP_UNLOCKED(nd.ni_dvp, "undelete"); ASSERT_VOP_UNLOCKED(nd.ni_vp, "undelete"); return (error); } /* * Delete a name from the filesystem. */ #ifndef _SYS_SYSPROTO_H_ struct unlink_args { char *path; }; #endif /* ARGSUSED */ int unlink(td, uap) struct thread *td; struct unlink_args /* { syscallarg(char *) path; } */ *uap; { struct mount *mp; struct vnode *vp; int error; struct nameidata nd; restart: bwillwrite(); NDINIT(&nd, DELETE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), td); if ((error = namei(&nd)) != 0) return (error); vp = nd.ni_vp; if (vp->v_type == VDIR) error = EPERM; /* POSIX */ else { /* * The root of a mounted filesystem cannot be deleted. * * XXX: can this only be a VDIR case? */ if (vp->v_flag & VROOT) error = EBUSY; } if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vrele(vp); vput(nd.ni_dvp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } VOP_LEASE(vp, td, td->td_proc->p_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); if (!error) { VOP_LEASE(nd.ni_dvp, td, td->td_proc->p_ucred, LEASE_WRITE); error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd); } NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); vput(vp); vn_finished_write(mp); ASSERT_VOP_UNLOCKED(nd.ni_dvp, "unlink"); ASSERT_VOP_UNLOCKED(nd.ni_vp, "unlink"); return (error); } /* * Reposition read/write file offset. */ #ifndef _SYS_SYSPROTO_H_ struct lseek_args { int fd; int pad; off_t offset; int whence; }; #endif int lseek(td, uap) struct thread *td; register struct lseek_args /* { syscallarg(int) fd; syscallarg(int) pad; syscallarg(off_t) offset; syscallarg(int) whence; } */ *uap; { struct ucred *cred = td->td_proc->p_ucred; - register struct filedesc *fdp = td->td_proc->p_fd; register struct file *fp; - struct vattr vattr; struct vnode *vp; + struct vattr vattr; off_t offset; int error, noneg; - if ((u_int)SCARG(uap, fd) >= fdp->fd_nfiles || - (fp = fdp->fd_ofiles[SCARG(uap, fd)]) == NULL) + fp = ffind_hold(td, uap->fd); + if (fp == NULL) return (EBADF); - if (fp->f_type != DTYPE_VNODE) + if (fp->f_type != DTYPE_VNODE) { + fdrop(fp, td); return (ESPIPE); + } vp = (struct vnode *)fp->f_data; noneg = (vp->v_type != VCHR); offset = SCARG(uap, offset); switch (SCARG(uap, whence)) { case L_INCR: if (noneg && (fp->f_offset < 0 || (offset > 0 && fp->f_offset > OFF_MAX - offset))) return (EOVERFLOW); offset += fp->f_offset; break; case L_XTND: error = VOP_GETATTR(vp, &vattr, cred, td); if (error) return (error); if (noneg && (vattr.va_size > OFF_MAX || (offset > 0 && vattr.va_size > OFF_MAX - offset))) return (EOVERFLOW); offset += vattr.va_size; break; case L_SET: break; default: + fdrop(fp, td); return (EINVAL); } if (noneg && offset < 0) return (EINVAL); fp->f_offset = offset; *(off_t *)(td->td_retval) = fp->f_offset; + fdrop(fp, td); return (0); } #if defined(COMPAT_43) || defined(COMPAT_SUNOS) /* * Reposition read/write file offset. */ #ifndef _SYS_SYSPROTO_H_ struct olseek_args { int fd; long offset; int whence; }; #endif int olseek(td, uap) struct thread *td; register struct olseek_args /* { syscallarg(int) fd; syscallarg(long) offset; syscallarg(int) whence; } */ *uap; { struct lseek_args /* { syscallarg(int) fd; syscallarg(int) pad; syscallarg(off_t) offset; syscallarg(int) whence; } */ nuap; int error; SCARG(&nuap, fd) = SCARG(uap, fd); SCARG(&nuap, offset) = SCARG(uap, offset); SCARG(&nuap, whence) = SCARG(uap, whence); error = lseek(td, &nuap); return (error); } #endif /* COMPAT_43 */ /* * Check access permissions using passed credentials. */ static int vn_access(vp, user_flags, cred, td) struct vnode *vp; int user_flags; struct ucred *cred; struct thread *td; { int error, flags; /* Flags == 0 means only check for existence. */ error = 0; if (user_flags) { flags = 0; if (user_flags & R_OK) flags |= VREAD; if (user_flags & W_OK) flags |= VWRITE; if (user_flags & X_OK) flags |= VEXEC; if ((flags & VWRITE) == 0 || (error = vn_writechk(vp)) == 0) error = VOP_ACCESS(vp, flags, cred, td); } return (error); } /* * Check access permissions using "real" credentials. */ #ifndef _SYS_SYSPROTO_H_ struct access_args { char *path; int flags; }; #endif int access(td, uap) struct thread *td; register struct access_args /* { syscallarg(char *) path; syscallarg(int) flags; } */ *uap; { struct ucred *cred, *tmpcred; register struct vnode *vp; int error; struct nameidata nd; cred = td->td_proc->p_ucred; /* * Create and modify a temporary credential instead of one that * is potentially shared. This could also mess up socket * buffer accounting which can run in an interrupt context. * * XXX - Depending on how "threads" are finally implemented, it * may be better to explicitly pass the credential to namei() * rather than to modify the potentially shared process structure. */ tmpcred = crdup(cred); tmpcred->cr_uid = cred->cr_ruid; tmpcred->cr_groups[0] = cred->cr_rgid; td->td_proc->p_ucred = tmpcred; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, SCARG(uap, path), td); if ((error = namei(&nd)) != 0) goto out1; vp = nd.ni_vp; error = vn_access(vp, SCARG(uap, flags), tmpcred, td); NDFREE(&nd, NDF_ONLY_PNBUF); vput(vp); out1: td->td_proc->p_ucred = cred; crfree(tmpcred); return (error); } /* * Check access permissions using "effective" credentials. */ #ifndef _SYS_SYSPROTO_H_ struct eaccess_args { char *path; int flags; }; #endif int eaccess(td, uap) struct thread *td; register struct eaccess_args /* { syscallarg(char *) path; syscallarg(int) flags; } */ *uap; { struct nameidata nd; struct vnode *vp; int error; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, SCARG(uap, path), td); if ((error = namei(&nd)) != 0) return (error); vp = nd.ni_vp; error = vn_access(vp, SCARG(uap, flags), td->td_proc->p_ucred, td); NDFREE(&nd, NDF_ONLY_PNBUF); vput(vp); return (error); } #if defined(COMPAT_43) || defined(COMPAT_SUNOS) /* * Get file status; this version follows links. */ #ifndef _SYS_SYSPROTO_H_ struct ostat_args { char *path; struct ostat *ub; }; #endif /* ARGSUSED */ int ostat(td, uap) struct thread *td; register struct ostat_args /* { syscallarg(char *) path; syscallarg(struct ostat *) ub; } */ *uap; { struct stat sb; struct ostat osb; int error; struct nameidata nd; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, SCARG(uap, path), td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = vn_stat(nd.ni_vp, &sb, td); vput(nd.ni_vp); if (error) return (error); cvtstat(&sb, &osb); error = copyout((caddr_t)&osb, (caddr_t)SCARG(uap, ub), sizeof (osb)); return (error); } /* * Get file status; this version does not follow links. */ #ifndef _SYS_SYSPROTO_H_ struct olstat_args { char *path; struct ostat *ub; }; #endif /* ARGSUSED */ int olstat(td, uap) struct thread *td; register struct olstat_args /* { syscallarg(char *) path; syscallarg(struct ostat *) ub; } */ *uap; { struct vnode *vp; struct stat sb; struct ostat osb; int error; struct nameidata nd; NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, SCARG(uap, path), td); if ((error = namei(&nd)) != 0) return (error); vp = nd.ni_vp; error = vn_stat(vp, &sb, td); NDFREE(&nd, NDF_ONLY_PNBUF); vput(vp); if (error) return (error); cvtstat(&sb, &osb); error = copyout((caddr_t)&osb, (caddr_t)SCARG(uap, ub), sizeof (osb)); return (error); } /* * Convert from an old to a new stat structure. */ void cvtstat(st, ost) struct stat *st; struct ostat *ost; { ost->st_dev = st->st_dev; ost->st_ino = st->st_ino; ost->st_mode = st->st_mode; ost->st_nlink = st->st_nlink; ost->st_uid = st->st_uid; ost->st_gid = st->st_gid; ost->st_rdev = st->st_rdev; if (st->st_size < (quad_t)1 << 32) ost->st_size = st->st_size; else ost->st_size = -2; ost->st_atime = st->st_atime; ost->st_mtime = st->st_mtime; ost->st_ctime = st->st_ctime; ost->st_blksize = st->st_blksize; ost->st_blocks = st->st_blocks; ost->st_flags = st->st_flags; ost->st_gen = st->st_gen; } #endif /* COMPAT_43 || COMPAT_SUNOS */ /* * Get file status; this version follows links. */ #ifndef _SYS_SYSPROTO_H_ struct stat_args { char *path; struct stat *ub; }; #endif /* ARGSUSED */ int stat(td, uap) struct thread *td; register struct stat_args /* { syscallarg(char *) path; syscallarg(struct stat *) ub; } */ *uap; { struct stat sb; int error; struct nameidata nd; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, SCARG(uap, path), td); if ((error = namei(&nd)) != 0) return (error); error = vn_stat(nd.ni_vp, &sb, td); NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_vp); if (error) return (error); error = copyout((caddr_t)&sb, (caddr_t)SCARG(uap, ub), sizeof (sb)); return (error); } /* * Get file status; this version does not follow links. */ #ifndef _SYS_SYSPROTO_H_ struct lstat_args { char *path; struct stat *ub; }; #endif /* ARGSUSED */ int lstat(td, uap) struct thread *td; register struct lstat_args /* { syscallarg(char *) path; syscallarg(struct stat *) ub; } */ *uap; { int error; struct vnode *vp; struct stat sb; struct nameidata nd; NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, SCARG(uap, path), td); if ((error = namei(&nd)) != 0) return (error); vp = nd.ni_vp; error = vn_stat(vp, &sb, td); NDFREE(&nd, NDF_ONLY_PNBUF); vput(vp); if (error) return (error); error = copyout((caddr_t)&sb, (caddr_t)SCARG(uap, ub), sizeof (sb)); return (error); } /* * Implementation of the NetBSD stat() function. * XXX This should probably be collapsed with the FreeBSD version, * as the differences are only due to vn_stat() clearing spares at * the end of the structures. vn_stat could be split to avoid this, * and thus collapse the following to close to zero code. */ void cvtnstat(sb, nsb) struct stat *sb; struct nstat *nsb; { nsb->st_dev = sb->st_dev; nsb->st_ino = sb->st_ino; nsb->st_mode = sb->st_mode; nsb->st_nlink = sb->st_nlink; nsb->st_uid = sb->st_uid; nsb->st_gid = sb->st_gid; nsb->st_rdev = sb->st_rdev; nsb->st_atimespec = sb->st_atimespec; nsb->st_mtimespec = sb->st_mtimespec; nsb->st_ctimespec = sb->st_ctimespec; nsb->st_size = sb->st_size; nsb->st_blocks = sb->st_blocks; nsb->st_blksize = sb->st_blksize; nsb->st_flags = sb->st_flags; nsb->st_gen = sb->st_gen; nsb->st_qspare[0] = sb->st_qspare[0]; nsb->st_qspare[1] = sb->st_qspare[1]; } #ifndef _SYS_SYSPROTO_H_ struct nstat_args { char *path; struct nstat *ub; }; #endif /* ARGSUSED */ int nstat(td, uap) struct thread *td; register struct nstat_args /* { syscallarg(char *) path; syscallarg(struct nstat *) ub; } */ *uap; { struct stat sb; struct nstat nsb; int error; struct nameidata nd; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, SCARG(uap, path), td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = vn_stat(nd.ni_vp, &sb, td); vput(nd.ni_vp); if (error) return (error); cvtnstat(&sb, &nsb); error = copyout((caddr_t)&nsb, (caddr_t)SCARG(uap, ub), sizeof (nsb)); return (error); } /* * NetBSD lstat. Get file status; this version does not follow links. */ #ifndef _SYS_SYSPROTO_H_ struct lstat_args { char *path; struct stat *ub; }; #endif /* ARGSUSED */ int nlstat(td, uap) struct thread *td; register struct nlstat_args /* { syscallarg(char *) path; syscallarg(struct nstat *) ub; } */ *uap; { int error; struct vnode *vp; struct stat sb; struct nstat nsb; struct nameidata nd; NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, SCARG(uap, path), td); if ((error = namei(&nd)) != 0) return (error); vp = nd.ni_vp; NDFREE(&nd, NDF_ONLY_PNBUF); error = vn_stat(vp, &sb, td); vput(vp); if (error) return (error); cvtnstat(&sb, &nsb); error = copyout((caddr_t)&nsb, (caddr_t)SCARG(uap, ub), sizeof (nsb)); return (error); } /* * Get configurable pathname variables. */ #ifndef _SYS_SYSPROTO_H_ struct pathconf_args { char *path; int name; }; #endif /* ARGSUSED */ int pathconf(td, uap) struct thread *td; register struct pathconf_args /* { syscallarg(char *) path; syscallarg(int) name; } */ *uap; { int error; struct nameidata nd; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, SCARG(uap, path), td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = VOP_PATHCONF(nd.ni_vp, SCARG(uap, name), td->td_retval); vput(nd.ni_vp); return (error); } /* * Return target name of a symbolic link. */ #ifndef _SYS_SYSPROTO_H_ struct readlink_args { char *path; char *buf; int count; }; #endif /* ARGSUSED */ int readlink(td, uap) struct thread *td; register struct readlink_args /* { syscallarg(char *) path; syscallarg(char *) buf; syscallarg(int) count; } */ *uap; { register struct vnode *vp; struct iovec aiov; struct uio auio; int error; struct nameidata nd; NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, SCARG(uap, path), td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; if (vp->v_type != VLNK) error = EINVAL; else { aiov.iov_base = SCARG(uap, buf); aiov.iov_len = SCARG(uap, count); auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = 0; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_USERSPACE; auio.uio_td = td; auio.uio_resid = SCARG(uap, count); error = VOP_READLINK(vp, &auio, td->td_proc->p_ucred); } vput(vp); td->td_retval[0] = SCARG(uap, count) - auio.uio_resid; return (error); } /* * Common implementation code for chflags() and fchflags(). */ static int setfflags(td, vp, flags) struct thread *td; struct vnode *vp; int flags; { int error; struct mount *mp; struct vattr vattr; /* * Prevent non-root users from setting flags on devices. When * a device is reused, users can retain ownership of the device * if they are allowed to set flags and programs assume that * chown can't fail when done as root. */ if (vp->v_type == VCHR || vp->v_type == VBLK) { error = suser_xxx(td->td_proc->p_ucred, td->td_proc, PRISON_ROOT); if (error) return (error); } if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) return (error); VOP_LEASE(vp, td, td->td_proc->p_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); VATTR_NULL(&vattr); vattr.va_flags = flags; error = VOP_SETATTR(vp, &vattr, td->td_proc->p_ucred, td); VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); return (error); } /* * Change flags of a file given a path name. */ #ifndef _SYS_SYSPROTO_H_ struct chflags_args { char *path; int flags; }; #endif /* ARGSUSED */ int chflags(td, uap) struct thread *td; register struct chflags_args /* { syscallarg(char *) path; syscallarg(int) flags; } */ *uap; { int error; struct nameidata nd; NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = setfflags(td, nd.ni_vp, SCARG(uap, flags)); vrele(nd.ni_vp); return error; } /* * Change flags of a file given a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fchflags_args { int fd; int flags; }; #endif /* ARGSUSED */ int fchflags(td, uap) struct thread *td; register struct fchflags_args /* { syscallarg(int) fd; syscallarg(int) flags; } */ *uap; { struct file *fp; int error; if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0) return (error); - return setfflags(td, (struct vnode *) fp->f_data, SCARG(uap, flags)); + error = setfflags(td, (struct vnode *) fp->f_data, SCARG(uap, flags)); + fdrop(fp, td); + return (error); } /* * Common implementation code for chmod(), lchmod() and fchmod(). */ static int setfmode(td, vp, mode) struct thread *td; struct vnode *vp; int mode; { int error; struct mount *mp; struct vattr vattr; if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) return (error); VOP_LEASE(vp, td, td->td_proc->p_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); VATTR_NULL(&vattr); vattr.va_mode = mode & ALLPERMS; error = VOP_SETATTR(vp, &vattr, td->td_proc->p_ucred, td); VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); return error; } /* * Change mode of a file given path name. */ #ifndef _SYS_SYSPROTO_H_ struct chmod_args { char *path; int mode; }; #endif /* ARGSUSED */ int chmod(td, uap) struct thread *td; register struct chmod_args /* { syscallarg(char *) path; syscallarg(int) mode; } */ *uap; { int error; struct nameidata nd; NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = setfmode(td, nd.ni_vp, SCARG(uap, mode)); vrele(nd.ni_vp); return error; } /* * Change mode of a file given path name (don't follow links.) */ #ifndef _SYS_SYSPROTO_H_ struct lchmod_args { char *path; int mode; }; #endif /* ARGSUSED */ int lchmod(td, uap) struct thread *td; register struct lchmod_args /* { syscallarg(char *) path; syscallarg(int) mode; } */ *uap; { int error; struct nameidata nd; NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, SCARG(uap, path), td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = setfmode(td, nd.ni_vp, SCARG(uap, mode)); vrele(nd.ni_vp); return error; } /* * Change mode of a file given a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fchmod_args { int fd; int mode; }; #endif /* ARGSUSED */ int fchmod(td, uap) struct thread *td; register struct fchmod_args /* { syscallarg(int) fd; syscallarg(int) mode; } */ *uap; { struct file *fp; + struct vnode *vp; int error; if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0) return (error); - return setfmode(td, (struct vnode *)fp->f_data, SCARG(uap, mode)); + vp = (struct vnode *)fp->f_data; + error = setfmode(td, (struct vnode *)fp->f_data, SCARG(uap, mode)); + fdrop(fp, td); + return (error); } /* * Common implementation for chown(), lchown(), and fchown() */ static int setfown(td, vp, uid, gid) struct thread *td; struct vnode *vp; uid_t uid; gid_t gid; { int error; struct mount *mp; struct vattr vattr; if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) return (error); VOP_LEASE(vp, td, td->td_proc->p_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); VATTR_NULL(&vattr); vattr.va_uid = uid; vattr.va_gid = gid; error = VOP_SETATTR(vp, &vattr, td->td_proc->p_ucred, td); VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); return error; } /* * Set ownership given a path name. */ #ifndef _SYS_SYSPROTO_H_ struct chown_args { char *path; int uid; int gid; }; #endif /* ARGSUSED */ int chown(td, uap) struct thread *td; register struct chown_args /* { syscallarg(char *) path; syscallarg(int) uid; syscallarg(int) gid; } */ *uap; { int error; struct nameidata nd; NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = setfown(td, nd.ni_vp, SCARG(uap, uid), SCARG(uap, gid)); vrele(nd.ni_vp); return (error); } /* * Set ownership given a path name, do not cross symlinks. */ #ifndef _SYS_SYSPROTO_H_ struct lchown_args { char *path; int uid; int gid; }; #endif /* ARGSUSED */ int lchown(td, uap) struct thread *td; register struct lchown_args /* { syscallarg(char *) path; syscallarg(int) uid; syscallarg(int) gid; } */ *uap; { int error; struct nameidata nd; NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, SCARG(uap, path), td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = setfown(td, nd.ni_vp, SCARG(uap, uid), SCARG(uap, gid)); vrele(nd.ni_vp); return (error); } /* * Set ownership given a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fchown_args { int fd; int uid; int gid; }; #endif /* ARGSUSED */ int fchown(td, uap) struct thread *td; register struct fchown_args /* { syscallarg(int) fd; syscallarg(int) uid; syscallarg(int) gid; } */ *uap; { struct file *fp; + struct vnode *vp; int error; if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0) return (error); - return setfown(td, (struct vnode *)fp->f_data, + vp = (struct vnode *)fp->f_data; + error = setfown(td, (struct vnode *)fp->f_data, SCARG(uap, uid), SCARG(uap, gid)); + fdrop(fp, td); + return (error); } /* * Common implementation code for utimes(), lutimes(), and futimes(). */ static int getutimes(usrtvp, tsp) const struct timeval *usrtvp; struct timespec *tsp; { struct timeval tv[2]; int error; if (usrtvp == NULL) { microtime(&tv[0]); TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]); tsp[1] = tsp[0]; } else { if ((error = copyin(usrtvp, tv, sizeof (tv))) != 0) return (error); TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]); TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]); } return 0; } /* * Common implementation code for utimes(), lutimes(), and futimes(). */ static int setutimes(td, vp, ts, nullflag) struct thread *td; struct vnode *vp; const struct timespec *ts; int nullflag; { int error; struct mount *mp; struct vattr vattr; if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) return (error); VOP_LEASE(vp, td, td->td_proc->p_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); VATTR_NULL(&vattr); vattr.va_atime = ts[0]; vattr.va_mtime = ts[1]; if (nullflag) vattr.va_vaflags |= VA_UTIMES_NULL; error = VOP_SETATTR(vp, &vattr, td->td_proc->p_ucred, td); VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); return error; } /* * Set the access and modification times of a file. */ #ifndef _SYS_SYSPROTO_H_ struct utimes_args { char *path; struct timeval *tptr; }; #endif /* ARGSUSED */ int utimes(td, uap) struct thread *td; register struct utimes_args /* { syscallarg(char *) path; syscallarg(struct timeval *) tptr; } */ *uap; { struct timespec ts[2]; struct timeval *usrtvp; int error; struct nameidata nd; usrtvp = SCARG(uap, tptr); if ((error = getutimes(usrtvp, ts)) != 0) return (error); NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = setutimes(td, nd.ni_vp, ts, usrtvp == NULL); vrele(nd.ni_vp); return (error); } /* * Set the access and modification times of a file. */ #ifndef _SYS_SYSPROTO_H_ struct lutimes_args { char *path; struct timeval *tptr; }; #endif /* ARGSUSED */ int lutimes(td, uap) struct thread *td; register struct lutimes_args /* { syscallarg(char *) path; syscallarg(struct timeval *) tptr; } */ *uap; { struct timespec ts[2]; struct timeval *usrtvp; int error; struct nameidata nd; usrtvp = SCARG(uap, tptr); if ((error = getutimes(usrtvp, ts)) != 0) return (error); NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, SCARG(uap, path), td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = setutimes(td, nd.ni_vp, ts, usrtvp == NULL); vrele(nd.ni_vp); return (error); } /* * Set the access and modification times of a file. */ #ifndef _SYS_SYSPROTO_H_ struct futimes_args { int fd; struct timeval *tptr; }; #endif /* ARGSUSED */ int futimes(td, uap) struct thread *td; register struct futimes_args /* { syscallarg(int ) fd; syscallarg(struct timeval *) tptr; } */ *uap; { struct timespec ts[2]; struct file *fp; struct timeval *usrtvp; int error; usrtvp = SCARG(uap, tptr); if ((error = getutimes(usrtvp, ts)) != 0) return (error); if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0) return (error); - return setutimes(td, (struct vnode *)fp->f_data, ts, usrtvp == NULL); + error = setutimes(td, (struct vnode *)fp->f_data, ts, usrtvp == NULL); + fdrop(fp, td); + return (error); } /* * Truncate a file given its path name. */ #ifndef _SYS_SYSPROTO_H_ struct truncate_args { char *path; int pad; off_t length; }; #endif /* ARGSUSED */ int truncate(td, uap) struct thread *td; register struct truncate_args /* { syscallarg(char *) path; syscallarg(int) pad; syscallarg(off_t) length; } */ *uap; { struct mount *mp; struct vnode *vp; struct vattr vattr; int error; struct nameidata nd; if (uap->length < 0) return(EINVAL); NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td); if ((error = namei(&nd)) != 0) return (error); vp = nd.ni_vp; if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) { vrele(vp); return (error); } NDFREE(&nd, NDF_ONLY_PNBUF); VOP_LEASE(vp, td, td->td_proc->p_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); if (vp->v_type == VDIR) error = EISDIR; else if ((error = vn_writechk(vp)) == 0 && (error = VOP_ACCESS(vp, VWRITE, td->td_proc->p_ucred, td)) == 0) { VATTR_NULL(&vattr); vattr.va_size = SCARG(uap, length); error = VOP_SETATTR(vp, &vattr, td->td_proc->p_ucred, td); } vput(vp); vn_finished_write(mp); return (error); } /* * Truncate a file given a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct ftruncate_args { int fd; int pad; off_t length; }; #endif /* ARGSUSED */ int ftruncate(td, uap) struct thread *td; register struct ftruncate_args /* { syscallarg(int) fd; syscallarg(int) pad; syscallarg(off_t) length; } */ *uap; { struct mount *mp; struct vattr vattr; struct vnode *vp; struct file *fp; int error; if (uap->length < 0) return(EINVAL); if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0) return (error); - if ((fp->f_flag & FWRITE) == 0) + if ((fp->f_flag & FWRITE) == 0) { + fdrop(fp, td); return (EINVAL); + } vp = (struct vnode *)fp->f_data; - if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) + if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) { + fdrop(fp, td); return (error); + } VOP_LEASE(vp, td, td->td_proc->p_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); if (vp->v_type == VDIR) error = EISDIR; else if ((error = vn_writechk(vp)) == 0) { VATTR_NULL(&vattr); vattr.va_size = SCARG(uap, length); error = VOP_SETATTR(vp, &vattr, fp->f_cred, td); } VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); + fdrop(fp, td); return (error); } #if defined(COMPAT_43) || defined(COMPAT_SUNOS) /* * Truncate a file given its path name. */ #ifndef _SYS_SYSPROTO_H_ struct otruncate_args { char *path; long length; }; #endif /* ARGSUSED */ int otruncate(td, uap) struct thread *td; register struct otruncate_args /* { syscallarg(char *) path; syscallarg(long) length; } */ *uap; { struct truncate_args /* { syscallarg(char *) path; syscallarg(int) pad; syscallarg(off_t) length; } */ nuap; SCARG(&nuap, path) = SCARG(uap, path); SCARG(&nuap, length) = SCARG(uap, length); return (truncate(td, &nuap)); } /* * Truncate a file given a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct oftruncate_args { int fd; long length; }; #endif /* ARGSUSED */ int oftruncate(td, uap) struct thread *td; register struct oftruncate_args /* { syscallarg(int) fd; syscallarg(long) length; } */ *uap; { struct ftruncate_args /* { syscallarg(int) fd; syscallarg(int) pad; syscallarg(off_t) length; } */ nuap; SCARG(&nuap, fd) = SCARG(uap, fd); SCARG(&nuap, length) = SCARG(uap, length); return (ftruncate(td, &nuap)); } #endif /* COMPAT_43 || COMPAT_SUNOS */ /* * Sync an open file. */ #ifndef _SYS_SYSPROTO_H_ struct fsync_args { int fd; }; #endif /* ARGSUSED */ int fsync(td, uap) struct thread *td; struct fsync_args /* { syscallarg(int) fd; } */ *uap; { struct vnode *vp; struct mount *mp; struct file *fp; vm_object_t obj; int error; GIANT_REQUIRED; if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0) return (error); vp = (struct vnode *)fp->f_data; - if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) + if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) { + fdrop(fp, td); return (error); + } vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); if (VOP_GETVOBJECT(vp, &obj) == 0) { vm_object_page_clean(obj, 0, 0, 0); } error = VOP_FSYNC(vp, fp->f_cred, MNT_WAIT, td); #ifdef SOFTUPDATES if (error == 0 && vp->v_mount && (vp->v_mount->mnt_flag & MNT_SOFTDEP)) error = softdep_fsync(vp); #endif VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); + fdrop(fp, td); return (error); } /* * Rename files. Source and destination must either both be directories, * or both not be directories. If target is a directory, it must be empty. */ #ifndef _SYS_SYSPROTO_H_ struct rename_args { char *from; char *to; }; #endif /* ARGSUSED */ int rename(td, uap) struct thread *td; register struct rename_args /* { syscallarg(char *) from; syscallarg(char *) to; } */ *uap; { struct mount *mp; struct vnode *tvp, *fvp, *tdvp; struct nameidata fromnd, tond; int error; bwillwrite(); NDINIT(&fromnd, DELETE, WANTPARENT | SAVESTART, UIO_USERSPACE, SCARG(uap, from), td); if ((error = namei(&fromnd)) != 0) return (error); fvp = fromnd.ni_vp; if ((error = vn_start_write(fvp, &mp, V_WAIT | PCATCH)) != 0) { NDFREE(&fromnd, NDF_ONLY_PNBUF); vrele(fromnd.ni_dvp); vrele(fvp); goto out1; } NDINIT(&tond, RENAME, LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | NOOBJ, UIO_USERSPACE, SCARG(uap, to), td); if (fromnd.ni_vp->v_type == VDIR) tond.ni_cnd.cn_flags |= WILLBEDIR; if ((error = namei(&tond)) != 0) { /* Translate error code for rename("dir1", "dir2/."). */ if (error == EISDIR && fvp->v_type == VDIR) error = EINVAL; NDFREE(&fromnd, NDF_ONLY_PNBUF); vrele(fromnd.ni_dvp); vrele(fvp); goto out1; } tdvp = tond.ni_dvp; tvp = tond.ni_vp; if (tvp != NULL) { if (fvp->v_type == VDIR && tvp->v_type != VDIR) { error = ENOTDIR; goto out; } else if (fvp->v_type != VDIR && tvp->v_type == VDIR) { error = EISDIR; goto out; } } if (fvp == tdvp) error = EINVAL; /* * If source is the same as the destination (that is the * same inode number with the same name in the same directory), * then there is nothing to do. */ if (fvp == tvp && fromnd.ni_dvp == tdvp && fromnd.ni_cnd.cn_namelen == tond.ni_cnd.cn_namelen && !bcmp(fromnd.ni_cnd.cn_nameptr, tond.ni_cnd.cn_nameptr, fromnd.ni_cnd.cn_namelen)) error = -1; out: if (!error) { VOP_LEASE(tdvp, td, td->td_proc->p_ucred, LEASE_WRITE); if (fromnd.ni_dvp != tdvp) { VOP_LEASE(fromnd.ni_dvp, td, td->td_proc->p_ucred, LEASE_WRITE); } if (tvp) { VOP_LEASE(tvp, td, td->td_proc->p_ucred, LEASE_WRITE); } error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd, tond.ni_dvp, tond.ni_vp, &tond.ni_cnd); NDFREE(&fromnd, NDF_ONLY_PNBUF); NDFREE(&tond, NDF_ONLY_PNBUF); } else { NDFREE(&fromnd, NDF_ONLY_PNBUF); NDFREE(&tond, NDF_ONLY_PNBUF); if (tdvp == tvp) vrele(tdvp); else vput(tdvp); if (tvp) vput(tvp); vrele(fromnd.ni_dvp); vrele(fvp); } vrele(tond.ni_startdir); vn_finished_write(mp); ASSERT_VOP_UNLOCKED(fromnd.ni_dvp, "rename"); ASSERT_VOP_UNLOCKED(fromnd.ni_vp, "rename"); ASSERT_VOP_UNLOCKED(tond.ni_dvp, "rename"); ASSERT_VOP_UNLOCKED(tond.ni_vp, "rename"); out1: if (fromnd.ni_startdir) vrele(fromnd.ni_startdir); if (error == -1) return (0); return (error); } /* * Make a directory file. */ #ifndef _SYS_SYSPROTO_H_ struct mkdir_args { char *path; int mode; }; #endif /* ARGSUSED */ int mkdir(td, uap) struct thread *td; register struct mkdir_args /* { syscallarg(char *) path; syscallarg(int) mode; } */ *uap; { return vn_mkdir(uap->path, uap->mode, UIO_USERSPACE, td); } int vn_mkdir(path, mode, segflg, td) char *path; int mode; enum uio_seg segflg; struct thread *td; { struct mount *mp; struct vnode *vp; struct vattr vattr; int error; struct nameidata nd; restart: bwillwrite(); NDINIT(&nd, CREATE, LOCKPARENT, segflg, path, td); nd.ni_cnd.cn_flags |= WILLBEDIR; if ((error = namei(&nd)) != 0) return (error); vp = nd.ni_vp; if (vp != NULL) { NDFREE(&nd, NDF_ONLY_PNBUF); vrele(vp); vput(nd.ni_dvp); return (EEXIST); } if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } VATTR_NULL(&vattr); vattr.va_type = VDIR; + FILEDESC_LOCK(td->td_proc->p_fd); vattr.va_mode = (mode & ACCESSPERMS) &~ td->td_proc->p_fd->fd_cmask; + FILEDESC_UNLOCK(td->td_proc->p_fd); VOP_LEASE(nd.ni_dvp, td, td->td_proc->p_ucred, LEASE_WRITE); error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if (!error) vput(nd.ni_vp); vn_finished_write(mp); ASSERT_VOP_UNLOCKED(nd.ni_dvp, "mkdir"); ASSERT_VOP_UNLOCKED(nd.ni_vp, "mkdir"); return (error); } /* * Remove a directory file. */ #ifndef _SYS_SYSPROTO_H_ struct rmdir_args { char *path; }; #endif /* ARGSUSED */ int rmdir(td, uap) struct thread *td; struct rmdir_args /* { syscallarg(char *) path; } */ *uap; { struct mount *mp; struct vnode *vp; int error; struct nameidata nd; restart: bwillwrite(); NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF, UIO_USERSPACE, SCARG(uap, path), td); if ((error = namei(&nd)) != 0) return (error); vp = nd.ni_vp; if (vp->v_type != VDIR) { error = ENOTDIR; goto out; } /* * No rmdir "." please. */ if (nd.ni_dvp == vp) { error = EINVAL; goto out; } /* * The root of a mounted filesystem cannot be deleted. */ if (vp->v_flag & VROOT) { error = EBUSY; goto out; } if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_dvp == vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); vput(vp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } VOP_LEASE(nd.ni_dvp, td, td->td_proc->p_ucred, LEASE_WRITE); VOP_LEASE(vp, td, td->td_proc->p_ucred, LEASE_WRITE); error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd); vn_finished_write(mp); out: NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_dvp == vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); vput(vp); ASSERT_VOP_UNLOCKED(nd.ni_dvp, "rmdir"); ASSERT_VOP_UNLOCKED(nd.ni_vp, "rmdir"); return (error); } #ifdef COMPAT_43 /* * Read a block of directory entries in a file system independent format. */ #ifndef _SYS_SYSPROTO_H_ struct ogetdirentries_args { int fd; char *buf; u_int count; long *basep; }; #endif int ogetdirentries(td, uap) struct thread *td; register struct ogetdirentries_args /* { syscallarg(int) fd; syscallarg(char *) buf; syscallarg(u_int) count; syscallarg(long *) basep; } */ *uap; { struct vnode *vp; struct file *fp; struct uio auio, kuio; struct iovec aiov, kiov; struct dirent *dp, *edp; caddr_t dirbuf; int error, eofflag, readcnt; long loff; /* XXX arbitrary sanity limit on `count'. */ if (SCARG(uap, count) > 64 * 1024) return (EINVAL); if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0) return (error); - if ((fp->f_flag & FREAD) == 0) + if ((fp->f_flag & FREAD) == 0) { + fdrop(fp, td); return (EBADF); + } vp = (struct vnode *)fp->f_data; unionread: - if (vp->v_type != VDIR) + if (vp->v_type != VDIR) { + fdrop(fp, td); return (EINVAL); + } aiov.iov_base = SCARG(uap, buf); aiov.iov_len = SCARG(uap, count); auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_USERSPACE; auio.uio_td = td; auio.uio_resid = SCARG(uap, count); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); loff = auio.uio_offset = fp->f_offset; # if (BYTE_ORDER != LITTLE_ENDIAN) if (vp->v_mount->mnt_maxsymlinklen <= 0) { error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL, NULL); fp->f_offset = auio.uio_offset; } else # endif { kuio = auio; kuio.uio_iov = &kiov; kuio.uio_segflg = UIO_SYSSPACE; kiov.iov_len = SCARG(uap, count); MALLOC(dirbuf, caddr_t, SCARG(uap, count), M_TEMP, M_WAITOK); kiov.iov_base = dirbuf; error = VOP_READDIR(vp, &kuio, fp->f_cred, &eofflag, NULL, NULL); fp->f_offset = kuio.uio_offset; if (error == 0) { readcnt = SCARG(uap, count) - kuio.uio_resid; edp = (struct dirent *)&dirbuf[readcnt]; for (dp = (struct dirent *)dirbuf; dp < edp; ) { # if (BYTE_ORDER == LITTLE_ENDIAN) /* * The expected low byte of * dp->d_namlen is our dp->d_type. * The high MBZ byte of dp->d_namlen * is our dp->d_namlen. */ dp->d_type = dp->d_namlen; dp->d_namlen = 0; # else /* * The dp->d_type is the high byte * of the expected dp->d_namlen, * so must be zero'ed. */ dp->d_type = 0; # endif if (dp->d_reclen > 0) { dp = (struct dirent *) ((char *)dp + dp->d_reclen); } else { error = EIO; break; } } if (dp >= edp) error = uiomove(dirbuf, readcnt, &auio); } FREE(dirbuf, M_TEMP); } VOP_UNLOCK(vp, 0, td); - if (error) + if (error) { + fdrop(fp, td); return (error); + } if (SCARG(uap, count) == auio.uio_resid) { if (union_dircheckp) { error = union_dircheckp(td, &vp, fp); if (error == -1) goto unionread; - if (error) + if (error) { + fdrop(fp, td); return (error); + } } if ((vp->v_flag & VROOT) && (vp->v_mount->mnt_flag & MNT_UNION)) { struct vnode *tvp = vp; vp = vp->v_mount->mnt_vnodecovered; VREF(vp); fp->f_data = (caddr_t) vp; fp->f_offset = 0; vrele(tvp); goto unionread; } } error = copyout((caddr_t)&loff, (caddr_t)SCARG(uap, basep), sizeof(long)); + fdrop(fp, td); td->td_retval[0] = SCARG(uap, count) - auio.uio_resid; return (error); } #endif /* COMPAT_43 */ /* * Read a block of directory entries in a file system independent format. */ #ifndef _SYS_SYSPROTO_H_ struct getdirentries_args { int fd; char *buf; u_int count; long *basep; }; #endif int getdirentries(td, uap) struct thread *td; register struct getdirentries_args /* { syscallarg(int) fd; syscallarg(char *) buf; syscallarg(u_int) count; syscallarg(long *) basep; } */ *uap; { struct vnode *vp; struct file *fp; struct uio auio; struct iovec aiov; long loff; int error, eofflag; if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0) return (error); - if ((fp->f_flag & FREAD) == 0) + if ((fp->f_flag & FREAD) == 0) { + fdrop(fp, td); return (EBADF); + } vp = (struct vnode *)fp->f_data; unionread: - if (vp->v_type != VDIR) + if (vp->v_type != VDIR) { + fdrop(fp, td); return (EINVAL); + } aiov.iov_base = SCARG(uap, buf); aiov.iov_len = SCARG(uap, count); auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_USERSPACE; auio.uio_td = td; auio.uio_resid = SCARG(uap, count); /* vn_lock(vp, LK_SHARED | LK_RETRY, td); */ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); loff = auio.uio_offset = fp->f_offset; error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL, NULL); fp->f_offset = auio.uio_offset; VOP_UNLOCK(vp, 0, td); - if (error) + if (error) { + fdrop(fp, td); return (error); + } if (SCARG(uap, count) == auio.uio_resid) { if (union_dircheckp) { error = union_dircheckp(td, &vp, fp); if (error == -1) goto unionread; - if (error) + if (error) { + fdrop(fp, td); return (error); + } } if ((vp->v_flag & VROOT) && (vp->v_mount->mnt_flag & MNT_UNION)) { struct vnode *tvp = vp; vp = vp->v_mount->mnt_vnodecovered; VREF(vp); fp->f_data = (caddr_t) vp; fp->f_offset = 0; vrele(tvp); goto unionread; } } if (SCARG(uap, basep) != NULL) { error = copyout((caddr_t)&loff, (caddr_t)SCARG(uap, basep), sizeof(long)); } td->td_retval[0] = SCARG(uap, count) - auio.uio_resid; + fdrop(fp, td); return (error); } #ifndef _SYS_SYSPROTO_H_ struct getdents_args { int fd; char *buf; size_t count; }; #endif int getdents(td, uap) struct thread *td; register struct getdents_args /* { syscallarg(int) fd; syscallarg(char *) buf; syscallarg(u_int) count; } */ *uap; { struct getdirentries_args ap; ap.fd = uap->fd; ap.buf = uap->buf; ap.count = uap->count; ap.basep = NULL; return getdirentries(td, &ap); } /* * Set the mode mask for creation of filesystem nodes. * * MP SAFE */ #ifndef _SYS_SYSPROTO_H_ struct umask_args { int newmask; }; #endif int umask(td, uap) struct thread *td; struct umask_args /* { syscallarg(int) newmask; } */ *uap; { register struct filedesc *fdp; + FILEDESC_LOCK(td->td_proc->p_fd); fdp = td->td_proc->p_fd; td->td_retval[0] = fdp->fd_cmask; fdp->fd_cmask = SCARG(uap, newmask) & ALLPERMS; + FILEDESC_UNLOCK(td->td_proc->p_fd); return (0); } /* * Void all references to file by ripping underlying filesystem * away from vnode. */ #ifndef _SYS_SYSPROTO_H_ struct revoke_args { char *path; }; #endif /* ARGSUSED */ int revoke(td, uap) struct thread *td; register struct revoke_args /* { syscallarg(char *) path; } */ *uap; { struct mount *mp; struct vnode *vp; struct vattr vattr; int error; struct nameidata nd; NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td); if ((error = namei(&nd)) != 0) return (error); vp = nd.ni_vp; NDFREE(&nd, NDF_ONLY_PNBUF); if (vp->v_type != VCHR) { error = EINVAL; goto out; } error = VOP_GETATTR(vp, &vattr, td->td_proc->p_ucred, td); if (error) goto out; if (td->td_proc->p_ucred->cr_uid != vattr.va_uid) { error = suser_xxx(0, td->td_proc, PRISON_ROOT); if (error) goto out; } if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) goto out; if (vcount(vp) > 1) VOP_REVOKE(vp, REVOKEALL); vn_finished_write(mp); out: vrele(vp); return (error); } /* * Convert a user file descriptor to a kernel file entry. + * The file entry is locked upon returning. */ int getvnode(fdp, fd, fpp) struct filedesc *fdp; int fd; struct file **fpp; { + int error; struct file *fp; - if ((u_int)fd >= fdp->fd_nfiles || - (fp = fdp->fd_ofiles[fd]) == NULL) - return (EBADF); - if (fp->f_type != DTYPE_VNODE && fp->f_type != DTYPE_FIFO) - return (EINVAL); + fp = NULL; + if (fdp == NULL) + error = EBADF; + else { + FILEDESC_LOCK(fdp); + if ((u_int)fd >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[fd]) == NULL) + error = EBADF; + else if (fp->f_type != DTYPE_VNODE && fp->f_type != DTYPE_FIFO) { + fp = NULL; + error = EINVAL; + } else { + fhold(fp); + error = 0; + } + FILEDESC_UNLOCK(fdp); + } *fpp = fp; - return (0); + return (error); } /* * Get (NFS) file handle */ #ifndef _SYS_SYSPROTO_H_ struct getfh_args { char *fname; fhandle_t *fhp; }; #endif int getfh(td, uap) struct thread *td; register struct getfh_args *uap; { struct nameidata nd; fhandle_t fh; register struct vnode *vp; int error; /* * Must be super user */ error = suser_td(td); if (error) return (error); NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, uap->fname, td); error = namei(&nd); if (error) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; bzero(&fh, sizeof(fh)); fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid; error = VFS_VPTOFH(vp, &fh.fh_fid); vput(vp); if (error) return (error); error = copyout(&fh, uap->fhp, sizeof (fh)); return (error); } /* * syscall for the rpc.lockd to use to translate a NFS file handle into * an open descriptor. * * warning: do not remove the suser() call or this becomes one giant * security hole. */ #ifndef _SYS_SYSPROTO_H_ struct fhopen_args { const struct fhandle *u_fhp; int flags; }; #endif int fhopen(td, uap) struct thread *td; struct fhopen_args /* { syscallarg(const struct fhandle *) u_fhp; syscallarg(int) flags; } */ *uap; { struct proc *p = td->td_proc; struct mount *mp; struct vnode *vp; struct fhandle fhp; struct vattr vat; struct vattr *vap = &vat; struct flock lf; struct file *fp; register struct filedesc *fdp = p->p_fd; int fmode, mode, error, type; struct file *nfp; int indx; /* * Must be super user */ error = suser_td(td); if (error) return (error); fmode = FFLAGS(SCARG(uap, flags)); /* why not allow a non-read/write open for our lockd? */ if (((fmode & (FREAD | FWRITE)) == 0) || (fmode & O_CREAT)) return (EINVAL); error = copyin(SCARG(uap,u_fhp), &fhp, sizeof(fhp)); if (error) return(error); /* find the mount point */ mp = vfs_getvfs(&fhp.fh_fsid); if (mp == NULL) return (ESTALE); /* now give me my vnode, it gets returned to me locked */ error = VFS_FHTOVP(mp, &fhp.fh_fid, &vp); if (error) return (error); /* * from now on we have to make sure not * to forget about the vnode * any error that causes an abort must vput(vp) * just set error = err and 'goto bad;'. */ /* * from vn_open */ if (vp->v_type == VLNK) { error = EMLINK; goto bad; } if (vp->v_type == VSOCK) { error = EOPNOTSUPP; goto bad; } mode = 0; if (fmode & (FWRITE | O_TRUNC)) { if (vp->v_type == VDIR) { error = EISDIR; goto bad; } error = vn_writechk(vp); if (error) goto bad; mode |= VWRITE; } if (fmode & FREAD) mode |= VREAD; if (mode) { error = VOP_ACCESS(vp, mode, p->p_ucred, td); if (error) goto bad; } if (fmode & O_TRUNC) { VOP_UNLOCK(vp, 0, td); /* XXX */ if ((error = vn_start_write(NULL, &mp, V_WAIT | PCATCH)) != 0) { vrele(vp); return (error); } VOP_LEASE(vp, td, p->p_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); /* XXX */ VATTR_NULL(vap); vap->va_size = 0; error = VOP_SETATTR(vp, vap, p->p_ucred, td); vn_finished_write(mp); if (error) goto bad; } error = VOP_OPEN(vp, fmode, p->p_ucred, td); if (error) goto bad; /* * Make sure that a VM object is created for VMIO support. */ if (vn_canvmio(vp) == TRUE) { if ((error = vfs_object_create(vp, td, p->p_ucred)) != 0) goto bad; } if (fmode & FWRITE) vp->v_writecount++; /* * end of vn_open code */ if ((error = falloc(td, &nfp, &indx)) != 0) { if (fmode & FWRITE) vp->v_writecount--; goto bad; } fp = nfp; /* * Hold an extra reference to avoid having fp ripped out * from under us while we block in the lock op */ fhold(fp); nfp->f_data = (caddr_t)vp; nfp->f_flag = fmode & FMASK; nfp->f_ops = &vnops; nfp->f_type = DTYPE_VNODE; if (fmode & (O_EXLOCK | O_SHLOCK)) { lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; if (fmode & O_EXLOCK) lf.l_type = F_WRLCK; else lf.l_type = F_RDLCK; type = F_FLOCK; if ((fmode & FNONBLOCK) == 0) type |= F_WAIT; VOP_UNLOCK(vp, 0, td); if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type)) != 0) { /* * The lock request failed. Normally close the * descriptor but handle the case where someone might * have dup()d or close()d it when we weren't looking. */ + FILEDESC_LOCK(fdp); if (fdp->fd_ofiles[indx] == fp) { fdp->fd_ofiles[indx] = NULL; + FILEDESC_UNLOCK(fdp); fdrop(fp, td); - } + } else + FILEDESC_UNLOCK(fdp); /* * release our private reference */ fdrop(fp, td); return(error); } vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); fp->f_flag |= FHASLOCK; } if ((vp->v_type == VREG) && (VOP_GETVOBJECT(vp, NULL) != 0)) vfs_object_create(vp, td, p->p_ucred); VOP_UNLOCK(vp, 0, td); fdrop(fp, td); td->td_retval[0] = indx; return (0); bad: vput(vp); return (error); } /* * Stat an (NFS) file handle. */ #ifndef _SYS_SYSPROTO_H_ struct fhstat_args { struct fhandle *u_fhp; struct stat *sb; }; #endif int fhstat(td, uap) struct thread *td; register struct fhstat_args /* { syscallarg(struct fhandle *) u_fhp; syscallarg(struct stat *) sb; } */ *uap; { struct stat sb; fhandle_t fh; struct mount *mp; struct vnode *vp; int error; /* * Must be super user */ error = suser_td(td); if (error) return (error); error = copyin(SCARG(uap, u_fhp), &fh, sizeof(fhandle_t)); if (error) return (error); if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL) return (ESTALE); if ((error = VFS_FHTOVP(mp, &fh.fh_fid, &vp))) return (error); error = vn_stat(vp, &sb, td); vput(vp); if (error) return (error); error = copyout(&sb, SCARG(uap, sb), sizeof(sb)); return (error); } /* * Implement fstatfs() for (NFS) file handles. */ #ifndef _SYS_SYSPROTO_H_ struct fhstatfs_args { struct fhandle *u_fhp; struct statfs *buf; }; #endif int fhstatfs(td, uap) struct thread *td; struct fhstatfs_args /* { syscallarg(struct fhandle) *u_fhp; syscallarg(struct statfs) *buf; } */ *uap; { struct statfs *sp; struct mount *mp; struct vnode *vp; struct statfs sb; fhandle_t fh; int error; /* * Must be super user */ error = suser_td(td); if (error) return (error); if ((error = copyin(SCARG(uap, u_fhp), &fh, sizeof(fhandle_t))) != 0) return (error); if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL) return (ESTALE); if ((error = VFS_FHTOVP(mp, &fh.fh_fid, &vp))) return (error); mp = vp->v_mount; sp = &mp->mnt_stat; vput(vp); if ((error = VFS_STATFS(mp, sp, td)) != 0) return (error); sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; if (suser_xxx(td->td_proc->p_ucred, 0, 0)) { bcopy((caddr_t)sp, (caddr_t)&sb, sizeof(sb)); sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0; sp = &sb; } return (copyout(sp, SCARG(uap, buf), sizeof(*sp))); } /* * Syscall to push extended attribute configuration information into the * VFS. Accepts a path, which it converts to a mountpoint, as well as * a command (int cmd), and attribute name and misc data. For now, the * attribute name is left in userspace for consumption by the VFS_op. * It will probably be changed to be copied into sysspace by the * syscall in the future, once issues with various consumers of the * attribute code have raised their hands. * * Currently this is used only by UFS Extended Attributes. */ int extattrctl(td, uap) struct thread *td; struct extattrctl_args *uap; { struct vnode *filename_vp; struct nameidata nd; struct mount *mp; char attrname[EXTATTR_MAXNAMELEN]; int error; /* * SCARG(uap, attrname) not always defined. We check again later * when we invoke the VFS call so as to pass in NULL there if needed. */ if (SCARG(uap, attrname) != NULL) { error = copyinstr(SCARG(uap, attrname), attrname, EXTATTR_MAXNAMELEN, NULL); if (error) return (error); } /* * SCARG(uap, filename) not always defined. If it is, grab * a vnode lock, which VFS_EXTATTRCTL() will later release. */ filename_vp = NULL; if (SCARG(uap, filename) != NULL) { NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, SCARG(uap, filename), td); if ((error = namei(&nd)) != 0) return (error); filename_vp = nd.ni_vp; NDFREE(&nd, NDF_NO_VP_RELE | NDF_NO_VP_UNLOCK); } /* SCARG(uap, path) always defined. */ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td); if ((error = namei(&nd)) != 0) return (error); error = vn_start_write(nd.ni_vp, &mp, V_WAIT | PCATCH); NDFREE(&nd, 0); if (error) { if (filename_vp) vrele(filename_vp); return (error); } if (SCARG(uap, attrname) != NULL) { error = VFS_EXTATTRCTL(mp, SCARG(uap, cmd), filename_vp, SCARG(uap, attrnamespace), attrname, td); } else { error = VFS_EXTATTRCTL(mp, SCARG(uap, cmd), filename_vp, SCARG(uap, attrnamespace), NULL, td); } vn_finished_write(mp); /* * VFS_EXTATTRCTL will have unlocked, but not de-ref'd, * filename_vp, so vrele it if it is defined. */ if (filename_vp != NULL) vrele(filename_vp); return (error); } /* * extattr_set_vp(): Set a named extended attribute on a file or directory * * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace", * kernelspace string pointer "attrname", * userspace iovec array pointer "iovp", unsigned int iovcnt * proc "p" * Returns: 0 on success, an error number otherwise * Locks: none * References: vp must be a valid reference for the duration of the call */ static int extattr_set_vp(struct vnode *vp, int attrnamespace, const char *attrname, struct iovec *iovp, unsigned iovcnt, struct thread *td) { struct mount *mp; struct uio auio; struct iovec *iov, *needfree = NULL, aiov[UIO_SMALLIOV]; u_int iovlen, cnt; int error, i; if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) return (error); VOP_LEASE(vp, td, td->td_proc->p_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); iovlen = iovcnt * sizeof(struct iovec); if (iovcnt > UIO_SMALLIOV) { if (iovcnt > UIO_MAXIOV) { error = EINVAL; goto done; } MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK); needfree = iov; } else iov = aiov; auio.uio_iov = iov; auio.uio_iovcnt = iovcnt; auio.uio_rw = UIO_WRITE; auio.uio_segflg = UIO_USERSPACE; auio.uio_td = td; auio.uio_offset = 0; if ((error = copyin((caddr_t)iovp, (caddr_t)iov, iovlen))) goto done; auio.uio_resid = 0; for (i = 0; i < iovcnt; i++) { if (iov->iov_len > INT_MAX - auio.uio_resid) { error = EINVAL; goto done; } auio.uio_resid += iov->iov_len; iov++; } cnt = auio.uio_resid; error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, td->td_proc->p_ucred, td); cnt -= auio.uio_resid; td->td_retval[0] = cnt; done: if (needfree) FREE(needfree, M_IOV); VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); return (error); } int extattr_set_file(td, uap) struct thread *td; struct extattr_set_file_args *uap; { struct nameidata nd; char attrname[EXTATTR_MAXNAMELEN]; int error; error = copyinstr(SCARG(uap, attrname), attrname, EXTATTR_MAXNAMELEN, NULL); if (error) return (error); NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = extattr_set_vp(nd.ni_vp, SCARG(uap, attrnamespace), attrname, SCARG(uap, iovp), SCARG(uap, iovcnt), td); vrele(nd.ni_vp); return (error); } int extattr_set_fd(td, uap) struct thread *td; struct extattr_set_fd_args *uap; { struct file *fp; char attrname[EXTATTR_MAXNAMELEN]; int error; error = copyinstr(SCARG(uap, attrname), attrname, EXTATTR_MAXNAMELEN, NULL); if (error) return (error); if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0) return (error); error = extattr_set_vp((struct vnode *)fp->f_data, SCARG(uap, attrnamespace), attrname, SCARG(uap, iovp), SCARG(uap, iovcnt), td); + fdrop(fp, td); return (error); } /* * extattr_get_vp(): Get a named extended attribute on a file or directory * * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace", * kernelspace string pointer "attrname", * userspace iovec array pointer "iovp", unsigned int iovcnt, * proc "p" * Returns: 0 on success, an error number otherwise * Locks: none * References: vp must be a valid reference for the duration of the call */ static int extattr_get_vp(struct vnode *vp, int attrnamespace, const char *attrname, struct iovec *iovp, unsigned iovcnt, struct thread *td) { struct uio auio; struct iovec *iov, *needfree = NULL, aiov[UIO_SMALLIOV]; u_int iovlen, cnt; int error, i; VOP_LEASE(vp, td, td->td_proc->p_ucred, LEASE_READ); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); iovlen = iovcnt * sizeof (struct iovec); if (iovcnt > UIO_SMALLIOV) { if (iovcnt > UIO_MAXIOV) { error = EINVAL; goto done; } MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK); needfree = iov; } else iov = aiov; auio.uio_iov = iov; auio.uio_iovcnt = iovcnt; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_USERSPACE; auio.uio_td = td; auio.uio_offset = 0; if ((error = copyin((caddr_t)iovp, (caddr_t)iov, iovlen))) goto done; auio.uio_resid = 0; for (i = 0; i < iovcnt; i++) { if (iov->iov_len > INT_MAX - auio.uio_resid) { error = EINVAL; goto done; } auio.uio_resid += iov->iov_len; iov++; } cnt = auio.uio_resid; error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, td->td_proc->p_ucred, td); cnt -= auio.uio_resid; td->td_retval[0] = cnt; done: if (needfree) FREE(needfree, M_IOV); VOP_UNLOCK(vp, 0, td); return (error); } int extattr_get_file(td, uap) struct thread *td; struct extattr_get_file_args *uap; { struct nameidata nd; char attrname[EXTATTR_MAXNAMELEN]; int error; error = copyinstr(SCARG(uap, attrname), attrname, EXTATTR_MAXNAMELEN, NULL); if (error) return (error); NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = extattr_get_vp(nd.ni_vp, SCARG(uap, attrnamespace), attrname, SCARG(uap, iovp), SCARG(uap, iovcnt), td); vrele(nd.ni_vp); return (error); } int extattr_get_fd(td, uap) struct thread *td; struct extattr_get_fd_args *uap; { struct file *fp; char attrname[EXTATTR_MAXNAMELEN]; int error; error = copyinstr(SCARG(uap, attrname), attrname, EXTATTR_MAXNAMELEN, NULL); if (error) return (error); if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0) return (error); error = extattr_get_vp((struct vnode *)fp->f_data, SCARG(uap, attrnamespace), attrname, SCARG(uap, iovp), SCARG(uap, iovcnt), td); + fdrop(fp, td); return (error); } /* * extattr_delete_vp(): Delete a named extended attribute on a file or * directory * * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace", * kernelspace string pointer "attrname", proc "p" * Returns: 0 on success, an error number otherwise * Locks: none * References: vp must be a valid reference for the duration of the call */ static int extattr_delete_vp(struct vnode *vp, int attrnamespace, const char *attrname, struct thread *td) { struct mount *mp; int error; if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) return (error); VOP_LEASE(vp, td, td->td_proc->p_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL, td->td_proc->p_ucred, td); VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); return (error); } int extattr_delete_file(td, uap) struct thread *td; struct extattr_delete_file_args *uap; { struct nameidata nd; char attrname[EXTATTR_MAXNAMELEN]; int error; error = copyinstr(SCARG(uap, attrname), attrname, EXTATTR_MAXNAMELEN, NULL); if (error) return(error); NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td); if ((error = namei(&nd)) != 0) return(error); NDFREE(&nd, NDF_ONLY_PNBUF); error = extattr_delete_vp(nd.ni_vp, SCARG(uap, attrnamespace), attrname, td); vrele(nd.ni_vp); return(error); } int extattr_delete_fd(td, uap) struct thread *td; struct extattr_delete_fd_args *uap; { struct file *fp; + struct vnode *vp; char attrname[EXTATTR_MAXNAMELEN]; int error; error = copyinstr(SCARG(uap, attrname), attrname, EXTATTR_MAXNAMELEN, NULL); if (error) return (error); if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0) return (error); + vp = (struct vnode *)fp->f_data; error = extattr_delete_vp((struct vnode *)fp->f_data, SCARG(uap, attrnamespace), attrname, td); + fdrop(fp, td); return (error); } Index: head/sys/kern/vfs_vnops.c =================================================================== --- head/sys/kern/vfs_vnops.c (revision 89305) +++ head/sys/kern/vfs_vnops.c (revision 89306) @@ -1,979 +1,979 @@ /* * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_vnops.c 8.2 (Berkeley) 1/21/94 * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int vn_closefile __P((struct file *fp, struct thread *td)); static int vn_ioctl __P((struct file *fp, u_long com, caddr_t data, struct thread *td)); static int vn_read __P((struct file *fp, struct uio *uio, struct ucred *cred, int flags, struct thread *td)); static int vn_poll __P((struct file *fp, int events, struct ucred *cred, struct thread *td)); static int vn_kqfilter __P((struct file *fp, struct knote *kn)); static int vn_statfile __P((struct file *fp, struct stat *sb, struct thread *td)); static int vn_write __P((struct file *fp, struct uio *uio, struct ucred *cred, int flags, struct thread *td)); struct fileops vnops = { vn_read, vn_write, vn_ioctl, vn_poll, vn_kqfilter, vn_statfile, vn_closefile }; int vn_open(ndp, flagp, cmode) register struct nameidata *ndp; int *flagp, cmode; { struct thread *td = ndp->ni_cnd.cn_thread; return (vn_open_cred(ndp, flagp, cmode, td->td_proc->p_ucred)); } /* * Common code for vnode open operations. * Check permissions, and call the VOP_OPEN or VOP_CREATE routine. * * Note that this does NOT free nameidata for the successful case, * due to the NDINIT being done elsewhere. */ int vn_open_cred(ndp, flagp, cmode, cred) register struct nameidata *ndp; int *flagp, cmode; struct ucred *cred; { struct vnode *vp; struct mount *mp; struct thread *td = ndp->ni_cnd.cn_thread; struct vattr vat; struct vattr *vap = &vat; int mode, fmode, error; restart: fmode = *flagp; if (fmode & O_CREAT) { ndp->ni_cnd.cn_nameiop = CREATE; ndp->ni_cnd.cn_flags = LOCKPARENT | LOCKLEAF; if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0) ndp->ni_cnd.cn_flags |= FOLLOW; bwillwrite(); if ((error = namei(ndp)) != 0) return (error); if (ndp->ni_vp == NULL) { VATTR_NULL(vap); vap->va_type = VREG; vap->va_mode = cmode; if (fmode & O_EXCL) vap->va_vaflags |= VA_EXCLUSIVE; if (vn_start_write(ndp->ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(ndp, NDF_ONLY_PNBUF); vput(ndp->ni_dvp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } VOP_LEASE(ndp->ni_dvp, td, cred, LEASE_WRITE); error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp, &ndp->ni_cnd, vap); vput(ndp->ni_dvp); vn_finished_write(mp); if (error) { NDFREE(ndp, NDF_ONLY_PNBUF); return (error); } ASSERT_VOP_UNLOCKED(ndp->ni_dvp, "create"); ASSERT_VOP_LOCKED(ndp->ni_vp, "create"); fmode &= ~O_TRUNC; vp = ndp->ni_vp; } else { if (ndp->ni_dvp == ndp->ni_vp) vrele(ndp->ni_dvp); else vput(ndp->ni_dvp); ndp->ni_dvp = NULL; vp = ndp->ni_vp; if (fmode & O_EXCL) { error = EEXIST; goto bad; } fmode &= ~O_CREAT; } } else { ndp->ni_cnd.cn_nameiop = LOOKUP; ndp->ni_cnd.cn_flags = ((fmode & O_NOFOLLOW) ? NOFOLLOW : FOLLOW) | LOCKLEAF; if ((error = namei(ndp)) != 0) return (error); vp = ndp->ni_vp; } if (vp->v_type == VLNK) { error = EMLINK; goto bad; } if (vp->v_type == VSOCK) { error = EOPNOTSUPP; goto bad; } if ((fmode & O_CREAT) == 0) { mode = 0; if (fmode & (FWRITE | O_TRUNC)) { if (vp->v_type == VDIR) { error = EISDIR; goto bad; } error = vn_writechk(vp); if (error) goto bad; mode |= VWRITE; } if (fmode & FREAD) mode |= VREAD; if (mode) { error = VOP_ACCESS(vp, mode, cred, td); if (error) goto bad; } } if ((error = VOP_OPEN(vp, fmode, cred, td)) != 0) goto bad; /* * Make sure that a VM object is created for VMIO support. */ if (vn_canvmio(vp) == TRUE) { if ((error = vfs_object_create(vp, td, cred)) != 0) /* XXX: Should VOP_CLOSE() again here. */ goto bad; } if (fmode & FWRITE) vp->v_writecount++; *flagp = fmode; return (0); bad: NDFREE(ndp, NDF_ONLY_PNBUF); vput(vp); *flagp = fmode; return (error); } /* * Check for write permissions on the specified vnode. * Prototype text segments cannot be written. */ int vn_writechk(vp) register struct vnode *vp; { /* * If there's shared text associated with * the vnode, try to free it up once. If * we fail, we can't allow writing. */ if (vp->v_flag & VTEXT) return (ETXTBSY); return (0); } /* * Vnode close call */ int vn_close(vp, flags, cred, td) register struct vnode *vp; int flags; struct ucred *cred; struct thread *td; { int error; if (flags & FWRITE) vp->v_writecount--; error = VOP_CLOSE(vp, flags, cred, td); /* * XXX - In certain instances VOP_CLOSE has to do the vrele * itself. If the vrele has been done, it will return EAGAIN * to indicate that the vrele should not be done again. When * this happens, we just return success. The correct thing to * do would be to have all VOP_CLOSE instances do the vrele. */ if (error == EAGAIN) return (0); vrele(vp); return (error); } static __inline int sequential_heuristic(struct uio *uio, struct file *fp) { + /* * Sequential heuristic - detect sequential operation */ if ((uio->uio_offset == 0 && fp->f_seqcount > 0) || uio->uio_offset == fp->f_nextoff) { /* * XXX we assume that the filesystem block size is * the default. Not true, but still gives us a pretty * good indicator of how sequential the read operations * are. */ fp->f_seqcount += (uio->uio_resid + BKVASIZE - 1) / BKVASIZE; if (fp->f_seqcount >= 127) fp->f_seqcount = 127; return(fp->f_seqcount << 16); } /* * Not sequential, quick draw-down of seqcount */ if (fp->f_seqcount > 1) fp->f_seqcount = 1; else fp->f_seqcount = 0; return(0); } /* * Package up an I/O request on a vnode into a uio and do it. */ int vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, cred, aresid, td) enum uio_rw rw; struct vnode *vp; caddr_t base; int len; off_t offset; enum uio_seg segflg; int ioflg; struct ucred *cred; int *aresid; struct thread *td; { struct uio auio; struct iovec aiov; struct mount *mp; int error; if ((ioflg & IO_NODELOCKED) == 0) { mp = NULL; if (rw == UIO_WRITE && vp->v_type != VCHR && (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) return (error); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); } auio.uio_iov = &aiov; auio.uio_iovcnt = 1; aiov.iov_base = base; aiov.iov_len = len; auio.uio_resid = len; auio.uio_offset = offset; auio.uio_segflg = segflg; auio.uio_rw = rw; auio.uio_td = td; if (rw == UIO_READ) { error = VOP_READ(vp, &auio, ioflg, cred); } else { error = VOP_WRITE(vp, &auio, ioflg, cred); } if (aresid) *aresid = auio.uio_resid; else if (auio.uio_resid && error == 0) error = EIO; if ((ioflg & IO_NODELOCKED) == 0) { vn_finished_write(mp); VOP_UNLOCK(vp, 0, td); } return (error); } /* * Package up an I/O request on a vnode into a uio and do it. The I/O * request is split up into smaller chunks and we try to avoid saturating * the buffer cache while potentially holding a vnode locked, so we * check bwillwrite() before calling vn_rdwr(). We also call uio_yield() * to give other processes a chance to lock the vnode (either other processes * core'ing the same binary, or unrelated processes scanning the directory). */ int vn_rdwr_inchunks(rw, vp, base, len, offset, segflg, ioflg, cred, aresid, td) enum uio_rw rw; struct vnode *vp; caddr_t base; int len; off_t offset; enum uio_seg segflg; int ioflg; struct ucred *cred; int *aresid; struct thread *td; { int error = 0; do { int chunk = (len > MAXBSIZE) ? MAXBSIZE : len; if (rw != UIO_READ && vp->v_type == VREG) bwillwrite(); error = vn_rdwr(rw, vp, base, chunk, offset, segflg, ioflg, cred, aresid, td); len -= chunk; /* aresid calc already includes length */ if (error) break; offset += chunk; base += chunk; uio_yield(); } while (len); if (aresid) *aresid += len; return (error); } /* * File table vnode read routine. */ static int vn_read(fp, uio, cred, flags, td) struct file *fp; struct uio *uio; struct ucred *cred; struct thread *td; int flags; { struct vnode *vp; int error, ioflag; KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", uio->uio_td, td)); vp = (struct vnode *)fp->f_data; ioflag = 0; if (fp->f_flag & FNONBLOCK) ioflag |= IO_NDELAY; if (fp->f_flag & O_DIRECT) ioflag |= IO_DIRECT; VOP_LEASE(vp, td, cred, LEASE_READ); vn_lock(vp, LK_SHARED | LK_NOPAUSE | LK_RETRY, td); if ((flags & FOF_OFFSET) == 0) uio->uio_offset = fp->f_offset; ioflag |= sequential_heuristic(uio, fp); error = VOP_READ(vp, uio, ioflag, cred); if ((flags & FOF_OFFSET) == 0) fp->f_offset = uio->uio_offset; fp->f_nextoff = uio->uio_offset; VOP_UNLOCK(vp, 0, td); return (error); } /* * File table vnode write routine. */ static int vn_write(fp, uio, cred, flags, td) struct file *fp; struct uio *uio; struct ucred *cred; struct thread *td; int flags; { struct vnode *vp; struct mount *mp; int error, ioflag; KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", uio->uio_td, td)); vp = (struct vnode *)fp->f_data; if (vp->v_type == VREG) bwillwrite(); - vp = (struct vnode *)fp->f_data; /* XXX needed? */ ioflag = IO_UNIT; if (vp->v_type == VREG && (fp->f_flag & O_APPEND)) ioflag |= IO_APPEND; if (fp->f_flag & FNONBLOCK) ioflag |= IO_NDELAY; if (fp->f_flag & O_DIRECT) ioflag |= IO_DIRECT; if ((fp->f_flag & O_FSYNC) || (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS))) ioflag |= IO_SYNC; mp = NULL; if (vp->v_type != VCHR && (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) return (error); VOP_LEASE(vp, td, cred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); if ((flags & FOF_OFFSET) == 0) uio->uio_offset = fp->f_offset; ioflag |= sequential_heuristic(uio, fp); error = VOP_WRITE(vp, uio, ioflag, cred); if ((flags & FOF_OFFSET) == 0) fp->f_offset = uio->uio_offset; fp->f_nextoff = uio->uio_offset; VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); return (error); } /* * File table vnode stat routine. */ static int vn_statfile(fp, sb, td) struct file *fp; struct stat *sb; struct thread *td; { struct vnode *vp = (struct vnode *)fp->f_data; return vn_stat(vp, sb, td); } int vn_stat(vp, sb, td) struct vnode *vp; register struct stat *sb; struct thread *td; { struct vattr vattr; register struct vattr *vap; int error; u_short mode; vap = &vattr; error = VOP_GETATTR(vp, vap, td->td_proc->p_ucred, td); if (error) return (error); /* * Zero the spare stat fields */ sb->st_lspare = 0; sb->st_qspare[0] = 0; sb->st_qspare[1] = 0; /* * Copy from vattr table */ if (vap->va_fsid != VNOVAL) sb->st_dev = vap->va_fsid; else sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0]; sb->st_ino = vap->va_fileid; mode = vap->va_mode; switch (vap->va_type) { case VREG: mode |= S_IFREG; break; case VDIR: mode |= S_IFDIR; break; case VBLK: mode |= S_IFBLK; break; case VCHR: mode |= S_IFCHR; break; case VLNK: mode |= S_IFLNK; /* This is a cosmetic change, symlinks do not have a mode. */ if (vp->v_mount->mnt_flag & MNT_NOSYMFOLLOW) sb->st_mode &= ~ACCESSPERMS; /* 0000 */ else sb->st_mode |= ACCESSPERMS; /* 0777 */ break; case VSOCK: mode |= S_IFSOCK; break; case VFIFO: mode |= S_IFIFO; break; default: return (EBADF); }; sb->st_mode = mode; sb->st_nlink = vap->va_nlink; sb->st_uid = vap->va_uid; sb->st_gid = vap->va_gid; sb->st_rdev = vap->va_rdev; if (vap->va_size > OFF_MAX) return (EOVERFLOW); sb->st_size = vap->va_size; sb->st_atimespec = vap->va_atime; sb->st_mtimespec = vap->va_mtime; sb->st_ctimespec = vap->va_ctime; /* * According to www.opengroup.org, the meaning of st_blksize is * "a filesystem-specific preferred I/O block size for this * object. In some filesystem types, this may vary from file * to file" * Default to zero to catch bogus uses of this field. */ if (vap->va_type == VREG) { sb->st_blksize = vap->va_blocksize; } else if (vn_isdisk(vp, NULL)) { sb->st_blksize = vp->v_rdev->si_bsize_best; if (sb->st_blksize < vp->v_rdev->si_bsize_phys) sb->st_blksize = vp->v_rdev->si_bsize_phys; if (sb->st_blksize < BLKDEV_IOSIZE) sb->st_blksize = BLKDEV_IOSIZE; } else { sb->st_blksize = 0; } sb->st_flags = vap->va_flags; if (suser_xxx(td->td_proc->p_ucred, 0, 0)) sb->st_gen = 0; else sb->st_gen = vap->va_gen; #if (S_BLKSIZE == 512) /* Optimize this case */ sb->st_blocks = vap->va_bytes >> 9; #else sb->st_blocks = vap->va_bytes / S_BLKSIZE; #endif return (0); } /* * File table vnode ioctl routine. */ static int vn_ioctl(fp, com, data, td) struct file *fp; u_long com; caddr_t data; struct thread *td; { register struct vnode *vp = ((struct vnode *)fp->f_data); struct vattr vattr; int error; switch (vp->v_type) { case VREG: case VDIR: if (com == FIONREAD) { error = VOP_GETATTR(vp, &vattr, td->td_proc->p_ucred, td); if (error) return (error); *(int *)data = vattr.va_size - fp->f_offset; return (0); } if (com == FIONBIO || com == FIOASYNC) /* XXX */ return (0); /* XXX */ /* fall into ... */ default: #if 0 return (ENOTTY); #endif case VFIFO: case VCHR: case VBLK: if (com == FIODTYPE) { if (vp->v_type != VCHR && vp->v_type != VBLK) return (ENOTTY); *(int *)data = devsw(vp->v_rdev)->d_flags & D_TYPEMASK; return (0); } error = VOP_IOCTL(vp, com, data, fp->f_flag, td->td_proc->p_ucred, td); if (error == 0 && com == TIOCSCTTY) { /* Do nothing if reassigning same control tty */ if (td->td_proc->p_session->s_ttyvp == vp) return (0); /* Get rid of reference to old control tty */ if (td->td_proc->p_session->s_ttyvp) vrele(td->td_proc->p_session->s_ttyvp); td->td_proc->p_session->s_ttyvp = vp; VREF(vp); } return (error); } } /* * File table vnode poll routine. */ static int vn_poll(fp, events, cred, td) struct file *fp; int events; struct ucred *cred; struct thread *td; { return (VOP_POLL(((struct vnode *)fp->f_data), events, cred, td)); } /* * Check that the vnode is still valid, and if so * acquire requested lock. */ int #ifndef DEBUG_LOCKS vn_lock(vp, flags, td) #else debug_vn_lock(vp, flags, td, filename, line) #endif struct vnode *vp; int flags; struct thread *td; #ifdef DEBUG_LOCKS const char *filename; int line; #endif { int error; do { if ((flags & LK_INTERLOCK) == 0) mtx_lock(&vp->v_interlock); if ((vp->v_flag & VXLOCK) && vp->v_vxproc != curthread) { vp->v_flag |= VXWANT; msleep(vp, &vp->v_interlock, PINOD | PDROP, "vn_lock", 0); error = ENOENT; } else { if (vp->v_vxproc != NULL) log(LOG_INFO, "VXLOCK interlock avoided in vn_lock\n"); #ifdef DEBUG_LOCKS vp->filename = filename; vp->line = line; #endif error = VOP_LOCK(vp, flags | LK_NOPAUSE | LK_INTERLOCK, td); if (error == 0) return (error); } flags &= ~LK_INTERLOCK; } while (flags & LK_RETRY); return (error); } /* * File table vnode close routine. */ static int vn_closefile(fp, td) struct file *fp; struct thread *td; { fp->f_ops = &badfileops; return (vn_close(((struct vnode *)fp->f_data), fp->f_flag, fp->f_cred, td)); } /* * Preparing to start a filesystem write operation. If the operation is * permitted, then we bump the count of operations in progress and * proceed. If a suspend request is in progress, we wait until the * suspension is over, and then proceed. */ int vn_start_write(vp, mpp, flags) struct vnode *vp; struct mount **mpp; int flags; { struct mount *mp; int error; /* * If a vnode is provided, get and return the mount point that * to which it will write. */ if (vp != NULL) { if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) { *mpp = NULL; if (error != EOPNOTSUPP) return (error); return (0); } } if ((mp = *mpp) == NULL) return (0); /* * Check on status of suspension. */ while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) { if (flags & V_NOWAIT) return (EWOULDBLOCK); error = tsleep(&mp->mnt_flag, (PUSER - 1) | (flags & PCATCH), "suspfs", 0); if (error) return (error); } if (flags & V_XSLEEP) return (0); mp->mnt_writeopcount++; return (0); } /* * Secondary suspension. Used by operations such as vop_inactive * routines that are needed by the higher level functions. These * are allowed to proceed until all the higher level functions have * completed (indicated by mnt_writeopcount dropping to zero). At that * time, these operations are halted until the suspension is over. */ int vn_write_suspend_wait(vp, mp, flags) struct vnode *vp; struct mount *mp; int flags; { int error; if (vp != NULL) { if ((error = VOP_GETWRITEMOUNT(vp, &mp)) != 0) { if (error != EOPNOTSUPP) return (error); return (0); } } /* * If we are not suspended or have not yet reached suspended * mode, then let the operation proceed. */ if (mp == NULL || (mp->mnt_kern_flag & MNTK_SUSPENDED) == 0) return (0); if (flags & V_NOWAIT) return (EWOULDBLOCK); /* * Wait for the suspension to finish. */ return (tsleep(&mp->mnt_flag, (PUSER - 1) | (flags & PCATCH), "suspfs", 0)); } /* * Filesystem write operation has completed. If we are suspending and this * operation is the last one, notify the suspender that the suspension is * now in effect. */ void vn_finished_write(mp) struct mount *mp; { if (mp == NULL) return; mp->mnt_writeopcount--; if (mp->mnt_writeopcount < 0) panic("vn_finished_write: neg cnt"); if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 && mp->mnt_writeopcount <= 0) wakeup(&mp->mnt_writeopcount); } /* * Request a filesystem to suspend write operations. */ void vfs_write_suspend(mp) struct mount *mp; { struct thread *td = curthread; if (mp->mnt_kern_flag & MNTK_SUSPEND) return; mp->mnt_kern_flag |= MNTK_SUSPEND; if (mp->mnt_writeopcount > 0) (void) tsleep(&mp->mnt_writeopcount, PUSER - 1, "suspwt", 0); VFS_SYNC(mp, MNT_WAIT, td->td_proc->p_ucred, td); mp->mnt_kern_flag |= MNTK_SUSPENDED; } /* * Request a filesystem to resume write operations. */ void vfs_write_resume(mp) struct mount *mp; { if ((mp->mnt_kern_flag & MNTK_SUSPEND) == 0) return; mp->mnt_kern_flag &= ~(MNTK_SUSPEND | MNTK_SUSPENDED); wakeup(&mp->mnt_writeopcount); wakeup(&mp->mnt_flag); } static int vn_kqfilter(struct file *fp, struct knote *kn) { return (VOP_KQFILTER(((struct vnode *)fp->f_data), kn)); } /* * Simplified in-kernel wrapper calls for extended attribute access. * Both calls pass in a NULL credential, authorizing as "kernel" access. * Set IO_NODELOCKED in ioflg if the vnode is already locked. */ int vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, int *buflen, char *buf, struct thread *td) { struct uio auio; struct iovec iov; int error; iov.iov_len = *buflen; iov.iov_base = buf; auio.uio_iov = &iov; auio.uio_iovcnt = 1; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_SYSSPACE; auio.uio_td = td; auio.uio_offset = 0; auio.uio_resid = *buflen; if ((ioflg & IO_NODELOCKED) == 0) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); /* authorize attribute retrieval as kernel */ error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, td); if ((ioflg & IO_NODELOCKED) == 0) VOP_UNLOCK(vp, 0, td); if (error == 0) { *buflen = *buflen - auio.uio_resid; } return (error); } /* * XXX failure mode if partially written? */ int vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, int buflen, char *buf, struct thread *td) { struct uio auio; struct iovec iov; struct mount *mp; int error; iov.iov_len = buflen; iov.iov_base = buf; auio.uio_iov = &iov; auio.uio_iovcnt = 1; auio.uio_rw = UIO_WRITE; auio.uio_segflg = UIO_SYSSPACE; auio.uio_td = td; auio.uio_offset = 0; auio.uio_resid = buflen; if ((ioflg & IO_NODELOCKED) == 0) { if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0) return (error); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); } /* authorize attribute setting as kernel */ error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, td); if ((ioflg & IO_NODELOCKED) == 0) { vn_finished_write(mp); VOP_UNLOCK(vp, 0, td); } return (error); } int vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, struct thread *td) { struct mount *mp; int error; if ((ioflg & IO_NODELOCKED) == 0) { if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0) return (error); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); } /* authorize attribute removal as kernel */ error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL, NULL, td); if ((ioflg & IO_NODELOCKED) == 0) { vn_finished_write(mp); VOP_UNLOCK(vp, 0, td); } return (error); } Index: head/sys/netgraph/ng_socket.c =================================================================== --- head/sys/netgraph/ng_socket.c (revision 89305) +++ head/sys/netgraph/ng_socket.c (revision 89306) @@ -1,1092 +1,1093 @@ /* * ng_socket.c * * Copyright (c) 1996-1999 Whistle Communications, Inc. * All rights reserved. * * Subject to the following obligations and disclaimer of warranty, use and * redistribution of this software, in source or object code forms, with or * without modifications are expressly permitted by Whistle Communications; * provided, however, that: * 1. Any and all reproductions of the source or object code must include the * copyright notice above and the following disclaimer of warranties; and * 2. No rights are granted, in any manner or form, to use Whistle * Communications, Inc. trademarks, including the mark "WHISTLE * COMMUNICATIONS" on advertising, endorsements, or otherwise except as * such appears in the above copyright notice or in the software. * * THIS SOFTWARE IS BEING PROVIDED BY WHISTLE COMMUNICATIONS "AS IS", AND * TO THE MAXIMUM EXTENT PERMITTED BY LAW, WHISTLE COMMUNICATIONS MAKES NO * REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED, REGARDING THIS SOFTWARE, * INCLUDING WITHOUT LIMITATION, ANY AND ALL IMPLIED WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT. * WHISTLE COMMUNICATIONS DOES NOT WARRANT, GUARANTEE, OR MAKE ANY * REPRESENTATIONS REGARDING THE USE OF, OR THE RESULTS OF THE USE OF THIS * SOFTWARE IN TERMS OF ITS CORRECTNESS, ACCURACY, RELIABILITY OR OTHERWISE. * IN NO EVENT SHALL WHISTLE COMMUNICATIONS BE LIABLE FOR ANY DAMAGES * RESULTING FROM OR ARISING OUT OF ANY USE OF THIS SOFTWARE, INCLUDING * WITHOUT LIMITATION, ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, * PUNITIVE, OR CONSEQUENTIAL DAMAGES, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES, LOSS OF USE, DATA OR PROFITS, HOWEVER CAUSED AND UNDER ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF WHISTLE COMMUNICATIONS IS ADVISED OF THE POSSIBILITY * OF SUCH DAMAGE. * * Author: Julian Elischer * * $FreeBSD$ * $Whistle: ng_socket.c,v 1.28 1999/11/01 09:24:52 julian Exp $ */ /* * Netgraph socket nodes * * There are two types of netgraph sockets, control and data. * Control sockets have a netgraph node, but data sockets are * parasitic on control sockets, and have no node of their own. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef NOTYET #include #endif #include #include #include #include #ifdef NG_SEPARATE_MALLOC MALLOC_DEFINE(M_NETGRAPH_PATH, "netgraph_path", "netgraph path info "); MALLOC_DEFINE(M_NETGRAPH_SOCK, "netgraph_sock", "netgraph socket info "); #else #define M_NETGRAPH_PATH M_NETGRAPH #define M_NETGRAPH_SOCK M_NETGRAPH #endif /* * It's Ascii-art time! * +-------------+ +-------------+ * |socket (ctl)| |socket (data)| * +-------------+ +-------------+ * ^ ^ * | | * v v * +-----------+ +-----------+ * |pcb (ctl)| |pcb (data)| * +-----------+ +-----------+ * ^ ^ * | | * v v * +--------------------------+ * | Socket type private | * | data | * +--------------------------+ * ^ * | * v * +----------------+ * | struct ng_node | * +----------------+ */ /* Netgraph node methods */ static ng_constructor_t ngs_constructor; static ng_rcvmsg_t ngs_rcvmsg; static ng_shutdown_t ngs_shutdown; static ng_newhook_t ngs_newhook; static ng_connect_t ngs_connect; static ng_rcvdata_t ngs_rcvdata; static ng_disconnect_t ngs_disconnect; /* Internal methods */ static int ng_attach_data(struct socket *so); static int ng_attach_cntl(struct socket *so); static int ng_attach_common(struct socket *so, int type); static void ng_detach_common(struct ngpcb *pcbp, int type); /*static int ng_internalize(struct mbuf *m, struct thread *p); */ static int ng_connect_data(struct sockaddr *nam, struct ngpcb *pcbp); static int ng_bind(struct sockaddr *nam, struct ngpcb *pcbp); static int ngs_mod_event(module_t mod, int event, void *data); static int ship_msg(struct ngpcb *pcbp, struct ng_mesg *msg, struct sockaddr_ng *addr); /* Netgraph type descriptor */ static struct ng_type typestruct = { NG_ABI_VERSION, NG_SOCKET_NODE_TYPE, ngs_mod_event, ngs_constructor, ngs_rcvmsg, ngs_shutdown, ngs_newhook, NULL, ngs_connect, ngs_rcvdata, ngs_disconnect, NULL }; NETGRAPH_INIT(socket, &typestruct); /* Buffer space */ static u_long ngpdg_sendspace = 20 * 1024; /* really max datagram size */ static u_long ngpdg_recvspace = 20 * 1024; /* List of all sockets */ static LIST_HEAD(, ngpcb) ngsocklist; #define sotongpcb(so) ((struct ngpcb *)(so)->so_pcb) /* If getting unexplained errors returned, set this to "Debugger("X"); */ #ifndef TRAP_ERROR #define TRAP_ERROR #endif /*************************************************************** Control sockets ***************************************************************/ static int ngc_attach(struct socket *so, int proto, struct thread *td) { struct ngpcb *const pcbp = sotongpcb(so); if (suser_td(td)) return (EPERM); if (pcbp != NULL) return (EISCONN); return (ng_attach_cntl(so)); } static int ngc_detach(struct socket *so) { struct ngpcb *const pcbp = sotongpcb(so); if (pcbp == NULL) return (EINVAL); ng_detach_common(pcbp, NG_CONTROL); return (0); } static int ngc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, struct mbuf *control, struct thread *td) { struct ngpcb *const pcbp = sotongpcb(so); struct sockaddr_ng *const sap = (struct sockaddr_ng *) addr; struct ng_mesg *msg; struct mbuf *m0; char *path = NULL; int len, error = 0; if (pcbp == NULL) { error = EINVAL; goto release; } #ifdef NOTYET if (control && (error = ng_internalize(control, td))) { if (pcbp->sockdata == NULL) { error = ENOTCONN; goto release; } } #else /* NOTYET */ if (control) { error = EINVAL; goto release; } #endif /* NOTYET */ /* Require destination as there may be >= 1 hooks on this node */ if (addr == NULL) { error = EDESTADDRREQ; goto release; } /* Allocate an expendable buffer for the path, chop off * the sockaddr header, and make sure it's NUL terminated */ len = sap->sg_len - 2; MALLOC(path, char *, len + 1, M_NETGRAPH_PATH, M_WAITOK); if (path == NULL) { error = ENOMEM; goto release; } bcopy(sap->sg_data, path, len); path[len] = '\0'; /* Move the actual message out of mbufs into a linear buffer. * Start by adding up the size of the data. (could use mh_len?) */ for (len = 0, m0 = m; m0 != NULL; m0 = m0->m_next) len += m0->m_len; /* Move the data into a linear buffer as well. Messages are not * delivered in mbufs. */ MALLOC(msg, struct ng_mesg *, len + 1, M_NETGRAPH_MSG, M_WAITOK); if (msg == NULL) { error = ENOMEM; goto release; } m_copydata(m, 0, len, (char *)msg); #ifdef TRACE_MESSAGES do { item_p item; if ((item = ng_package_msg(msg)) == NULL) { (msg) = NULL; (error) = ENOMEM; printf("err=%d\n",error); break; } if (((error) = ng_address_path((pcbp->sockdata->node), (item), (path), (NULL))) == 0) { printf("[%x]:<---------[socket]: c=<%d>cmd=%x(%s) f=%x #%d (%s)\n", item->el_dest->nd_ID, msg->header.typecookie, msg->header.cmd, msg->header.cmdstr, msg->header.flags, msg->header.token, item->el_dest->nd_type->name); SAVE_LINE(item); (error) = ng_snd_item((item), 0); } else { printf("errx=%d\n",error); } (msg) = NULL; } while (0); #else /* The callee will free the msg when done. The path is our business. */ NG_SEND_MSG_PATH(error, pcbp->sockdata->node, msg, path, NULL); #endif release: if (path != NULL) FREE(path, M_NETGRAPH_PATH); if (control != NULL) m_freem(control); if (m != NULL) m_freem(m); return (error); } static int ngc_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { struct ngpcb *const pcbp = sotongpcb(so); if (pcbp == 0) return (EINVAL); return (ng_bind(nam, pcbp)); } static int ngc_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { printf(" program tried to connect control socket to remote node\n "); /* * At this time refuse to do this.. it used to * do something but it was undocumented and not used. */ return (EINVAL); } /*************************************************************** Data sockets ***************************************************************/ static int ngd_attach(struct socket *so, int proto, struct thread *td) { struct ngpcb *const pcbp = sotongpcb(so); if (pcbp != NULL) return (EISCONN); return (ng_attach_data(so)); } static int ngd_detach(struct socket *so) { struct ngpcb *const pcbp = sotongpcb(so); if (pcbp == NULL) return (EINVAL); ng_detach_common(pcbp, NG_DATA); return (0); } static int ngd_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, struct mbuf *control, struct thread *td) { struct ngpcb *const pcbp = sotongpcb(so); struct sockaddr_ng *const sap = (struct sockaddr_ng *) addr; int len, error; hook_p hook = NULL; char hookname[NG_HOOKLEN + 1]; if ((pcbp == NULL) || (control != NULL)) { error = EINVAL; goto release; } if (pcbp->sockdata == NULL) { error = ENOTCONN; goto release; } /* * If the user used any of these ways to not specify an address * then handle specially. */ if ((sap == NULL) || ((len = sap->sg_len - 2) <= 0) || (*sap->sg_data == '\0')) { if (NG_NODE_NUMHOOKS(pcbp->sockdata->node) != 1) { error = EDESTADDRREQ; goto release; } /* * if exactly one hook exists, just use it. * Special case to allow write(2) to work on an ng_socket. */ hook = LIST_FIRST(&pcbp->sockdata->node->nd_hooks); } else { if (len > NG_HOOKLEN) { error = EINVAL; goto release; } /* * chop off the sockaddr header, and make sure it's NUL * terminated */ bcopy(sap->sg_data, hookname, len); hookname[len] = '\0'; /* Find the correct hook from 'hookname' */ LIST_FOREACH(hook, &pcbp->sockdata->node->nd_hooks, hk_hooks) { if (strcmp(hookname, NG_HOOK_NAME(hook)) == 0) { break; } } if (hook == NULL) { error = EHOSTUNREACH; } } /* Send data (OK if hook is NULL) */ NG_SEND_DATA_ONLY(error, hook, m); /* makes m NULL */ release: if (control != NULL) m_freem(control); if (m != NULL) m_freem(m); return (error); } static int ngd_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { struct ngpcb *const pcbp = sotongpcb(so); if (pcbp == 0) return (EINVAL); return (ng_connect_data(nam, pcbp)); } /* * Used for both data and control sockets */ static int ng_setsockaddr(struct socket *so, struct sockaddr **addr) { struct ngpcb *pcbp; struct sockaddr_ng *sg; int sg_len, namelen, s; /* Why isn't sg_data a `char[1]' ? :-( */ sg_len = sizeof(struct sockaddr_ng) - sizeof(sg->sg_data) + 1; s = splnet(); pcbp = sotongpcb(so); if ((pcbp == NULL) || (pcbp->sockdata == NULL)) { splx(s); return (EINVAL); } namelen = 0; /* silence compiler ! */ if ( NG_NODE_HAS_NAME(pcbp->sockdata->node)) sg_len += namelen = strlen(NG_NODE_NAME(pcbp->sockdata->node)); MALLOC(sg, struct sockaddr_ng *, sg_len, M_SONAME, M_WAITOK | M_ZERO); if (NG_NODE_HAS_NAME(pcbp->sockdata->node)) bcopy(NG_NODE_NAME(pcbp->sockdata->node), sg->sg_data, namelen); splx(s); sg->sg_len = sg_len; sg->sg_family = AF_NETGRAPH; *addr = (struct sockaddr *)sg; return (0); } /* * Attach a socket to it's protocol specific partner. * For a control socket, actually create a netgraph node and attach * to it as well. */ static int ng_attach_cntl(struct socket *so) { struct ngsock *privdata; struct ngpcb *pcbp; int error; /* Setup protocol control block */ if ((error = ng_attach_common(so, NG_CONTROL)) != 0) return (error); pcbp = sotongpcb(so); /* Allocate node private info */ MALLOC(privdata, struct ngsock *, sizeof(*privdata), M_NETGRAPH_SOCK, M_WAITOK | M_ZERO); if (privdata == NULL) { ng_detach_common(pcbp, NG_CONTROL); return (ENOMEM); } /* Make the generic node components */ if ((error = ng_make_node_common(&typestruct, &privdata->node)) != 0) { FREE(privdata, M_NETGRAPH_SOCK); ng_detach_common(pcbp, NG_CONTROL); return (error); } NG_NODE_SET_PRIVATE(privdata->node, privdata); /* Link the pcb and the node private data */ privdata->ctlsock = pcbp; pcbp->sockdata = privdata; privdata->refs++; return (0); } static int ng_attach_data(struct socket *so) { return(ng_attach_common(so, NG_DATA)); } /* * Set up a socket protocol control block. * This code is shared between control and data sockets. */ static int ng_attach_common(struct socket *so, int type) { struct ngpcb *pcbp; int error; /* Standard socket setup stuff */ error = soreserve(so, ngpdg_sendspace, ngpdg_recvspace); if (error) return (error); /* Allocate the pcb */ MALLOC(pcbp, struct ngpcb *, sizeof(*pcbp), M_PCB, M_WAITOK | M_ZERO); if (pcbp == NULL) return (ENOMEM); pcbp->type = type; /* Link the pcb and the socket */ so->so_pcb = (caddr_t) pcbp; pcbp->ng_socket = so; /* Add the socket to linked list */ LIST_INSERT_HEAD(&ngsocklist, pcbp, socks); return (0); } /* * Disassociate the socket from it's protocol specific * partner. If it's attached to a node's private data structure, * then unlink from that too. If we were the last socket attached to it, * then shut down the entire node. Shared code for control and data sockets. */ static void ng_detach_common(struct ngpcb *pcbp, int which) { struct ngsock *priv; if (pcbp->sockdata) { priv = pcbp->sockdata; pcbp->sockdata = NULL; switch (which) { case NG_CONTROL: priv->ctlsock = NULL; break; case NG_DATA: priv->datasock = NULL; break; default: panic(__func__); } if ((--priv->refs == 0) && (priv->node != NULL)) ng_rmnode_self(priv->node); } pcbp->ng_socket->so_pcb = NULL; pcbp->ng_socket = NULL; LIST_REMOVE(pcbp, socks); FREE(pcbp, M_PCB); } #ifdef NOTYET /* * File descriptors can be passed into a AF_NETGRAPH socket. * Note, that file descriptors cannot be passed OUT. * Only character device descriptors are accepted. * Character devices are useful to connect a graph to a device, * which after all is the purpose of this whole system. */ static int ng_internalize(struct mbuf *control, struct thread *td) { - struct filedesc *fdp = td->td_proc->p_fd; struct cmsghdr *cm = mtod(control, struct cmsghdr *); struct file *fp; struct vnode *vn; int oldfds; int fd; if (cm->cmsg_type != SCM_RIGHTS || cm->cmsg_level != SOL_SOCKET || cm->cmsg_len != control->m_len) { TRAP_ERROR; return (EINVAL); } /* Check there is only one FD. XXX what would more than one signify? */ oldfds = ((caddr_t)cm + cm->cmsg_len - (caddr_t)data) / sizeof (int); if (oldfds != 1) { TRAP_ERROR; return (EINVAL); } /* Check that the FD given is legit. and change it to a pointer to a * struct file. */ fd = CMSG_DATA(cm); - if ((unsigned) fd >= fdp->fd_nfiles - || (fp = fdp->fd_ofiles[fd]) == NULL) { + fp = ffind_hold(td, fd); + if (fp == NULL) return (EBADF); - } /* Depending on what kind of resource it is, act differently. For * devices, we treat it as a file. For a AF_NETGRAPH socket, * shortcut straight to the node. */ switch (fp->f_type) { case DTYPE_VNODE: vn = (struct vnode *) fp->f_data; if (vn && (vn->v_type == VCHR)) { /* for a VCHR, actually reference the FILE */ fp->f_count++; /* XXX then what :) */ /* how to pass on to other modules? */ } else { + fdrop(fp, td); TRAP_ERROR; return (EINVAL); } break; default: + fdrop(fp, td); TRAP_ERROR; return (EINVAL); } + fdrop(fp, td); return (0); } #endif /* NOTYET */ /* * Connect the data socket to a named control socket node. */ static int ng_connect_data(struct sockaddr *nam, struct ngpcb *pcbp) { struct sockaddr_ng *sap; node_p farnode; struct ngsock *priv; int error; item_p item; /* If we are already connected, don't do it again */ if (pcbp->sockdata != NULL) return (EISCONN); /* Find the target (victim) and check it doesn't already have a data * socket. Also check it is a 'socket' type node. * Use ng_package_data() and address_path() to do this. */ sap = (struct sockaddr_ng *) nam; /* The item will hold the node reference */ item = ng_package_data(NULL, NULL); if (item == NULL) { return (ENOMEM); } if ((error = ng_address_path(NULL, item, sap->sg_data, NULL))) return (error); /* item is freed on failure */ /* * Extract node from item and free item. Remember we now have * a reference on the node. The item holds it for us. * when we free the item we release the reference. */ farnode = item->el_dest; /* shortcut */ if (strcmp(farnode->nd_type->name, NG_SOCKET_NODE_TYPE) != 0) { NG_FREE_ITEM(item); /* drop the reference to the node */ return (EINVAL); } priv = NG_NODE_PRIVATE(farnode); if (priv->datasock != NULL) { NG_FREE_ITEM(item); /* drop the reference to the node */ return (EADDRINUSE); } /* * Link the PCB and the private data struct. and note the extra * reference. Drop the extra reference on the node. */ priv->datasock = pcbp; pcbp->sockdata = priv; priv->refs++; /* XXX possible race if it's being freed */ NG_FREE_ITEM(item); /* drop the reference to the node */ return (0); } /* * Binding a socket means giving the corresponding node a name */ static int ng_bind(struct sockaddr *nam, struct ngpcb *pcbp) { struct ngsock *const priv = pcbp->sockdata; struct sockaddr_ng *const sap = (struct sockaddr_ng *) nam; if (priv == NULL) { TRAP_ERROR; return (EINVAL); } if ((sap->sg_len < 4) || (sap->sg_len > (NG_NODELEN + 3)) || (sap->sg_data[0] == '\0') || (sap->sg_data[sap->sg_len - 3] != '\0')) { TRAP_ERROR; return (EINVAL); } return (ng_name_node(priv->node, sap->sg_data)); } /* * Take a message and pass it up to the control socket associated * with the node. */ static int ship_msg(struct ngpcb *pcbp, struct ng_mesg *msg, struct sockaddr_ng *addr) { struct socket *const so = pcbp->ng_socket; struct mbuf *mdata; int msglen; /* Copy the message itself into an mbuf chain */ msglen = sizeof(struct ng_mesg) + msg->header.arglen; mdata = m_devget((caddr_t) msg, msglen, 0, NULL, NULL); /* Here we free the message, as we are the end of the line. * We need to do that regardless of whether we got mbufs. */ NG_FREE_MSG(msg); if (mdata == NULL) { TRAP_ERROR; return (ENOBUFS); } /* Send it up to the socket */ if (sbappendaddr(&so->so_rcv, (struct sockaddr *) addr, mdata, NULL) == 0) { TRAP_ERROR; m_freem(mdata); return (ENOBUFS); } sorwakeup(so); return (0); } /* * You can only create new nodes from the socket end of things. */ static int ngs_constructor(node_p nodep) { return (EINVAL); } /* * We allow any hook to be connected to the node. * There is no per-hook private information though. */ static int ngs_newhook(node_p node, hook_p hook, const char *name) { NG_HOOK_SET_PRIVATE(hook, NG_NODE_PRIVATE(node)); return (0); } /* * if only one hook, allow read(2) and write(2) to work. */ static int ngs_connect(hook_p hook) { node_p node = NG_HOOK_NODE(hook); struct ngsock *priv = NG_NODE_PRIVATE(node); if ((priv->datasock) && (priv->datasock->ng_socket)) { if (NG_NODE_NUMHOOKS(node) == 1) { priv->datasock->ng_socket->so_state |= SS_ISCONNECTED; } else { priv->datasock->ng_socket->so_state &= ~SS_ISCONNECTED; } } return (0); } /* * Incoming messages get passed up to the control socket. * Unless they are for us specifically (socket_type) */ static int ngs_rcvmsg(node_p node, item_p item, hook_p lasthook) { struct ngsock *const priv = NG_NODE_PRIVATE(node); struct ngpcb *const pcbp = priv->ctlsock; struct sockaddr_ng *addr; int addrlen; int error = 0; struct ng_mesg *msg; ng_ID_t retaddr = NGI_RETADDR(item); char retabuf[32]; NGI_GET_MSG(item, msg); NG_FREE_ITEM(item); /* we have all we need */ /* Only allow mesgs to be passed if we have the control socket. * Data sockets can only support the generic messages. */ if (pcbp == NULL) { TRAP_ERROR; return (EINVAL); } #ifdef TRACE_MESSAGES printf("[%x]:---------->[socket]: c=<%d>cmd=%x(%s) f=%x #%d\n", retaddr, msg->header.typecookie, msg->header.cmd, msg->header.cmdstr, msg->header.flags, msg->header.token); #endif if (msg->header.typecookie == NGM_SOCKET_COOKIE) { switch (msg->header.cmd) { case NGM_SOCK_CMD_NOLINGER: priv->flags |= NGS_FLAG_NOLINGER; break; case NGM_SOCK_CMD_LINGER: priv->flags &= ~NGS_FLAG_NOLINGER; break; default: error = EINVAL; /* unknown command */ } /* Free the message and return */ NG_FREE_MSG(msg); return(error); } /* Get the return address into a sockaddr */ sprintf(retabuf,"[%x]:", retaddr); addrlen = strlen(retabuf); MALLOC(addr, struct sockaddr_ng *, addrlen + 4, M_NETGRAPH_PATH, M_NOWAIT); if (addr == NULL) { TRAP_ERROR; return (ENOMEM); } addr->sg_len = addrlen + 3; addr->sg_family = AF_NETGRAPH; bcopy(retabuf, addr->sg_data, addrlen); addr->sg_data[addrlen] = '\0'; /* Send it up */ error = ship_msg(pcbp, msg, addr); FREE(addr, M_NETGRAPH_PATH); return (error); } /* * Receive data on a hook */ static int ngs_rcvdata(hook_p hook, item_p item) { struct ngsock *const priv = NG_NODE_PRIVATE(NG_HOOK_NODE(hook)); struct ngpcb *const pcbp = priv->datasock; struct socket *so; struct sockaddr_ng *addr; char *addrbuf[NG_HOOKLEN + 1 + 4]; int addrlen; struct mbuf *m; NGI_GET_M(item, m); NG_FREE_ITEM(item); /* If there is no data socket, black-hole it */ if (pcbp == NULL) { NG_FREE_M(m); return (0); } so = pcbp->ng_socket; /* Get the return address into a sockaddr. */ addrlen = strlen(NG_HOOK_NAME(hook)); /* <= NG_HOOKLEN */ addr = (struct sockaddr_ng *) addrbuf; addr->sg_len = addrlen + 3; addr->sg_family = AF_NETGRAPH; bcopy(NG_HOOK_NAME(hook), addr->sg_data, addrlen); addr->sg_data[addrlen] = '\0'; /* Try to tell the socket which hook it came in on */ if (sbappendaddr(&so->so_rcv, (struct sockaddr *) addr, m, NULL) == 0) { m_freem(m); TRAP_ERROR; return (ENOBUFS); } sorwakeup(so); return (0); } /* * Hook disconnection * * For this type, removal of the last link destroys the node * if the NOLINGER flag is set. */ static int ngs_disconnect(hook_p hook) { node_p node = NG_HOOK_NODE(hook); struct ngsock *const priv = NG_NODE_PRIVATE(node); if ((priv->datasock) && (priv->datasock->ng_socket)) { if (NG_NODE_NUMHOOKS(node) == 1) { priv->datasock->ng_socket->so_state |= SS_ISCONNECTED; } else { priv->datasock->ng_socket->so_state &= ~SS_ISCONNECTED; } } if ((priv->flags & NGS_FLAG_NOLINGER ) && (NG_NODE_NUMHOOKS(node) == 0) && (NG_NODE_IS_VALID(node))) { ng_rmnode_self(node); } return (0); } /* * Do local shutdown processing. * In this case, that involves making sure the socket * knows we should be shutting down. */ static int ngs_shutdown(node_p node) { struct ngsock *const priv = NG_NODE_PRIVATE(node); struct ngpcb *const dpcbp = priv->datasock; struct ngpcb *const pcbp = priv->ctlsock; if (dpcbp != NULL) { soisdisconnected(dpcbp->ng_socket); dpcbp->sockdata = NULL; priv->datasock = NULL; priv->refs--; } if (pcbp != NULL) { soisdisconnected(pcbp->ng_socket); pcbp->sockdata = NULL; priv->ctlsock = NULL; priv->refs--; } NG_NODE_SET_PRIVATE(node, NULL); NG_NODE_UNREF(node); FREE(priv, M_NETGRAPH_SOCK); return (0); } static int dummy_disconnect(struct socket *so) { return (0); } /* * Control and data socket type descriptors */ static struct pr_usrreqs ngc_usrreqs = { NULL, /* abort */ pru_accept_notsupp, ngc_attach, ngc_bind, ngc_connect, pru_connect2_notsupp, pru_control_notsupp, ngc_detach, dummy_disconnect, /* disconnect */ pru_listen_notsupp, NULL, /* setpeeraddr */ pru_rcvd_notsupp, pru_rcvoob_notsupp, ngc_send, pru_sense_null, NULL, /* shutdown */ ng_setsockaddr, sosend, soreceive, sopoll }; static struct pr_usrreqs ngd_usrreqs = { NULL, /* abort */ pru_accept_notsupp, ngd_attach, NULL, /* bind */ ngd_connect, pru_connect2_notsupp, pru_control_notsupp, ngd_detach, dummy_disconnect, /* disconnect */ pru_listen_notsupp, NULL, /* setpeeraddr */ pru_rcvd_notsupp, pru_rcvoob_notsupp, ngd_send, pru_sense_null, NULL, /* shutdown */ ng_setsockaddr, sosend, soreceive, sopoll }; /* * Definitions of protocols supported in the NETGRAPH domain. */ extern struct domain ngdomain; /* stop compiler warnings */ static struct protosw ngsw[] = { { SOCK_DGRAM, /* protocol type */ &ngdomain, /* backpointer to domain */ NG_CONTROL, PR_ATOMIC | PR_ADDR /* | PR_RIGHTS */, /* flags */ 0, 0, 0, 0, /* input, output, ctlinput, ctloutput */ NULL, /* ousrreq */ 0, 0, 0, 0, /* init, fasttimeo, slowtimo, drain */ &ngc_usrreqs, /* usrreq table (above) */ /*{NULL}*/ /* pffh (protocol filter head?) */ }, { SOCK_DGRAM, /* protocol type */ &ngdomain, /* backpointer to domain */ NG_DATA, PR_ATOMIC | PR_ADDR, /* flags */ 0, 0, 0, 0, /* input, output, ctlinput, ctloutput */ NULL, /* ousrreq() */ 0, 0, 0, 0, /* init, fasttimeo, slowtimo, drain */ &ngd_usrreqs, /* usrreq table (above) */ /*{NULL}*/ /* pffh (protocol filter head?) */ } }; struct domain ngdomain = { AF_NETGRAPH, "netgraph", NULL, /* init() */ NULL, /* externalise() */ NULL, /* dispose() */ ngsw, /* protosw entry */ &ngsw[sizeof(ngsw) / sizeof(ngsw[0])], /* Number of protosw entries */ NULL, /* next domain in list */ NULL, /* rtattach() */ 0, /* arg to rtattach in bits */ 0 /* maxrtkey */ }; /* * Handle loading and unloading for this node type * This is to handle auxiliary linkages (e.g protocol domain addition). */ static int ngs_mod_event(module_t mod, int event, void *data) { int error = 0; switch (event) { case MOD_LOAD: /* Register protocol domain */ net_add_domain(&ngdomain); break; case MOD_UNLOAD: /* Insure there are no open netgraph sockets */ if (!LIST_EMPTY(&ngsocklist)) { error = EBUSY; break; } #ifdef NOTYET if ((LIST_EMPTY(&ngsocklist)) && (typestruct.refs == 0)) { /* Unregister protocol domain XXX can't do this yet.. */ if ((error = net_rm_domain(&ngdomain)) != 0) break; } else #endif error = EBUSY; break; default: error = EOPNOTSUPP; break; } return (error); } SYSCTL_INT(_net_graph, OID_AUTO, family, CTLFLAG_RD, 0, AF_NETGRAPH, ""); SYSCTL_NODE(_net_graph, OID_AUTO, data, CTLFLAG_RW, 0, "DATA"); SYSCTL_INT(_net_graph_data, OID_AUTO, proto, CTLFLAG_RD, 0, NG_DATA, ""); SYSCTL_NODE(_net_graph, OID_AUTO, control, CTLFLAG_RW, 0, "CONTROL"); SYSCTL_INT(_net_graph_control, OID_AUTO, proto, CTLFLAG_RD, 0, NG_CONTROL, ""); Index: head/sys/netsmb/smb_dev.c =================================================================== --- head/sys/netsmb/smb_dev.c (revision 89305) +++ head/sys/netsmb/smb_dev.c (revision 89306) @@ -1,434 +1,445 @@ /* * Copyright (c) 2000-2001 Boris Popov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Boris Popov. * 4. Neither the name of the author nor the names of any co-contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include /* Must come after sys/malloc.h */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define SMB_GETDEV(dev) ((struct smb_dev*)(dev)->si_drv1) #define SMB_CHECKMINOR(dev) do { \ sdp = SMB_GETDEV(dev); \ if (sdp == NULL) return ENXIO; \ } while(0) static d_open_t nsmb_dev_open; static d_close_t nsmb_dev_close; static d_read_t nsmb_dev_read; static d_write_t nsmb_dev_write; static d_ioctl_t nsmb_dev_ioctl; static d_poll_t nsmb_dev_poll; MODULE_DEPEND(netsmb, libiconv, 1, 1, 1); MODULE_VERSION(netsmb, NSMB_VERSION); static int smb_version = NSMB_VERSION; SYSCTL_DECL(_net_smb); SYSCTL_INT(_net_smb, OID_AUTO, version, CTLFLAG_RD, &smb_version, 0, ""); static MALLOC_DEFINE(M_NSMBDEV, "NETSMBDEV", "NET/SMB device"); /* int smb_dev_queue(struct smb_dev *ndp, struct smb_rq *rqp, int prio); */ static struct cdevsw nsmb_cdevsw = { /* open */ nsmb_dev_open, /* close */ nsmb_dev_close, /* read */ nsmb_dev_read, /* write */ nsmb_dev_write, /* ioctl */ nsmb_dev_ioctl, /* poll */ nsmb_dev_poll, /* mmap */ nommap, /* strategy */ nostrategy, /* name */ NSMB_NAME, /* maj */ NSMB_MAJOR, /* dump */ nodump, /* psize */ nopsize, /* flags */ 0, #ifndef FB_CURRENT /* bmaj */ -1 #endif }; static eventhandler_tag nsmb_dev_tag; static void nsmb_dev_clone(void *arg, char *name, int namelen, dev_t *dev) { int min; if (*dev != NODEV) return; if (dev_stdclone(name, NULL, NSMB_NAME, &min) != 1) return; *dev = make_dev(&nsmb_cdevsw, min, 0, 0, 0600, NSMB_NAME"%d", min); } static int nsmb_dev_open(dev_t dev, int oflags, int devtype, struct thread *td) { struct smb_dev *sdp; struct proc *p = td->td_proc; struct ucred *cred = p->p_ucred; int s; sdp = SMB_GETDEV(dev); if (sdp && (sdp->sd_flags & NSMBFL_OPEN)) return EBUSY; if (sdp == NULL) { sdp = malloc(sizeof(*sdp), M_NSMBDEV, M_WAITOK); dev->si_drv1 = (void*)sdp; } /* * XXX: this is just crazy - make a device for an already passed device... * someone should take care of it. */ if ((dev->si_flags & SI_NAMED) == 0) make_dev(&nsmb_cdevsw, minor(dev), cred->cr_uid, cred->cr_gid, 0700, NSMB_NAME"%d", dev2unit(dev)); bzero(sdp, sizeof(*sdp)); /* STAILQ_INIT(&sdp->sd_rqlist); STAILQ_INIT(&sdp->sd_rplist); bzero(&sdp->sd_pollinfo, sizeof(struct selinfo)); */ s = splimp(); sdp->sd_level = -1; sdp->sd_flags |= NSMBFL_OPEN; splx(s); return 0; } static int nsmb_dev_close(dev_t dev, int flag, int fmt, struct thread *td) { struct smb_dev *sdp; struct smb_vc *vcp; struct smb_share *ssp; struct smb_cred scred; int s; SMB_CHECKMINOR(dev); s = splimp(); if ((sdp->sd_flags & NSMBFL_OPEN) == 0) { splx(s); return EBADF; } smb_makescred(&scred, td, NULL); ssp = sdp->sd_share; if (ssp != NULL) smb_share_rele(ssp, &scred); vcp = sdp->sd_vc; if (vcp != NULL) smb_vc_rele(vcp, &scred); /* smb_flushq(&sdp->sd_rqlist); smb_flushq(&sdp->sd_rplist); */ dev->si_drv1 = NULL; free(sdp, M_NSMBDEV); destroy_dev(dev); splx(s); return 0; } static int nsmb_dev_ioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct thread *td) { struct smb_dev *sdp; struct smb_vc *vcp; struct smb_share *ssp; struct smb_cred scred; int error = 0; SMB_CHECKMINOR(dev); if ((sdp->sd_flags & NSMBFL_OPEN) == 0) return EBADF; smb_makescred(&scred, td, NULL); switch (cmd) { case SMBIOC_OPENSESSION: if (sdp->sd_vc) return EISCONN; error = smb_usr_opensession((struct smbioc_ossn*)data, &scred, &vcp); if (error) break; sdp->sd_vc = vcp; smb_vc_unlock(vcp, 0, td); sdp->sd_level = SMBL_VC; break; case SMBIOC_OPENSHARE: if (sdp->sd_share) return EISCONN; if (sdp->sd_vc == NULL) return ENOTCONN; error = smb_usr_openshare(sdp->sd_vc, (struct smbioc_oshare*)data, &scred, &ssp); if (error) break; sdp->sd_share = ssp; smb_share_unlock(ssp, 0, td); sdp->sd_level = SMBL_SHARE; break; case SMBIOC_REQUEST: if (sdp->sd_share == NULL) return ENOTCONN; error = smb_usr_simplerequest(sdp->sd_share, (struct smbioc_rq*)data, &scred); break; case SMBIOC_T2RQ: if (sdp->sd_share == NULL) return ENOTCONN; error = smb_usr_t2request(sdp->sd_share, (struct smbioc_t2rq*)data, &scred); break; case SMBIOC_SETFLAGS: { struct smbioc_flags *fl = (struct smbioc_flags*)data; int on; if (fl->ioc_level == SMBL_VC) { if (fl->ioc_mask & SMBV_PERMANENT) { on = fl->ioc_flags & SMBV_PERMANENT; if ((vcp = sdp->sd_vc) == NULL) return ENOTCONN; error = smb_vc_get(vcp, LK_EXCLUSIVE, &scred); if (error) break; if (on && (vcp->obj.co_flags & SMBV_PERMANENT) == 0) { vcp->obj.co_flags |= SMBV_PERMANENT; smb_vc_ref(vcp); } else if (!on && (vcp->obj.co_flags & SMBV_PERMANENT)) { vcp->obj.co_flags &= ~SMBV_PERMANENT; smb_vc_rele(vcp, &scred); } smb_vc_put(vcp, &scred); } else error = EINVAL; } else if (fl->ioc_level == SMBL_SHARE) { if (fl->ioc_mask & SMBS_PERMANENT) { on = fl->ioc_flags & SMBS_PERMANENT; if ((ssp = sdp->sd_share) == NULL) return ENOTCONN; error = smb_share_get(ssp, LK_EXCLUSIVE, &scred); if (error) break; if (on && (ssp->obj.co_flags & SMBS_PERMANENT) == 0) { ssp->obj.co_flags |= SMBS_PERMANENT; smb_share_ref(ssp); } else if (!on && (ssp->obj.co_flags & SMBS_PERMANENT)) { ssp->obj.co_flags &= ~SMBS_PERMANENT; smb_share_rele(ssp, &scred); } smb_share_put(ssp, &scred); } else error = EINVAL; break; } else error = EINVAL; break; } case SMBIOC_LOOKUP: if (sdp->sd_vc || sdp->sd_share) return EISCONN; vcp = NULL; ssp = NULL; error = smb_usr_lookup((struct smbioc_lookup*)data, &scred, &vcp, &ssp); if (error) break; if (vcp) { sdp->sd_vc = vcp; smb_vc_unlock(vcp, 0, td); sdp->sd_level = SMBL_VC; } if (ssp) { sdp->sd_share = ssp; smb_share_unlock(ssp, 0, td); sdp->sd_level = SMBL_SHARE; } break; case SMBIOC_READ: case SMBIOC_WRITE: { struct smbioc_rw *rwrq = (struct smbioc_rw*)data; struct uio auio; struct iovec iov; if ((ssp = sdp->sd_share) == NULL) return ENOTCONN; iov.iov_base = rwrq->ioc_base; iov.iov_len = rwrq->ioc_cnt; auio.uio_iov = &iov; auio.uio_iovcnt = 1; auio.uio_offset = rwrq->ioc_offset; auio.uio_resid = rwrq->ioc_cnt; auio.uio_segflg = UIO_USERSPACE; auio.uio_rw = (cmd == SMBIOC_READ) ? UIO_READ : UIO_WRITE; auio.uio_td = td; if (cmd == SMBIOC_READ) error = smb_read(ssp, rwrq->ioc_fh, &auio, &scred); else error = smb_write(ssp, rwrq->ioc_fh, &auio, &scred); rwrq->ioc_cnt -= auio.uio_resid; break; } default: error = ENODEV; } return error; } static int nsmb_dev_read(dev_t dev, struct uio *uio, int flag) { return EACCES; } static int nsmb_dev_write(dev_t dev, struct uio *uio, int flag) { return EACCES; } static int nsmb_dev_poll(dev_t dev, int events, struct thread *td) { return ENODEV; } static int nsmb_dev_load(module_t mod, int cmd, void *arg) { int error = 0; switch (cmd) { case MOD_LOAD: error = smb_sm_init(); if (error) break; error = smb_iod_init(); if (error) { smb_sm_done(); break; } cdevsw_add(&nsmb_cdevsw); nsmb_dev_tag = EVENTHANDLER_REGISTER(dev_clone, nsmb_dev_clone, 0, 1000); printf("netsmb_dev: loaded\n"); break; case MOD_UNLOAD: smb_iod_done(); error = smb_sm_done(); error = 0; EVENTHANDLER_DEREGISTER(dev_clone, nsmb_dev_tag); cdevsw_remove(&nsmb_cdevsw); printf("netsmb_dev: unloaded\n"); break; default: error = EINVAL; break; } return error; } DEV_MODULE (dev_netsmb, nsmb_dev_load, 0); /* * Convert a file descriptor to appropriate smb_share pointer */ static struct file* nsmb_getfp(struct filedesc* fdp, int fd, int flag) { struct file* fp; + FILEDESC_LOCK(fdp); if (((u_int)fd) >= fdp->fd_nfiles || (fp = fdp->fd_ofiles[fd]) == NULL || - (fp->f_flag & flag) == 0) + (fp->f_flag & flag) == 0) { + FILEDESC_UNLOCK(fdp); return (NULL); + } + fhold(fp); + FILEDESC_UNLOCK(fdp); return (fp); } int smb_dev2share(int fd, int mode, struct smb_cred *scred, struct smb_share **sspp) { struct file *fp; struct vnode *vp; struct smb_dev *sdp; struct smb_share *ssp; dev_t dev; int error; fp = nsmb_getfp(scred->scr_td->td_proc->p_fd, fd, FREAD | FWRITE); if (fp == NULL) return EBADF; vp = (struct vnode*)fp->f_data; - if (vp == NULL) + if (vp == NULL) { + fdrop(fp, curthread); return EBADF; + } dev = vn_todev(vp); - if (dev == NODEV) + if (dev == NODEV) { + fdrop(fp, curthread); return EBADF; + } SMB_CHECKMINOR(dev); ssp = sdp->sd_share; - if (ssp == NULL) + if (ssp == NULL) { + fdrop(fp, curthread); return ENOTCONN; + } error = smb_share_get(ssp, LK_EXCLUSIVE, scred); - if (error) - return error; - *sspp = ssp; - return 0; + if (error == 0) + *sspp = ssp; + fdrop(fp, curthread); + return error; } Index: head/sys/sys/fcntl.h =================================================================== --- head/sys/sys/fcntl.h (revision 89305) +++ head/sys/sys/fcntl.h (revision 89306) @@ -1,211 +1,212 @@ /*- * Copyright (c) 1983, 1990, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)fcntl.h 8.3 (Berkeley) 1/21/94 * $FreeBSD$ */ #ifndef _SYS_FCNTL_H_ #define _SYS_FCNTL_H_ /* * This file includes the definitions for open and fcntl * described by POSIX for ; it also includes * related kernel definitions. */ #ifndef _KERNEL #include #endif /* * File status flags: these are used by open(2), fcntl(2). * They are also used (indirectly) in the kernel file structure f_flags, * which is a superset of the open/fcntl flags. Open flags and f_flags * are inter-convertible using OFLAGS(fflags) and FFLAGS(oflags). * Open/fcntl flags begin with O_; kernel-internal flags begin with F. */ /* open-only flags */ #define O_RDONLY 0x0000 /* open for reading only */ #define O_WRONLY 0x0001 /* open for writing only */ #define O_RDWR 0x0002 /* open for reading and writing */ #define O_ACCMODE 0x0003 /* mask for above modes */ /* * Kernel encoding of open mode; separate read and write bits that are * independently testable: 1 greater than the above. * * XXX * FREAD and FWRITE are excluded from the #ifdef _KERNEL so that TIOCFLUSH, * which was documented to use FREAD/FWRITE, continues to work. */ #ifndef _POSIX_SOURCE #define FREAD 0x0001 #define FWRITE 0x0002 #endif #define O_NONBLOCK 0x0004 /* no delay */ #define O_APPEND 0x0008 /* set append mode */ #ifndef _POSIX_SOURCE #define O_SHLOCK 0x0010 /* open with shared file lock */ #define O_EXLOCK 0x0020 /* open with exclusive file lock */ #define O_ASYNC 0x0040 /* signal pgrp when data ready */ #define O_FSYNC 0x0080 /* synchronous writes */ #define O_NOFOLLOW 0x0100 /* don't follow symlinks */ #endif #define O_CREAT 0x0200 /* create if nonexistent */ #define O_TRUNC 0x0400 /* truncate to zero length */ #define O_EXCL 0x0800 /* error if already exists */ #ifdef _KERNEL -#define FMARK 0x1000 /* mark during gc() */ -#define FDEFER 0x2000 /* defer for next gc pass */ +/* FMARK/FDEFER kept in f_gcflags */ +#define FMARK 0x1 /* mark during gc() */ +#define FDEFER 0x2 /* defer for next gc pass */ #define FHASLOCK 0x4000 /* descriptor holds advisory lock */ #endif /* Defined by POSIX 1003.1; BSD default, but must be distinct from O_RDONLY. */ #define O_NOCTTY 0x8000 /* don't assign controlling terminal */ /* Attempt to bypass buffer cache */ #define O_DIRECT 0x00010000 #ifdef _KERNEL /* convert from open() flags to/from fflags; convert O_RD/WR to FREAD/FWRITE */ #define FFLAGS(oflags) ((oflags) + 1) #define OFLAGS(fflags) ((fflags) - 1) /* bits to save after open */ #define FMASK (FREAD|FWRITE|FAPPEND|FASYNC|FFSYNC|FNONBLOCK|O_DIRECT) /* bits settable by fcntl(F_SETFL, ...) */ #define FCNTLFLAGS (FAPPEND|FASYNC|FFSYNC|FNONBLOCK|FPOSIXSHM|O_DIRECT) #endif /* * The O_* flags used to have only F* names, which were used in the kernel * and by fcntl. We retain the F* names for the kernel f_flag field * and for backward compatibility for fcntl. */ #ifndef _POSIX_SOURCE #define FAPPEND O_APPEND /* kernel/compat */ #define FASYNC O_ASYNC /* kernel/compat */ #define FFSYNC O_FSYNC /* kernel */ #define FNONBLOCK O_NONBLOCK /* kernel */ #define FNDELAY O_NONBLOCK /* compat */ #define O_NDELAY O_NONBLOCK /* compat */ #endif /* * We are out of bits in f_flag (which is a short). However, * the flag bits not set in FMASK are only meaningful in the * initial open syscall. Those bits can thus be given a * different meaning for fcntl(2). */ #ifndef _POSIX_SOURCE /* * Set by shm_open(3) to get automatic MAP_ASYNC behavior * for POSIX shared memory objects (which are otherwise * implemented as plain files). */ #define FPOSIXSHM O_NOFOLLOW #endif /* * Constants used for fcntl(2) */ /* command values */ #define F_DUPFD 0 /* duplicate file descriptor */ #define F_GETFD 1 /* get file descriptor flags */ #define F_SETFD 2 /* set file descriptor flags */ #define F_GETFL 3 /* get file status flags */ #define F_SETFL 4 /* set file status flags */ #ifndef _POSIX_SOURCE #define F_GETOWN 5 /* get SIGIO/SIGURG proc/pgrp */ #define F_SETOWN 6 /* set SIGIO/SIGURG proc/pgrp */ #endif #define F_GETLK 7 /* get record locking information */ #define F_SETLK 8 /* set record locking information */ #define F_SETLKW 9 /* F_SETLK; wait if blocked */ /* file descriptor flags (F_GETFD, F_SETFD) */ #define FD_CLOEXEC 1 /* close-on-exec flag */ /* record locking flags (F_GETLK, F_SETLK, F_SETLKW) */ #define F_RDLCK 1 /* shared or read lock */ #define F_UNLCK 2 /* unlock */ #define F_WRLCK 3 /* exclusive or write lock */ #ifdef _KERNEL #define F_WAIT 0x010 /* Wait until lock is granted */ #define F_FLOCK 0x020 /* Use flock(2) semantics for lock */ #define F_POSIX 0x040 /* Use POSIX semantics for lock */ #endif /* * Advisory file segment locking data type - * information passed to system by user */ struct flock { off_t l_start; /* starting offset */ off_t l_len; /* len = 0 means until end of file */ pid_t l_pid; /* lock owner */ short l_type; /* lock type: read/write, etc. */ short l_whence; /* type of l_start */ }; #ifndef _POSIX_SOURCE /* lock operations for flock(2) */ #define LOCK_SH 0x01 /* shared file lock */ #define LOCK_EX 0x02 /* exclusive file lock */ #define LOCK_NB 0x04 /* don't block when locking */ #define LOCK_UN 0x08 /* unlock file */ #endif #ifndef _KERNEL #include __BEGIN_DECLS int open __P((const char *, int, ...)); int creat __P((const char *, mode_t)); int fcntl __P((int, int, ...)); #ifndef _POSIX_SOURCE int flock __P((int, int)); #endif /* !_POSIX_SOURCE */ __END_DECLS #endif #endif /* !_SYS_FCNTL_H_ */ Index: head/sys/sys/file.h =================================================================== --- head/sys/sys/file.h (revision 89305) +++ head/sys/sys/file.h (revision 89306) @@ -1,243 +1,272 @@ /* * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)file.h 8.3 (Berkeley) 1/9/95 * $FreeBSD$ */ #ifndef _SYS_FILE_H_ #define _SYS_FILE_H_ #ifndef _KERNEL #include #include #endif #ifdef _KERNEL +#include +#include +#include +#include #include struct stat; struct thread; struct uio; struct knote; struct vnode; struct socket; /* * Kernel descriptor table. * One entry for each open kernel vnode and socket. + * + * Below is the list of locks that protects members in struct file. + * + * (fl) filelist_lock + * (f) f_mtx in struct file + * none not locked */ struct file { - LIST_ENTRY(file) f_list;/* list of active files */ - short f_FILLER3; /* (old f_flag) */ + LIST_ENTRY(file) f_list;/* (fl) list of active files */ + short f_gcflag; /* used by thread doing fd garbage collection */ #define DTYPE_VNODE 1 /* file */ #define DTYPE_SOCKET 2 /* communications endpoint */ #define DTYPE_PIPE 3 /* pipe */ #define DTYPE_FIFO 4 /* fifo (named pipe) */ #define DTYPE_KQUEUE 5 /* event queue */ short f_type; /* descriptor type */ - int f_count; /* reference count */ - int f_msgcount; /* references from message queue */ + int f_count; /* (f) reference count */ + int f_msgcount; /* (f) references from message queue */ struct ucred *f_cred; /* credentials associated with descriptor */ struct fileops { int (*fo_read) __P((struct file *fp, struct uio *uio, struct ucred *cred, int flags, struct thread *td)); int (*fo_write) __P((struct file *fp, struct uio *uio, struct ucred *cred, int flags, struct thread *td)); #define FOF_OFFSET 1 int (*fo_ioctl) __P((struct file *fp, u_long com, caddr_t data, struct thread *td)); int (*fo_poll) __P((struct file *fp, int events, struct ucred *cred, struct thread *td)); int (*fo_kqfilter) __P((struct file *fp, struct knote *kn)); int (*fo_stat) __P((struct file *fp, struct stat *sb, struct thread *td)); int (*fo_close) __P((struct file *fp, struct thread *td)); } *f_ops; int f_seqcount; /* * count of sequential accesses -- cleared * by most seek operations. */ off_t f_nextoff; /* * offset of next expected read or write */ off_t f_offset; caddr_t f_data; /* vnode or socket */ u_int f_flag; /* see fcntl.h */ + struct mtx f_mtx; /* mutex to protect data */ }; #ifdef MALLOC_DECLARE MALLOC_DECLARE(M_FILE); #endif LIST_HEAD(filelist, file); -extern struct filelist filehead; /* head of list of open files */ +extern struct filelist filehead; /* (fl) head of list of open files */ extern struct fileops vnops; extern struct fileops badfileops; extern int maxfiles; /* kernel limit on number of open files */ extern int maxfilesperproc; /* per process limit on number of open files */ -extern int nfiles; /* actual number of open files */ +extern int nfiles; /* (fl) actual number of open files */ +extern struct sx filelist_lock; /* sx to protect filelist and nfiles */ -static __inline void fhold __P((struct file *fp)); +static __inline struct file * fhold __P((struct file *fp)); +static __inline struct file * fhold_locked __P((struct file *fp)); int fget __P((struct thread *td, int fd, struct file **fpp)); int fget_read __P((struct thread *td, int fd, struct file **fpp)); int fget_write __P((struct thread *td, int fd, struct file **fpp)); int fdrop __P((struct file *fp, struct thread *td)); +int fdrop_locked __P((struct file *fp, struct thread *td)); +/* Lock a file. */ +/*#define FILE_LOCK_DEBUG*/ +#ifdef FILE_LOCK_DEBUG +#define FILE_LOCK(f) \ + do { \ + printf("FLCK: %p %s %d\n", &(f)->f_mtx, __FILE__, __LINE__); \ + mtx_lock(&(f)->f_mtx); \ + } while (0) +#define FILE_UNLOCK(f) \ + do { \ + printf("FREL: %p %s %d\n", &(f)->f_mtx, __FILE__, __LINE__); \ + mtx_unlock(&(f)->f_mtx); \ + } while (0) +#else +#define FILE_LOCK(f) mtx_lock(&(f)->f_mtx) +#define FILE_UNLOCK(f) mtx_unlock(&(f)->f_mtx) +#endif +#define FILE_LOCKED(f) mtx_owned(&(f)->f_mtx) +#define FILE_LOCK_ASSERT(f, type) mtx_assert(&(f)->f_mtx, (type)) + int fgetvp __P((struct thread *td, int fd, struct vnode **vpp)); int fgetvp_read __P((struct thread *td, int fd, struct vnode **vpp)); int fgetvp_write __P((struct thread *td, int fd, struct vnode **vpp)); int fgetsock __P((struct thread *td, int fd, struct socket **spp, u_int *fflagp)); void fputsock __P((struct socket *sp)); -static __inline void -fhold(fp) +static __inline struct file * +fhold_locked(fp) struct file *fp; { +#ifdef INVARIANTS + FILE_LOCK_ASSERT(fp, MA_OWNED); +#endif fp->f_count++; + return (fp); } +static __inline struct file * +fhold(fp) + struct file *fp; +{ + + FILE_LOCK(fp); + fhold_locked(fp); + FILE_UNLOCK(fp); + return (fp); +} + static __inline int fo_read __P((struct file *fp, struct uio *uio, struct ucred *cred, int flags, struct thread *td)); static __inline int fo_write __P((struct file *fp, struct uio *uio, struct ucred *cred, int flags, struct thread *td)); static __inline int fo_ioctl __P((struct file *fp, u_long com, caddr_t data, struct thread *td)); static __inline int fo_poll __P((struct file *fp, int events, struct ucred *cred, struct thread *td)); static __inline int fo_stat __P((struct file *fp, struct stat *sb, struct thread *td)); static __inline int fo_close __P((struct file *fp, struct thread *td)); static __inline int fo_kqfilter __P((struct file *fp, struct knote *kn)); +struct proc; +struct file *ffind_hold(struct thread *, int fd); +struct file *ffind_lock(struct thread *, int fd); static __inline int fo_read(fp, uio, cred, flags, td) struct file *fp; struct uio *uio; struct ucred *cred; struct thread *td; int flags; { - int error; - fhold(fp); - error = (*fp->f_ops->fo_read)(fp, uio, cred, flags, td); - fdrop(fp, td); - return (error); + return ((*fp->f_ops->fo_read)(fp, uio, cred, flags, td)); } static __inline int fo_write(fp, uio, cred, flags, td) struct file *fp; struct uio *uio; struct ucred *cred; struct thread *td; int flags; { - int error; - - fhold(fp); - error = (*fp->f_ops->fo_write)(fp, uio, cred, flags, td); - fdrop(fp, td); - return (error); + return ((*fp->f_ops->fo_write)(fp, uio, cred, flags, td)); } static __inline int fo_ioctl(fp, com, data, td) struct file *fp; u_long com; caddr_t data; struct thread *td; { - int error; - - fhold(fp); - error = (*fp->f_ops->fo_ioctl)(fp, com, data, td); - fdrop(fp, td); - return (error); + return ((*fp->f_ops->fo_ioctl)(fp, com, data, td)); } static __inline int fo_poll(fp, events, cred, td) struct file *fp; int events; struct ucred *cred; struct thread *td; { - int error; - - fhold(fp); - error = (*fp->f_ops->fo_poll)(fp, events, cred, td); - fdrop(fp, td); - return (error); + /* select(2) and poll(2) hold file descriptors. */ + return ((*fp->f_ops->fo_poll)(fp, events, cred, td)); } static __inline int fo_stat(fp, sb, td) struct file *fp; struct stat *sb; struct thread *td; { - int error; - - fhold(fp); - error = (*fp->f_ops->fo_stat)(fp, sb, td); - fdrop(fp, td); - return (error); + return ((*fp->f_ops->fo_stat)(fp, sb, td)); } static __inline int fo_close(fp, td) struct file *fp; struct thread *td; { return ((*fp->f_ops->fo_close)(fp, td)); } static __inline int fo_kqfilter(fp, kn) struct file *fp; struct knote *kn; { return ((*fp->f_ops->fo_kqfilter)(fp, kn)); } #endif /* _KERNEL */ #endif /* !SYS_FILE_H */ Index: head/sys/sys/filedesc.h =================================================================== --- head/sys/sys/filedesc.h (revision 89305) +++ head/sys/sys/filedesc.h (revision 89306) @@ -1,150 +1,175 @@ /* * Copyright (c) 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)filedesc.h 8.1 (Berkeley) 6/2/93 * $FreeBSD$ */ #ifndef _SYS_FILEDESC_H_ #define _SYS_FILEDESC_H_ +#include +#include #include +#include /* * This structure is used for the management of descriptors. It may be * shared by multiple processes. * * A process is initially started out with NDFILE descriptors stored within * this structure, selected to be enough for typical applications based on * the historical limit of 20 open files (and the usage of descriptors by * shells). If these descriptors are exhausted, a larger descriptor table * may be allocated, up to a process' resource limit; the internal arrays * are then unused. The initial expansion is set to NDEXTENT; each time * it runs out, it is doubled until the resource limit is reached. NDEXTENT * should be selected to be the biggest multiple of OFILESIZE (see below) * that will fit in a power-of-two sized piece of memory. */ #define NDFILE 20 #define NDEXTENT 50 /* 250 bytes in 256-byte alloc. */ struct filedesc { struct file **fd_ofiles; /* file structures for open files */ char *fd_ofileflags; /* per-process open file flags */ struct vnode *fd_cdir; /* current directory */ struct vnode *fd_rdir; /* root directory */ struct vnode *fd_jdir; /* jail root directory */ int fd_nfiles; /* number of open files allocated */ - u_short fd_lastfile; /* high-water mark of fd_ofiles */ - u_short fd_freefile; /* approx. next free file */ + int fd_lastfile; /* high-water mark of fd_ofiles */ + int fd_freefile; /* approx. next free file */ u_short fd_cmask; /* mask for file creation */ u_short fd_refcnt; /* reference count */ int fd_knlistsize; /* size of knlist */ struct klist *fd_knlist; /* list of attached knotes */ u_long fd_knhashmask; /* size of knhash */ struct klist *fd_knhash; /* hash table for attached knotes */ + struct mtx fd_mtx; /* mtx to protect the members of struct filedesc */ }; /* * Basic allocation of descriptors: * one of the above, plus arrays for NDFILE descriptors. */ struct filedesc0 { struct filedesc fd_fd; /* * These arrays are used when the number of open files is * <= NDFILE, and are then pointed to by the pointers above. */ struct file *fd_dfiles[NDFILE]; char fd_dfileflags[NDFILE]; }; /* * Per-process open flags. */ #define UF_EXCLOSE 0x01 /* auto-close on exec */ #if 0 #define UF_MAPPED 0x02 /* mapped from device */ #endif /* * Storage required per open file descriptor. */ #define OFILESIZE (sizeof(struct file *) + sizeof(char)) /* * This structure holds the information needed to send a SIGIO or * a SIGURG signal to a process or process group when new data arrives * on a device or socket. The structure is placed on an SLIST belonging * to the proc or pgrp so that the entire list may be revoked when the * process exits or the process group disappears. */ struct sigio { union { struct proc *siu_proc; /* process to receive SIGIO/SIGURG */ struct pgrp *siu_pgrp; /* process group to receive ... */ } sio_u; SLIST_ENTRY(sigio) sio_pgsigio; /* sigio's for process or group */ struct sigio **sio_myref; /* location of the pointer that holds * the reference to this structure */ struct ucred *sio_ucred; /* current credentials */ pid_t sio_pgid; /* pgid for signals */ }; #define sio_proc sio_u.siu_proc #define sio_pgrp sio_u.siu_pgrp SLIST_HEAD(sigiolst, sigio); #ifdef _KERNEL + +/* Lock a file descriptor table. */ +/*#define FILEDESC_LOCK_DEBUG*/ +#ifdef FILEDESC_LOCK_DEBUG +#define FILEDESC_LOCK(fd) \ + do { \ + printf("FD_LCK: %p %s %d\n", &(fd)->fd_mtx, __FILE__, __LINE__); \ + mtx_lock(&(fd)->fd_mtx); \ + } while (0) +#define FILEDESC_UNLOCK(fd) \ + do { \ + printf("FD_REL: %p %s %d\n", &(fd)->fd_mtx, __FILE__, __LINE__); \ + mtx_unlock(&(fd)->fd_mtx); \ + } while (0) +#else +#define FILEDESC_LOCK(fd) mtx_lock(&(fd)->fd_mtx) +#define FILEDESC_UNLOCK(fd) mtx_unlock(&(fd)->fd_mtx) +#endif +#define FILEDESC_LOCKED(fd) mtx_owned(&(fd)->fd_mtx) +#define FILEDESC_LOCK_ASSERT(fd, type) mtx_assert(&(fd)->fd_mtx, (type)) + int closef __P((struct file *fp, struct thread *p)); int dupfdopen __P((struct thread *td, struct filedesc *fdp, int indx, int dfd, int mode, int error)); int falloc __P((struct thread *p, struct file **resultfp, int *resultfd)); int fdalloc __P((struct thread *p, int want, int *result)); int fdavail __P((struct thread *td, int n)); void fdcloseexec __P((struct thread *td)); struct filedesc *fdcopy __P((struct thread *td)); void fdfree __P((struct thread *td)); struct filedesc *fdinit __P((struct thread *td)); struct filedesc *fdshare __P((struct proc *p)); void ffree __P((struct file *fp)); pid_t fgetown __P((struct sigio *sigio)); int fsetown __P((pid_t pgid, struct sigio **sigiop)); void funsetown __P((struct sigio *sigio)); void funsetownlst __P((struct sigiolst *sigiolst)); struct file *holdfp __P((struct filedesc *fdp, int fd, int flag)); int getvnode __P((struct filedesc *fdp, int fd, struct file **fpp)); void setugidsafety __P((struct thread *td)); #endif /* _KERNEL */ #endif /* !_SYS_FILEDESC_H_ */ Index: head/sys/ufs/ffs/ffs_alloc.c =================================================================== --- head/sys/ufs/ffs/ffs_alloc.c (revision 89305) +++ head/sys/ufs/ffs/ffs_alloc.c (revision 89306) @@ -1,2046 +1,2049 @@ /* * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ffs_alloc.c 8.18 (Berkeley) 5/26/95 * $FreeBSD$ */ #include "opt_quota.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include typedef ufs_daddr_t allocfcn_t __P((struct inode *ip, int cg, ufs_daddr_t bpref, int size)); static ufs_daddr_t ffs_alloccg __P((struct inode *, int, ufs_daddr_t, int)); static ufs_daddr_t ffs_alloccgblk __P((struct inode *, struct buf *, ufs_daddr_t)); #ifdef DIAGNOSTIC static int ffs_checkblk __P((struct inode *, ufs_daddr_t, long)); #endif static ufs_daddr_t ffs_clusteralloc __P((struct inode *, int, ufs_daddr_t, int)); static ino_t ffs_dirpref __P((struct inode *)); static ufs_daddr_t ffs_fragextend __P((struct inode *, int, long, int, int)); static void ffs_fserr __P((struct fs *, u_int, char *)); static u_long ffs_hashalloc __P((struct inode *, int, long, int, allocfcn_t *)); static ino_t ffs_nodealloccg __P((struct inode *, int, ufs_daddr_t, int)); static ufs_daddr_t ffs_mapsearch __P((struct fs *, struct cg *, ufs_daddr_t, int)); /* * Allocate a block in the file system. * * The size of the requested block is given, which must be some * multiple of fs_fsize and <= fs_bsize. * A preference may be optionally specified. If a preference is given * the following hierarchy is used to allocate a block: * 1) allocate the requested block. * 2) allocate a rotationally optimal block in the same cylinder. * 3) allocate a block in the same cylinder group. * 4) quadradically rehash into other cylinder groups, until an * available block is located. * If no block preference is given the following heirarchy is used * to allocate a block: * 1) allocate a block in the cylinder group that contains the * inode for the file. * 2) quadradically rehash into other cylinder groups, until an * available block is located. */ int ffs_alloc(ip, lbn, bpref, size, cred, bnp) register struct inode *ip; ufs_daddr_t lbn, bpref; int size; struct ucred *cred; ufs_daddr_t *bnp; { register struct fs *fs; ufs_daddr_t bno; int cg; #ifdef QUOTA int error; #endif *bnp = 0; fs = ip->i_fs; #ifdef DIAGNOSTIC if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0) { printf("dev = %s, bsize = %ld, size = %d, fs = %s\n", devtoname(ip->i_dev), (long)fs->fs_bsize, size, fs->fs_fsmnt); panic("ffs_alloc: bad size"); } if (cred == NOCRED) panic("ffs_alloc: missing credential"); #endif /* DIAGNOSTIC */ if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0) goto nospace; if (suser_xxx(cred, NULL, PRISON_ROOT) && freespace(fs, fs->fs_minfree) - numfrags(fs, size) < 0) goto nospace; #ifdef QUOTA error = chkdq(ip, (long)btodb(size), cred, 0); if (error) return (error); #endif if (bpref >= fs->fs_size) bpref = 0; if (bpref == 0) cg = ino_to_cg(fs, ip->i_number); else cg = dtog(fs, bpref); bno = (ufs_daddr_t)ffs_hashalloc(ip, cg, (long)bpref, size, ffs_alloccg); if (bno > 0) { ip->i_blocks += btodb(size); ip->i_flag |= IN_CHANGE | IN_UPDATE; *bnp = bno; return (0); } #ifdef QUOTA /* * Restore user's disk quota because allocation failed. */ (void) chkdq(ip, (long)-btodb(size), cred, FORCE); #endif nospace: ffs_fserr(fs, cred->cr_uid, "file system full"); uprintf("\n%s: write failed, file system is full\n", fs->fs_fsmnt); return (ENOSPC); } /* * Reallocate a fragment to a bigger size * * The number and size of the old block is given, and a preference * and new size is also specified. The allocator attempts to extend * the original block. Failing that, the regular block allocator is * invoked to get an appropriate block. */ int ffs_realloccg(ip, lbprev, bpref, osize, nsize, cred, bpp) register struct inode *ip; ufs_daddr_t lbprev; ufs_daddr_t bpref; int osize, nsize; struct ucred *cred; struct buf **bpp; { register struct fs *fs; struct buf *bp; int cg, request, error; ufs_daddr_t bprev, bno; *bpp = 0; fs = ip->i_fs; #ifdef DIAGNOSTIC if (ITOV(ip)->v_mount->mnt_kern_flag & MNTK_SUSPENDED) panic("ffs_realloccg: allocation on suspended filesystem"); if ((u_int)osize > fs->fs_bsize || fragoff(fs, osize) != 0 || (u_int)nsize > fs->fs_bsize || fragoff(fs, nsize) != 0) { printf( "dev = %s, bsize = %ld, osize = %d, nsize = %d, fs = %s\n", devtoname(ip->i_dev), (long)fs->fs_bsize, osize, nsize, fs->fs_fsmnt); panic("ffs_realloccg: bad size"); } if (cred == NOCRED) panic("ffs_realloccg: missing credential"); #endif /* DIAGNOSTIC */ if (suser_xxx(cred, NULL, PRISON_ROOT) && freespace(fs, fs->fs_minfree) - numfrags(fs, nsize - osize) < 0) goto nospace; if ((bprev = ip->i_db[lbprev]) == 0) { printf("dev = %s, bsize = %ld, bprev = %ld, fs = %s\n", devtoname(ip->i_dev), (long)fs->fs_bsize, (long)bprev, fs->fs_fsmnt); panic("ffs_realloccg: bad bprev"); } /* * Allocate the extra space in the buffer. */ error = bread(ITOV(ip), lbprev, osize, NOCRED, &bp); if (error) { brelse(bp); return (error); } if( bp->b_blkno == bp->b_lblkno) { if( lbprev >= NDADDR) panic("ffs_realloccg: lbprev out of range"); bp->b_blkno = fsbtodb(fs, bprev); } #ifdef QUOTA error = chkdq(ip, (long)btodb(nsize - osize), cred, 0); if (error) { brelse(bp); return (error); } #endif /* * Check for extension in the existing location. */ cg = dtog(fs, bprev); bno = ffs_fragextend(ip, cg, (long)bprev, osize, nsize); if (bno) { if (bp->b_blkno != fsbtodb(fs, bno)) panic("ffs_realloccg: bad blockno"); ip->i_blocks += btodb(nsize - osize); ip->i_flag |= IN_CHANGE | IN_UPDATE; allocbuf(bp, nsize); bp->b_flags |= B_DONE; bzero((char *)bp->b_data + osize, (u_int)nsize - osize); *bpp = bp; return (0); } /* * Allocate a new disk location. */ if (bpref >= fs->fs_size) bpref = 0; switch ((int)fs->fs_optim) { case FS_OPTSPACE: /* * Allocate an exact sized fragment. Although this makes * best use of space, we will waste time relocating it if * the file continues to grow. If the fragmentation is * less than half of the minimum free reserve, we choose * to begin optimizing for time. */ request = nsize; if (fs->fs_minfree <= 5 || fs->fs_cstotal.cs_nffree > (off_t)fs->fs_dsize * fs->fs_minfree / (2 * 100)) break; log(LOG_NOTICE, "%s: optimization changed from SPACE to TIME\n", fs->fs_fsmnt); fs->fs_optim = FS_OPTTIME; break; case FS_OPTTIME: /* * At this point we have discovered a file that is trying to * grow a small fragment to a larger fragment. To save time, * we allocate a full sized block, then free the unused portion. * If the file continues to grow, the `ffs_fragextend' call * above will be able to grow it in place without further * copying. If aberrant programs cause disk fragmentation to * grow within 2% of the free reserve, we choose to begin * optimizing for space. */ request = fs->fs_bsize; if (fs->fs_cstotal.cs_nffree < (off_t)fs->fs_dsize * (fs->fs_minfree - 2) / 100) break; log(LOG_NOTICE, "%s: optimization changed from TIME to SPACE\n", fs->fs_fsmnt); fs->fs_optim = FS_OPTSPACE; break; default: printf("dev = %s, optim = %ld, fs = %s\n", devtoname(ip->i_dev), (long)fs->fs_optim, fs->fs_fsmnt); panic("ffs_realloccg: bad optim"); /* NOTREACHED */ } bno = (ufs_daddr_t)ffs_hashalloc(ip, cg, (long)bpref, request, ffs_alloccg); if (bno > 0) { bp->b_blkno = fsbtodb(fs, bno); if (!DOINGSOFTDEP(ITOV(ip))) ffs_blkfree(ip, bprev, (long)osize); if (nsize < request) ffs_blkfree(ip, bno + numfrags(fs, nsize), (long)(request - nsize)); ip->i_blocks += btodb(nsize - osize); ip->i_flag |= IN_CHANGE | IN_UPDATE; allocbuf(bp, nsize); bp->b_flags |= B_DONE; bzero((char *)bp->b_data + osize, (u_int)nsize - osize); *bpp = bp; return (0); } #ifdef QUOTA /* * Restore user's disk quota because allocation failed. */ (void) chkdq(ip, (long)-btodb(nsize - osize), cred, FORCE); #endif brelse(bp); nospace: /* * no space available */ ffs_fserr(fs, cred->cr_uid, "file system full"); uprintf("\n%s: write failed, file system is full\n", fs->fs_fsmnt); return (ENOSPC); } /* * Reallocate a sequence of blocks into a contiguous sequence of blocks. * * The vnode and an array of buffer pointers for a range of sequential * logical blocks to be made contiguous is given. The allocator attempts * to find a range of sequential blocks starting as close as possible to * an fs_rotdelay offset from the end of the allocation for the logical * block immediately preceding the current range. If successful, the * physical block numbers in the buffer pointers and in the inode are * changed to reflect the new allocation. If unsuccessful, the allocation * is left unchanged. The success in doing the reallocation is returned. * Note that the error return is not reflected back to the user. Rather * the previous block allocation will be used. */ SYSCTL_NODE(_vfs, OID_AUTO, ffs, CTLFLAG_RW, 0, "FFS filesystem"); static int doasyncfree = 1; SYSCTL_INT(_vfs_ffs, OID_AUTO, doasyncfree, CTLFLAG_RW, &doasyncfree, 0, ""); static int doreallocblks = 1; SYSCTL_INT(_vfs_ffs, OID_AUTO, doreallocblks, CTLFLAG_RW, &doreallocblks, 0, ""); #ifdef DEBUG static volatile int prtrealloc = 0; #endif int ffs_reallocblks(ap) struct vop_reallocblks_args /* { struct vnode *a_vp; struct cluster_save *a_buflist; } */ *ap; { struct fs *fs; struct inode *ip; struct vnode *vp; struct buf *sbp, *ebp; ufs_daddr_t *bap, *sbap, *ebap = 0; struct cluster_save *buflist; ufs_daddr_t start_lbn, end_lbn, soff, newblk, blkno; struct indir start_ap[NIADDR + 1], end_ap[NIADDR + 1], *idp; int i, len, start_lvl, end_lvl, pref, ssize; if (doreallocblks == 0) return (ENOSPC); vp = ap->a_vp; ip = VTOI(vp); fs = ip->i_fs; if (fs->fs_contigsumsize <= 0) return (ENOSPC); buflist = ap->a_buflist; len = buflist->bs_nchildren; start_lbn = buflist->bs_children[0]->b_lblkno; end_lbn = start_lbn + len - 1; #ifdef DIAGNOSTIC for (i = 0; i < len; i++) if (!ffs_checkblk(ip, dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize)) panic("ffs_reallocblks: unallocated block 1"); for (i = 1; i < len; i++) if (buflist->bs_children[i]->b_lblkno != start_lbn + i) panic("ffs_reallocblks: non-logical cluster"); blkno = buflist->bs_children[0]->b_blkno; ssize = fsbtodb(fs, fs->fs_frag); for (i = 1; i < len - 1; i++) if (buflist->bs_children[i]->b_blkno != blkno + (i * ssize)) panic("ffs_reallocblks: non-physical cluster %d", i); #endif /* * If the latest allocation is in a new cylinder group, assume that * the filesystem has decided to move and do not force it back to * the previous cylinder group. */ if (dtog(fs, dbtofsb(fs, buflist->bs_children[0]->b_blkno)) != dtog(fs, dbtofsb(fs, buflist->bs_children[len - 1]->b_blkno))) return (ENOSPC); if (ufs_getlbns(vp, start_lbn, start_ap, &start_lvl) || ufs_getlbns(vp, end_lbn, end_ap, &end_lvl)) return (ENOSPC); /* * Get the starting offset and block map for the first block. */ if (start_lvl == 0) { sbap = &ip->i_db[0]; soff = start_lbn; } else { idp = &start_ap[start_lvl - 1]; if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &sbp)) { brelse(sbp); return (ENOSPC); } sbap = (ufs_daddr_t *)sbp->b_data; soff = idp->in_off; } /* * Find the preferred location for the cluster. */ pref = ffs_blkpref(ip, start_lbn, soff, sbap); /* * If the block range spans two block maps, get the second map. */ if (end_lvl == 0 || (idp = &end_ap[end_lvl - 1])->in_off + 1 >= len) { ssize = len; } else { #ifdef DIAGNOSTIC if (start_ap[start_lvl-1].in_lbn == idp->in_lbn) panic("ffs_reallocblk: start == end"); #endif ssize = len - (idp->in_off + 1); if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &ebp)) goto fail; ebap = (ufs_daddr_t *)ebp->b_data; } /* * Search the block map looking for an allocation of the desired size. */ if ((newblk = (ufs_daddr_t)ffs_hashalloc(ip, dtog(fs, pref), (long)pref, len, ffs_clusteralloc)) == 0) goto fail; /* * We have found a new contiguous block. * * First we have to replace the old block pointers with the new * block pointers in the inode and indirect blocks associated * with the file. */ #ifdef DEBUG if (prtrealloc) printf("realloc: ino %d, lbns %d-%d\n\told:", ip->i_number, start_lbn, end_lbn); #endif blkno = newblk; for (bap = &sbap[soff], i = 0; i < len; i++, blkno += fs->fs_frag) { if (i == ssize) { bap = ebap; soff = -i; } #ifdef DIAGNOSTIC if (!ffs_checkblk(ip, dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize)) panic("ffs_reallocblks: unallocated block 2"); if (dbtofsb(fs, buflist->bs_children[i]->b_blkno) != *bap) panic("ffs_reallocblks: alloc mismatch"); #endif #ifdef DEBUG if (prtrealloc) printf(" %d,", *bap); #endif if (DOINGSOFTDEP(vp)) { if (sbap == &ip->i_db[0] && i < ssize) softdep_setup_allocdirect(ip, start_lbn + i, blkno, *bap, fs->fs_bsize, fs->fs_bsize, buflist->bs_children[i]); else softdep_setup_allocindir_page(ip, start_lbn + i, i < ssize ? sbp : ebp, soff + i, blkno, *bap, buflist->bs_children[i]); } *bap++ = blkno; } /* * Next we must write out the modified inode and indirect blocks. * For strict correctness, the writes should be synchronous since * the old block values may have been written to disk. In practise * they are almost never written, but if we are concerned about * strict correctness, the `doasyncfree' flag should be set to zero. * * The test on `doasyncfree' should be changed to test a flag * that shows whether the associated buffers and inodes have * been written. The flag should be set when the cluster is * started and cleared whenever the buffer or inode is flushed. * We can then check below to see if it is set, and do the * synchronous write only when it has been cleared. */ if (sbap != &ip->i_db[0]) { if (doasyncfree) bdwrite(sbp); else bwrite(sbp); } else { ip->i_flag |= IN_CHANGE | IN_UPDATE; if (!doasyncfree) UFS_UPDATE(vp, 1); } if (ssize < len) { if (doasyncfree) bdwrite(ebp); else bwrite(ebp); } /* * Last, free the old blocks and assign the new blocks to the buffers. */ #ifdef DEBUG if (prtrealloc) printf("\n\tnew:"); #endif for (blkno = newblk, i = 0; i < len; i++, blkno += fs->fs_frag) { if (!DOINGSOFTDEP(vp)) ffs_blkfree(ip, dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize); buflist->bs_children[i]->b_blkno = fsbtodb(fs, blkno); #ifdef DIAGNOSTIC if (!ffs_checkblk(ip, dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize)) panic("ffs_reallocblks: unallocated block 3"); #endif #ifdef DEBUG if (prtrealloc) printf(" %d,", blkno); #endif } #ifdef DEBUG if (prtrealloc) { prtrealloc--; printf("\n"); } #endif return (0); fail: if (ssize < len) brelse(ebp); if (sbap != &ip->i_db[0]) brelse(sbp); return (ENOSPC); } /* * Allocate an inode in the file system. * * If allocating a directory, use ffs_dirpref to select the inode. * If allocating in a directory, the following hierarchy is followed: * 1) allocate the preferred inode. * 2) allocate an inode in the same cylinder group. * 3) quadradically rehash into other cylinder groups, until an * available inode is located. * If no inode preference is given the following heirarchy is used * to allocate an inode: * 1) allocate an inode in cylinder group 0. * 2) quadradically rehash into other cylinder groups, until an * available inode is located. */ int ffs_valloc(pvp, mode, cred, vpp) struct vnode *pvp; int mode; struct ucred *cred; struct vnode **vpp; { register struct inode *pip; register struct fs *fs; register struct inode *ip; ino_t ino, ipref; int cg, error; *vpp = NULL; pip = VTOI(pvp); fs = pip->i_fs; if (fs->fs_cstotal.cs_nifree == 0) goto noinodes; if ((mode & IFMT) == IFDIR) ipref = ffs_dirpref(pip); else ipref = pip->i_number; if (ipref >= fs->fs_ncg * fs->fs_ipg) ipref = 0; cg = ino_to_cg(fs, ipref); /* * Track number of dirs created one after another * in a same cg without intervening by files. */ if ((mode & IFMT) == IFDIR) { if (fs->fs_contigdirs[cg] < 255) fs->fs_contigdirs[cg]++; } else { if (fs->fs_contigdirs[cg] > 0) fs->fs_contigdirs[cg]--; } ino = (ino_t)ffs_hashalloc(pip, cg, (long)ipref, mode, (allocfcn_t *)ffs_nodealloccg); if (ino == 0) goto noinodes; error = VFS_VGET(pvp->v_mount, ino, vpp); if (error) { UFS_VFREE(pvp, ino, mode); return (error); } ip = VTOI(*vpp); if (ip->i_mode) { printf("mode = 0%o, inum = %lu, fs = %s\n", ip->i_mode, (u_long)ip->i_number, fs->fs_fsmnt); panic("ffs_valloc: dup alloc"); } if (ip->i_blocks && (fs->fs_flags & FS_UNCLEAN) == 0) { /* XXX */ printf("free inode %s/%lu had %ld blocks\n", fs->fs_fsmnt, (u_long)ino, (long)ip->i_blocks); ip->i_blocks = 0; } ip->i_flags = 0; /* * Set up a new generation number for this inode. */ if (ip->i_gen == 0 || ++ip->i_gen == 0) ip->i_gen = random() / 2 + 1; return (0); noinodes: ffs_fserr(fs, cred->cr_uid, "out of inodes"); uprintf("\n%s: create/symlink failed, no inodes free\n", fs->fs_fsmnt); return (ENOSPC); } /* * Find a cylinder group to place a directory. * * The policy implemented by this algorithm is to allocate a * directory inode in the same cylinder group as its parent * directory, but also to reserve space for its files inodes * and data. Restrict the number of directories which may be * allocated one after another in the same cylinder group * without intervening allocation of files. * * If we allocate a first level directory then force allocation * in another cylinder group. */ static ino_t ffs_dirpref(pip) struct inode *pip; { register struct fs *fs; int cg, prefcg, dirsize, cgsize; int avgifree, avgbfree, avgndir, curdirsize; int minifree, minbfree, maxndir; int mincg, minndir; int maxcontigdirs; fs = pip->i_fs; avgifree = fs->fs_cstotal.cs_nifree / fs->fs_ncg; avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; avgndir = fs->fs_cstotal.cs_ndir / fs->fs_ncg; /* * Force allocation in another cg if creating a first level dir. */ if (ITOV(pip)->v_flag & VROOT) { prefcg = arc4random() % fs->fs_ncg; mincg = prefcg; minndir = fs->fs_ipg; for (cg = prefcg; cg < fs->fs_ncg; cg++) if (fs->fs_cs(fs, cg).cs_ndir < minndir && fs->fs_cs(fs, cg).cs_nifree >= avgifree && fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { mincg = cg; minndir = fs->fs_cs(fs, cg).cs_ndir; } for (cg = 0; cg < prefcg; cg++) if (fs->fs_cs(fs, cg).cs_ndir < minndir && fs->fs_cs(fs, cg).cs_nifree >= avgifree && fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { mincg = cg; minndir = fs->fs_cs(fs, cg).cs_ndir; } return ((ino_t)(fs->fs_ipg * mincg)); } /* * Count various limits which used for * optimal allocation of a directory inode. */ maxndir = min(avgndir + fs->fs_ipg / 16, fs->fs_ipg); minifree = avgifree - fs->fs_ipg / 4; if (minifree < 0) minifree = 0; minbfree = avgbfree - fs->fs_fpg / fs->fs_frag / 4; if (minbfree < 0) minbfree = 0; cgsize = fs->fs_fsize * fs->fs_fpg; dirsize = fs->fs_avgfilesize * fs->fs_avgfpdir; curdirsize = avgndir ? (cgsize - avgbfree * fs->fs_bsize) / avgndir : 0; if (dirsize < curdirsize) dirsize = curdirsize; maxcontigdirs = min(cgsize / dirsize, 255); if (fs->fs_avgfpdir > 0) maxcontigdirs = min(maxcontigdirs, fs->fs_ipg / fs->fs_avgfpdir); if (maxcontigdirs == 0) maxcontigdirs = 1; /* * Limit number of dirs in one cg and reserve space for * regular files, but only if we have no deficit in * inodes or space. */ prefcg = ino_to_cg(fs, pip->i_number); for (cg = prefcg; cg < fs->fs_ncg; cg++) if (fs->fs_cs(fs, cg).cs_ndir < maxndir && fs->fs_cs(fs, cg).cs_nifree >= minifree && fs->fs_cs(fs, cg).cs_nbfree >= minbfree) { if (fs->fs_contigdirs[cg] < maxcontigdirs) return ((ino_t)(fs->fs_ipg * cg)); } for (cg = 0; cg < prefcg; cg++) if (fs->fs_cs(fs, cg).cs_ndir < maxndir && fs->fs_cs(fs, cg).cs_nifree >= minifree && fs->fs_cs(fs, cg).cs_nbfree >= minbfree) { if (fs->fs_contigdirs[cg] < maxcontigdirs) return ((ino_t)(fs->fs_ipg * cg)); } /* * This is a backstop when we have deficit in space. */ for (cg = prefcg; cg < fs->fs_ncg; cg++) if (fs->fs_cs(fs, cg).cs_nifree >= avgifree) return ((ino_t)(fs->fs_ipg * cg)); for (cg = 0; cg < prefcg; cg++) if (fs->fs_cs(fs, cg).cs_nifree >= avgifree) break; return ((ino_t)(fs->fs_ipg * cg)); } /* * Select the desired position for the next block in a file. The file is * logically divided into sections. The first section is composed of the * direct blocks. Each additional section contains fs_maxbpg blocks. * * If no blocks have been allocated in the first section, the policy is to * request a block in the same cylinder group as the inode that describes * the file. If no blocks have been allocated in any other section, the * policy is to place the section in a cylinder group with a greater than * average number of free blocks. An appropriate cylinder group is found * by using a rotor that sweeps the cylinder groups. When a new group of * blocks is needed, the sweep begins in the cylinder group following the * cylinder group from which the previous allocation was made. The sweep * continues until a cylinder group with greater than the average number * of free blocks is found. If the allocation is for the first block in an * indirect block, the information on the previous allocation is unavailable; * here a best guess is made based upon the logical block number being * allocated. * * If a section is already partially allocated, the policy is to * contiguously allocate fs_maxcontig blocks. The end of one of these * contiguous blocks and the beginning of the next is physically separated * so that the disk head will be in transit between them for at least * fs_rotdelay milliseconds. This is to allow time for the processor to * schedule another I/O transfer. */ ufs_daddr_t ffs_blkpref(ip, lbn, indx, bap) struct inode *ip; ufs_daddr_t lbn; int indx; ufs_daddr_t *bap; { register struct fs *fs; register int cg; int avgbfree, startcg; ufs_daddr_t nextblk; fs = ip->i_fs; if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) { if (lbn < NDADDR + NINDIR(fs)) { cg = ino_to_cg(fs, ip->i_number); return (fs->fs_fpg * cg + fs->fs_frag); } /* * Find a cylinder with greater than average number of * unused data blocks. */ if (indx == 0 || bap[indx - 1] == 0) startcg = ino_to_cg(fs, ip->i_number) + lbn / fs->fs_maxbpg; else startcg = dtog(fs, bap[indx - 1]) + 1; startcg %= fs->fs_ncg; avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; for (cg = startcg; cg < fs->fs_ncg; cg++) if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { fs->fs_cgrotor = cg; return (fs->fs_fpg * cg + fs->fs_frag); } for (cg = 0; cg <= startcg; cg++) if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { fs->fs_cgrotor = cg; return (fs->fs_fpg * cg + fs->fs_frag); } return (0); } /* * One or more previous blocks have been laid out. If less * than fs_maxcontig previous blocks are contiguous, the * next block is requested contiguously, otherwise it is * requested rotationally delayed by fs_rotdelay milliseconds. */ nextblk = bap[indx - 1] + fs->fs_frag; if (fs->fs_rotdelay == 0 || indx < fs->fs_maxcontig || bap[indx - fs->fs_maxcontig] + blkstofrags(fs, fs->fs_maxcontig) != nextblk) return (nextblk); /* * Here we convert ms of delay to frags as: * (frags) = (ms) * (rev/sec) * (sect/rev) / * ((sect/frag) * (ms/sec)) * then round up to the next block. */ nextblk += roundup(fs->fs_rotdelay * fs->fs_rps * fs->fs_nsect / (NSPF(fs) * 1000), fs->fs_frag); return (nextblk); } /* * Implement the cylinder overflow algorithm. * * The policy implemented by this algorithm is: * 1) allocate the block in its requested cylinder group. * 2) quadradically rehash on the cylinder group number. * 3) brute force search for a free block. */ /*VARARGS5*/ static u_long ffs_hashalloc(ip, cg, pref, size, allocator) struct inode *ip; int cg; long pref; int size; /* size for data blocks, mode for inodes */ allocfcn_t *allocator; { register struct fs *fs; long result; /* XXX why not same type as we return? */ int i, icg = cg; #ifdef DIAGNOSTIC if (ITOV(ip)->v_mount->mnt_kern_flag & MNTK_SUSPENDED) panic("ffs_hashalloc: allocation on suspended filesystem"); #endif fs = ip->i_fs; /* * 1: preferred cylinder group */ result = (*allocator)(ip, cg, pref, size); if (result) return (result); /* * 2: quadratic rehash */ for (i = 1; i < fs->fs_ncg; i *= 2) { cg += i; if (cg >= fs->fs_ncg) cg -= fs->fs_ncg; result = (*allocator)(ip, cg, 0, size); if (result) return (result); } /* * 3: brute force search * Note that we start at i == 2, since 0 was checked initially, * and 1 is always checked in the quadratic rehash. */ cg = (icg + 2) % fs->fs_ncg; for (i = 2; i < fs->fs_ncg; i++) { result = (*allocator)(ip, cg, 0, size); if (result) return (result); cg++; if (cg == fs->fs_ncg) cg = 0; } return (0); } /* * Determine whether a fragment can be extended. * * Check to see if the necessary fragments are available, and * if they are, allocate them. */ static ufs_daddr_t ffs_fragextend(ip, cg, bprev, osize, nsize) struct inode *ip; int cg; long bprev; int osize, nsize; { register struct fs *fs; register struct cg *cgp; struct buf *bp; long bno; int frags, bbase; int i, error; u_int8_t *blksfree; fs = ip->i_fs; if (fs->fs_cs(fs, cg).cs_nffree < numfrags(fs, nsize - osize)) return (0); frags = numfrags(fs, nsize); bbase = fragnum(fs, bprev); if (bbase > fragnum(fs, (bprev + frags - 1))) { /* cannot extend across a block boundary */ return (0); } error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), (int)fs->fs_cgsize, NOCRED, &bp); if (error) { brelse(bp); return (0); } cgp = (struct cg *)bp->b_data; if (!cg_chkmagic(cgp)) { brelse(bp); return (0); } bp->b_xflags |= BX_BKGRDWRITE; cgp->cg_time = time_second; bno = dtogd(fs, bprev); blksfree = cg_blksfree(cgp); for (i = numfrags(fs, osize); i < frags; i++) if (isclr(blksfree, bno + i)) { brelse(bp); return (0); } /* * the current fragment can be extended * deduct the count on fragment being extended into * increase the count on the remaining fragment (if any) * allocate the extended piece */ for (i = frags; i < fs->fs_frag - bbase; i++) if (isclr(blksfree, bno + i)) break; cgp->cg_frsum[i - numfrags(fs, osize)]--; if (i != frags) cgp->cg_frsum[i - frags]++; for (i = numfrags(fs, osize); i < frags; i++) { clrbit(blksfree, bno + i); cgp->cg_cs.cs_nffree--; fs->fs_cstotal.cs_nffree--; fs->fs_cs(fs, cg).cs_nffree--; } fs->fs_fmod = 1; if (DOINGSOFTDEP(ITOV(ip))) softdep_setup_blkmapdep(bp, fs, bprev); if (fs->fs_active != 0) atomic_clear_int(&ACTIVECGNUM(fs, cg), ACTIVECGOFF(cg)); bdwrite(bp); return (bprev); } /* * Determine whether a block can be allocated. * * Check to see if a block of the appropriate size is available, * and if it is, allocate it. */ static ufs_daddr_t ffs_alloccg(ip, cg, bpref, size) struct inode *ip; int cg; ufs_daddr_t bpref; int size; { register struct fs *fs; register struct cg *cgp; struct buf *bp; register int i; ufs_daddr_t bno, blkno; int allocsiz, error, frags; u_int8_t *blksfree; fs = ip->i_fs; if (fs->fs_cs(fs, cg).cs_nbfree == 0 && size == fs->fs_bsize) return (0); error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), (int)fs->fs_cgsize, NOCRED, &bp); if (error) { brelse(bp); return (0); } cgp = (struct cg *)bp->b_data; if (!cg_chkmagic(cgp) || (cgp->cg_cs.cs_nbfree == 0 && size == fs->fs_bsize)) { brelse(bp); return (0); } bp->b_xflags |= BX_BKGRDWRITE; cgp->cg_time = time_second; if (size == fs->fs_bsize) { bno = ffs_alloccgblk(ip, bp, bpref); if (fs->fs_active != 0) atomic_clear_int(&ACTIVECGNUM(fs, cg), ACTIVECGOFF(cg)); bdwrite(bp); return (bno); } /* * check to see if any fragments are already available * allocsiz is the size which will be allocated, hacking * it down to a smaller size if necessary */ blksfree = cg_blksfree(cgp); frags = numfrags(fs, size); for (allocsiz = frags; allocsiz < fs->fs_frag; allocsiz++) if (cgp->cg_frsum[allocsiz] != 0) break; if (allocsiz == fs->fs_frag) { /* * no fragments were available, so a block will be * allocated, and hacked up */ if (cgp->cg_cs.cs_nbfree == 0) { brelse(bp); return (0); } bno = ffs_alloccgblk(ip, bp, bpref); bpref = dtogd(fs, bno); for (i = frags; i < fs->fs_frag; i++) setbit(blksfree, bpref + i); i = fs->fs_frag - frags; cgp->cg_cs.cs_nffree += i; fs->fs_cstotal.cs_nffree += i; fs->fs_cs(fs, cg).cs_nffree += i; fs->fs_fmod = 1; cgp->cg_frsum[i]++; if (fs->fs_active != 0) atomic_clear_int(&ACTIVECGNUM(fs, cg), ACTIVECGOFF(cg)); bdwrite(bp); return (bno); } bno = ffs_mapsearch(fs, cgp, bpref, allocsiz); if (bno < 0) { brelse(bp); return (0); } for (i = 0; i < frags; i++) clrbit(blksfree, bno + i); cgp->cg_cs.cs_nffree -= frags; fs->fs_cstotal.cs_nffree -= frags; fs->fs_cs(fs, cg).cs_nffree -= frags; fs->fs_fmod = 1; cgp->cg_frsum[allocsiz]--; if (frags != allocsiz) cgp->cg_frsum[allocsiz - frags]++; blkno = cg * fs->fs_fpg + bno; if (DOINGSOFTDEP(ITOV(ip))) softdep_setup_blkmapdep(bp, fs, blkno); if (fs->fs_active != 0) atomic_clear_int(&ACTIVECGNUM(fs, cg), ACTIVECGOFF(cg)); bdwrite(bp); return ((u_long)blkno); } /* * Allocate a block in a cylinder group. * * This algorithm implements the following policy: * 1) allocate the requested block. * 2) allocate a rotationally optimal block in the same cylinder. * 3) allocate the next available block on the block rotor for the * specified cylinder group. * Note that this routine only allocates fs_bsize blocks; these * blocks may be fragmented by the routine that allocates them. */ static ufs_daddr_t ffs_alloccgblk(ip, bp, bpref) struct inode *ip; struct buf *bp; ufs_daddr_t bpref; { struct fs *fs; struct cg *cgp; ufs_daddr_t bno, blkno; int cylno, pos, delta; short *cylbp; register int i; u_int8_t *blksfree; fs = ip->i_fs; cgp = (struct cg *)bp->b_data; blksfree = cg_blksfree(cgp); if (bpref == 0 || dtog(fs, bpref) != cgp->cg_cgx) { bpref = cgp->cg_rotor; goto norot; } bpref = blknum(fs, bpref); bpref = dtogd(fs, bpref); /* * if the requested block is available, use it */ if (ffs_isblock(fs, blksfree, fragstoblks(fs, bpref))) { bno = bpref; goto gotit; } if (fs->fs_nrpos <= 1 || fs->fs_cpc == 0) { /* * Block layout information is not available. * Leaving bpref unchanged means we take the * next available free block following the one * we just allocated. Hopefully this will at * least hit a track cache on drives of unknown * geometry (e.g. SCSI). */ goto norot; } /* * check for a block available on the same cylinder */ cylno = cbtocylno(fs, bpref); if (cg_blktot(cgp)[cylno] == 0) goto norot; /* * check the summary information to see if a block is * available in the requested cylinder starting at the * requested rotational position and proceeding around. */ cylbp = cg_blks(fs, cgp, cylno); pos = cbtorpos(fs, bpref); for (i = pos; i < fs->fs_nrpos; i++) if (cylbp[i] > 0) break; if (i == fs->fs_nrpos) for (i = 0; i < pos; i++) if (cylbp[i] > 0) break; if (cylbp[i] > 0) { /* * found a rotational position, now find the actual * block. A panic if none is actually there. */ pos = cylno % fs->fs_cpc; bno = (cylno - pos) * fs->fs_spc / NSPB(fs); if (fs_postbl(fs, pos)[i] == -1) { printf("pos = %d, i = %d, fs = %s\n", pos, i, fs->fs_fsmnt); panic("ffs_alloccgblk: cyl groups corrupted"); } for (i = fs_postbl(fs, pos)[i];; ) { if (ffs_isblock(fs, blksfree, bno + i)) { bno = blkstofrags(fs, (bno + i)); goto gotit; } delta = fs_rotbl(fs)[i]; if (delta <= 0 || delta + i > fragstoblks(fs, fs->fs_fpg)) break; i += delta; } printf("pos = %d, i = %d, fs = %s\n", pos, i, fs->fs_fsmnt); panic("ffs_alloccgblk: can't find blk in cyl"); } norot: /* * no blocks in the requested cylinder, so take next * available one in this cylinder group. */ bno = ffs_mapsearch(fs, cgp, bpref, (int)fs->fs_frag); if (bno < 0) return (0); cgp->cg_rotor = bno; gotit: blkno = fragstoblks(fs, bno); ffs_clrblock(fs, blksfree, (long)blkno); ffs_clusteracct(fs, cgp, blkno, -1); cgp->cg_cs.cs_nbfree--; fs->fs_cstotal.cs_nbfree--; fs->fs_cs(fs, cgp->cg_cgx).cs_nbfree--; cylno = cbtocylno(fs, bno); cg_blks(fs, cgp, cylno)[cbtorpos(fs, bno)]--; cg_blktot(cgp)[cylno]--; fs->fs_fmod = 1; blkno = cgp->cg_cgx * fs->fs_fpg + bno; if (DOINGSOFTDEP(ITOV(ip))) softdep_setup_blkmapdep(bp, fs, blkno); return (blkno); } /* * Determine whether a cluster can be allocated. * * We do not currently check for optimal rotational layout if there * are multiple choices in the same cylinder group. Instead we just * take the first one that we find following bpref. */ static ufs_daddr_t ffs_clusteralloc(ip, cg, bpref, len) struct inode *ip; int cg; ufs_daddr_t bpref; int len; { register struct fs *fs; register struct cg *cgp; struct buf *bp; int i, got, run, bno, bit, map; u_char *mapp; int32_t *lp; u_int8_t *blksfree; fs = ip->i_fs; if (fs->fs_maxcluster[cg] < len) return (0); if (bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), (int)fs->fs_cgsize, NOCRED, &bp)) goto fail; cgp = (struct cg *)bp->b_data; if (!cg_chkmagic(cgp)) goto fail; bp->b_xflags |= BX_BKGRDWRITE; /* * Check to see if a cluster of the needed size (or bigger) is * available in this cylinder group. */ lp = &cg_clustersum(cgp)[len]; for (i = len; i <= fs->fs_contigsumsize; i++) if (*lp++ > 0) break; if (i > fs->fs_contigsumsize) { /* * This is the first time looking for a cluster in this * cylinder group. Update the cluster summary information * to reflect the true maximum sized cluster so that * future cluster allocation requests can avoid reading * the cylinder group map only to find no clusters. */ lp = &cg_clustersum(cgp)[len - 1]; for (i = len - 1; i > 0; i--) if (*lp-- > 0) break; fs->fs_maxcluster[cg] = i; goto fail; } /* * Search the cluster map to find a big enough cluster. * We take the first one that we find, even if it is larger * than we need as we prefer to get one close to the previous * block allocation. We do not search before the current * preference point as we do not want to allocate a block * that is allocated before the previous one (as we will * then have to wait for another pass of the elevator * algorithm before it will be read). We prefer to fail and * be recalled to try an allocation in the next cylinder group. */ if (dtog(fs, bpref) != cg) bpref = 0; else bpref = fragstoblks(fs, dtogd(fs, blknum(fs, bpref))); mapp = &cg_clustersfree(cgp)[bpref / NBBY]; map = *mapp++; bit = 1 << (bpref % NBBY); for (run = 0, got = bpref; got < cgp->cg_nclusterblks; got++) { if ((map & bit) == 0) { run = 0; } else { run++; if (run == len) break; } if ((got & (NBBY - 1)) != (NBBY - 1)) { bit <<= 1; } else { map = *mapp++; bit = 1; } } if (got >= cgp->cg_nclusterblks) goto fail; /* * Allocate the cluster that we have found. */ blksfree = cg_blksfree(cgp); for (i = 1; i <= len; i++) if (!ffs_isblock(fs, blksfree, got - run + i)) panic("ffs_clusteralloc: map mismatch"); bno = cg * fs->fs_fpg + blkstofrags(fs, got - run + 1); if (dtog(fs, bno) != cg) panic("ffs_clusteralloc: allocated out of group"); len = blkstofrags(fs, len); for (i = 0; i < len; i += fs->fs_frag) if ((got = ffs_alloccgblk(ip, bp, bno + i)) != bno + i) panic("ffs_clusteralloc: lost block"); if (fs->fs_active != 0) atomic_clear_int(&ACTIVECGNUM(fs, cg), ACTIVECGOFF(cg)); bdwrite(bp); return (bno); fail: brelse(bp); return (0); } /* * Determine whether an inode can be allocated. * * Check to see if an inode is available, and if it is, * allocate it using the following policy: * 1) allocate the requested inode. * 2) allocate the next available inode after the requested * inode in the specified cylinder group. */ static ino_t ffs_nodealloccg(ip, cg, ipref, mode) struct inode *ip; int cg; ufs_daddr_t ipref; int mode; { register struct fs *fs; register struct cg *cgp; struct buf *bp; u_int8_t *inosused; int error, start, len, loc, map, i; fs = ip->i_fs; if (fs->fs_cs(fs, cg).cs_nifree == 0) return (0); error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), (int)fs->fs_cgsize, NOCRED, &bp); if (error) { brelse(bp); return (0); } cgp = (struct cg *)bp->b_data; if (!cg_chkmagic(cgp) || cgp->cg_cs.cs_nifree == 0) { brelse(bp); return (0); } bp->b_xflags |= BX_BKGRDWRITE; cgp->cg_time = time_second; inosused = cg_inosused(cgp); if (ipref) { ipref %= fs->fs_ipg; if (isclr(inosused, ipref)) goto gotit; } start = cgp->cg_irotor / NBBY; len = howmany(fs->fs_ipg - cgp->cg_irotor, NBBY); loc = skpc(0xff, len, &inosused[start]); if (loc == 0) { len = start + 1; start = 0; loc = skpc(0xff, len, &inosused[0]); if (loc == 0) { printf("cg = %d, irotor = %ld, fs = %s\n", cg, (long)cgp->cg_irotor, fs->fs_fsmnt); panic("ffs_nodealloccg: map corrupted"); /* NOTREACHED */ } } i = start + len - loc; map = inosused[i]; ipref = i * NBBY; for (i = 1; i < (1 << NBBY); i <<= 1, ipref++) { if ((map & i) == 0) { cgp->cg_irotor = ipref; goto gotit; } } printf("fs = %s\n", fs->fs_fsmnt); panic("ffs_nodealloccg: block not in map"); /* NOTREACHED */ gotit: if (DOINGSOFTDEP(ITOV(ip))) softdep_setup_inomapdep(bp, ip, cg * fs->fs_ipg + ipref); setbit(inosused, ipref); cgp->cg_cs.cs_nifree--; fs->fs_cstotal.cs_nifree--; fs->fs_cs(fs, cg).cs_nifree--; fs->fs_fmod = 1; if ((mode & IFMT) == IFDIR) { cgp->cg_cs.cs_ndir++; fs->fs_cstotal.cs_ndir++; fs->fs_cs(fs, cg).cs_ndir++; } bdwrite(bp); return (cg * fs->fs_ipg + ipref); } /* * Free a block or fragment. * * The specified block or fragment is placed back in the * free map. If a fragment is deallocated, a possible * block reassembly is checked. */ void ffs_blkfree(ip, bno, size) register struct inode *ip; ufs_daddr_t bno; long size; { register struct fs *fs; register struct cg *cgp; struct buf *bp; ufs_daddr_t fragno, cgbno; int i, error, cg, blk, frags, bbase; u_int8_t *blksfree; #ifdef DIAGNOSTIC struct vnode *vp; #endif fs = ip->i_fs; #ifdef DIAGNOSTIC if ((vp = ITOV(ip)) != NULL && vp->v_mount != NULL && (vp->v_mount->mnt_kern_flag & MNTK_SUSPENDED)) panic("ffs_blkfree: deallocation on suspended filesystem"); if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0 || fragnum(fs, bno) + numfrags(fs, size) > fs->fs_frag) { printf("dev=%s, bno = %ld, bsize = %ld, size = %ld, fs = %s\n", devtoname(ip->i_dev), (long)bno, (long)fs->fs_bsize, size, fs->fs_fsmnt); panic("ffs_blkfree: bad size"); } #endif if ((ip->i_devvp->v_flag & VCOPYONWRITE) && ffs_snapblkfree(ip, bno, size)) return; VOP_FREEBLKS(ip->i_devvp, fsbtodb(fs, bno), size); cg = dtog(fs, bno); if ((u_int)bno >= fs->fs_size) { printf("bad block %ld, ino %lu\n", (long)bno, (u_long)ip->i_number); ffs_fserr(fs, ip->i_uid, "bad block"); return; } error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), (int)fs->fs_cgsize, NOCRED, &bp); if (error) { brelse(bp); return; } cgp = (struct cg *)bp->b_data; if (!cg_chkmagic(cgp)) { brelse(bp); return; } bp->b_xflags |= BX_BKGRDWRITE; cgp->cg_time = time_second; cgbno = dtogd(fs, bno); blksfree = cg_blksfree(cgp); if (size == fs->fs_bsize) { fragno = fragstoblks(fs, cgbno); if (!ffs_isfreeblock(fs, blksfree, fragno)) { printf("dev = %s, block = %ld, fs = %s\n", devtoname(ip->i_dev), (long)bno, fs->fs_fsmnt); panic("ffs_blkfree: freeing free block"); } ffs_setblock(fs, blksfree, fragno); ffs_clusteracct(fs, cgp, fragno, 1); cgp->cg_cs.cs_nbfree++; fs->fs_cstotal.cs_nbfree++; fs->fs_cs(fs, cg).cs_nbfree++; i = cbtocylno(fs, cgbno); cg_blks(fs, cgp, i)[cbtorpos(fs, cgbno)]++; cg_blktot(cgp)[i]++; } else { bbase = cgbno - fragnum(fs, cgbno); /* * decrement the counts associated with the old frags */ blk = blkmap(fs, blksfree, bbase); ffs_fragacct(fs, blk, cgp->cg_frsum, -1); /* * deallocate the fragment */ frags = numfrags(fs, size); for (i = 0; i < frags; i++) { if (isset(blksfree, cgbno + i)) { printf("dev = %s, block = %ld, fs = %s\n", devtoname(ip->i_dev), (long)(bno + i), fs->fs_fsmnt); panic("ffs_blkfree: freeing free frag"); } setbit(blksfree, cgbno + i); } cgp->cg_cs.cs_nffree += i; fs->fs_cstotal.cs_nffree += i; fs->fs_cs(fs, cg).cs_nffree += i; /* * add back in counts associated with the new frags */ blk = blkmap(fs, blksfree, bbase); ffs_fragacct(fs, blk, cgp->cg_frsum, 1); /* * if a complete block has been reassembled, account for it */ fragno = fragstoblks(fs, bbase); if (ffs_isblock(fs, blksfree, fragno)) { cgp->cg_cs.cs_nffree -= fs->fs_frag; fs->fs_cstotal.cs_nffree -= fs->fs_frag; fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag; ffs_clusteracct(fs, cgp, fragno, 1); cgp->cg_cs.cs_nbfree++; fs->fs_cstotal.cs_nbfree++; fs->fs_cs(fs, cg).cs_nbfree++; i = cbtocylno(fs, bbase); cg_blks(fs, cgp, i)[cbtorpos(fs, bbase)]++; cg_blktot(cgp)[i]++; } } fs->fs_fmod = 1; if (fs->fs_active != 0) atomic_clear_int(&ACTIVECGNUM(fs, cg), ACTIVECGOFF(cg)); bdwrite(bp); } #ifdef DIAGNOSTIC /* * Verify allocation of a block or fragment. Returns true if block or * fragment is allocated, false if it is free. */ static int ffs_checkblk(ip, bno, size) struct inode *ip; ufs_daddr_t bno; long size; { struct fs *fs; struct cg *cgp; struct buf *bp; int i, error, frags, free; u_int8_t *blksfree; fs = ip->i_fs; if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0) { printf("bsize = %ld, size = %ld, fs = %s\n", (long)fs->fs_bsize, size, fs->fs_fsmnt); panic("ffs_checkblk: bad size"); } if ((u_int)bno >= fs->fs_size) panic("ffs_checkblk: bad block %d", bno); error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, dtog(fs, bno))), (int)fs->fs_cgsize, NOCRED, &bp); if (error) panic("ffs_checkblk: cg bread failed"); cgp = (struct cg *)bp->b_data; if (!cg_chkmagic(cgp)) panic("ffs_checkblk: cg magic mismatch"); bp->b_xflags |= BX_BKGRDWRITE; blksfree = cg_blksfree(cgp); bno = dtogd(fs, bno); if (size == fs->fs_bsize) { free = ffs_isblock(fs, blksfree, fragstoblks(fs, bno)); } else { frags = numfrags(fs, size); for (free = 0, i = 0; i < frags; i++) if (isset(blksfree, bno + i)) free++; if (free != 0 && free != frags) panic("ffs_checkblk: partially free fragment"); } brelse(bp); return (!free); } #endif /* DIAGNOSTIC */ /* * Free an inode. */ int ffs_vfree(pvp, ino, mode) struct vnode *pvp; ino_t ino; int mode; { if (DOINGSOFTDEP(pvp)) { softdep_freefile(pvp, ino, mode); return (0); } return (ffs_freefile(VTOI(pvp), ino, mode)); } /* * Do the actual free operation. * The specified inode is placed back in the free map. */ int ffs_freefile(pip, ino, mode) struct inode *pip; ino_t ino; int mode; { register struct fs *fs; register struct cg *cgp; struct buf *bp; int error, cg; u_int8_t *inosused; fs = pip->i_fs; if ((u_int)ino >= fs->fs_ipg * fs->fs_ncg) panic("ffs_vfree: range: dev = (%d,%d), ino = %d, fs = %s", major(pip->i_dev), minor(pip->i_dev), ino, fs->fs_fsmnt); cg = ino_to_cg(fs, ino); error = bread(pip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), (int)fs->fs_cgsize, NOCRED, &bp); if (error) { brelse(bp); return (error); } cgp = (struct cg *)bp->b_data; if (!cg_chkmagic(cgp)) { brelse(bp); return (0); } bp->b_xflags |= BX_BKGRDWRITE; cgp->cg_time = time_second; inosused = cg_inosused(cgp); ino %= fs->fs_ipg; if (isclr(inosused, ino)) { printf("dev = %s, ino = %lu, fs = %s\n", devtoname(pip->i_dev), (u_long)ino + cg * fs->fs_ipg, fs->fs_fsmnt); if (fs->fs_ronly == 0) panic("ffs_vfree: freeing free inode"); } clrbit(inosused, ino); if (ino < cgp->cg_irotor) cgp->cg_irotor = ino; cgp->cg_cs.cs_nifree++; fs->fs_cstotal.cs_nifree++; fs->fs_cs(fs, cg).cs_nifree++; if ((mode & IFMT) == IFDIR) { cgp->cg_cs.cs_ndir--; fs->fs_cstotal.cs_ndir--; fs->fs_cs(fs, cg).cs_ndir--; } fs->fs_fmod = 1; bdwrite(bp); return (0); } /* * Find a block of the specified size in the specified cylinder group. * * It is a panic if a request is made to find a block if none are * available. */ static ufs_daddr_t ffs_mapsearch(fs, cgp, bpref, allocsiz) register struct fs *fs; register struct cg *cgp; ufs_daddr_t bpref; int allocsiz; { ufs_daddr_t bno; int start, len, loc, i; int blk, field, subfield, pos; u_int8_t *blksfree; /* * find the fragment by searching through the free block * map for an appropriate bit pattern */ if (bpref) start = dtogd(fs, bpref) / NBBY; else start = cgp->cg_frotor / NBBY; blksfree = cg_blksfree(cgp); len = howmany(fs->fs_fpg, NBBY) - start; loc = scanc((u_int)len, (u_char *)&blksfree[start], (u_char *)fragtbl[fs->fs_frag], (u_char)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY)))); if (loc == 0) { len = start + 1; start = 0; loc = scanc((u_int)len, (u_char *)&blksfree[0], (u_char *)fragtbl[fs->fs_frag], (u_char)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY)))); if (loc == 0) { printf("start = %d, len = %d, fs = %s\n", start, len, fs->fs_fsmnt); panic("ffs_alloccg: map corrupted"); /* NOTREACHED */ } } bno = (start + len - loc) * NBBY; cgp->cg_frotor = bno; /* * found the byte in the map * sift through the bits to find the selected frag */ for (i = bno + NBBY; bno < i; bno += fs->fs_frag) { blk = blkmap(fs, blksfree, bno); blk <<= 1; field = around[allocsiz]; subfield = inside[allocsiz]; for (pos = 0; pos <= fs->fs_frag - allocsiz; pos++) { if ((blk & field) == subfield) return (bno + pos); field <<= 1; subfield <<= 1; } } printf("bno = %lu, fs = %s\n", (u_long)bno, fs->fs_fsmnt); panic("ffs_alloccg: block not in map"); return (-1); } /* * Update the cluster map because of an allocation or free. * * Cnt == 1 means free; cnt == -1 means allocating. */ void ffs_clusteracct(fs, cgp, blkno, cnt) struct fs *fs; struct cg *cgp; ufs_daddr_t blkno; int cnt; { int32_t *sump; int32_t *lp; u_char *freemapp, *mapp; int i, start, end, forw, back, map, bit; if (fs->fs_contigsumsize <= 0) return; freemapp = cg_clustersfree(cgp); sump = cg_clustersum(cgp); /* * Allocate or clear the actual block. */ if (cnt > 0) setbit(freemapp, blkno); else clrbit(freemapp, blkno); /* * Find the size of the cluster going forward. */ start = blkno + 1; end = start + fs->fs_contigsumsize; if (end >= cgp->cg_nclusterblks) end = cgp->cg_nclusterblks; mapp = &freemapp[start / NBBY]; map = *mapp++; bit = 1 << (start % NBBY); for (i = start; i < end; i++) { if ((map & bit) == 0) break; if ((i & (NBBY - 1)) != (NBBY - 1)) { bit <<= 1; } else { map = *mapp++; bit = 1; } } forw = i - start; /* * Find the size of the cluster going backward. */ start = blkno - 1; end = start - fs->fs_contigsumsize; if (end < 0) end = -1; mapp = &freemapp[start / NBBY]; map = *mapp--; bit = 1 << (start % NBBY); for (i = start; i > end; i--) { if ((map & bit) == 0) break; if ((i & (NBBY - 1)) != 0) { bit >>= 1; } else { map = *mapp--; bit = 1 << (NBBY - 1); } } back = start - i; /* * Account for old cluster and the possibly new forward and * back clusters. */ i = back + forw + 1; if (i > fs->fs_contigsumsize) i = fs->fs_contigsumsize; sump[i] += cnt; if (back > 0) sump[back] -= cnt; if (forw > 0) sump[forw] -= cnt; /* * Update cluster summary information. */ lp = &sump[fs->fs_contigsumsize]; for (i = fs->fs_contigsumsize; i > 0; i--) if (*lp-- > 0) break; fs->fs_maxcluster[cgp->cg_cgx] = i; } /* * Fserr prints the name of a file system with an error diagnostic. * * The form of the error message is: * fs: error message */ static void ffs_fserr(fs, uid, cp) struct fs *fs; u_int uid; char *cp; { struct proc *p = curproc; /* XXX */ log(LOG_ERR, "pid %d (%s), uid %d on %s: %s\n", p ? p->p_pid : -1, p ? p->p_comm : "-", uid, fs->fs_fsmnt, cp); } /* * This function provides the capability for the fsck program to * update an active filesystem. Six operations are provided: * * adjrefcnt(inode, amt) - adjusts the reference count on the * specified inode by the specified amount. Under normal * operation the count should always go down. Decrementing * the count to zero will cause the inode to be freed. * adjblkcnt(inode, amt) - adjust the number of blocks used to * by the specifed amount. * freedirs(inode, count) - directory inodes [inode..inode + count - 1] * are marked as free. Inodes should never have to be marked * as in use. * freefiles(inode, count) - file inodes [inode..inode + count - 1] * are marked as free. Inodes should never have to be marked * as in use. * freeblks(blockno, size) - blocks [blockno..blockno + size - 1] * are marked as free. Blocks should never have to be marked * as in use. * setflags(flags, set/clear) - the fs_flags field has the specified * flags set (second parameter +1) or cleared (second parameter -1). */ static int sysctl_ffs_fsck __P((SYSCTL_HANDLER_ARGS)); SYSCTL_PROC(_vfs_ffs, FFS_ADJ_REFCNT, adjrefcnt, CTLFLAG_WR|CTLTYPE_STRUCT, 0, 0, sysctl_ffs_fsck, "S,fsck", "Adjust Inode Reference Count"); SYSCTL_NODE(_vfs_ffs, FFS_ADJ_BLKCNT, adjblkcnt, CTLFLAG_WR, sysctl_ffs_fsck, "Adjust Inode Used Blocks Count"); SYSCTL_NODE(_vfs_ffs, FFS_DIR_FREE, freedirs, CTLFLAG_WR, sysctl_ffs_fsck, "Free Range of Directory Inodes"); SYSCTL_NODE(_vfs_ffs, FFS_FILE_FREE, freefiles, CTLFLAG_WR, sysctl_ffs_fsck, "Free Range of File Inodes"); SYSCTL_NODE(_vfs_ffs, FFS_BLK_FREE, freeblks, CTLFLAG_WR, sysctl_ffs_fsck, "Free Range of Blocks"); SYSCTL_NODE(_vfs_ffs, FFS_SET_FLAGS, setflags, CTLFLAG_WR, sysctl_ffs_fsck, "Change Filesystem Flags"); #ifdef DEBUG static int fsckcmds = 0; SYSCTL_INT(_debug, OID_AUTO, fsckcmds, CTLFLAG_RW, &fsckcmds, 0, ""); #endif /* DEBUG */ static int sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS) { struct fsck_cmd cmd; struct inode tip; struct ufsmount *ump; struct vnode *vp; struct inode *ip; struct mount *mp; struct fs *fs; ufs_daddr_t blkno; long blkcnt, blksize; struct file *fp; int filetype, error; if (req->newlen > sizeof cmd) return (EBADRPC); if ((error = SYSCTL_IN(req, &cmd, sizeof cmd)) != 0) return (error); if (cmd.version != FFS_CMD_VERSION) return (ERPCMISMATCH); if ((error = getvnode(curproc->p_fd, cmd.handle, &fp)) != 0) return (error); vn_start_write((struct vnode *)fp->f_data, &mp, V_WAIT); if (mp == 0 || strncmp(mp->mnt_stat.f_fstypename, "ufs", MFSNAMELEN)) { vn_finished_write(mp); + fdrop(fp, curthread); return (EINVAL); } if (mp->mnt_flag & MNT_RDONLY) { vn_finished_write(mp); + fdrop(fp, curthread); return (EROFS); } ump = VFSTOUFS(mp); fs = ump->um_fs; filetype = IFREG; switch (oidp->oid_number) { case FFS_SET_FLAGS: #ifdef DEBUG if (fsckcmds) printf("%s: %s flags\n", mp->mnt_stat.f_mntonname, cmd.size > 0 ? "set" : "clear"); #endif /* DEBUG */ if (cmd.size > 0) fs->fs_flags |= (long)cmd.value; else fs->fs_flags &= ~(long)cmd.value; break; case FFS_ADJ_REFCNT: #ifdef DEBUG if (fsckcmds) { printf("%s: adjust inode %d count by %ld\n", mp->mnt_stat.f_mntonname, (ino_t)cmd.value, cmd.size); } #endif /* DEBUG */ if ((error = VFS_VGET(mp, (ino_t)cmd.value, &vp)) != 0) break; ip = VTOI(vp); ip->i_nlink += cmd.size; ip->i_effnlink += cmd.size; ip->i_flag |= IN_CHANGE; if (DOINGSOFTDEP(vp)) softdep_change_linkcnt(ip); vput(vp); break; case FFS_ADJ_BLKCNT: #ifdef DEBUG if (fsckcmds) { printf("%s: adjust inode %d block count by %ld\n", mp->mnt_stat.f_mntonname, (ino_t)cmd.value, cmd.size); } #endif /* DEBUG */ if ((error = VFS_VGET(mp, (ino_t)cmd.value, &vp)) != 0) break; ip = VTOI(vp); ip->i_blocks += cmd.size; ip->i_flag |= IN_CHANGE; vput(vp); break; case FFS_DIR_FREE: filetype = IFDIR; /* fall through */ case FFS_FILE_FREE: #ifdef DEBUG if (fsckcmds) { if (cmd.size == 1) printf("%s: free %s inode %d\n", mp->mnt_stat.f_mntonname, filetype == IFDIR ? "directory" : "file", (ino_t)cmd.value); else printf("%s: free %s inodes %d-%d\n", mp->mnt_stat.f_mntonname, filetype == IFDIR ? "directory" : "file", (ino_t)cmd.value, (ino_t)(cmd.value + cmd.size - 1)); } #endif /* DEBUG */ tip.i_devvp = ump->um_devvp; tip.i_dev = ump->um_dev; tip.i_fs = fs; while (cmd.size > 0) { if ((error = ffs_freefile(&tip, cmd.value, filetype))) break; cmd.size -= 1; cmd.value += 1; } break; case FFS_BLK_FREE: #ifdef DEBUG if (fsckcmds) { if (cmd.size == 1) printf("%s: free block %d\n", mp->mnt_stat.f_mntonname, (ufs_daddr_t)cmd.value); else printf("%s: free blocks %d-%ld\n", mp->mnt_stat.f_mntonname, (ufs_daddr_t)cmd.value, (ufs_daddr_t)cmd.value + cmd.size - 1); } #endif /* DEBUG */ tip.i_number = ROOTINO; tip.i_devvp = ump->um_devvp; tip.i_dev = ump->um_dev; tip.i_fs = fs; tip.i_size = cmd.size * fs->fs_fsize; tip.i_uid = 0; tip.i_vnode = NULL; blkno = (ufs_daddr_t)cmd.value; blkcnt = cmd.size; blksize = fs->fs_frag - (blkno % fs->fs_frag); while (blkcnt > 0) { if (blksize > blkcnt) blksize = blkcnt; ffs_blkfree(&tip, blkno, blksize * fs->fs_fsize); blkno += blksize; blkcnt -= blksize; blksize = fs->fs_frag; } break; default: #ifdef DEBUG if (fsckcmds) { printf("Invalid request %d from fsck\n", oidp->oid_number); } #endif /* DEBUG */ error = EINVAL; break; } + fdrop(fp, curthread); vn_finished_write(mp); return (error); } Index: head/sys/vm/vm_mmap.c =================================================================== --- head/sys/vm/vm_mmap.c (revision 89305) +++ head/sys/vm/vm_mmap.c (revision 89306) @@ -1,1284 +1,1281 @@ /* * Copyright (c) 1988 University of Utah. * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$ * * @(#)vm_mmap.c 8.4 (Berkeley) 1/12/94 * $FreeBSD$ */ /* * Mapped file (mmap) interface to VM */ #include "opt_bleed.h" #include "opt_compat.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef _SYS_SYSPROTO_H_ struct sbrk_args { int incr; }; #endif static int max_proc_mmap; SYSCTL_INT(_vm, OID_AUTO, max_proc_mmap, CTLFLAG_RW, &max_proc_mmap, 0, ""); /* * Set the maximum number of vm_map_entry structures per process. Roughly * speaking vm_map_entry structures are tiny, so allowing them to eat 1/100 * of our KVM malloc space still results in generous limits. We want a * default that is good enough to prevent the kernel running out of resources * if attacked from compromised user account but generous enough such that * multi-threaded processes are not unduly inconvenienced. */ static void vmmapentry_rsrc_init __P((void *)); SYSINIT(vmmersrc, SI_SUB_KVM_RSRC, SI_ORDER_FIRST, vmmapentry_rsrc_init, NULL) static void vmmapentry_rsrc_init(dummy) void *dummy; { max_proc_mmap = vm_kmem_size / sizeof(struct vm_map_entry); max_proc_mmap /= 100; } /* * MPSAFE */ /* ARGSUSED */ int sbrk(td, uap) struct thread *td; struct sbrk_args *uap; { /* Not yet implemented */ /* mtx_lock(&Giant); */ /* mtx_unlock(&Giant); */ return (EOPNOTSUPP); } #ifndef _SYS_SYSPROTO_H_ struct sstk_args { int incr; }; #endif /* * MPSAFE */ /* ARGSUSED */ int sstk(td, uap) struct thread *td; struct sstk_args *uap; { /* Not yet implemented */ /* mtx_lock(&Giant); */ /* mtx_unlock(&Giant); */ return (EOPNOTSUPP); } #if defined(COMPAT_43) || defined(COMPAT_SUNOS) #ifndef _SYS_SYSPROTO_H_ struct getpagesize_args { int dummy; }; #endif /* ARGSUSED */ int ogetpagesize(td, uap) struct thread *td; struct getpagesize_args *uap; { /* MP SAFE */ td->td_retval[0] = PAGE_SIZE; return (0); } #endif /* COMPAT_43 || COMPAT_SUNOS */ /* * Memory Map (mmap) system call. Note that the file offset * and address are allowed to be NOT page aligned, though if * the MAP_FIXED flag it set, both must have the same remainder * modulo the PAGE_SIZE (POSIX 1003.1b). If the address is not * page-aligned, the actual mapping starts at trunc_page(addr) * and the return value is adjusted up by the page offset. * * Generally speaking, only character devices which are themselves * memory-based, such as a video framebuffer, can be mmap'd. Otherwise * there would be no cache coherency between a descriptor and a VM mapping * both to the same character device. * * Block devices can be mmap'd no matter what they represent. Cache coherency * is maintained as long as you do not write directly to the underlying * character device. */ #ifndef _SYS_SYSPROTO_H_ struct mmap_args { void *addr; size_t len; int prot; int flags; int fd; long pad; off_t pos; }; #endif /* * MPSAFE */ int mmap(td, uap) struct thread *td; struct mmap_args *uap; { - struct filedesc *fdp = td->td_proc->p_fd; struct file *fp = NULL; struct vnode *vp; vm_offset_t addr; vm_size_t size, pageoff; vm_prot_t prot, maxprot; void *handle; int flags, error; int disablexworkaround; off_t pos; struct vmspace *vms = td->td_proc->p_vmspace; vm_object_t obj; addr = (vm_offset_t) uap->addr; size = uap->len; prot = uap->prot & VM_PROT_ALL; flags = uap->flags; pos = uap->pos; + fp = NULL; /* make sure mapping fits into numeric range etc */ if ((ssize_t) uap->len < 0 || ((flags & MAP_ANON) && uap->fd != -1)) return (EINVAL); if (flags & MAP_STACK) { if ((uap->fd != -1) || ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE))) return (EINVAL); flags |= MAP_ANON; pos = 0; } /* * Align the file position to a page boundary, * and save its page offset component. */ pageoff = (pos & PAGE_MASK); pos -= pageoff; /* Adjust size for rounding (on both ends). */ size += pageoff; /* low end... */ size = (vm_size_t) round_page(size); /* hi end */ /* * Check for illegal addresses. Watch out for address wrap... Note * that VM_*_ADDRESS are not constants due to casts (argh). */ if (flags & MAP_FIXED) { /* * The specified address must have the same remainder * as the file offset taken modulo PAGE_SIZE, so it * should be aligned after adjustment by pageoff. */ addr -= pageoff; if (addr & PAGE_MASK) return (EINVAL); /* Address range must be all in user VM space. */ if (VM_MAXUSER_ADDRESS > 0 && addr + size > VM_MAXUSER_ADDRESS) return (EINVAL); #ifndef i386 if (VM_MIN_ADDRESS > 0 && addr < VM_MIN_ADDRESS) return (EINVAL); #endif if (addr + size < addr) return (EINVAL); } /* * XXX for non-fixed mappings where no hint is provided or * the hint would fall in the potential heap space, * place it after the end of the largest possible heap. * * There should really be a pmap call to determine a reasonable * location. */ else if (addr == 0 || (addr >= round_page((vm_offset_t)vms->vm_taddr) && addr < round_page((vm_offset_t)vms->vm_daddr + maxdsiz))) addr = round_page((vm_offset_t)vms->vm_daddr + maxdsiz); mtx_lock(&Giant); /* syscall marked mp-safe but isn't */ if (flags & MAP_ANON) { /* * Mapping blank space is trivial. */ handle = NULL; maxprot = VM_PROT_ALL; pos = 0; } else { /* * Mapping file, get fp for validation. Obtain vnode and make * sure it is of appropriate type. + * don't let the descriptor disappear on us if we block */ - if (((unsigned) uap->fd) >= fdp->fd_nfiles || - (fp = fdp->fd_ofiles[uap->fd]) == NULL) { + fp = ffind_hold(td, uap->fd); + if (fp == NULL) { error = EBADF; - goto done2; + goto done; } if (fp->f_type != DTYPE_VNODE) { error = EINVAL; - goto done2; + goto done; } /* - * don't let the descriptor disappear on us if we block - */ - fhold(fp); - - /* * POSIX shared-memory objects are defined to have * kernel persistence, and are not defined to support * read(2)/write(2) -- or even open(2). Thus, we can * use MAP_ASYNC to trade on-disk coherence for speed. * The shm_open(3) library routine turns on the FPOSIXSHM * flag to request this behavior. */ if (fp->f_flag & FPOSIXSHM) flags |= MAP_NOSYNC; vp = (struct vnode *) fp->f_data; if (vp->v_type != VREG && vp->v_type != VCHR) { error = EINVAL; goto done; } if (vp->v_type == VREG) { /* * Get the proper underlying object */ if (VOP_GETVOBJECT(vp, &obj) != 0) { error = EINVAL; goto done; } vp = (struct vnode*)obj->handle; } /* * XXX hack to handle use of /dev/zero to map anon memory (ala * SunOS). */ if ((vp->v_type == VCHR) && (vp->v_rdev->si_devsw->d_flags & D_MMAP_ANON)) { handle = NULL; maxprot = VM_PROT_ALL; flags |= MAP_ANON; pos = 0; } else { /* * cdevs does not provide private mappings of any kind. */ /* * However, for XIG X server to continue to work, * we should allow the superuser to do it anyway. * We only allow it at securelevel < 1. * (Because the XIG X server writes directly to video * memory via /dev/mem, it should never work at any * other securelevel. * XXX this will have to go */ if (securelevel_ge(td->td_proc->p_ucred, 1)) disablexworkaround = 1; else disablexworkaround = suser_td(td); if (vp->v_type == VCHR && disablexworkaround && (flags & (MAP_PRIVATE|MAP_COPY))) { error = EINVAL; goto done; } /* * Ensure that file and memory protections are * compatible. Note that we only worry about * writability if mapping is shared; in this case, * current and max prot are dictated by the open file. * XXX use the vnode instead? Problem is: what * credentials do we use for determination? What if * proc does a setuid? */ maxprot = VM_PROT_EXECUTE; /* ??? */ if (fp->f_flag & FREAD) { maxprot |= VM_PROT_READ; } else if (prot & PROT_READ) { error = EACCES; goto done; } /* * If we are sharing potential changes (either via * MAP_SHARED or via the implicit sharing of character * device mappings), and we are trying to get write * permission although we opened it without asking * for it, bail out. Check for superuser, only if * we're at securelevel < 1, to allow the XIG X server * to continue to work. */ if ((flags & MAP_SHARED) != 0 || (vp->v_type == VCHR && disablexworkaround)) { if ((fp->f_flag & FWRITE) != 0) { struct vattr va; if ((error = VOP_GETATTR(vp, &va, td->td_proc->p_ucred, td))) { goto done; } if ((va.va_flags & (SF_SNAPSHOT|IMMUTABLE|APPEND)) == 0) { maxprot |= VM_PROT_WRITE; } else if (prot & PROT_WRITE) { error = EPERM; goto done; } } else if ((prot & PROT_WRITE) != 0) { error = EACCES; goto done; } } else { maxprot |= VM_PROT_WRITE; } handle = (void *)vp; } } /* * Do not allow more then a certain number of vm_map_entry structures * per process. Scale with the number of rforks sharing the map * to make the limit reasonable for threads. */ if (max_proc_mmap && vms->vm_map.nentries >= max_proc_mmap * vms->vm_refcnt) { error = ENOMEM; goto done; } mtx_unlock(&Giant); error = vm_mmap(&vms->vm_map, &addr, size, prot, maxprot, flags, handle, pos); if (error == 0) td->td_retval[0] = (register_t) (addr + pageoff); mtx_lock(&Giant); done: if (fp) fdrop(fp, td); -done2: mtx_unlock(&Giant); return (error); } #ifdef COMPAT_43 #ifndef _SYS_SYSPROTO_H_ struct ommap_args { caddr_t addr; int len; int prot; int flags; int fd; long pos; }; #endif int ommap(td, uap) struct thread *td; struct ommap_args *uap; { struct mmap_args nargs; static const char cvtbsdprot[8] = { 0, PROT_EXEC, PROT_WRITE, PROT_EXEC | PROT_WRITE, PROT_READ, PROT_EXEC | PROT_READ, PROT_WRITE | PROT_READ, PROT_EXEC | PROT_WRITE | PROT_READ, }; #define OMAP_ANON 0x0002 #define OMAP_COPY 0x0020 #define OMAP_SHARED 0x0010 #define OMAP_FIXED 0x0100 nargs.addr = uap->addr; nargs.len = uap->len; nargs.prot = cvtbsdprot[uap->prot & 0x7]; nargs.flags = 0; if (uap->flags & OMAP_ANON) nargs.flags |= MAP_ANON; if (uap->flags & OMAP_COPY) nargs.flags |= MAP_COPY; if (uap->flags & OMAP_SHARED) nargs.flags |= MAP_SHARED; else nargs.flags |= MAP_PRIVATE; if (uap->flags & OMAP_FIXED) nargs.flags |= MAP_FIXED; nargs.fd = uap->fd; nargs.pos = uap->pos; return (mmap(td, &nargs)); } #endif /* COMPAT_43 */ #ifndef _SYS_SYSPROTO_H_ struct msync_args { void *addr; int len; int flags; }; #endif /* * MPSAFE */ int msync(td, uap) struct thread *td; struct msync_args *uap; { vm_offset_t addr; vm_size_t size, pageoff; int flags; vm_map_t map; int rv; addr = (vm_offset_t) uap->addr; size = uap->len; flags = uap->flags; pageoff = (addr & PAGE_MASK); addr -= pageoff; size += pageoff; size = (vm_size_t) round_page(size); if (addr + size < addr) return(EINVAL); if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE)) return (EINVAL); mtx_lock(&Giant); map = &td->td_proc->p_vmspace->vm_map; /* * XXX Gak! If size is zero we are supposed to sync "all modified * pages with the region containing addr". Unfortunately, we don't * really keep track of individual mmaps so we approximate by flushing * the range of the map entry containing addr. This can be incorrect * if the region splits or is coalesced with a neighbor. */ if (size == 0) { vm_map_entry_t entry; vm_map_lock_read(map); rv = vm_map_lookup_entry(map, addr, &entry); vm_map_unlock_read(map); if (rv == FALSE) { rv = -1; goto done2; } addr = entry->start; size = entry->end - entry->start; } /* * Clean the pages and interpret the return value. */ rv = vm_map_clean(map, addr, addr + size, (flags & MS_ASYNC) == 0, (flags & MS_INVALIDATE) != 0); done2: mtx_unlock(&Giant); switch (rv) { case KERN_SUCCESS: return(0); case KERN_INVALID_ADDRESS: return (EINVAL); /* Sun returns ENOMEM? */ case KERN_FAILURE: return (EIO); default: return (EINVAL); } } #ifndef _SYS_SYSPROTO_H_ struct munmap_args { void *addr; size_t len; }; #endif /* * MPSAFE */ int munmap(td, uap) struct thread *td; struct munmap_args *uap; { vm_offset_t addr; vm_size_t size, pageoff; vm_map_t map; addr = (vm_offset_t) uap->addr; size = uap->len; pageoff = (addr & PAGE_MASK); addr -= pageoff; size += pageoff; size = (vm_size_t) round_page(size); if (addr + size < addr) return(EINVAL); if (size == 0) return (0); /* * Check for illegal addresses. Watch out for address wrap... Note * that VM_*_ADDRESS are not constants due to casts (argh). */ if (VM_MAXUSER_ADDRESS > 0 && addr + size > VM_MAXUSER_ADDRESS) return (EINVAL); #ifndef i386 if (VM_MIN_ADDRESS > 0 && addr < VM_MIN_ADDRESS) return (EINVAL); #endif mtx_lock(&Giant); map = &td->td_proc->p_vmspace->vm_map; /* * Make sure entire range is allocated. */ if (!vm_map_check_protection(map, addr, addr + size, VM_PROT_NONE)) { mtx_unlock(&Giant); return (EINVAL); } /* returns nothing but KERN_SUCCESS anyway */ (void) vm_map_remove(map, addr, addr + size); mtx_unlock(&Giant); return (0); } #if 0 void munmapfd(td, fd) struct thread *td; int fd; { /* * XXX should unmap any regions mapped to this file */ + FILEDESC_LOCK(p->p_fd); td->td_proc->p_fd->fd_ofileflags[fd] &= ~UF_MAPPED; + FILEDESC_UNLOCK(p->p_fd); } #endif #ifndef _SYS_SYSPROTO_H_ struct mprotect_args { const void *addr; size_t len; int prot; }; #endif /* * MPSAFE */ int mprotect(td, uap) struct thread *td; struct mprotect_args *uap; { vm_offset_t addr; vm_size_t size, pageoff; vm_prot_t prot; int ret; addr = (vm_offset_t) uap->addr; size = uap->len; prot = uap->prot & VM_PROT_ALL; #if defined(VM_PROT_READ_IS_EXEC) if (prot & VM_PROT_READ) prot |= VM_PROT_EXECUTE; #endif pageoff = (addr & PAGE_MASK); addr -= pageoff; size += pageoff; size = (vm_size_t) round_page(size); if (addr + size < addr) return(EINVAL); mtx_lock(&Giant); ret = vm_map_protect(&td->td_proc->p_vmspace->vm_map, addr, addr + size, prot, FALSE); mtx_unlock(&Giant); switch (ret) { case KERN_SUCCESS: return (0); case KERN_PROTECTION_FAILURE: return (EACCES); } return (EINVAL); } #ifndef _SYS_SYSPROTO_H_ struct minherit_args { void *addr; size_t len; int inherit; }; #endif /* * MPSAFE */ int minherit(td, uap) struct thread *td; struct minherit_args *uap; { vm_offset_t addr; vm_size_t size, pageoff; vm_inherit_t inherit; int ret; addr = (vm_offset_t)uap->addr; size = uap->len; inherit = uap->inherit; pageoff = (addr & PAGE_MASK); addr -= pageoff; size += pageoff; size = (vm_size_t) round_page(size); if (addr + size < addr) return(EINVAL); mtx_lock(&Giant); ret = vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr, addr+size, inherit); mtx_unlock(&Giant); switch (ret) { case KERN_SUCCESS: return (0); case KERN_PROTECTION_FAILURE: return (EACCES); } return (EINVAL); } #ifndef _SYS_SYSPROTO_H_ struct madvise_args { void *addr; size_t len; int behav; }; #endif /* * MPSAFE */ /* ARGSUSED */ int madvise(td, uap) struct thread *td; struct madvise_args *uap; { vm_offset_t start, end; int ret; /* * Check for illegal behavior */ if (uap->behav < 0 || uap->behav > MADV_CORE) return (EINVAL); /* * Check for illegal addresses. Watch out for address wrap... Note * that VM_*_ADDRESS are not constants due to casts (argh). */ if (VM_MAXUSER_ADDRESS > 0 && ((vm_offset_t) uap->addr + uap->len) > VM_MAXUSER_ADDRESS) return (EINVAL); #ifndef i386 if (VM_MIN_ADDRESS > 0 && uap->addr < VM_MIN_ADDRESS) return (EINVAL); #endif if (((vm_offset_t) uap->addr + uap->len) < (vm_offset_t) uap->addr) return (EINVAL); /* * Since this routine is only advisory, we default to conservative * behavior. */ start = trunc_page((vm_offset_t) uap->addr); end = round_page((vm_offset_t) uap->addr + uap->len); mtx_lock(&Giant); ret = vm_map_madvise(&td->td_proc->p_vmspace->vm_map, start, end, uap->behav); mtx_unlock(&Giant); return (ret ? EINVAL : 0); } #ifndef _SYS_SYSPROTO_H_ struct mincore_args { const void *addr; size_t len; char *vec; }; #endif /* * MPSAFE */ /* ARGSUSED */ int mincore(td, uap) struct thread *td; struct mincore_args *uap; { vm_offset_t addr, first_addr; vm_offset_t end, cend; pmap_t pmap; vm_map_t map; char *vec; int error = 0; int vecindex, lastvecindex; vm_map_entry_t current; vm_map_entry_t entry; int mincoreinfo; unsigned int timestamp; /* * Make sure that the addresses presented are valid for user * mode. */ first_addr = addr = trunc_page((vm_offset_t) uap->addr); end = addr + (vm_size_t)round_page(uap->len); if (VM_MAXUSER_ADDRESS > 0 && end > VM_MAXUSER_ADDRESS) return (EINVAL); if (end < addr) return (EINVAL); /* * Address of byte vector */ vec = uap->vec; mtx_lock(&Giant); map = &td->td_proc->p_vmspace->vm_map; pmap = vmspace_pmap(td->td_proc->p_vmspace); vm_map_lock_read(map); RestartScan: timestamp = map->timestamp; if (!vm_map_lookup_entry(map, addr, &entry)) entry = entry->next; /* * Do this on a map entry basis so that if the pages are not * in the current processes address space, we can easily look * up the pages elsewhere. */ lastvecindex = -1; for (current = entry; (current != &map->header) && (current->start < end); current = current->next) { /* * ignore submaps (for now) or null objects */ if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) || current->object.vm_object == NULL) continue; /* * limit this scan to the current map entry and the * limits for the mincore call */ if (addr < current->start) addr = current->start; cend = current->end; if (cend > end) cend = end; /* * scan this entry one page at a time */ while (addr < cend) { /* * Check pmap first, it is likely faster, also * it can provide info as to whether we are the * one referencing or modifying the page. */ mincoreinfo = pmap_mincore(pmap, addr); if (!mincoreinfo) { vm_pindex_t pindex; vm_ooffset_t offset; vm_page_t m; /* * calculate the page index into the object */ offset = current->offset + (addr - current->start); pindex = OFF_TO_IDX(offset); m = vm_page_lookup(current->object.vm_object, pindex); /* * if the page is resident, then gather information about * it. */ if (m) { mincoreinfo = MINCORE_INCORE; if (m->dirty || pmap_is_modified(m)) mincoreinfo |= MINCORE_MODIFIED_OTHER; if ((m->flags & PG_REFERENCED) || pmap_ts_referenced(m)) { vm_page_flag_set(m, PG_REFERENCED); mincoreinfo |= MINCORE_REFERENCED_OTHER; } } } /* * subyte may page fault. In case it needs to modify * the map, we release the lock. */ vm_map_unlock_read(map); /* * calculate index into user supplied byte vector */ vecindex = OFF_TO_IDX(addr - first_addr); /* * If we have skipped map entries, we need to make sure that * the byte vector is zeroed for those skipped entries. */ while ((lastvecindex + 1) < vecindex) { error = subyte( vec + lastvecindex, 0); if (error) { error = EFAULT; goto done2; } ++lastvecindex; } /* * Pass the page information to the user */ error = subyte( vec + vecindex, mincoreinfo); if (error) { error = EFAULT; goto done2; } /* * If the map has changed, due to the subyte, the previous * output may be invalid. */ vm_map_lock_read(map); if (timestamp != map->timestamp) goto RestartScan; lastvecindex = vecindex; addr += PAGE_SIZE; } } /* * subyte may page fault. In case it needs to modify * the map, we release the lock. */ vm_map_unlock_read(map); /* * Zero the last entries in the byte vector. */ vecindex = OFF_TO_IDX(end - first_addr); while ((lastvecindex + 1) < vecindex) { error = subyte( vec + lastvecindex, 0); if (error) { error = EFAULT; goto done2; } ++lastvecindex; } /* * If the map has changed, due to the subyte, the previous * output may be invalid. */ vm_map_lock_read(map); if (timestamp != map->timestamp) goto RestartScan; vm_map_unlock_read(map); done2: mtx_unlock(&Giant); return (error); } #ifndef _SYS_SYSPROTO_H_ struct mlock_args { const void *addr; size_t len; }; #endif /* * MPSAFE */ int mlock(td, uap) struct thread *td; struct mlock_args *uap; { vm_offset_t addr; vm_size_t size, pageoff; int error; addr = (vm_offset_t) uap->addr; size = uap->len; pageoff = (addr & PAGE_MASK); addr -= pageoff; size += pageoff; size = (vm_size_t) round_page(size); /* disable wrap around */ if (addr + size < addr) return (EINVAL); if (atop(size) + cnt.v_wire_count > vm_page_max_wired) return (EAGAIN); #ifdef pmap_wired_count if (size + ptoa(pmap_wired_count(vm_map_pmap(&td->td_proc->p_vmspace->vm_map))) > td->td_proc->p_rlimit[RLIMIT_MEMLOCK].rlim_cur) return (ENOMEM); #else error = suser_td(td); if (error) return (error); #endif mtx_lock(&Giant); error = vm_map_user_pageable(&td->td_proc->p_vmspace->vm_map, addr, addr + size, FALSE); mtx_unlock(&Giant); return (error == KERN_SUCCESS ? 0 : ENOMEM); } #ifndef _SYS_SYSPROTO_H_ struct mlockall_args { int how; }; #endif /* * MPSAFE */ int mlockall(td, uap) struct thread *td; struct mlockall_args *uap; { /* mtx_lock(&Giant); */ /* mtx_unlock(&Giant); */ return 0; } #ifndef _SYS_SYSPROTO_H_ struct mlockall_args { int how; }; #endif /* * MPSAFE */ int munlockall(td, uap) struct thread *td; struct munlockall_args *uap; { /* mtx_lock(&Giant); */ /* mtx_unlock(&Giant); */ return 0; } #ifndef _SYS_SYSPROTO_H_ struct munlock_args { const void *addr; size_t len; }; #endif /* * MPSAFE */ int munlock(td, uap) struct thread *td; struct munlock_args *uap; { vm_offset_t addr; vm_size_t size, pageoff; int error; addr = (vm_offset_t) uap->addr; size = uap->len; pageoff = (addr & PAGE_MASK); addr -= pageoff; size += pageoff; size = (vm_size_t) round_page(size); /* disable wrap around */ if (addr + size < addr) return (EINVAL); #ifndef pmap_wired_count error = suser_td(td); if (error) return (error); #endif mtx_lock(&Giant); error = vm_map_user_pageable(&td->td_proc->p_vmspace->vm_map, addr, addr + size, TRUE); mtx_unlock(&Giant); return (error == KERN_SUCCESS ? 0 : ENOMEM); } /* * vm_mmap() * * MPSAFE * * Internal version of mmap. Currently used by mmap, exec, and sys5 * shared memory. Handle is either a vnode pointer or NULL for MAP_ANON. */ int vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, vm_prot_t maxprot, int flags, void *handle, vm_ooffset_t foff) { boolean_t fitit; vm_object_t object; struct vnode *vp = NULL; objtype_t type; int rv = KERN_SUCCESS; vm_ooffset_t objsize; int docow; struct thread *td = curthread; if (size == 0) return (0); objsize = size = round_page(size); /* * We currently can only deal with page aligned file offsets. * The check is here rather than in the syscall because the * kernel calls this function internally for other mmaping * operations (such as in exec) and non-aligned offsets will * cause pmap inconsistencies...so we want to be sure to * disallow this in all cases. */ if (foff & PAGE_MASK) return (EINVAL); if ((flags & MAP_FIXED) == 0) { fitit = TRUE; *addr = round_page(*addr); mtx_lock(&Giant); } else { if (*addr != trunc_page(*addr)) return (EINVAL); fitit = FALSE; mtx_lock(&Giant); (void) vm_map_remove(map, *addr, *addr + size); } /* * Lookup/allocate object. */ if (flags & MAP_ANON) { type = OBJT_DEFAULT; /* * Unnamed anonymous regions always start at 0. */ if (handle == 0) foff = 0; } else { vp = (struct vnode *) handle; if (vp->v_type == VCHR) { type = OBJT_DEVICE; handle = (void *)(intptr_t)vp->v_rdev; } else { struct vattr vat; int error; error = VOP_GETATTR(vp, &vat, td->td_proc->p_ucred, td); if (error) { mtx_unlock(&Giant); return (error); } objsize = round_page(vat.va_size); type = OBJT_VNODE; /* * if it is a regular file without any references * we do not need to sync it. */ if (vp->v_type == VREG && vat.va_nlink == 0) { flags |= MAP_NOSYNC; } } } if (handle == NULL) { object = NULL; docow = 0; } else { object = vm_pager_allocate(type, handle, objsize, prot, foff); if (object == NULL) { mtx_unlock(&Giant); return (type == OBJT_DEVICE ? EINVAL : ENOMEM); } docow = MAP_PREFAULT_PARTIAL; } /* * Force device mappings to be shared. */ if (type == OBJT_DEVICE || type == OBJT_PHYS) { flags &= ~(MAP_PRIVATE|MAP_COPY); flags |= MAP_SHARED; } if ((flags & (MAP_ANON|MAP_SHARED)) == 0) docow |= MAP_COPY_ON_WRITE; if (flags & MAP_NOSYNC) docow |= MAP_DISABLE_SYNCER; if (flags & MAP_NOCORE) docow |= MAP_DISABLE_COREDUMP; #if defined(VM_PROT_READ_IS_EXEC) if (prot & VM_PROT_READ) prot |= VM_PROT_EXECUTE; if (maxprot & VM_PROT_READ) maxprot |= VM_PROT_EXECUTE; #endif if (fitit) *addr = pmap_addr_hint(object, *addr, size); if (flags & MAP_STACK) rv = vm_map_stack (map, *addr, size, prot, maxprot, docow); else rv = vm_map_find(map, object, foff, addr, size, fitit, prot, maxprot, docow); if (rv != KERN_SUCCESS) { /* * Lose the object reference. Will destroy the * object if it's an unnamed anonymous mapping * or named anonymous without other references. */ vm_object_deallocate(object); } else if (flags & MAP_SHARED) { /* * Shared memory is also shared with children. */ rv = vm_map_inherit(map, *addr, *addr + size, VM_INHERIT_SHARE); if (rv != KERN_SUCCESS) (void) vm_map_remove(map, *addr, *addr + size); } mtx_unlock(&Giant); switch (rv) { case KERN_SUCCESS: return (0); case KERN_INVALID_ADDRESS: case KERN_NO_SPACE: return (ENOMEM); case KERN_PROTECTION_FAILURE: return (EACCES); default: return (EINVAL); } }