diff --git a/lib/libc/sys/open.2 b/lib/libc/sys/open.2 --- a/lib/libc/sys/open.2 +++ b/lib/libc/sys/open.2 @@ -28,7 +28,7 @@ .\" @(#)open.2 8.2 (Berkeley) 11/16/93 .\" $FreeBSD$ .\" -.Dd February 23, 2021 +.Dd March 18, 2021 .Dt OPEN 2 .Os .Sh NAME @@ -168,6 +168,7 @@ O_CLOEXEC set FD_CLOEXEC upon open O_VERIFY verify the contents of the file O_RESOLVE_BENEATH path resolution must not cross the fd directory +O_PATH record only the target path in the opened descriptor .Ed .Pp Opening a file with @@ -316,6 +317,50 @@ .Fn *at family of functions. .Pp +.Dv O_PATH +returns a file descriptor that can be used as a directory file descriptor for +.Xr openat 2 +and other system calls taking a file descriptor argument, like +.Xr fstatat 2 +and others. +The other functionality of the returned file descriptor is limited to +the descriptor-level operations. +It can be used for +.Bl -tag -width SCM_RIGHTS -offset indent -compact +.It Xr fcntl 2 +but advisory locking is not allowed +.It Xr dup 2 +.It Xr close 2 +.It Xr fstat 2 +.It Xr fexecve 2 +requires that +.Dv O_EXEC +was also specified at open time +.It Dv SCM_RIGHTS +can be passed over a +.Xr unix 4 +socket using a +.Dv SCM_RIGHTS +message +.It Xr kqueue 2 +using for +.Dv EVFILT_VNODE +.El +But operations like +.Xr read 2 , +.Xr ftruncate 2 , +and any other that operate on file and not on file descriptor (except +.Xr fstat 2 ), +are not allowed. +File opened with the +.Dv O_PATH +flag does not prevent non-forced unmount of the volume it belongs to. +See also the description of +.Dv AT_EMPTY_PATH +flag for +.Xr fstatat 2 +and related syscalls. +.Pp If successful, .Fn open returns a non-negative integer, termed a file descriptor. diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c --- a/sys/kern/kern_descrip.c +++ b/sys/kern/kern_descrip.c @@ -61,6 +61,7 @@ #include #include #include +#include #include #include #include @@ -610,7 +611,7 @@ error = fget_unlocked(fdp, fd, &cap_flock_rights, &fp); if (error != 0) break; - if (fp->f_type != DTYPE_VNODE) { + if (fp->f_type != DTYPE_VNODE || fp->f_ops == &path_fileops) { error = EBADF; fdrop(fp, td); break; @@ -715,7 +716,7 @@ error = fget_unlocked(fdp, fd, &cap_flock_rights, &fp); if (error != 0) break; - if (fp->f_type != DTYPE_VNODE) { + if (fp->f_type != DTYPE_VNODE || fp->f_ops == &path_fileops) { error = EBADF; fdrop(fp, td); break; @@ -3424,7 +3425,7 @@ error = EINVAL; } else { *vpp = fp->f_vnode; - vrefact(*vpp); + vref(*vpp); } fdrop(fp, td); @@ -3460,7 +3461,7 @@ *havecaps = caps; *vpp = fp->f_vnode; - vrefact(*vpp); + vref(*vpp); fdrop(fp, td); return (0); @@ -3544,7 +3545,7 @@ error = fget(td, uap->fd, &cap_flock_rights, &fp); if (error != 0) return (error); - if (fp->f_type != DTYPE_VNODE) { + if (fp->f_type != DTYPE_VNODE || fp->f_ops == &path_fileops) { fdrop(fp, td); return (EOPNOTSUPP); } @@ -4960,6 +4961,38 @@ .fo_fill_kinfo = badfo_fill_kinfo, }; +static int +path_poll(struct file *fp, int events, struct ucred *active_cred, + struct thread *td) +{ + return (POLLNVAL); +} + +static int +path_close(struct file *fp, struct thread *td) +{ + MPASS(fp->f_type == DTYPE_VNODE); + fp->f_ops = &badfileops; + vdrop(fp->f_vnode); + return (0); +} + +struct fileops path_fileops = { + .fo_read = badfo_readwrite, + .fo_write = badfo_readwrite, + .fo_truncate = badfo_truncate, + .fo_ioctl = badfo_ioctl, + .fo_poll = path_poll, + .fo_kqfilter = vn_kqfilter_opath, + .fo_stat = vn_statfile, + .fo_close = path_close, + .fo_chmod = badfo_chmod, + .fo_chown = badfo_chown, + .fo_sendfile = badfo_sendfile, + .fo_fill_kinfo = vn_fill_kinfo, + .fo_flags = DFLAG_PASSABLE, +}; + int invfo_rdwr(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) diff --git a/sys/kern/vfs_lookup.c b/sys/kern/vfs_lookup.c --- a/sys/kern/vfs_lookup.c +++ b/sys/kern/vfs_lookup.c @@ -360,8 +360,10 @@ if (cnp->cn_flags & AUDITVNODE2) AUDIT_ARG_ATFD2(ndp->ni_dirfd); /* - * Effectively inlined fgetvp_rights, because we need to - * inspect the file as well as grabbing the vnode. + * Effectively inlined fgetvp_rights, because + * we need to inspect the file as well as + * grabbing the vnode. No check for O_PATH, + * files to implement its semantic. */ error = fget_cap(td, ndp->ni_dirfd, &rights, &dfp, &ndp->ni_filecaps); @@ -378,7 +380,7 @@ error = ENOTDIR; } else { *dpp = dfp->f_vnode; - vrefact(*dpp); + vref(*dpp); if ((dfp->f_flag & FSEARCH) != 0) cnp->cn_flags |= NOEXECCHECK; diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c --- a/sys/kern/vfs_syscalls.c +++ b/sys/kern/vfs_syscalls.c @@ -373,7 +373,7 @@ int error; AUDIT_ARG_FD(fd); - error = getvnode(td, fd, &cap_fstatfs_rights, &fp); + error = getvnode_path(td, fd, &cap_fstatfs_rights, &fp); if (error != 0) return (error); vp = fp->f_vnode; @@ -889,12 +889,12 @@ int error; AUDIT_ARG_FD(uap->fd); - error = getvnode(td, uap->fd, &cap_fchdir_rights, + error = getvnode_path(td, uap->fd, &cap_fchdir_rights, &fp); if (error != 0) return (error); vp = fp->f_vnode; - vrefact(vp); + vref(vp); fdrop(fp, td); vn_lock(vp, LK_SHARED | LK_RETRY); AUDIT_ARG_VNODE1(vp); @@ -1021,9 +1021,10 @@ static __inline void flags_to_rights(int flags, cap_rights_t *rightsp) { - if (flags & O_EXEC) { cap_rights_set_one(rightsp, CAP_FEXECVE); + if (flags & O_PATH) + return; } else { switch ((flags & O_ACCMODE)) { case O_RDONLY: @@ -1110,11 +1111,15 @@ AUDIT_ARG_MODE(mode); cap_rights_init_one(&rights, CAP_LOOKUP); flags_to_rights(flags, &rights); + /* * Only one of the O_EXEC, O_RDONLY, O_WRONLY and O_RDWR flags - * may be specified. + * may be specified. On the other hand, for O_PATH any mode + * except O_EXEC is ignored. */ - if (flags & O_EXEC) { + if ((flags & O_PATH) != 0) { + flags &= ~(O_CREAT | O_ACCMODE); + } else if ((flags & O_EXEC) != 0) { if (flags & O_ACCMODE) return (EINVAL); } else if ((flags & O_ACCMODE) == O_ACCMODE) { @@ -1174,14 +1179,22 @@ * files that switched type in the cdevsw fdopen() method. */ fp->f_vnode = vp; + /* * If the file wasn't claimed by devfs bind it to the normal * vnode operations here. */ if (fp->f_ops == &badfileops) { - KASSERT(vp->v_type != VFIFO, + flags |= fp->f_flag & FKQALLOWED; + KASSERT(vp->v_type != VFIFO || (flags & O_PATH) != 0, ("Unexpected fifo fp %p vp %p", fp, vp)); - finit_vnode(fp, flags, NULL, &vnops); + if ((flags & O_PATH) != 0) { + finit_vnode(fp, flags, NULL, &path_fileops); + vhold(vp); + vunref(vp); + } else { + finit_vnode(fp, flags, NULL, &vnops); + } } VOP_UNLOCK(vp); @@ -1862,7 +1875,7 @@ fp = NULL; if (fd != FD_NONE) { - error = getvnode(td, fd, &cap_no_rights, &fp); + error = getvnode_path(td, fd, &cap_no_rights, &fp); if (error != 0) return (error); } @@ -1881,8 +1894,8 @@ if (vp->v_type == VDIR && oldinum == 0) { error = EPERM; /* POSIX */ } else if (oldinum != 0 && - ((error = VOP_STAT(vp, &sb, td->td_ucred, NOCRED, td)) == 0) && - sb.st_ino != oldinum) { + ((error = VOP_STAT(vp, &sb, td->td_ucred, NOCRED, td)) == 0) && + sb.st_ino != oldinum) { error = EIDRM; /* Identifier removed */ } else if (fp != NULL && fp->f_vnode != vp) { if (VN_IS_DOOMED(fp->f_vnode)) @@ -3818,8 +3831,8 @@ fp = NULL; if (fd != FD_NONE) { - error = getvnode(td, fd, cap_rights_init_one(&rights, CAP_LOOKUP), - &fp); + error = getvnode(td, fd, cap_rights_init_one(&rights, + CAP_LOOKUP), &fp); if (error != 0) return (error); } @@ -4230,12 +4243,13 @@ } /* - * Convert a user file descriptor to a kernel file entry and check that, if it - * is a capability, the correct rights are present. A reference on the file - * entry is held upon returning. + * This variant of getvnode() allows O_PATH files. Caller should + * ensure that returned file and vnode are only used for compatible + * semantics. */ int -getvnode(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp) +getvnode_path(struct thread *td, int fd, cap_rights_t *rightsp, + struct file **fpp) { struct file *fp; int error; @@ -4260,10 +4274,35 @@ fdrop(fp, td); return (EINVAL); } + *fpp = fp; return (0); } +/* + * Convert a user file descriptor to a kernel file entry and check + * that, if it is a capability, the correct rights are present. + * A reference on the file entry is held upon returning. + */ +int +getvnode(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp) +{ + int error; + + error = getvnode_path(td, fd, rightsp, fpp); + + /* + * Filter out O_PATH file descriptors, most getvnode() callers + * do not call fo_ methods. + */ + if (error == 0 && (*fpp)->f_ops == &path_fileops) { + fdrop(*fpp, td); + error = EBADF; + } + + return (error); +} + /* * Get an (NFS) file handle. */ diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c --- a/sys/kern/vfs_vnops.c +++ b/sys/kern/vfs_vnops.c @@ -102,7 +102,6 @@ static fo_ioctl_t vn_ioctl; static fo_poll_t vn_poll; static fo_kqfilter_t vn_kqfilter; -static fo_stat_t vn_statfile; static fo_close_t vn_closefile; static fo_mmap_t vn_mmap; static fo_fallocate_t vn_fallocate; @@ -386,31 +385,38 @@ accmode_t accmode; int error; - if (vp->v_type == VLNK) - return (EMLINK); + if (vp->v_type == VLNK) { + if ((fmode & O_PATH) == 0 || (fmode & FEXEC) != 0) + return (EMLINK); + } if (vp->v_type == VSOCK) return (EOPNOTSUPP); if (vp->v_type != VDIR && fmode & O_DIRECTORY) return (ENOTDIR); + accmode = 0; - if (fmode & (FWRITE | O_TRUNC)) { - if (vp->v_type == VDIR) - return (EISDIR); - accmode |= VWRITE; + if ((fmode & O_PATH) == 0) { + if ((fmode & (FWRITE | O_TRUNC)) != 0) { + if (vp->v_type == VDIR) + return (EISDIR); + accmode |= VWRITE; + } + if ((fmode & FREAD) != 0) + accmode |= VREAD; + if ((fmode & O_APPEND) && (fmode & FWRITE)) + accmode |= VAPPEND; +#ifdef MAC + if ((fmode & O_CREAT) != 0) + accmode |= VCREAT; +#endif } - if (fmode & FREAD) - accmode |= VREAD; - if (fmode & FEXEC) + if ((fmode & FEXEC) != 0) accmode |= VEXEC; - if ((fmode & O_APPEND) && (fmode & FWRITE)) - accmode |= VAPPEND; #ifdef MAC - if (fmode & O_CREAT) - accmode |= VCREAT; - if (fmode & O_VERIFY) + if ((fmode & O_VERIFY) != 0) accmode |= VVERIFY; error = mac_vnode_check_open(cred, vp, accmode); - if (error) + if (error != 0) return (error); accmode &= ~(VCREAT | VVERIFY); @@ -420,6 +426,13 @@ if (error != 0) return (error); } + if ((fmode & O_PATH) != 0) { + error = VOP_ACCESS(vp, VREAD, cred, td); + if (error == 0) + fp->f_flag |= FKQALLOWED; + return (0); + } + if (vp->v_type == VFIFO && VOP_ISLOCKED(vp) != LK_EXCLUSIVE) vn_lock(vp, LK_UPGRADE | LK_RETRY); error = VOP_OPEN(vp, fmode, cred, td, fp); @@ -1616,7 +1629,7 @@ /* * File table vnode stat routine. */ -static int +int vn_statfile(struct file *fp, struct stat *sb, struct ucred *active_cred, struct thread *td) { @@ -1775,7 +1788,7 @@ vp = fp->f_vnode; fp->f_ops = &badfileops; - ref= (fp->f_flag & FHASLOCK) != 0 && fp->f_type == DTYPE_VNODE; + ref = (fp->f_flag & FHASLOCK) != 0 && fp->f_type == DTYPE_VNODE; error = vn_close1(vp, fp->f_flag, fp->f_cred, td, ref); @@ -2130,6 +2143,14 @@ return (VOP_KQFILTER(fp->f_vnode, kn)); } +int +vn_kqfilter_opath(struct file *fp, struct knote *kn) +{ + if ((fp->f_flag & FKQALLOWED) == 0) + return (EBADF); + return (vn_kqfilter(fp, kn)); +} + /* * Simplified in-kernel wrapper calls for extended attribute access. * Both calls pass in a NULL credential, authorizing as "kernel" access. diff --git a/sys/sys/fcntl.h b/sys/sys/fcntl.h --- a/sys/sys/fcntl.h +++ b/sys/sys/fcntl.h @@ -135,7 +135,7 @@ #if __BSD_VISIBLE #define O_VERIFY 0x00200000 /* open only after verification */ -/* #define O_UNUSED1 0x00400000 */ /* Was O_BENEATH */ +#define O_PATH 0x00400000 /* fd is only a path */ #define O_RESOLVE_BENEATH 0x00800000 /* Do not allow name resolution to walk out of cwd */ #endif @@ -153,13 +153,17 @@ #define FREVOKE O_VERIFY /* Only for fo_close() from half-succeeded open */ #define FOPENFAILED O_TTY_INIT +/* Only for O_PATH files which passed ACCESS FREAD check on open */ +#define FKQALLOWED O_RESOLVE_BENEATH /* convert from open() flags to/from fflags; convert O_RD/WR to FREAD/FWRITE */ #define FFLAGS(oflags) ((oflags) & O_EXEC ? (oflags) : (oflags) + 1) -#define OFLAGS(fflags) ((fflags) & O_EXEC ? (fflags) : (fflags) - 1) +#define OFLAGS(fflags) \ + (((fflags) & (O_EXEC | O_PATH)) != 0 ? (fflags) : (fflags) - 1) /* bits to save after open */ -#define FMASK (FREAD|FWRITE|FAPPEND|FASYNC|FFSYNC|FDSYNC|FNONBLOCK|O_DIRECT|FEXEC) +#define FMASK (FREAD|FWRITE|FAPPEND|FASYNC|FFSYNC|FDSYNC|FNONBLOCK| \ + O_DIRECT|FEXEC|O_PATH) /* bits settable by fcntl(F_SETFL, ...) */ #define FCNTLFLAGS (FAPPEND|FASYNC|FFSYNC|FDSYNC|FNONBLOCK|FRDAHEAD|O_DIRECT) diff --git a/sys/sys/file.h b/sys/sys/file.h --- a/sys/sys/file.h +++ b/sys/sys/file.h @@ -239,6 +239,7 @@ extern struct fileops vnops; extern struct fileops badfileops; +extern struct fileops path_fileops; extern struct fileops socketops; extern int maxfiles; /* kernel limit on number of open files */ extern int maxfilesperproc; /* per process limit on number of open files */ @@ -262,10 +263,11 @@ fo_chmod_t invfo_chmod; fo_chown_t invfo_chown; fo_sendfile_t invfo_sendfile; - +fo_stat_t vn_statfile; fo_sendfile_t vn_sendfile; fo_seek_t vn_seek; fo_fill_kinfo_t vn_fill_kinfo; +fo_kqfilter_t vn_kqfilter_opath; int vn_fill_kinfo_vnode(struct vnode *vp, struct kinfo_file *kif); void finit(struct file *, u_int, short, void *, struct fileops *); diff --git a/sys/sys/filedesc.h b/sys/sys/filedesc.h --- a/sys/sys/filedesc.h +++ b/sys/sys/filedesc.h @@ -265,6 +265,8 @@ struct filedesc *fdp, struct proc *leader); int getvnode(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp); +int getvnode_path(struct thread *td, int fd, cap_rights_t *rightsp, + struct file **fpp); void mountcheckdirs(struct vnode *olddp, struct vnode *newdp); int fget_cap_locked(struct filedesc *fdp, int fd, cap_rights_t *needrightsp,