Index: sys/kern/vfs_lookup.c =================================================================== --- sys/kern/vfs_lookup.c +++ sys/kern/vfs_lookup.c @@ -73,21 +73,28 @@ "unsigned long"); SDT_PROBE_DEFINE2(vfs, namei, lookup, return, "int", "struct vnode *"); -/* - * Allocation zone for namei - */ +/* Allocation zone for namei. */ uma_zone_t namei_zone; -/* - * Placeholder vnode for mp traversal - */ + +/* Placeholder vnode for mp traversal. */ static struct vnode *vp_crossmp; +struct nameicap_tracker { + struct vnode *dp; + TAILQ_ENTRY(nameicap_tracker) nm_link; +}; + +/* Zone for cap mode tracker elements used for dotdot capability checks. */ +static uma_zone_t nt_zone; + static void nameiinit(void *dummy __unused) { namei_zone = uma_zcreate("NAMEI", MAXPATHLEN, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); + nt_zone = uma_zcreate("rentr", sizeof(struct nameicap_tracker), + NULL, NULL, NULL, NULL, sizeof(void *), 0); getnewvnode("crossmp", NULL, &dead_vnodeops, &vp_crossmp); vn_lock(vp_crossmp, LK_EXCLUSIVE); VN_LOCK_ASHARE(vp_crossmp); @@ -98,10 +105,68 @@ static int lookup_shared = 1; SYSCTL_INT(_vfs, OID_AUTO, lookup_shared, CTLFLAG_RWTUN, &lookup_shared, 0, "Enables/Disables shared locks for path name translation"); +static int lookup_cap_dotdot = 0; +SYSCTL_INT(_vfs, OID_AUTO, lookup_cap_dotdot, CTLFLAG_RWTUN, + &lookup_cap_dotdot, 0, + "Enables/Disables use of \"..\" components for capability mode lookups"); + +static void +nameicap_tracker_add(struct nameidata *ndp, struct vnode *dp) +{ + struct nameicap_tracker *nt; + + if (ndp->ni_cap_dotdot == 0 || dp->v_type != VDIR) + return; + nt = uma_zalloc(nt_zone, M_WAITOK); + vhold(dp); + nt->dp = dp; + TAILQ_INSERT_TAIL(&ndp->ni_cap_tracker, nt, nm_link); +} + +static void +nameicap_cleanup(struct nameidata *ndp) +{ + struct nameicap_tracker *nt, *nt1; + + KASSERT(TAILQ_EMPTY(&ndp->ni_cap_tracker) || + ndp->ni_cap_dotdot != 0, ("not strictrelative")); + TAILQ_FOREACH_SAFE(nt, &ndp->ni_cap_tracker, nm_link, nt1) { + TAILQ_REMOVE(&ndp->ni_cap_tracker, nt, nm_link); + vdrop(nt->dp); + uma_zfree(nt_zone, nt); + } +} + +/* + * For dotdot lookups in capability mode, only allow the component + * lookup to succeed if the resulting directory was already traversed + * during the operation. Also fail dotdot lookups for non-local + * filesystems, where external agents might assist local lookups to + * escape the compartment. + */ +static int +nameicap_check_dotdot(struct nameidata *ndp, struct vnode *dp) +{ + struct nameicap_tracker *nt; + struct mount *mp; + + if (ndp->ni_cap_dotdot == 0 || dp == NULL || dp->v_type != VDIR) + return (0); + mp = dp->v_mount; + if (mp != NULL && (mp->mnt_flag & MNT_LOCAL) == 0) + return (ENOTCAPABLE); + TAILQ_FOREACH_REVERSE(nt, &ndp->ni_cap_tracker, nameicap_tracker_head, + nm_link) { + if (dp == nt->dp) + return (0); + } + return (ENOTCAPABLE); +} static void namei_cleanup_cnp(struct componentname *cnp) { + uma_zfree(namei_zone, cnp->cn_pnbuf); #ifdef DIAGNOSTIC cnp->cn_pnbuf = NULL; @@ -158,12 +223,16 @@ char *cp; /* pointer into pathname argument */ struct vnode *dp; /* the directory we are searching */ struct iovec aiov; /* uio for reading symbolic links */ + struct componentname *cnp; + struct thread *td; + struct proc *p; + cap_rights_t rights; struct uio auio; int error, linklen, startdir_used; - struct componentname *cnp = &ndp->ni_cnd; - struct thread *td = cnp->cn_thread; - struct proc *p = td->td_proc; + cnp = &ndp->ni_cnd; + td = cnp->cn_thread; + p = td->td_proc; ndp->ni_cnd.cn_cred = ndp->ni_cnd.cn_thread->td_ucred; KASSERT(cnp->cn_cred && p, ("namei: bad cred/proc")); KASSERT((cnp->cn_nameiop & (~OPMASK)) == 0, @@ -175,6 +244,8 @@ if (!lookup_shared) cnp->cn_flags &= ~LOCKSHARED; fdp = p->p_fd; + TAILQ_INIT(&ndp->ni_cap_tracker); + ndp->ni_cap_dotdot = 0; /* We will set this ourselves if we need it. */ cnp->cn_flags &= ~TRAILINGSLASH; @@ -186,11 +257,11 @@ if ((cnp->cn_flags & HASBUF) == 0) cnp->cn_pnbuf = uma_zalloc(namei_zone, M_WAITOK); if (ndp->ni_segflg == UIO_SYSSPACE) - error = copystr(ndp->ni_dirp, cnp->cn_pnbuf, - MAXPATHLEN, (size_t *)&ndp->ni_pathlen); + error = copystr(ndp->ni_dirp, cnp->cn_pnbuf, MAXPATHLEN, + &ndp->ni_pathlen); else - error = copyinstr(ndp->ni_dirp, cnp->cn_pnbuf, - MAXPATHLEN, (size_t *)&ndp->ni_pathlen); + error = copyinstr(ndp->ni_dirp, cnp->cn_pnbuf, MAXPATHLEN, + &ndp->ni_pathlen); /* * Don't allow empty pathnames. @@ -200,9 +271,17 @@ #ifdef CAPABILITY_MODE /* - * In capability mode, lookups must be "strictly relative" (i.e. - * not an absolute path, and not containing '..' components) to - * a real file descriptor, not the pseudo-descriptor AT_FDCWD. + * In capability mode, lookups must be restricted to happen in + * the subtree with the root specified by the file descriptor: + * - The root must be real file descriptor, not the pseudo-descriptor + * AT_FDCWD. + * - The passed path must be relative and not absolute. + * - If lookup_cap_dotdot is disabled, path must not contain the + * '..' components. + * - If lookup_cap_dotdot is enabled, we verify that all '..' + * components lookups result in the directories which were + * previously walked by us, which prevents an escape from + * the relative root. */ if (error == 0 && IN_CAPABILITY_MODE(td) && (cnp->cn_flags & NOCAPCHECK) == 0) { @@ -258,8 +337,6 @@ dp = fdp->fd_cdir; VREF(dp); } else { - cap_rights_t rights; - rights = ndp->ni_rightsneeded; cap_rights_set(&rights, CAP_LOOKUP); @@ -297,6 +374,8 @@ vrele(dp); goto out; } + if (ndp->ni_strictrelative != 0 && lookup_cap_dotdot != 0) + ndp->ni_cap_dotdot = 1; SDT_PROBE3(vfs, namei, lookup, entry, dp, cnp->cn_pnbuf, cnp->cn_flags); for (;;) { @@ -313,7 +392,7 @@ namei_cleanup_cnp(cnp); } else cnp->cn_flags |= HASBUF; - + nameicap_cleanup(ndp); SDT_PROBE2(vfs, namei, lookup, return, 0, ndp->ni_vp); return (0); } @@ -387,6 +466,7 @@ out: vrele(ndp->ni_rootdir); namei_cleanup_cnp(cnp); + nameicap_cleanup(ndp); SDT_PROBE2(vfs, namei, lookup, return, error, NULL); return (error); } @@ -583,6 +663,8 @@ goto bad; } + nameicap_tracker_add(ndp, dp); + /* * Check for degenerate name (e.g. / or "") * which is a way of talking about a directory, @@ -618,9 +700,8 @@ /* * Handle "..": five special cases. - * 0. If doing a capability lookup, return ENOTCAPABLE (this is a - * fairly conservative design choice, but it's the only one that we - * are satisfied guarantees the property we're looking for). + * 0. If doing a capability lookup and lookup_cap_dotdot is + * disabled, return ENOTCAPABLE. * 1. Return an error if this is the last component of * the name and the operation is DELETE or RENAME. * 2. If at root directory (e.g. after chroot) @@ -632,9 +713,14 @@ * .. in the other filesystem. * 4. If the vnode is the top directory of * the jail or chroot, don't let them out. + * 5. If doing a capability lookup and lookup_cap_dotdot is + * enabled, return ENOTCAPABLE if the lookup would escape + * from the initial file descriptor directory. Checks are + * done by ensuring that namei() already traversed the + * result of dotdot lookup. */ if (cnp->cn_flags & ISDOTDOT) { - if (ndp->ni_strictrelative != 0) { + if (ndp->ni_strictrelative != 0 && ndp->ni_cap_dotdot == 0) { #ifdef KTRACE if (KTRPOINT(curthread, KTR_CAPFAIL)) ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL); @@ -676,6 +762,14 @@ vn_lock(dp, compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags | LK_RETRY, ISDOTDOT)); + error = nameicap_check_dotdot(ndp, dp); + if (error != 0) { +#ifdef KTRACE + if (KTRPOINT(curthread, KTR_CAPFAIL)) + ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL); +#endif + goto bad; + } } } @@ -735,6 +829,7 @@ vn_lock(dp, compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags | LK_RETRY, cnp->cn_flags)); + nameicap_tracker_add(ndp, dp); goto unionlookup; } @@ -855,6 +950,16 @@ vrele(ndp->ni_dvp); goto dirloop; } + if (cnp->cn_flags & ISDOTDOT) { + error = nameicap_check_dotdot(ndp, ndp->ni_vp); + if (error != 0) { +#ifdef KTRACE + if (KTRPOINT(curthread, KTR_CAPFAIL)) + ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL); +#endif + goto bad2; + } + } if (*ndp->ni_next == '/') { cnp->cn_nameptr = ndp->ni_next; while (*cnp->cn_nameptr == '/') { Index: sys/sys/namei.h =================================================================== --- sys/sys/namei.h +++ sys/sys/namei.h @@ -55,6 +55,9 @@ long cn_namelen; /* length of looked up component */ }; +struct nameicap_tracker; +TAILQ_HEAD(nameicap_tracker_head, nameicap_tracker); + /* * Encapsulation of namei parameters. */ @@ -72,7 +75,7 @@ struct vnode *ni_rootdir; /* logical root directory */ struct vnode *ni_topdir; /* logical top directory */ int ni_dirfd; /* starting directory for *at functions */ - int ni_strictrelative; /* relative lookup only; no '..' */ + int ni_strictrelative; /* relative lookup only */ /* * Results: returned from namei */ @@ -94,6 +97,8 @@ * through the VOP interface. */ struct componentname ni_cnd; + struct nameicap_tracker_head ni_cap_tracker; + int ni_cap_dotdot; /* ".." in strictrelative case */ }; #ifdef _KERNEL