Index: sys/kern/vfs_lookup.c =================================================================== --- sys/kern/vfs_lookup.c +++ sys/kern/vfs_lookup.c @@ -73,21 +73,28 @@ "unsigned long"); SDT_PROBE_DEFINE2(vfs, namei, lookup, return, "int", "struct vnode *"); -/* - * Allocation zone for namei - */ +/* Allocation zone for namei. */ uma_zone_t namei_zone; -/* - * Placeholder vnode for mp traversal - */ + +/* Placeholder vnode for mp traversal. */ static struct vnode *vp_crossmp; +struct nameicap_tracker { + struct vnode *dp; + TAILQ_ENTRY(nameicap_tracker) nm_link; +}; + +/* Zone for cap mode tracker elements used for dotdot capability checks. */ +static uma_zone_t nt_zone; + static void nameiinit(void *dummy __unused) { namei_zone = uma_zcreate("NAMEI", MAXPATHLEN, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); + nt_zone = uma_zcreate("rentr", sizeof(struct nameicap_tracker), + NULL, NULL, NULL, NULL, sizeof(void *), 0); getnewvnode("crossmp", NULL, &dead_vnodeops, &vp_crossmp); vn_lock(vp_crossmp, LK_EXCLUSIVE); VN_LOCK_ASHARE(vp_crossmp); @@ -100,8 +107,62 @@ "Enables/Disables shared locks for path name translation"); static void +nameicap_tracker_add(struct nameidata *ndp, struct vnode *dp) +{ + struct nameicap_tracker *nt; + + if (ndp->ni_strictrelative == 0 || dp->v_type != VDIR) + return; + nt = uma_zalloc(nt_zone, M_WAITOK); + vhold(dp); + nt->dp = dp; + TAILQ_INSERT_TAIL(&ndp->ni_cap_tracker, nt, nm_link); +} + +static void +nameicap_cleanup(struct nameidata *ndp) +{ + struct nameicap_tracker *nt, *nt1; + + KASSERT(TAILQ_EMPTY(&ndp->ni_cap_tracker) || + ndp->ni_strictrelative != 0, ("not strictrelative")); + TAILQ_FOREACH_SAFE(nt, &ndp->ni_cap_tracker, nm_link, nt1) { + TAILQ_REMOVE(&ndp->ni_cap_tracker, nt, nm_link); + vdrop(nt->dp); + uma_zfree(nt_zone, nt); + } +} + +/* + * For dotdot lookups in capability mode, only allow the component + * lookup to succeed if the resulting directory was already traversed + * during the operation. Also fail dotdot lookups for non-local + * filesystems, where external agents might assist local lookups to + * escape the compartment. + */ +static int +nameicap_check_dotdot(struct nameidata *ndp, struct vnode *dp) +{ + struct nameicap_tracker *nt; + struct mount *mp; + + if (ndp->ni_strictrelative == 0 || dp == NULL || dp->v_type != VDIR) + return (0); + mp = dp->v_mount; + if (mp != NULL && (mp->mnt_flag & MNT_LOCAL) == 0) + return (ENOTCAPABLE); + TAILQ_FOREACH_REVERSE(nt, &ndp->ni_cap_tracker, nameicap_tracker_head, + nm_link) { + if (dp == nt->dp) + return (0); + } + return (ENOTCAPABLE); +} + +static void namei_cleanup_cnp(struct componentname *cnp) { + uma_zfree(namei_zone, cnp->cn_pnbuf); #ifdef DIAGNOSTIC cnp->cn_pnbuf = NULL; @@ -158,12 +219,16 @@ char *cp; /* pointer into pathname argument */ struct vnode *dp; /* the directory we are searching */ struct iovec aiov; /* uio for reading symbolic links */ + struct componentname *cnp; + struct thread *td; + struct proc *p; + cap_rights_t rights; struct uio auio; int error, linklen, startdir_used; - struct componentname *cnp = &ndp->ni_cnd; - struct thread *td = cnp->cn_thread; - struct proc *p = td->td_proc; + cnp = &ndp->ni_cnd; + td = cnp->cn_thread; + p = td->td_proc; ndp->ni_cnd.cn_cred = ndp->ni_cnd.cn_thread->td_ucred; KASSERT(cnp->cn_cred && p, ("namei: bad cred/proc")); KASSERT((cnp->cn_nameiop & (~OPMASK)) == 0, @@ -175,6 +240,7 @@ if (!lookup_shared) cnp->cn_flags &= ~LOCKSHARED; fdp = p->p_fd; + TAILQ_INIT(&ndp->ni_cap_tracker); /* We will set this ourselves if we need it. */ cnp->cn_flags &= ~TRAILINGSLASH; @@ -200,9 +266,14 @@ #ifdef CAPABILITY_MODE /* - * In capability mode, lookups must be "strictly relative" (i.e. - * not an absolute path, and not containing '..' components) to - * a real file descriptor, not the pseudo-descriptor AT_FDCWD. + * In capability mode, lookups must be restricted to happen in + * the subtree with the root specified by the file descriptor: + * - The root must be real file descriptor, not the pseudo-descriptor + * AT_FDCWD. + * - The passed path must be relative and not absolute. + * - We verify that all '..' components lookups result in the + * directories which were previously walked by us, which + * prevents an escape from the relative root. */ if (error == 0 && IN_CAPABILITY_MODE(td) && (cnp->cn_flags & NOCAPCHECK) == 0) { @@ -258,8 +329,6 @@ dp = fdp->fd_cdir; VREF(dp); } else { - cap_rights_t rights; - rights = ndp->ni_rightsneeded; cap_rights_set(&rights, CAP_LOOKUP); @@ -299,6 +368,7 @@ } SDT_PROBE3(vfs, namei, lookup, entry, dp, cnp->cn_pnbuf, cnp->cn_flags); + nameicap_tracker_add(ndp, dp); for (;;) { ndp->ni_startdir = dp; error = lookup(ndp); @@ -313,7 +383,7 @@ namei_cleanup_cnp(cnp); } else cnp->cn_flags |= HASBUF; - + nameicap_cleanup(ndp); SDT_PROBE2(vfs, namei, lookup, return, 0, ndp->ni_vp); return (0); } @@ -387,6 +457,7 @@ out: vrele(ndp->ni_rootdir); namei_cleanup_cnp(cnp); + nameicap_cleanup(ndp); SDT_PROBE2(vfs, namei, lookup, return, error, NULL); return (error); } @@ -616,11 +687,10 @@ goto success; } + nameicap_tracker_add(ndp, dp); + /* * Handle "..": five special cases. - * 0. If doing a capability lookup, return ENOTCAPABLE (this is a - * fairly conservative design choice, but it's the only one that we - * are satisfied guarantees the property we're looking for). * 1. Return an error if this is the last component of * the name and the operation is DELETE or RENAME. * 2. If at root directory (e.g. after chroot) @@ -632,16 +702,12 @@ * .. in the other filesystem. * 4. If the vnode is the top directory of * the jail or chroot, don't let them out. + * 5. If doing a capability lookup, return ENOTCAPABLE if the + * lookup would escape from the initial file descriptor + * directory. Checks are done by ensuring that namei() + * already traversed the result of dotdot lookup. */ if (cnp->cn_flags & ISDOTDOT) { - if (ndp->ni_strictrelative != 0) { -#ifdef KTRACE - if (KTRPOINT(curthread, KTR_CAPFAIL)) - ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL); -#endif - error = ENOTCAPABLE; - goto bad; - } if ((cnp->cn_flags & ISLASTCN) != 0 && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) { error = EINVAL; @@ -676,6 +742,14 @@ vn_lock(dp, compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags | LK_RETRY, ISDOTDOT)); + error = nameicap_check_dotdot(ndp, dp); + if (error != 0) { +#ifdef KTRACE + if (KTRPOINT(curthread, KTR_CAPFAIL)) + ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL); +#endif + goto bad; + } } } @@ -735,6 +809,7 @@ vn_lock(dp, compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags | LK_RETRY, cnp->cn_flags)); + nameicap_tracker_add(ndp, dp); goto unionlookup; } @@ -855,6 +930,16 @@ vrele(ndp->ni_dvp); goto dirloop; } + if (cnp->cn_flags & ISDOTDOT) { + error = nameicap_check_dotdot(ndp, ndp->ni_vp); + if (error != 0) { +#ifdef KTRACE + if (KTRPOINT(curthread, KTR_CAPFAIL)) + ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL); +#endif + goto bad2; + } + } if (*ndp->ni_next == '/') { cnp->cn_nameptr = ndp->ni_next; while (*cnp->cn_nameptr == '/') { Index: sys/sys/namei.h =================================================================== --- sys/sys/namei.h +++ sys/sys/namei.h @@ -55,6 +55,9 @@ long cn_namelen; /* length of looked up component */ }; +struct nameicap_tracker; +TAILQ_HEAD(nameicap_tracker_head, nameicap_tracker); + /* * Encapsulation of namei parameters. */ @@ -94,6 +97,7 @@ * through the VOP interface. */ struct componentname ni_cnd; + struct nameicap_tracker_head ni_cap_tracker; }; #ifdef _KERNEL