diff --git a/sys/kern/vfs_default.c b/sys/kern/vfs_default.c --- a/sys/kern/vfs_default.c +++ b/sys/kern/vfs_default.c @@ -75,15 +75,9 @@ static int vop_nolookup(struct vop_lookup_args *); static int vop_norename(struct vop_rename_args *); static int vop_nostrategy(struct vop_strategy_args *); -static int get_next_dirent(struct vnode *vp, struct dirent **dpp, - char *dirbuf, int dirbuflen, off_t *off, - char **cpos, int *len, int *eofflag, - struct thread *td); static int dirent_exists(struct vnode *vp, const char *dirname, struct thread *td); -#define DIRENT_MINSIZE (sizeof(struct dirent) - (MAXNAMLEN+1) + 4) - static int vop_stdis_text(struct vop_is_text_args *ap); static int vop_stdunset_text(struct vop_unset_text_args *ap); static int vop_stdadd_writecount(struct vop_add_writecount_args *ap); @@ -281,109 +275,54 @@ return (EOPNOTSUPP); } -static int -get_next_dirent(struct vnode *vp, struct dirent **dpp, char *dirbuf, - int dirbuflen, off_t *off, char **cpos, int *len, - int *eofflag, struct thread *td) -{ - int error, reclen; - struct uio uio; - struct iovec iov; - struct dirent *dp; - - KASSERT(VOP_ISLOCKED(vp), ("vp %p is not locked", vp)); - KASSERT(vp->v_type == VDIR, ("vp %p is not a directory", vp)); - - if (*len == 0) { - iov.iov_base = dirbuf; - iov.iov_len = dirbuflen; - - uio.uio_iov = &iov; - uio.uio_iovcnt = 1; - uio.uio_offset = *off; - uio.uio_resid = dirbuflen; - uio.uio_segflg = UIO_SYSSPACE; - uio.uio_rw = UIO_READ; - uio.uio_td = td; - - *eofflag = 0; - -#ifdef MAC - error = mac_vnode_check_readdir(td->td_ucred, vp); - if (error == 0) -#endif - error = VOP_READDIR(vp, &uio, td->td_ucred, eofflag, - NULL, NULL); - if (error) - return (error); - - *off = uio.uio_offset; - - *cpos = dirbuf; - *len = (dirbuflen - uio.uio_resid); - - if (*len == 0) - return (ENOENT); - } - - dp = (struct dirent *)(*cpos); - reclen = dp->d_reclen; - *dpp = dp; - - /* check for malformed directory.. */ - if (reclen < DIRENT_MINSIZE) - return (EINVAL); - - *cpos += reclen; - *len -= reclen; - - return (0); -} - /* - * Check if a named file exists in a given directory vnode. + * Check if a named file exists in a given directory vnode + * + * Returns 0 if the file exists, ENOENT if it doesn't, or errors returned by + * vfs_next_dirent(). */ static int dirent_exists(struct vnode *vp, const char *dirname, struct thread *td) { - char *dirbuf, *cpos; - int error, eofflag, dirbuflen, len, found; + int error, eofflag; + size_t dirbuflen, len; + char *dirbuf; off_t off; struct dirent *dp; struct vattr va; - KASSERT(VOP_ISLOCKED(vp), ("vp %p is not locked", vp)); - KASSERT(vp->v_type == VDIR, ("vp %p is not a directory", vp)); - - found = 0; - error = VOP_GETATTR(vp, &va, td->td_ucred); - if (error) - return (found); + if (error != 0) + return (error); - dirbuflen = DEV_BSIZE; + dirbuflen = MAX(DEV_BSIZE, GENERIC_MAXDIRSIZ); if (dirbuflen < va.va_blocksize) dirbuflen = va.va_blocksize; dirbuf = (char *)malloc(dirbuflen, M_TEMP, M_WAITOK); - off = 0; len = 0; - do { - error = get_next_dirent(vp, &dp, dirbuf, dirbuflen, &off, - &cpos, &len, &eofflag, td); - if (error) + off = 0; + eofflag = 0; + + for (;;) { + error = vfs_next_dirent(vp, td, dirbuf, dirbuflen, + &dp, &len, &off, &eofflag); + if (error != 0) goto out; + if (len == 0) + break; + if (dp->d_type != DT_WHT && dp->d_fileno != 0 && - strcmp(dp->d_name, dirname) == 0) { - found = 1; + strcmp(dp->d_name, dirname) == 0) goto out; - } - } while (len > 0 || !eofflag); + } + + error = ENOENT; out: free(dirbuf, M_TEMP); - return (found); + return (error); } int @@ -737,26 +676,23 @@ int vop_stdvptocnp(struct vop_vptocnp_args *ap) { - struct vnode *vp = ap->a_vp; - struct vnode **dvp = ap->a_vpp; - struct ucred *cred; - char *buf = ap->a_buf; - size_t *buflen = ap->a_buflen; - char *dirbuf, *cpos; - int i, error, eofflag, dirbuflen, flags, locked, len, covered; off_t off; ino_t fileno; - struct vattr va; - struct nameidata nd; - struct thread *td; + struct vnode *const vp = ap->a_vp; + struct vnode **dvp = ap->a_vpp; + char *buf = ap->a_buf; + size_t *buflen = ap->a_buflen; + char *dirbuf; + size_t dirbuflen, len; + struct thread *const td = curthread; + struct ucred *const cred = td->td_ucred; struct dirent *dp; struct vnode *mvp; - - i = *buflen; - error = 0; - covered = 0; - td = curthread; - cred = td->td_ucred; + struct vattr va; + struct nameidata nd; + int i = *buflen; + int error = 0, covered = 0; + int eofflag, flags, locked; if (vp->v_type != VDIR) return (ENOENT); @@ -794,7 +730,7 @@ fileno = va.va_fileid; - dirbuflen = DEV_BSIZE; + dirbuflen = MAX(DEV_BSIZE, GENERIC_MAXDIRSIZ); if (dirbuflen < va.va_blocksize) dirbuflen = va.va_blocksize; dirbuf = (char *)malloc(dirbuflen, M_TEMP, M_WAITOK); @@ -804,21 +740,26 @@ goto out; } - off = 0; len = 0; - do { + off = 0; + eofflag = 0; + + for (;;) { /* call VOP_READDIR of parent */ - error = get_next_dirent(*dvp, &dp, dirbuf, dirbuflen, &off, - &cpos, &len, &eofflag, td); + error = vfs_next_dirent(*dvp, td, + dirbuf, dirbuflen, &dp, &len, &off, &eofflag); if (error) goto out; + if (len == 0) + break; + if ((dp->d_type != DT_WHT) && (dp->d_fileno == fileno)) { if (covered) { VOP_UNLOCK(*dvp); vn_lock(mvp, LK_SHARED | LK_RETRY); - if (dirent_exists(mvp, dp->d_name, td)) { + if (dirent_exists(mvp, dp->d_name, td) == 0) { error = ENOENT; VOP_UNLOCK(mvp); vn_lock(*dvp, LK_SHARED | LK_RETRY); @@ -841,7 +782,7 @@ } goto out; } - } while (len > 0 || !eofflag); + } error = ENOENT; out: diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c --- a/sys/kern/vfs_subr.c +++ b/sys/kern/vfs_subr.c @@ -6383,73 +6383,223 @@ } /* - * Returns whether the directory is empty or not. - * If it is empty, the return value is 0; otherwise - * the return value is an error value (which may - * be ENOTEMPTY). + * Returns successive directory entries through some caller's provided buffer + * + * This function automatically refills the provided buffer with calls to + * VOP_READDIR() (after MAC permission checks). + * + * 'td' is used for credentials and passed to uiomove(). 'dirbuf' is the + * caller's buffer to fill and 'dirbuflen' its allocated size. 'dirbuf' must be + * properly aligned to access 'struct dirent' structures and 'dirbuflen' must + * be greater than GENERIC_MAXDIRSIZ to avoid VOP_READDIR() returning EINVAL + * (the latter is not a strong guarantee (yet); but EINVAL will always be + * returned if this requirement is not verified). '*dpp' points to the current + * directory entry in the buffer and '*len' contains the remaining valid bytes + * in 'dirbuf' after 'dpp' (including this entry). + * + * At first call (or when restarting the read), '*len' must have been set to 0, + * '*off' to 0 (or any valid start offset) and '*eofflag' to 0. There are no + * more entries as soon as '*len' is 0 after a call that returned 0. Calling + * again this function after such a condition is considered an error and EINVAL + * will be returned. Other possible error codes are those of VOP_READDIR() or + * EINTEGRITY if the returned entries do not pass coherency tests. + * + * '*off' and '*eofflag' are internal state the caller should not tamper with, + * except as explained in the previous paragraph. '*off' is the next directory + * offset to read from to refill the buffer. '*eofflag' is set to 0 or 1 by the + * last internal call to VOP_READDIR() that returned without error, indicating + * whether it reached the end of the directory, and to 2 by this function after + * all entries have been read. + */ +int +vfs_next_dirent +(struct vnode *vp, struct thread *td, + char *dirbuf, size_t dirbuflen, + struct dirent **dpp, size_t *len, off_t *off, int *eofflag) +{ + struct uio uio; + struct iovec iov; + struct dirent *dp; + int_fast16_t reclen; + int error; + + if (__predict_false(dirbuflen < GENERIC_MAXDIRSIZ)) + /* Don't take any chances in this case */ + return (EINVAL); + + MPASS2((uintptr_t)dirbuf < (uintptr_t)dirbuf + dirbuflen, + "Address space overflow"); + KASSERT(VOP_ISLOCKED(vp), ("vp %p is not locked", vp)); + KASSERT(vp->v_type == VDIR, ("vp %p is not a directory", vp)); + + if (*len != 0) { + dp = *dpp; + + MPASS2((uintptr_t)dirbuf <= (uintptr_t)dp && + (uintptr_t)dp + *len <= (uintptr_t)dirbuf + dirbuflen, + "Filled range not inside buffer"); + + reclen = dp->d_reclen; + if (reclen >= *len) + /* End of buffer reached */ + *len = 0; + else { + dp = (struct dirent *)((char *)dp + reclen); + *len -= reclen; + } + } + + if (*len == 0) { + /* Have to refill */ + switch (*eofflag) { + case 0: + break; + + case 1: + /* Nothing more to read. */ + *eofflag = 2; /* Remember the caller reached EOF. */ + return (0); + + default: + /* The caller didn't test for EOF */ + return (EINVAL); + } + + iov.iov_base = dirbuf; + iov.iov_len = dirbuflen; + + uio.uio_iov = &iov; + uio.uio_iovcnt = 1; + uio.uio_offset = *off; + uio.uio_resid = dirbuflen; + uio.uio_segflg = UIO_SYSSPACE; + uio.uio_rw = UIO_READ; + uio.uio_td = td; + +#ifdef MAC + error = mac_vnode_check_readdir(td->td_ucred, vp); + if (error == 0) +#endif + error = VOP_READDIR(vp, &uio, td->td_ucred, eofflag, + NULL, NULL); + if (error != 0) + return (error); + + *len = dirbuflen - uio.uio_resid; + *off = uio.uio_offset; + + if (*len == 0) { + /* Sanity check on INVARIANTS */ + MPASS(*eofflag != 0); + *eofflag = 1; + dp = NULL; + goto end; + } + + /* Normalize the flag returned by VOP_READDIR() */ + if (*eofflag != 0) + *eofflag = 1; + + dp = (struct dirent *)dirbuf; + } + + MPASS2(*len >= GENERIC_MINDIRSIZ, "Buffer underflow"); + VNASSERT(dp->d_reclen >= GENERIC_MINDIRSIZ, vp, + ("Too short directory entry")); + +end: + *dpp = dp; + + return (0); +} + +/* + * Returns whether some directory is empty. + * + * If it is empty, returns 0, and if not, ENOTEMPTY. Other return values are + * actual errors. */ int vfs_emptydir(struct vnode *vp) { - struct uio uio; - struct iovec iov; - struct dirent *dirent, *dp, *endp; - int error, eof; - - error = 0; - eof = 0; + struct thread *const td = curthread; + char *dirbuf; + size_t dirbuflen, len; + off_t off; + int eofflag, error; + struct dirent *dp; + struct vattr va; ASSERT_VOP_LOCKED(vp, "vfs_emptydir"); VNPASS(vp->v_type == VDIR, vp); - dirent = malloc(sizeof(struct dirent), M_TEMP, M_WAITOK); - iov.iov_base = dirent; - iov.iov_len = sizeof(struct dirent); + error = VOP_GETATTR(vp, &va, td->td_ucred); + if (error != 0) + return (error); - uio.uio_iov = &iov; - uio.uio_iovcnt = 1; - uio.uio_offset = 0; - uio.uio_resid = sizeof(struct dirent); - uio.uio_segflg = UIO_SYSSPACE; - uio.uio_rw = UIO_READ; - uio.uio_td = curthread; + dirbuflen = max(DEV_BSIZE, GENERIC_MAXDIRSIZ); + if (dirbuflen < va.va_blocksize) + dirbuflen = va.va_blocksize; + dirbuf = (char *)malloc(dirbuflen, M_TEMP, M_WAITOK); - while (eof == 0 && error == 0) { - error = VOP_READDIR(vp, &uio, curthread->td_ucred, &eof, - NULL, NULL); + len = 0; + off = 0; + eofflag = 0; + + for (;;) { + error = vfs_next_dirent(vp, td, dirbuf, dirbuflen, + &dp, &len, &off, &eofflag); if (error != 0) + goto end; + + if (len == 0) { + /* EOF */ + error = 0; + goto end; + } + + /* This should never happen. */ + if (dp->d_fileno == 0) + continue; + + /* Skip whiteouts. Unionfs operates on filesystems only and not + * on hierarchies, so these whiteouts would be shadowed on the + * system hierarchy but not for a union using the filesystem of + * their directories as the upper layer. Additionally, unionfs + * currently transparently exposes union-specific metadata of + * its upper layer, meaning that whiteouts can be seen through + * the union view in empty directories. Taking into account + * these whiteouts would then prevent mounting another + * filesystem on such effectively empty directories. */ + if (dp->d_type == DT_WHT) + continue; + + /* Any file in the directory which is not '.' or '..' indicates + * the directory is not empty. */ + switch (dp->d_namlen) { + case 2: + if (dp->d_name[1] != '.') { + /* Can't be '..' (nor '.') */ + error = ENOTEMPTY; + goto end; + } + /* FALLTHROUGH */ + case 1: + /* Can't be '..' nor '.' */ + if (dp->d_name[0] != '.') { + error = ENOTEMPTY; + goto end; + } break; - endp = (void *)((uint8_t *)dirent + - sizeof(struct dirent) - uio.uio_resid); - for (dp = dirent; dp < endp; - dp = (void *)((uint8_t *)dp + GENERIC_DIRSIZ(dp))) { - if (dp->d_type == DT_WHT) - continue; - if (dp->d_namlen == 0) - continue; - if (dp->d_type != DT_DIR && - dp->d_type != DT_UNKNOWN) { - error = ENOTEMPTY; - break; - } - if (dp->d_namlen > 2) { - error = ENOTEMPTY; - break; - } - if (dp->d_namlen == 1 && - dp->d_name[0] != '.') { - error = ENOTEMPTY; - break; - } - if (dp->d_namlen == 2 && - dp->d_name[1] != '.') { - error = ENOTEMPTY; - break; - } - uio.uio_resid = sizeof(struct dirent); + + default: + error = ENOTEMPTY; + goto end; } } - free(dirent, M_TEMP); + +end: + free(dirbuf, M_TEMP); return (error); } diff --git a/sys/sys/dirent.h b/sys/sys/dirent.h --- a/sys/sys/dirent.h +++ b/sys/sys/dirent.h @@ -65,7 +65,7 @@ struct dirent { ino_t d_fileno; /* file number of entry */ - off_t d_off; /* directory offset of entry */ + off_t d_off; /* directory offset of next entry */ __uint16_t d_reclen; /* length of this record */ __uint8_t d_type; /* file type, see below */ __uint8_t d_pad0; @@ -122,11 +122,18 @@ #define _GENERIC_DIRLEN(namlen) \ ((__offsetof(struct dirent, d_name) + (namlen) + 1 + 7) & ~7) #define _GENERIC_DIRSIZ(dp) _GENERIC_DIRLEN((dp)->d_namlen) +#define _GENERIC_MINDIRSIZ _GENERIC_DIRLEN(1) /* Name must not be empty */ +#define _GENERIC_MAXDIRSIZ _GENERIC_DIRLEN(MAXNAMLEN) +/* Keep this assert as long as sizeof(struct dirent) is used as the maximum + * entry size. */ +_Static_assert(_GENERIC_MAXDIRSIZ == sizeof(struct dirent), + "'struct dirent' size must be a multiple of 8 (see _GENERIC_DIRLEN())"); #endif /* __BSD_VISIBLE */ #ifdef _KERNEL #define GENERIC_DIRSIZ(dp) _GENERIC_DIRSIZ(dp) - +#define GENERIC_MINDIRSIZ _GENERIC_MINDIRSIZ +#define GENERIC_MAXDIRSIZ _GENERIC_MAXDIRSIZ /* * Ensure that padding bytes are zeroed and that the name is NUL-terminated. */ diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h --- a/sys/sys/vnode.h +++ b/sys/sys/vnode.h @@ -1100,6 +1100,9 @@ int vfs_kqfilter(struct vop_kqfilter_args *); struct dirent; int vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off); +int vfs_next_dirent(struct vnode *vp, struct thread *td, + char *dirbuf, size_t dirbuflen, + struct dirent **dpp, size_t *len, off_t *off, int *eofflag); int vfs_emptydir(struct vnode *vp); int vfs_unixify_accmode(accmode_t *accmode);