Index: head/sys/kern/kern_acl.c =================================================================== --- head/sys/kern/kern_acl.c (revision 75570) +++ head/sys/kern/kern_acl.c (revision 75571) @@ -1,714 +1,768 @@ /*- * Copyright (c) 1999, 2000, 2001 Robert N. M. Watson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ /* * Developed by the TrustedBSD Project. * Support for POSIX.1e access control lists. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include MALLOC_DEFINE(M_ACL, "acl", "access control list"); static int vacl_set_acl(struct proc *p, struct vnode *vp, acl_type_t type, struct acl *aclp); static int vacl_get_acl(struct proc *p, struct vnode *vp, acl_type_t type, struct acl *aclp); -static int vacl_aclcheck(struct proc *p, struct vnode *vp, acl_type_t type, - struct acl *aclp); +static int vacl_aclcheck(struct proc *p, struct vnode *vp, + acl_type_t type, struct acl *aclp); /* * Implement a version of vaccess() that understands POSIX.1e ACL semantics. * Return 0 on success, else an errno value. Should be merged into * vaccess() eventually. */ int -vaccess_acl_posix1e(enum vtype type, struct acl *acl, mode_t acc_mode, - struct ucred *cred, int *privused) +vaccess_acl_posix1e(enum vtype type, uid_t file_uid, gid_t file_gid, + struct acl *acl, mode_t acc_mode, struct ucred *cred, int *privused) { struct acl_entry *acl_other, *acl_mask; mode_t dac_granted; mode_t cap_granted; mode_t acl_mask_granted; int group_matched, i; /* * Look for a normal, non-privileged way to access the file/directory * as requested. If it exists, go with that. Otherwise, attempt * to use privileges granted via cap_granted. In some cases, * which privileges to use may be ambiguous due to "best match", * in which case fall back on first match for the time being. */ if (privused != NULL) *privused = 0; /* * Determine privileges now, but don't apply until we've found * a DAC match that has failed to allow access. */ #ifndef CAPABILITIES if (suser_xxx(cred, NULL, PRISON_ROOT) == 0) cap_granted = (VEXEC | VREAD | VWRITE | VADMIN); else cap_granted = 0; #else cap_granted = 0; if (type == VDIR) { if ((acc_mode & VEXEC) && !cap_check(cred, NULL, CAP_DAC_READ_SEARCH, PRISON_ROOT)) cap_granted |= VEXEC; } else { if ((acc_mode & VEXEC) && !cap_check(cred, NULL, CAP_DAC_EXECUTE, PRISON_ROOT)) cap_granted |= VEXEC; } if ((acc_mode & VREAD) && !cap_check(cred, NULL, CAP_DAC_READ_SEARCH, PRISON_ROOT)) cap_granted |= VREAD; if ((acc_mode & VWRITE) && !cap_check(cred, NULL, CAP_DAC_WRITE, PRISON_ROOT)) cap_granted |= VWRITE; if ((acc_mode & VADMIN) && !cap_check(cred, NULL, CAP_FOWNER, PRISON_ROOT)) cap_granted |= VADMIN; #endif /* CAPABILITIES */ /* * Check the owner. * Also, record locations of ACL_MASK and ACL_OTHER for reference * later if the owner doesn't match. */ acl_mask = acl_other = NULL; for (i = 0; i < acl->acl_cnt; i++) { switch (acl->acl_entry[i].ae_tag) { case ACL_USER_OBJ: - if (acl->acl_entry[i].ae_id != cred->cr_uid) + if (file_uid != cred->cr_uid) break; dac_granted = 0; dac_granted |= VADMIN; if (acl->acl_entry[i].ae_perm & ACL_EXECUTE) dac_granted |= VEXEC; if (acl->acl_entry[i].ae_perm & ACL_READ) dac_granted |= VREAD; if (acl->acl_entry[i].ae_perm & ACL_WRITE) dac_granted |= VWRITE; if ((acc_mode & dac_granted) == acc_mode) return (0); if ((acc_mode & (dac_granted | cap_granted)) == acc_mode) { if (privused != NULL) *privused = 1; return (0); } goto error; case ACL_MASK: acl_mask = &acl->acl_entry[i]; break; case ACL_OTHER: acl_other = &acl->acl_entry[i]; break; default: } } /* * Checks against ACL_USER, ACL_GROUP_OBJ, and ACL_GROUP fields * are masked by an ACL_MASK entry, if any. As such, first identify * the ACL_MASK field, then iterate through identifying potential * user matches, then group matches. If there is no ACL_MASK, * assume that the mask allows all requests to succeed. * Also keep track of the location of ACL_OTHER for later consumption. */ if (acl_other == NULL) { /* * XXX: This should never happen. Only properly formatted * ACLs should be passed to vaccess_acl_posix1e. * Should make this a panic post-debugging. */ printf("vaccess_acl_posix1e: ACL_OTHER missing\n"); return (EPERM); } if (acl_mask != NULL) { acl_mask_granted = 0; if (acl_mask->ae_perm & ACL_EXECUTE) acl_mask_granted |= VEXEC; if (acl_mask->ae_perm & ACL_READ) acl_mask_granted |= VREAD; if (acl_mask->ae_perm & ACL_WRITE) acl_mask_granted |= VWRITE; } else acl_mask_granted = VEXEC | VREAD | VWRITE; /* * We have to check each type even if we know ACL_MASK will reject, * as we need to know what match there might have been, and * therefore what further types we might be allowed to check. * Do the checks twice -- once without privilege, and a second time * with, if there was a match. */ /* * Check ACL_USER ACL entries. */ for (i = 0; i < acl->acl_cnt; i++) { switch (acl->acl_entry[i].ae_tag) { case ACL_USER: if (acl->acl_entry[i].ae_id != cred->cr_uid) break; dac_granted = 0; if (acl->acl_entry[i].ae_perm & ACL_EXECUTE) dac_granted |= VEXEC; if (acl->acl_entry[i].ae_perm & ACL_READ) dac_granted |= VREAD; if (acl->acl_entry[i].ae_perm & ACL_WRITE) dac_granted |= VWRITE; dac_granted &= acl_mask_granted; if ((acc_mode & dac_granted) == acc_mode) return (0); - if ((acc_mode & (dac_granted | cap_granted)) == - acc_mode) { - if (privused != NULL) - *privused = 1; - return (0); - } - goto error; + if ((acc_mode & (dac_granted | cap_granted)) != + acc_mode) + goto error; + + if (privused != NULL) + *privused = 1; + return (0); } } /* * Group match is best-match, not first-match, so find a * "best" match. Iterate across, testing each potential group * match. Make sure we keep track of whether we found a match * or not, so that we know if we can move on to ACL_OTHER. */ group_matched = 0; for (i = 0; i < acl->acl_cnt; i++) { switch (acl->acl_entry[i].ae_tag) { case ACL_GROUP_OBJ: + if (file_gid != cred->cr_groups[0]) + break; + dac_granted = 0; + if (acl->acl_entry[i].ae_perm & ACL_EXECUTE) + dac_granted |= VEXEC; + if (acl->acl_entry[i].ae_perm & ACL_READ) + dac_granted |= VREAD; + if (acl->acl_entry[i].ae_perm & ACL_WRITE) + dac_granted |= VWRITE; + dac_granted &= acl_mask_granted; + + if ((acc_mode & dac_granted) == acc_mode) + return (0); + + group_matched = 1; + break; + case ACL_GROUP: - if (groupmember(acl->acl_entry[i].ae_id, cred)) { - dac_granted = 0; - if (acl->acl_entry[i].ae_perm & ACL_EXECUTE) - dac_granted |= VEXEC; - if (acl->acl_entry[i].ae_perm & ACL_READ) - dac_granted |= VREAD; - if (acl->acl_entry[i].ae_perm & ACL_WRITE) - dac_granted |= VWRITE; - dac_granted &= acl_mask_granted; + if (!groupmember(acl->acl_entry[i].ae_id, cred)) + break; + dac_granted = 0; + if (acl->acl_entry[i].ae_perm & ACL_EXECUTE) + dac_granted |= VEXEC; + if (acl->acl_entry[i].ae_perm & ACL_READ) + dac_granted |= VREAD; + if (acl->acl_entry[i].ae_perm & ACL_WRITE) + dac_granted |= VWRITE; + dac_granted &= acl_mask_granted; - if ((acc_mode & dac_granted) == acc_mode) - return (0); + if ((acc_mode & dac_granted) == acc_mode) + return (0); - group_matched = 1; - } + group_matched = 1; + break; + default: } } if (group_matched == 1) { /* * There was a match, but it did not grant rights via * pure DAC. Try again, this time with privilege. */ for (i = 0; i < acl->acl_cnt; i++) { switch (acl->acl_entry[i].ae_tag) { case ACL_GROUP_OBJ: - case ACL_GROUP: - if (groupmember(acl->acl_entry[i].ae_id, - cred)) { - dac_granted = 0; - if (acl->acl_entry[i].ae_perm & - ACL_EXECUTE) + if (file_gid != cred->cr_groups[0]) + break; + dac_granted = 0; + if (acl->acl_entry[i].ae_perm & ACL_EXECUTE) dac_granted |= VEXEC; - if (acl->acl_entry[i].ae_perm & - ACL_READ) - dac_granted |= VREAD; - if (acl->acl_entry[i].ae_perm & - ACL_WRITE) - dac_granted |= VWRITE; - dac_granted &= acl_mask_granted; - if ((acc_mode & (dac_granted | - cap_granted)) == acc_mode) { - if (privused != NULL) - *privused = 1; - return (0); - } - } + if (acl->acl_entry[i].ae_perm & ACL_READ) + dac_granted |= VREAD; + if (acl->acl_entry[i].ae_perm & ACL_WRITE) + dac_granted |= VWRITE; + dac_granted &= acl_mask_granted; + + if ((acc_mode & (dac_granted | cap_granted)) != + acc_mode) + break; + + if (privused != NULL) + *privused = 1; + return (0); + + case ACL_GROUP: + if (!groupmember(acl->acl_entry[i].ae_id, + cred)) + break; + dac_granted = 0; + if (acl->acl_entry[i].ae_perm & ACL_EXECUTE) + dac_granted |= VEXEC; + if (acl->acl_entry[i].ae_perm & ACL_READ) + dac_granted |= VREAD; + if (acl->acl_entry[i].ae_perm & ACL_WRITE) + dac_granted |= VWRITE; + dac_granted &= acl_mask_granted; + + if ((acc_mode & (dac_granted | cap_granted)) != + acc_mode) + break; + + if (privused != NULL) + *privused = 1; + return (0); + default: } } /* * Even with privilege, group membership was not sufficient. * Return failure. */ goto error; } /* * Fall back on ACL_OTHER. ACL_MASK is not applied to ACL_OTHER. */ dac_granted = 0; if (acl_other->ae_perm & ACL_EXECUTE) dac_granted |= VEXEC; if (acl_other->ae_perm & ACL_READ) dac_granted |= VREAD; if (acl_other->ae_perm & ACL_WRITE) dac_granted |= VWRITE; if ((acc_mode & dac_granted) == acc_mode) return (0); if ((acc_mode & (dac_granted | cap_granted)) == acc_mode) { if (privused != NULL) *privused = 1; return (0); } error: return ((acc_mode & VADMIN) ? EPERM : EACCES); } /* * For the purposes of file systems maintaining the _OBJ entries in an * inode with a mode_t field, this routine converts a mode_t entry * to an acl_perm_t. */ acl_perm_t acl_posix1e_mode_to_perm(acl_tag_t tag, mode_t mode) { acl_perm_t perm = 0; switch(tag) { case ACL_USER_OBJ: if (mode & S_IXUSR) perm |= ACL_EXECUTE; if (mode & S_IRUSR) perm |= ACL_READ; if (mode & S_IWUSR) perm |= ACL_WRITE; return (perm); case ACL_GROUP_OBJ: if (mode & S_IXGRP) perm |= ACL_EXECUTE; if (mode & S_IRGRP) perm |= ACL_READ; if (mode & S_IWGRP) perm |= ACL_WRITE; return (perm); case ACL_OTHER: if (mode & S_IXOTH) perm |= ACL_EXECUTE; if (mode & S_IROTH) perm |= ACL_READ; if (mode & S_IWOTH) perm |= ACL_WRITE; return (perm); default: printf("acl_posix1e_mode_to_perm: invalid tag (%d)\n", tag); return (0); } } /* * Given inode information (uid, gid, mode), return an acl entry of the * appropriate type. */ struct acl_entry acl_posix1e_mode_to_entry(acl_tag_t tag, uid_t uid, gid_t gid, mode_t mode) { struct acl_entry acl_entry; acl_entry.ae_tag = tag; acl_entry.ae_perm = acl_posix1e_mode_to_perm(tag, mode); switch(tag) { case ACL_USER_OBJ: acl_entry.ae_id = uid; break; case ACL_GROUP_OBJ: acl_entry.ae_id = gid; break; case ACL_OTHER: acl_entry.ae_id = 0; break; default: acl_entry.ae_id = 0; printf("acl_posix1e_mode_to_entry: invalid tag (%d)\n", tag); } return (acl_entry); } /* * Utility function to generate a file mode given appropriate ACL entries. */ mode_t acl_posix1e_perms_to_mode(struct acl_entry *acl_user_obj_entry, struct acl_entry *acl_group_obj_entry, struct acl_entry *acl_other_entry) { mode_t mode; mode = 0; if (acl_user_obj_entry->ae_perm & ACL_EXECUTE) mode |= S_IXUSR; if (acl_user_obj_entry->ae_perm & ACL_READ) mode |= S_IRUSR; if (acl_user_obj_entry->ae_perm & ACL_WRITE) mode |= S_IWUSR; if (acl_group_obj_entry->ae_perm & ACL_EXECUTE) mode |= S_IXGRP; if (acl_group_obj_entry->ae_perm & ACL_READ) mode |= S_IRGRP; if (acl_group_obj_entry->ae_perm & ACL_WRITE) mode |= S_IWGRP; if (acl_other_entry->ae_perm & ACL_EXECUTE) mode |= S_IXOTH; if (acl_other_entry->ae_perm & ACL_READ) mode |= S_IROTH; if (acl_other_entry->ae_perm & ACL_WRITE) mode |= S_IWOTH; return (mode); } /* * Perform a syntactic check of the ACL, sufficient to allow an * implementing file system to determine if it should accept this and * rely on the POSIX.1e ACL properties. */ int acl_posix1e_check(struct acl *acl) { int num_acl_user_obj, num_acl_user, num_acl_group_obj, num_acl_group; int num_acl_mask, num_acl_other, i; /* * Verify that the number of entries does not exceed the maximum * defined for acl_t. * Verify that the correct number of various sorts of ae_tags are * present: * Exactly one ACL_USER_OBJ * Exactly one ACL_GROUP_OBJ * Exactly one ACL_OTHER * If any ACL_USER or ACL_GROUP entries appear, then exactly one * ACL_MASK entry must also appear. * Verify that all ae_perm entries are in ACL_PERM_BITS. * Verify all ae_tag entries are understood by this implementation. * Note: Does not check for uniqueness of qualifier (ae_id) field. */ num_acl_user_obj = num_acl_user = num_acl_group_obj = num_acl_group = num_acl_mask = num_acl_other = 0; if (acl->acl_cnt > ACL_MAX_ENTRIES || acl->acl_cnt < 0) return (EINVAL); for (i = 0; i < acl->acl_cnt; i++) { /* * Check for a valid tag. */ switch(acl->acl_entry[i].ae_tag) { case ACL_USER_OBJ: + acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */ + if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID) + return (EINVAL); num_acl_user_obj++; break; case ACL_GROUP_OBJ: + acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */ + if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID) + return (EINVAL); num_acl_group_obj++; break; case ACL_USER: + if (acl->acl_entry[i].ae_id == ACL_UNDEFINED_ID) + return (EINVAL); num_acl_user++; break; case ACL_GROUP: + if (acl->acl_entry[i].ae_id == ACL_UNDEFINED_ID) + return (EINVAL); num_acl_group++; break; case ACL_OTHER: + acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */ + if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID) + return (EINVAL); num_acl_other++; break; case ACL_MASK: + acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */ + if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID) + return (EINVAL); num_acl_mask++; break; default: return (EINVAL); } /* * Check for valid perm entries. */ if ((acl->acl_entry[i].ae_perm | ACL_PERM_BITS) != ACL_PERM_BITS) return (EINVAL); } if ((num_acl_user_obj != 1) || (num_acl_group_obj != 1) || (num_acl_other != 1) || (num_acl_mask != 0 && num_acl_mask != 1)) return (EINVAL); if (((num_acl_group != 0) || (num_acl_user != 0)) && (num_acl_mask != 1)) return (EINVAL); return (0); } /* * These calls wrap the real vnode operations, and are called by the * syscall code once the syscall has converted the path or file * descriptor to a vnode (unlocked). The aclp pointer is assumed * still to point to userland, so this should not be consumed within * the kernel except by syscall code. Other code should directly * invoke VOP_{SET,GET}ACL. */ /* * Given a vnode, set its ACL. */ static int vacl_set_acl(struct proc *p, struct vnode *vp, acl_type_t type, struct acl *aclp) { struct acl inkernacl; int error; error = copyin(aclp, &inkernacl, sizeof(struct acl)); if (error) return(error); VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); error = VOP_SETACL(vp, type, &inkernacl, p->p_ucred, p); VOP_UNLOCK(vp, 0, p); return(error); } /* * Given a vnode, get its ACL. */ static int vacl_get_acl(struct proc *p, struct vnode *vp, acl_type_t type, struct acl *aclp) { struct acl inkernelacl; int error; VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); error = VOP_GETACL(vp, type, &inkernelacl, p->p_ucred, p); VOP_UNLOCK(vp, 0, p); if (error == 0) error = copyout(&inkernelacl, aclp, sizeof(struct acl)); return (error); } /* * Given a vnode, delete its ACL. */ static int vacl_delete(struct proc *p, struct vnode *vp, acl_type_t type) { int error; VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); error = VOP_SETACL(vp, ACL_TYPE_DEFAULT, 0, p->p_ucred, p); VOP_UNLOCK(vp, 0, p); return (error); } /* * Given a vnode, check whether an ACL is appropriate for it */ static int vacl_aclcheck(struct proc *p, struct vnode *vp, acl_type_t type, struct acl *aclp) { struct acl inkernelacl; int error; error = copyin(aclp, &inkernelacl, sizeof(struct acl)); if (error) return(error); error = VOP_ACLCHECK(vp, type, &inkernelacl, p->p_ucred, p); return (error); } /* * syscalls -- convert the path/fd to a vnode, and call vacl_whatever. * Don't need to lock, as the vacl_ code will get/release any locks * required. */ /* * Given a file path, get an ACL for it */ int __acl_get_file(struct proc *p, struct __acl_get_file_args *uap) { struct nameidata nd; int error; /* what flags are required here -- possible not LOCKLEAF? */ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p); error = namei(&nd); if (error) return(error); error = vacl_get_acl(p, nd.ni_vp, SCARG(uap, type), SCARG(uap, aclp)); NDFREE(&nd, 0); return (error); } /* * Given a file path, set an ACL for it */ int __acl_set_file(struct proc *p, struct __acl_set_file_args *uap) { struct nameidata nd; int error; NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p); error = namei(&nd); if (error) return(error); error = vacl_set_acl(p, nd.ni_vp, SCARG(uap, type), SCARG(uap, aclp)); NDFREE(&nd, 0); return (error); } /* * Given a file descriptor, get an ACL for it */ int __acl_get_fd(struct proc *p, struct __acl_get_fd_args *uap) { struct file *fp; int error; error = getvnode(p->p_fd, SCARG(uap, filedes), &fp); if (error) return(error); return vacl_get_acl(p, (struct vnode *)fp->f_data, SCARG(uap, type), SCARG(uap, aclp)); } /* * Given a file descriptor, set an ACL for it */ int __acl_set_fd(struct proc *p, struct __acl_set_fd_args *uap) { struct file *fp; int error; error = getvnode(p->p_fd, SCARG(uap, filedes), &fp); if (error) return(error); return vacl_set_acl(p, (struct vnode *)fp->f_data, SCARG(uap, type), SCARG(uap, aclp)); } /* * Given a file path, delete an ACL from it. */ int __acl_delete_file(struct proc *p, struct __acl_delete_file_args *uap) { struct nameidata nd; int error; NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p); error = namei(&nd); if (error) return(error); error = vacl_delete(p, nd.ni_vp, SCARG(uap, type)); NDFREE(&nd, 0); return (error); } /* * Given a file path, delete an ACL from it. */ int __acl_delete_fd(struct proc *p, struct __acl_delete_fd_args *uap) { struct file *fp; int error; error = getvnode(p->p_fd, SCARG(uap, filedes), &fp); if (error) return(error); error = vacl_delete(p, (struct vnode *)fp->f_data, SCARG(uap, type)); return (error); } /* * Given a file path, check an ACL for it */ int __acl_aclcheck_file(struct proc *p, struct __acl_aclcheck_file_args *uap) { struct nameidata nd; int error; NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p); error = namei(&nd); if (error) return(error); error = vacl_aclcheck(p, nd.ni_vp, SCARG(uap, type), SCARG(uap, aclp)); NDFREE(&nd, 0); return (error); } /* * Given a file descriptor, check an ACL for it */ int __acl_aclcheck_fd(struct proc *p, struct __acl_aclcheck_fd_args *uap) { struct file *fp; int error; error = getvnode(p->p_fd, SCARG(uap, filedes), &fp); if (error) return(error); return vacl_aclcheck(p, (struct vnode *)fp->f_data, SCARG(uap, type), SCARG(uap, aclp)); } Index: head/sys/kern/subr_acl_posix1e.c =================================================================== --- head/sys/kern/subr_acl_posix1e.c (revision 75570) +++ head/sys/kern/subr_acl_posix1e.c (revision 75571) @@ -1,714 +1,768 @@ /*- * Copyright (c) 1999, 2000, 2001 Robert N. M. Watson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ /* * Developed by the TrustedBSD Project. * Support for POSIX.1e access control lists. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include MALLOC_DEFINE(M_ACL, "acl", "access control list"); static int vacl_set_acl(struct proc *p, struct vnode *vp, acl_type_t type, struct acl *aclp); static int vacl_get_acl(struct proc *p, struct vnode *vp, acl_type_t type, struct acl *aclp); -static int vacl_aclcheck(struct proc *p, struct vnode *vp, acl_type_t type, - struct acl *aclp); +static int vacl_aclcheck(struct proc *p, struct vnode *vp, + acl_type_t type, struct acl *aclp); /* * Implement a version of vaccess() that understands POSIX.1e ACL semantics. * Return 0 on success, else an errno value. Should be merged into * vaccess() eventually. */ int -vaccess_acl_posix1e(enum vtype type, struct acl *acl, mode_t acc_mode, - struct ucred *cred, int *privused) +vaccess_acl_posix1e(enum vtype type, uid_t file_uid, gid_t file_gid, + struct acl *acl, mode_t acc_mode, struct ucred *cred, int *privused) { struct acl_entry *acl_other, *acl_mask; mode_t dac_granted; mode_t cap_granted; mode_t acl_mask_granted; int group_matched, i; /* * Look for a normal, non-privileged way to access the file/directory * as requested. If it exists, go with that. Otherwise, attempt * to use privileges granted via cap_granted. In some cases, * which privileges to use may be ambiguous due to "best match", * in which case fall back on first match for the time being. */ if (privused != NULL) *privused = 0; /* * Determine privileges now, but don't apply until we've found * a DAC match that has failed to allow access. */ #ifndef CAPABILITIES if (suser_xxx(cred, NULL, PRISON_ROOT) == 0) cap_granted = (VEXEC | VREAD | VWRITE | VADMIN); else cap_granted = 0; #else cap_granted = 0; if (type == VDIR) { if ((acc_mode & VEXEC) && !cap_check(cred, NULL, CAP_DAC_READ_SEARCH, PRISON_ROOT)) cap_granted |= VEXEC; } else { if ((acc_mode & VEXEC) && !cap_check(cred, NULL, CAP_DAC_EXECUTE, PRISON_ROOT)) cap_granted |= VEXEC; } if ((acc_mode & VREAD) && !cap_check(cred, NULL, CAP_DAC_READ_SEARCH, PRISON_ROOT)) cap_granted |= VREAD; if ((acc_mode & VWRITE) && !cap_check(cred, NULL, CAP_DAC_WRITE, PRISON_ROOT)) cap_granted |= VWRITE; if ((acc_mode & VADMIN) && !cap_check(cred, NULL, CAP_FOWNER, PRISON_ROOT)) cap_granted |= VADMIN; #endif /* CAPABILITIES */ /* * Check the owner. * Also, record locations of ACL_MASK and ACL_OTHER for reference * later if the owner doesn't match. */ acl_mask = acl_other = NULL; for (i = 0; i < acl->acl_cnt; i++) { switch (acl->acl_entry[i].ae_tag) { case ACL_USER_OBJ: - if (acl->acl_entry[i].ae_id != cred->cr_uid) + if (file_uid != cred->cr_uid) break; dac_granted = 0; dac_granted |= VADMIN; if (acl->acl_entry[i].ae_perm & ACL_EXECUTE) dac_granted |= VEXEC; if (acl->acl_entry[i].ae_perm & ACL_READ) dac_granted |= VREAD; if (acl->acl_entry[i].ae_perm & ACL_WRITE) dac_granted |= VWRITE; if ((acc_mode & dac_granted) == acc_mode) return (0); if ((acc_mode & (dac_granted | cap_granted)) == acc_mode) { if (privused != NULL) *privused = 1; return (0); } goto error; case ACL_MASK: acl_mask = &acl->acl_entry[i]; break; case ACL_OTHER: acl_other = &acl->acl_entry[i]; break; default: } } /* * Checks against ACL_USER, ACL_GROUP_OBJ, and ACL_GROUP fields * are masked by an ACL_MASK entry, if any. As such, first identify * the ACL_MASK field, then iterate through identifying potential * user matches, then group matches. If there is no ACL_MASK, * assume that the mask allows all requests to succeed. * Also keep track of the location of ACL_OTHER for later consumption. */ if (acl_other == NULL) { /* * XXX: This should never happen. Only properly formatted * ACLs should be passed to vaccess_acl_posix1e. * Should make this a panic post-debugging. */ printf("vaccess_acl_posix1e: ACL_OTHER missing\n"); return (EPERM); } if (acl_mask != NULL) { acl_mask_granted = 0; if (acl_mask->ae_perm & ACL_EXECUTE) acl_mask_granted |= VEXEC; if (acl_mask->ae_perm & ACL_READ) acl_mask_granted |= VREAD; if (acl_mask->ae_perm & ACL_WRITE) acl_mask_granted |= VWRITE; } else acl_mask_granted = VEXEC | VREAD | VWRITE; /* * We have to check each type even if we know ACL_MASK will reject, * as we need to know what match there might have been, and * therefore what further types we might be allowed to check. * Do the checks twice -- once without privilege, and a second time * with, if there was a match. */ /* * Check ACL_USER ACL entries. */ for (i = 0; i < acl->acl_cnt; i++) { switch (acl->acl_entry[i].ae_tag) { case ACL_USER: if (acl->acl_entry[i].ae_id != cred->cr_uid) break; dac_granted = 0; if (acl->acl_entry[i].ae_perm & ACL_EXECUTE) dac_granted |= VEXEC; if (acl->acl_entry[i].ae_perm & ACL_READ) dac_granted |= VREAD; if (acl->acl_entry[i].ae_perm & ACL_WRITE) dac_granted |= VWRITE; dac_granted &= acl_mask_granted; if ((acc_mode & dac_granted) == acc_mode) return (0); - if ((acc_mode & (dac_granted | cap_granted)) == - acc_mode) { - if (privused != NULL) - *privused = 1; - return (0); - } - goto error; + if ((acc_mode & (dac_granted | cap_granted)) != + acc_mode) + goto error; + + if (privused != NULL) + *privused = 1; + return (0); } } /* * Group match is best-match, not first-match, so find a * "best" match. Iterate across, testing each potential group * match. Make sure we keep track of whether we found a match * or not, so that we know if we can move on to ACL_OTHER. */ group_matched = 0; for (i = 0; i < acl->acl_cnt; i++) { switch (acl->acl_entry[i].ae_tag) { case ACL_GROUP_OBJ: + if (file_gid != cred->cr_groups[0]) + break; + dac_granted = 0; + if (acl->acl_entry[i].ae_perm & ACL_EXECUTE) + dac_granted |= VEXEC; + if (acl->acl_entry[i].ae_perm & ACL_READ) + dac_granted |= VREAD; + if (acl->acl_entry[i].ae_perm & ACL_WRITE) + dac_granted |= VWRITE; + dac_granted &= acl_mask_granted; + + if ((acc_mode & dac_granted) == acc_mode) + return (0); + + group_matched = 1; + break; + case ACL_GROUP: - if (groupmember(acl->acl_entry[i].ae_id, cred)) { - dac_granted = 0; - if (acl->acl_entry[i].ae_perm & ACL_EXECUTE) - dac_granted |= VEXEC; - if (acl->acl_entry[i].ae_perm & ACL_READ) - dac_granted |= VREAD; - if (acl->acl_entry[i].ae_perm & ACL_WRITE) - dac_granted |= VWRITE; - dac_granted &= acl_mask_granted; + if (!groupmember(acl->acl_entry[i].ae_id, cred)) + break; + dac_granted = 0; + if (acl->acl_entry[i].ae_perm & ACL_EXECUTE) + dac_granted |= VEXEC; + if (acl->acl_entry[i].ae_perm & ACL_READ) + dac_granted |= VREAD; + if (acl->acl_entry[i].ae_perm & ACL_WRITE) + dac_granted |= VWRITE; + dac_granted &= acl_mask_granted; - if ((acc_mode & dac_granted) == acc_mode) - return (0); + if ((acc_mode & dac_granted) == acc_mode) + return (0); - group_matched = 1; - } + group_matched = 1; + break; + default: } } if (group_matched == 1) { /* * There was a match, but it did not grant rights via * pure DAC. Try again, this time with privilege. */ for (i = 0; i < acl->acl_cnt; i++) { switch (acl->acl_entry[i].ae_tag) { case ACL_GROUP_OBJ: - case ACL_GROUP: - if (groupmember(acl->acl_entry[i].ae_id, - cred)) { - dac_granted = 0; - if (acl->acl_entry[i].ae_perm & - ACL_EXECUTE) + if (file_gid != cred->cr_groups[0]) + break; + dac_granted = 0; + if (acl->acl_entry[i].ae_perm & ACL_EXECUTE) dac_granted |= VEXEC; - if (acl->acl_entry[i].ae_perm & - ACL_READ) - dac_granted |= VREAD; - if (acl->acl_entry[i].ae_perm & - ACL_WRITE) - dac_granted |= VWRITE; - dac_granted &= acl_mask_granted; - if ((acc_mode & (dac_granted | - cap_granted)) == acc_mode) { - if (privused != NULL) - *privused = 1; - return (0); - } - } + if (acl->acl_entry[i].ae_perm & ACL_READ) + dac_granted |= VREAD; + if (acl->acl_entry[i].ae_perm & ACL_WRITE) + dac_granted |= VWRITE; + dac_granted &= acl_mask_granted; + + if ((acc_mode & (dac_granted | cap_granted)) != + acc_mode) + break; + + if (privused != NULL) + *privused = 1; + return (0); + + case ACL_GROUP: + if (!groupmember(acl->acl_entry[i].ae_id, + cred)) + break; + dac_granted = 0; + if (acl->acl_entry[i].ae_perm & ACL_EXECUTE) + dac_granted |= VEXEC; + if (acl->acl_entry[i].ae_perm & ACL_READ) + dac_granted |= VREAD; + if (acl->acl_entry[i].ae_perm & ACL_WRITE) + dac_granted |= VWRITE; + dac_granted &= acl_mask_granted; + + if ((acc_mode & (dac_granted | cap_granted)) != + acc_mode) + break; + + if (privused != NULL) + *privused = 1; + return (0); + default: } } /* * Even with privilege, group membership was not sufficient. * Return failure. */ goto error; } /* * Fall back on ACL_OTHER. ACL_MASK is not applied to ACL_OTHER. */ dac_granted = 0; if (acl_other->ae_perm & ACL_EXECUTE) dac_granted |= VEXEC; if (acl_other->ae_perm & ACL_READ) dac_granted |= VREAD; if (acl_other->ae_perm & ACL_WRITE) dac_granted |= VWRITE; if ((acc_mode & dac_granted) == acc_mode) return (0); if ((acc_mode & (dac_granted | cap_granted)) == acc_mode) { if (privused != NULL) *privused = 1; return (0); } error: return ((acc_mode & VADMIN) ? EPERM : EACCES); } /* * For the purposes of file systems maintaining the _OBJ entries in an * inode with a mode_t field, this routine converts a mode_t entry * to an acl_perm_t. */ acl_perm_t acl_posix1e_mode_to_perm(acl_tag_t tag, mode_t mode) { acl_perm_t perm = 0; switch(tag) { case ACL_USER_OBJ: if (mode & S_IXUSR) perm |= ACL_EXECUTE; if (mode & S_IRUSR) perm |= ACL_READ; if (mode & S_IWUSR) perm |= ACL_WRITE; return (perm); case ACL_GROUP_OBJ: if (mode & S_IXGRP) perm |= ACL_EXECUTE; if (mode & S_IRGRP) perm |= ACL_READ; if (mode & S_IWGRP) perm |= ACL_WRITE; return (perm); case ACL_OTHER: if (mode & S_IXOTH) perm |= ACL_EXECUTE; if (mode & S_IROTH) perm |= ACL_READ; if (mode & S_IWOTH) perm |= ACL_WRITE; return (perm); default: printf("acl_posix1e_mode_to_perm: invalid tag (%d)\n", tag); return (0); } } /* * Given inode information (uid, gid, mode), return an acl entry of the * appropriate type. */ struct acl_entry acl_posix1e_mode_to_entry(acl_tag_t tag, uid_t uid, gid_t gid, mode_t mode) { struct acl_entry acl_entry; acl_entry.ae_tag = tag; acl_entry.ae_perm = acl_posix1e_mode_to_perm(tag, mode); switch(tag) { case ACL_USER_OBJ: acl_entry.ae_id = uid; break; case ACL_GROUP_OBJ: acl_entry.ae_id = gid; break; case ACL_OTHER: acl_entry.ae_id = 0; break; default: acl_entry.ae_id = 0; printf("acl_posix1e_mode_to_entry: invalid tag (%d)\n", tag); } return (acl_entry); } /* * Utility function to generate a file mode given appropriate ACL entries. */ mode_t acl_posix1e_perms_to_mode(struct acl_entry *acl_user_obj_entry, struct acl_entry *acl_group_obj_entry, struct acl_entry *acl_other_entry) { mode_t mode; mode = 0; if (acl_user_obj_entry->ae_perm & ACL_EXECUTE) mode |= S_IXUSR; if (acl_user_obj_entry->ae_perm & ACL_READ) mode |= S_IRUSR; if (acl_user_obj_entry->ae_perm & ACL_WRITE) mode |= S_IWUSR; if (acl_group_obj_entry->ae_perm & ACL_EXECUTE) mode |= S_IXGRP; if (acl_group_obj_entry->ae_perm & ACL_READ) mode |= S_IRGRP; if (acl_group_obj_entry->ae_perm & ACL_WRITE) mode |= S_IWGRP; if (acl_other_entry->ae_perm & ACL_EXECUTE) mode |= S_IXOTH; if (acl_other_entry->ae_perm & ACL_READ) mode |= S_IROTH; if (acl_other_entry->ae_perm & ACL_WRITE) mode |= S_IWOTH; return (mode); } /* * Perform a syntactic check of the ACL, sufficient to allow an * implementing file system to determine if it should accept this and * rely on the POSIX.1e ACL properties. */ int acl_posix1e_check(struct acl *acl) { int num_acl_user_obj, num_acl_user, num_acl_group_obj, num_acl_group; int num_acl_mask, num_acl_other, i; /* * Verify that the number of entries does not exceed the maximum * defined for acl_t. * Verify that the correct number of various sorts of ae_tags are * present: * Exactly one ACL_USER_OBJ * Exactly one ACL_GROUP_OBJ * Exactly one ACL_OTHER * If any ACL_USER or ACL_GROUP entries appear, then exactly one * ACL_MASK entry must also appear. * Verify that all ae_perm entries are in ACL_PERM_BITS. * Verify all ae_tag entries are understood by this implementation. * Note: Does not check for uniqueness of qualifier (ae_id) field. */ num_acl_user_obj = num_acl_user = num_acl_group_obj = num_acl_group = num_acl_mask = num_acl_other = 0; if (acl->acl_cnt > ACL_MAX_ENTRIES || acl->acl_cnt < 0) return (EINVAL); for (i = 0; i < acl->acl_cnt; i++) { /* * Check for a valid tag. */ switch(acl->acl_entry[i].ae_tag) { case ACL_USER_OBJ: + acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */ + if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID) + return (EINVAL); num_acl_user_obj++; break; case ACL_GROUP_OBJ: + acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */ + if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID) + return (EINVAL); num_acl_group_obj++; break; case ACL_USER: + if (acl->acl_entry[i].ae_id == ACL_UNDEFINED_ID) + return (EINVAL); num_acl_user++; break; case ACL_GROUP: + if (acl->acl_entry[i].ae_id == ACL_UNDEFINED_ID) + return (EINVAL); num_acl_group++; break; case ACL_OTHER: + acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */ + if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID) + return (EINVAL); num_acl_other++; break; case ACL_MASK: + acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */ + if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID) + return (EINVAL); num_acl_mask++; break; default: return (EINVAL); } /* * Check for valid perm entries. */ if ((acl->acl_entry[i].ae_perm | ACL_PERM_BITS) != ACL_PERM_BITS) return (EINVAL); } if ((num_acl_user_obj != 1) || (num_acl_group_obj != 1) || (num_acl_other != 1) || (num_acl_mask != 0 && num_acl_mask != 1)) return (EINVAL); if (((num_acl_group != 0) || (num_acl_user != 0)) && (num_acl_mask != 1)) return (EINVAL); return (0); } /* * These calls wrap the real vnode operations, and are called by the * syscall code once the syscall has converted the path or file * descriptor to a vnode (unlocked). The aclp pointer is assumed * still to point to userland, so this should not be consumed within * the kernel except by syscall code. Other code should directly * invoke VOP_{SET,GET}ACL. */ /* * Given a vnode, set its ACL. */ static int vacl_set_acl(struct proc *p, struct vnode *vp, acl_type_t type, struct acl *aclp) { struct acl inkernacl; int error; error = copyin(aclp, &inkernacl, sizeof(struct acl)); if (error) return(error); VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); error = VOP_SETACL(vp, type, &inkernacl, p->p_ucred, p); VOP_UNLOCK(vp, 0, p); return(error); } /* * Given a vnode, get its ACL. */ static int vacl_get_acl(struct proc *p, struct vnode *vp, acl_type_t type, struct acl *aclp) { struct acl inkernelacl; int error; VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); error = VOP_GETACL(vp, type, &inkernelacl, p->p_ucred, p); VOP_UNLOCK(vp, 0, p); if (error == 0) error = copyout(&inkernelacl, aclp, sizeof(struct acl)); return (error); } /* * Given a vnode, delete its ACL. */ static int vacl_delete(struct proc *p, struct vnode *vp, acl_type_t type) { int error; VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); error = VOP_SETACL(vp, ACL_TYPE_DEFAULT, 0, p->p_ucred, p); VOP_UNLOCK(vp, 0, p); return (error); } /* * Given a vnode, check whether an ACL is appropriate for it */ static int vacl_aclcheck(struct proc *p, struct vnode *vp, acl_type_t type, struct acl *aclp) { struct acl inkernelacl; int error; error = copyin(aclp, &inkernelacl, sizeof(struct acl)); if (error) return(error); error = VOP_ACLCHECK(vp, type, &inkernelacl, p->p_ucred, p); return (error); } /* * syscalls -- convert the path/fd to a vnode, and call vacl_whatever. * Don't need to lock, as the vacl_ code will get/release any locks * required. */ /* * Given a file path, get an ACL for it */ int __acl_get_file(struct proc *p, struct __acl_get_file_args *uap) { struct nameidata nd; int error; /* what flags are required here -- possible not LOCKLEAF? */ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p); error = namei(&nd); if (error) return(error); error = vacl_get_acl(p, nd.ni_vp, SCARG(uap, type), SCARG(uap, aclp)); NDFREE(&nd, 0); return (error); } /* * Given a file path, set an ACL for it */ int __acl_set_file(struct proc *p, struct __acl_set_file_args *uap) { struct nameidata nd; int error; NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p); error = namei(&nd); if (error) return(error); error = vacl_set_acl(p, nd.ni_vp, SCARG(uap, type), SCARG(uap, aclp)); NDFREE(&nd, 0); return (error); } /* * Given a file descriptor, get an ACL for it */ int __acl_get_fd(struct proc *p, struct __acl_get_fd_args *uap) { struct file *fp; int error; error = getvnode(p->p_fd, SCARG(uap, filedes), &fp); if (error) return(error); return vacl_get_acl(p, (struct vnode *)fp->f_data, SCARG(uap, type), SCARG(uap, aclp)); } /* * Given a file descriptor, set an ACL for it */ int __acl_set_fd(struct proc *p, struct __acl_set_fd_args *uap) { struct file *fp; int error; error = getvnode(p->p_fd, SCARG(uap, filedes), &fp); if (error) return(error); return vacl_set_acl(p, (struct vnode *)fp->f_data, SCARG(uap, type), SCARG(uap, aclp)); } /* * Given a file path, delete an ACL from it. */ int __acl_delete_file(struct proc *p, struct __acl_delete_file_args *uap) { struct nameidata nd; int error; NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p); error = namei(&nd); if (error) return(error); error = vacl_delete(p, nd.ni_vp, SCARG(uap, type)); NDFREE(&nd, 0); return (error); } /* * Given a file path, delete an ACL from it. */ int __acl_delete_fd(struct proc *p, struct __acl_delete_fd_args *uap) { struct file *fp; int error; error = getvnode(p->p_fd, SCARG(uap, filedes), &fp); if (error) return(error); error = vacl_delete(p, (struct vnode *)fp->f_data, SCARG(uap, type)); return (error); } /* * Given a file path, check an ACL for it */ int __acl_aclcheck_file(struct proc *p, struct __acl_aclcheck_file_args *uap) { struct nameidata nd; int error; NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p); error = namei(&nd); if (error) return(error); error = vacl_aclcheck(p, nd.ni_vp, SCARG(uap, type), SCARG(uap, aclp)); NDFREE(&nd, 0); return (error); } /* * Given a file descriptor, check an ACL for it */ int __acl_aclcheck_fd(struct proc *p, struct __acl_aclcheck_fd_args *uap) { struct file *fp; int error; error = getvnode(p->p_fd, SCARG(uap, filedes), &fp); if (error) return(error); return vacl_aclcheck(p, (struct vnode *)fp->f_data, SCARG(uap, type), SCARG(uap, aclp)); } Index: head/sys/kern/vfs_acl.c =================================================================== --- head/sys/kern/vfs_acl.c (revision 75570) +++ head/sys/kern/vfs_acl.c (revision 75571) @@ -1,714 +1,768 @@ /*- * Copyright (c) 1999, 2000, 2001 Robert N. M. Watson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ /* * Developed by the TrustedBSD Project. * Support for POSIX.1e access control lists. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include MALLOC_DEFINE(M_ACL, "acl", "access control list"); static int vacl_set_acl(struct proc *p, struct vnode *vp, acl_type_t type, struct acl *aclp); static int vacl_get_acl(struct proc *p, struct vnode *vp, acl_type_t type, struct acl *aclp); -static int vacl_aclcheck(struct proc *p, struct vnode *vp, acl_type_t type, - struct acl *aclp); +static int vacl_aclcheck(struct proc *p, struct vnode *vp, + acl_type_t type, struct acl *aclp); /* * Implement a version of vaccess() that understands POSIX.1e ACL semantics. * Return 0 on success, else an errno value. Should be merged into * vaccess() eventually. */ int -vaccess_acl_posix1e(enum vtype type, struct acl *acl, mode_t acc_mode, - struct ucred *cred, int *privused) +vaccess_acl_posix1e(enum vtype type, uid_t file_uid, gid_t file_gid, + struct acl *acl, mode_t acc_mode, struct ucred *cred, int *privused) { struct acl_entry *acl_other, *acl_mask; mode_t dac_granted; mode_t cap_granted; mode_t acl_mask_granted; int group_matched, i; /* * Look for a normal, non-privileged way to access the file/directory * as requested. If it exists, go with that. Otherwise, attempt * to use privileges granted via cap_granted. In some cases, * which privileges to use may be ambiguous due to "best match", * in which case fall back on first match for the time being. */ if (privused != NULL) *privused = 0; /* * Determine privileges now, but don't apply until we've found * a DAC match that has failed to allow access. */ #ifndef CAPABILITIES if (suser_xxx(cred, NULL, PRISON_ROOT) == 0) cap_granted = (VEXEC | VREAD | VWRITE | VADMIN); else cap_granted = 0; #else cap_granted = 0; if (type == VDIR) { if ((acc_mode & VEXEC) && !cap_check(cred, NULL, CAP_DAC_READ_SEARCH, PRISON_ROOT)) cap_granted |= VEXEC; } else { if ((acc_mode & VEXEC) && !cap_check(cred, NULL, CAP_DAC_EXECUTE, PRISON_ROOT)) cap_granted |= VEXEC; } if ((acc_mode & VREAD) && !cap_check(cred, NULL, CAP_DAC_READ_SEARCH, PRISON_ROOT)) cap_granted |= VREAD; if ((acc_mode & VWRITE) && !cap_check(cred, NULL, CAP_DAC_WRITE, PRISON_ROOT)) cap_granted |= VWRITE; if ((acc_mode & VADMIN) && !cap_check(cred, NULL, CAP_FOWNER, PRISON_ROOT)) cap_granted |= VADMIN; #endif /* CAPABILITIES */ /* * Check the owner. * Also, record locations of ACL_MASK and ACL_OTHER for reference * later if the owner doesn't match. */ acl_mask = acl_other = NULL; for (i = 0; i < acl->acl_cnt; i++) { switch (acl->acl_entry[i].ae_tag) { case ACL_USER_OBJ: - if (acl->acl_entry[i].ae_id != cred->cr_uid) + if (file_uid != cred->cr_uid) break; dac_granted = 0; dac_granted |= VADMIN; if (acl->acl_entry[i].ae_perm & ACL_EXECUTE) dac_granted |= VEXEC; if (acl->acl_entry[i].ae_perm & ACL_READ) dac_granted |= VREAD; if (acl->acl_entry[i].ae_perm & ACL_WRITE) dac_granted |= VWRITE; if ((acc_mode & dac_granted) == acc_mode) return (0); if ((acc_mode & (dac_granted | cap_granted)) == acc_mode) { if (privused != NULL) *privused = 1; return (0); } goto error; case ACL_MASK: acl_mask = &acl->acl_entry[i]; break; case ACL_OTHER: acl_other = &acl->acl_entry[i]; break; default: } } /* * Checks against ACL_USER, ACL_GROUP_OBJ, and ACL_GROUP fields * are masked by an ACL_MASK entry, if any. As such, first identify * the ACL_MASK field, then iterate through identifying potential * user matches, then group matches. If there is no ACL_MASK, * assume that the mask allows all requests to succeed. * Also keep track of the location of ACL_OTHER for later consumption. */ if (acl_other == NULL) { /* * XXX: This should never happen. Only properly formatted * ACLs should be passed to vaccess_acl_posix1e. * Should make this a panic post-debugging. */ printf("vaccess_acl_posix1e: ACL_OTHER missing\n"); return (EPERM); } if (acl_mask != NULL) { acl_mask_granted = 0; if (acl_mask->ae_perm & ACL_EXECUTE) acl_mask_granted |= VEXEC; if (acl_mask->ae_perm & ACL_READ) acl_mask_granted |= VREAD; if (acl_mask->ae_perm & ACL_WRITE) acl_mask_granted |= VWRITE; } else acl_mask_granted = VEXEC | VREAD | VWRITE; /* * We have to check each type even if we know ACL_MASK will reject, * as we need to know what match there might have been, and * therefore what further types we might be allowed to check. * Do the checks twice -- once without privilege, and a second time * with, if there was a match. */ /* * Check ACL_USER ACL entries. */ for (i = 0; i < acl->acl_cnt; i++) { switch (acl->acl_entry[i].ae_tag) { case ACL_USER: if (acl->acl_entry[i].ae_id != cred->cr_uid) break; dac_granted = 0; if (acl->acl_entry[i].ae_perm & ACL_EXECUTE) dac_granted |= VEXEC; if (acl->acl_entry[i].ae_perm & ACL_READ) dac_granted |= VREAD; if (acl->acl_entry[i].ae_perm & ACL_WRITE) dac_granted |= VWRITE; dac_granted &= acl_mask_granted; if ((acc_mode & dac_granted) == acc_mode) return (0); - if ((acc_mode & (dac_granted | cap_granted)) == - acc_mode) { - if (privused != NULL) - *privused = 1; - return (0); - } - goto error; + if ((acc_mode & (dac_granted | cap_granted)) != + acc_mode) + goto error; + + if (privused != NULL) + *privused = 1; + return (0); } } /* * Group match is best-match, not first-match, so find a * "best" match. Iterate across, testing each potential group * match. Make sure we keep track of whether we found a match * or not, so that we know if we can move on to ACL_OTHER. */ group_matched = 0; for (i = 0; i < acl->acl_cnt; i++) { switch (acl->acl_entry[i].ae_tag) { case ACL_GROUP_OBJ: + if (file_gid != cred->cr_groups[0]) + break; + dac_granted = 0; + if (acl->acl_entry[i].ae_perm & ACL_EXECUTE) + dac_granted |= VEXEC; + if (acl->acl_entry[i].ae_perm & ACL_READ) + dac_granted |= VREAD; + if (acl->acl_entry[i].ae_perm & ACL_WRITE) + dac_granted |= VWRITE; + dac_granted &= acl_mask_granted; + + if ((acc_mode & dac_granted) == acc_mode) + return (0); + + group_matched = 1; + break; + case ACL_GROUP: - if (groupmember(acl->acl_entry[i].ae_id, cred)) { - dac_granted = 0; - if (acl->acl_entry[i].ae_perm & ACL_EXECUTE) - dac_granted |= VEXEC; - if (acl->acl_entry[i].ae_perm & ACL_READ) - dac_granted |= VREAD; - if (acl->acl_entry[i].ae_perm & ACL_WRITE) - dac_granted |= VWRITE; - dac_granted &= acl_mask_granted; + if (!groupmember(acl->acl_entry[i].ae_id, cred)) + break; + dac_granted = 0; + if (acl->acl_entry[i].ae_perm & ACL_EXECUTE) + dac_granted |= VEXEC; + if (acl->acl_entry[i].ae_perm & ACL_READ) + dac_granted |= VREAD; + if (acl->acl_entry[i].ae_perm & ACL_WRITE) + dac_granted |= VWRITE; + dac_granted &= acl_mask_granted; - if ((acc_mode & dac_granted) == acc_mode) - return (0); + if ((acc_mode & dac_granted) == acc_mode) + return (0); - group_matched = 1; - } + group_matched = 1; + break; + default: } } if (group_matched == 1) { /* * There was a match, but it did not grant rights via * pure DAC. Try again, this time with privilege. */ for (i = 0; i < acl->acl_cnt; i++) { switch (acl->acl_entry[i].ae_tag) { case ACL_GROUP_OBJ: - case ACL_GROUP: - if (groupmember(acl->acl_entry[i].ae_id, - cred)) { - dac_granted = 0; - if (acl->acl_entry[i].ae_perm & - ACL_EXECUTE) + if (file_gid != cred->cr_groups[0]) + break; + dac_granted = 0; + if (acl->acl_entry[i].ae_perm & ACL_EXECUTE) dac_granted |= VEXEC; - if (acl->acl_entry[i].ae_perm & - ACL_READ) - dac_granted |= VREAD; - if (acl->acl_entry[i].ae_perm & - ACL_WRITE) - dac_granted |= VWRITE; - dac_granted &= acl_mask_granted; - if ((acc_mode & (dac_granted | - cap_granted)) == acc_mode) { - if (privused != NULL) - *privused = 1; - return (0); - } - } + if (acl->acl_entry[i].ae_perm & ACL_READ) + dac_granted |= VREAD; + if (acl->acl_entry[i].ae_perm & ACL_WRITE) + dac_granted |= VWRITE; + dac_granted &= acl_mask_granted; + + if ((acc_mode & (dac_granted | cap_granted)) != + acc_mode) + break; + + if (privused != NULL) + *privused = 1; + return (0); + + case ACL_GROUP: + if (!groupmember(acl->acl_entry[i].ae_id, + cred)) + break; + dac_granted = 0; + if (acl->acl_entry[i].ae_perm & ACL_EXECUTE) + dac_granted |= VEXEC; + if (acl->acl_entry[i].ae_perm & ACL_READ) + dac_granted |= VREAD; + if (acl->acl_entry[i].ae_perm & ACL_WRITE) + dac_granted |= VWRITE; + dac_granted &= acl_mask_granted; + + if ((acc_mode & (dac_granted | cap_granted)) != + acc_mode) + break; + + if (privused != NULL) + *privused = 1; + return (0); + default: } } /* * Even with privilege, group membership was not sufficient. * Return failure. */ goto error; } /* * Fall back on ACL_OTHER. ACL_MASK is not applied to ACL_OTHER. */ dac_granted = 0; if (acl_other->ae_perm & ACL_EXECUTE) dac_granted |= VEXEC; if (acl_other->ae_perm & ACL_READ) dac_granted |= VREAD; if (acl_other->ae_perm & ACL_WRITE) dac_granted |= VWRITE; if ((acc_mode & dac_granted) == acc_mode) return (0); if ((acc_mode & (dac_granted | cap_granted)) == acc_mode) { if (privused != NULL) *privused = 1; return (0); } error: return ((acc_mode & VADMIN) ? EPERM : EACCES); } /* * For the purposes of file systems maintaining the _OBJ entries in an * inode with a mode_t field, this routine converts a mode_t entry * to an acl_perm_t. */ acl_perm_t acl_posix1e_mode_to_perm(acl_tag_t tag, mode_t mode) { acl_perm_t perm = 0; switch(tag) { case ACL_USER_OBJ: if (mode & S_IXUSR) perm |= ACL_EXECUTE; if (mode & S_IRUSR) perm |= ACL_READ; if (mode & S_IWUSR) perm |= ACL_WRITE; return (perm); case ACL_GROUP_OBJ: if (mode & S_IXGRP) perm |= ACL_EXECUTE; if (mode & S_IRGRP) perm |= ACL_READ; if (mode & S_IWGRP) perm |= ACL_WRITE; return (perm); case ACL_OTHER: if (mode & S_IXOTH) perm |= ACL_EXECUTE; if (mode & S_IROTH) perm |= ACL_READ; if (mode & S_IWOTH) perm |= ACL_WRITE; return (perm); default: printf("acl_posix1e_mode_to_perm: invalid tag (%d)\n", tag); return (0); } } /* * Given inode information (uid, gid, mode), return an acl entry of the * appropriate type. */ struct acl_entry acl_posix1e_mode_to_entry(acl_tag_t tag, uid_t uid, gid_t gid, mode_t mode) { struct acl_entry acl_entry; acl_entry.ae_tag = tag; acl_entry.ae_perm = acl_posix1e_mode_to_perm(tag, mode); switch(tag) { case ACL_USER_OBJ: acl_entry.ae_id = uid; break; case ACL_GROUP_OBJ: acl_entry.ae_id = gid; break; case ACL_OTHER: acl_entry.ae_id = 0; break; default: acl_entry.ae_id = 0; printf("acl_posix1e_mode_to_entry: invalid tag (%d)\n", tag); } return (acl_entry); } /* * Utility function to generate a file mode given appropriate ACL entries. */ mode_t acl_posix1e_perms_to_mode(struct acl_entry *acl_user_obj_entry, struct acl_entry *acl_group_obj_entry, struct acl_entry *acl_other_entry) { mode_t mode; mode = 0; if (acl_user_obj_entry->ae_perm & ACL_EXECUTE) mode |= S_IXUSR; if (acl_user_obj_entry->ae_perm & ACL_READ) mode |= S_IRUSR; if (acl_user_obj_entry->ae_perm & ACL_WRITE) mode |= S_IWUSR; if (acl_group_obj_entry->ae_perm & ACL_EXECUTE) mode |= S_IXGRP; if (acl_group_obj_entry->ae_perm & ACL_READ) mode |= S_IRGRP; if (acl_group_obj_entry->ae_perm & ACL_WRITE) mode |= S_IWGRP; if (acl_other_entry->ae_perm & ACL_EXECUTE) mode |= S_IXOTH; if (acl_other_entry->ae_perm & ACL_READ) mode |= S_IROTH; if (acl_other_entry->ae_perm & ACL_WRITE) mode |= S_IWOTH; return (mode); } /* * Perform a syntactic check of the ACL, sufficient to allow an * implementing file system to determine if it should accept this and * rely on the POSIX.1e ACL properties. */ int acl_posix1e_check(struct acl *acl) { int num_acl_user_obj, num_acl_user, num_acl_group_obj, num_acl_group; int num_acl_mask, num_acl_other, i; /* * Verify that the number of entries does not exceed the maximum * defined for acl_t. * Verify that the correct number of various sorts of ae_tags are * present: * Exactly one ACL_USER_OBJ * Exactly one ACL_GROUP_OBJ * Exactly one ACL_OTHER * If any ACL_USER or ACL_GROUP entries appear, then exactly one * ACL_MASK entry must also appear. * Verify that all ae_perm entries are in ACL_PERM_BITS. * Verify all ae_tag entries are understood by this implementation. * Note: Does not check for uniqueness of qualifier (ae_id) field. */ num_acl_user_obj = num_acl_user = num_acl_group_obj = num_acl_group = num_acl_mask = num_acl_other = 0; if (acl->acl_cnt > ACL_MAX_ENTRIES || acl->acl_cnt < 0) return (EINVAL); for (i = 0; i < acl->acl_cnt; i++) { /* * Check for a valid tag. */ switch(acl->acl_entry[i].ae_tag) { case ACL_USER_OBJ: + acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */ + if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID) + return (EINVAL); num_acl_user_obj++; break; case ACL_GROUP_OBJ: + acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */ + if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID) + return (EINVAL); num_acl_group_obj++; break; case ACL_USER: + if (acl->acl_entry[i].ae_id == ACL_UNDEFINED_ID) + return (EINVAL); num_acl_user++; break; case ACL_GROUP: + if (acl->acl_entry[i].ae_id == ACL_UNDEFINED_ID) + return (EINVAL); num_acl_group++; break; case ACL_OTHER: + acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */ + if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID) + return (EINVAL); num_acl_other++; break; case ACL_MASK: + acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */ + if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID) + return (EINVAL); num_acl_mask++; break; default: return (EINVAL); } /* * Check for valid perm entries. */ if ((acl->acl_entry[i].ae_perm | ACL_PERM_BITS) != ACL_PERM_BITS) return (EINVAL); } if ((num_acl_user_obj != 1) || (num_acl_group_obj != 1) || (num_acl_other != 1) || (num_acl_mask != 0 && num_acl_mask != 1)) return (EINVAL); if (((num_acl_group != 0) || (num_acl_user != 0)) && (num_acl_mask != 1)) return (EINVAL); return (0); } /* * These calls wrap the real vnode operations, and are called by the * syscall code once the syscall has converted the path or file * descriptor to a vnode (unlocked). The aclp pointer is assumed * still to point to userland, so this should not be consumed within * the kernel except by syscall code. Other code should directly * invoke VOP_{SET,GET}ACL. */ /* * Given a vnode, set its ACL. */ static int vacl_set_acl(struct proc *p, struct vnode *vp, acl_type_t type, struct acl *aclp) { struct acl inkernacl; int error; error = copyin(aclp, &inkernacl, sizeof(struct acl)); if (error) return(error); VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); error = VOP_SETACL(vp, type, &inkernacl, p->p_ucred, p); VOP_UNLOCK(vp, 0, p); return(error); } /* * Given a vnode, get its ACL. */ static int vacl_get_acl(struct proc *p, struct vnode *vp, acl_type_t type, struct acl *aclp) { struct acl inkernelacl; int error; VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); error = VOP_GETACL(vp, type, &inkernelacl, p->p_ucred, p); VOP_UNLOCK(vp, 0, p); if (error == 0) error = copyout(&inkernelacl, aclp, sizeof(struct acl)); return (error); } /* * Given a vnode, delete its ACL. */ static int vacl_delete(struct proc *p, struct vnode *vp, acl_type_t type) { int error; VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); error = VOP_SETACL(vp, ACL_TYPE_DEFAULT, 0, p->p_ucred, p); VOP_UNLOCK(vp, 0, p); return (error); } /* * Given a vnode, check whether an ACL is appropriate for it */ static int vacl_aclcheck(struct proc *p, struct vnode *vp, acl_type_t type, struct acl *aclp) { struct acl inkernelacl; int error; error = copyin(aclp, &inkernelacl, sizeof(struct acl)); if (error) return(error); error = VOP_ACLCHECK(vp, type, &inkernelacl, p->p_ucred, p); return (error); } /* * syscalls -- convert the path/fd to a vnode, and call vacl_whatever. * Don't need to lock, as the vacl_ code will get/release any locks * required. */ /* * Given a file path, get an ACL for it */ int __acl_get_file(struct proc *p, struct __acl_get_file_args *uap) { struct nameidata nd; int error; /* what flags are required here -- possible not LOCKLEAF? */ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p); error = namei(&nd); if (error) return(error); error = vacl_get_acl(p, nd.ni_vp, SCARG(uap, type), SCARG(uap, aclp)); NDFREE(&nd, 0); return (error); } /* * Given a file path, set an ACL for it */ int __acl_set_file(struct proc *p, struct __acl_set_file_args *uap) { struct nameidata nd; int error; NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p); error = namei(&nd); if (error) return(error); error = vacl_set_acl(p, nd.ni_vp, SCARG(uap, type), SCARG(uap, aclp)); NDFREE(&nd, 0); return (error); } /* * Given a file descriptor, get an ACL for it */ int __acl_get_fd(struct proc *p, struct __acl_get_fd_args *uap) { struct file *fp; int error; error = getvnode(p->p_fd, SCARG(uap, filedes), &fp); if (error) return(error); return vacl_get_acl(p, (struct vnode *)fp->f_data, SCARG(uap, type), SCARG(uap, aclp)); } /* * Given a file descriptor, set an ACL for it */ int __acl_set_fd(struct proc *p, struct __acl_set_fd_args *uap) { struct file *fp; int error; error = getvnode(p->p_fd, SCARG(uap, filedes), &fp); if (error) return(error); return vacl_set_acl(p, (struct vnode *)fp->f_data, SCARG(uap, type), SCARG(uap, aclp)); } /* * Given a file path, delete an ACL from it. */ int __acl_delete_file(struct proc *p, struct __acl_delete_file_args *uap) { struct nameidata nd; int error; NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p); error = namei(&nd); if (error) return(error); error = vacl_delete(p, nd.ni_vp, SCARG(uap, type)); NDFREE(&nd, 0); return (error); } /* * Given a file path, delete an ACL from it. */ int __acl_delete_fd(struct proc *p, struct __acl_delete_fd_args *uap) { struct file *fp; int error; error = getvnode(p->p_fd, SCARG(uap, filedes), &fp); if (error) return(error); error = vacl_delete(p, (struct vnode *)fp->f_data, SCARG(uap, type)); return (error); } /* * Given a file path, check an ACL for it */ int __acl_aclcheck_file(struct proc *p, struct __acl_aclcheck_file_args *uap) { struct nameidata nd; int error; NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p); error = namei(&nd); if (error) return(error); error = vacl_aclcheck(p, nd.ni_vp, SCARG(uap, type), SCARG(uap, aclp)); NDFREE(&nd, 0); return (error); } /* * Given a file descriptor, check an ACL for it */ int __acl_aclcheck_fd(struct proc *p, struct __acl_aclcheck_fd_args *uap) { struct file *fp; int error; error = getvnode(p->p_fd, SCARG(uap, filedes), &fp); if (error) return(error); return vacl_aclcheck(p, (struct vnode *)fp->f_data, SCARG(uap, type), SCARG(uap, aclp)); } Index: head/sys/sys/vnode.h =================================================================== --- head/sys/sys/vnode.h (revision 75570) +++ head/sys/sys/vnode.h (revision 75571) @@ -1,659 +1,660 @@ /* * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vnode.h 8.7 (Berkeley) 2/4/94 * $FreeBSD$ */ #ifndef _SYS_VNODE_H_ #define _SYS_VNODE_H_ #include #include #include #include #include #include /* * The vnode is the focus of all file activity in UNIX. There is a * unique vnode allocated for each active file, each current directory, * each mounted-on file, text file, and the root. */ /* * Vnode types. VNON means no type. */ enum vtype { VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO, VBAD }; /* * Vnode tag types. * These are for the benefit of external programs only (e.g., pstat) * and should NEVER be inspected by the kernel. */ enum vtagtype { VT_NON, VT_UFS, VT_NFS, VT_MFS, VT_PC, VT_LFS, VT_LOFS, VT_FDESC, VT_PORTAL, VT_NULL, VT_UMAP, VT_KERNFS, VT_PROCFS, VT_AFS, VT_ISOFS, VT_UNION, VT_MSDOSFS, VT_DEVFS, VT_TFS, VT_VFS, VT_CODA, VT_NTFS, VT_HPFS, VT_NWFS, VT_PSEUDOFS, VT_SMBFS }; /* * Each underlying filesystem allocates its own private area and hangs * it from v_data. If non-null, this area is freed in getnewvnode(). */ TAILQ_HEAD(buflists, buf); typedef int vop_t __P((void *)); struct namecache; /* * Reading or writing any of these items requires holding the appropriate lock. * v_freelist is locked by the global vnode_free_list mutex. * v_mntvnodes is locked by the global mntvnodes mutex. * v_flag, v_usecount, v_holdcount and v_writecount are * locked by the v_interlock mutex. * v_pollinfo is locked by the lock contained inside it. */ struct vnode { u_long v_flag; /* vnode flags (see below) */ int v_usecount; /* reference count of users */ int v_writecount; /* reference count of writers */ int v_holdcnt; /* page & buffer references */ u_long v_id; /* capability identifier */ struct mount *v_mount; /* ptr to vfs we are in */ vop_t **v_op; /* vnode operations vector */ TAILQ_ENTRY(vnode) v_freelist; /* vnode freelist */ LIST_ENTRY(vnode) v_mntvnodes; /* vnodes for mount point */ struct buflists v_cleanblkhd; /* clean blocklist head */ struct buflists v_dirtyblkhd; /* dirty blocklist head */ LIST_ENTRY(vnode) v_synclist; /* vnodes with dirty buffers */ long v_numoutput; /* num of writes in progress */ enum vtype v_type; /* vnode type */ union { struct mount *vu_mountedhere;/* ptr to mounted vfs (VDIR) */ struct socket *vu_socket; /* unix ipc (VSOCK) */ struct { struct specinfo *vu_specinfo; /* device (VCHR, VBLK) */ SLIST_ENTRY(vnode) vu_specnext; } vu_spec; struct fifoinfo *vu_fifoinfo; /* fifo (VFIFO) */ } v_un; struct nqlease *v_lease; /* Soft reference to lease */ daddr_t v_lastw; /* last write (write cluster) */ daddr_t v_cstart; /* start block of cluster */ daddr_t v_lasta; /* last allocation */ int v_clen; /* length of current cluster */ struct vm_object *v_object; /* Place to store VM object */ struct mtx v_interlock; /* lock on usecount and flag */ struct lock v_lock; /* used if fs don't have one */ struct lock *v_vnlock; /* pointer to vnode lock */ enum vtagtype v_tag; /* type of underlying data */ void *v_data; /* private data for fs */ LIST_HEAD(, namecache) v_cache_src; /* Cache entries from us */ TAILQ_HEAD(, namecache) v_cache_dst; /* Cache entries to us */ struct vnode *v_dd; /* .. vnode */ u_long v_ddid; /* .. capability identifier */ struct { struct mtx vpi_lock; /* lock to protect below */ struct selinfo vpi_selinfo; /* identity of poller(s) */ short vpi_events; /* what they are looking for */ short vpi_revents; /* what has happened */ } v_pollinfo; struct proc *v_vxproc; /* proc owning VXLOCK */ #ifdef DEBUG_LOCKS const char *filename; /* Source file doing locking */ int line; /* Line number doing locking */ #endif }; #define v_mountedhere v_un.vu_mountedhere #define v_socket v_un.vu_socket #define v_rdev v_un.vu_spec.vu_specinfo #define v_specnext v_un.vu_spec.vu_specnext #define v_fifoinfo v_un.vu_fifoinfo #define VN_POLLEVENT(vp, events) \ do { \ if ((vp)->v_pollinfo.vpi_events & (events)) \ vn_pollevent((vp), (events)); \ } while (0) /* * Vnode flags. */ #define VROOT 0x00001 /* root of its file system */ #define VTEXT 0x00002 /* vnode is a pure text prototype */ #define VSYSTEM 0x00004 /* vnode being used by kernel */ #define VISTTY 0x00008 /* vnode represents a tty */ #define VXLOCK 0x00100 /* vnode is locked to change underlying type */ #define VXWANT 0x00200 /* process is waiting for vnode */ #define VBWAIT 0x00400 /* waiting for output to complete */ /* open for business 0x00800 */ /* open for business 0x01000 */ #define VOBJBUF 0x02000 /* Allocate buffers in VM object */ #define VCOPYONWRITE 0x04000 /* vnode is doing copy-on-write */ #define VAGE 0x08000 /* Insert vnode at head of free list */ #define VOLOCK 0x10000 /* vnode is locked waiting for an object */ #define VOWANT 0x20000 /* a process is waiting for VOLOCK */ #define VDOOMED 0x40000 /* This vnode is being recycled */ #define VFREE 0x80000 /* This vnode is on the freelist */ /* open for business 0x100000 */ #define VONWORKLST 0x200000 /* On syncer work-list */ #define VMOUNT 0x400000 /* Mount in progress */ /* * Vnode attributes. A field value of VNOVAL represents a field whose value * is unavailable (getattr) or which is not to be changed (setattr). */ struct vattr { enum vtype va_type; /* vnode type (for create) */ u_short va_mode; /* files access mode and type */ short va_nlink; /* number of references to file */ uid_t va_uid; /* owner user id */ gid_t va_gid; /* owner group id */ udev_t va_fsid; /* file system id */ long va_fileid; /* file id */ u_quad_t va_size; /* file size in bytes */ long va_blocksize; /* blocksize preferred for i/o */ struct timespec va_atime; /* time of last access */ struct timespec va_mtime; /* time of last modification */ struct timespec va_ctime; /* time file changed */ u_long va_gen; /* generation number of file */ u_long va_flags; /* flags defined for file */ udev_t va_rdev; /* device the special file represents */ u_quad_t va_bytes; /* bytes of disk space held by file */ u_quad_t va_filerev; /* file modification number */ u_int va_vaflags; /* operations flags, see below */ long va_spare; /* remain quad aligned */ }; /* * Flags for va_vaflags. */ #define VA_UTIMES_NULL 0x01 /* utimes argument was NULL */ #define VA_EXCLUSIVE 0x02 /* exclusive create request */ /* * Flags for ioflag. (high 16 bits used to ask for read-ahead and * help with write clustering) */ #define IO_UNIT 0x01 /* do I/O as atomic unit */ #define IO_APPEND 0x02 /* append write to end */ #define IO_SYNC 0x04 /* do I/O synchronously */ #define IO_NODELOCKED 0x08 /* underlying node already locked */ #define IO_NDELAY 0x10 /* FNDELAY flag set in file table */ #define IO_VMIO 0x20 /* data already in VMIO space */ #define IO_INVAL 0x40 /* invalidate after I/O */ #define IO_ASYNC 0x80 /* bawrite rather then bdwrite */ /* * Modes. Some values same as Ixxx entries from inode.h for now. */ #define VADMIN 010000 /* permission to administer vnode */ #define VSUID 004000 /* set user id on execution */ #define VSGID 002000 /* set group id on execution */ #define VSVTX 001000 /* save swapped text even after use */ #define VREAD 000400 /* read, write, execute permissions */ #define VWRITE 000200 #define VEXEC 000100 /* * Token indicating no attribute value yet assigned. */ #define VNOVAL (-1) #ifdef _KERNEL #ifdef MALLOC_DECLARE MALLOC_DECLARE(M_VNODE); #endif /* * Convert between vnode types and inode formats (since POSIX.1 * defines mode word of stat structure in terms of inode formats). */ extern enum vtype iftovt_tab[]; extern int vttoif_tab[]; #define IFTOVT(mode) (iftovt_tab[((mode) & S_IFMT) >> 12]) #define VTTOIF(indx) (vttoif_tab[(int)(indx)]) #define MAKEIMODE(indx, mode) (int)(VTTOIF(indx) | (mode)) /* * Flags to various vnode functions. */ #define SKIPSYSTEM 0x0001 /* vflush: skip vnodes marked VSYSTEM */ #define FORCECLOSE 0x0002 /* vflush: force file closure */ #define WRITECLOSE 0x0004 /* vflush: only close writable files */ #define DOCLOSE 0x0008 /* vclean: close active files */ #define V_SAVE 0x0001 /* vinvalbuf: sync file first */ #define REVOKEALL 0x0001 /* vop_revoke: revoke all aliases */ #define V_WAIT 0x0001 /* vn_start_write: sleep for suspend */ #define V_NOWAIT 0x0002 /* vn_start_write: don't sleep for suspend */ #define V_XSLEEP 0x0004 /* vn_start_write: just return after sleep */ #define VREF(vp) vref(vp) #ifdef DIAGNOSTIC #define VATTR_NULL(vap) vattr_null(vap) #else #define VATTR_NULL(vap) (*(vap) = va_null) /* initialize a vattr */ #endif /* DIAGNOSTIC */ #define NULLVP ((struct vnode *)NULL) #define VNODEOP_SET(f) \ C_SYSINIT(f##init, SI_SUB_VFS, SI_ORDER_SECOND, vfs_add_vnodeops, &f); \ C_SYSUNINIT(f##uninit, SI_SUB_VFS, SI_ORDER_SECOND, vfs_rm_vnodeops, &f); /* * Global vnode data. */ extern struct vnode *rootvnode; /* root (i.e. "/") vnode */ extern int desiredvnodes; /* number of vnodes desired */ extern time_t syncdelay; /* max time to delay syncing data */ extern time_t filedelay; /* time to delay syncing files */ extern time_t dirdelay; /* time to delay syncing directories */ extern time_t metadelay; /* time to delay syncing metadata */ extern struct vm_zone *namei_zone; extern int prtactive; /* nonzero to call vprint() */ extern struct vattr va_null; /* predefined null vattr structure */ extern int vfs_ioopt; /* * Macro/function to check for client cache inconsistency w.r.t. leasing. */ #define LEASE_READ 0x1 /* Check lease for readers */ #define LEASE_WRITE 0x2 /* Check lease for modifiers */ extern void (*lease_updatetime) __P((int deltat)); #define VSHOULDFREE(vp) \ (!((vp)->v_flag & (VFREE|VDOOMED)) && \ !(vp)->v_holdcnt && !(vp)->v_usecount && \ (!(vp)->v_object || \ !((vp)->v_object->ref_count || (vp)->v_object->resident_page_count))) #define VSHOULDBUSY(vp) \ (((vp)->v_flag & VFREE) && \ ((vp)->v_holdcnt || (vp)->v_usecount)) #define VI_LOCK(vp) mtx_lock(&(vp)->v_interlock) #define VI_TRYLOCK(vp) mtx_trylock(&(vp)->v_interlock) #define VI_UNLOCK(vp) mtx_unlock(&(vp)->v_interlock) #endif /* _KERNEL */ /* * Mods for extensibility. */ /* * Flags for vdesc_flags: */ #define VDESC_MAX_VPS 16 /* Low order 16 flag bits are reserved for willrele flags for vp arguments. */ #define VDESC_VP0_WILLRELE 0x0001 #define VDESC_VP1_WILLRELE 0x0002 #define VDESC_VP2_WILLRELE 0x0004 #define VDESC_VP3_WILLRELE 0x0008 #define VDESC_NOMAP_VPP 0x0100 #define VDESC_VPP_WILLRELE 0x0200 /* * VDESC_NO_OFFSET is used to identify the end of the offset list * and in places where no such field exists. */ #define VDESC_NO_OFFSET -1 /* * This structure describes the vnode operation taking place. */ struct vnodeop_desc { int vdesc_offset; /* offset in vector--first for speed */ char *vdesc_name; /* a readable name for debugging */ int vdesc_flags; /* VDESC_* flags */ /* * These ops are used by bypass routines to map and locate arguments. * Creds and procs are not needed in bypass routines, but sometimes * they are useful to (for example) transport layers. * Nameidata is useful because it has a cred in it. */ int *vdesc_vp_offsets; /* list ended by VDESC_NO_OFFSET */ int vdesc_vpp_offset; /* return vpp location */ int vdesc_cred_offset; /* cred location, if any */ int vdesc_proc_offset; /* proc location, if any */ int vdesc_componentname_offset; /* if any */ /* * Finally, we've got a list of private data (about each operation) * for each transport layer. (Support to manage this list is not * yet part of BSD.) */ caddr_t *vdesc_transports; }; #ifdef _KERNEL /* * A list of all the operation descs. */ extern struct vnodeop_desc *vnodeop_descs[]; /* * Interlock for scanning list of vnodes attached to a mountpoint */ extern struct mtx mntvnode_mtx; /* * This macro is very helpful in defining those offsets in the vdesc struct. * * This is stolen from X11R4. I ignored all the fancy stuff for * Crays, so if you decide to port this to such a serious machine, * you might want to consult Intrinsic.h's XtOffset{,Of,To}. */ #define VOPARG_OFFSET(p_type,field) \ ((int) (((char *) (&(((p_type)NULL)->field))) - ((char *) NULL))) #define VOPARG_OFFSETOF(s_type,field) \ VOPARG_OFFSET(s_type*,field) #define VOPARG_OFFSETTO(S_TYPE,S_OFFSET,STRUCT_P) \ ((S_TYPE)(((char*)(STRUCT_P))+(S_OFFSET))) /* * This structure is used to configure the new vnodeops vector. */ struct vnodeopv_entry_desc { struct vnodeop_desc *opve_op; /* which operation this is */ vop_t *opve_impl; /* code implementing this operation */ }; struct vnodeopv_desc { /* ptr to the ptr to the vector where op should go */ vop_t ***opv_desc_vector_p; struct vnodeopv_entry_desc *opv_desc_ops; /* null terminated list */ }; /* * A generic structure. * This can be used by bypass routines to identify generic arguments. */ struct vop_generic_args { struct vnodeop_desc *a_desc; /* other random data follows, presumably */ }; #ifdef DEBUG_VFS_LOCKS /* * Macros to aid in tracing VFS locking problems. Not totally * reliable since if the process sleeps between changing the lock * state and checking it with the assert, some other process could * change the state. They are good enough for debugging a single * filesystem using a single-threaded test. I find that 'cvs co src' * is a pretty good test. */ /* * [dfr] Kludge until I get around to fixing all the vfs locking. */ #define IS_LOCKING_VFS(vp) ((vp)->v_tag == VT_UFS \ || (vp)->v_tag == VT_MFS \ || (vp)->v_tag == VT_NFS \ || (vp)->v_tag == VT_LFS \ || (vp)->v_tag == VT_ISOFS \ || (vp)->v_tag == VT_MSDOSFS \ || (vp)->v_tag == VT_DEVFS) #define ASSERT_VOP_LOCKED(vp, str) \ do { \ struct vnode *_vp = (vp); \ \ if (_vp && IS_LOCKING_VFS(_vp) && !VOP_ISLOCKED(_vp, NULL)) \ panic("%s: %p is not locked but should be", str, _vp); \ } while (0) #define ASSERT_VOP_UNLOCKED(vp, str) \ do { \ struct vnode *_vp = (vp); \ int lockstate; \ \ if (_vp && IS_LOCKING_VFS(_vp)) { \ lockstate = VOP_ISLOCKED(_vp, curproc); \ if (lockstate == LK_EXCLUSIVE) \ panic("%s: %p is locked but should not be", \ str, _vp); \ } \ } while (0) #define ASSERT_VOP_ELOCKED(vp, str) \ do { \ struct vnode *_vp = (vp); \ \ if (_vp && IS_LOCKING_VFS(_vp) && \ VOP_ISLOCKED(_vp, curproc) != LK_EXCLUSIVE) \ panic("%s: %p is not exclusive locked but should be", \ str, _vp); \ } while (0) #define ASSERT_VOP_ELOCKED_OTHER(vp, str) \ do { \ struct vnode *_vp = (vp); \ \ if (_vp && IS_LOCKING_VFS(_vp) && \ VOP_ISLOCKED(_vp, curproc) != LK_EXCLOTHER) \ panic("%s: %p is not exclusive locked by another proc", \ str, _vp); \ } while (0) #define ASSERT_VOP_SLOCKED(vp, str) \ do { \ struct vnode *_vp = (vp); \ \ if (_vp && IS_LOCKING_VFS(_vp) && \ VOP_ISLOCKED(_vp, NULL) != LK_SHARED) \ panic("%s: %p is not locked shared but should be", \ str, _vp); \ } while (0) #else #define ASSERT_VOP_LOCKED(vp, str) #define ASSERT_VOP_UNLOCKED(vp, str) #endif /* * VOCALL calls an op given an ops vector. We break it out because BSD's * vclean changes the ops vector and then wants to call ops with the old * vector. */ #define VOCALL(OPSV,OFF,AP) (( *((OPSV)[(OFF)])) (AP)) /* * This call works for vnodes in the kernel. */ #define VCALL(VP,OFF,AP) VOCALL((VP)->v_op,(OFF),(AP)) #define VDESC(OP) (& __CONCAT(OP,_desc)) #define VOFFSET(OP) (VDESC(OP)->vdesc_offset) /* * VMIO support inline */ extern int vmiodirenable; static __inline int vn_canvmio(struct vnode *vp) { if (vp && (vp->v_type == VREG || (vmiodirenable && vp->v_type == VDIR))) return(TRUE); return(FALSE); } /* * Finally, include the default set of vnode operations. */ #include "vnode_if.h" /* * Public vnode manipulation functions. */ struct componentname; struct file; struct mount; struct nameidata; struct ostat; struct proc; struct stat; struct nstat; struct ucred; struct uio; struct vattr; struct vnode; struct vop_bwrite_args; extern int (*lease_check_hook) __P((struct vop_lease_args *)); struct vnode *addaliasu __P((struct vnode *vp, udev_t nvp_rdev)); int bdevvp __P((dev_t dev, struct vnode **vpp)); /* cache_* may belong in namei.h. */ void cache_enter __P((struct vnode *dvp, struct vnode *vp, struct componentname *cnp)); int cache_lookup __P((struct vnode *dvp, struct vnode **vpp, struct componentname *cnp)); void cache_purge __P((struct vnode *vp)); void cache_purgevfs __P((struct mount *mp)); void cvtstat __P((struct stat *st, struct ostat *ost)); void cvtnstat __P((struct stat *sb, struct nstat *nsb)); int getnewvnode __P((enum vtagtype tag, struct mount *mp, vop_t **vops, struct vnode **vpp)); int lease_check __P((struct vop_lease_args *ap)); int spec_vnoperate __P((struct vop_generic_args *)); int speedup_syncer __P((void)); int textvp_fullpath __P((struct proc *p, char **retbuf, char **retfreebuf)); int vaccess __P((enum vtype type, mode_t file_mode, uid_t uid, gid_t gid, mode_t acc_mode, struct ucred *cred, int *privused)); -int vaccess_acl_posix1e __P((enum vtype type, struct acl *acl, - mode_t acc_mode, struct ucred *cred, int *privused)); +int vaccess_acl_posix1e __P((enum vtype type, uid_t file_uid, + gid_t file_gid, struct acl *acl, mode_t acc_mode, + struct ucred *cred, int *privused)); void vattr_null __P((struct vattr *vap)); int vcount __P((struct vnode *vp)); void vdrop __P((struct vnode *)); int vfinddev __P((dev_t dev, enum vtype type, struct vnode **vpp)); void vfs_add_vnodeops __P((const void *)); void vfs_rm_vnodeops __P((const void *)); int vflush __P((struct mount *mp, struct vnode *skipvp, int flags)); int vget __P((struct vnode *vp, int lockflag, struct proc *p)); void vgone __P((struct vnode *vp)); void vgonel __P((struct vnode *vp, struct proc *p)); void vhold __P((struct vnode *)); int vinvalbuf __P((struct vnode *vp, int save, struct ucred *cred, struct proc *p, int slpflag, int slptimeo)); int vtruncbuf __P((struct vnode *vp, struct ucred *cred, struct proc *p, off_t length, int blksize)); void vprint __P((char *label, struct vnode *vp)); int vrecycle __P((struct vnode *vp, struct mtx *inter_lkp, struct proc *p)); int vn_close __P((struct vnode *vp, int flags, struct ucred *cred, struct proc *p)); void vn_finished_write __P((struct mount *mp)); int vn_isdisk __P((struct vnode *vp, int *errp)); int vn_lock __P((struct vnode *vp, int flags, struct proc *p)); #ifdef DEBUG_LOCKS int debug_vn_lock __P((struct vnode *vp, int flags, struct proc *p, const char *filename, int line)); #define vn_lock(vp,flags,p) debug_vn_lock(vp,flags,p,__FILE__,__LINE__) #endif int vn_open __P((struct nameidata *ndp, int *flagp, int cmode)); void vn_pollevent __P((struct vnode *vp, int events)); void vn_pollgone __P((struct vnode *vp)); int vn_pollrecord __P((struct vnode *vp, struct proc *p, int events)); int vn_rdwr __P((enum uio_rw rw, struct vnode *vp, caddr_t base, int len, off_t offset, enum uio_seg segflg, int ioflg, struct ucred *cred, int *aresid, struct proc *p)); int vn_stat __P((struct vnode *vp, struct stat *sb, struct proc *p)); int vn_start_write __P((struct vnode *vp, struct mount **mpp, int flags)); dev_t vn_todev __P((struct vnode *vp)); int vn_write_suspend_wait __P((struct vnode *vp, struct mount *mp, int flags)); int vn_writechk __P((struct vnode *vp)); int vn_extattr_get __P((struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, int *buflen, char *buf, struct proc *p)); int vn_extattr_set __P((struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, int buflen, char *buf, struct proc *p)); int vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, struct proc *p); int vfs_cache_lookup __P((struct vop_lookup_args *ap)); int vfs_object_create __P((struct vnode *vp, struct proc *p, struct ucred *cred)); void vfs_timestamp __P((struct timespec *)); void vfs_write_resume __P((struct mount *mp)); void vfs_write_suspend __P((struct mount *mp)); int vop_stdbwrite __P((struct vop_bwrite_args *ap)); int vop_stdgetwritemount __P((struct vop_getwritemount_args *)); int vop_stdinactive __P((struct vop_inactive_args *)); int vop_stdislocked __P((struct vop_islocked_args *)); int vop_stdlock __P((struct vop_lock_args *)); int vop_stdunlock __P((struct vop_unlock_args *)); int vop_noislocked __P((struct vop_islocked_args *)); int vop_nolock __P((struct vop_lock_args *)); int vop_nopoll __P((struct vop_poll_args *)); int vop_nounlock __P((struct vop_unlock_args *)); int vop_stdpathconf __P((struct vop_pathconf_args *)); int vop_stdpoll __P((struct vop_poll_args *)); int vop_revoke __P((struct vop_revoke_args *)); int vop_sharedlock __P((struct vop_lock_args *)); int vop_eopnotsupp __P((struct vop_generic_args *ap)); int vop_ebadf __P((struct vop_generic_args *ap)); int vop_einval __P((struct vop_generic_args *ap)); int vop_enotty __P((struct vop_generic_args *ap)); int vop_defaultop __P((struct vop_generic_args *ap)); int vop_null __P((struct vop_generic_args *ap)); int vop_panic __P((struct vop_generic_args *ap)); int vop_stdcreatevobject __P((struct vop_createvobject_args *ap)); int vop_stddestroyvobject __P((struct vop_destroyvobject_args *ap)); int vop_stdgetvobject __P((struct vop_getvobject_args *ap)); void vfree __P((struct vnode *)); void vput __P((struct vnode *vp)); void vrele __P((struct vnode *vp)); void vref __P((struct vnode *vp)); void vbusy __P((struct vnode *vp)); extern vop_t **default_vnodeop_p; extern vop_t **spec_vnodeop_p; extern vop_t **dead_vnodeop_p; #endif /* _KERNEL */ #endif /* !_SYS_VNODE_H_ */ Index: head/sys/ufs/ufs/ufs_acl.c =================================================================== --- head/sys/ufs/ufs/ufs_acl.c (revision 75570) +++ head/sys/ufs/ufs/ufs_acl.c (revision 75571) @@ -1,569 +1,493 @@ /*- * Copyright (c) 1999, 2000, 2001 Robert N. M. Watson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ /* * Developed by the TrustedBSD Project. * Support for POSIX.1e access control lists: UFS-specific support functions. */ #include "opt_ufs.h" #include "opt_quota.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define VN_KNOTE(vp, b) \ KNOTE(&vp->v_pollinfo.vpi_selinfo.si_note, (b)) #ifdef UFS_ACL /* * Synchronize an ACL and an inode by copying over appropriate inode fields * to the passed ACL. Assumes an ACL that would satisfy acl_posix1e_check(), * and may panic if not. */ void ufs_sync_acl_from_inode(struct inode *ip, struct acl *acl) { struct acl_entry *acl_mask, *acl_group_obj; int i; /* * Update ACL_USER_OBJ, ACL_OTHER, but simply identify ACL_MASK * and ACL_GROUP_OBJ for use after we know whether ACL_MASK is * present. */ acl_mask = NULL; acl_group_obj = NULL; for (i = 0; i < acl->acl_cnt; i++) { switch (acl->acl_entry[i].ae_tag) { case ACL_USER_OBJ: acl->acl_entry[i].ae_perm = acl_posix1e_mode_to_perm( ACL_USER_OBJ, ip->i_mode); - acl->acl_entry[i].ae_id = ip->i_uid; + acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; break; case ACL_GROUP_OBJ: acl_group_obj = &acl->acl_entry[i]; - acl->acl_entry[i].ae_id = ip->i_gid; + acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; break; case ACL_OTHER: acl->acl_entry[i].ae_perm = acl_posix1e_mode_to_perm( ACL_OTHER, ip->i_mode); - acl->acl_entry[i].ae_id = 0; + acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; break; case ACL_MASK: acl_mask = &acl->acl_entry[i]; - acl->acl_entry[i].ae_id = 0; + acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; break; case ACL_USER: case ACL_GROUP: break; default: panic("ufs_sync_acl_from_inode(): bad ae_tag"); } } if (acl_group_obj == NULL) panic("ufs_sync_acl_from_inode(): no ACL_GROUP_OBJ"); if (acl_mask == NULL) { /* * There is no ACL_MASK, so update ACL_GROUP_OBJ. */ acl_group_obj->ae_perm = acl_posix1e_mode_to_perm( ACL_GROUP_OBJ, ip->i_mode); } else { /* * Update the ACL_MASK entry instead of ACL_GROUP_OBJ. */ acl_mask->ae_perm = acl_posix1e_mode_to_perm(ACL_GROUP_OBJ, ip->i_mode); } } /* * Synchronize an inode and an ACL by copying over appropriate ACL fields to * the passed inode. Assumes an ACL that would satisfy acl_posix1e_check(), * and may panic if not. This code will preserve existing use of the * sticky, setugid, and non-permission bits in the mode field. It may * be that the caller wishes to have previously authorized these changes, * and may also want to clear the setugid bits in some situations. */ void ufs_sync_inode_from_acl(struct acl *acl, struct inode *ip, mode_t preserve_mask) { struct acl_entry *acl_mask, *acl_user_obj, *acl_group_obj; struct acl_entry *acl_other; mode_t preserve_mode; int i; /* * Preserve old mode so we can restore appropriate bits of it. */ preserve_mode = (ip->i_mode & preserve_mask); /* * Identify the ACL_MASK and all other entries appearing in the * inode mode. */ acl_user_obj = NULL; acl_group_obj = NULL; acl_other = NULL; acl_mask = NULL; for (i = 0; i < acl->acl_cnt; i++) { switch (acl->acl_entry[i].ae_tag) { case ACL_USER_OBJ: acl_user_obj = &acl->acl_entry[i]; - ip->i_uid = acl->acl_entry[i].ae_id; break; case ACL_GROUP_OBJ: acl_group_obj = &acl->acl_entry[i]; - ip->i_gid = acl->acl_entry[i].ae_id; break; case ACL_OTHER: acl_other = &acl->acl_entry[i]; break; case ACL_MASK: acl_mask = &acl->acl_entry[i]; break; case ACL_USER: case ACL_GROUP: break; default: panic("ufs_sync_inode_from_acl(): bad ae_tag"); } } if (acl_user_obj == NULL || acl_group_obj == NULL || acl_other == NULL) panic("ufs_sync_inode_from_acl(): missing ae_tags"); if (acl_mask == NULL) { /* * There is no ACL_MASK, so use the ACL_GROUP_OBJ entry. */ - ip->i_mode &= ~ALLPERMS; + ip->i_mode &= ~(S_IRWXU|S_IRWXG|S_IRWXO); ip->i_mode |= acl_posix1e_perms_to_mode(acl_user_obj, acl_group_obj, acl_other); } else { /* * Use the ACL_MASK entry. */ - ip->i_mode &= ~ALLPERMS; + ip->i_mode &= ~(S_IRWXU|S_IRWXG|S_IRWXO); ip->i_mode |= acl_posix1e_perms_to_mode(acl_user_obj, acl_mask, acl_other); } ip->i_mode |= preserve_mode; } /* * Retrieve the ACL on a file. * * As part of the ACL is stored in the inode, and the rest in an EA, * assemble both into a final ACL product. Right now this is not done * very efficiently. */ int ufs_getacl(ap) struct vop_getacl_args /* { struct vnode *vp; struct acl_type_t type; struct acl *aclp; struct ucred *cred; struct proc *p; } */ *ap; { struct inode *ip = VTOI(ap->a_vp); int error, len; /* * Attempt to retrieve the ACL based on the ACL type. */ bzero(ap->a_aclp, sizeof(*ap->a_aclp)); len = sizeof(*ap->a_aclp); switch(ap->a_type) { case ACL_TYPE_ACCESS: /* * ACL_TYPE_ACCESS ACLs may or may not be stored in the * EA, as they are in fact a combination of the inode * ownership/permissions and the EA contents. If the * EA is present, merge the two in a temporary ACL * storage, otherwise just return the inode contents. */ error = vn_extattr_get(ap->a_vp, IO_NODELOCKED, POSIX1E_ACL_ACCESS_EXTATTR_NAMESPACE, POSIX1E_ACL_ACCESS_EXTATTR_NAME, &len, (char *) ap->a_aclp, ap->a_p); switch (error) { /* XXX: Will be ENOATTR. */ /* XXX: If ufs_getacl() should work on file systems without * the EA configured, add case EOPNOTSUPP here. */ case ENOENT: /* * Legitimately no ACL set on object, purely * emulate it through the inode. These fields will * be updated when the ACL is synchronized with * the inode later. */ ap->a_aclp->acl_cnt = 3; ap->a_aclp->acl_entry[0].ae_tag = ACL_USER_OBJ; - ap->a_aclp->acl_entry[0].ae_id = 0; + ap->a_aclp->acl_entry[0].ae_id = ACL_UNDEFINED_ID; ap->a_aclp->acl_entry[0].ae_perm = 0; ap->a_aclp->acl_entry[1].ae_tag = ACL_GROUP_OBJ; - ap->a_aclp->acl_entry[1].ae_id = 0; + ap->a_aclp->acl_entry[1].ae_id = ACL_UNDEFINED_ID; ap->a_aclp->acl_entry[1].ae_perm = 0; ap->a_aclp->acl_entry[2].ae_tag = ACL_OTHER; - ap->a_aclp->acl_entry[2].ae_id = 0; + ap->a_aclp->acl_entry[2].ae_id = ACL_UNDEFINED_ID; ap->a_aclp->acl_entry[2].ae_perm = 0; ufs_sync_acl_from_inode(ip, ap->a_aclp); error = 0; break; case 0: if (len != sizeof(*ap->a_aclp)) { /* * A short (or long) read, meaning that for * some reason the ACL is corrupted. Return * EPERM since the object DAC protections * are unsafe. */ printf("ufs_getacl(): Loaded invalid ACL (" "%d bytes)\n", len); return (EPERM); } ufs_sync_acl_from_inode(ip, ap->a_aclp); break; default: } break; case ACL_TYPE_DEFAULT: if (ap->a_vp->v_type != VDIR) { error = EINVAL; break; } error = vn_extattr_get(ap->a_vp, IO_NODELOCKED, POSIX1E_ACL_DEFAULT_EXTATTR_NAMESPACE, POSIX1E_ACL_DEFAULT_EXTATTR_NAME, &len, (char *) ap->a_aclp, ap->a_p); /* * Unlike ACL_TYPE_ACCESS, there is no relationship between * the inode contents and the ACL, and it is therefore * possible for the request for the ACL to fail since the * ACL is undefined. In this situation, return success * and an empty ACL, as required by POSIX.1e. */ switch (error) { /* XXX: Will be ENOATTR. */ /* XXX: If ufs_getacl() should work on file systems without * the EA configured, add case EOPNOTSUPP here. */ case ENOENT: bzero(ap->a_aclp, sizeof(*ap->a_aclp)); ap->a_aclp->acl_cnt = 0; error = 0; break; case 0: if (len != sizeof(*ap->a_aclp)) { /* * A short (or long) read, meaning that for * some reason the ACL is corrupted. Return * EPERM since the object default DAC * protections are unsafe. */ printf("ufs_getacl(): Loaded invalid ACL (" "%d bytes)\n", len); return (EPERM); } break; default: } break; default: error = EINVAL; } return (error); } /* * Set the ACL on a file. * * As part of the ACL is stored in the inode, and the rest in an EA, * this is necessarily non-atomic, and has complex authorization. * As ufs_setacl() includes elements of ufs_chown() and ufs_chmod(), * a fair number of different access checks may be required to go ahead * with the operation at all. */ int ufs_setacl(ap) struct vop_setacl_args /* { struct vnode *vp; acl_type_t type; struct acl *aclp; struct ucred *cred; struct proc *p; } */ *ap; { struct inode *ip = VTOI(ap->a_vp); - struct acl_entry *acl_user_obj, *acl_group_obj, *acl_other; mode_t old_mode, preserve_mask; - uid_t old_uid, new_uid = 0; - gid_t old_gid, new_gid = 0; - int error, i; + int error; /* * If this is a set operation rather than a delete operation, * invoke VOP_ACLCHECK() on the passed ACL to determine if it is * valid for the target. This will include a check on ap->a_type. */ if (ap->a_aclp != NULL) { /* * Set operation. */ error = VOP_ACLCHECK(ap->a_vp, ap->a_type, ap->a_aclp, ap->a_cred, ap->a_p); if (error != 0) return (error); } else { /* * Delete operation. * POSIX.1e allows only deletion of the default ACL on a * directory (ACL_TYPE_DEFAULT). */ if (ap->a_type != ACL_TYPE_DEFAULT) return (EINVAL); if (ap->a_vp->v_type != VDIR) return (ENOTDIR); } if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); /* * Authorize the ACL operation. */ if (ip->i_flags & (IMMUTABLE | APPEND)) return (EPERM); /* * Must hold VADMIN (be file owner) or have appropriate privilege. */ if ((error = VOP_ACCESS(ap->a_vp, VADMIN, ap->a_cred, ap->a_p))) return (error); - /* - * ACL_TYPE_ACCESS may involve the changing of ownership, sticky - * bit, setugid bits on the file or directory. As such, it requires - * special handling to identify these changes, and to authorize - * them. - * ACL_TYPE_DEFAULT does not require this, and ap->a_aclp should - * not be dereferenced without a NULL check, as it may be a delete - * operation. - */ switch(ap->a_type) { case ACL_TYPE_ACCESS: - /* - * Identify ACL_USER_OBJ, ACL_GROUP_OBJ, and determine if - * they have changed. If so, authorize in the style of - * ufs_chown(). While we're at it, identify ACL_OTHER. - */ - acl_user_obj = acl_group_obj = acl_other = NULL; - for (i = 0; i < ap->a_aclp->acl_cnt; i++) - switch(ap->a_aclp->acl_entry[i].ae_tag) { - case ACL_USER_OBJ: - acl_user_obj = &ap->a_aclp->acl_entry[i]; - new_uid = acl_user_obj->ae_id; - break; - case ACL_GROUP_OBJ: - acl_group_obj = &ap->a_aclp->acl_entry[i]; - new_gid = acl_group_obj->ae_id; - break; - case ACL_OTHER: - acl_other = &ap->a_aclp->acl_entry[i]; - break; - default: - } - old_uid = ip->i_uid; - old_gid = ip->i_gid; - - /* - * Authorize changes to base object ownership in the style - * of ufs_chown(). - */ - if (new_uid != old_uid && (error = suser_xxx(ap->a_cred, - ap->a_p, PRISON_ROOT))) - return (error); - if (new_gid != old_gid && !groupmember(new_gid, ap->a_cred) && - (error = suser_xxx(ap->a_cred, ap->a_p, PRISON_ROOT))) - return (error); - - case ACL_TYPE_DEFAULT: - /* - * ACL_TYPE_DEFAULT can literally be written straight into - * the EA unhindered, as it has gone through sanity checking - * already. - */ - break; - - default: - panic("ufs_setacl(): unknown acl type\n"); - } - - switch(ap->a_type) { - case ACL_TYPE_ACCESS: error = vn_extattr_set(ap->a_vp, IO_NODELOCKED, POSIX1E_ACL_ACCESS_EXTATTR_NAMESPACE, POSIX1E_ACL_ACCESS_EXTATTR_NAME, sizeof(*ap->a_aclp), (char *) ap->a_aclp, ap->a_p); break; case ACL_TYPE_DEFAULT: if (ap->a_aclp == NULL) { error = vn_extattr_rm(ap->a_vp, IO_NODELOCKED, POSIX1E_ACL_DEFAULT_EXTATTR_NAMESPACE, POSIX1E_ACL_DEFAULT_EXTATTR_NAME, ap->a_p); /* * Attempting to delete a non-present default ACL * will return success for portability purposes. * (TRIX) */ /* XXX: the ENOENT here will eventually be ENOATTR. */ if (error == EINVAL) error = 0; } else error = vn_extattr_set(ap->a_vp, IO_NODELOCKED, POSIX1E_ACL_DEFAULT_EXTATTR_NAMESPACE, POSIX1E_ACL_DEFAULT_EXTATTR_NAME, sizeof(*ap->a_aclp), (char *) ap->a_aclp, ap->a_p); break; default: error = EINVAL; } /* * Map lack of attribute definition in UFS_EXTATTR into lack of * support for ACLs on the file system. */ /* XXX: ENOENT here will eventually be ENOATTR. */ if (error == ENOENT) return (EOPNOTSUPP); if (error != 0) return (error); if (ap->a_type == ACL_TYPE_ACCESS) { /* * Now that the EA is successfully updated, update the * inode and mark it as changed. */ - old_uid = ip->i_uid; - old_gid = ip->i_gid; old_mode = ip->i_mode; preserve_mask = ISVTX | ISGID | ISUID; ufs_sync_inode_from_acl(ap->a_aclp, ip, preserve_mask); - - /* - * Clear the ISGID and ISUID bits if the ownership has - * changed, or appropriate privilege is not available. - * XXX: This should probably be a check for broadening - * availability of the bits, but it's not clear from the - * spec. - */ - if (suser_xxx(ap->a_cred, NULL, PRISON_ROOT) && - (ip->i_gid != old_gid || ip->i_uid != old_uid)) - ip->i_mode &= ~(ISUID | ISGID); ip->i_flag |= IN_CHANGE; } VN_KNOTE(ap->a_vp, NOTE_ATTRIB); return (0); } /* * Check the validity of an ACL for a file. */ int ufs_aclcheck(ap) struct vop_aclcheck_args /* { struct vnode *vp; acl_type_t type; struct acl *aclp; struct ucred *cred; struct proc *p; } */ *ap; { /* * Verify we understand this type of ACL, and that it applies * to this kind of object. * Rely on the acl_posix1e_check() routine to verify the contents. */ switch(ap->a_type) { case ACL_TYPE_ACCESS: break; case ACL_TYPE_DEFAULT: if (ap->a_vp->v_type != VDIR) return (EINVAL); break; default: return (EINVAL); } return (acl_posix1e_check(ap->a_aclp)); } #endif /* !UFS_ACL */ Index: head/sys/ufs/ufs/ufs_vnops.c =================================================================== --- head/sys/ufs/ufs/ufs_vnops.c (revision 75570) +++ head/sys/ufs/ufs/ufs_vnops.c (revision 75571) @@ -1,2586 +1,2586 @@ /* * Copyright (c) 1982, 1986, 1989, 1993, 1995 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ufs_vnops.c 8.27 (Berkeley) 5/27/95 * $FreeBSD$ */ #include "opt_quota.h" #include "opt_suiddir.h" #include "opt_ufs.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* XXX */ #include #include #include #include #include #include #include #include #include #include static int ufs_access __P((struct vop_access_args *)); static int ufs_advlock __P((struct vop_advlock_args *)); static int ufs_chmod __P((struct vnode *, int, struct ucred *, struct proc *)); static int ufs_chown __P((struct vnode *, uid_t, gid_t, struct ucred *, struct proc *)); static int ufs_close __P((struct vop_close_args *)); static int ufs_create __P((struct vop_create_args *)); static int ufs_getattr __P((struct vop_getattr_args *)); static int ufs_link __P((struct vop_link_args *)); static int ufs_makeinode __P((int mode, struct vnode *, struct vnode **, struct componentname *)); static int ufs_missingop __P((struct vop_generic_args *ap)); static int ufs_mkdir __P((struct vop_mkdir_args *)); static int ufs_mknod __P((struct vop_mknod_args *)); static int ufs_open __P((struct vop_open_args *)); static int ufs_pathconf __P((struct vop_pathconf_args *)); static int ufs_print __P((struct vop_print_args *)); static int ufs_readlink __P((struct vop_readlink_args *)); static int ufs_remove __P((struct vop_remove_args *)); static int ufs_rename __P((struct vop_rename_args *)); static int ufs_rmdir __P((struct vop_rmdir_args *)); static int ufs_setattr __P((struct vop_setattr_args *)); static int ufs_strategy __P((struct vop_strategy_args *)); static int ufs_symlink __P((struct vop_symlink_args *)); static int ufs_whiteout __P((struct vop_whiteout_args *)); static int ufsfifo_close __P((struct vop_close_args *)); static int ufsfifo_read __P((struct vop_read_args *)); static int ufsfifo_write __P((struct vop_write_args *)); static int ufsspec_close __P((struct vop_close_args *)); static int ufsspec_read __P((struct vop_read_args *)); static int ufsspec_write __P((struct vop_write_args *)); static int filt_ufsread __P((struct knote *kn, long hint)); static int filt_ufsvnode __P((struct knote *kn, long hint)); static void filt_ufsdetach __P((struct knote *kn)); static int ufs_kqfilter __P((struct vop_kqfilter_args *ap)); union _qcvt { int64_t qcvt; int32_t val[2]; }; #define SETHIGH(q, h) { \ union _qcvt tmp; \ tmp.qcvt = (q); \ tmp.val[_QUAD_HIGHWORD] = (h); \ (q) = tmp.qcvt; \ } #define SETLOW(q, l) { \ union _qcvt tmp; \ tmp.qcvt = (q); \ tmp.val[_QUAD_LOWWORD] = (l); \ (q) = tmp.qcvt; \ } #define VN_KNOTE(vp, b) \ KNOTE(&vp->v_pollinfo.vpi_selinfo.si_note, (b)) /* * A virgin directory (no blushing please). */ static struct dirtemplate mastertemplate = { 0, 12, DT_DIR, 1, ".", 0, DIRBLKSIZ - 12, DT_DIR, 2, ".." }; static struct odirtemplate omastertemplate = { 0, 12, 1, ".", 0, DIRBLKSIZ - 12, 2, ".." }; void ufs_itimes(vp) struct vnode *vp; { struct inode *ip; struct timespec ts; ip = VTOI(vp); if ((ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_UPDATE)) == 0) return; if ((vp->v_mount->mnt_flag & MNT_RDONLY) == 0) { vfs_timestamp(&ts); if ((vp->v_type == VBLK || vp->v_type == VCHR) && !DOINGSOFTDEP(vp)) ip->i_flag |= IN_LAZYMOD; else ip->i_flag |= IN_MODIFIED; if (ip->i_flag & IN_ACCESS) { ip->i_atime = ts.tv_sec; ip->i_atimensec = ts.tv_nsec; } if (ip->i_flag & IN_UPDATE) { ip->i_mtime = ts.tv_sec; ip->i_mtimensec = ts.tv_nsec; ip->i_modrev++; } if (ip->i_flag & IN_CHANGE) { ip->i_ctime = ts.tv_sec; ip->i_ctimensec = ts.tv_nsec; } } ip->i_flag &= ~(IN_ACCESS | IN_CHANGE | IN_UPDATE); } /* * Create a regular file */ int ufs_create(ap) struct vop_create_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap; { int error; error = ufs_makeinode(MAKEIMODE(ap->a_vap->va_type, ap->a_vap->va_mode), ap->a_dvp, ap->a_vpp, ap->a_cnp); if (error) return (error); VN_KNOTE(ap->a_dvp, NOTE_WRITE); return (0); } /* * Mknod vnode call */ /* ARGSUSED */ int ufs_mknod(ap) struct vop_mknod_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap; { struct vattr *vap = ap->a_vap; struct vnode **vpp = ap->a_vpp; struct inode *ip; ino_t ino; int error; error = ufs_makeinode(MAKEIMODE(vap->va_type, vap->va_mode), ap->a_dvp, vpp, ap->a_cnp); if (error) return (error); VN_KNOTE(ap->a_dvp, NOTE_WRITE); ip = VTOI(*vpp); ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE; if (vap->va_rdev != VNOVAL) { /* * Want to be able to use this to make badblock * inodes, so don't truncate the dev number. */ ip->i_rdev = vap->va_rdev; } /* * Remove inode, then reload it through VFS_VGET so it is * checked to see if it is an alias of an existing entry in * the inode cache. */ vput(*vpp); (*vpp)->v_type = VNON; ino = ip->i_number; /* Save this before vgone() invalidates ip. */ vgone(*vpp); error = VFS_VGET(ap->a_dvp->v_mount, ino, vpp); if (error) { *vpp = NULL; return (error); } return (0); } /* * Open called. * * Nothing to do. */ /* ARGSUSED */ int ufs_open(ap) struct vop_open_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct proc *a_p; } */ *ap; { /* * Files marked append-only must be opened for appending. */ if ((VTOI(ap->a_vp)->i_flags & APPEND) && (ap->a_mode & (FWRITE | O_APPEND)) == FWRITE) return (EPERM); return (0); } /* * Close called. * * Update the times on the inode. */ /* ARGSUSED */ int ufs_close(ap) struct vop_close_args /* { struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct proc *a_p; } */ *ap; { register struct vnode *vp = ap->a_vp; mtx_lock(&vp->v_interlock); if (vp->v_usecount > 1) ufs_itimes(vp); mtx_unlock(&vp->v_interlock); return (0); } int ufs_access(ap) struct vop_access_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct proc *a_p; } */ *ap; { struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); mode_t mode = ap->a_mode; int error; #ifdef UFS_ACL struct acl *acl; int len; #endif /* * Disallow write attempts on read-only file systems; * unless the file is a socket, fifo, or a block or * character device resident on the file system. */ if (mode & VWRITE) { switch (vp->v_type) { case VDIR: case VLNK: case VREG: if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); #ifdef QUOTA if ((error = getinoquota(ip)) != 0) return (error); #endif break; default: break; } } /* If immutable bit set, nobody gets to write it. */ if ((mode & VWRITE) && (ip->i_flags & (IMMUTABLE | SF_SNAPSHOT))) return (EPERM); #ifdef UFS_ACL MALLOC(acl, struct acl *, sizeof(*acl), M_ACL, M_WAITOK); len = sizeof(*acl); error = VOP_GETACL(vp, ACL_TYPE_ACCESS, acl, ap->a_cred, ap->a_p); switch (error) { case EOPNOTSUPP: error = vaccess(vp->v_type, ip->i_mode, ip->i_uid, ip->i_gid, ap->a_mode, ap->a_cred, NULL); break; case 0: - error = vaccess_acl_posix1e(vp->v_type, acl, ap->a_mode, - ap->a_cred, NULL); + error = vaccess_acl_posix1e(vp->v_type, ip->i_uid, ip->i_gid, + acl, ap->a_mode, ap->a_cred, NULL); break; default: printf("ufs_access(): Error retrieving ACL on object (%d).\n", error); /* * XXX: Fall back until debugged. Should eventually * possibly log an error, and return EPERM for safety. */ error = vaccess(vp->v_type, ip->i_mode, ip->i_uid, ip->i_gid, ap->a_mode, ap->a_cred, NULL); } FREE(acl, M_ACL); #else error = vaccess(vp->v_type, ip->i_mode, ip->i_uid, ip->i_gid, ap->a_mode, ap->a_cred, NULL); #endif return (error); } /* ARGSUSED */ int ufs_getattr(ap) struct vop_getattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct proc *a_p; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct inode *ip = VTOI(vp); register struct vattr *vap = ap->a_vap; ufs_itimes(vp); /* * Copy from inode table */ vap->va_fsid = dev2udev(ip->i_dev); vap->va_fileid = ip->i_number; vap->va_mode = ip->i_mode & ~IFMT; vap->va_nlink = VFSTOUFS(vp->v_mount)->um_i_effnlink_valid ? ip->i_effnlink : ip->i_nlink; vap->va_uid = ip->i_uid; vap->va_gid = ip->i_gid; vap->va_rdev = ip->i_rdev; vap->va_size = ip->i_din.di_size; vap->va_atime.tv_sec = ip->i_atime; vap->va_atime.tv_nsec = ip->i_atimensec; vap->va_mtime.tv_sec = ip->i_mtime; vap->va_mtime.tv_nsec = ip->i_mtimensec; vap->va_ctime.tv_sec = ip->i_ctime; vap->va_ctime.tv_nsec = ip->i_ctimensec; vap->va_flags = ip->i_flags; vap->va_gen = ip->i_gen; vap->va_blocksize = vp->v_mount->mnt_stat.f_iosize; vap->va_bytes = dbtob((u_quad_t)ip->i_blocks); vap->va_type = IFTOVT(ip->i_mode); vap->va_filerev = ip->i_modrev; return (0); } /* * Set attribute vnode op. called from several syscalls */ int ufs_setattr(ap) struct vop_setattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct proc *a_p; } */ *ap; { struct vattr *vap = ap->a_vap; struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); struct ucred *cred = ap->a_cred; struct proc *p = ap->a_p; int error; /* * Check for unsettable attributes. */ if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) || (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) || (vap->va_blocksize != VNOVAL) || (vap->va_rdev != VNOVAL) || ((int)vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) { return (EINVAL); } if (vap->va_flags != VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); /* * Callers may only modify the file flags on objects they * have VADMIN rights for. */ if ((error = VOP_ACCESS(vp, VADMIN, cred, p))) return (error); /* * Unprivileged processes and privileged processes in * jail() are not permitted to set system flags. * Privileged processes not in jail() may only set system * flags if the securelevel <= 0. */ if (!suser_xxx(cred, NULL, 0)) { if ((ip->i_flags & (SF_NOUNLINK | SF_IMMUTABLE | SF_APPEND)) && securelevel > 0) return (EPERM); /* Snapshot flag cannot be set or cleared */ if (((vap->va_flags & SF_SNAPSHOT) != 0 && (ip->i_flags & SF_SNAPSHOT) == 0) || ((vap->va_flags & SF_SNAPSHOT) == 0 && (ip->i_flags & SF_SNAPSHOT) != 0)) return (EPERM); ip->i_flags = vap->va_flags; } else { if (ip->i_flags & (SF_NOUNLINK | SF_IMMUTABLE | SF_APPEND) || (vap->va_flags & UF_SETTABLE) != vap->va_flags) return (EPERM); ip->i_flags &= SF_SETTABLE; ip->i_flags |= (vap->va_flags & UF_SETTABLE); } ip->i_flag |= IN_CHANGE; if (vap->va_flags & (IMMUTABLE | APPEND)) return (0); } if (ip->i_flags & (IMMUTABLE | APPEND)) return (EPERM); /* * Go through the fields and update iff not VNOVAL. */ if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if ((error = ufs_chown(vp, vap->va_uid, vap->va_gid, cred, p)) != 0) return (error); } if (vap->va_size != VNOVAL) { /* * Disallow write attempts on read-only file systems; * unless the file is a socket, fifo, or a block or * character device resident on the file system. */ switch (vp->v_type) { case VDIR: return (EISDIR); case VLNK: case VREG: if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if ((ip->i_flags & SF_SNAPSHOT) != 0) return (EPERM); break; default: break; } if ((error = UFS_TRUNCATE(vp, vap->va_size, 0, cred, p)) != 0) return (error); } if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if ((ip->i_flags & SF_SNAPSHOT) != 0) return (EPERM); /* * From utimes(2): * If times is NULL, ... The caller must be the owner of * the file, have permission to write the file, or be the * super-user. * If times is non-NULL, ... The caller must be the owner of * the file or be the super-user. */ if ((error = VOP_ACCESS(vp, VADMIN, cred, p)) && ((vap->va_vaflags & VA_UTIMES_NULL) == 0 || (error = VOP_ACCESS(vp, VWRITE, cred, p)))) return (error); if (vap->va_atime.tv_sec != VNOVAL) ip->i_flag |= IN_ACCESS; if (vap->va_mtime.tv_sec != VNOVAL) ip->i_flag |= IN_CHANGE | IN_UPDATE; ufs_itimes(vp); if (vap->va_atime.tv_sec != VNOVAL) { ip->i_atime = vap->va_atime.tv_sec; ip->i_atimensec = vap->va_atime.tv_nsec; } if (vap->va_mtime.tv_sec != VNOVAL) { ip->i_mtime = vap->va_mtime.tv_sec; ip->i_mtimensec = vap->va_mtime.tv_nsec; } error = UFS_UPDATE(vp, 0); if (error) return (error); } error = 0; if (vap->va_mode != (mode_t)VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if ((ip->i_flags & SF_SNAPSHOT) != 0 && (vap->va_mode & (S_IXUSR | S_IWUSR | S_IXGRP | S_IWGRP | S_IXOTH | S_IWOTH))) return (EPERM); error = ufs_chmod(vp, (int)vap->va_mode, cred, p); } VN_KNOTE(vp, NOTE_ATTRIB); return (error); } /* * Change the mode on a file. * Inode must be locked before calling. */ static int ufs_chmod(vp, mode, cred, p) register struct vnode *vp; register int mode; register struct ucred *cred; struct proc *p; { register struct inode *ip = VTOI(vp); int error; /* * To modify the permissions on a file, must possess VADMIN * for that file. */ if ((error = VOP_ACCESS(vp, VADMIN, cred, p))) return (error); /* * Privileged processes may set the sticky bit on non-directories, * as well as set the setgid bit on a file with a group that the * process is not a member of. */ if (suser_xxx(cred, NULL, PRISON_ROOT)) { if (vp->v_type != VDIR && (mode & S_ISTXT)) return (EFTYPE); if (!groupmember(ip->i_gid, cred) && (mode & ISGID)) return (EPERM); } ip->i_mode &= ~ALLPERMS; ip->i_mode |= (mode & ALLPERMS); ip->i_flag |= IN_CHANGE; return (0); } /* * Perform chown operation on inode ip; * inode must be locked prior to call. */ static int ufs_chown(vp, uid, gid, cred, p) register struct vnode *vp; uid_t uid; gid_t gid; struct ucred *cred; struct proc *p; { register struct inode *ip = VTOI(vp); uid_t ouid; gid_t ogid; int error = 0; #ifdef QUOTA register int i; long change; #endif if (uid == (uid_t)VNOVAL) uid = ip->i_uid; if (gid == (gid_t)VNOVAL) gid = ip->i_gid; /* * To modify the ownership of a file, must possess VADMIN * for that file. */ if ((error = VOP_ACCESS(vp, VADMIN, cred, p))) return (error); /* * To change the owner of a file, or change the group of a file * to a group of which we are not a member, the caller must * have privilege. */ if ((uid != ip->i_uid || (gid != ip->i_gid && !groupmember(gid, cred))) && (error = suser_xxx(cred, p, PRISON_ROOT))) return (error); ogid = ip->i_gid; ouid = ip->i_uid; #ifdef QUOTA if ((error = getinoquota(ip)) != 0) return (error); if (ouid == uid) { dqrele(vp, ip->i_dquot[USRQUOTA]); ip->i_dquot[USRQUOTA] = NODQUOT; } if (ogid == gid) { dqrele(vp, ip->i_dquot[GRPQUOTA]); ip->i_dquot[GRPQUOTA] = NODQUOT; } change = ip->i_blocks; (void) chkdq(ip, -change, cred, CHOWN); (void) chkiq(ip, -1, cred, CHOWN); for (i = 0; i < MAXQUOTAS; i++) { dqrele(vp, ip->i_dquot[i]); ip->i_dquot[i] = NODQUOT; } #endif ip->i_gid = gid; ip->i_uid = uid; #ifdef QUOTA if ((error = getinoquota(ip)) == 0) { if (ouid == uid) { dqrele(vp, ip->i_dquot[USRQUOTA]); ip->i_dquot[USRQUOTA] = NODQUOT; } if (ogid == gid) { dqrele(vp, ip->i_dquot[GRPQUOTA]); ip->i_dquot[GRPQUOTA] = NODQUOT; } if ((error = chkdq(ip, change, cred, CHOWN)) == 0) { if ((error = chkiq(ip, 1, cred, CHOWN)) == 0) goto good; else (void) chkdq(ip, -change, cred, CHOWN|FORCE); } for (i = 0; i < MAXQUOTAS; i++) { dqrele(vp, ip->i_dquot[i]); ip->i_dquot[i] = NODQUOT; } } ip->i_gid = ogid; ip->i_uid = ouid; if (getinoquota(ip) == 0) { if (ouid == uid) { dqrele(vp, ip->i_dquot[USRQUOTA]); ip->i_dquot[USRQUOTA] = NODQUOT; } if (ogid == gid) { dqrele(vp, ip->i_dquot[GRPQUOTA]); ip->i_dquot[GRPQUOTA] = NODQUOT; } (void) chkdq(ip, change, cred, FORCE|CHOWN); (void) chkiq(ip, 1, cred, FORCE|CHOWN); (void) getinoquota(ip); } return (error); good: if (getinoquota(ip)) panic("ufs_chown: lost quota"); #endif /* QUOTA */ ip->i_flag |= IN_CHANGE; if (suser_xxx(cred, NULL, PRISON_ROOT) && (ouid != uid || ogid != gid)) ip->i_mode &= ~(ISUID | ISGID); return (0); } int ufs_remove(ap) struct vop_remove_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { struct inode *ip; struct vnode *vp = ap->a_vp; struct vnode *dvp = ap->a_dvp; int error; ip = VTOI(vp); if ((ip->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) || (VTOI(dvp)->i_flags & APPEND)) { error = EPERM; goto out; } error = ufs_dirremove(dvp, ip, ap->a_cnp->cn_flags, 0); VN_KNOTE(vp, NOTE_DELETE); VN_KNOTE(dvp, NOTE_WRITE); out: return (error); } /* * link vnode call */ int ufs_link(ap) struct vop_link_args /* { struct vnode *a_tdvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { struct vnode *vp = ap->a_vp; struct vnode *tdvp = ap->a_tdvp; struct componentname *cnp = ap->a_cnp; struct proc *p = cnp->cn_proc; struct inode *ip; struct direct newdir; int error; #ifdef DIAGNOSTIC if ((cnp->cn_flags & HASBUF) == 0) panic("ufs_link: no name"); #endif if (tdvp->v_mount != vp->v_mount) { error = EXDEV; goto out2; } if (tdvp != vp && (error = vn_lock(vp, LK_EXCLUSIVE, p))) { goto out2; } ip = VTOI(vp); if ((nlink_t)ip->i_nlink >= LINK_MAX) { error = EMLINK; goto out1; } if (ip->i_flags & (IMMUTABLE | APPEND)) { error = EPERM; goto out1; } ip->i_effnlink++; ip->i_nlink++; ip->i_flag |= IN_CHANGE; if (DOINGSOFTDEP(vp)) softdep_change_linkcnt(ip); error = UFS_UPDATE(vp, !(DOINGSOFTDEP(vp) | DOINGASYNC(vp))); if (!error) { ufs_makedirentry(ip, cnp, &newdir); error = ufs_direnter(tdvp, vp, &newdir, cnp, NULL); } if (error) { ip->i_effnlink--; ip->i_nlink--; ip->i_flag |= IN_CHANGE; if (DOINGSOFTDEP(vp)) softdep_change_linkcnt(ip); } out1: if (tdvp != vp) VOP_UNLOCK(vp, 0, p); out2: VN_KNOTE(vp, NOTE_LINK); VN_KNOTE(tdvp, NOTE_WRITE); return (error); } /* * whiteout vnode call */ int ufs_whiteout(ap) struct vop_whiteout_args /* { struct vnode *a_dvp; struct componentname *a_cnp; int a_flags; } */ *ap; { struct vnode *dvp = ap->a_dvp; struct componentname *cnp = ap->a_cnp; struct direct newdir; int error = 0; switch (ap->a_flags) { case LOOKUP: /* 4.4 format directories support whiteout operations */ if (dvp->v_mount->mnt_maxsymlinklen > 0) return (0); return (EOPNOTSUPP); case CREATE: /* create a new directory whiteout */ #ifdef DIAGNOSTIC if ((cnp->cn_flags & SAVENAME) == 0) panic("ufs_whiteout: missing name"); if (dvp->v_mount->mnt_maxsymlinklen <= 0) panic("ufs_whiteout: old format filesystem"); #endif newdir.d_ino = WINO; newdir.d_namlen = cnp->cn_namelen; bcopy(cnp->cn_nameptr, newdir.d_name, (unsigned)cnp->cn_namelen + 1); newdir.d_type = DT_WHT; error = ufs_direnter(dvp, NULL, &newdir, cnp, NULL); break; case DELETE: /* remove an existing directory whiteout */ #ifdef DIAGNOSTIC if (dvp->v_mount->mnt_maxsymlinklen <= 0) panic("ufs_whiteout: old format filesystem"); #endif cnp->cn_flags &= ~DOWHITEOUT; error = ufs_dirremove(dvp, NULL, cnp->cn_flags, 0); break; default: panic("ufs_whiteout: unknown op"); } return (error); } /* * Rename system call. * rename("foo", "bar"); * is essentially * unlink("bar"); * link("foo", "bar"); * unlink("foo"); * but ``atomically''. Can't do full commit without saving state in the * inode on disk which isn't feasible at this time. Best we can do is * always guarantee the target exists. * * Basic algorithm is: * * 1) Bump link count on source while we're linking it to the * target. This also ensure the inode won't be deleted out * from underneath us while we work (it may be truncated by * a concurrent `trunc' or `open' for creation). * 2) Link source to destination. If destination already exists, * delete it first. * 3) Unlink source reference to inode if still around. If a * directory was moved and the parent of the destination * is different from the source, patch the ".." entry in the * directory. */ int ufs_rename(ap) struct vop_rename_args /* { struct vnode *a_fdvp; struct vnode *a_fvp; struct componentname *a_fcnp; struct vnode *a_tdvp; struct vnode *a_tvp; struct componentname *a_tcnp; } */ *ap; { struct vnode *tvp = ap->a_tvp; register struct vnode *tdvp = ap->a_tdvp; struct vnode *fvp = ap->a_fvp; struct vnode *fdvp = ap->a_fdvp; struct componentname *tcnp = ap->a_tcnp; struct componentname *fcnp = ap->a_fcnp; struct proc *p = fcnp->cn_proc; struct inode *ip, *xp, *dp; struct direct newdir; int doingdirectory = 0, oldparent = 0, newparent = 0; int error = 0, ioflag; #ifdef DIAGNOSTIC if ((tcnp->cn_flags & HASBUF) == 0 || (fcnp->cn_flags & HASBUF) == 0) panic("ufs_rename: no name"); #endif /* * Check for cross-device rename. */ if ((fvp->v_mount != tdvp->v_mount) || (tvp && (fvp->v_mount != tvp->v_mount))) { error = EXDEV; abortit: if (tdvp == tvp) vrele(tdvp); else vput(tdvp); if (tvp) vput(tvp); vrele(fdvp); vrele(fvp); return (error); } if (tvp && ((VTOI(tvp)->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) || (VTOI(tdvp)->i_flags & APPEND))) { error = EPERM; goto abortit; } /* * Check if just deleting a link name or if we've lost a race. * If another process completes the same rename after we've looked * up the source and have blocked looking up the target, then the * source and target inodes may be identical now although the * names were never linked. */ if (fvp == tvp) { if (fvp->v_type == VDIR) { /* * Linked directories are impossible, so we must * have lost the race. Pretend that the rename * completed before the lookup. */ #ifdef UFS_RENAME_DEBUG printf("ufs_rename: fvp == tvp for directories\n"); #endif error = ENOENT; goto abortit; } /* Release destination completely. */ vput(tdvp); vput(tvp); /* * Delete source. There is another race now that everything * is unlocked, but this doesn't cause any new complications. * Relookup() may find a file that is unrelated to the * original one, or it may fail. Too bad. */ vrele(fdvp); vrele(fvp); fcnp->cn_flags &= ~MODMASK; fcnp->cn_flags |= LOCKPARENT | LOCKLEAF; if ((fcnp->cn_flags & SAVESTART) == 0) panic("ufs_rename: lost from startdir"); fcnp->cn_nameiop = DELETE; VREF(fdvp); error = relookup(fdvp, &fvp, fcnp); if (error == 0) vrele(fdvp); if (fvp == NULL) { #ifdef UFS_RENAME_DEBUG printf("ufs_rename: from name disappeared\n"); #endif return (ENOENT); } error = VOP_REMOVE(fdvp, fvp, fcnp); if (fdvp == fvp) vrele(fdvp); else vput(fdvp); vput(fvp); return (error); } if ((error = vn_lock(fvp, LK_EXCLUSIVE, p)) != 0) goto abortit; dp = VTOI(fdvp); ip = VTOI(fvp); if (ip->i_nlink >= LINK_MAX) { VOP_UNLOCK(fvp, 0, p); error = EMLINK; goto abortit; } if ((ip->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) || (dp->i_flags & APPEND)) { VOP_UNLOCK(fvp, 0, p); error = EPERM; goto abortit; } if ((ip->i_mode & IFMT) == IFDIR) { /* * Avoid ".", "..", and aliases of "." for obvious reasons. */ if ((fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') || dp == ip || (fcnp->cn_flags | tcnp->cn_flags) & ISDOTDOT || (ip->i_flag & IN_RENAME)) { VOP_UNLOCK(fvp, 0, p); error = EINVAL; goto abortit; } ip->i_flag |= IN_RENAME; oldparent = dp->i_number; doingdirectory = 1; } VN_KNOTE(fdvp, NOTE_WRITE); /* XXX right place? */ vrele(fdvp); /* * When the target exists, both the directory * and target vnodes are returned locked. */ dp = VTOI(tdvp); xp = NULL; if (tvp) xp = VTOI(tvp); /* * 1) Bump link count while we're moving stuff * around. If we crash somewhere before * completing our work, the link count * may be wrong, but correctable. */ ip->i_effnlink++; ip->i_nlink++; ip->i_flag |= IN_CHANGE; if (DOINGSOFTDEP(fvp)) softdep_change_linkcnt(ip); if ((error = UFS_UPDATE(fvp, !(DOINGSOFTDEP(fvp) | DOINGASYNC(fvp)))) != 0) { VOP_UNLOCK(fvp, 0, p); goto bad; } /* * If ".." must be changed (ie the directory gets a new * parent) then the source directory must not be in the * directory heirarchy above the target, as this would * orphan everything below the source directory. Also * the user must have write permission in the source so * as to be able to change "..". We must repeat the call * to namei, as the parent directory is unlocked by the * call to checkpath(). */ error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred, tcnp->cn_proc); VOP_UNLOCK(fvp, 0, p); if (oldparent != dp->i_number) newparent = dp->i_number; if (doingdirectory && newparent) { if (error) /* write access check above */ goto bad; if (xp != NULL) vput(tvp); error = ufs_checkpath(ip, dp, tcnp->cn_cred); if (error) goto out; if ((tcnp->cn_flags & SAVESTART) == 0) panic("ufs_rename: lost to startdir"); VREF(tdvp); error = relookup(tdvp, &tvp, tcnp); if (error) goto out; vrele(tdvp); dp = VTOI(tdvp); xp = NULL; if (tvp) xp = VTOI(tvp); } /* * 2) If target doesn't exist, link the target * to the source and unlink the source. * Otherwise, rewrite the target directory * entry to reference the source inode and * expunge the original entry's existence. */ if (xp == NULL) { if (dp->i_dev != ip->i_dev) panic("ufs_rename: EXDEV"); /* * Account for ".." in new directory. * When source and destination have the same * parent we don't fool with the link count. */ if (doingdirectory && newparent) { if ((nlink_t)dp->i_nlink >= LINK_MAX) { error = EMLINK; goto bad; } dp->i_effnlink++; dp->i_nlink++; dp->i_flag |= IN_CHANGE; if (DOINGSOFTDEP(tdvp)) softdep_change_linkcnt(dp); error = UFS_UPDATE(tdvp, !(DOINGSOFTDEP(tdvp) | DOINGASYNC(tdvp))); if (error) goto bad; } ufs_makedirentry(ip, tcnp, &newdir); error = ufs_direnter(tdvp, NULL, &newdir, tcnp, NULL); if (error) { if (doingdirectory && newparent) { dp->i_effnlink--; dp->i_nlink--; dp->i_flag |= IN_CHANGE; if (DOINGSOFTDEP(tdvp)) softdep_change_linkcnt(dp); (void)UFS_UPDATE(tdvp, 1); } goto bad; } VN_KNOTE(tdvp, NOTE_WRITE); vput(tdvp); } else { if (xp->i_dev != dp->i_dev || xp->i_dev != ip->i_dev) panic("ufs_rename: EXDEV"); /* * Short circuit rename(foo, foo). */ if (xp->i_number == ip->i_number) panic("ufs_rename: same file"); /* * If the parent directory is "sticky", then the caller * must possess VADMIN for the parent directory, or the * destination of the rename. This implements append-only * directories. */ if ((dp->i_mode & S_ISTXT) && VOP_ACCESS(tdvp, VADMIN, tcnp->cn_cred, p) && VOP_ACCESS(tvp, VADMIN, tcnp->cn_cred, p)) { error = EPERM; goto bad; } /* * Target must be empty if a directory and have no links * to it. Also, ensure source and target are compatible * (both directories, or both not directories). */ if ((xp->i_mode&IFMT) == IFDIR) { if ((xp->i_effnlink > 2) || !ufs_dirempty(xp, dp->i_number, tcnp->cn_cred)) { error = ENOTEMPTY; goto bad; } if (!doingdirectory) { error = ENOTDIR; goto bad; } cache_purge(tdvp); } else if (doingdirectory) { error = EISDIR; goto bad; } error = ufs_dirrewrite(dp, xp, ip->i_number, IFTODT(ip->i_mode), (doingdirectory && newparent) ? newparent : doingdirectory); if (error) goto bad; if (doingdirectory) { if (!newparent) { dp->i_effnlink--; if (DOINGSOFTDEP(tdvp)) softdep_change_linkcnt(dp); } xp->i_effnlink--; if (DOINGSOFTDEP(tvp)) softdep_change_linkcnt(xp); } if (doingdirectory && !DOINGSOFTDEP(tvp)) { /* * Truncate inode. The only stuff left in the directory * is "." and "..". The "." reference is inconsequential * since we are quashing it. We have removed the "." * reference and the reference in the parent directory, * but there may be other hard links. The soft * dependency code will arrange to do these operations * after the parent directory entry has been deleted on * disk, so when running with that code we avoid doing * them now. */ if (!newparent) { dp->i_nlink--; dp->i_flag |= IN_CHANGE; } xp->i_nlink--; xp->i_flag |= IN_CHANGE; ioflag = DOINGASYNC(tvp) ? 0 : IO_SYNC; if ((error = UFS_TRUNCATE(tvp, (off_t)0, ioflag, tcnp->cn_cred, tcnp->cn_proc)) != 0) goto bad; } VN_KNOTE(tdvp, NOTE_WRITE); vput(tdvp); VN_KNOTE(tvp, NOTE_DELETE); vput(tvp); xp = NULL; } /* * 3) Unlink the source. */ fcnp->cn_flags &= ~MODMASK; fcnp->cn_flags |= LOCKPARENT | LOCKLEAF; if ((fcnp->cn_flags & SAVESTART) == 0) panic("ufs_rename: lost from startdir"); VREF(fdvp); error = relookup(fdvp, &fvp, fcnp); if (error == 0) vrele(fdvp); if (fvp != NULL) { xp = VTOI(fvp); dp = VTOI(fdvp); } else { /* * From name has disappeared. */ if (doingdirectory) panic("ufs_rename: lost dir entry"); vrele(ap->a_fvp); return (0); } /* * Ensure that the directory entry still exists and has not * changed while the new name has been entered. If the source is * a file then the entry may have been unlinked or renamed. In * either case there is no further work to be done. If the source * is a directory then it cannot have been rmdir'ed; the IN_RENAME * flag ensures that it cannot be moved by another rename or removed * by a rmdir. */ if (xp != ip) { if (doingdirectory) panic("ufs_rename: lost dir entry"); } else { /* * If the source is a directory with a * new parent, the link count of the old * parent directory must be decremented * and ".." set to point to the new parent. */ if (doingdirectory && newparent) { xp->i_offset = mastertemplate.dot_reclen; ufs_dirrewrite(xp, dp, newparent, DT_DIR, 0); cache_purge(fdvp); } error = ufs_dirremove(fdvp, xp, fcnp->cn_flags, 0); xp->i_flag &= ~IN_RENAME; } VN_KNOTE(fvp, NOTE_RENAME); if (dp) vput(fdvp); if (xp) vput(fvp); vrele(ap->a_fvp); return (error); bad: if (xp) vput(ITOV(xp)); vput(ITOV(dp)); out: if (doingdirectory) ip->i_flag &= ~IN_RENAME; if (vn_lock(fvp, LK_EXCLUSIVE, p) == 0) { ip->i_effnlink--; ip->i_nlink--; ip->i_flag |= IN_CHANGE; ip->i_flag &= ~IN_RENAME; if (DOINGSOFTDEP(fvp)) softdep_change_linkcnt(ip); vput(fvp); } else vrele(fvp); return (error); } /* * Mkdir system call */ int ufs_mkdir(ap) struct vop_mkdir_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap; { register struct vnode *dvp = ap->a_dvp; register struct vattr *vap = ap->a_vap; register struct componentname *cnp = ap->a_cnp; register struct inode *ip, *dp; struct vnode *tvp; struct buf *bp; struct dirtemplate dirtemplate, *dtp; struct direct newdir; #ifdef UFS_ACL struct acl *acl, *dacl; #endif int error, dmode; long blkoff; #ifdef DIAGNOSTIC if ((cnp->cn_flags & HASBUF) == 0) panic("ufs_mkdir: no name"); #endif dp = VTOI(dvp); if ((nlink_t)dp->i_nlink >= LINK_MAX) { error = EMLINK; goto out; } dmode = vap->va_mode & 0777; dmode |= IFDIR; /* * Must simulate part of ufs_makeinode here to acquire the inode, * but not have it entered in the parent directory. The entry is * made later after writing "." and ".." entries. */ error = UFS_VALLOC(dvp, dmode, cnp->cn_cred, &tvp); if (error) goto out; ip = VTOI(tvp); ip->i_gid = dp->i_gid; #ifdef SUIDDIR { #ifdef QUOTA struct ucred ucred, *ucp; ucp = cnp->cn_cred; #endif /* * If we are hacking owners here, (only do this where told to) * and we are not giving it TO root, (would subvert quotas) * then go ahead and give it to the other user. * The new directory also inherits the SUID bit. * If user's UID and dir UID are the same, * 'give it away' so that the SUID is still forced on. */ if ((dvp->v_mount->mnt_flag & MNT_SUIDDIR) && (dp->i_mode & ISUID) && dp->i_uid) { dmode |= ISUID; ip->i_uid = dp->i_uid; #ifdef QUOTA if (dp->i_uid != cnp->cn_cred->cr_uid) { /* * Make sure the correct user gets charged * for the space. * Make a dummy credential for the victim. * XXX This seems to never be accessed out of * our context so a stack variable is ok. */ ucred.cr_ref = 1; ucred.cr_uid = ip->i_uid; ucred.cr_ngroups = 1; ucred.cr_groups[0] = dp->i_gid; ucp = &ucred; } #endif } else ip->i_uid = cnp->cn_cred->cr_uid; #ifdef QUOTA if ((error = getinoquota(ip)) || (error = chkiq(ip, 1, ucp, 0))) { UFS_VFREE(tvp, ip->i_number, dmode); vput(tvp); return (error); } #endif } #else /* !SUIDDIR */ ip->i_uid = cnp->cn_cred->cr_uid; #ifdef QUOTA if ((error = getinoquota(ip)) || (error = chkiq(ip, 1, cnp->cn_cred, 0))) { UFS_VFREE(tvp, ip->i_number, dmode); vput(tvp); return (error); } #endif #endif /* !SUIDDIR */ ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE; #ifdef UFS_ACL MALLOC(acl, struct acl *, sizeof(*acl), M_ACL, M_WAITOK); MALLOC(dacl, struct acl *, sizeof(*acl), M_ACL, M_WAITOK); /* * Retrieve default ACL from parent, if any. */ error = VOP_GETACL(dvp, ACL_TYPE_DEFAULT, acl, cnp->cn_cred, cnp->cn_proc); switch (error) { case 0: /* * Retrieved a default ACL, so merge mode and ACL if * necessary. */ if (acl->acl_cnt != 0) { /* * Two possible ways for default ACL to not be * present. First, the EA can be undefined, * or second, the default ACL can be blank. * If it's blank, fall through to the it's * not defined case. */ ip->i_mode = dmode; *dacl = *acl; ufs_sync_acl_from_inode(ip, acl); break; } /* FALLTHROUGH */ case EOPNOTSUPP: /* * Just use the mode as-is. */ ip->i_mode = dmode; FREE(acl, M_ACL); FREE(dacl, M_ACL); dacl = acl = NULL; break; default: UFS_VFREE(tvp, ip->i_number, dmode); vput(tvp); return (error); } #else /* !UFS_ACL */ ip->i_mode = dmode; #endif /* !UFS_ACL */ tvp->v_type = VDIR; /* Rest init'd in getnewvnode(). */ ip->i_effnlink = 2; ip->i_nlink = 2; if (DOINGSOFTDEP(tvp)) softdep_change_linkcnt(ip); if (cnp->cn_flags & ISWHITEOUT) ip->i_flags |= UF_OPAQUE; /* * Bump link count in parent directory to reflect work done below. * Should be done before reference is created so cleanup is * possible if we crash. */ dp->i_effnlink++; dp->i_nlink++; dp->i_flag |= IN_CHANGE; if (DOINGSOFTDEP(dvp)) softdep_change_linkcnt(dp); error = UFS_UPDATE(tvp, !(DOINGSOFTDEP(dvp) | DOINGASYNC(dvp))); if (error) goto bad; #ifdef UFS_ACL if (acl != NULL) { /* * XXX: If we abort now, will Soft Updates notify the extattr * code that the EAs for the file need to be released? */ error = VOP_SETACL(tvp, ACL_TYPE_ACCESS, acl, cnp->cn_cred, cnp->cn_proc); if (error == 0) error = VOP_SETACL(tvp, ACL_TYPE_DEFAULT, dacl, cnp->cn_cred, cnp->cn_proc); switch (error) { case 0: break; case EOPNOTSUPP: /* * XXX: This should not happen, as EOPNOTSUPP above * was supposed to free acl. */ printf("ufs_mkdir: VOP_GETACL() but no VOP_SETACL()\n"); /* panic("ufs_mkdir: VOP_GETACL() but no VOP_SETACL()"); */ break; default: FREE(acl, M_ACL); FREE(dacl, M_ACL); goto bad; } FREE(acl, M_ACL); FREE(dacl, M_ACL); } #endif /* !UFS_ACL */ /* * Initialize directory with "." and ".." from static template. */ if (dvp->v_mount->mnt_maxsymlinklen > 0 ) dtp = &mastertemplate; else dtp = (struct dirtemplate *)&omastertemplate; dirtemplate = *dtp; dirtemplate.dot_ino = ip->i_number; dirtemplate.dotdot_ino = dp->i_number; if ((error = VOP_BALLOC(tvp, (off_t)0, DIRBLKSIZ, cnp->cn_cred, B_CLRBUF, &bp)) != 0) goto bad; ip->i_size = DIRBLKSIZ; ip->i_flag |= IN_CHANGE | IN_UPDATE; vnode_pager_setsize(tvp, (u_long)ip->i_size); bcopy((caddr_t)&dirtemplate, (caddr_t)bp->b_data, sizeof dirtemplate); if (DOINGSOFTDEP(tvp)) { /* * Ensure that the entire newly allocated block is a * valid directory so that future growth within the * block does not have to ensure that the block is * written before the inode. */ blkoff = DIRBLKSIZ; while (blkoff < bp->b_bcount) { ((struct direct *) (bp->b_data + blkoff))->d_reclen = DIRBLKSIZ; blkoff += DIRBLKSIZ; } } if ((error = UFS_UPDATE(tvp, !(DOINGSOFTDEP(tvp) | DOINGASYNC(tvp)))) != 0) { (void)BUF_WRITE(bp); goto bad; } /* * Directory set up, now install its entry in the parent directory. * * If we are not doing soft dependencies, then we must write out the * buffer containing the new directory body before entering the new * name in the parent. If we are doing soft dependencies, then the * buffer containing the new directory body will be passed to and * released in the soft dependency code after the code has attached * an appropriate ordering dependency to the buffer which ensures that * the buffer is written before the new name is written in the parent. */ if (DOINGASYNC(dvp)) bdwrite(bp); else if (!DOINGSOFTDEP(dvp) && ((error = BUF_WRITE(bp)))) goto bad; ufs_makedirentry(ip, cnp, &newdir); error = ufs_direnter(dvp, tvp, &newdir, cnp, bp); bad: if (error == 0) { VN_KNOTE(dvp, NOTE_WRITE); *ap->a_vpp = tvp; } else { dp->i_effnlink--; dp->i_nlink--; dp->i_flag |= IN_CHANGE; if (DOINGSOFTDEP(dvp)) softdep_change_linkcnt(dp); /* * No need to do an explicit VOP_TRUNCATE here, vrele will * do this for us because we set the link count to 0. */ ip->i_effnlink = 0; ip->i_nlink = 0; ip->i_flag |= IN_CHANGE; if (DOINGSOFTDEP(tvp)) softdep_change_linkcnt(ip); vput(tvp); } out: return (error); } /* * Rmdir system call. */ int ufs_rmdir(ap) struct vop_rmdir_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { struct vnode *vp = ap->a_vp; struct vnode *dvp = ap->a_dvp; struct componentname *cnp = ap->a_cnp; struct inode *ip, *dp; int error, ioflag; ip = VTOI(vp); dp = VTOI(dvp); /* * Do not remove a directory that is in the process of being renamed. * Verify the directory is empty (and valid). Rmdir ".." will not be * valid since ".." will contain a reference to the current directory * and thus be non-empty. Do not allow the removal of mounted on * directories (this can happen when an NFS exported filesystem * tries to remove a locally mounted on directory). */ error = 0; if (ip->i_flag & IN_RENAME) { error = EINVAL; goto out; } if (ip->i_effnlink != 2 || !ufs_dirempty(ip, dp->i_number, cnp->cn_cred)) { error = ENOTEMPTY; goto out; } if ((dp->i_flags & APPEND) || (ip->i_flags & (NOUNLINK | IMMUTABLE | APPEND))) { error = EPERM; goto out; } if (vp->v_mountedhere != 0) { error = EINVAL; goto out; } /* * Delete reference to directory before purging * inode. If we crash in between, the directory * will be reattached to lost+found, */ dp->i_effnlink--; ip->i_effnlink--; if (DOINGSOFTDEP(vp)) { softdep_change_linkcnt(dp); softdep_change_linkcnt(ip); } error = ufs_dirremove(dvp, ip, cnp->cn_flags, 1); if (error) { dp->i_effnlink++; ip->i_effnlink++; if (DOINGSOFTDEP(vp)) { softdep_change_linkcnt(dp); softdep_change_linkcnt(ip); } goto out; } VN_KNOTE(dvp, NOTE_WRITE | NOTE_LINK); cache_purge(dvp); /* * Truncate inode. The only stuff left in the directory is "." and * "..". The "." reference is inconsequential since we are quashing * it. The soft dependency code will arrange to do these operations * after the parent directory entry has been deleted on disk, so * when running with that code we avoid doing them now. */ if (!DOINGSOFTDEP(vp)) { dp->i_nlink--; dp->i_flag |= IN_CHANGE; ip->i_nlink--; ip->i_flag |= IN_CHANGE; ioflag = DOINGASYNC(vp) ? 0 : IO_SYNC; error = UFS_TRUNCATE(vp, (off_t)0, ioflag, cnp->cn_cred, cnp->cn_proc); } cache_purge(vp); out: VN_KNOTE(vp, NOTE_DELETE); return (error); } /* * symlink -- make a symbolic link */ int ufs_symlink(ap) struct vop_symlink_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; char *a_target; } */ *ap; { register struct vnode *vp, **vpp = ap->a_vpp; register struct inode *ip; int len, error; error = ufs_makeinode(IFLNK | ap->a_vap->va_mode, ap->a_dvp, vpp, ap->a_cnp); if (error) return (error); VN_KNOTE(ap->a_dvp, NOTE_WRITE); vp = *vpp; len = strlen(ap->a_target); if (len < vp->v_mount->mnt_maxsymlinklen) { ip = VTOI(vp); bcopy(ap->a_target, (char *)ip->i_shortlink, len); ip->i_size = len; ip->i_flag |= IN_CHANGE | IN_UPDATE; } else error = vn_rdwr(UIO_WRITE, vp, ap->a_target, len, (off_t)0, UIO_SYSSPACE, IO_NODELOCKED, ap->a_cnp->cn_cred, (int *)0, (struct proc *)0); if (error) vput(vp); return (error); } /* * Vnode op for reading directories. * * The routine below assumes that the on-disk format of a directory * is the same as that defined by . If the on-disk * format changes, then it will be necessary to do a conversion * from the on-disk format that read returns to the format defined * by . */ int ufs_readdir(ap) struct vop_readdir_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; int *a_eofflag; int *ncookies; u_long **a_cookies; } */ *ap; { register struct uio *uio = ap->a_uio; int error; size_t count, lost; off_t off; if (ap->a_ncookies != NULL) /* * Ensure that the block is aligned. The caller can use * the cookies to determine where in the block to start. */ uio->uio_offset &= ~(DIRBLKSIZ - 1); off = uio->uio_offset; count = uio->uio_resid; /* Make sure we don't return partial entries. */ if (count <= ((uio->uio_offset + count) & (DIRBLKSIZ -1))) return (EINVAL); count -= (uio->uio_offset + count) & (DIRBLKSIZ -1); lost = uio->uio_resid - count; uio->uio_resid = count; uio->uio_iov->iov_len = count; # if (BYTE_ORDER == LITTLE_ENDIAN) if (ap->a_vp->v_mount->mnt_maxsymlinklen > 0) { error = VOP_READ(ap->a_vp, uio, 0, ap->a_cred); } else { struct dirent *dp, *edp; struct uio auio; struct iovec aiov; caddr_t dirbuf; int readcnt; u_char tmp; auio = *uio; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_segflg = UIO_SYSSPACE; aiov.iov_len = count; MALLOC(dirbuf, caddr_t, count, M_TEMP, M_WAITOK); aiov.iov_base = dirbuf; error = VOP_READ(ap->a_vp, &auio, 0, ap->a_cred); if (error == 0) { readcnt = count - auio.uio_resid; edp = (struct dirent *)&dirbuf[readcnt]; for (dp = (struct dirent *)dirbuf; dp < edp; ) { tmp = dp->d_namlen; dp->d_namlen = dp->d_type; dp->d_type = tmp; if (dp->d_reclen > 0) { dp = (struct dirent *) ((char *)dp + dp->d_reclen); } else { error = EIO; break; } } if (dp >= edp) error = uiomove(dirbuf, readcnt, uio); } FREE(dirbuf, M_TEMP); } # else error = VOP_READ(ap->a_vp, uio, 0, ap->a_cred); # endif if (!error && ap->a_ncookies != NULL) { struct dirent* dpStart; struct dirent* dpEnd; struct dirent* dp; int ncookies; u_long *cookies; u_long *cookiep; if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) panic("ufs_readdir: unexpected uio from NFS server"); dpStart = (struct dirent *) (uio->uio_iov->iov_base - (uio->uio_offset - off)); dpEnd = (struct dirent *) uio->uio_iov->iov_base; for (dp = dpStart, ncookies = 0; dp < dpEnd; dp = (struct dirent *)((caddr_t) dp + dp->d_reclen)) ncookies++; MALLOC(cookies, u_long *, ncookies * sizeof(u_long), M_TEMP, M_WAITOK); for (dp = dpStart, cookiep = cookies; dp < dpEnd; dp = (struct dirent *)((caddr_t) dp + dp->d_reclen)) { off += dp->d_reclen; *cookiep++ = (u_long) off; } *ap->a_ncookies = ncookies; *ap->a_cookies = cookies; } uio->uio_resid += lost; if (ap->a_eofflag) *ap->a_eofflag = VTOI(ap->a_vp)->i_size <= uio->uio_offset; return (error); } /* * Return target name of a symbolic link */ int ufs_readlink(ap) struct vop_readlink_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct inode *ip = VTOI(vp); int isize; isize = ip->i_size; if ((isize < vp->v_mount->mnt_maxsymlinklen) || (ip->i_din.di_blocks == 0)) { /* XXX - for old fastlink support */ uiomove((char *)ip->i_shortlink, isize, ap->a_uio); return (0); } return (VOP_READ(vp, ap->a_uio, 0, ap->a_cred)); } /* * Calculate the logical to physical mapping if not done already, * then call the device strategy routine. * * In order to be able to swap to a file, the VOP_BMAP operation may not * deadlock on memory. See ufs_bmap() for details. */ int ufs_strategy(ap) struct vop_strategy_args /* { struct vnode *a_vp; struct buf *a_bp; } */ *ap; { register struct buf *bp = ap->a_bp; register struct vnode *vp = ap->a_vp; register struct inode *ip; int error; ip = VTOI(vp); if (vp->v_type == VBLK || vp->v_type == VCHR) panic("ufs_strategy: spec"); if (bp->b_blkno == bp->b_lblkno) { error = VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL); if (error) { bp->b_error = error; bp->b_ioflags |= BIO_ERROR; bufdone(bp); return (error); } if ((long)bp->b_blkno == -1) vfs_bio_clrbuf(bp); } if ((long)bp->b_blkno == -1) { bufdone(bp); return (0); } vp = ip->i_devvp; bp->b_dev = vp->v_rdev; VOP_STRATEGY(vp, bp); return (0); } /* * Print out the contents of an inode. */ int ufs_print(ap) struct vop_print_args /* { struct vnode *a_vp; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct inode *ip = VTOI(vp); printf("tag VT_UFS, ino %lu, on dev %s (%d, %d)", (u_long)ip->i_number, devtoname(ip->i_dev), major(ip->i_dev), minor(ip->i_dev)); if (vp->v_type == VFIFO) fifo_printinfo(vp); lockmgr_printinfo(&vp->v_lock); printf("\n"); return (0); } /* * Read wrapper for special devices. */ int ufsspec_read(ap) struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { int error, resid; struct inode *ip; struct uio *uio; uio = ap->a_uio; resid = uio->uio_resid; error = VOCALL(spec_vnodeop_p, VOFFSET(vop_read), ap); /* * The inode may have been revoked during the call, so it must not * be accessed blindly here or in the other wrapper functions. */ ip = VTOI(ap->a_vp); if (ip != NULL && (uio->uio_resid != resid || (error == 0 && resid != 0))) ip->i_flag |= IN_ACCESS; return (error); } /* * Write wrapper for special devices. */ int ufsspec_write(ap) struct vop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { int error, resid; struct inode *ip; struct uio *uio; uio = ap->a_uio; resid = uio->uio_resid; error = VOCALL(spec_vnodeop_p, VOFFSET(vop_write), ap); ip = VTOI(ap->a_vp); if (ip != NULL && (uio->uio_resid != resid || (error == 0 && resid != 0))) VTOI(ap->a_vp)->i_flag |= IN_CHANGE | IN_UPDATE; return (error); } /* * Close wrapper for special devices. * * Update the times on the inode then do device close. */ int ufsspec_close(ap) struct vop_close_args /* { struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct proc *a_p; } */ *ap; { struct vnode *vp = ap->a_vp; mtx_lock(&vp->v_interlock); if (vp->v_usecount > 1) ufs_itimes(vp); mtx_unlock(&vp->v_interlock); return (VOCALL(spec_vnodeop_p, VOFFSET(vop_close), ap)); } /* * Read wrapper for fifos. */ int ufsfifo_read(ap) struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { int error, resid; struct inode *ip; struct uio *uio; uio = ap->a_uio; resid = uio->uio_resid; error = VOCALL(fifo_vnodeop_p, VOFFSET(vop_read), ap); ip = VTOI(ap->a_vp); if ((ap->a_vp->v_mount->mnt_flag & MNT_NOATIME) == 0 && ip != NULL && (uio->uio_resid != resid || (error == 0 && resid != 0))) VTOI(ap->a_vp)->i_flag |= IN_ACCESS; return (error); } /* * Write wrapper for fifos. */ int ufsfifo_write(ap) struct vop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { int error, resid; struct inode *ip; struct uio *uio; uio = ap->a_uio; resid = uio->uio_resid; error = VOCALL(fifo_vnodeop_p, VOFFSET(vop_write), ap); ip = VTOI(ap->a_vp); if (ip != NULL && (uio->uio_resid != resid || (error == 0 && resid != 0))) VTOI(ap->a_vp)->i_flag |= IN_CHANGE | IN_UPDATE; return (error); } /* * Close wrapper for fifos. * * Update the times on the inode then do device close. */ int ufsfifo_close(ap) struct vop_close_args /* { struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct proc *a_p; } */ *ap; { struct vnode *vp = ap->a_vp; mtx_lock(&vp->v_interlock); if (vp->v_usecount > 1) ufs_itimes(vp); mtx_unlock(&vp->v_interlock); return (VOCALL(fifo_vnodeop_p, VOFFSET(vop_close), ap)); } /* * Return POSIX pathconf information applicable to ufs filesystems. */ int ufs_pathconf(ap) struct vop_pathconf_args /* { struct vnode *a_vp; int a_name; int *a_retval; } */ *ap; { switch (ap->a_name) { case _PC_LINK_MAX: *ap->a_retval = LINK_MAX; return (0); case _PC_NAME_MAX: *ap->a_retval = NAME_MAX; return (0); case _PC_PATH_MAX: *ap->a_retval = PATH_MAX; return (0); case _PC_PIPE_BUF: *ap->a_retval = PIPE_BUF; return (0); case _PC_CHOWN_RESTRICTED: *ap->a_retval = 1; return (0); case _PC_NO_TRUNC: *ap->a_retval = 1; return (0); default: return (EINVAL); } /* NOTREACHED */ } /* * Advisory record locking support */ int ufs_advlock(ap) struct vop_advlock_args /* { struct vnode *a_vp; caddr_t a_id; int a_op; struct flock *a_fl; int a_flags; } */ *ap; { register struct inode *ip = VTOI(ap->a_vp); return (lf_advlock(ap, &(ip->i_lockf), ip->i_size)); } /* * Initialize the vnode associated with a new inode, handle aliased * vnodes. */ int ufs_vinit(mntp, specops, fifoops, vpp) struct mount *mntp; vop_t **specops; vop_t **fifoops; struct vnode **vpp; { struct inode *ip; struct vnode *vp; struct timeval tv; vp = *vpp; ip = VTOI(vp); switch(vp->v_type = IFTOVT(ip->i_mode)) { case VCHR: case VBLK: vp->v_op = specops; vp = addaliasu(vp, ip->i_rdev); ip->i_vnode = vp; break; case VFIFO: vp->v_op = fifoops; break; default: break; } if (ip->i_number == ROOTINO) vp->v_flag |= VROOT; /* * Initialize modrev times */ getmicrouptime(&tv); SETHIGH(ip->i_modrev, tv.tv_sec); SETLOW(ip->i_modrev, tv.tv_usec * 4294); *vpp = vp; return (0); } /* * Allocate a new inode. * Vnode dvp must be locked. */ int ufs_makeinode(mode, dvp, vpp, cnp) int mode; struct vnode *dvp; struct vnode **vpp; struct componentname *cnp; { register struct inode *ip, *pdir; struct direct newdir; struct vnode *tvp; #ifdef UFS_ACL struct acl *acl; #endif int error; pdir = VTOI(dvp); #ifdef DIAGNOSTIC if ((cnp->cn_flags & HASBUF) == 0) panic("ufs_makeinode: no name"); #endif *vpp = NULL; if ((mode & IFMT) == 0) mode |= IFREG; error = UFS_VALLOC(dvp, mode, cnp->cn_cred, &tvp); if (error) return (error); ip = VTOI(tvp); ip->i_gid = pdir->i_gid; #ifdef SUIDDIR { #ifdef QUOTA struct ucred ucred, *ucp; ucp = cnp->cn_cred; #endif /* * If we are not the owner of the directory, * and we are hacking owners here, (only do this where told to) * and we are not giving it TO root, (would subvert quotas) * then go ahead and give it to the other user. * Note that this drops off the execute bits for security. */ if ((dvp->v_mount->mnt_flag & MNT_SUIDDIR) && (pdir->i_mode & ISUID) && (pdir->i_uid != cnp->cn_cred->cr_uid) && pdir->i_uid) { ip->i_uid = pdir->i_uid; mode &= ~07111; #ifdef QUOTA /* * Make sure the correct user gets charged * for the space. * Quickly knock up a dummy credential for the victim. * XXX This seems to never be accessed out of our * context so a stack variable is ok. */ ucred.cr_ref = 1; ucred.cr_uid = ip->i_uid; ucred.cr_ngroups = 1; ucred.cr_groups[0] = pdir->i_gid; ucp = &ucred; #endif } else ip->i_uid = cnp->cn_cred->cr_uid; #ifdef QUOTA if ((error = getinoquota(ip)) || (error = chkiq(ip, 1, ucp, 0))) { UFS_VFREE(tvp, ip->i_number, mode); vput(tvp); return (error); } #endif } #else /* !SUIDDIR */ ip->i_uid = cnp->cn_cred->cr_uid; #ifdef QUOTA if ((error = getinoquota(ip)) || (error = chkiq(ip, 1, cnp->cn_cred, 0))) { UFS_VFREE(tvp, ip->i_number, mode); vput(tvp); return (error); } #endif #endif /* !SUIDDIR */ ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE; #ifdef UFS_ACL MALLOC(acl, struct acl *, sizeof(*acl), M_ACL, M_WAITOK); /* * Retrieve default ACL for parent, if any. */ error = VOP_GETACL(dvp, ACL_TYPE_DEFAULT, acl, cnp->cn_cred, cnp->cn_proc); switch (error) { case 0: /* * Retrieved a default ACL, so merge mode and ACL if * necessary. */ if (acl->acl_cnt != 0) { /* * Two possible ways for default ACL to not be * present. First, the EA can be undefined, * or second, the default ACL can be blank. * If it's blank, fall through to the it's * not defined case. */ ip->i_mode = mode; ufs_sync_acl_from_inode(ip, acl); break; } case EOPNOTSUPP: /* * Just use the mode as-is. */ ip->i_mode = mode; FREE(acl, M_ACL); acl = NULL; break; default: UFS_VFREE(tvp, ip->i_number, mode); vput(tvp); return (error); } #else /* !UFS_ACL */ ip->i_mode = mode; #endif /* !UFS_ACL */ tvp->v_type = IFTOVT(mode); /* Rest init'd in getnewvnode(). */ ip->i_effnlink = 1; ip->i_nlink = 1; if (DOINGSOFTDEP(tvp)) softdep_change_linkcnt(ip); if ((ip->i_mode & ISGID) && !groupmember(ip->i_gid, cnp->cn_cred) && suser_xxx(cnp->cn_cred, NULL, PRISON_ROOT)) ip->i_mode &= ~ISGID; if (cnp->cn_flags & ISWHITEOUT) ip->i_flags |= UF_OPAQUE; /* * Make sure inode goes to disk before directory entry. */ error = UFS_UPDATE(tvp, !(DOINGSOFTDEP(tvp) | DOINGASYNC(tvp))); if (error) goto bad; #ifdef UFS_ACL if (acl != NULL) { /* * XXX: If we abort now, will Soft Updates notify the extattr * code that the EAs for the file need to be released? */ error = VOP_SETACL(tvp, ACL_TYPE_ACCESS, acl, cnp->cn_cred, cnp->cn_proc); switch (error) { case 0: break; case EOPNOTSUPP: /* * XXX: This should not happen, as EOPNOTSUPP above was * supposed to free acl. */ printf("ufs_makeinode: VOP_GETACL() but no " "VOP_SETACL()\n"); /* panic("ufs_makeinode: VOP_GETACL() but no " "VOP_SETACL()"); */ break; default: FREE(acl, M_ACL); goto bad; } FREE(acl, M_ACL); } #endif /* !UFS_ACL */ ufs_makedirentry(ip, cnp, &newdir); error = ufs_direnter(dvp, tvp, &newdir, cnp, NULL); if (error) goto bad; *vpp = tvp; return (0); bad: /* * Write error occurred trying to update the inode * or the directory so must deallocate the inode. */ ip->i_effnlink = 0; ip->i_nlink = 0; ip->i_flag |= IN_CHANGE; if (DOINGSOFTDEP(tvp)) softdep_change_linkcnt(ip); vput(tvp); return (error); } static int ufs_missingop(ap) struct vop_generic_args *ap; { panic("no vop function for %s in ufs child", ap->a_desc->vdesc_name); return (EOPNOTSUPP); } static struct filterops ufsread_filtops = { 1, NULL, filt_ufsdetach, filt_ufsread }; static struct filterops ufsvnode_filtops = { 1, NULL, filt_ufsdetach, filt_ufsvnode }; static int ufs_kqfilter(ap) struct vop_kqfilter_args /* { struct vnode *a_vp; struct knote *a_kn; } */ *ap; { struct vnode *vp = ap->a_vp; struct knote *kn = ap->a_kn; switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &ufsread_filtops; break; case EVFILT_VNODE: kn->kn_fop = &ufsvnode_filtops; break; default: return (1); } kn->kn_hook = (caddr_t)vp; mtx_lock(&vp->v_pollinfo.vpi_lock); SLIST_INSERT_HEAD(&vp->v_pollinfo.vpi_selinfo.si_note, kn, kn_selnext); mtx_unlock(&vp->v_pollinfo.vpi_lock); return (0); } static void filt_ufsdetach(struct knote *kn) { struct vnode *vp = (struct vnode *)kn->kn_hook; mtx_lock(&vp->v_pollinfo.vpi_lock); SLIST_REMOVE(&vp->v_pollinfo.vpi_selinfo.si_note, kn, knote, kn_selnext); mtx_unlock(&vp->v_pollinfo.vpi_lock); } /*ARGSUSED*/ static int filt_ufsread(struct knote *kn, long hint) { struct vnode *vp = (struct vnode *)kn->kn_hook; struct inode *ip = VTOI(vp); /* * filesystem is gone, so set the EOF flag and schedule * the knote for deletion. */ if (hint == NOTE_REVOKE) { kn->kn_flags |= (EV_EOF | EV_ONESHOT); return (1); } kn->kn_data = ip->i_size - kn->kn_fp->f_offset; return (kn->kn_data != 0); } static int filt_ufsvnode(struct knote *kn, long hint) { if (kn->kn_sfflags & hint) kn->kn_fflags |= hint; if (hint == NOTE_REVOKE) { kn->kn_flags |= EV_EOF; return (1); } return (kn->kn_fflags != 0); } /* Global vfs data structures for ufs. */ static vop_t **ufs_vnodeop_p; static struct vnodeopv_entry_desc ufs_vnodeop_entries[] = { { &vop_default_desc, (vop_t *) vop_defaultop }, { &vop_fsync_desc, (vop_t *) ufs_missingop }, { &vop_read_desc, (vop_t *) ufs_missingop }, { &vop_reallocblks_desc, (vop_t *) ufs_missingop }, { &vop_write_desc, (vop_t *) ufs_missingop }, { &vop_access_desc, (vop_t *) ufs_access }, { &vop_advlock_desc, (vop_t *) ufs_advlock }, { &vop_bmap_desc, (vop_t *) ufs_bmap }, { &vop_cachedlookup_desc, (vop_t *) ufs_lookup }, { &vop_close_desc, (vop_t *) ufs_close }, { &vop_create_desc, (vop_t *) ufs_create }, { &vop_getattr_desc, (vop_t *) ufs_getattr }, { &vop_inactive_desc, (vop_t *) ufs_inactive }, { &vop_islocked_desc, (vop_t *) vop_stdislocked }, { &vop_link_desc, (vop_t *) ufs_link }, { &vop_lock_desc, (vop_t *) vop_stdlock }, { &vop_lookup_desc, (vop_t *) vfs_cache_lookup }, { &vop_mkdir_desc, (vop_t *) ufs_mkdir }, { &vop_mknod_desc, (vop_t *) ufs_mknod }, { &vop_open_desc, (vop_t *) ufs_open }, { &vop_pathconf_desc, (vop_t *) ufs_pathconf }, { &vop_poll_desc, (vop_t *) vop_stdpoll }, { &vop_kqfilter_desc, (vop_t *) ufs_kqfilter }, { &vop_getwritemount_desc, (vop_t *) vop_stdgetwritemount }, { &vop_print_desc, (vop_t *) ufs_print }, { &vop_readdir_desc, (vop_t *) ufs_readdir }, { &vop_readlink_desc, (vop_t *) ufs_readlink }, { &vop_reclaim_desc, (vop_t *) ufs_reclaim }, { &vop_remove_desc, (vop_t *) ufs_remove }, { &vop_rename_desc, (vop_t *) ufs_rename }, { &vop_rmdir_desc, (vop_t *) ufs_rmdir }, { &vop_setattr_desc, (vop_t *) ufs_setattr }, { &vop_strategy_desc, (vop_t *) ufs_strategy }, { &vop_symlink_desc, (vop_t *) ufs_symlink }, { &vop_unlock_desc, (vop_t *) vop_stdunlock }, { &vop_whiteout_desc, (vop_t *) ufs_whiteout }, #ifdef UFS_ACL { &vop_getacl_desc, (vop_t *) ufs_getacl }, { &vop_setacl_desc, (vop_t *) ufs_setacl }, { &vop_aclcheck_desc, (vop_t *) ufs_aclcheck }, #endif { NULL, NULL } }; static struct vnodeopv_desc ufs_vnodeop_opv_desc = { &ufs_vnodeop_p, ufs_vnodeop_entries }; static vop_t **ufs_specop_p; static struct vnodeopv_entry_desc ufs_specop_entries[] = { { &vop_default_desc, (vop_t *) spec_vnoperate }, { &vop_fsync_desc, (vop_t *) ufs_missingop }, { &vop_access_desc, (vop_t *) ufs_access }, { &vop_close_desc, (vop_t *) ufsspec_close }, { &vop_getattr_desc, (vop_t *) ufs_getattr }, { &vop_inactive_desc, (vop_t *) ufs_inactive }, { &vop_islocked_desc, (vop_t *) vop_stdislocked }, { &vop_lock_desc, (vop_t *) vop_stdlock }, { &vop_print_desc, (vop_t *) ufs_print }, { &vop_read_desc, (vop_t *) ufsspec_read }, { &vop_reclaim_desc, (vop_t *) ufs_reclaim }, { &vop_setattr_desc, (vop_t *) ufs_setattr }, { &vop_unlock_desc, (vop_t *) vop_stdunlock }, { &vop_write_desc, (vop_t *) ufsspec_write }, #ifdef UFS_ACL { &vop_getacl_desc, (vop_t *) ufs_getacl }, { &vop_setacl_desc, (vop_t *) ufs_setacl }, { &vop_aclcheck_desc, (vop_t *) ufs_aclcheck }, #endif {NULL, NULL} }; static struct vnodeopv_desc ufs_specop_opv_desc = { &ufs_specop_p, ufs_specop_entries }; static vop_t **ufs_fifoop_p; static struct vnodeopv_entry_desc ufs_fifoop_entries[] = { { &vop_default_desc, (vop_t *) fifo_vnoperate }, { &vop_fsync_desc, (vop_t *) ufs_missingop }, { &vop_access_desc, (vop_t *) ufs_access }, { &vop_close_desc, (vop_t *) ufsfifo_close }, { &vop_getattr_desc, (vop_t *) ufs_getattr }, { &vop_inactive_desc, (vop_t *) ufs_inactive }, { &vop_islocked_desc, (vop_t *) vop_stdislocked }, { &vop_lock_desc, (vop_t *) vop_stdlock }, { &vop_print_desc, (vop_t *) ufs_print }, { &vop_read_desc, (vop_t *) ufsfifo_read }, { &vop_reclaim_desc, (vop_t *) ufs_reclaim }, { &vop_setattr_desc, (vop_t *) ufs_setattr }, { &vop_unlock_desc, (vop_t *) vop_stdunlock }, { &vop_write_desc, (vop_t *) ufsfifo_write }, #ifdef UFS_ACL { &vop_getacl_desc, (vop_t *) ufs_getacl }, { &vop_setacl_desc, (vop_t *) ufs_setacl }, { &vop_aclcheck_desc, (vop_t *) ufs_aclcheck }, #endif { NULL, NULL } }; static struct vnodeopv_desc ufs_fifoop_opv_desc = { &ufs_fifoop_p, ufs_fifoop_entries }; VNODEOP_SET(ufs_vnodeop_opv_desc); VNODEOP_SET(ufs_specop_opv_desc); VNODEOP_SET(ufs_fifoop_opv_desc); int ufs_vnoperate(ap) struct vop_generic_args /* { struct vnodeop_desc *a_desc; } */ *ap; { return (VOCALL(ufs_vnodeop_p, ap->a_desc->vdesc_offset, ap)); } int ufs_vnoperatefifo(ap) struct vop_generic_args /* { struct vnodeop_desc *a_desc; } */ *ap; { return (VOCALL(ufs_fifoop_p, ap->a_desc->vdesc_offset, ap)); } int ufs_vnoperatespec(ap) struct vop_generic_args /* { struct vnodeop_desc *a_desc; } */ *ap; { return (VOCALL(ufs_specop_p, ap->a_desc->vdesc_offset, ap)); }