Index: head/bin/df/df.c =================================================================== --- head/bin/df/df.c (revision 122536) +++ head/bin/df/df.c (revision 122537) @@ -1,587 +1,589 @@ /* * Copyright (c) 1980, 1990, 1993, 1994 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #if 0 #ifndef lint static const char copyright[] = "@(#) Copyright (c) 1980, 1990, 1993, 1994\n\ The Regents of the University of California. All rights reserved.\n"; #endif /* not lint */ #ifndef lint static char sccsid[] = "@(#)df.c 8.9 (Berkeley) 5/8/95"; #endif /* not lint */ #endif #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include "extern.h" #define UNITS_SI 1 #define UNITS_2 2 #define KILO_SZ(n) (n) #define MEGA_SZ(n) ((n) * (n)) #define GIGA_SZ(n) ((n) * (n) * (n)) #define TERA_SZ(n) ((n) * (n) * (n) * (n)) #define PETA_SZ(n) ((n) * (n) * (n) * (n) * (n)) #define KILO_2_SZ (KILO_SZ(1024ULL)) #define MEGA_2_SZ (MEGA_SZ(1024ULL)) #define GIGA_2_SZ (GIGA_SZ(1024ULL)) #define TERA_2_SZ (TERA_SZ(1024ULL)) #define PETA_2_SZ (PETA_SZ(1024ULL)) #define KILO_SI_SZ (KILO_SZ(1000ULL)) #define MEGA_SI_SZ (MEGA_SZ(1000ULL)) #define GIGA_SI_SZ (GIGA_SZ(1000ULL)) #define TERA_SI_SZ (TERA_SZ(1000ULL)) #define PETA_SI_SZ (PETA_SZ(1000ULL)) /* Maximum widths of various fields. */ struct maxwidths { size_t mntfrom; size_t total; size_t used; size_t avail; size_t iused; size_t ifree; }; static uintmax_t vals_si [] = { 1, KILO_SI_SZ, MEGA_SI_SZ, GIGA_SI_SZ, TERA_SI_SZ, PETA_SI_SZ }; static uintmax_t vals_base2[] = { 1, KILO_2_SZ, MEGA_2_SZ, GIGA_2_SZ, TERA_2_SZ, PETA_2_SZ }; static uintmax_t *valp; typedef enum { NONE, KILO, MEGA, GIGA, TERA, PETA, UNIT_MAX } unit_t; static unit_t unitp [] = { NONE, KILO, MEGA, GIGA, TERA, PETA }; static char *getmntpt(const char *); -static size_t longwidth(long); +static size_t int64width(int64_t); static char *makenetvfslist(void); -static void prthuman(const struct statfs *, size_t); +static void prthuman(const struct statfs *, int64_t); static void prthumanval(double); static void prtstat(struct statfs *, struct maxwidths *); static size_t regetmntinfo(struct statfs **, long, const char **); static unit_t unit_adjust(double *); static void update_maxwidths(struct maxwidths *, const struct statfs *); static void usage(void); static __inline u_int max(u_int a, u_int b) { return (a > b ? a : b); } static int aflag = 0, hflag, iflag, nflag; static struct ufs_args mdev; int main(int argc, char *argv[]) { struct stat stbuf; struct statfs statfsbuf, *mntbuf; struct maxwidths maxwidths; const char *fstype; char *mntpath, *mntpt; const char **vfslist; size_t i, mntsize; int ch, rv; fstype = "ufs"; vfslist = NULL; while ((ch = getopt(argc, argv, "abgHhiklmnPt:")) != -1) switch (ch) { case 'a': aflag = 1; break; case 'b': /* FALLTHROUGH */ case 'P': putenv("BLOCKSIZE=512"); hflag = 0; break; case 'g': putenv("BLOCKSIZE=1g"); hflag = 0; break; case 'H': hflag = UNITS_SI; valp = vals_si; break; case 'h': hflag = UNITS_2; valp = vals_base2; break; case 'i': iflag = 1; break; case 'k': putenv("BLOCKSIZE=1k"); hflag = 0; break; case 'l': if (vfslist != NULL) errx(1, "-l and -t are mutually exclusive."); vfslist = makevfslist(makenetvfslist()); break; case 'm': putenv("BLOCKSIZE=1m"); hflag = 0; break; case 'n': nflag = 1; break; case 't': if (vfslist != NULL) errx(1, "only one -t option may be specified"); fstype = optarg; vfslist = makevfslist(optarg); break; case '?': default: usage(); } argc -= optind; argv += optind; mntsize = getmntinfo(&mntbuf, MNT_NOWAIT); bzero(&maxwidths, sizeof(maxwidths)); for (i = 0; i < mntsize; i++) update_maxwidths(&maxwidths, &mntbuf[i]); rv = 0; if (!*argv) { mntsize = regetmntinfo(&mntbuf, mntsize, vfslist); bzero(&maxwidths, sizeof(maxwidths)); for (i = 0; i < mntsize; i++) update_maxwidths(&maxwidths, &mntbuf[i]); for (i = 0; i < mntsize; i++) { if (aflag || (mntbuf[i].f_flags & MNT_IGNORE) == 0) prtstat(&mntbuf[i], &maxwidths); } exit(rv); } for (; *argv; argv++) { if (stat(*argv, &stbuf) < 0) { if ((mntpt = getmntpt(*argv)) == 0) { warn("%s", *argv); rv = 1; continue; } } else if (S_ISCHR(stbuf.st_mode)) { if ((mntpt = getmntpt(*argv)) == 0) { mdev.fspec = *argv; mntpath = strdup("/tmp/df.XXXXXX"); if (mntpath == NULL) { warn("strdup failed"); rv = 1; continue; } mntpt = mkdtemp(mntpath); if (mntpt == NULL) { warn("mkdtemp(\"%s\") failed", mntpath); rv = 1; free(mntpath); continue; } if (mount(fstype, mntpt, MNT_RDONLY, &mdev) != 0) { warn("%s", *argv); rv = 1; (void)rmdir(mntpt); free(mntpath); continue; } else if (statfs(mntpt, &statfsbuf) == 0) { statfsbuf.f_mntonname[0] = '\0'; prtstat(&statfsbuf, &maxwidths); } else { warn("%s", *argv); rv = 1; } (void)unmount(mntpt, 0); (void)rmdir(mntpt); free(mntpath); continue; } } else mntpt = *argv; /* * Statfs does not take a `wait' flag, so we cannot * implement nflag here. */ if (statfs(mntpt, &statfsbuf) < 0) { warn("%s", mntpt); rv = 1; continue; } /* * Check to make sure the arguments we've been given are * satisfied. Return an error if we have been asked to * list a mount point that does not match the other args * we've been given (-l, -t, etc.). */ if (checkvfsname(statfsbuf.f_fstypename, vfslist)) { rv = 1; continue; } if (argc == 1) { bzero(&maxwidths, sizeof(maxwidths)); update_maxwidths(&maxwidths, &statfsbuf); } prtstat(&statfsbuf, &maxwidths); } return (rv); } static char * getmntpt(const char *name) { size_t mntsize, i; struct statfs *mntbuf; mntsize = getmntinfo(&mntbuf, MNT_NOWAIT); for (i = 0; i < mntsize; i++) { if (!strcmp(mntbuf[i].f_mntfromname, name)) return (mntbuf[i].f_mntonname); } return (0); } /* * Make a pass over the file system info in ``mntbuf'' filtering out * file system types not in vfslist and possibly re-stating to get * current (not cached) info. Returns the new count of valid statfs bufs. */ static size_t regetmntinfo(struct statfs **mntbufp, long mntsize, const char **vfslist) { int i, j; struct statfs *mntbuf; if (vfslist == NULL) return (nflag ? mntsize : getmntinfo(mntbufp, MNT_WAIT)); mntbuf = *mntbufp; for (j = 0, i = 0; i < mntsize; i++) { if (checkvfsname(mntbuf[i].f_fstypename, vfslist)) continue; if (!nflag) (void)statfs(mntbuf[i].f_mntonname,&mntbuf[j]); else if (i != j) mntbuf[j] = mntbuf[i]; j++; } return (j); } /* * Output in "human-readable" format. Uses 3 digits max and puts * unit suffixes at the end. Makes output compact and easy to read, * especially on huge disks. * */ static unit_t unit_adjust(double *val) { double abval; unit_t unit; int unit_sz; abval = fabs(*val); unit_sz = abval ? ilogb(abval) / 10 : 0; if (unit_sz >= (int)UNIT_MAX) { unit = NONE; } else { unit = unitp[unit_sz]; *val /= (double)valp[unit_sz]; } return (unit); } static void -prthuman(const struct statfs *sfsp, size_t used) +prthuman(const struct statfs *sfsp, int64_t used) { prthumanval((double)sfsp->f_blocks * (double)sfsp->f_bsize); prthumanval((double)used * (double)sfsp->f_bsize); prthumanval((double)sfsp->f_bavail * (double)sfsp->f_bsize); } static void prthumanval(double bytes) { unit_t unit; unit = unit_adjust(&bytes); if (bytes == 0) (void)printf(" 0B"); else if (bytes > 10) (void)printf(" %5.0f%c", bytes, "BKMGTPE"[unit]); else (void)printf(" %5.1f%c", bytes, "BKMGTPE"[unit]); } /* * Convert statfs returned file system size into BLOCKSIZE units. * Attempts to avoid overflow for large file systems. */ #define fsbtoblk(num, fsbs, bs) \ (((fsbs) != 0 && (fsbs) < (bs)) ? \ (num) / ((bs) / (fsbs)) : (num) * ((fsbs) / (bs))) /* * Print out status about a file system. */ static void prtstat(struct statfs *sfsp, struct maxwidths *mwp) { - static long blocksize; + static u_long blocksize; static int headerlen, timesthrough = 0; static const char *header; - size_t used, availblks, inodes; + int64_t used, availblks, inodes; if (++timesthrough == 1) { mwp->mntfrom = max(mwp->mntfrom, strlen("Filesystem")); if (hflag) { header = " Size"; mwp->total = mwp->used = mwp->avail = strlen(header); } else { header = getbsize(&headerlen, &blocksize); mwp->total = max(mwp->total, (u_int)headerlen); } mwp->used = max(mwp->used, strlen("Used")); mwp->avail = max(mwp->avail, strlen("Avail")); (void)printf("%-*s %-*s %*s %*s Capacity", (u_int)mwp->mntfrom, "Filesystem", (u_int)mwp->total, header, (u_int)mwp->used, "Used", (u_int)mwp->avail, "Avail"); if (iflag) { mwp->iused = max(mwp->iused, strlen(" iused")); mwp->ifree = max(mwp->ifree, strlen("ifree")); (void)printf(" %*s %*s %%iused", (u_int)mwp->iused - 2, "iused", (u_int)mwp->ifree, "ifree"); } (void)printf(" Mounted on\n"); } (void)printf("%-*s", (u_int)mwp->mntfrom, sfsp->f_mntfromname); used = sfsp->f_blocks - sfsp->f_bfree; availblks = sfsp->f_bavail + used; if (hflag) { prthuman(sfsp, used); } else { - (void)printf(" %*ld %*ld %*ld", - (u_int)mwp->total, fsbtoblk(sfsp->f_blocks, sfsp->f_bsize, blocksize), - (u_int)mwp->used, fsbtoblk(used, sfsp->f_bsize, blocksize), - (u_int)mwp->avail, fsbtoblk(sfsp->f_bavail, sfsp->f_bsize, - blocksize)); + (void)printf(" %*qd %*qd %*qd", + (u_int)mwp->total, + (intmax_t)fsbtoblk(sfsp->f_blocks, sfsp->f_bsize, blocksize), + (u_int)mwp->used, + (intmax_t)fsbtoblk(used, sfsp->f_bsize, blocksize), + (u_int)mwp->avail, + (intmax_t)fsbtoblk(sfsp->f_bavail, sfsp->f_bsize, blocksize)); } (void)printf(" %5.0f%%", availblks == 0 ? 100.0 : (double)used / (double)availblks * 100.0); if (iflag) { inodes = sfsp->f_files; used = inodes - sfsp->f_ffree; - (void)printf(" %*lu %*lu %4.0f%% ", - (u_int)mwp->iused, (u_long)used, - (u_int)mwp->ifree, sfsp->f_ffree, - inodes == 0 ? 100.0 : (double)used / (double)inodes * 100.0); + (void)printf(" %*qd %*qd %4.0f%% ", + (u_int)mwp->iused, (intmax_t)used, + (u_int)mwp->ifree, (intmax_t)sfsp->f_ffree, + inodes == 0 ? 100.0 : (double)used / (double)inodes * 100.0); } else (void)printf(" "); (void)printf(" %s\n", sfsp->f_mntonname); } /* * Update the maximum field-width information in `mwp' based on * the file system specified by `sfsp'. */ static void update_maxwidths(struct maxwidths *mwp, const struct statfs *sfsp) { - static long blocksize = 0; + static u_long blocksize = 0; int dummy; if (blocksize == 0) getbsize(&dummy, &blocksize); mwp->mntfrom = max(mwp->mntfrom, strlen(sfsp->f_mntfromname)); - mwp->total = max(mwp->total, longwidth(fsbtoblk(sfsp->f_blocks, + mwp->total = max(mwp->total, int64width( + fsbtoblk((int64_t)sfsp->f_blocks, sfsp->f_bsize, blocksize))); + mwp->used = max(mwp->used, int64width(fsbtoblk((int64_t)sfsp->f_blocks - + (int64_t)sfsp->f_bfree, sfsp->f_bsize, blocksize))); + mwp->avail = max(mwp->avail, int64width(fsbtoblk(sfsp->f_bavail, sfsp->f_bsize, blocksize))); - mwp->used = max(mwp->used, longwidth(fsbtoblk(sfsp->f_blocks - - sfsp->f_bfree, sfsp->f_bsize, blocksize))); - mwp->avail = max(mwp->avail, longwidth(fsbtoblk(sfsp->f_bavail, - sfsp->f_bsize, blocksize))); - mwp->iused = max(mwp->iused, longwidth(sfsp->f_files - + mwp->iused = max(mwp->iused, int64width((int64_t)sfsp->f_files - sfsp->f_ffree)); - mwp->ifree = max(mwp->ifree, longwidth(sfsp->f_ffree)); + mwp->ifree = max(mwp->ifree, int64width(sfsp->f_ffree)); } /* Return the width in characters of the specified long. */ static size_t -longwidth(long val) +int64width(int64_t val) { size_t len; len = 0; /* Negative or zero values require one extra digit. */ if (val <= 0) { val = -val; len++; } while (val > 0) { len++; val /= 10; } return (len); } static void usage(void) { (void)fprintf(stderr, "usage: df [-b | -H | -h | -k | -m | -P] [-ailn] [-t type] [file | filesystem ...]\n"); exit(EX_USAGE); } static char * makenetvfslist(void) { char *str, *strptr, **listptr; struct xvfsconf *xvfsp, *keep_xvfsp; size_t buflen; int cnt, i, maxvfsconf; if (sysctlbyname("vfs.conflist", NULL, &buflen, NULL, 0) < 0) { warn("sysctl(vfs.conflist)"); return (NULL); } xvfsp = malloc(buflen); if (xvfsp == NULL) { warnx("malloc failed"); return (NULL); } keep_xvfsp = xvfsp; if (sysctlbyname("vfs.conflist", xvfsp, &buflen, NULL, 0) < 0) { warn("sysctl(vfs.conflist)"); free(keep_xvfsp); return (NULL); } maxvfsconf = buflen / sizeof(struct xvfsconf); if ((listptr = malloc(sizeof(char*) * maxvfsconf)) == NULL) { warnx("malloc failed"); free(keep_xvfsp); return (NULL); } for (cnt = 0, i = 0; i < maxvfsconf; i++) { if (xvfsp->vfc_flags & VFCF_NETWORK) { listptr[cnt++] = strdup(xvfsp->vfc_name); if (listptr[cnt-1] == NULL) { warnx("malloc failed"); free(listptr); free(keep_xvfsp); return (NULL); } } xvfsp++; } if (cnt == 0 || (str = malloc(sizeof(char) * (32 * cnt + cnt + 2))) == NULL) { if (cnt > 0) warnx("malloc failed"); free(listptr); free(keep_xvfsp); return (NULL); } *str = 'n'; *(str + 1) = 'o'; for (i = 0, strptr = str + 2; i < cnt; i++, strptr++) { strncpy(strptr, listptr[i], 32); strptr += strlen(listptr[i]); *strptr = ','; free(listptr[i]); } *(--strptr) = NULL; free(keep_xvfsp); free(listptr); return (str); } Index: head/sys/kern/syscalls.master =================================================================== --- head/sys/kern/syscalls.master (revision 122536) +++ head/sys/kern/syscalls.master (revision 122537) @@ -1,643 +1,645 @@ $FreeBSD$ ; from: @(#)syscalls.master 8.2 (Berkeley) 1/13/94 ; ; System call name/number master file. ; Processed to created init_sysent.c, syscalls.c and syscall.h. ; Columns: number [M]type nargs namespc name alt{name,tag,rtyp}/comments ; number system call number, must be in order ; type one of [M]STD, [M]OBSOL, [M]UNIMPL, [M]COMPAT, [M]CPT_NOA, ; [M]LIBCOMPAT, [M]NODEF, [M]NOARGS, [M]NOPROTO, [M]NOIMPL, ; [M]NOSTD, [M]COMPAT4 ; namespc one of POSIX, BSD, NOHIDE ; name psuedo-prototype of syscall routine ; If one of the following alts is different, then all appear: ; altname name of system call if different ; alttag name of args struct tag if different from [o]`name'"_args" ; altrtyp return type if not int (bogus - syscalls always return int) ; for UNIMPL/OBSOL, name continues with comments ; types: ; [M] e.g. like MSTD -- means the system call is MP-safe. If no ; M prefix is used, the syscall wrapper will obtain the Giant ; lock for the syscall. ; STD always included ; COMPAT included on COMPAT #ifdef ; COMPAT4 included on COMPAT4 #ifdef (FreeBSD 4 compat) ; LIBCOMPAT included on COMPAT #ifdef, and placed in syscall.h ; OBSOL obsolete, not included in system, only specifies name ; UNIMPL not implemented, placeholder only ; NOSTD implemented but as a lkm that can be statically ; compiled in sysent entry will be filled with lkmsys ; so the SYSCALL_MODULE macro works ; ; Please copy any additions and changes to the following compatability tables: ; sys/ia64/ia32/syscalls.master (take a best guess) ; [other 64 bit platforms with an alternate 32 bit syscall table go here too] ; #ifdef's, etc. may be included, and are copied to the output files. #include #include #include ; Reserved/unimplemented system calls in the range 0-150 inclusive ; are reserved for use in future Berkeley releases. ; Additional system calls implemented in vendor and other ; redistributions should be placed in the reserved range at the end ; of the current calls. 0 STD NOHIDE { int nosys(void); } syscall nosys_args int 1 MSTD NOHIDE { void sys_exit(int rval); } exit sys_exit_args void 2 MSTD POSIX { int fork(void); } 3 MSTD POSIX { ssize_t read(int fd, void *buf, size_t nbyte); } 4 MSTD POSIX { ssize_t write(int fd, const void *buf, size_t nbyte); } 5 STD POSIX { int open(char *path, int flags, int mode); } ; XXX should be { int open(const char *path, int flags, ...); } ; but we're not ready for `const' or varargs. ; XXX man page says `mode_t mode'. 6 MSTD POSIX { int close(int fd); } 7 MSTD BSD { int wait4(int pid, int *status, int options, \ struct rusage *rusage); } wait4 wait_args int 8 COMPAT BSD { int creat(char *path, int mode); } 9 STD POSIX { int link(char *path, char *link); } 10 STD POSIX { int unlink(char *path); } 11 OBSOL NOHIDE execv 12 STD POSIX { int chdir(char *path); } 13 STD BSD { int fchdir(int fd); } 14 STD POSIX { int mknod(char *path, int mode, int dev); } 15 STD POSIX { int chmod(char *path, int mode); } 16 STD POSIX { int chown(char *path, int uid, int gid); } 17 MSTD BSD { int obreak(char *nsize); } break obreak_args int -18 STD BSD { int getfsstat(struct statfs *buf, long bufsize, \ +18 COMPAT4 BSD { int getfsstat(struct ostatfs *buf, long bufsize, \ int flags); } 19 COMPAT POSIX { long lseek(int fd, long offset, int whence); } 20 MSTD POSIX { pid_t getpid(void); } 21 STD BSD { int mount(char *type, char *path, int flags, \ caddr_t data); } ; XXX `path' should have type `const char *' but we're not ready for that. 22 STD BSD { int unmount(char *path, int flags); } 23 MSTD POSIX { int setuid(uid_t uid); } 24 MSTD POSIX { uid_t getuid(void); } 25 MSTD POSIX { uid_t geteuid(void); } 26 MSTD BSD { int ptrace(int req, pid_t pid, caddr_t addr, \ int data); } 27 MSTD BSD { int recvmsg(int s, struct msghdr *msg, int flags); } 28 MSTD BSD { int sendmsg(int s, caddr_t msg, int flags); } 29 MSTD BSD { int recvfrom(int s, caddr_t buf, size_t len, \ int flags, caddr_t from, int *fromlenaddr); } 30 MSTD BSD { int accept(int s, caddr_t name, int *anamelen); } 31 MSTD BSD { int getpeername(int fdes, caddr_t asa, int *alen); } 32 MSTD BSD { int getsockname(int fdes, caddr_t asa, int *alen); } 33 STD POSIX { int access(char *path, int flags); } 34 STD BSD { int chflags(char *path, int flags); } 35 STD BSD { int fchflags(int fd, int flags); } 36 STD BSD { int sync(void); } 37 MSTD POSIX { int kill(int pid, int signum); } 38 COMPAT POSIX { int stat(char *path, struct ostat *ub); } 39 MSTD POSIX { pid_t getppid(void); } 40 COMPAT POSIX { int lstat(char *path, struct ostat *ub); } 41 MSTD POSIX { int dup(u_int fd); } 42 MSTD POSIX { int pipe(void); } 43 MSTD POSIX { gid_t getegid(void); } 44 MSTD BSD { int profil(caddr_t samples, size_t size, \ size_t offset, u_int scale); } 45 MSTD BSD { int ktrace(const char *fname, int ops, int facs, \ int pid); } 46 MCOMPAT POSIX { int sigaction(int signum, struct osigaction *nsa, \ struct osigaction *osa); } 47 MSTD POSIX { gid_t getgid(void); } 48 MCOMPAT POSIX { int sigprocmask(int how, osigset_t mask); } ; XXX note nonstandard (bogus) calling convention - the libc stub passes ; us the mask, not a pointer to it, and we return the old mask as the ; (int) return value. 49 MSTD BSD { int getlogin(char *namebuf, u_int namelen); } 50 MSTD BSD { int setlogin(char *namebuf); } 51 MSTD BSD { int acct(char *path); } 52 MCOMPAT POSIX { int sigpending(void); } 53 MSTD BSD { int sigaltstack(stack_t *ss, stack_t *oss); } 54 MSTD POSIX { int ioctl(int fd, u_long com, caddr_t data); } 55 MSTD BSD { int reboot(int opt); } 56 STD POSIX { int revoke(char *path); } 57 STD POSIX { int symlink(char *path, char *link); } 58 STD POSIX { int readlink(char *path, char *buf, int count); } 59 MSTD POSIX { int execve(char *fname, char **argv, char **envv); } 60 MSTD POSIX { int umask(int newmask); } umask umask_args int 61 STD BSD { int chroot(char *path); } 62 MCOMPAT POSIX { int fstat(int fd, struct ostat *sb); } 63 MCOMPAT BSD { int getkerninfo(int op, char *where, size_t *size, \ int arg); } getkerninfo getkerninfo_args int 64 MCOMPAT BSD { int getpagesize(void); } \ getpagesize getpagesize_args int 65 MSTD BSD { int msync(void *addr, size_t len, int flags); } 66 MSTD BSD { int vfork(void); } 67 OBSOL NOHIDE vread 68 OBSOL NOHIDE vwrite 69 MSTD BSD { int sbrk(int incr); } 70 MSTD BSD { int sstk(int incr); } 71 MCOMPAT BSD { int mmap(void *addr, int len, int prot, \ int flags, int fd, long pos); } 72 MSTD BSD { int ovadvise(int anom); } vadvise ovadvise_args int 73 MSTD BSD { int munmap(void *addr, size_t len); } 74 MSTD BSD { int mprotect(const void *addr, size_t len, int prot); } 75 MSTD BSD { int madvise(void *addr, size_t len, int behav); } 76 OBSOL NOHIDE vhangup 77 OBSOL NOHIDE vlimit 78 MSTD BSD { int mincore(const void *addr, size_t len, \ char *vec); } 79 MSTD POSIX { int getgroups(u_int gidsetsize, gid_t *gidset); } 80 MSTD POSIX { int setgroups(u_int gidsetsize, gid_t *gidset); } 81 MSTD POSIX { int getpgrp(void); } 82 MSTD POSIX { int setpgid(int pid, int pgid); } 83 MSTD BSD { int setitimer(u_int which, struct itimerval *itv, \ struct itimerval *oitv); } 84 MCOMPAT BSD { int wait(void); } 85 MSTD BSD { int swapon(char *name); } 86 MSTD BSD { int getitimer(u_int which, struct itimerval *itv); } 87 MCOMPAT BSD { int gethostname(char *hostname, u_int len); } \ gethostname gethostname_args int 88 MCOMPAT BSD { int sethostname(char *hostname, u_int len); } \ sethostname sethostname_args int 89 MSTD BSD { int getdtablesize(void); } 90 MSTD POSIX { int dup2(u_int from, u_int to); } 91 UNIMPL BSD getdopt 92 MSTD POSIX { int fcntl(int fd, int cmd, long arg); } ; XXX should be { int fcntl(int fd, int cmd, ...); } ; but we're not ready for varargs. 93 MSTD BSD { int select(int nd, fd_set *in, fd_set *ou, \ fd_set *ex, struct timeval *tv); } 94 UNIMPL BSD setdopt 95 STD POSIX { int fsync(int fd); } 96 MSTD BSD { int setpriority(int which, int who, int prio); } 97 MSTD BSD { int socket(int domain, int type, int protocol); } 98 MSTD BSD { int connect(int s, caddr_t name, int namelen); } 99 MCPT_NOA BSD { int accept(int s, caddr_t name, int *anamelen); } \ accept accept_args int 100 MSTD BSD { int getpriority(int which, int who); } 101 MCOMPAT BSD { int send(int s, caddr_t buf, int len, int flags); } 102 MCOMPAT BSD { int recv(int s, caddr_t buf, int len, int flags); } 103 MCOMPAT BSD { int sigreturn(struct osigcontext *sigcntxp); } 104 MSTD BSD { int bind(int s, caddr_t name, int namelen); } 105 MSTD BSD { int setsockopt(int s, int level, int name, \ caddr_t val, int valsize); } 106 MSTD BSD { int listen(int s, int backlog); } 107 OBSOL NOHIDE vtimes 108 MCOMPAT BSD { int sigvec(int signum, struct sigvec *nsv, \ struct sigvec *osv); } 109 MCOMPAT BSD { int sigblock(int mask); } 110 MCOMPAT BSD { int sigsetmask(int mask); } 111 MCOMPAT POSIX { int sigsuspend(osigset_t mask); } ; XXX note nonstandard (bogus) calling convention - the libc stub passes ; us the mask, not a pointer to it. 112 MCOMPAT BSD { int sigstack(struct sigstack *nss, \ struct sigstack *oss); } 113 MCOMPAT BSD { int recvmsg(int s, struct omsghdr *msg, int flags); } 114 MCOMPAT BSD { int sendmsg(int s, caddr_t msg, int flags); } 115 OBSOL NOHIDE vtrace 116 MSTD BSD { int gettimeofday(struct timeval *tp, \ struct timezone *tzp); } 117 MSTD BSD { int getrusage(int who, struct rusage *rusage); } 118 MSTD BSD { int getsockopt(int s, int level, int name, \ caddr_t val, int *avalsize); } 119 UNIMPL NOHIDE resuba (BSD/OS 2.x) 120 MSTD BSD { int readv(int fd, struct iovec *iovp, u_int iovcnt); } 121 MSTD BSD { int writev(int fd, struct iovec *iovp, \ u_int iovcnt); } 122 MSTD BSD { int settimeofday(struct timeval *tv, \ struct timezone *tzp); } 123 STD BSD { int fchown(int fd, int uid, int gid); } 124 STD BSD { int fchmod(int fd, int mode); } 125 MCPT_NOA BSD { int recvfrom(int s, caddr_t buf, size_t len, \ int flags, caddr_t from, int *fromlenaddr); } \ recvfrom recvfrom_args int 126 MSTD BSD { int setreuid(int ruid, int euid); } 127 MSTD BSD { int setregid(int rgid, int egid); } 128 STD POSIX { int rename(char *from, char *to); } 129 COMPAT BSD { int truncate(char *path, long length); } 130 COMPAT BSD { int ftruncate(int fd, long length); } 131 MSTD BSD { int flock(int fd, int how); } 132 STD POSIX { int mkfifo(char *path, int mode); } 133 MSTD BSD { int sendto(int s, caddr_t buf, size_t len, \ int flags, caddr_t to, int tolen); } 134 MSTD BSD { int shutdown(int s, int how); } 135 MSTD BSD { int socketpair(int domain, int type, int protocol, \ int *rsv); } 136 STD POSIX { int mkdir(char *path, int mode); } 137 STD POSIX { int rmdir(char *path); } 138 STD BSD { int utimes(char *path, struct timeval *tptr); } 139 OBSOL NOHIDE 4.2 sigreturn 140 MSTD BSD { int adjtime(struct timeval *delta, \ struct timeval *olddelta); } 141 MCOMPAT BSD { int getpeername(int fdes, caddr_t asa, int *alen); } 142 MCOMPAT BSD { long gethostid(void); } 143 MCOMPAT BSD { int sethostid(long hostid); } 144 MCOMPAT BSD { int getrlimit(u_int which, struct orlimit *rlp); } 145 MCOMPAT BSD { int setrlimit(u_int which, struct orlimit *rlp); } 146 MCOMPAT BSD { int killpg(int pgid, int signum); } 147 MSTD POSIX { int setsid(void); } 148 STD BSD { int quotactl(char *path, int cmd, int uid, \ caddr_t arg); } 149 MCOMPAT BSD { int quota(void); } 150 MCPT_NOA BSD { int getsockname(int fdec, caddr_t asa, int *alen); }\ getsockname getsockname_args int ; Syscalls 151-180 inclusive are reserved for vendor-specific ; system calls. (This includes various calls added for compatibity ; with other Unix variants.) ; Some of these calls are now supported by BSD... 151 UNIMPL NOHIDE sem_lock (BSD/OS 2.x) 152 UNIMPL NOHIDE sem_wakeup (BSD/OS 2.x) 153 UNIMPL NOHIDE asyncdaemon (BSD/OS 2.x) 154 UNIMPL NOHIDE nosys ; 155 is initialized by the NFS code, if present. 155 MNOIMPL BSD { int nfssvc(int flag, caddr_t argp); } 156 COMPAT BSD { int getdirentries(int fd, char *buf, u_int count, \ long *basep); } -157 STD BSD { int statfs(char *path, struct statfs *buf); } -158 STD BSD { int fstatfs(int fd, struct statfs *buf); } +157 COMPAT4 BSD { int statfs(char *path, struct ostatfs *buf); } +158 COMPAT4 BSD { int fstatfs(int fd, struct ostatfs *buf); } 159 UNIMPL NOHIDE nosys 160 UNIMPL NOHIDE nosys 161 STD BSD { int getfh(char *fname, struct fhandle *fhp); } 162 MSTD BSD { int getdomainname(char *domainname, int len); } 163 MSTD BSD { int setdomainname(char *domainname, int len); } 164 MSTD BSD { int uname(struct utsname *name); } 165 MSTD BSD { int sysarch(int op, char *parms); } 166 MSTD BSD { int rtprio(int function, pid_t pid, \ struct rtprio *rtp); } 167 UNIMPL NOHIDE nosys 168 UNIMPL NOHIDE nosys ; 169 is initialized by the SYSVSEM code if present or loaded 169 MNOSTD BSD { int semsys(int which, int a2, int a3, int a4, \ int a5); } ; 169 is initialized by the SYSVMSG code if present or loaded ; XXX should be { int semsys(int which, ...); } 170 MNOSTD BSD { int msgsys(int which, int a2, int a3, int a4, \ int a5, int a6); } ; 169 is initialized by the SYSVSHM code if present or loaded ; XXX should be { int msgsys(int which, ...); } 171 MNOSTD BSD { int shmsys(int which, int a2, int a3, int a4); } ; XXX should be { int shmsys(int which, ...); } 172 UNIMPL NOHIDE nosys 173 MSTD POSIX { ssize_t pread(int fd, void *buf, size_t nbyte, \ int pad, off_t offset); } 174 MSTD POSIX { ssize_t pwrite(int fd, const void *buf, \ size_t nbyte, int pad, off_t offset); } 175 UNIMPL NOHIDE nosys 176 MSTD BSD { int ntp_adjtime(struct timex *tp); } 177 UNIMPL NOHIDE sfork (BSD/OS 2.x) 178 UNIMPL NOHIDE getdescriptor (BSD/OS 2.x) 179 UNIMPL NOHIDE setdescriptor (BSD/OS 2.x) 180 UNIMPL NOHIDE nosys ; Syscalls 181-199 are used by/reserved for BSD 181 MSTD POSIX { int setgid(gid_t gid); } 182 MSTD BSD { int setegid(gid_t egid); } 183 MSTD BSD { int seteuid(uid_t euid); } 184 UNIMPL BSD lfs_bmapv 185 UNIMPL BSD lfs_markv 186 UNIMPL BSD lfs_segclean 187 UNIMPL BSD lfs_segwait 188 STD POSIX { int stat(char *path, struct stat *ub); } 189 MSTD POSIX { int fstat(int fd, struct stat *sb); } 190 STD POSIX { int lstat(char *path, struct stat *ub); } 191 STD POSIX { int pathconf(char *path, int name); } 192 MSTD POSIX { int fpathconf(int fd, int name); } 193 UNIMPL NOHIDE nosys 194 MSTD BSD { int getrlimit(u_int which, \ struct rlimit *rlp); } \ getrlimit __getrlimit_args int 195 MSTD BSD { int setrlimit(u_int which, \ struct rlimit *rlp); } \ setrlimit __setrlimit_args int 196 STD BSD { int getdirentries(int fd, char *buf, u_int count, \ long *basep); } 197 MSTD BSD { caddr_t mmap(caddr_t addr, size_t len, int prot, \ int flags, int fd, int pad, off_t pos); } 198 STD NOHIDE { int nosys(void); } __syscall __syscall_args int 199 STD POSIX { off_t lseek(int fd, int pad, off_t offset, \ int whence); } 200 STD BSD { int truncate(char *path, int pad, off_t length); } 201 STD BSD { int ftruncate(int fd, int pad, off_t length); } 202 MSTD BSD { int __sysctl(int *name, u_int namelen, void *old, \ size_t *oldlenp, void *new, size_t newlen); } \ __sysctl sysctl_args int ; properly, __sysctl should be a NOHIDE, but making an exception ; here allows to avoid one in libc/sys/Makefile.inc. 203 MSTD BSD { int mlock(const void *addr, size_t len); } 204 MSTD BSD { int munlock(const void *addr, size_t len); } 205 STD BSD { int undelete(char *path); } 206 STD BSD { int futimes(int fd, struct timeval *tptr); } 207 MSTD BSD { int getpgid(pid_t pid); } 208 UNIMPL NOHIDE newreboot (NetBSD) 209 MSTD BSD { int poll(struct pollfd *fds, u_int nfds, \ int timeout); } ; ; The following are reserved for loadable syscalls ; 210 NODEF NOHIDE lkmnosys lkmnosys nosys_args int 211 NODEF NOHIDE lkmnosys lkmnosys nosys_args int 212 NODEF NOHIDE lkmnosys lkmnosys nosys_args int 213 NODEF NOHIDE lkmnosys lkmnosys nosys_args int 214 NODEF NOHIDE lkmnosys lkmnosys nosys_args int 215 NODEF NOHIDE lkmnosys lkmnosys nosys_args int 216 NODEF NOHIDE lkmnosys lkmnosys nosys_args int 217 NODEF NOHIDE lkmnosys lkmnosys nosys_args int 218 NODEF NOHIDE lkmnosys lkmnosys nosys_args int 219 NODEF NOHIDE lkmnosys lkmnosys nosys_args int ; ; The following were introduced with NetBSD/4.4Lite-2 ; They are initialized by thier respective modules/sysinits 220 MNOSTD BSD { int __semctl(int semid, int semnum, int cmd, \ union semun *arg); } 221 MNOSTD BSD { int semget(key_t key, int nsems, int semflg); } 222 MNOSTD BSD { int semop(int semid, struct sembuf *sops, \ size_t nsops); } 223 UNIMPL NOHIDE semconfig 224 MNOSTD BSD { int msgctl(int msqid, int cmd, \ struct msqid_ds *buf); } 225 MNOSTD BSD { int msgget(key_t key, int msgflg); } 226 MNOSTD BSD { int msgsnd(int msqid, const void *msgp, size_t msgsz, \ int msgflg); } 227 MNOSTD BSD { int msgrcv(int msqid, void *msgp, size_t msgsz, \ long msgtyp, int msgflg); } 228 MNOSTD BSD { int shmat(int shmid, const void *shmaddr, int shmflg); } 229 MNOSTD BSD { int shmctl(int shmid, int cmd, \ struct shmid_ds *buf); } 230 MNOSTD BSD { int shmdt(const void *shmaddr); } 231 MNOSTD BSD { int shmget(key_t key, size_t size, int shmflg); } ; 232 MSTD POSIX { int clock_gettime(clockid_t clock_id, \ struct timespec *tp); } 233 MSTD POSIX { int clock_settime(clockid_t clock_id, \ const struct timespec *tp); } 234 MSTD POSIX { int clock_getres(clockid_t clock_id, \ struct timespec *tp); } 235 UNIMPL NOHIDE timer_create 236 UNIMPL NOHIDE timer_delete 237 UNIMPL NOHIDE timer_settime 238 UNIMPL NOHIDE timer_gettime 239 UNIMPL NOHIDE timer_getoverrun 240 MSTD POSIX { int nanosleep(const struct timespec *rqtp, \ struct timespec *rmtp); } 241 UNIMPL NOHIDE nosys 242 UNIMPL NOHIDE nosys 243 UNIMPL NOHIDE nosys 244 UNIMPL NOHIDE nosys 245 UNIMPL NOHIDE nosys 246 UNIMPL NOHIDE nosys 247 UNIMPL NOHIDE nosys 248 UNIMPL NOHIDE nosys 249 UNIMPL NOHIDE nosys ; syscall numbers initially used in OpenBSD 250 MSTD BSD { int minherit(void *addr, size_t len, int inherit); } 251 MSTD BSD { int rfork(int flags); } 252 MSTD BSD { int openbsd_poll(struct pollfd *fds, u_int nfds, \ int timeout); } 253 MSTD BSD { int issetugid(void); } 254 STD BSD { int lchown(char *path, int uid, int gid); } 255 UNIMPL NOHIDE nosys 256 UNIMPL NOHIDE nosys 257 UNIMPL NOHIDE nosys 258 UNIMPL NOHIDE nosys 259 UNIMPL NOHIDE nosys 260 UNIMPL NOHIDE nosys 261 UNIMPL NOHIDE nosys 262 UNIMPL NOHIDE nosys 263 UNIMPL NOHIDE nosys 264 UNIMPL NOHIDE nosys 265 UNIMPL NOHIDE nosys 266 UNIMPL NOHIDE nosys 267 UNIMPL NOHIDE nosys 268 UNIMPL NOHIDE nosys 269 UNIMPL NOHIDE nosys 270 UNIMPL NOHIDE nosys 271 UNIMPL NOHIDE nosys 272 STD BSD { int getdents(int fd, char *buf, size_t count); } 273 UNIMPL NOHIDE nosys 274 STD BSD { int lchmod(char *path, mode_t mode); } 275 NOPROTO BSD { int lchown(char *path, uid_t uid, gid_t gid); } netbsd_lchown lchown_args int 276 STD BSD { int lutimes(char *path, struct timeval *tptr); } 277 MNOPROTO BSD { int msync(void *addr, size_t len, int flags); } netbsd_msync msync_args int 278 STD BSD { int nstat(char *path, struct nstat *ub); } 279 MSTD BSD { int nfstat(int fd, struct nstat *sb); } 280 STD BSD { int nlstat(char *path, struct nstat *ub); } 281 UNIMPL NOHIDE nosys 282 UNIMPL NOHIDE nosys 283 UNIMPL NOHIDE nosys 284 UNIMPL NOHIDE nosys 285 UNIMPL NOHIDE nosys 286 UNIMPL NOHIDE nosys 287 UNIMPL NOHIDE nosys 288 UNIMPL NOHIDE nosys 289 UNIMPL NOHIDE nosys 290 UNIMPL NOHIDE nosys 291 UNIMPL NOHIDE nosys 292 UNIMPL NOHIDE nosys 293 UNIMPL NOHIDE nosys 294 UNIMPL NOHIDE nosys 295 UNIMPL NOHIDE nosys 296 UNIMPL NOHIDE nosys ; XXX 297 is 300 in NetBSD -297 STD BSD { int fhstatfs(const struct fhandle *u_fhp, struct statfs *buf); } +297 COMPAT4 BSD { int fhstatfs(const struct fhandle *u_fhp, struct ostatfs *buf); } 298 STD BSD { int fhopen(const struct fhandle *u_fhp, int flags); } 299 STD BSD { int fhstat(const struct fhandle *u_fhp, struct stat *sb); } ; syscall numbers for FreeBSD 300 MSTD BSD { int modnext(int modid); } 301 MSTD BSD { int modstat(int modid, struct module_stat* stat); } 302 MSTD BSD { int modfnext(int modid); } 303 MSTD BSD { int modfind(const char *name); } 304 MSTD BSD { int kldload(const char *file); } 305 MSTD BSD { int kldunload(int fileid); } 306 MSTD BSD { int kldfind(const char *file); } 307 MSTD BSD { int kldnext(int fileid); } 308 MSTD BSD { int kldstat(int fileid, struct kld_file_stat* stat); } 309 MSTD BSD { int kldfirstmod(int fileid); } 310 MSTD BSD { int getsid(pid_t pid); } 311 MSTD BSD { int setresuid(uid_t ruid, uid_t euid, uid_t suid); } 312 MSTD BSD { int setresgid(gid_t rgid, gid_t egid, gid_t sgid); } 313 OBSOL NOHIDE signanosleep 314 NOSTD BSD { int aio_return(struct aiocb *aiocbp); } 315 NOSTD BSD { int aio_suspend(struct aiocb * const * aiocbp, int nent, const struct timespec *timeout); } 316 NOSTD BSD { int aio_cancel(int fd, struct aiocb *aiocbp); } 317 NOSTD BSD { int aio_error(struct aiocb *aiocbp); } 318 NOSTD BSD { int aio_read(struct aiocb *aiocbp); } 319 NOSTD BSD { int aio_write(struct aiocb *aiocbp); } 320 NOSTD BSD { int lio_listio(int mode, struct aiocb * const *acb_list, int nent, struct sigevent *sig); } 321 MSTD BSD { int yield(void); } 322 OBSOL NOHIDE thr_sleep 323 OBSOL NOHIDE thr_wakeup 324 MSTD BSD { int mlockall(int how); } 325 MSTD BSD { int munlockall(void); } 326 STD BSD { int __getcwd(u_char *buf, u_int buflen); } 327 MSTD POSIX { int sched_setparam (pid_t pid, const struct sched_param *param); } 328 MSTD POSIX { int sched_getparam (pid_t pid, struct sched_param *param); } 329 MSTD POSIX { int sched_setscheduler (pid_t pid, int policy, const struct sched_param *param); } 330 MSTD POSIX { int sched_getscheduler (pid_t pid); } 331 MSTD POSIX { int sched_yield (void); } 332 MSTD POSIX { int sched_get_priority_max (int policy); } 333 MSTD POSIX { int sched_get_priority_min (int policy); } 334 MSTD POSIX { int sched_rr_get_interval (pid_t pid, struct timespec *interval); } 335 MSTD BSD { int utrace(const void *addr, size_t len); } 336 MCOMPAT4 BSD { int sendfile(int fd, int s, off_t offset, size_t nbytes, \ struct sf_hdtr *hdtr, off_t *sbytes, int flags); } 337 STD BSD { int kldsym(int fileid, int cmd, void *data); } 338 MSTD BSD { int jail(struct jail *jail); } 339 UNIMPL BSD pioctl 340 MSTD POSIX { int sigprocmask(int how, const sigset_t *set, \ sigset_t *oset); } 341 MSTD POSIX { int sigsuspend(const sigset_t *sigmask); } 342 MCOMPAT4 POSIX { int sigaction(int sig, const struct sigaction *act, \ struct sigaction *oact); } 343 MSTD POSIX { int sigpending(sigset_t *set); } 344 MCOMPAT4 BSD { int sigreturn(const struct ucontext4 *sigcntxp); } 345 MSTD NOHIDE { int sigtimedwait(const sigset_t *set, \ siginfo_t *info, const struct timespec *timeout); } 346 MSTD NOHIDE { int sigwaitinfo(const sigset_t *set, \ siginfo_t *info); } 347 MSTD BSD { int __acl_get_file(const char *path, \ acl_type_t type, struct acl *aclp); } 348 MSTD BSD { int __acl_set_file(const char *path, \ acl_type_t type, struct acl *aclp); } 349 MSTD BSD { int __acl_get_fd(int filedes, acl_type_t type, \ struct acl *aclp); } 350 MSTD BSD { int __acl_set_fd(int filedes, acl_type_t type, \ struct acl *aclp); } 351 MSTD BSD { int __acl_delete_file(const char *path, \ acl_type_t type); } 352 MSTD BSD { int __acl_delete_fd(int filedes, acl_type_t type); } 353 MSTD BSD { int __acl_aclcheck_file(const char *path, \ acl_type_t type, struct acl *aclp); } 354 MSTD BSD { int __acl_aclcheck_fd(int filedes, acl_type_t type, \ struct acl *aclp); } 355 STD BSD { int extattrctl(const char *path, int cmd, \ const char *filename, int attrnamespace, \ const char *attrname); } 356 STD BSD { int extattr_set_file(const char *path, \ int attrnamespace, const char *attrname, \ void *data, size_t nbytes); } 357 STD BSD { ssize_t extattr_get_file(const char *path, \ int attrnamespace, const char *attrname, \ void *data, size_t nbytes); } 358 STD BSD { int extattr_delete_file(const char *path, \ int attrnamespace, const char *attrname); } 359 NOSTD BSD { int aio_waitcomplete(struct aiocb **aiocbp, struct timespec *timeout); } 360 MSTD BSD { int getresuid(uid_t *ruid, uid_t *euid, uid_t *suid); } 361 MSTD BSD { int getresgid(gid_t *rgid, gid_t *egid, gid_t *sgid); } 362 MSTD BSD { int kqueue(void); } 363 MSTD BSD { int kevent(int fd, \ const struct kevent *changelist, int nchanges, \ struct kevent *eventlist, int nevents, \ const struct timespec *timeout); } 364 UNIMPL BSD __cap_get_proc 365 UNIMPL BSD __cap_set_proc 366 UNIMPL BSD __cap_get_fd 367 UNIMPL BSD __cap_get_file 368 UNIMPL BSD __cap_set_fd 369 UNIMPL BSD __cap_set_file 370 NODEF NOHIDE lkmressys lkmressys nosys_args int 371 STD BSD { int extattr_set_fd(int fd, int attrnamespace, \ const char *attrname, void *data, \ size_t nbytes); } 372 STD BSD { ssize_t extattr_get_fd(int fd, int attrnamespace, \ const char *attrname, void *data, size_t nbytes); } 373 STD BSD { int extattr_delete_fd(int fd, int attrnamespace, \ const char *attrname); } 374 MSTD BSD { int __setugid(int flag); } 375 NOIMPL BSD { int nfsclnt(int flag, caddr_t argp); } 376 STD BSD { int eaccess(char *path, int flags); } 377 UNIMPL BSD afs_syscall 378 STD BSD { int nmount(struct iovec *iovp, unsigned int iovcnt, \ int flags); } 379 MSTD BSD { int kse_exit(void); } 380 MSTD BSD { int kse_wakeup(struct kse_mailbox *mbx); } 381 STD BSD { int kse_create(struct kse_mailbox *mbx, \ int newgroup); } 382 MSTD BSD { int kse_thr_interrupt(struct kse_thr_mailbox *tmbx, int cmd, long data); } 383 MSTD BSD { int kse_release(struct timespec *timeout); } 384 MSTD BSD { int __mac_get_proc(struct mac *mac_p); } 385 MSTD BSD { int __mac_set_proc(struct mac *mac_p); } 386 MSTD BSD { int __mac_get_fd(int fd, struct mac *mac_p); } 387 MSTD BSD { int __mac_get_file(const char *path_p, \ struct mac *mac_p); } 388 MSTD BSD { int __mac_set_fd(int fd, struct mac *mac_p); } 389 MSTD BSD { int __mac_set_file(const char *path_p, \ struct mac *mac_p); } 390 STD BSD { int kenv(int what, const char *name, char *value, \ int len); } 391 STD BSD { int lchflags(const char *path, int flags); } 392 STD BSD { int uuidgen(struct uuid *store, int count); } 393 MSTD BSD { int sendfile(int fd, int s, off_t offset, size_t nbytes, \ struct sf_hdtr *hdtr, off_t *sbytes, int flags); } 394 MSTD BSD { int mac_syscall(const char *policy, int call, \ void *arg); } -395 UNIMPL NOHIDE nosys -396 UNIMPL NOHIDE nosys -397 UNIMPL NOHIDE nosys -398 UNIMPL NOHIDE nosys +395 STD BSD { int getfsstat(struct statfs *buf, long bufsize, \ + int flags); } +396 STD BSD { int statfs(char *path, struct statfs *buf); } +397 STD BSD { int fstatfs(int fd, struct statfs *buf); } +398 STD BSD { int fhstatfs(const struct fhandle *u_fhp, \ + struct statfs *buf); } 399 UNIMPL NOHIDE nosys 400 MNOSTD BSD { int ksem_close(semid_t id); } 401 MNOSTD BSD { int ksem_post(semid_t id); } 402 MNOSTD BSD { int ksem_wait(semid_t id); } 403 MNOSTD BSD { int ksem_trywait(semid_t id); } 404 MNOSTD BSD { int ksem_init(semid_t *idp, unsigned int value); } 405 MNOSTD BSD { int ksem_open(semid_t *idp, const char *name, \ int oflag, mode_t mode, unsigned int value); } 406 MNOSTD BSD { int ksem_unlink(const char *name); } 407 MNOSTD BSD { int ksem_getvalue(semid_t id, int *val); } 408 MNOSTD BSD { int ksem_destroy(semid_t id); } 409 MSTD BSD { int __mac_get_pid(pid_t pid, struct mac *mac_p); } 410 MSTD BSD { int __mac_get_link(const char *path_p, \ struct mac *mac_p); } 411 MSTD BSD { int __mac_set_link(const char *path_p, \ struct mac *mac_p); } 412 STD BSD { int extattr_set_link(const char *path, \ int attrnamespace, const char *attrname, \ void *data, size_t nbytes); } 413 STD BSD { ssize_t extattr_get_link(const char *path, \ int attrnamespace, const char *attrname, \ void *data, size_t nbytes); } 414 STD BSD { int extattr_delete_link(const char *path, \ int attrnamespace, const char *attrname); } 415 MSTD BSD { int __mac_execve(char *fname, char **argv, \ char **envv, struct mac *mac_p); } 416 MSTD POSIX { int sigaction(int sig, const struct sigaction *act, \ struct sigaction *oact); } 417 MSTD BSD { int sigreturn(const struct __ucontext *sigcntxp); } 418 UNIMPL BSD __xstat 419 UNIMPL BSD __xfstat 420 UNIMPL BSD __xlstat 421 MSTD BSD { int getcontext(struct __ucontext *ucp); } 422 MSTD BSD { int setcontext(const struct __ucontext *ucp); } 423 MSTD BSD { int swapcontext(struct __ucontext *oucp, \ const struct __ucontext *ucp); } 424 MSTD BSD { int swapoff(const char *name); } 425 MSTD BSD { int __acl_get_link(const char *path, \ acl_type_t type, struct acl *aclp); } 426 MSTD BSD { int __acl_set_link(const char *path, \ acl_type_t type, struct acl *aclp); } 427 MSTD BSD { int __acl_delete_link(const char *path, \ acl_type_t type); } 428 MSTD BSD { int __acl_aclcheck_link(const char *path, \ acl_type_t type, struct acl *aclp); } 429 MSTD NOHIDE { int sigwait(const sigset_t *set, int *sig); } 430 MSTD BSD { int thr_create(ucontext_t *ctx, thr_id_t *id, int flags); } 431 MSTD BSD { void thr_exit(void); } 432 MSTD BSD { int thr_self(thr_id_t *id); } 433 MSTD BSD { int thr_kill(thr_id_t id, int sig); } 434 MSTD BSD { int _umtx_lock(struct umtx *umtx); } 435 MSTD BSD { int _umtx_unlock(struct umtx *umtx); } 436 MSTD BSD { int jail_attach(int jid); } 437 STD BSD { ssize_t extattr_list_fd(int fd, int attrnamespace, \ void *data, size_t nbytes); } 438 STD BSD { ssize_t extattr_list_file(const char *path, \ int attrnamespace, void *data, size_t nbytes); } 439 STD BSD { ssize_t extattr_list_link(const char *path, \ int attrnamespace, void *data, size_t nbytes); } ; Please copy any additions and changes to the following compatability tables: ; sys/ia64/ia32/syscalls.master (take a best guess) ; [other 64 bit platforms with an alternate 32 bit syscall table go here too] Index: head/sys/kern/vfs_bio.c =================================================================== --- head/sys/kern/vfs_bio.c (revision 122536) +++ head/sys/kern/vfs_bio.c (revision 122537) @@ -1,3812 +1,3812 @@ /* * Copyright (c) 1994,1997 John S. Dyson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice immediately at the beginning of the file, without modification, * this list of conditions, and the following disclaimer. * 2. Absolutely no warranty of function or purpose is made by the author * John S. Dyson. */ /* * this file contains a new buffer I/O scheme implementing a coherent * VM object and buffer cache scheme. Pains have been taken to make * sure that the performance degradation associated with schemes such * as this is not realized. * * Author: John S. Dyson * Significant help during the development and debugging phases * had been provided by David Greenman, also of the FreeBSD core team. * * see man buf(9) for more info. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "opt_directio.h" #include "opt_swap.h" static MALLOC_DEFINE(M_BIOBUF, "BIO buffer", "BIO buffer"); struct bio_ops bioops; /* I/O operation notification */ struct buf_ops buf_ops_bio = { "buf_ops_bio", bwrite }; /* * XXX buf is global because kern_shutdown.c and ffs_checkoverlap has * carnal knowledge of buffers. This knowledge should be moved to vfs_bio.c. */ struct buf *buf; /* buffer header pool */ static struct proc *bufdaemonproc; static void vm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to); static void vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to); static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, int pageno, vm_page_t m); static void vfs_clean_pages(struct buf * bp); static void vfs_setdirty(struct buf *bp); static void vfs_vmio_release(struct buf *bp); static void vfs_backgroundwritedone(struct buf *bp); static int vfs_bio_clcheck(struct vnode *vp, int size, daddr_t lblkno, daddr_t blkno); static int flushbufqueues(int flushdeps); static void buf_daemon(void); void bremfreel(struct buf * bp); int vmiodirenable = TRUE; SYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW, &vmiodirenable, 0, "Use the VM system for directory writes"); int runningbufspace; SYSCTL_INT(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0, "Amount of presently outstanding async buffer io"); static int bufspace; SYSCTL_INT(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, &bufspace, 0, "KVA memory used for bufs"); static int maxbufspace; SYSCTL_INT(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RD, &maxbufspace, 0, "Maximum allowed value of bufspace (including buf_daemon)"); static int bufmallocspace; SYSCTL_INT(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0, "Amount of malloced memory for buffers"); static int maxbufmallocspace; SYSCTL_INT(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace, 0, "Maximum amount of malloced memory for buffers"); static int lobufspace; SYSCTL_INT(_vfs, OID_AUTO, lobufspace, CTLFLAG_RD, &lobufspace, 0, "Minimum amount of buffers we want to have"); static int hibufspace; SYSCTL_INT(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD, &hibufspace, 0, "Maximum allowed value of bufspace (excluding buf_daemon)"); static int bufreusecnt; SYSCTL_INT(_vfs, OID_AUTO, bufreusecnt, CTLFLAG_RW, &bufreusecnt, 0, "Number of times we have reused a buffer"); static int buffreekvacnt; SYSCTL_INT(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt, 0, "Number of times we have freed the KVA space from some buffer"); static int bufdefragcnt; SYSCTL_INT(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW, &bufdefragcnt, 0, "Number of times we have had to repeat buffer allocation to defragment"); static int lorunningspace; SYSCTL_INT(_vfs, OID_AUTO, lorunningspace, CTLFLAG_RW, &lorunningspace, 0, "Minimum preferred space used for in-progress I/O"); static int hirunningspace; SYSCTL_INT(_vfs, OID_AUTO, hirunningspace, CTLFLAG_RW, &hirunningspace, 0, "Maximum amount of space to use for in-progress I/O"); static int dirtybufferflushes; SYSCTL_INT(_vfs, OID_AUTO, dirtybufferflushes, CTLFLAG_RW, &dirtybufferflushes, 0, "Number of bdwrite to bawrite conversions to limit dirty buffers"); static int altbufferflushes; SYSCTL_INT(_vfs, OID_AUTO, altbufferflushes, CTLFLAG_RW, &altbufferflushes, 0, "Number of fsync flushes to limit dirty buffers"); static int recursiveflushes; SYSCTL_INT(_vfs, OID_AUTO, recursiveflushes, CTLFLAG_RW, &recursiveflushes, 0, "Number of flushes skipped due to being recursive"); static int numdirtybuffers; SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD, &numdirtybuffers, 0, "Number of buffers that are dirty (has unwritten changes) at the moment"); static int lodirtybuffers; SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW, &lodirtybuffers, 0, "How many buffers we want to have free before bufdaemon can sleep"); static int hidirtybuffers; SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW, &hidirtybuffers, 0, "When the number of dirty buffers is considered severe"); static int dirtybufthresh; SYSCTL_INT(_vfs, OID_AUTO, dirtybufthresh, CTLFLAG_RW, &dirtybufthresh, 0, "Number of bdwrite to bawrite conversions to clear dirty buffers"); static int numfreebuffers; SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, &numfreebuffers, 0, "Number of free buffers"); static int lofreebuffers; SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW, &lofreebuffers, 0, "XXX Unused"); static int hifreebuffers; SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW, &hifreebuffers, 0, "XXX Complicatedly unused"); static int getnewbufcalls; SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW, &getnewbufcalls, 0, "Number of calls to getnewbuf"); static int getnewbufrestarts; SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW, &getnewbufrestarts, 0, "Number of times getnewbuf has had to restart a buffer aquisition"); static int dobkgrdwrite = 1; SYSCTL_INT(_debug, OID_AUTO, dobkgrdwrite, CTLFLAG_RW, &dobkgrdwrite, 0, "Do background writes (honoring the BV_BKGRDWRITE flag)?"); /* * Wakeup point for bufdaemon, as well as indicator of whether it is already * active. Set to 1 when the bufdaemon is already "on" the queue, 0 when it * is idling. */ static int bd_request; /* * This lock synchronizes access to bd_request. */ static struct mtx bdlock; /* * bogus page -- for I/O to/from partially complete buffers * this is a temporary solution to the problem, but it is not * really that bad. it would be better to split the buffer * for input in the case of buffers partially already in memory, * but the code is intricate enough already. */ vm_page_t bogus_page; /* * Synchronization (sleep/wakeup) variable for active buffer space requests. * Set when wait starts, cleared prior to wakeup(). * Used in runningbufwakeup() and waitrunningbufspace(). */ static int runningbufreq; /* * This lock protects the runningbufreq and synchronizes runningbufwakeup and * waitrunningbufspace(). */ static struct mtx rbreqlock; /* * Synchronization (sleep/wakeup) variable for buffer requests. * Can contain the VFS_BIO_NEED flags defined below; setting/clearing is done * by and/or. * Used in numdirtywakeup(), bufspacewakeup(), bufcountwakeup(), bwillwrite(), * getnewbuf(), and getblk(). */ static int needsbuffer; /* * Lock that protects needsbuffer and the sleeps/wakeups surrounding it. */ static struct mtx nblock; /* * Lock that protects against bwait()/bdone()/B_DONE races. */ static struct mtx bdonelock; /* * Definitions for the buffer free lists. */ #define BUFFER_QUEUES 5 /* number of free buffer queues */ #define QUEUE_NONE 0 /* on no queue */ #define QUEUE_CLEAN 1 /* non-B_DELWRI buffers */ #define QUEUE_DIRTY 2 /* B_DELWRI buffers */ #define QUEUE_EMPTYKVA 3 /* empty buffer headers w/KVA assignment */ #define QUEUE_EMPTY 4 /* empty buffer headers */ /* Queues for free buffers with various properties */ static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } }; /* Lock for the bufqueues */ static struct mtx bqlock; /* * Single global constant for BUF_WMESG, to avoid getting multiple references. * buf_wmesg is referred from macros. */ const char *buf_wmesg = BUF_WMESG; #define VFS_BIO_NEED_ANY 0x01 /* any freeable buffer */ #define VFS_BIO_NEED_DIRTYFLUSH 0x02 /* waiting for dirty buffer flush */ #define VFS_BIO_NEED_FREE 0x04 /* wait for free bufs, hi hysteresis */ #define VFS_BIO_NEED_BUFSPACE 0x08 /* wait for buf space, lo hysteresis */ #ifdef DIRECTIO extern void ffs_rawread_setup(void); #endif /* DIRECTIO */ /* * numdirtywakeup: * * If someone is blocked due to there being too many dirty buffers, * and numdirtybuffers is now reasonable, wake them up. */ static __inline void numdirtywakeup(int level) { if (numdirtybuffers <= level) { mtx_lock(&nblock); if (needsbuffer & VFS_BIO_NEED_DIRTYFLUSH) { needsbuffer &= ~VFS_BIO_NEED_DIRTYFLUSH; wakeup(&needsbuffer); } mtx_unlock(&nblock); } } /* * bufspacewakeup: * * Called when buffer space is potentially available for recovery. * getnewbuf() will block on this flag when it is unable to free * sufficient buffer space. Buffer space becomes recoverable when * bp's get placed back in the queues. */ static __inline void bufspacewakeup(void) { /* * If someone is waiting for BUF space, wake them up. Even * though we haven't freed the kva space yet, the waiting * process will be able to now. */ mtx_lock(&nblock); if (needsbuffer & VFS_BIO_NEED_BUFSPACE) { needsbuffer &= ~VFS_BIO_NEED_BUFSPACE; wakeup(&needsbuffer); } mtx_unlock(&nblock); } /* * runningbufwakeup() - in-progress I/O accounting. * */ static __inline void runningbufwakeup(struct buf *bp) { if (bp->b_runningbufspace) { atomic_subtract_int(&runningbufspace, bp->b_runningbufspace); bp->b_runningbufspace = 0; mtx_lock(&rbreqlock); if (runningbufreq && runningbufspace <= lorunningspace) { runningbufreq = 0; wakeup(&runningbufreq); } mtx_unlock(&rbreqlock); } } /* * bufcountwakeup: * * Called when a buffer has been added to one of the free queues to * account for the buffer and to wakeup anyone waiting for free buffers. * This typically occurs when large amounts of metadata are being handled * by the buffer cache ( else buffer space runs out first, usually ). */ static __inline void bufcountwakeup(void) { atomic_add_int(&numfreebuffers, 1); mtx_lock(&nblock); if (needsbuffer) { needsbuffer &= ~VFS_BIO_NEED_ANY; if (numfreebuffers >= hifreebuffers) needsbuffer &= ~VFS_BIO_NEED_FREE; wakeup(&needsbuffer); } mtx_unlock(&nblock); } /* * waitrunningbufspace() * * runningbufspace is a measure of the amount of I/O currently * running. This routine is used in async-write situations to * prevent creating huge backups of pending writes to a device. * Only asynchronous writes are governed by this function. * * Reads will adjust runningbufspace, but will not block based on it. * The read load has a side effect of reducing the allowed write load. * * This does NOT turn an async write into a sync write. It waits * for earlier writes to complete and generally returns before the * caller's write has reached the device. */ static __inline void waitrunningbufspace(void) { mtx_lock(&rbreqlock); while (runningbufspace > hirunningspace) { ++runningbufreq; msleep(&runningbufreq, &rbreqlock, PVM, "wdrain", 0); } mtx_unlock(&rbreqlock); } /* * vfs_buf_test_cache: * * Called when a buffer is extended. This function clears the B_CACHE * bit if the newly extended portion of the buffer does not contain * valid data. */ static __inline__ void vfs_buf_test_cache(struct buf *bp, vm_ooffset_t foff, vm_offset_t off, vm_offset_t size, vm_page_t m) { GIANT_REQUIRED; VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); if (bp->b_flags & B_CACHE) { int base = (foff + off) & PAGE_MASK; if (vm_page_is_valid(m, base, size) == 0) bp->b_flags &= ~B_CACHE; } } /* Wake up the buffer deamon if necessary */ static __inline__ void bd_wakeup(int dirtybuflevel) { mtx_lock(&bdlock); if (bd_request == 0 && numdirtybuffers >= dirtybuflevel) { bd_request = 1; wakeup(&bd_request); } mtx_unlock(&bdlock); } /* * bd_speedup - speedup the buffer cache flushing code */ static __inline__ void bd_speedup(void) { bd_wakeup(1); } /* * Calculating buffer cache scaling values and reserve space for buffer * headers. This is called during low level kernel initialization and * may be called more then once. We CANNOT write to the memory area * being reserved at this time. */ caddr_t kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est) { /* * physmem_est is in pages. Convert it to kilobytes (assumes * PAGE_SIZE is >= 1K) */ physmem_est = physmem_est * (PAGE_SIZE / 1024); /* * The nominal buffer size (and minimum KVA allocation) is BKVASIZE. * For the first 64MB of ram nominally allocate sufficient buffers to * cover 1/4 of our ram. Beyond the first 64MB allocate additional * buffers to cover 1/20 of our ram over 64MB. When auto-sizing * the buffer cache we limit the eventual kva reservation to * maxbcache bytes. * * factor represents the 1/4 x ram conversion. */ if (nbuf == 0) { int factor = 4 * BKVASIZE / 1024; nbuf = 50; if (physmem_est > 4096) nbuf += min((physmem_est - 4096) / factor, 65536 / factor); if (physmem_est > 65536) nbuf += (physmem_est - 65536) * 2 / (factor * 5); if (maxbcache && nbuf > maxbcache / BKVASIZE) nbuf = maxbcache / BKVASIZE; } #if 0 /* * Do not allow the buffer_map to be more then 1/2 the size of the * kernel_map. */ if (nbuf > (kernel_map->max_offset - kernel_map->min_offset) / (BKVASIZE * 2)) { nbuf = (kernel_map->max_offset - kernel_map->min_offset) / (BKVASIZE * 2); printf("Warning: nbufs capped at %d\n", nbuf); } #endif /* * swbufs are used as temporary holders for I/O, such as paging I/O. * We have no less then 16 and no more then 256. */ nswbuf = max(min(nbuf/4, 256), 16); #ifdef NSWBUF_MIN if (nswbuf < NSWBUF_MIN) nswbuf = NSWBUF_MIN; #endif #ifdef DIRECTIO ffs_rawread_setup(); #endif /* * Reserve space for the buffer cache buffers */ swbuf = (void *)v; v = (caddr_t)(swbuf + nswbuf); buf = (void *)v; v = (caddr_t)(buf + nbuf); return(v); } /* Initialize the buffer subsystem. Called before use of any buffers. */ void bufinit(void) { struct buf *bp; int i; GIANT_REQUIRED; mtx_init(&bqlock, "buf queue lock", NULL, MTX_DEF); mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF); mtx_init(&nblock, "needsbuffer lock", NULL, MTX_DEF); mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF); mtx_init(&bdonelock, "bdone lock", NULL, MTX_DEF); /* next, make a null set of free lists */ for (i = 0; i < BUFFER_QUEUES; i++) TAILQ_INIT(&bufqueues[i]); /* finally, initialize each buffer header and stick on empty q */ for (i = 0; i < nbuf; i++) { bp = &buf[i]; bzero(bp, sizeof *bp); bp->b_flags = B_INVAL; /* we're just an empty header */ bp->b_dev = NODEV; bp->b_rcred = NOCRED; bp->b_wcred = NOCRED; bp->b_qindex = QUEUE_EMPTY; bp->b_vflags = 0; bp->b_xflags = 0; LIST_INIT(&bp->b_dep); BUF_LOCKINIT(bp); TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist); } /* * maxbufspace is the absolute maximum amount of buffer space we are * allowed to reserve in KVM and in real terms. The absolute maximum * is nominally used by buf_daemon. hibufspace is the nominal maximum * used by most other processes. The differential is required to * ensure that buf_daemon is able to run when other processes might * be blocked waiting for buffer space. * * maxbufspace is based on BKVASIZE. Allocating buffers larger then * this may result in KVM fragmentation which is not handled optimally * by the system. */ maxbufspace = nbuf * BKVASIZE; hibufspace = imax(3 * maxbufspace / 4, maxbufspace - MAXBSIZE * 10); lobufspace = hibufspace - MAXBSIZE; lorunningspace = 512 * 1024; hirunningspace = 1024 * 1024; /* * Limit the amount of malloc memory since it is wired permanently into * the kernel space. Even though this is accounted for in the buffer * allocation, we don't want the malloced region to grow uncontrolled. * The malloc scheme improves memory utilization significantly on average * (small) directories. */ maxbufmallocspace = hibufspace / 20; /* * Reduce the chance of a deadlock occuring by limiting the number * of delayed-write dirty buffers we allow to stack up. */ hidirtybuffers = nbuf / 4 + 20; dirtybufthresh = hidirtybuffers * 9 / 10; numdirtybuffers = 0; /* * To support extreme low-memory systems, make sure hidirtybuffers cannot * eat up all available buffer space. This occurs when our minimum cannot * be met. We try to size hidirtybuffers to 3/4 our buffer space assuming * BKVASIZE'd (8K) buffers. */ while (hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) { hidirtybuffers >>= 1; } lodirtybuffers = hidirtybuffers / 2; /* * Try to keep the number of free buffers in the specified range, * and give special processes (e.g. like buf_daemon) access to an * emergency reserve. */ lofreebuffers = nbuf / 18 + 5; hifreebuffers = 2 * lofreebuffers; numfreebuffers = nbuf; /* * Maximum number of async ops initiated per buf_daemon loop. This is * somewhat of a hack at the moment, we really need to limit ourselves * based on the number of bytes of I/O in-transit that were initiated * from buf_daemon. */ bogus_page = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL | VM_ALLOC_WIRED); } /* * bfreekva() - free the kva allocation for a buffer. * * Must be called at splbio() or higher as this is the only locking for * buffer_map. * * Since this call frees up buffer space, we call bufspacewakeup(). */ static void bfreekva(struct buf * bp) { GIANT_REQUIRED; if (bp->b_kvasize) { atomic_add_int(&buffreekvacnt, 1); atomic_subtract_int(&bufspace, bp->b_kvasize); vm_map_delete(buffer_map, (vm_offset_t) bp->b_kvabase, (vm_offset_t) bp->b_kvabase + bp->b_kvasize ); bp->b_kvasize = 0; bufspacewakeup(); } } /* * bremfree: * * Remove the buffer from the appropriate free list. */ void bremfree(struct buf * bp) { mtx_lock(&bqlock); bremfreel(bp); mtx_unlock(&bqlock); } void bremfreel(struct buf * bp) { int s = splbio(); int old_qindex = bp->b_qindex; GIANT_REQUIRED; if (bp->b_qindex != QUEUE_NONE) { KASSERT(BUF_REFCNT(bp) == 1, ("bremfree: bp %p not locked",bp)); TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist); bp->b_qindex = QUEUE_NONE; } else { if (BUF_REFCNT(bp) <= 1) panic("bremfree: removing a buffer not on a queue"); } /* * Fixup numfreebuffers count. If the buffer is invalid or not * delayed-write, and it was on the EMPTY, LRU, or AGE queues, * the buffer was free and we must decrement numfreebuffers. */ if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0) { switch(old_qindex) { case QUEUE_DIRTY: case QUEUE_CLEAN: case QUEUE_EMPTY: case QUEUE_EMPTYKVA: atomic_subtract_int(&numfreebuffers, 1); break; default: break; } } splx(s); } /* * Get a buffer with the specified data. Look in the cache first. We * must clear BIO_ERROR and B_INVAL prior to initiating I/O. If B_CACHE * is set, the buffer is valid and we do not have to do anything ( see * getblk() ). This is really just a special case of breadn(). */ int bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred, struct buf ** bpp) { return (breadn(vp, blkno, size, 0, 0, 0, cred, bpp)); } /* * Operates like bread, but also starts asynchronous I/O on * read-ahead blocks. We must clear BIO_ERROR and B_INVAL prior * to initiating I/O . If B_CACHE is set, the buffer is valid * and we do not have to do anything. */ int breadn(struct vnode * vp, daddr_t blkno, int size, daddr_t * rablkno, int *rabsize, int cnt, struct ucred * cred, struct buf ** bpp) { struct buf *bp, *rabp; int i; int rv = 0, readwait = 0; *bpp = bp = getblk(vp, blkno, size, 0, 0, 0); /* if not found in cache, do some I/O */ if ((bp->b_flags & B_CACHE) == 0) { if (curthread != PCPU_GET(idlethread)) curthread->td_proc->p_stats->p_ru.ru_inblock++; bp->b_iocmd = BIO_READ; bp->b_flags &= ~B_INVAL; bp->b_ioflags &= ~BIO_ERROR; if (bp->b_rcred == NOCRED && cred != NOCRED) bp->b_rcred = crhold(cred); vfs_busy_pages(bp, 0); bp->b_iooffset = dbtob(bp->b_blkno); if (vp->v_type == VCHR) VOP_SPECSTRATEGY(vp, bp); else VOP_STRATEGY(vp, bp); ++readwait; } for (i = 0; i < cnt; i++, rablkno++, rabsize++) { if (inmem(vp, *rablkno)) continue; rabp = getblk(vp, *rablkno, *rabsize, 0, 0, 0); if ((rabp->b_flags & B_CACHE) == 0) { if (curthread != PCPU_GET(idlethread)) curthread->td_proc->p_stats->p_ru.ru_inblock++; rabp->b_flags |= B_ASYNC; rabp->b_flags &= ~B_INVAL; rabp->b_ioflags &= ~BIO_ERROR; rabp->b_iocmd = BIO_READ; if (rabp->b_rcred == NOCRED && cred != NOCRED) rabp->b_rcred = crhold(cred); vfs_busy_pages(rabp, 0); BUF_KERNPROC(rabp); rabp->b_iooffset = dbtob(rabp->b_blkno); if (vp->v_type == VCHR) VOP_SPECSTRATEGY(vp, rabp); else VOP_STRATEGY(vp, rabp); } else { brelse(rabp); } } if (readwait) { rv = bufwait(bp); } return (rv); } /* * Write, release buffer on completion. (Done by iodone * if async). Do not bother writing anything if the buffer * is invalid. * * Note that we set B_CACHE here, indicating that buffer is * fully valid and thus cacheable. This is true even of NFS * now so we set it generally. This could be set either here * or in biodone() since the I/O is synchronous. We put it * here. */ int bwrite(struct buf * bp) { int oldflags, s; struct buf *newbp; if (bp->b_flags & B_INVAL) { brelse(bp); return (0); } oldflags = bp->b_flags; if (BUF_REFCNT(bp) == 0) panic("bwrite: buffer is not busy???"); s = splbio(); /* * If a background write is already in progress, delay * writing this block if it is asynchronous. Otherwise * wait for the background write to complete. */ VI_LOCK(bp->b_vp); if (bp->b_vflags & BV_BKGRDINPROG) { if (bp->b_flags & B_ASYNC) { VI_UNLOCK(bp->b_vp); splx(s); bdwrite(bp); return (0); } bp->b_vflags |= BV_BKGRDWAIT; msleep(&bp->b_xflags, VI_MTX(bp->b_vp), PRIBIO, "bwrbg", 0); if (bp->b_vflags & BV_BKGRDINPROG) panic("bwrite: still writing"); } VI_UNLOCK(bp->b_vp); /* Mark the buffer clean */ bundirty(bp); /* * If this buffer is marked for background writing and we * do not have to wait for it, make a copy and write the * copy so as to leave this buffer ready for further use. * * This optimization eats a lot of memory. If we have a page * or buffer shortfall we can't do it. */ if (dobkgrdwrite && (bp->b_xflags & BX_BKGRDWRITE) && (bp->b_flags & B_ASYNC) && !vm_page_count_severe() && !buf_dirty_count_severe()) { if (bp->b_iodone != NULL) { printf("bp->b_iodone = %p\n", bp->b_iodone); panic("bwrite: need chained iodone"); } /* get a new block */ newbp = geteblk(bp->b_bufsize); /* * set it to be identical to the old block. We have to * set b_lblkno and BKGRDMARKER before calling bgetvp() * to avoid confusing the splay tree and gbincore(). */ memcpy(newbp->b_data, bp->b_data, bp->b_bufsize); newbp->b_lblkno = bp->b_lblkno; newbp->b_xflags |= BX_BKGRDMARKER; VI_LOCK(bp->b_vp); bp->b_vflags |= BV_BKGRDINPROG; bgetvp(bp->b_vp, newbp); VI_UNLOCK(bp->b_vp); newbp->b_blkno = bp->b_blkno; newbp->b_offset = bp->b_offset; newbp->b_iodone = vfs_backgroundwritedone; newbp->b_flags |= B_ASYNC; newbp->b_flags &= ~B_INVAL; /* move over the dependencies */ if (LIST_FIRST(&bp->b_dep) != NULL) buf_movedeps(bp, newbp); /* * Initiate write on the copy, release the original to * the B_LOCKED queue so that it cannot go away until * the background write completes. If not locked it could go * away and then be reconstituted while it was being written. * If the reconstituted buffer were written, we could end up * with two background copies being written at the same time. */ bqrelse(bp); bp = newbp; } bp->b_flags &= ~B_DONE; bp->b_ioflags &= ~BIO_ERROR; bp->b_flags |= B_WRITEINPROG | B_CACHE; bp->b_iocmd = BIO_WRITE; VI_LOCK(bp->b_vp); bp->b_vp->v_numoutput++; VI_UNLOCK(bp->b_vp); vfs_busy_pages(bp, 1); /* * Normal bwrites pipeline writes */ bp->b_runningbufspace = bp->b_bufsize; atomic_add_int(&runningbufspace, bp->b_runningbufspace); if (curthread != PCPU_GET(idlethread)) curthread->td_proc->p_stats->p_ru.ru_oublock++; splx(s); if (oldflags & B_ASYNC) BUF_KERNPROC(bp); bp->b_iooffset = dbtob(bp->b_blkno); if (bp->b_vp->v_type == VCHR) VOP_SPECSTRATEGY(bp->b_vp, bp); else VOP_STRATEGY(bp->b_vp, bp); if ((oldflags & B_ASYNC) == 0) { int rtval = bufwait(bp); brelse(bp); return (rtval); } else { /* * don't allow the async write to saturate the I/O * system. We will not deadlock here because * we are blocking waiting for I/O that is already in-progress * to complete. We do not block here if it is the update * or syncer daemon trying to clean up as that can lead * to deadlock. */ if (curthread->td_proc != bufdaemonproc && curthread->td_proc != updateproc) waitrunningbufspace(); } return (0); } /* * Complete a background write started from bwrite. */ static void vfs_backgroundwritedone(bp) struct buf *bp; { struct buf *origbp; /* * Find the original buffer that we are writing. */ VI_LOCK(bp->b_vp); if ((origbp = gbincore(bp->b_vp, bp->b_lblkno)) == NULL) panic("backgroundwritedone: lost buffer"); /* * Clear the BV_BKGRDINPROG flag in the original buffer * and awaken it if it is waiting for the write to complete. * If BV_BKGRDINPROG is not set in the original buffer it must * have been released and re-instantiated - which is not legal. */ KASSERT((origbp->b_vflags & BV_BKGRDINPROG), ("backgroundwritedone: lost buffer2")); origbp->b_vflags &= ~BV_BKGRDINPROG; if (origbp->b_vflags & BV_BKGRDWAIT) { origbp->b_vflags &= ~BV_BKGRDWAIT; wakeup(&origbp->b_xflags); } VI_UNLOCK(bp->b_vp); /* * Process dependencies then return any unfinished ones. */ if (LIST_FIRST(&bp->b_dep) != NULL) buf_complete(bp); if (LIST_FIRST(&bp->b_dep) != NULL) buf_movedeps(bp, origbp); /* * This buffer is marked B_NOCACHE, so when it is released * by biodone, it will be tossed. We mark it with BIO_READ * to avoid biodone doing a second vwakeup. */ bp->b_flags |= B_NOCACHE; bp->b_iocmd = BIO_READ; bp->b_flags &= ~(B_CACHE | B_DONE); bp->b_iodone = 0; bufdone(bp); } /* * Delayed write. (Buffer is marked dirty). Do not bother writing * anything if the buffer is marked invalid. * * Note that since the buffer must be completely valid, we can safely * set B_CACHE. In fact, we have to set B_CACHE here rather then in * biodone() in order to prevent getblk from writing the buffer * out synchronously. */ void bdwrite(struct buf * bp) { struct thread *td = curthread; struct vnode *vp; struct buf *nbp; GIANT_REQUIRED; if (BUF_REFCNT(bp) == 0) panic("bdwrite: buffer is not busy"); if (bp->b_flags & B_INVAL) { brelse(bp); return; } /* * If we have too many dirty buffers, don't create any more. * If we are wildly over our limit, then force a complete * cleanup. Otherwise, just keep the situation from getting * out of control. Note that we have to avoid a recursive * disaster and not try to clean up after our own cleanup! */ vp = bp->b_vp; VI_LOCK(vp); if (td->td_pflags & TDP_COWINPROGRESS) { recursiveflushes++; } else if (vp != NULL && vp->v_dirtybufcnt > dirtybufthresh + 10) { VI_UNLOCK(vp); (void) VOP_FSYNC(vp, td->td_ucred, MNT_NOWAIT, td); VI_LOCK(vp); altbufferflushes++; } else if (vp != NULL && vp->v_dirtybufcnt > dirtybufthresh) { /* * Try to find a buffer to flush. */ TAILQ_FOREACH(nbp, &vp->v_dirtyblkhd, b_vnbufs) { if ((nbp->b_vflags & BV_BKGRDINPROG) || buf_countdeps(nbp, 0) || BUF_LOCK(nbp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) continue; if (bp == nbp) panic("bdwrite: found ourselves"); VI_UNLOCK(vp); if (nbp->b_flags & B_CLUSTEROK) { vfs_bio_awrite(nbp); } else { bremfree(nbp); bawrite(nbp); } VI_LOCK(vp); dirtybufferflushes++; break; } } VI_UNLOCK(vp); bdirty(bp); /* * Set B_CACHE, indicating that the buffer is fully valid. This is * true even of NFS now. */ bp->b_flags |= B_CACHE; /* * This bmap keeps the system from needing to do the bmap later, * perhaps when the system is attempting to do a sync. Since it * is likely that the indirect block -- or whatever other datastructure * that the filesystem needs is still in memory now, it is a good * thing to do this. Note also, that if the pageout daemon is * requesting a sync -- there might not be enough memory to do * the bmap then... So, this is important to do. */ if (vp->v_type != VCHR && bp->b_lblkno == bp->b_blkno) { VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL); } /* * Set the *dirty* buffer range based upon the VM system dirty pages. */ vfs_setdirty(bp); /* * We need to do this here to satisfy the vnode_pager and the * pageout daemon, so that it thinks that the pages have been * "cleaned". Note that since the pages are in a delayed write * buffer -- the VFS layer "will" see that the pages get written * out on the next sync, or perhaps the cluster will be completed. */ vfs_clean_pages(bp); bqrelse(bp); /* * Wakeup the buffer flushing daemon if we have a lot of dirty * buffers (midpoint between our recovery point and our stall * point). */ bd_wakeup((lodirtybuffers + hidirtybuffers) / 2); /* * note: we cannot initiate I/O from a bdwrite even if we wanted to, * due to the softdep code. */ } /* * bdirty: * * Turn buffer into delayed write request. We must clear BIO_READ and * B_RELBUF, and we must set B_DELWRI. We reassign the buffer to * itself to properly update it in the dirty/clean lists. We mark it * B_DONE to ensure that any asynchronization of the buffer properly * clears B_DONE ( else a panic will occur later ). * * bdirty() is kinda like bdwrite() - we have to clear B_INVAL which * might have been set pre-getblk(). Unlike bwrite/bdwrite, bdirty() * should only be called if the buffer is known-good. * * Since the buffer is not on a queue, we do not update the numfreebuffers * count. * * Must be called at splbio(). * The buffer must be on QUEUE_NONE. */ void bdirty(bp) struct buf *bp; { KASSERT(bp->b_qindex == QUEUE_NONE, ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex)); bp->b_flags &= ~(B_RELBUF); bp->b_iocmd = BIO_WRITE; if ((bp->b_flags & B_DELWRI) == 0) { bp->b_flags |= B_DONE | B_DELWRI; reassignbuf(bp, bp->b_vp); atomic_add_int(&numdirtybuffers, 1); bd_wakeup((lodirtybuffers + hidirtybuffers) / 2); } } /* * bundirty: * * Clear B_DELWRI for buffer. * * Since the buffer is not on a queue, we do not update the numfreebuffers * count. * * Must be called at splbio(). * The buffer must be on QUEUE_NONE. */ void bundirty(bp) struct buf *bp; { KASSERT(bp->b_qindex == QUEUE_NONE, ("bundirty: buffer %p still on queue %d", bp, bp->b_qindex)); if (bp->b_flags & B_DELWRI) { bp->b_flags &= ~B_DELWRI; reassignbuf(bp, bp->b_vp); atomic_subtract_int(&numdirtybuffers, 1); numdirtywakeup(lodirtybuffers); } /* * Since it is now being written, we can clear its deferred write flag. */ bp->b_flags &= ~B_DEFERRED; } /* * bawrite: * * Asynchronous write. Start output on a buffer, but do not wait for * it to complete. The buffer is released when the output completes. * * bwrite() ( or the VOP routine anyway ) is responsible for handling * B_INVAL buffers. Not us. */ void bawrite(struct buf * bp) { bp->b_flags |= B_ASYNC; (void) BUF_WRITE(bp); } /* * bwillwrite: * * Called prior to the locking of any vnodes when we are expecting to * write. We do not want to starve the buffer cache with too many * dirty buffers so we block here. By blocking prior to the locking * of any vnodes we attempt to avoid the situation where a locked vnode * prevents the various system daemons from flushing related buffers. */ void bwillwrite(void) { if (numdirtybuffers >= hidirtybuffers) { int s; mtx_lock(&Giant); s = splbio(); mtx_lock(&nblock); while (numdirtybuffers >= hidirtybuffers) { bd_wakeup(1); needsbuffer |= VFS_BIO_NEED_DIRTYFLUSH; msleep(&needsbuffer, &nblock, (PRIBIO + 4), "flswai", 0); } splx(s); mtx_unlock(&nblock); mtx_unlock(&Giant); } } /* * Return true if we have too many dirty buffers. */ int buf_dirty_count_severe(void) { return(numdirtybuffers >= hidirtybuffers); } /* * brelse: * * Release a busy buffer and, if requested, free its resources. The * buffer will be stashed in the appropriate bufqueue[] allowing it * to be accessed later as a cache entity or reused for other purposes. */ void brelse(struct buf * bp) { int s; GIANT_REQUIRED; KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); s = splbio(); if (bp->b_iocmd == BIO_WRITE && (bp->b_ioflags & BIO_ERROR) && !(bp->b_flags & B_INVAL)) { /* * Failed write, redirty. Must clear BIO_ERROR to prevent * pages from being scrapped. If B_INVAL is set then * this case is not run and the next case is run to * destroy the buffer. B_INVAL can occur if the buffer * is outside the range supported by the underlying device. */ bp->b_ioflags &= ~BIO_ERROR; bdirty(bp); } else if ((bp->b_flags & (B_NOCACHE | B_INVAL)) || (bp->b_ioflags & BIO_ERROR) || bp->b_iocmd == BIO_DELETE || (bp->b_bufsize <= 0)) { /* * Either a failed I/O or we were asked to free or not * cache the buffer. */ bp->b_flags |= B_INVAL; if (LIST_FIRST(&bp->b_dep) != NULL) buf_deallocate(bp); if (bp->b_flags & B_DELWRI) { atomic_subtract_int(&numdirtybuffers, 1); numdirtywakeup(lodirtybuffers); } bp->b_flags &= ~(B_DELWRI | B_CACHE); if ((bp->b_flags & B_VMIO) == 0) { if (bp->b_bufsize) allocbuf(bp, 0); if (bp->b_vp) brelvp(bp); } } /* * We must clear B_RELBUF if B_DELWRI is set. If vfs_vmio_release() * is called with B_DELWRI set, the underlying pages may wind up * getting freed causing a previous write (bdwrite()) to get 'lost' * because pages associated with a B_DELWRI bp are marked clean. * * We still allow the B_INVAL case to call vfs_vmio_release(), even * if B_DELWRI is set. * * If B_DELWRI is not set we may have to set B_RELBUF if we are low * on pages to return pages to the VM page queues. */ if (bp->b_flags & B_DELWRI) bp->b_flags &= ~B_RELBUF; else if (vm_page_count_severe()) { /* * XXX This lock may not be necessary since BKGRDINPROG * cannot be set while we hold the buf lock, it can only be * cleared if it is already pending. */ if (bp->b_vp) { VI_LOCK(bp->b_vp); if (!(bp->b_vflags & BV_BKGRDINPROG)) bp->b_flags |= B_RELBUF; VI_UNLOCK(bp->b_vp); } else bp->b_flags |= B_RELBUF; } /* * VMIO buffer rundown. It is not very necessary to keep a VMIO buffer * constituted, not even NFS buffers now. Two flags effect this. If * B_INVAL, the struct buf is invalidated but the VM object is kept * around ( i.e. so it is trivial to reconstitute the buffer later ). * * If BIO_ERROR or B_NOCACHE is set, pages in the VM object will be * invalidated. BIO_ERROR cannot be set for a failed write unless the * buffer is also B_INVAL because it hits the re-dirtying code above. * * Normally we can do this whether a buffer is B_DELWRI or not. If * the buffer is an NFS buffer, it is tracking piecemeal writes or * the commit state and we cannot afford to lose the buffer. If the * buffer has a background write in progress, we need to keep it * around to prevent it from being reconstituted and starting a second * background write. */ if ((bp->b_flags & B_VMIO) && !(bp->b_vp->v_mount != NULL && (bp->b_vp->v_mount->mnt_vfc->vfc_flags & VFCF_NETWORK) != 0 && !vn_isdisk(bp->b_vp, NULL) && (bp->b_flags & B_DELWRI)) ) { int i, j, resid; vm_page_t m; off_t foff; vm_pindex_t poff; vm_object_t obj; struct vnode *vp; vp = bp->b_vp; obj = bp->b_object; /* * Get the base offset and length of the buffer. Note that * in the VMIO case if the buffer block size is not * page-aligned then b_data pointer may not be page-aligned. * But our b_pages[] array *IS* page aligned. * * block sizes less then DEV_BSIZE (usually 512) are not * supported due to the page granularity bits (m->valid, * m->dirty, etc...). * * See man buf(9) for more information */ resid = bp->b_bufsize; foff = bp->b_offset; VM_OBJECT_LOCK(obj); for (i = 0; i < bp->b_npages; i++) { int had_bogus = 0; m = bp->b_pages[i]; vm_page_lock_queues(); vm_page_flag_clear(m, PG_ZERO); vm_page_unlock_queues(); /* * If we hit a bogus page, fixup *all* the bogus pages * now. */ if (m == bogus_page) { poff = OFF_TO_IDX(bp->b_offset); had_bogus = 1; for (j = i; j < bp->b_npages; j++) { vm_page_t mtmp; mtmp = bp->b_pages[j]; if (mtmp == bogus_page) { mtmp = vm_page_lookup(obj, poff + j); if (!mtmp) { panic("brelse: page missing\n"); } bp->b_pages[j] = mtmp; } } if ((bp->b_flags & B_INVAL) == 0) { pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } m = bp->b_pages[i]; } if ((bp->b_flags & B_NOCACHE) || (bp->b_ioflags & BIO_ERROR)) { int poffset = foff & PAGE_MASK; int presid = resid > (PAGE_SIZE - poffset) ? (PAGE_SIZE - poffset) : resid; KASSERT(presid >= 0, ("brelse: extra page")); vm_page_lock_queues(); vm_page_set_invalid(m, poffset, presid); vm_page_unlock_queues(); if (had_bogus) printf("avoided corruption bug in bogus_page/brelse code\n"); } resid -= PAGE_SIZE - (foff & PAGE_MASK); foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; } VM_OBJECT_UNLOCK(obj); if (bp->b_flags & (B_INVAL | B_RELBUF)) vfs_vmio_release(bp); } else if (bp->b_flags & B_VMIO) { if (bp->b_flags & (B_INVAL | B_RELBUF)) { vfs_vmio_release(bp); } } if (bp->b_qindex != QUEUE_NONE) panic("brelse: free buffer onto another queue???"); if (BUF_REFCNT(bp) > 1) { /* do not release to free list */ BUF_UNLOCK(bp); splx(s); return; } /* enqueue */ mtx_lock(&bqlock); /* buffers with no memory */ if (bp->b_bufsize == 0) { bp->b_flags |= B_INVAL; bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA); if (bp->b_vflags & BV_BKGRDINPROG) panic("losing buffer 1"); if (bp->b_kvasize) { bp->b_qindex = QUEUE_EMPTYKVA; } else { bp->b_qindex = QUEUE_EMPTY; } TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist); bp->b_dev = NODEV; /* buffers with junk contents */ } else if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF) || (bp->b_ioflags & BIO_ERROR)) { bp->b_flags |= B_INVAL; bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA); if (bp->b_vflags & BV_BKGRDINPROG) panic("losing buffer 2"); bp->b_qindex = QUEUE_CLEAN; TAILQ_INSERT_HEAD(&bufqueues[QUEUE_CLEAN], bp, b_freelist); bp->b_dev = NODEV; /* remaining buffers */ } else { if (bp->b_flags & B_DELWRI) bp->b_qindex = QUEUE_DIRTY; else bp->b_qindex = QUEUE_CLEAN; if (bp->b_flags & B_AGE) TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist); else TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist); } mtx_unlock(&bqlock); /* * If B_INVAL and B_DELWRI is set, clear B_DELWRI. We have already * placed the buffer on the correct queue. We must also disassociate * the device and vnode for a B_INVAL buffer so gbincore() doesn't * find it. */ if (bp->b_flags & B_INVAL) { if (bp->b_flags & B_DELWRI) bundirty(bp); if (bp->b_vp) brelvp(bp); } /* * Fixup numfreebuffers count. The bp is on an appropriate queue * unless locked. We then bump numfreebuffers if it is not B_DELWRI. * We've already handled the B_INVAL case ( B_DELWRI will be clear * if B_INVAL is set ). */ if (!(bp->b_flags & B_DELWRI)) bufcountwakeup(); /* * Something we can maybe free or reuse */ if (bp->b_bufsize || bp->b_kvasize) bufspacewakeup(); bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF | B_DIRECT); if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY)) panic("brelse: not dirty"); /* unlock */ BUF_UNLOCK(bp); splx(s); } /* * Release a buffer back to the appropriate queue but do not try to free * it. The buffer is expected to be used again soon. * * bqrelse() is used by bdwrite() to requeue a delayed write, and used by * biodone() to requeue an async I/O on completion. It is also used when * known good buffers need to be requeued but we think we may need the data * again soon. * * XXX we should be able to leave the B_RELBUF hint set on completion. */ void bqrelse(struct buf * bp) { int s; s = splbio(); KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); if (bp->b_qindex != QUEUE_NONE) panic("bqrelse: free buffer onto another queue???"); if (BUF_REFCNT(bp) > 1) { /* do not release to free list */ BUF_UNLOCK(bp); splx(s); return; } mtx_lock(&bqlock); /* buffers with stale but valid contents */ if (bp->b_flags & B_DELWRI) { bp->b_qindex = QUEUE_DIRTY; TAILQ_INSERT_TAIL(&bufqueues[QUEUE_DIRTY], bp, b_freelist); } else { /* * XXX This lock may not be necessary since BKGRDINPROG * cannot be set while we hold the buf lock, it can only be * cleared if it is already pending. */ VI_LOCK(bp->b_vp); if (!vm_page_count_severe() || bp->b_vflags & BV_BKGRDINPROG) { VI_UNLOCK(bp->b_vp); bp->b_qindex = QUEUE_CLEAN; TAILQ_INSERT_TAIL(&bufqueues[QUEUE_CLEAN], bp, b_freelist); } else { /* * We are too low on memory, we have to try to free * the buffer (most importantly: the wired pages * making up its backing store) *now*. */ VI_UNLOCK(bp->b_vp); mtx_unlock(&bqlock); splx(s); brelse(bp); return; } } mtx_unlock(&bqlock); if ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI)) bufcountwakeup(); /* * Something we can maybe free or reuse. */ if (bp->b_bufsize && !(bp->b_flags & B_DELWRI)) bufspacewakeup(); bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF); if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY)) panic("bqrelse: not dirty"); /* unlock */ BUF_UNLOCK(bp); splx(s); } /* Give pages used by the bp back to the VM system (where possible) */ static void vfs_vmio_release(bp) struct buf *bp; { int i; vm_page_t m; GIANT_REQUIRED; if (bp->b_object != NULL) VM_OBJECT_LOCK(bp->b_object); vm_page_lock_queues(); for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; bp->b_pages[i] = NULL; /* * In order to keep page LRU ordering consistent, put * everything on the inactive queue. */ vm_page_unwire(m, 0); /* * We don't mess with busy pages, it is * the responsibility of the process that * busied the pages to deal with them. */ if ((m->flags & PG_BUSY) || (m->busy != 0)) continue; if (m->wire_count == 0) { vm_page_flag_clear(m, PG_ZERO); /* * Might as well free the page if we can and it has * no valid data. We also free the page if the * buffer was used for direct I/O */ if ((bp->b_flags & B_ASYNC) == 0 && !m->valid && m->hold_count == 0) { vm_page_busy(m); pmap_remove_all(m); vm_page_free(m); } else if (bp->b_flags & B_DIRECT) { vm_page_try_to_free(m); } else if (vm_page_count_severe()) { vm_page_try_to_cache(m); } } } vm_page_unlock_queues(); if (bp->b_object != NULL) VM_OBJECT_UNLOCK(bp->b_object); pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages); if (bp->b_bufsize) { bufspacewakeup(); bp->b_bufsize = 0; } bp->b_npages = 0; bp->b_flags &= ~B_VMIO; if (bp->b_vp) brelvp(bp); } /* * Check to see if a block at a particular lbn is available for a clustered * write. */ static int vfs_bio_clcheck(struct vnode *vp, int size, daddr_t lblkno, daddr_t blkno) { struct buf *bpa; int match; match = 0; /* If the buf isn't in core skip it */ if ((bpa = gbincore(vp, lblkno)) == NULL) return (0); /* If the buf is busy we don't want to wait for it */ if (BUF_LOCK(bpa, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) return (0); /* Only cluster with valid clusterable delayed write buffers */ if ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) != (B_DELWRI | B_CLUSTEROK)) goto done; if (bpa->b_bufsize != size) goto done; /* * Check to see if it is in the expected place on disk and that the * block has been mapped. */ if ((bpa->b_blkno != bpa->b_lblkno) && (bpa->b_blkno == blkno)) match = 1; done: BUF_UNLOCK(bpa); return (match); } /* * vfs_bio_awrite: * * Implement clustered async writes for clearing out B_DELWRI buffers. * This is much better then the old way of writing only one buffer at * a time. Note that we may not be presented with the buffers in the * correct order, so we search for the cluster in both directions. */ int vfs_bio_awrite(struct buf * bp) { int i; int j; daddr_t lblkno = bp->b_lblkno; struct vnode *vp = bp->b_vp; int s; int ncl; int nwritten; int size; int maxcl; s = splbio(); /* * right now we support clustered writing only to regular files. If * we find a clusterable block we could be in the middle of a cluster * rather then at the beginning. */ if ((vp->v_type == VREG) && (vp->v_mount != 0) && /* Only on nodes that have the size info */ (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) { size = vp->v_mount->mnt_stat.f_iosize; maxcl = MAXPHYS / size; VI_LOCK(vp); for (i = 1; i < maxcl; i++) if (vfs_bio_clcheck(vp, size, lblkno + i, bp->b_blkno + ((i * size) >> DEV_BSHIFT)) == 0) break; for (j = 1; i + j <= maxcl && j <= lblkno; j++) if (vfs_bio_clcheck(vp, size, lblkno - j, bp->b_blkno - ((j * size) >> DEV_BSHIFT)) == 0) break; VI_UNLOCK(vp); --j; ncl = i + j; /* * this is a possible cluster write */ if (ncl != 1) { BUF_UNLOCK(bp); nwritten = cluster_wbuild(vp, size, lblkno - j, ncl); splx(s); return nwritten; } } bremfree(bp); bp->b_flags |= B_ASYNC; splx(s); /* * default (old) behavior, writing out only one block * * XXX returns b_bufsize instead of b_bcount for nwritten? */ nwritten = bp->b_bufsize; (void) BUF_WRITE(bp); return nwritten; } /* * getnewbuf: * * Find and initialize a new buffer header, freeing up existing buffers * in the bufqueues as necessary. The new buffer is returned locked. * * Important: B_INVAL is not set. If the caller wishes to throw the * buffer away, the caller must set B_INVAL prior to calling brelse(). * * We block if: * We have insufficient buffer headers * We have insufficient buffer space * buffer_map is too fragmented ( space reservation fails ) * If we have to flush dirty buffers ( but we try to avoid this ) * * To avoid VFS layer recursion we do not flush dirty buffers ourselves. * Instead we ask the buf daemon to do it for us. We attempt to * avoid piecemeal wakeups of the pageout daemon. */ static struct buf * getnewbuf(int slpflag, int slptimeo, int size, int maxsize) { struct buf *bp; struct buf *nbp; int defrag = 0; int nqindex; static int flushingbufs; GIANT_REQUIRED; /* * We can't afford to block since we might be holding a vnode lock, * which may prevent system daemons from running. We deal with * low-memory situations by proactively returning memory and running * async I/O rather then sync I/O. */ atomic_add_int(&getnewbufcalls, 1); atomic_subtract_int(&getnewbufrestarts, 1); restart: atomic_add_int(&getnewbufrestarts, 1); /* * Setup for scan. If we do not have enough free buffers, * we setup a degenerate case that immediately fails. Note * that if we are specially marked process, we are allowed to * dip into our reserves. * * The scanning sequence is nominally: EMPTY->EMPTYKVA->CLEAN * * We start with EMPTYKVA. If the list is empty we backup to EMPTY. * However, there are a number of cases (defragging, reusing, ...) * where we cannot backup. */ mtx_lock(&bqlock); nqindex = QUEUE_EMPTYKVA; nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]); if (nbp == NULL) { /* * If no EMPTYKVA buffers and we are either * defragging or reusing, locate a CLEAN buffer * to free or reuse. If bufspace useage is low * skip this step so we can allocate a new buffer. */ if (defrag || bufspace >= lobufspace) { nqindex = QUEUE_CLEAN; nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]); } /* * If we could not find or were not allowed to reuse a * CLEAN buffer, check to see if it is ok to use an EMPTY * buffer. We can only use an EMPTY buffer if allocating * its KVA would not otherwise run us out of buffer space. */ if (nbp == NULL && defrag == 0 && bufspace + maxsize < hibufspace) { nqindex = QUEUE_EMPTY; nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]); } } /* * Run scan, possibly freeing data and/or kva mappings on the fly * depending. */ while ((bp = nbp) != NULL) { int qindex = nqindex; /* * Calculate next bp ( we can only use it if we do not block * or do other fancy things ). */ if ((nbp = TAILQ_NEXT(bp, b_freelist)) == NULL) { switch(qindex) { case QUEUE_EMPTY: nqindex = QUEUE_EMPTYKVA; if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]))) break; /* FALLTHROUGH */ case QUEUE_EMPTYKVA: nqindex = QUEUE_CLEAN; if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]))) break; /* FALLTHROUGH */ case QUEUE_CLEAN: /* * nbp is NULL. */ break; } } if (bp->b_vp) { VI_LOCK(bp->b_vp); if (bp->b_vflags & BV_BKGRDINPROG) { VI_UNLOCK(bp->b_vp); continue; } VI_UNLOCK(bp->b_vp); } /* * Sanity Checks */ KASSERT(bp->b_qindex == qindex, ("getnewbuf: inconsistant queue %d bp %p", qindex, bp)); /* * Note: we no longer distinguish between VMIO and non-VMIO * buffers. */ KASSERT((bp->b_flags & B_DELWRI) == 0, ("delwri buffer %p found in queue %d", bp, qindex)); /* * If we are defragging then we need a buffer with * b_kvasize != 0. XXX this situation should no longer * occur, if defrag is non-zero the buffer's b_kvasize * should also be non-zero at this point. XXX */ if (defrag && bp->b_kvasize == 0) { printf("Warning: defrag empty buffer %p\n", bp); continue; } /* * Start freeing the bp. This is somewhat involved. nbp * remains valid only for QUEUE_EMPTY[KVA] bp's. */ if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) panic("getnewbuf: locked buf"); bremfreel(bp); mtx_unlock(&bqlock); if (qindex == QUEUE_CLEAN) { if (bp->b_flags & B_VMIO) { bp->b_flags &= ~B_ASYNC; vfs_vmio_release(bp); } if (bp->b_vp) brelvp(bp); } /* * NOTE: nbp is now entirely invalid. We can only restart * the scan from this point on. * * Get the rest of the buffer freed up. b_kva* is still * valid after this operation. */ if (bp->b_rcred != NOCRED) { crfree(bp->b_rcred); bp->b_rcred = NOCRED; } if (bp->b_wcred != NOCRED) { crfree(bp->b_wcred); bp->b_wcred = NOCRED; } if (LIST_FIRST(&bp->b_dep) != NULL) buf_deallocate(bp); if (bp->b_vflags & BV_BKGRDINPROG) panic("losing buffer 3"); if (bp->b_bufsize) allocbuf(bp, 0); bp->b_flags = 0; bp->b_ioflags = 0; bp->b_xflags = 0; bp->b_vflags = 0; bp->b_dev = NODEV; bp->b_vp = NULL; bp->b_blkno = bp->b_lblkno = 0; bp->b_offset = NOOFFSET; bp->b_iodone = 0; bp->b_error = 0; bp->b_resid = 0; bp->b_bcount = 0; bp->b_npages = 0; bp->b_dirtyoff = bp->b_dirtyend = 0; bp->b_magic = B_MAGIC_BIO; bp->b_op = &buf_ops_bio; bp->b_object = NULL; LIST_INIT(&bp->b_dep); /* * If we are defragging then free the buffer. */ if (defrag) { bp->b_flags |= B_INVAL; bfreekva(bp); brelse(bp); defrag = 0; goto restart; } /* * If we are overcomitted then recover the buffer and its * KVM space. This occurs in rare situations when multiple * processes are blocked in getnewbuf() or allocbuf(). */ if (bufspace >= hibufspace) flushingbufs = 1; if (flushingbufs && bp->b_kvasize != 0) { bp->b_flags |= B_INVAL; bfreekva(bp); brelse(bp); goto restart; } if (bufspace < lobufspace) flushingbufs = 0; break; } /* * If we exhausted our list, sleep as appropriate. We may have to * wakeup various daemons and write out some dirty buffers. * * Generally we are sleeping due to insufficient buffer space. */ if (bp == NULL) { int flags; char *waitmsg; mtx_unlock(&bqlock); if (defrag) { flags = VFS_BIO_NEED_BUFSPACE; waitmsg = "nbufkv"; } else if (bufspace >= hibufspace) { waitmsg = "nbufbs"; flags = VFS_BIO_NEED_BUFSPACE; } else { waitmsg = "newbuf"; flags = VFS_BIO_NEED_ANY; } bd_speedup(); /* heeeelp */ mtx_lock(&nblock); needsbuffer |= flags; while (needsbuffer & flags) { if (msleep(&needsbuffer, &nblock, (PRIBIO + 4) | slpflag, waitmsg, slptimeo)) { mtx_unlock(&nblock); return (NULL); } } mtx_unlock(&nblock); } else { /* * We finally have a valid bp. We aren't quite out of the * woods, we still have to reserve kva space. In order * to keep fragmentation sane we only allocate kva in * BKVASIZE chunks. */ maxsize = (maxsize + BKVAMASK) & ~BKVAMASK; if (maxsize != bp->b_kvasize) { vm_offset_t addr = 0; bfreekva(bp); if (vm_map_findspace(buffer_map, vm_map_min(buffer_map), maxsize, &addr)) { /* * Uh oh. Buffer map is to fragmented. We * must defragment the map. */ atomic_add_int(&bufdefragcnt, 1); defrag = 1; bp->b_flags |= B_INVAL; brelse(bp); goto restart; } if (addr) { vm_map_insert(buffer_map, NULL, 0, addr, addr + maxsize, VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT); bp->b_kvabase = (caddr_t) addr; bp->b_kvasize = maxsize; atomic_add_int(&bufspace, bp->b_kvasize); atomic_add_int(&bufreusecnt, 1); } } bp->b_saveaddr = bp->b_kvabase; bp->b_data = bp->b_saveaddr; } return(bp); } /* * buf_daemon: * * buffer flushing daemon. Buffers are normally flushed by the * update daemon but if it cannot keep up this process starts to * take the load in an attempt to prevent getnewbuf() from blocking. */ static struct kproc_desc buf_kp = { "bufdaemon", buf_daemon, &bufdaemonproc }; SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp) static void buf_daemon() { int s; mtx_lock(&Giant); /* * This process needs to be suspended prior to shutdown sync. */ EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, bufdaemonproc, SHUTDOWN_PRI_LAST); /* * This process is allowed to take the buffer cache to the limit */ s = splbio(); mtx_lock(&bdlock); for (;;) { bd_request = 0; mtx_unlock(&bdlock); kthread_suspend_check(bufdaemonproc); /* * Do the flush. Limit the amount of in-transit I/O we * allow to build up, otherwise we would completely saturate * the I/O system. Wakeup any waiting processes before we * normally would so they can run in parallel with our drain. */ while (numdirtybuffers > lodirtybuffers) { if (flushbufqueues(0) == 0) { /* * Could not find any buffers without rollback * dependencies, so just write the first one * in the hopes of eventually making progress. */ flushbufqueues(1); break; } waitrunningbufspace(); numdirtywakeup((lodirtybuffers + hidirtybuffers) / 2); } /* * Only clear bd_request if we have reached our low water * mark. The buf_daemon normally waits 1 second and * then incrementally flushes any dirty buffers that have * built up, within reason. * * If we were unable to hit our low water mark and couldn't * find any flushable buffers, we sleep half a second. * Otherwise we loop immediately. */ mtx_lock(&bdlock); if (numdirtybuffers <= lodirtybuffers) { /* * We reached our low water mark, reset the * request and sleep until we are needed again. * The sleep is just so the suspend code works. */ bd_request = 0; msleep(&bd_request, &bdlock, PVM, "psleep", hz); } else { /* * We couldn't find any flushable dirty buffers but * still have too many dirty buffers, we * have to sleep and try again. (rare) */ msleep(&bd_request, &bdlock, PVM, "qsleep", hz / 10); } } } /* * flushbufqueues: * * Try to flush a buffer in the dirty queue. We must be careful to * free up B_INVAL buffers instead of write them, which NFS is * particularly sensitive to. */ int flushwithdeps = 0; SYSCTL_INT(_vfs, OID_AUTO, flushwithdeps, CTLFLAG_RW, &flushwithdeps, 0, "Number of buffers flushed with dependecies that require rollbacks"); static int flushbufqueues(int flushdeps) { struct thread *td = curthread; struct vnode *vp; struct mount *mp; struct buf *bp; int hasdeps; mtx_lock(&bqlock); TAILQ_FOREACH(bp, &bufqueues[QUEUE_DIRTY], b_freelist) { if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) continue; KASSERT((bp->b_flags & B_DELWRI), ("unexpected clean buffer %p", bp)); VI_LOCK(bp->b_vp); if ((bp->b_vflags & BV_BKGRDINPROG) != 0) { VI_UNLOCK(bp->b_vp); BUF_UNLOCK(bp); continue; } VI_UNLOCK(bp->b_vp); if (bp->b_flags & B_INVAL) { bremfreel(bp); mtx_unlock(&bqlock); brelse(bp); return (1); } if (LIST_FIRST(&bp->b_dep) != NULL && buf_countdeps(bp, 0)) { if (flushdeps == 0) { BUF_UNLOCK(bp); continue; } hasdeps = 1; } else hasdeps = 0; /* * We must hold the lock on a vnode before writing * one of its buffers. Otherwise we may confuse, or * in the case of a snapshot vnode, deadlock the * system. * * The lock order here is the reverse of the normal * of vnode followed by buf lock. This is ok because * the NOWAIT will prevent deadlock. */ vp = bp->b_vp; if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { BUF_UNLOCK(bp); continue; } if (vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT, td) == 0) { mtx_unlock(&bqlock); vfs_bio_awrite(bp); vn_finished_write(mp); VOP_UNLOCK(vp, 0, td); flushwithdeps += hasdeps; return (1); } vn_finished_write(mp); BUF_UNLOCK(bp); } mtx_unlock(&bqlock); return (0); } /* * Check to see if a block is currently memory resident. */ struct buf * incore(struct vnode * vp, daddr_t blkno) { struct buf *bp; int s = splbio(); VI_LOCK(vp); bp = gbincore(vp, blkno); VI_UNLOCK(vp); splx(s); return (bp); } /* * Returns true if no I/O is needed to access the * associated VM object. This is like incore except * it also hunts around in the VM system for the data. */ int inmem(struct vnode * vp, daddr_t blkno) { vm_object_t obj; vm_offset_t toff, tinc, size; vm_page_t m; vm_ooffset_t off; GIANT_REQUIRED; ASSERT_VOP_LOCKED(vp, "inmem"); if (incore(vp, blkno)) return 1; if (vp->v_mount == NULL) return 0; if (VOP_GETVOBJECT(vp, &obj) != 0 || (vp->v_vflag & VV_OBJBUF) == 0) return 0; size = PAGE_SIZE; if (size > vp->v_mount->mnt_stat.f_iosize) size = vp->v_mount->mnt_stat.f_iosize; off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosize; VM_OBJECT_LOCK(obj); for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) { m = vm_page_lookup(obj, OFF_TO_IDX(off + toff)); if (!m) goto notinmem; tinc = size; if (tinc > PAGE_SIZE - ((toff + off) & PAGE_MASK)) tinc = PAGE_SIZE - ((toff + off) & PAGE_MASK); if (vm_page_is_valid(m, (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0) goto notinmem; } VM_OBJECT_UNLOCK(obj); return 1; notinmem: VM_OBJECT_UNLOCK(obj); return (0); } /* * vfs_setdirty: * * Sets the dirty range for a buffer based on the status of the dirty * bits in the pages comprising the buffer. * * The range is limited to the size of the buffer. * * This routine is primarily used by NFS, but is generalized for the * B_VMIO case. */ static void vfs_setdirty(struct buf *bp) { int i; vm_object_t object; GIANT_REQUIRED; /* * Degenerate case - empty buffer */ if (bp->b_bufsize == 0) return; /* * We qualify the scan for modified pages on whether the * object has been flushed yet. The OBJ_WRITEABLE flag * is not cleared simply by protecting pages off. */ if ((bp->b_flags & B_VMIO) == 0) return; object = bp->b_pages[0]->object; VM_OBJECT_LOCK(object); if ((object->flags & OBJ_WRITEABLE) && !(object->flags & OBJ_MIGHTBEDIRTY)) printf("Warning: object %p writeable but not mightbedirty\n", object); if (!(object->flags & OBJ_WRITEABLE) && (object->flags & OBJ_MIGHTBEDIRTY)) printf("Warning: object %p mightbedirty but not writeable\n", object); if (object->flags & (OBJ_MIGHTBEDIRTY|OBJ_CLEANING)) { vm_offset_t boffset; vm_offset_t eoffset; vm_page_lock_queues(); /* * test the pages to see if they have been modified directly * by users through the VM system. */ for (i = 0; i < bp->b_npages; i++) { vm_page_flag_clear(bp->b_pages[i], PG_ZERO); vm_page_test_dirty(bp->b_pages[i]); } /* * Calculate the encompassing dirty range, boffset and eoffset, * (eoffset - boffset) bytes. */ for (i = 0; i < bp->b_npages; i++) { if (bp->b_pages[i]->dirty) break; } boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK); for (i = bp->b_npages - 1; i >= 0; --i) { if (bp->b_pages[i]->dirty) { break; } } eoffset = ((i + 1) << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK); vm_page_unlock_queues(); /* * Fit it to the buffer. */ if (eoffset > bp->b_bcount) eoffset = bp->b_bcount; /* * If we have a good dirty range, merge with the existing * dirty range. */ if (boffset < eoffset) { if (bp->b_dirtyoff > boffset) bp->b_dirtyoff = boffset; if (bp->b_dirtyend < eoffset) bp->b_dirtyend = eoffset; } } VM_OBJECT_UNLOCK(object); } /* * getblk: * * Get a block given a specified block and offset into a file/device. * The buffers B_DONE bit will be cleared on return, making it almost * ready for an I/O initiation. B_INVAL may or may not be set on * return. The caller should clear B_INVAL prior to initiating a * READ. * * For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for * an existing buffer. * * For a VMIO buffer, B_CACHE is modified according to the backing VM. * If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set * and then cleared based on the backing VM. If the previous buffer is * non-0-sized but invalid, B_CACHE will be cleared. * * If getblk() must create a new buffer, the new buffer is returned with * both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which * case it is returned with B_INVAL clear and B_CACHE set based on the * backing VM. * * getblk() also forces a BUF_WRITE() for any B_DELWRI buffer whos * B_CACHE bit is clear. * * What this means, basically, is that the caller should use B_CACHE to * determine whether the buffer is fully valid or not and should clear * B_INVAL prior to issuing a read. If the caller intends to validate * the buffer by loading its data area with something, the caller needs * to clear B_INVAL. If the caller does this without issuing an I/O, * the caller should set B_CACHE ( as an optimization ), else the caller * should issue the I/O and biodone() will set B_CACHE if the I/O was * a write attempt or if it was a successfull read. If the caller * intends to issue a READ, the caller must clear B_INVAL and BIO_ERROR * prior to issuing the READ. biodone() will *not* clear B_INVAL. */ struct buf * getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo, int flags) { struct buf *bp; int s; int error; ASSERT_VOP_LOCKED(vp, "getblk"); if (size > MAXBSIZE) panic("getblk: size(%d) > MAXBSIZE(%d)\n", size, MAXBSIZE); s = splbio(); loop: /* * Block if we are low on buffers. Certain processes are allowed * to completely exhaust the buffer cache. * * If this check ever becomes a bottleneck it may be better to * move it into the else, when gbincore() fails. At the moment * it isn't a problem. * * XXX remove if 0 sections (clean this up after its proven) */ if (numfreebuffers == 0) { if (curthread == PCPU_GET(idlethread)) return NULL; mtx_lock(&nblock); needsbuffer |= VFS_BIO_NEED_ANY; mtx_unlock(&nblock); } VI_LOCK(vp); if ((bp = gbincore(vp, blkno))) { int lockflags; /* * Buffer is in-core. If the buffer is not busy, it must * be on a queue. */ lockflags = LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK; if (flags & GB_LOCK_NOWAIT) lockflags |= LK_NOWAIT; error = BUF_TIMELOCK(bp, lockflags, VI_MTX(vp), "getblk", slpflag, slptimeo); /* * If we slept and got the lock we have to restart in case * the buffer changed identities. */ if (error == ENOLCK) goto loop; /* We timed out or were interrupted. */ else if (error) return (NULL); /* * The buffer is locked. B_CACHE is cleared if the buffer is * invalid. Otherwise, for a non-VMIO buffer, B_CACHE is set * and for a VMIO buffer B_CACHE is adjusted according to the * backing VM cache. */ if (bp->b_flags & B_INVAL) bp->b_flags &= ~B_CACHE; else if ((bp->b_flags & (B_VMIO | B_INVAL)) == 0) bp->b_flags |= B_CACHE; bremfree(bp); /* * check for size inconsistancies for non-VMIO case. */ if (bp->b_bcount != size) { if ((bp->b_flags & B_VMIO) == 0 || (size > bp->b_kvasize)) { if (bp->b_flags & B_DELWRI) { bp->b_flags |= B_NOCACHE; BUF_WRITE(bp); } else { if ((bp->b_flags & B_VMIO) && (LIST_FIRST(&bp->b_dep) == NULL)) { bp->b_flags |= B_RELBUF; brelse(bp); } else { bp->b_flags |= B_NOCACHE; BUF_WRITE(bp); } } goto loop; } } /* * If the size is inconsistant in the VMIO case, we can resize * the buffer. This might lead to B_CACHE getting set or * cleared. If the size has not changed, B_CACHE remains * unchanged from its previous state. */ if (bp->b_bcount != size) allocbuf(bp, size); KASSERT(bp->b_offset != NOOFFSET, ("getblk: no buffer offset")); /* * A buffer with B_DELWRI set and B_CACHE clear must * be committed before we can return the buffer in * order to prevent the caller from issuing a read * ( due to B_CACHE not being set ) and overwriting * it. * * Most callers, including NFS and FFS, need this to * operate properly either because they assume they * can issue a read if B_CACHE is not set, or because * ( for example ) an uncached B_DELWRI might loop due * to softupdates re-dirtying the buffer. In the latter * case, B_CACHE is set after the first write completes, * preventing further loops. * NOTE! b*write() sets B_CACHE. If we cleared B_CACHE * above while extending the buffer, we cannot allow the * buffer to remain with B_CACHE set after the write * completes or it will represent a corrupt state. To * deal with this we set B_NOCACHE to scrap the buffer * after the write. * * We might be able to do something fancy, like setting * B_CACHE in bwrite() except if B_DELWRI is already set, * so the below call doesn't set B_CACHE, but that gets real * confusing. This is much easier. */ if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) { bp->b_flags |= B_NOCACHE; BUF_WRITE(bp); goto loop; } splx(s); bp->b_flags &= ~B_DONE; } else { int bsize, maxsize, vmio; off_t offset; /* * Buffer is not in-core, create new buffer. The buffer * returned by getnewbuf() is locked. Note that the returned * buffer is also considered valid (not marked B_INVAL). */ VI_UNLOCK(vp); /* * If the user does not want us to create the buffer, bail out * here. */ if (flags & GB_NOCREAT) { splx(s); return NULL; } if (vn_isdisk(vp, NULL)) bsize = DEV_BSIZE; else if (vp->v_mountedhere) bsize = vp->v_mountedhere->mnt_stat.f_iosize; else if (vp->v_mount) bsize = vp->v_mount->mnt_stat.f_iosize; else bsize = size; offset = blkno * bsize; vmio = (VOP_GETVOBJECT(vp, NULL) == 0) && (vp->v_vflag & VV_OBJBUF); maxsize = vmio ? size + (offset & PAGE_MASK) : size; maxsize = imax(maxsize, bsize); if ((bp = getnewbuf(slpflag, slptimeo, size, maxsize)) == NULL) { if (slpflag || slptimeo) { splx(s); return NULL; } goto loop; } /* * This code is used to make sure that a buffer is not * created while the getnewbuf routine is blocked. * This can be a problem whether the vnode is locked or not. * If the buffer is created out from under us, we have to * throw away the one we just created. There is now window * race because we are safely running at splbio() from the * point of the duplicate buffer creation through to here, * and we've locked the buffer. * * Note: this must occur before we associate the buffer * with the vp especially considering limitations in * the splay tree implementation when dealing with duplicate * lblkno's. */ VI_LOCK(vp); if (gbincore(vp, blkno)) { VI_UNLOCK(vp); bp->b_flags |= B_INVAL; brelse(bp); goto loop; } /* * Insert the buffer into the hash, so that it can * be found by incore. */ bp->b_blkno = bp->b_lblkno = blkno; bp->b_offset = offset; bgetvp(vp, bp); VI_UNLOCK(vp); /* * set B_VMIO bit. allocbuf() the buffer bigger. Since the * buffer size starts out as 0, B_CACHE will be set by * allocbuf() for the VMIO case prior to it testing the * backing store for validity. */ if (vmio) { bp->b_flags |= B_VMIO; #if defined(VFS_BIO_DEBUG) if (vp->v_type != VREG) printf("getblk: vmioing file type %d???\n", vp->v_type); #endif VOP_GETVOBJECT(vp, &bp->b_object); } else { bp->b_flags &= ~B_VMIO; bp->b_object = NULL; } allocbuf(bp, size); splx(s); bp->b_flags &= ~B_DONE; } KASSERT(BUF_REFCNT(bp) == 1, ("getblk: bp %p not locked",bp)); return (bp); } /* * Get an empty, disassociated buffer of given size. The buffer is initially * set to B_INVAL. */ struct buf * geteblk(int size) { struct buf *bp; int s; int maxsize; maxsize = (size + BKVAMASK) & ~BKVAMASK; s = splbio(); while ((bp = getnewbuf(0, 0, size, maxsize)) == 0) continue; splx(s); allocbuf(bp, size); bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */ KASSERT(BUF_REFCNT(bp) == 1, ("geteblk: bp %p not locked",bp)); return (bp); } /* * This code constitutes the buffer memory from either anonymous system * memory (in the case of non-VMIO operations) or from an associated * VM object (in the case of VMIO operations). This code is able to * resize a buffer up or down. * * Note that this code is tricky, and has many complications to resolve * deadlock or inconsistant data situations. Tread lightly!!! * There are B_CACHE and B_DELWRI interactions that must be dealt with by * the caller. Calling this code willy nilly can result in the loss of data. * * allocbuf() only adjusts B_CACHE for VMIO buffers. getblk() deals with * B_CACHE for the non-VMIO case. */ int allocbuf(struct buf *bp, int size) { int newbsize, mbsize; int i; GIANT_REQUIRED; if (BUF_REFCNT(bp) == 0) panic("allocbuf: buffer not busy"); if (bp->b_kvasize < size) panic("allocbuf: buffer too small"); if ((bp->b_flags & B_VMIO) == 0) { caddr_t origbuf; int origbufsize; /* * Just get anonymous memory from the kernel. Don't * mess with B_CACHE. */ mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); if (bp->b_flags & B_MALLOC) newbsize = mbsize; else newbsize = round_page(size); if (newbsize < bp->b_bufsize) { /* * malloced buffers are not shrunk */ if (bp->b_flags & B_MALLOC) { if (newbsize) { bp->b_bcount = size; } else { free(bp->b_data, M_BIOBUF); if (bp->b_bufsize) { atomic_subtract_int( &bufmallocspace, bp->b_bufsize); bufspacewakeup(); bp->b_bufsize = 0; } bp->b_saveaddr = bp->b_kvabase; bp->b_data = bp->b_saveaddr; bp->b_bcount = 0; bp->b_flags &= ~B_MALLOC; } return 1; } vm_hold_free_pages( bp, (vm_offset_t) bp->b_data + newbsize, (vm_offset_t) bp->b_data + bp->b_bufsize); } else if (newbsize > bp->b_bufsize) { /* * We only use malloced memory on the first allocation. * and revert to page-allocated memory when the buffer * grows. */ /* * There is a potential smp race here that could lead * to bufmallocspace slightly passing the max. It * is probably extremely rare and not worth worrying * over. */ if ( (bufmallocspace < maxbufmallocspace) && (bp->b_bufsize == 0) && (mbsize <= PAGE_SIZE/2)) { bp->b_data = malloc(mbsize, M_BIOBUF, M_WAITOK); bp->b_bufsize = mbsize; bp->b_bcount = size; bp->b_flags |= B_MALLOC; atomic_add_int(&bufmallocspace, mbsize); return 1; } origbuf = NULL; origbufsize = 0; /* * If the buffer is growing on its other-than-first allocation, * then we revert to the page-allocation scheme. */ if (bp->b_flags & B_MALLOC) { origbuf = bp->b_data; origbufsize = bp->b_bufsize; bp->b_data = bp->b_kvabase; if (bp->b_bufsize) { atomic_subtract_int(&bufmallocspace, bp->b_bufsize); bufspacewakeup(); bp->b_bufsize = 0; } bp->b_flags &= ~B_MALLOC; newbsize = round_page(newbsize); } vm_hold_load_pages( bp, (vm_offset_t) bp->b_data + bp->b_bufsize, (vm_offset_t) bp->b_data + newbsize); if (origbuf) { bcopy(origbuf, bp->b_data, origbufsize); free(origbuf, M_BIOBUF); } } } else { int desiredpages; newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); desiredpages = (size == 0) ? 0 : num_pages((bp->b_offset & PAGE_MASK) + newbsize); if (bp->b_flags & B_MALLOC) panic("allocbuf: VMIO buffer can't be malloced"); /* * Set B_CACHE initially if buffer is 0 length or will become * 0-length. */ if (size == 0 || bp->b_bufsize == 0) bp->b_flags |= B_CACHE; if (newbsize < bp->b_bufsize) { /* * DEV_BSIZE aligned new buffer size is less then the * DEV_BSIZE aligned existing buffer size. Figure out * if we have to remove any pages. */ if (desiredpages < bp->b_npages) { vm_page_t m; vm_page_lock_queues(); for (i = desiredpages; i < bp->b_npages; i++) { /* * the page is not freed here -- it * is the responsibility of * vnode_pager_setsize */ m = bp->b_pages[i]; KASSERT(m != bogus_page, ("allocbuf: bogus page found")); while (vm_page_sleep_if_busy(m, TRUE, "biodep")) vm_page_lock_queues(); bp->b_pages[i] = NULL; vm_page_unwire(m, 0); } vm_page_unlock_queues(); pmap_qremove((vm_offset_t) trunc_page((vm_offset_t)bp->b_data) + (desiredpages << PAGE_SHIFT), (bp->b_npages - desiredpages)); bp->b_npages = desiredpages; } } else if (size > bp->b_bcount) { /* * We are growing the buffer, possibly in a * byte-granular fashion. */ struct vnode *vp; vm_object_t obj; vm_offset_t toff; vm_offset_t tinc; /* * Step 1, bring in the VM pages from the object, * allocating them if necessary. We must clear * B_CACHE if these pages are not valid for the * range covered by the buffer. */ vp = bp->b_vp; obj = bp->b_object; VM_OBJECT_LOCK(obj); while (bp->b_npages < desiredpages) { vm_page_t m; vm_pindex_t pi; pi = OFF_TO_IDX(bp->b_offset) + bp->b_npages; if ((m = vm_page_lookup(obj, pi)) == NULL) { /* * note: must allocate system pages * since blocking here could intefere * with paging I/O, no matter which * process we are. */ m = vm_page_alloc(obj, pi, VM_ALLOC_SYSTEM | VM_ALLOC_WIRED); if (m == NULL) { atomic_add_int(&vm_pageout_deficit, desiredpages - bp->b_npages); VM_OBJECT_UNLOCK(obj); VM_WAIT; VM_OBJECT_LOCK(obj); } else { vm_page_lock_queues(); vm_page_wakeup(m); vm_page_unlock_queues(); bp->b_flags &= ~B_CACHE; bp->b_pages[bp->b_npages] = m; ++bp->b_npages; } continue; } /* * We found a page. If we have to sleep on it, * retry because it might have gotten freed out * from under us. * * We can only test PG_BUSY here. Blocking on * m->busy might lead to a deadlock: * * vm_fault->getpages->cluster_read->allocbuf * */ vm_page_lock_queues(); if (vm_page_sleep_if_busy(m, FALSE, "pgtblk")) continue; /* * We have a good page. Should we wakeup the * page daemon? */ if ((curproc != pageproc) && ((m->queue - m->pc) == PQ_CACHE) && ((cnt.v_free_count + cnt.v_cache_count) < (cnt.v_free_min + cnt.v_cache_min))) { pagedaemon_wakeup(); } vm_page_flag_clear(m, PG_ZERO); vm_page_wire(m); vm_page_unlock_queues(); bp->b_pages[bp->b_npages] = m; ++bp->b_npages; } /* * Step 2. We've loaded the pages into the buffer, * we have to figure out if we can still have B_CACHE * set. Note that B_CACHE is set according to the * byte-granular range ( bcount and size ), new the * aligned range ( newbsize ). * * The VM test is against m->valid, which is DEV_BSIZE * aligned. Needless to say, the validity of the data * needs to also be DEV_BSIZE aligned. Note that this * fails with NFS if the server or some other client * extends the file's EOF. If our buffer is resized, * B_CACHE may remain set! XXX */ toff = bp->b_bcount; tinc = PAGE_SIZE - ((bp->b_offset + toff) & PAGE_MASK); while ((bp->b_flags & B_CACHE) && toff < size) { vm_pindex_t pi; if (tinc > (size - toff)) tinc = size - toff; pi = ((bp->b_offset & PAGE_MASK) + toff) >> PAGE_SHIFT; vfs_buf_test_cache( bp, bp->b_offset, toff, tinc, bp->b_pages[pi] ); toff += tinc; tinc = PAGE_SIZE; } VM_OBJECT_UNLOCK(obj); /* * Step 3, fixup the KVM pmap. Remember that * bp->b_data is relative to bp->b_offset, but * bp->b_offset may be offset into the first page. */ bp->b_data = (caddr_t) trunc_page((vm_offset_t)bp->b_data); pmap_qenter( (vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages ); bp->b_data = (caddr_t)((vm_offset_t)bp->b_data | (vm_offset_t)(bp->b_offset & PAGE_MASK)); } } if (newbsize < bp->b_bufsize) bufspacewakeup(); bp->b_bufsize = newbsize; /* actual buffer allocation */ bp->b_bcount = size; /* requested buffer size */ return 1; } void biodone(struct bio *bp) { mtx_lock(&bdonelock); bp->bio_flags |= BIO_DONE; if (bp->bio_done == NULL) wakeup(bp); mtx_unlock(&bdonelock); if (bp->bio_done != NULL) bp->bio_done(bp); } /* * Wait for a BIO to finish. * * XXX: resort to a timeout for now. The optimal locking (if any) for this * case is not yet clear. */ int biowait(struct bio *bp, const char *wchan) { mtx_lock(&bdonelock); while ((bp->bio_flags & BIO_DONE) == 0) msleep(bp, &bdonelock, PRIBIO, wchan, hz / 10); mtx_unlock(&bdonelock); if (bp->bio_error != 0) return (bp->bio_error); if (!(bp->bio_flags & BIO_ERROR)) return (0); return (EIO); } void biofinish(struct bio *bp, struct devstat *stat, int error) { if (error) { bp->bio_error = error; bp->bio_flags |= BIO_ERROR; } if (stat != NULL) devstat_end_transaction_bio(stat, bp); biodone(bp); } /* * bufwait: * * Wait for buffer I/O completion, returning error status. The buffer * is left locked and B_DONE on return. B_EINTR is converted into an EINTR * error and cleared. */ int bufwait(register struct buf * bp) { int s; s = splbio(); if (bp->b_iocmd == BIO_READ) bwait(bp, PRIBIO, "biord"); else bwait(bp, PRIBIO, "biowr"); splx(s); if (bp->b_flags & B_EINTR) { bp->b_flags &= ~B_EINTR; return (EINTR); } if (bp->b_ioflags & BIO_ERROR) { return (bp->b_error ? bp->b_error : EIO); } else { return (0); } } /* * Call back function from struct bio back up to struct buf. * The corresponding initialization lives in sys/conf.h:DEV_STRATEGY(). */ static void bufdonebio(struct bio *bp) { /* Device drivers may or may not hold giant, hold it here. */ mtx_lock(&Giant); bufdone(bp->bio_caller2); mtx_unlock(&Giant); } void dev_strategy(struct buf *bp) { if ((!bp->b_iocmd) || (bp->b_iocmd & (bp->b_iocmd - 1))) panic("b_iocmd botch"); if (bp->b_flags & B_PHYS) bp->b_io.bio_offset = bp->b_offset; else bp->b_io.bio_offset = dbtob(bp->b_blkno); bp->b_io.bio_done = bufdonebio; bp->b_io.bio_caller2 = bp; (*devsw(bp->b_io.bio_dev)->d_strategy)(&bp->b_io); } /* * bufdone: * * Finish I/O on a buffer, optionally calling a completion function. * This is usually called from an interrupt so process blocking is * not allowed. * * biodone is also responsible for setting B_CACHE in a B_VMIO bp. * In a non-VMIO bp, B_CACHE will be set on the next getblk() * assuming B_INVAL is clear. * * For the VMIO case, we set B_CACHE if the op was a read and no * read error occured, or if the op was a write. B_CACHE is never * set if the buffer is invalid or otherwise uncacheable. * * biodone does not mess with B_INVAL, allowing the I/O routine or the * initiator to leave B_INVAL set to brelse the buffer out of existance * in the biodone routine. */ void bufdone(struct buf *bp) { int s; void (*biodone)(struct buf *); GIANT_REQUIRED; s = splbio(); KASSERT(BUF_REFCNT(bp) > 0, ("biodone: bp %p not busy %d", bp, BUF_REFCNT(bp))); KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp)); bp->b_flags |= B_DONE; runningbufwakeup(bp); if (bp->b_iocmd == BIO_DELETE) { brelse(bp); splx(s); return; } if (bp->b_iocmd == BIO_WRITE) { vwakeup(bp); } /* call optional completion function if requested */ if (bp->b_iodone != NULL) { biodone = bp->b_iodone; bp->b_iodone = NULL; (*biodone) (bp); splx(s); return; } if (LIST_FIRST(&bp->b_dep) != NULL) buf_complete(bp); if (bp->b_flags & B_VMIO) { int i; vm_ooffset_t foff; vm_page_t m; vm_object_t obj; int iosize; struct vnode *vp = bp->b_vp; obj = bp->b_object; #if defined(VFS_BIO_DEBUG) mp_fixme("usecount and vflag accessed without locks."); if (vp->v_usecount == 0) { panic("biodone: zero vnode ref count"); } if ((vp->v_vflag & VV_OBJBUF) == 0) { panic("biodone: vnode is not setup for merged cache"); } #endif foff = bp->b_offset; KASSERT(bp->b_offset != NOOFFSET, ("biodone: no buffer offset")); VM_OBJECT_LOCK(obj); #if defined(VFS_BIO_DEBUG) if (obj->paging_in_progress < bp->b_npages) { printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n", obj->paging_in_progress, bp->b_npages); } #endif /* * Set B_CACHE if the op was a normal read and no error * occured. B_CACHE is set for writes in the b*write() * routines. */ iosize = bp->b_bcount - bp->b_resid; if (bp->b_iocmd == BIO_READ && !(bp->b_flags & (B_INVAL|B_NOCACHE)) && !(bp->b_ioflags & BIO_ERROR)) { bp->b_flags |= B_CACHE; } vm_page_lock_queues(); for (i = 0; i < bp->b_npages; i++) { int bogusflag = 0; int resid; resid = ((foff + PAGE_SIZE) & ~(off_t)PAGE_MASK) - foff; if (resid > iosize) resid = iosize; /* * cleanup bogus pages, restoring the originals */ m = bp->b_pages[i]; if (m == bogus_page) { bogusflag = 1; m = vm_page_lookup(obj, OFF_TO_IDX(foff)); if (m == NULL) panic("biodone: page disappeared!"); bp->b_pages[i] = m; pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } #if defined(VFS_BIO_DEBUG) if (OFF_TO_IDX(foff) != m->pindex) { printf( "biodone: foff(%jd)/m->pindex(%ju) mismatch\n", (intmax_t)foff, (uintmax_t)m->pindex); } #endif /* * In the write case, the valid and clean bits are * already changed correctly ( see bdwrite() ), so we * only need to do this here in the read case. */ if ((bp->b_iocmd == BIO_READ) && !bogusflag && resid > 0) { vfs_page_set_valid(bp, foff, i, m); } vm_page_flag_clear(m, PG_ZERO); /* * when debugging new filesystems or buffer I/O methods, this * is the most common error that pops up. if you see this, you * have not set the page busy flag correctly!!! */ if (m->busy == 0) { printf("biodone: page busy < 0, " "pindex: %d, foff: 0x(%x,%x), " "resid: %d, index: %d\n", (int) m->pindex, (int)(foff >> 32), (int) foff & 0xffffffff, resid, i); if (!vn_isdisk(vp, NULL)) - printf(" iosize: %ld, lblkno: %jd, flags: 0x%x, npages: %d\n", - bp->b_vp->v_mount->mnt_stat.f_iosize, + printf(" iosize: %jd, lblkno: %jd, flags: 0x%x, npages: %d\n", + (intmax_t)bp->b_vp->v_mount->mnt_stat.f_iosize, (intmax_t) bp->b_lblkno, bp->b_flags, bp->b_npages); else printf(" VDEV, lblkno: %jd, flags: 0x%x, npages: %d\n", (intmax_t) bp->b_lblkno, bp->b_flags, bp->b_npages); printf(" valid: 0x%lx, dirty: 0x%lx, wired: %d\n", (u_long)m->valid, (u_long)m->dirty, m->wire_count); panic("biodone: page busy < 0\n"); } vm_page_io_finish(m); vm_object_pip_subtract(obj, 1); foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; iosize -= resid; } vm_page_unlock_queues(); vm_object_pip_wakeupn(obj, 0); VM_OBJECT_UNLOCK(obj); } /* * For asynchronous completions, release the buffer now. The brelse * will do a wakeup there if necessary - so no need to do a wakeup * here in the async case. The sync case always needs to do a wakeup. */ if (bp->b_flags & B_ASYNC) { if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_RELBUF)) || (bp->b_ioflags & BIO_ERROR)) brelse(bp); else bqrelse(bp); } else { bdone(bp); } splx(s); } /* * This routine is called in lieu of iodone in the case of * incomplete I/O. This keeps the busy status for pages * consistant. */ void vfs_unbusy_pages(struct buf * bp) { int i; GIANT_REQUIRED; runningbufwakeup(bp); if (bp->b_flags & B_VMIO) { vm_object_t obj; obj = bp->b_object; VM_OBJECT_LOCK(obj); vm_page_lock_queues(); for (i = 0; i < bp->b_npages; i++) { vm_page_t m = bp->b_pages[i]; if (m == bogus_page) { m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_offset) + i); if (!m) { panic("vfs_unbusy_pages: page missing\n"); } bp->b_pages[i] = m; pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } vm_object_pip_subtract(obj, 1); vm_page_flag_clear(m, PG_ZERO); vm_page_io_finish(m); } vm_page_unlock_queues(); vm_object_pip_wakeupn(obj, 0); VM_OBJECT_UNLOCK(obj); } } /* * vfs_page_set_valid: * * Set the valid bits in a page based on the supplied offset. The * range is restricted to the buffer's size. * * This routine is typically called after a read completes. */ static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, int pageno, vm_page_t m) { vm_ooffset_t soff, eoff; mtx_assert(&vm_page_queue_mtx, MA_OWNED); /* * Start and end offsets in buffer. eoff - soff may not cross a * page boundry or cross the end of the buffer. The end of the * buffer, in this case, is our file EOF, not the allocation size * of the buffer. */ soff = off; eoff = (off + PAGE_SIZE) & ~(off_t)PAGE_MASK; if (eoff > bp->b_offset + bp->b_bcount) eoff = bp->b_offset + bp->b_bcount; /* * Set valid range. This is typically the entire buffer and thus the * entire page. */ if (eoff > soff) { vm_page_set_validclean( m, (vm_offset_t) (soff & PAGE_MASK), (vm_offset_t) (eoff - soff) ); } } /* * This routine is called before a device strategy routine. * It is used to tell the VM system that paging I/O is in * progress, and treat the pages associated with the buffer * almost as being PG_BUSY. Also the object paging_in_progress * flag is handled to make sure that the object doesn't become * inconsistant. * * Since I/O has not been initiated yet, certain buffer flags * such as BIO_ERROR or B_INVAL may be in an inconsistant state * and should be ignored. */ void vfs_busy_pages(struct buf * bp, int clear_modify) { int i, bogus; if (bp->b_flags & B_VMIO) { vm_object_t obj; vm_ooffset_t foff; obj = bp->b_object; foff = bp->b_offset; KASSERT(bp->b_offset != NOOFFSET, ("vfs_busy_pages: no buffer offset")); vfs_setdirty(bp); VM_OBJECT_LOCK(obj); retry: vm_page_lock_queues(); for (i = 0; i < bp->b_npages; i++) { vm_page_t m = bp->b_pages[i]; if (vm_page_sleep_if_busy(m, FALSE, "vbpage")) goto retry; } bogus = 0; for (i = 0; i < bp->b_npages; i++) { vm_page_t m = bp->b_pages[i]; vm_page_flag_clear(m, PG_ZERO); if ((bp->b_flags & B_CLUSTER) == 0) { vm_object_pip_add(obj, 1); vm_page_io_start(m); } /* * When readying a buffer for a read ( i.e * clear_modify == 0 ), it is important to do * bogus_page replacement for valid pages in * partially instantiated buffers. Partially * instantiated buffers can, in turn, occur when * reconstituting a buffer from its VM backing store * base. We only have to do this if B_CACHE is * clear ( which causes the I/O to occur in the * first place ). The replacement prevents the read * I/O from overwriting potentially dirty VM-backed * pages. XXX bogus page replacement is, uh, bogus. * It may not work properly with small-block devices. * We need to find a better way. */ pmap_remove_all(m); if (clear_modify) vfs_page_set_valid(bp, foff, i, m); else if (m->valid == VM_PAGE_BITS_ALL && (bp->b_flags & B_CACHE) == 0) { bp->b_pages[i] = bogus_page; bogus++; } foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; } vm_page_unlock_queues(); VM_OBJECT_UNLOCK(obj); if (bogus) pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } } /* * Tell the VM system that the pages associated with this buffer * are clean. This is used for delayed writes where the data is * going to go to disk eventually without additional VM intevention. * * Note that while we only really need to clean through to b_bcount, we * just go ahead and clean through to b_bufsize. */ static void vfs_clean_pages(struct buf * bp) { int i; if (bp->b_flags & B_VMIO) { vm_ooffset_t foff; foff = bp->b_offset; KASSERT(bp->b_offset != NOOFFSET, ("vfs_clean_pages: no buffer offset")); VM_OBJECT_LOCK(bp->b_object); vm_page_lock_queues(); for (i = 0; i < bp->b_npages; i++) { vm_page_t m = bp->b_pages[i]; vm_ooffset_t noff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; vm_ooffset_t eoff = noff; if (eoff > bp->b_offset + bp->b_bufsize) eoff = bp->b_offset + bp->b_bufsize; vfs_page_set_valid(bp, foff, i, m); /* vm_page_clear_dirty(m, foff & PAGE_MASK, eoff - foff); */ foff = noff; } vm_page_unlock_queues(); VM_OBJECT_UNLOCK(bp->b_object); } } /* * vfs_bio_set_validclean: * * Set the range within the buffer to valid and clean. The range is * relative to the beginning of the buffer, b_offset. Note that b_offset * itself may be offset from the beginning of the first page. * */ void vfs_bio_set_validclean(struct buf *bp, int base, int size) { if (bp->b_flags & B_VMIO) { int i; int n; /* * Fixup base to be relative to beginning of first page. * Set initial n to be the maximum number of bytes in the * first page that can be validated. */ base += (bp->b_offset & PAGE_MASK); n = PAGE_SIZE - (base & PAGE_MASK); VM_OBJECT_LOCK(bp->b_object); vm_page_lock_queues(); for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) { vm_page_t m = bp->b_pages[i]; if (n > size) n = size; vm_page_set_validclean(m, base & PAGE_MASK, n); base += n; size -= n; n = PAGE_SIZE; } vm_page_unlock_queues(); VM_OBJECT_UNLOCK(bp->b_object); } } /* * vfs_bio_clrbuf: * * clear a buffer. This routine essentially fakes an I/O, so we need * to clear BIO_ERROR and B_INVAL. * * Note that while we only theoretically need to clear through b_bcount, * we go ahead and clear through b_bufsize. */ void vfs_bio_clrbuf(struct buf *bp) { int i, mask = 0; caddr_t sa, ea; GIANT_REQUIRED; if ((bp->b_flags & (B_VMIO | B_MALLOC)) == B_VMIO) { bp->b_flags &= ~B_INVAL; bp->b_ioflags &= ~BIO_ERROR; if (bp->b_object != NULL) VM_OBJECT_LOCK(bp->b_object); if( (bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) && (bp->b_offset & PAGE_MASK) == 0) { mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1; if (bp->b_pages[0] != bogus_page) VM_OBJECT_LOCK_ASSERT(bp->b_pages[0]->object, MA_OWNED); if ((bp->b_pages[0]->valid & mask) == mask) goto unlock; if (((bp->b_pages[0]->flags & PG_ZERO) == 0) && ((bp->b_pages[0]->valid & mask) == 0)) { bzero(bp->b_data, bp->b_bufsize); bp->b_pages[0]->valid |= mask; goto unlock; } } ea = sa = bp->b_data; for(i=0;ib_npages;i++,sa=ea) { int j = ((vm_offset_t)sa & PAGE_MASK) / DEV_BSIZE; ea = (caddr_t)trunc_page((vm_offset_t)sa + PAGE_SIZE); ea = (caddr_t)(vm_offset_t)ulmin( (u_long)(vm_offset_t)ea, (u_long)(vm_offset_t)bp->b_data + bp->b_bufsize); mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j; if (bp->b_pages[i] != bogus_page) VM_OBJECT_LOCK_ASSERT(bp->b_pages[i]->object, MA_OWNED); if ((bp->b_pages[i]->valid & mask) == mask) continue; if ((bp->b_pages[i]->valid & mask) == 0) { if ((bp->b_pages[i]->flags & PG_ZERO) == 0) { bzero(sa, ea - sa); } } else { for (; sa < ea; sa += DEV_BSIZE, j++) { if (((bp->b_pages[i]->flags & PG_ZERO) == 0) && (bp->b_pages[i]->valid & (1<b_pages[i]->valid |= mask; vm_page_lock_queues(); vm_page_flag_clear(bp->b_pages[i], PG_ZERO); vm_page_unlock_queues(); } unlock: if (bp->b_object != NULL) VM_OBJECT_UNLOCK(bp->b_object); bp->b_resid = 0; } else { clrbuf(bp); } } /* * vm_hold_load_pages and vm_hold_free_pages get pages into * a buffers address space. The pages are anonymous and are * not associated with a file object. */ static void vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to) { vm_offset_t pg; vm_page_t p; int index; to = round_page(to); from = round_page(from); index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; VM_OBJECT_LOCK(kernel_object); for (pg = from; pg < to; pg += PAGE_SIZE, index++) { tryagain: /* * note: must allocate system pages since blocking here * could intefere with paging I/O, no matter which * process we are. */ p = vm_page_alloc(kernel_object, ((pg - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT), VM_ALLOC_SYSTEM | VM_ALLOC_WIRED); if (!p) { atomic_add_int(&vm_pageout_deficit, (to - pg) >> PAGE_SHIFT); VM_OBJECT_UNLOCK(kernel_object); VM_WAIT; VM_OBJECT_LOCK(kernel_object); goto tryagain; } p->valid = VM_PAGE_BITS_ALL; pmap_qenter(pg, &p, 1); bp->b_pages[index] = p; vm_page_lock_queues(); vm_page_wakeup(p); vm_page_unlock_queues(); } VM_OBJECT_UNLOCK(kernel_object); bp->b_npages = index; } /* Return pages associated with this buf to the vm system */ static void vm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to) { vm_offset_t pg; vm_page_t p; int index, newnpages; GIANT_REQUIRED; from = round_page(from); to = round_page(to); newnpages = index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; VM_OBJECT_LOCK(kernel_object); for (pg = from; pg < to; pg += PAGE_SIZE, index++) { p = bp->b_pages[index]; if (p && (index < bp->b_npages)) { if (p->busy) { printf( "vm_hold_free_pages: blkno: %jd, lblkno: %jd\n", (intmax_t)bp->b_blkno, (intmax_t)bp->b_lblkno); } bp->b_pages[index] = NULL; pmap_qremove(pg, 1); vm_page_lock_queues(); vm_page_busy(p); vm_page_unwire(p, 0); vm_page_free(p); vm_page_unlock_queues(); } } VM_OBJECT_UNLOCK(kernel_object); bp->b_npages = newnpages; } /* * Map an IO request into kernel virtual address space. * * All requests are (re)mapped into kernel VA space. * Notice that we use b_bufsize for the size of the buffer * to be mapped. b_bcount might be modified by the driver. * * Note that even if the caller determines that the address space should * be valid, a race or a smaller-file mapped into a larger space may * actually cause vmapbuf() to fail, so all callers of vmapbuf() MUST * check the return value. */ int vmapbuf(struct buf *bp) { caddr_t addr, kva; vm_prot_t prot; int pidx, i; struct vm_page *m; struct pmap *pmap = &curproc->p_vmspace->vm_pmap; GIANT_REQUIRED; if (bp->b_bufsize < 0) return (-1); prot = (bp->b_iocmd == BIO_READ) ? VM_PROT_READ | VM_PROT_WRITE : VM_PROT_READ; for (addr = (caddr_t)trunc_page((vm_offset_t)bp->b_data), pidx = 0; addr < bp->b_data + bp->b_bufsize; addr += PAGE_SIZE, pidx++) { /* * Do the vm_fault if needed; do the copy-on-write thing * when reading stuff off device into memory. * * NOTE! Must use pmap_extract() because addr may be in * the userland address space, and kextract is only guarenteed * to work for the kernland address space (see: sparc64 port). */ retry: if (vm_fault_quick(addr >= bp->b_data ? addr : bp->b_data, prot) < 0) { vm_page_lock_queues(); for (i = 0; i < pidx; ++i) { vm_page_unhold(bp->b_pages[i]); bp->b_pages[i] = NULL; } vm_page_unlock_queues(); return(-1); } m = pmap_extract_and_hold(pmap, (vm_offset_t)addr, prot); if (m == NULL) goto retry; bp->b_pages[pidx] = m; } if (pidx > btoc(MAXPHYS)) panic("vmapbuf: mapped more than MAXPHYS"); pmap_qenter((vm_offset_t)bp->b_saveaddr, bp->b_pages, pidx); kva = bp->b_saveaddr; bp->b_npages = pidx; bp->b_saveaddr = bp->b_data; bp->b_data = kva + (((vm_offset_t) bp->b_data) & PAGE_MASK); return(0); } /* * Free the io map PTEs associated with this IO operation. * We also invalidate the TLB entries and restore the original b_addr. */ void vunmapbuf(struct buf *bp) { int pidx; int npages; GIANT_REQUIRED; npages = bp->b_npages; pmap_qremove(trunc_page((vm_offset_t)bp->b_data), npages); vm_page_lock_queues(); for (pidx = 0; pidx < npages; pidx++) vm_page_unhold(bp->b_pages[pidx]); vm_page_unlock_queues(); bp->b_data = bp->b_saveaddr; } void bdone(struct buf *bp) { mtx_lock(&bdonelock); bp->b_flags |= B_DONE; wakeup(bp); mtx_unlock(&bdonelock); } void bwait(struct buf *bp, u_char pri, const char *wchan) { mtx_lock(&bdonelock); while ((bp->b_flags & B_DONE) == 0) msleep(bp, &bdonelock, pri, wchan, 0); mtx_unlock(&bdonelock); } #include "opt_ddb.h" #ifdef DDB #include /* DDB command to show buffer data */ DB_SHOW_COMMAND(buffer, db_show_buffer) { /* get args */ struct buf *bp = (struct buf *)addr; if (!have_addr) { db_printf("usage: show buffer \n"); return; } db_printf("b_flags = 0x%b\n", (u_int)bp->b_flags, PRINT_BUF_FLAGS); db_printf( "b_error = %d, b_bufsize = %ld, b_bcount = %ld, b_resid = %ld\n" "b_dev = (%d,%d), b_data = %p, b_blkno = %jd\n", bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid, major(bp->b_dev), minor(bp->b_dev), bp->b_data, (intmax_t)bp->b_blkno); if (bp->b_npages) { int i; db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages); for (i = 0; i < bp->b_npages; i++) { vm_page_t m; m = bp->b_pages[i]; db_printf("(%p, 0x%lx, 0x%lx)", (void *)m->object, (u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m)); if ((i + 1) < bp->b_npages) db_printf(","); } db_printf("\n"); } } #endif /* DDB */ Index: head/sys/kern/vfs_cluster.c =================================================================== --- head/sys/kern/vfs_cluster.c (revision 122536) +++ head/sys/kern/vfs_cluster.c (revision 122537) @@ -1,1035 +1,1035 @@ /*- * Copyright (c) 1993 * The Regents of the University of California. All rights reserved. * Modifications/enhancements: * Copyright (c) 1995 John S. Dyson. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_cluster.c 8.7 (Berkeley) 2/13/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_debug_cluster.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(CLUSTERDEBUG) static int rcluster= 0; SYSCTL_INT(_debug, OID_AUTO, rcluster, CTLFLAG_RW, &rcluster, 0, "Debug VFS clustering code"); #endif static MALLOC_DEFINE(M_SEGMENT, "cluster_save buffer", "cluster_save buffer"); static struct cluster_save * cluster_collectbufs(struct vnode *vp, struct buf *last_bp); static struct buf * cluster_rbuild(struct vnode *vp, u_quad_t filesize, daddr_t lbn, daddr_t blkno, long size, int run, struct buf *fbp); static int write_behind = 1; SYSCTL_INT(_vfs, OID_AUTO, write_behind, CTLFLAG_RW, &write_behind, 0, "Cluster write-behind; 0: disable, 1: enable, 2: backed off"); static int read_max = 8; SYSCTL_INT(_vfs, OID_AUTO, read_max, CTLFLAG_RW, &read_max, 0, "Cluster read-ahead max block count"); /* Page expended to mark partially backed buffers */ extern vm_page_t bogus_page; /* * Number of physical bufs (pbufs) this subsystem is allowed. * Manipulated by vm_pager.c */ extern int cluster_pbuf_freecnt; /* * Read data to a buf, including read-ahead if we find this to be beneficial. * cluster_read replaces bread. */ int cluster_read(vp, filesize, lblkno, size, cred, totread, seqcount, bpp) struct vnode *vp; u_quad_t filesize; daddr_t lblkno; long size; struct ucred *cred; long totread; int seqcount; struct buf **bpp; { struct buf *bp, *rbp, *reqbp; daddr_t blkno, origblkno; int maxra, racluster; int error, ncontig; int i; error = 0; /* * Try to limit the amount of read-ahead by a few * ad-hoc parameters. This needs work!!! */ racluster = vp->v_mount->mnt_iosize_max / size; maxra = seqcount; maxra = min(read_max, maxra); maxra = min(nbuf/8, maxra); if (((u_quad_t)(lblkno + maxra + 1) * size) > filesize) maxra = (filesize / size) - lblkno; /* * get the requested block */ *bpp = reqbp = bp = getblk(vp, lblkno, size, 0, 0, 0); origblkno = lblkno; /* * if it is in the cache, then check to see if the reads have been * sequential. If they have, then try some read-ahead, otherwise * back-off on prospective read-aheads. */ if (bp->b_flags & B_CACHE) { if (!seqcount) { return 0; } else if ((bp->b_flags & B_RAM) == 0) { return 0; } else { int s; bp->b_flags &= ~B_RAM; /* * We do the spl here so that there is no window * between the incore and the b_usecount increment * below. We opt to keep the spl out of the loop * for efficiency. */ s = splbio(); VI_LOCK(vp); for (i = 1; i < maxra; i++) { /* * Stop if the buffer does not exist or it * is invalid (about to go away?) */ rbp = gbincore(vp, lblkno+i); if (rbp == NULL || (rbp->b_flags & B_INVAL)) break; /* * Set another read-ahead mark so we know * to check again. */ if (((i % racluster) == (racluster - 1)) || (i == (maxra - 1))) rbp->b_flags |= B_RAM; } VI_UNLOCK(vp); splx(s); if (i >= maxra) { return 0; } lblkno += i; } reqbp = bp = NULL; /* * If it isn't in the cache, then get a chunk from * disk if sequential, otherwise just get the block. */ } else { off_t firstread = bp->b_offset; int nblks; KASSERT(bp->b_offset != NOOFFSET, ("cluster_read: no buffer offset")); ncontig = 0; /* * Compute the total number of blocks that we should read * synchronously. */ if (firstread + totread > filesize) totread = filesize - firstread; nblks = howmany(totread, size); if (nblks > racluster) nblks = racluster; /* * Now compute the number of contiguous blocks. */ if (nblks > 1) { error = VOP_BMAP(vp, lblkno, NULL, &blkno, &ncontig, NULL); /* * If this failed to map just do the original block. */ if (error || blkno == -1) ncontig = 0; } /* * If we have contiguous data available do a cluster * otherwise just read the requested block. */ if (ncontig) { /* Account for our first block. */ ncontig = min(ncontig + 1, nblks); if (ncontig < nblks) nblks = ncontig; bp = cluster_rbuild(vp, filesize, lblkno, blkno, size, nblks, bp); lblkno += (bp->b_bufsize / size); } else { bp->b_flags |= B_RAM; bp->b_iocmd = BIO_READ; lblkno += 1; } } /* * handle the synchronous read so that it is available ASAP. */ if (bp) { if ((bp->b_flags & B_CLUSTER) == 0) { vfs_busy_pages(bp, 0); } bp->b_flags &= ~B_INVAL; bp->b_ioflags &= ~BIO_ERROR; if ((bp->b_flags & B_ASYNC) || bp->b_iodone != NULL) BUF_KERNPROC(bp); bp->b_iooffset = dbtob(bp->b_blkno); error = VOP_STRATEGY(vp, bp); curproc->p_stats->p_ru.ru_inblock++; if (error) return (error); } /* * If we have been doing sequential I/O, then do some read-ahead. */ while (lblkno < (origblkno + maxra)) { error = VOP_BMAP(vp, lblkno, NULL, &blkno, &ncontig, NULL); if (error) break; if (blkno == -1) break; /* * We could throttle ncontig here by maxra but we might as * well read the data if it is contiguous. We're throttled * by racluster anyway. */ if (ncontig) { ncontig = min(ncontig + 1, racluster); rbp = cluster_rbuild(vp, filesize, lblkno, blkno, size, ncontig, NULL); lblkno += (rbp->b_bufsize / size); if (rbp->b_flags & B_DELWRI) { bqrelse(rbp); continue; } } else { rbp = getblk(vp, lblkno, size, 0, 0, 0); lblkno += 1; if (rbp->b_flags & B_DELWRI) { bqrelse(rbp); continue; } rbp->b_flags |= B_ASYNC | B_RAM; rbp->b_iocmd = BIO_READ; rbp->b_blkno = blkno; } if (rbp->b_flags & B_CACHE) { rbp->b_flags &= ~B_ASYNC; bqrelse(rbp); continue; } if ((rbp->b_flags & B_CLUSTER) == 0) { vfs_busy_pages(rbp, 0); } rbp->b_flags &= ~B_INVAL; rbp->b_ioflags &= ~BIO_ERROR; if ((rbp->b_flags & B_ASYNC) || rbp->b_iodone != NULL) BUF_KERNPROC(rbp); rbp->b_iooffset = dbtob(rbp->b_blkno); (void) VOP_STRATEGY(vp, rbp); curproc->p_stats->p_ru.ru_inblock++; } if (reqbp) return (bufwait(reqbp)); else return (error); } /* * If blocks are contiguous on disk, use this to provide clustered * read ahead. We will read as many blocks as possible sequentially * and then parcel them up into logical blocks in the buffer hash table. */ static struct buf * cluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp) struct vnode *vp; u_quad_t filesize; daddr_t lbn; daddr_t blkno; long size; int run; struct buf *fbp; { struct buf *bp, *tbp; daddr_t bn; int i, inc, j; GIANT_REQUIRED; KASSERT(size == vp->v_mount->mnt_stat.f_iosize, - ("cluster_rbuild: size %ld != filesize %ld\n", - size, vp->v_mount->mnt_stat.f_iosize)); + ("cluster_rbuild: size %ld != filesize %jd\n", + size, (intmax_t)vp->v_mount->mnt_stat.f_iosize)); /* * avoid a division */ while ((u_quad_t) size * (lbn + run) > filesize) { --run; } if (fbp) { tbp = fbp; tbp->b_iocmd = BIO_READ; } else { tbp = getblk(vp, lbn, size, 0, 0, 0); if (tbp->b_flags & B_CACHE) return tbp; tbp->b_flags |= B_ASYNC | B_RAM; tbp->b_iocmd = BIO_READ; } tbp->b_blkno = blkno; if( (tbp->b_flags & B_MALLOC) || ((tbp->b_flags & B_VMIO) == 0) || (run <= 1) ) return tbp; bp = trypbuf(&cluster_pbuf_freecnt); if (bp == 0) return tbp; /* * We are synthesizing a buffer out of vm_page_t's, but * if the block size is not page aligned then the starting * address may not be either. Inherit the b_data offset * from the original buffer. */ bp->b_data = (char *)((vm_offset_t)bp->b_data | ((vm_offset_t)tbp->b_data & PAGE_MASK)); bp->b_flags = B_ASYNC | B_CLUSTER | B_VMIO; bp->b_iocmd = BIO_READ; bp->b_iodone = cluster_callback; bp->b_blkno = blkno; bp->b_lblkno = lbn; bp->b_offset = tbp->b_offset; KASSERT(bp->b_offset != NOOFFSET, ("cluster_rbuild: no buffer offset")); pbgetvp(vp, bp); TAILQ_INIT(&bp->b_cluster.cluster_head); bp->b_bcount = 0; bp->b_bufsize = 0; bp->b_npages = 0; inc = btodb(size); for (bn = blkno, i = 0; i < run; ++i, bn += inc) { if (i != 0) { if ((bp->b_npages * PAGE_SIZE) + round_page(size) > vp->v_mount->mnt_iosize_max) { break; } tbp = getblk(vp, lbn + i, size, 0, 0, GB_LOCK_NOWAIT); /* Don't wait around for locked bufs. */ if (tbp == NULL) break; /* * Stop scanning if the buffer is fully valid * (marked B_CACHE), or locked (may be doing a * background write), or if the buffer is not * VMIO backed. The clustering code can only deal * with VMIO-backed buffers. */ VI_LOCK(bp->b_vp); if ((tbp->b_vflags & BV_BKGRDINPROG) || (tbp->b_flags & B_CACHE) || (tbp->b_flags & B_VMIO) == 0) { VI_UNLOCK(bp->b_vp); bqrelse(tbp); break; } VI_UNLOCK(bp->b_vp); /* * The buffer must be completely invalid in order to * take part in the cluster. If it is partially valid * then we stop. */ VM_OBJECT_LOCK(tbp->b_object); for (j = 0;j < tbp->b_npages; j++) { VM_OBJECT_LOCK_ASSERT(tbp->b_pages[j]->object, MA_OWNED); if (tbp->b_pages[j]->valid) break; } VM_OBJECT_UNLOCK(tbp->b_object); if (j != tbp->b_npages) { bqrelse(tbp); break; } /* * Set a read-ahead mark as appropriate */ if ((fbp && (i == 1)) || (i == (run - 1))) tbp->b_flags |= B_RAM; /* * Set the buffer up for an async read (XXX should * we do this only if we do not wind up brelse()ing?). * Set the block number if it isn't set, otherwise * if it is make sure it matches the block number we * expect. */ tbp->b_flags |= B_ASYNC; tbp->b_iocmd = BIO_READ; if (tbp->b_blkno == tbp->b_lblkno) { tbp->b_blkno = bn; } else if (tbp->b_blkno != bn) { brelse(tbp); break; } } /* * XXX fbp from caller may not be B_ASYNC, but we are going * to biodone() it in cluster_callback() anyway */ BUF_KERNPROC(tbp); TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head, tbp, b_cluster.cluster_entry); VM_OBJECT_LOCK(tbp->b_object); vm_page_lock_queues(); for (j = 0; j < tbp->b_npages; j += 1) { vm_page_t m; m = tbp->b_pages[j]; vm_page_io_start(m); vm_object_pip_add(m->object, 1); if ((bp->b_npages == 0) || (bp->b_pages[bp->b_npages-1] != m)) { bp->b_pages[bp->b_npages] = m; bp->b_npages++; } if ((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) tbp->b_pages[j] = bogus_page; } vm_page_unlock_queues(); VM_OBJECT_UNLOCK(tbp->b_object); /* * XXX shouldn't this be += size for both, like in * cluster_wbuild()? * * Don't inherit tbp->b_bufsize as it may be larger due to * a non-page-aligned size. Instead just aggregate using * 'size'. */ if (tbp->b_bcount != size) printf("warning: tbp->b_bcount wrong %ld vs %ld\n", tbp->b_bcount, size); if (tbp->b_bufsize != size) printf("warning: tbp->b_bufsize wrong %ld vs %ld\n", tbp->b_bufsize, size); bp->b_bcount += size; bp->b_bufsize += size; } /* * Fully valid pages in the cluster are already good and do not need * to be re-read from disk. Replace the page with bogus_page */ VM_OBJECT_LOCK(bp->b_object); for (j = 0; j < bp->b_npages; j++) { VM_OBJECT_LOCK_ASSERT(bp->b_pages[j]->object, MA_OWNED); if ((bp->b_pages[j]->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) { bp->b_pages[j] = bogus_page; } } VM_OBJECT_UNLOCK(bp->b_object); if (bp->b_bufsize > bp->b_kvasize) panic("cluster_rbuild: b_bufsize(%ld) > b_kvasize(%d)\n", bp->b_bufsize, bp->b_kvasize); bp->b_kvasize = bp->b_bufsize; pmap_qenter(trunc_page((vm_offset_t) bp->b_data), (vm_page_t *)bp->b_pages, bp->b_npages); return (bp); } /* * Cleanup after a clustered read or write. * This is complicated by the fact that any of the buffers might have * extra memory (if there were no empty buffer headers at allocbuf time) * that we will need to shift around. */ void cluster_callback(bp) struct buf *bp; { struct buf *nbp, *tbp; int error = 0; GIANT_REQUIRED; /* * Must propogate errors to all the components. */ if (bp->b_ioflags & BIO_ERROR) error = bp->b_error; pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages); /* * Move memory from the large cluster buffer into the component * buffers and mark IO as done on these. */ for (tbp = TAILQ_FIRST(&bp->b_cluster.cluster_head); tbp; tbp = nbp) { nbp = TAILQ_NEXT(&tbp->b_cluster, cluster_entry); if (error) { tbp->b_ioflags |= BIO_ERROR; tbp->b_error = error; } else { tbp->b_dirtyoff = tbp->b_dirtyend = 0; tbp->b_flags &= ~B_INVAL; tbp->b_ioflags &= ~BIO_ERROR; /* * XXX the bdwrite()/bqrelse() issued during * cluster building clears B_RELBUF (see bqrelse() * comment). If direct I/O was specified, we have * to restore it here to allow the buffer and VM * to be freed. */ if (tbp->b_flags & B_DIRECT) tbp->b_flags |= B_RELBUF; } bufdone(tbp); } relpbuf(bp, &cluster_pbuf_freecnt); } /* * cluster_wbuild_wb: * * Implement modified write build for cluster. * * write_behind = 0 write behind disabled * write_behind = 1 write behind normal (default) * write_behind = 2 write behind backed-off */ static __inline int cluster_wbuild_wb(struct vnode *vp, long size, daddr_t start_lbn, int len) { int r = 0; switch(write_behind) { case 2: if (start_lbn < len) break; start_lbn -= len; /* FALLTHROUGH */ case 1: r = cluster_wbuild(vp, size, start_lbn, len); /* FALLTHROUGH */ default: /* FALLTHROUGH */ break; } return(r); } /* * Do clustered write for FFS. * * Three cases: * 1. Write is not sequential (write asynchronously) * Write is sequential: * 2. beginning of cluster - begin cluster * 3. middle of a cluster - add to cluster * 4. end of a cluster - asynchronously write cluster */ void cluster_write(bp, filesize, seqcount) struct buf *bp; u_quad_t filesize; int seqcount; { struct vnode *vp; daddr_t lbn; int maxclen, cursize; int lblocksize; int async; vp = bp->b_vp; if (vp->v_type == VREG) { async = vp->v_mount->mnt_flag & MNT_ASYNC; lblocksize = vp->v_mount->mnt_stat.f_iosize; } else { async = 0; lblocksize = bp->b_bufsize; } lbn = bp->b_lblkno; KASSERT(bp->b_offset != NOOFFSET, ("cluster_write: no buffer offset")); /* Initialize vnode to beginning of file. */ if (lbn == 0) vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 || (bp->b_blkno != vp->v_lasta + btodb(lblocksize))) { maxclen = vp->v_mount->mnt_iosize_max / lblocksize - 1; if (vp->v_clen != 0) { /* * Next block is not sequential. * * If we are not writing at end of file, the process * seeked to another point in the file since its last * write, or we have reached our maximum cluster size, * then push the previous cluster. Otherwise try * reallocating to make it sequential. * * Change to algorithm: only push previous cluster if * it was sequential from the point of view of the * seqcount heuristic, otherwise leave the buffer * intact so we can potentially optimize the I/O * later on in the buf_daemon or update daemon * flush. */ cursize = vp->v_lastw - vp->v_cstart + 1; if (((u_quad_t) bp->b_offset + lblocksize) != filesize || lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) { if (!async && seqcount > 0) { cluster_wbuild_wb(vp, lblocksize, vp->v_cstart, cursize); } } else { struct buf **bpp, **endbp; struct cluster_save *buflist; buflist = cluster_collectbufs(vp, bp); endbp = &buflist->bs_children [buflist->bs_nchildren - 1]; if (VOP_REALLOCBLKS(vp, buflist)) { /* * Failed, push the previous cluster * if *really* writing sequentially * in the logical file (seqcount > 1), * otherwise delay it in the hopes that * the low level disk driver can * optimize the write ordering. */ for (bpp = buflist->bs_children; bpp < endbp; bpp++) brelse(*bpp); free(buflist, M_SEGMENT); if (seqcount > 1) { cluster_wbuild_wb(vp, lblocksize, vp->v_cstart, cursize); } } else { /* * Succeeded, keep building cluster. */ for (bpp = buflist->bs_children; bpp <= endbp; bpp++) bdwrite(*bpp); free(buflist, M_SEGMENT); vp->v_lastw = lbn; vp->v_lasta = bp->b_blkno; return; } } } /* * Consider beginning a cluster. If at end of file, make * cluster as large as possible, otherwise find size of * existing cluster. */ if ((vp->v_type == VREG) && ((u_quad_t) bp->b_offset + lblocksize) != filesize && (bp->b_blkno == bp->b_lblkno) && (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen, NULL) || bp->b_blkno == -1)) { bawrite(bp); vp->v_clen = 0; vp->v_lasta = bp->b_blkno; vp->v_cstart = lbn + 1; vp->v_lastw = lbn; return; } vp->v_clen = maxclen; if (!async && maxclen == 0) { /* I/O not contiguous */ vp->v_cstart = lbn + 1; bawrite(bp); } else { /* Wait for rest of cluster */ vp->v_cstart = lbn; bdwrite(bp); } } else if (lbn == vp->v_cstart + vp->v_clen) { /* * At end of cluster, write it out if seqcount tells us we * are operating sequentially, otherwise let the buf or * update daemon handle it. */ bdwrite(bp); if (seqcount > 1) cluster_wbuild_wb(vp, lblocksize, vp->v_cstart, vp->v_clen + 1); vp->v_clen = 0; vp->v_cstart = lbn + 1; } else if (vm_page_count_severe()) { /* * We are low on memory, get it going NOW */ bawrite(bp); } else { /* * In the middle of a cluster, so just delay the I/O for now. */ bdwrite(bp); } vp->v_lastw = lbn; vp->v_lasta = bp->b_blkno; } /* * This is an awful lot like cluster_rbuild...wish they could be combined. * The last lbn argument is the current block on which I/O is being * performed. Check to see that it doesn't fall in the middle of * the current block (if last_bp == NULL). */ int cluster_wbuild(vp, size, start_lbn, len) struct vnode *vp; long size; daddr_t start_lbn; int len; { struct buf *bp, *tbp; int i, j, s; int totalwritten = 0; int dbsize = btodb(size); GIANT_REQUIRED; while (len > 0) { s = splbio(); /* * If the buffer is not delayed-write (i.e. dirty), or it * is delayed-write but either locked or inval, it cannot * partake in the clustered write. */ VI_LOCK(vp); if ((tbp = gbincore(vp, start_lbn)) == NULL || (tbp->b_vflags & BV_BKGRDINPROG)) { VI_UNLOCK(vp); ++start_lbn; --len; splx(s); continue; } if (BUF_LOCK(tbp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, VI_MTX(vp))) { ++start_lbn; --len; splx(s); continue; } if ((tbp->b_flags & (B_INVAL | B_DELWRI)) != B_DELWRI) { BUF_UNLOCK(tbp); ++start_lbn; --len; splx(s); continue; } bremfree(tbp); tbp->b_flags &= ~B_DONE; splx(s); /* * Extra memory in the buffer, punt on this buffer. * XXX we could handle this in most cases, but we would * have to push the extra memory down to after our max * possible cluster size and then potentially pull it back * up if the cluster was terminated prematurely--too much * hassle. */ if (((tbp->b_flags & (B_CLUSTEROK | B_MALLOC | B_VMIO)) != (B_CLUSTEROK | B_VMIO)) || (tbp->b_bcount != tbp->b_bufsize) || (tbp->b_bcount != size) || (len == 1) || ((bp = getpbuf(&cluster_pbuf_freecnt)) == NULL)) { totalwritten += tbp->b_bufsize; bawrite(tbp); ++start_lbn; --len; continue; } /* * We got a pbuf to make the cluster in. * so initialise it. */ TAILQ_INIT(&bp->b_cluster.cluster_head); bp->b_bcount = 0; bp->b_magic = tbp->b_magic; bp->b_op = tbp->b_op; bp->b_bufsize = 0; bp->b_npages = 0; if (tbp->b_wcred != NOCRED) bp->b_wcred = crhold(tbp->b_wcred); bp->b_blkno = tbp->b_blkno; bp->b_lblkno = tbp->b_lblkno; bp->b_offset = tbp->b_offset; /* * We are synthesizing a buffer out of vm_page_t's, but * if the block size is not page aligned then the starting * address may not be either. Inherit the b_data offset * from the original buffer. */ bp->b_data = (char *)((vm_offset_t)bp->b_data | ((vm_offset_t)tbp->b_data & PAGE_MASK)); bp->b_flags |= B_CLUSTER | (tbp->b_flags & (B_VMIO | B_NEEDCOMMIT)); bp->b_iodone = cluster_callback; pbgetvp(vp, bp); /* * From this location in the file, scan forward to see * if there are buffers with adjacent data that need to * be written as well. */ for (i = 0; i < len; ++i, ++start_lbn) { if (i != 0) { /* If not the first buffer */ s = splbio(); /* * If the adjacent data is not even in core it * can't need to be written. */ VI_LOCK(vp); if ((tbp = gbincore(vp, start_lbn)) == NULL || (tbp->b_vflags & BV_BKGRDINPROG)) { VI_UNLOCK(vp); splx(s); break; } /* * If it IS in core, but has different * characteristics, or is locked (which * means it could be undergoing a background * I/O or be in a weird state), then don't * cluster with it. */ if (BUF_LOCK(tbp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, VI_MTX(vp))) { splx(s); break; } if ((tbp->b_flags & (B_VMIO | B_CLUSTEROK | B_INVAL | B_DELWRI | B_NEEDCOMMIT)) != (B_DELWRI | B_CLUSTEROK | (bp->b_flags & (B_VMIO | B_NEEDCOMMIT))) || tbp->b_wcred != bp->b_wcred) { BUF_UNLOCK(tbp); splx(s); break; } /* * Check that the combined cluster * would make sense with regard to pages * and would not be too large */ if ((tbp->b_bcount != size) || ((bp->b_blkno + (dbsize * i)) != tbp->b_blkno) || ((tbp->b_npages + bp->b_npages) > (vp->v_mount->mnt_iosize_max / PAGE_SIZE))) { BUF_UNLOCK(tbp); splx(s); break; } /* * Ok, it's passed all the tests, * so remove it from the free list * and mark it busy. We will use it. */ bremfree(tbp); tbp->b_flags &= ~B_DONE; splx(s); } /* end of code for non-first buffers only */ /* check for latent dependencies to be handled */ if ((LIST_FIRST(&tbp->b_dep)) != NULL) { tbp->b_iocmd = BIO_WRITE; buf_start(tbp); } /* * If the IO is via the VM then we do some * special VM hackery (yuck). Since the buffer's * block size may not be page-aligned it is possible * for a page to be shared between two buffers. We * have to get rid of the duplication when building * the cluster. */ if (tbp->b_flags & B_VMIO) { vm_page_t m; if (i != 0) { /* if not first buffer */ for (j = 0; j < tbp->b_npages; j += 1) { m = tbp->b_pages[j]; if (m->flags & PG_BUSY) { bqrelse(tbp); goto finishcluster; } } } if (tbp->b_object != NULL) VM_OBJECT_LOCK(tbp->b_object); vm_page_lock_queues(); for (j = 0; j < tbp->b_npages; j += 1) { m = tbp->b_pages[j]; vm_page_io_start(m); vm_object_pip_add(m->object, 1); if ((bp->b_npages == 0) || (bp->b_pages[bp->b_npages - 1] != m)) { bp->b_pages[bp->b_npages] = m; bp->b_npages++; } } vm_page_unlock_queues(); if (tbp->b_object != NULL) VM_OBJECT_UNLOCK(tbp->b_object); } bp->b_bcount += size; bp->b_bufsize += size; s = splbio(); bundirty(tbp); tbp->b_flags &= ~B_DONE; tbp->b_ioflags &= ~BIO_ERROR; tbp->b_flags |= B_ASYNC; tbp->b_iocmd = BIO_WRITE; reassignbuf(tbp, tbp->b_vp); /* put on clean list */ VI_LOCK(tbp->b_vp); ++tbp->b_vp->v_numoutput; VI_UNLOCK(tbp->b_vp); splx(s); BUF_KERNPROC(tbp); TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head, tbp, b_cluster.cluster_entry); } finishcluster: pmap_qenter(trunc_page((vm_offset_t) bp->b_data), (vm_page_t *) bp->b_pages, bp->b_npages); if (bp->b_bufsize > bp->b_kvasize) panic( "cluster_wbuild: b_bufsize(%ld) > b_kvasize(%d)\n", bp->b_bufsize, bp->b_kvasize); bp->b_kvasize = bp->b_bufsize; totalwritten += bp->b_bufsize; bp->b_dirtyoff = 0; bp->b_dirtyend = bp->b_bufsize; bawrite(bp); len -= i; } return totalwritten; } /* * Collect together all the buffers in a cluster. * Plus add one additional buffer. */ static struct cluster_save * cluster_collectbufs(vp, last_bp) struct vnode *vp; struct buf *last_bp; { struct cluster_save *buflist; struct buf *bp; daddr_t lbn; int i, len; len = vp->v_lastw - vp->v_cstart + 1; buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist), M_SEGMENT, M_WAITOK); buflist->bs_nchildren = 0; buflist->bs_children = (struct buf **) (buflist + 1); for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++) { (void) bread(vp, lbn, last_bp->b_bcount, NOCRED, &bp); buflist->bs_children[i] = bp; if (bp->b_blkno == bp->b_lblkno) VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL); } buflist->bs_children[i] = bp = last_bp; if (bp->b_blkno == bp->b_lblkno) VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL); buflist->bs_nchildren = i + 1; return (buflist); } Index: head/sys/kern/vfs_extattr.c =================================================================== --- head/sys/kern/vfs_extattr.c (revision 122536) +++ head/sys/kern/vfs_extattr.c (revision 122537) @@ -1,4508 +1,4791 @@ /* * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_syscalls.c 8.13 (Berkeley) 4/15/94 */ #include __FBSDID("$FreeBSD$"); /* For 4.3 integer FS ID compatibility */ #include "opt_compat.h" #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int chroot_refuse_vdir_fds(struct filedesc *fdp); static int getutimes(const struct timeval *, enum uio_seg, struct timespec *); static int setfown(struct thread *td, struct vnode *, uid_t, gid_t); static int setfmode(struct thread *td, struct vnode *, int); static int setfflags(struct thread *td, struct vnode *, int); static int setutimes(struct thread *td, struct vnode *, const struct timespec *, int, int); static int vn_access(struct vnode *vp, int user_flags, struct ucred *cred, struct thread *td); static int extattr_list_vp(struct vnode *vp, int attrnamespace, void *data, size_t nbytes, struct thread *td); int (*union_dircheckp)(struct thread *td, struct vnode **, struct file *); int (*softdep_fsync_hook)(struct vnode *); /* * The module initialization routine for POSIX asynchronous I/O will * set this to the version of AIO that it implements. (Zero means * that it is not implemented.) This value is used here by pathconf() * and in kern_descrip.c by fpathconf(). */ int async_io_version; /* * Sync each mounted filesystem. */ #ifndef _SYS_SYSPROTO_H_ struct sync_args { int dummy; }; #endif #ifdef DEBUG static int syncprt = 0; SYSCTL_INT(_debug, OID_AUTO, syncprt, CTLFLAG_RW, &syncprt, 0, ""); #endif /* ARGSUSED */ int sync(td, uap) struct thread *td; struct sync_args *uap; { struct mount *mp, *nmp; int asyncflag; mtx_lock(&mountlist_mtx); for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } if ((mp->mnt_flag & MNT_RDONLY) == 0 && vn_start_write(NULL, &mp, V_NOWAIT) == 0) { asyncflag = mp->mnt_flag & MNT_ASYNC; mp->mnt_flag &= ~MNT_ASYNC; vfs_msync(mp, MNT_NOWAIT); VFS_SYNC(mp, MNT_NOWAIT, ((td != NULL) ? td->td_ucred : NOCRED), td); mp->mnt_flag |= asyncflag; vn_finished_write(mp); } mtx_lock(&mountlist_mtx); nmp = TAILQ_NEXT(mp, mnt_list); vfs_unbusy(mp, td); } mtx_unlock(&mountlist_mtx); #if 0 /* * XXX don't call vfs_bufstats() yet because that routine * was not imported in the Lite2 merge. */ #ifdef DIAGNOSTIC if (syncprt) vfs_bufstats(); #endif /* DIAGNOSTIC */ #endif return (0); } /* XXX PRISON: could be per prison flag */ static int prison_quotas; #if 0 SYSCTL_INT(_kern_prison, OID_AUTO, quotas, CTLFLAG_RW, &prison_quotas, 0, ""); #endif /* * Change filesystem quotas. */ #ifndef _SYS_SYSPROTO_H_ struct quotactl_args { char *path; int cmd; int uid; caddr_t arg; }; #endif /* ARGSUSED */ int quotactl(td, uap) struct thread *td; register struct quotactl_args /* { char *path; int cmd; int uid; caddr_t arg; } */ *uap; { struct mount *mp; int error; struct nameidata nd; if (jailed(td->td_ucred) && !prison_quotas) return (EPERM); NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = vn_start_write(nd.ni_vp, &mp, V_WAIT | PCATCH); vrele(nd.ni_vp); if (error) return (error); error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, uap->arg, td); vn_finished_write(mp); return (error); } /* * Get filesystem statistics. */ #ifndef _SYS_SYSPROTO_H_ struct statfs_args { char *path; struct statfs *buf; }; #endif /* ARGSUSED */ int statfs(td, uap) struct thread *td; register struct statfs_args /* { char *path; struct statfs *buf; } */ *uap; { - register struct mount *mp; - register struct statfs *sp; + struct mount *mp; + struct statfs *sp, sb; int error; struct nameidata nd; - struct statfs sb; NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, td); if ((error = namei(&nd)) != 0) return (error); mp = nd.ni_vp->v_mount; sp = &mp->mnt_stat; NDFREE(&nd, NDF_ONLY_PNBUF); vrele(nd.ni_vp); #ifdef MAC error = mac_check_mount_stat(td->td_ucred, mp); if (error) return (error); #endif + /* + * Set these in case the underlying filesystem fails to do so. + */ + sp->f_version = STATFS_VERSION; + sp->f_namemax = NAME_MAX; + sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; error = VFS_STATFS(mp, sp, td); if (error) return (error); - sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; if (suser(td)) { bcopy(sp, &sb, sizeof(sb)); sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0; sp = &sb; } return (copyout(sp, uap->buf, sizeof(*sp))); } /* * Get filesystem statistics. */ #ifndef _SYS_SYSPROTO_H_ struct fstatfs_args { int fd; struct statfs *buf; }; #endif /* ARGSUSED */ int fstatfs(td, uap) struct thread *td; register struct fstatfs_args /* { int fd; struct statfs *buf; } */ *uap; { struct file *fp; struct mount *mp; - register struct statfs *sp; + struct statfs *sp, sb; int error; - struct statfs sb; if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0) return (error); mp = fp->f_vnode->v_mount; fdrop(fp, td); if (mp == NULL) return (EBADF); #ifdef MAC error = mac_check_mount_stat(td->td_ucred, mp); if (error) return (error); #endif sp = &mp->mnt_stat; + /* + * Set these in case the underlying filesystem fails to do so. + */ + sp->f_version = STATFS_VERSION; + sp->f_namemax = NAME_MAX; + sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; error = VFS_STATFS(mp, sp, td); if (error) return (error); - sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; if (suser(td)) { bcopy(sp, &sb, sizeof(sb)); sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0; sp = &sb; } return (copyout(sp, uap->buf, sizeof(*sp))); } /* * Get statistics on all filesystems. */ #ifndef _SYS_SYSPROTO_H_ struct getfsstat_args { struct statfs *buf; long bufsize; int flags; }; #endif int getfsstat(td, uap) struct thread *td; register struct getfsstat_args /* { struct statfs *buf; long bufsize; int flags; } */ *uap; { - register struct mount *mp, *nmp; - register struct statfs *sp; + struct mount *mp, *nmp; + struct statfs *sp, sb; caddr_t sfsp; long count, maxcount, error; maxcount = uap->bufsize / sizeof(struct statfs); sfsp = (caddr_t)uap->buf; count = 0; mtx_lock(&mountlist_mtx); for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { #ifdef MAC if (mac_check_mount_stat(td->td_ucred, mp) != 0) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } #endif if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } if (sfsp && count < maxcount) { sp = &mp->mnt_stat; /* + * Set these in case the underlying filesystem + * fails to do so. + */ + sp->f_version = STATFS_VERSION; + sp->f_namemax = NAME_MAX; + sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; + /* * If MNT_NOWAIT or MNT_LAZY is specified, do not * refresh the fsstat cache. MNT_NOWAIT or MNT_LAZY * overrides MNT_WAIT. */ if (((uap->flags & (MNT_LAZY|MNT_NOWAIT)) == 0 || (uap->flags & MNT_WAIT)) && (error = VFS_STATFS(mp, sp, td))) { mtx_lock(&mountlist_mtx); nmp = TAILQ_NEXT(mp, mnt_list); vfs_unbusy(mp, td); continue; } - sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; + if (suser(td)) { + bcopy(sp, &sb, sizeof(sb)); + sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0; + sp = &sb; + } error = copyout(sp, sfsp, sizeof(*sp)); if (error) { vfs_unbusy(mp, td); return (error); } sfsp += sizeof(*sp); } count++; mtx_lock(&mountlist_mtx); nmp = TAILQ_NEXT(mp, mnt_list); vfs_unbusy(mp, td); } mtx_unlock(&mountlist_mtx); if (sfsp && count > maxcount) td->td_retval[0] = maxcount; else td->td_retval[0] = count; return (0); } +#ifdef COMPAT_FREEBSD4 /* + * Get old format filesystem statistics. + */ +static void cvtstatfs(struct thread *, struct statfs *, struct ostatfs *); + +#ifndef _SYS_SYSPROTO_H_ +struct freebsd4_statfs_args { + char *path; + struct ostatfs *buf; +}; +#endif +/* ARGSUSED */ +int +freebsd4_statfs(td, uap) + struct thread *td; + struct freebsd4_statfs_args /* { + char *path; + struct ostatfs *buf; + } */ *uap; +{ + struct mount *mp; + struct statfs *sp; + struct ostatfs osb; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, td); + if ((error = namei(&nd)) != 0) + return (error); + mp = nd.ni_vp->v_mount; + sp = &mp->mnt_stat; + NDFREE(&nd, NDF_ONLY_PNBUF); + vrele(nd.ni_vp); +#ifdef MAC + error = mac_check_mount_stat(td->td_ucred, mp); + if (error) + return (error); +#endif + error = VFS_STATFS(mp, sp, td); + if (error) + return (error); + sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; + cvtstatfs(td, sp, &osb); + return (copyout(&osb, uap->buf, sizeof(osb))); +} + +/* + * Get filesystem statistics. + */ +#ifndef _SYS_SYSPROTO_H_ +struct freebsd4_fstatfs_args { + int fd; + struct ostatfs *buf; +}; +#endif +/* ARGSUSED */ +int +freebsd4_fstatfs(td, uap) + struct thread *td; + struct freebsd4_fstatfs_args /* { + int fd; + struct ostatfs *buf; + } */ *uap; +{ + struct file *fp; + struct mount *mp; + struct statfs *sp; + struct ostatfs osb; + int error; + + if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0) + return (error); + mp = fp->f_vnode->v_mount; + fdrop(fp, td); + if (mp == NULL) + return (EBADF); +#ifdef MAC + error = mac_check_mount_stat(td->td_ucred, mp); + if (error) + return (error); +#endif + sp = &mp->mnt_stat; + error = VFS_STATFS(mp, sp, td); + if (error) + return (error); + sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; + cvtstatfs(td, sp, &osb); + return (copyout(&osb, uap->buf, sizeof(osb))); +} + +/* + * Get statistics on all filesystems. + */ +#ifndef _SYS_SYSPROTO_H_ +struct freebsd4_getfsstat_args { + struct ostatfs *buf; + long bufsize; + int flags; +}; +#endif +int +freebsd4_getfsstat(td, uap) + struct thread *td; + register struct freebsd4_getfsstat_args /* { + struct ostatfs *buf; + long bufsize; + int flags; + } */ *uap; +{ + struct mount *mp, *nmp; + struct statfs *sp; + struct ostatfs osb; + caddr_t sfsp; + long count, maxcount, error; + + maxcount = uap->bufsize / sizeof(struct ostatfs); + sfsp = (caddr_t)uap->buf; + count = 0; + mtx_lock(&mountlist_mtx); + for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { +#ifdef MAC + if (mac_check_mount_stat(td->td_ucred, mp) != 0) { + nmp = TAILQ_NEXT(mp, mnt_list); + continue; + } +#endif + if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) { + nmp = TAILQ_NEXT(mp, mnt_list); + continue; + } + if (sfsp && count < maxcount) { + sp = &mp->mnt_stat; + /* + * If MNT_NOWAIT or MNT_LAZY is specified, do not + * refresh the fsstat cache. MNT_NOWAIT or MNT_LAZY + * overrides MNT_WAIT. + */ + if (((uap->flags & (MNT_LAZY|MNT_NOWAIT)) == 0 || + (uap->flags & MNT_WAIT)) && + (error = VFS_STATFS(mp, sp, td))) { + mtx_lock(&mountlist_mtx); + nmp = TAILQ_NEXT(mp, mnt_list); + vfs_unbusy(mp, td); + continue; + } + sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; + cvtstatfs(td, sp, &osb); + error = copyout(&osb, sfsp, sizeof(osb)); + if (error) { + vfs_unbusy(mp, td); + return (error); + } + sfsp += sizeof(osb); + } + count++; + mtx_lock(&mountlist_mtx); + nmp = TAILQ_NEXT(mp, mnt_list); + vfs_unbusy(mp, td); + } + mtx_unlock(&mountlist_mtx); + if (sfsp && count > maxcount) + td->td_retval[0] = maxcount; + else + td->td_retval[0] = count; + return (0); +} + +/* + * Implement fstatfs() for (NFS) file handles. + */ +#ifndef _SYS_SYSPROTO_H_ +struct freebsd4_fhstatfs_args { + struct fhandle *u_fhp; + struct ostatfs *buf; +}; +#endif +int +freebsd4_fhstatfs(td, uap) + struct thread *td; + struct freebsd4_fhstatfs_args /* { + struct fhandle *u_fhp; + struct ostatfs *buf; + } */ *uap; +{ + struct statfs *sp; + struct mount *mp; + struct vnode *vp; + struct ostatfs osb; + fhandle_t fh; + int error; + + /* + * Must be super user + */ + error = suser(td); + if (error) + return (error); + + if ((error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t))) != 0) + return (error); + + if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL) + return (ESTALE); + if ((error = VFS_FHTOVP(mp, &fh.fh_fid, &vp))) + return (error); + mp = vp->v_mount; + sp = &mp->mnt_stat; + vput(vp); +#ifdef MAC + error = mac_check_mount_stat(td->td_ucred, mp); + if (error) + return (error); +#endif + if ((error = VFS_STATFS(mp, sp, td)) != 0) + return (error); + sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; + cvtstatfs(td, sp, &osb); + return (copyout(&osb, uap->buf, sizeof(osb))); +} + +/* + * Convert a new format statfs structure to an old format statfs structure. + */ +static void +cvtstatfs(td, nsp, osp) + struct thread *td; + struct statfs *nsp; + struct ostatfs *osp; +{ + + bzero(osp, sizeof(*osp)); + osp->f_bsize = MIN(nsp->f_bsize, LONG_MAX); + osp->f_iosize = MIN(nsp->f_iosize, LONG_MAX); + osp->f_blocks = MIN(nsp->f_blocks, LONG_MAX); + osp->f_bfree = MIN(nsp->f_bfree, LONG_MAX); + osp->f_bavail = MIN(nsp->f_bavail, LONG_MAX); + osp->f_files = MIN(nsp->f_files, LONG_MAX); + osp->f_ffree = MIN(nsp->f_ffree, LONG_MAX); + osp->f_owner = nsp->f_owner; + osp->f_type = nsp->f_type; + osp->f_flags = nsp->f_flags; + osp->f_syncwrites = MIN(nsp->f_syncwrites, LONG_MAX); + osp->f_asyncwrites = MIN(nsp->f_asyncwrites, LONG_MAX); + osp->f_syncreads = MIN(nsp->f_syncreads, LONG_MAX); + osp->f_asyncreads = MIN(nsp->f_asyncreads, LONG_MAX); + bcopy(nsp->f_fstypename, osp->f_fstypename, + MIN(MFSNAMELEN, OMNAMELEN)); + bcopy(nsp->f_mntonname, osp->f_mntonname, + MIN(MFSNAMELEN, OMNAMELEN)); + bcopy(nsp->f_mntfromname, osp->f_mntfromname, + MIN(MFSNAMELEN, OMNAMELEN)); + if (suser(td)) { + osp->f_fsid.val[0] = osp->f_fsid.val[1] = 0; + } else { + osp->f_fsid = nsp->f_fsid; + } +} +#endif /* COMPAT_FREEBSD4 */ + +/* * Change current working directory to a given file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fchdir_args { int fd; }; #endif /* ARGSUSED */ int fchdir(td, uap) struct thread *td; struct fchdir_args /* { int fd; } */ *uap; { register struct filedesc *fdp = td->td_proc->p_fd; struct vnode *vp, *tdp, *vpold; struct mount *mp; struct file *fp; int error; if ((error = getvnode(fdp, uap->fd, &fp)) != 0) return (error); vp = fp->f_vnode; VREF(vp); fdrop(fp, td); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); if (vp->v_type != VDIR) error = ENOTDIR; #ifdef MAC else if ((error = mac_check_vnode_chdir(td->td_ucred, vp)) != 0) { } #endif else error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td); while (!error && (mp = vp->v_mountedhere) != NULL) { if (vfs_busy(mp, 0, 0, td)) continue; error = VFS_ROOT(mp, &tdp); vfs_unbusy(mp, td); if (error) break; vput(vp); vp = tdp; } if (error) { vput(vp); return (error); } VOP_UNLOCK(vp, 0, td); FILEDESC_LOCK(fdp); vpold = fdp->fd_cdir; fdp->fd_cdir = vp; FILEDESC_UNLOCK(fdp); vrele(vpold); return (0); } /* * Change current working directory (``.''). */ #ifndef _SYS_SYSPROTO_H_ struct chdir_args { char *path; }; #endif /* ARGSUSED */ int chdir(td, uap) struct thread *td; struct chdir_args /* { char *path; } */ *uap; { return (kern_chdir(td, uap->path, UIO_USERSPACE)); } int kern_chdir(struct thread *td, char *path, enum uio_seg pathseg) { register struct filedesc *fdp = td->td_proc->p_fd; int error; struct nameidata nd; struct vnode *vp; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); if ((error = change_dir(nd.ni_vp, td)) != 0) { vput(nd.ni_vp); NDFREE(&nd, NDF_ONLY_PNBUF); return (error); } VOP_UNLOCK(nd.ni_vp, 0, td); NDFREE(&nd, NDF_ONLY_PNBUF); FILEDESC_LOCK(fdp); vp = fdp->fd_cdir; fdp->fd_cdir = nd.ni_vp; FILEDESC_UNLOCK(fdp); vrele(vp); return (0); } /* * Helper function for raised chroot(2) security function: Refuse if * any filedescriptors are open directories. */ static int chroot_refuse_vdir_fds(fdp) struct filedesc *fdp; { struct vnode *vp; struct file *fp; int fd; FILEDESC_LOCK_ASSERT(fdp, MA_OWNED); for (fd = 0; fd < fdp->fd_nfiles ; fd++) { fp = fget_locked(fdp, fd); if (fp == NULL) continue; if (fp->f_type == DTYPE_VNODE) { vp = fp->f_vnode; if (vp->v_type == VDIR) return (EPERM); } } return (0); } /* * This sysctl determines if we will allow a process to chroot(2) if it * has a directory open: * 0: disallowed for all processes. * 1: allowed for processes that were not already chroot(2)'ed. * 2: allowed for all processes. */ static int chroot_allow_open_directories = 1; SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW, &chroot_allow_open_directories, 0, ""); /* * Change notion of root (``/'') directory. */ #ifndef _SYS_SYSPROTO_H_ struct chroot_args { char *path; }; #endif /* ARGSUSED */ int chroot(td, uap) struct thread *td; struct chroot_args /* { char *path; } */ *uap; { int error; struct nameidata nd; error = suser_cred(td->td_ucred, PRISON_ROOT); if (error) return (error); NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, uap->path, td); mtx_lock(&Giant); error = namei(&nd); if (error) goto error; if ((error = change_dir(nd.ni_vp, td)) != 0) goto e_vunlock; #ifdef MAC if ((error = mac_check_vnode_chroot(td->td_ucred, nd.ni_vp))) goto e_vunlock; #endif VOP_UNLOCK(nd.ni_vp, 0, td); error = change_root(nd.ni_vp, td); vrele(nd.ni_vp); NDFREE(&nd, NDF_ONLY_PNBUF); mtx_unlock(&Giant); return (error); e_vunlock: vput(nd.ni_vp); error: mtx_unlock(&Giant); NDFREE(&nd, NDF_ONLY_PNBUF); return (error); } /* * Common routine for chroot and chdir. Callers must provide a locked vnode * instance. */ int change_dir(vp, td) struct vnode *vp; struct thread *td; { int error; ASSERT_VOP_LOCKED(vp, "change_dir(): vp not locked"); if (vp->v_type != VDIR) return (ENOTDIR); #ifdef MAC error = mac_check_vnode_chdir(td->td_ucred, vp); if (error) return (error); #endif error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td); return (error); } /* * Common routine for kern_chroot() and jail_attach(). The caller is * responsible for invoking suser() and mac_check_chroot() to authorize this * operation. */ int change_root(vp, td) struct vnode *vp; struct thread *td; { struct filedesc *fdp; struct vnode *oldvp; int error; mtx_assert(&Giant, MA_OWNED); fdp = td->td_proc->p_fd; FILEDESC_LOCK(fdp); if (chroot_allow_open_directories == 0 || (chroot_allow_open_directories == 1 && fdp->fd_rdir != rootvnode)) { error = chroot_refuse_vdir_fds(fdp); if (error) { FILEDESC_UNLOCK(fdp); return (error); } } oldvp = fdp->fd_rdir; fdp->fd_rdir = vp; VREF(fdp->fd_rdir); if (!fdp->fd_jdir) { fdp->fd_jdir = vp; VREF(fdp->fd_jdir); } FILEDESC_UNLOCK(fdp); vrele(oldvp); return (0); } /* * Check permissions, allocate an open file structure, * and call the device open routine if any. */ #ifndef _SYS_SYSPROTO_H_ struct open_args { char *path; int flags; int mode; }; #endif int open(td, uap) struct thread *td; register struct open_args /* { char *path; int flags; int mode; } */ *uap; { return (kern_open(td, uap->path, UIO_USERSPACE, uap->flags, uap->mode)); } int kern_open(struct thread *td, char *path, enum uio_seg pathseg, int flags, int mode) { struct proc *p = td->td_proc; struct filedesc *fdp = p->p_fd; struct file *fp; struct vnode *vp; struct vattr vat; struct mount *mp; int cmode; struct file *nfp; int type, indx, error; struct flock lf; struct nameidata nd; if ((flags & O_ACCMODE) == O_ACCMODE) return (EINVAL); flags = FFLAGS(flags); error = falloc(td, &nfp, &indx); if (error) return (error); /* An extra reference on `nfp' has been held for us by falloc(). */ fp = nfp; cmode = ((mode &~ fdp->fd_cmask) & ALLPERMS) &~ S_ISTXT; NDINIT(&nd, LOOKUP, FOLLOW, pathseg, path, td); td->td_dupfd = -1; /* XXX check for fdopen */ error = vn_open(&nd, &flags, cmode, indx); if (error) { /* * If the vn_open replaced the method vector, something * wonderous happened deep below and we just pass it up * pretending we know what we do. */ if (error == ENXIO && fp->f_ops != &badfileops) { fdrop(fp, td); td->td_retval[0] = indx; return (0); } /* * release our own reference */ fdrop(fp, td); /* * handle special fdopen() case. bleh. dupfdopen() is * responsible for dropping the old contents of ofiles[indx] * if it succeeds. */ if ((error == ENODEV || error == ENXIO) && td->td_dupfd >= 0 && /* XXX from fdopen */ (error = dupfdopen(td, fdp, indx, td->td_dupfd, flags, error)) == 0) { td->td_retval[0] = indx; return (0); } /* * Clean up the descriptor, but only if another thread hadn't * replaced or closed it. */ FILEDESC_LOCK(fdp); if (fdp->fd_ofiles[indx] == fp) { fdp->fd_ofiles[indx] = NULL; FILEDESC_UNLOCK(fdp); fdrop(fp, td); } else FILEDESC_UNLOCK(fdp); if (error == ERESTART) error = EINTR; return (error); } td->td_dupfd = 0; NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; /* * There should be 2 references on the file, one from the descriptor * table, and one for us. * * Handle the case where someone closed the file (via its file * descriptor) while we were blocked. The end result should look * like opening the file succeeded but it was immediately closed. */ FILEDESC_LOCK(fdp); FILE_LOCK(fp); if (fp->f_count == 1) { KASSERT(fdp->fd_ofiles[indx] != fp, ("Open file descriptor lost all refs")); FILEDESC_UNLOCK(fdp); FILE_UNLOCK(fp); VOP_UNLOCK(vp, 0, td); vn_close(vp, flags & FMASK, fp->f_cred, td); fdrop(fp, td); td->td_retval[0] = indx; return 0; } fp->f_vnode = vp; fp->f_data = vp; fp->f_flag = flags & FMASK; fp->f_ops = &vnops; fp->f_seqcount = 1; fp->f_type = (vp->v_type == VFIFO ? DTYPE_FIFO : DTYPE_VNODE); FILEDESC_UNLOCK(fdp); FILE_UNLOCK(fp); /* assert that vn_open created a backing object if one is needed */ KASSERT(!vn_canvmio(vp) || VOP_GETVOBJECT(vp, NULL) == 0, ("open: vmio vnode has no backing object after vn_open")); VOP_UNLOCK(vp, 0, td); if (flags & (O_EXLOCK | O_SHLOCK)) { lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; if (flags & O_EXLOCK) lf.l_type = F_WRLCK; else lf.l_type = F_RDLCK; type = F_FLOCK; if ((flags & FNONBLOCK) == 0) type |= F_WAIT; if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type)) != 0) goto bad; fp->f_flag |= FHASLOCK; } if (flags & O_TRUNC) { if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) goto bad; VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); VATTR_NULL(&vat); vat.va_size = 0; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); #ifdef MAC error = mac_check_vnode_write(td->td_ucred, fp->f_cred, vp); if (error == 0) #endif error = VOP_SETATTR(vp, &vat, td->td_ucred, td); VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); if (error) goto bad; } /* * Release our private reference, leaving the one associated with * the descriptor table intact. */ fdrop(fp, td); td->td_retval[0] = indx; return (0); bad: FILEDESC_LOCK(fdp); if (fdp->fd_ofiles[indx] == fp) { fdp->fd_ofiles[indx] = NULL; FILEDESC_UNLOCK(fdp); fdrop(fp, td); } else FILEDESC_UNLOCK(fdp); fdrop(fp, td); return (error); } #ifdef COMPAT_43 /* * Create a file. */ #ifndef _SYS_SYSPROTO_H_ struct ocreat_args { char *path; int mode; }; #endif int ocreat(td, uap) struct thread *td; register struct ocreat_args /* { char *path; int mode; } */ *uap; { struct open_args /* { char *path; int flags; int mode; } */ nuap; nuap.path = uap->path; nuap.mode = uap->mode; nuap.flags = O_WRONLY | O_CREAT | O_TRUNC; return (open(td, &nuap)); } #endif /* COMPAT_43 */ /* * Create a special file. */ #ifndef _SYS_SYSPROTO_H_ struct mknod_args { char *path; int mode; int dev; }; #endif /* ARGSUSED */ int mknod(td, uap) struct thread *td; register struct mknod_args /* { char *path; int mode; int dev; } */ *uap; { return (kern_mknod(td, uap->path, UIO_USERSPACE, uap->mode, uap->dev)); } int kern_mknod(struct thread *td, char *path, enum uio_seg pathseg, int mode, int dev) { struct vnode *vp; struct mount *mp; struct vattr vattr; int error; int whiteout = 0; struct nameidata nd; switch (mode & S_IFMT) { case S_IFCHR: case S_IFBLK: error = suser(td); break; default: error = suser_cred(td->td_ucred, PRISON_ROOT); break; } if (error) return (error); restart: bwillwrite(); NDINIT(&nd, CREATE, LOCKPARENT | SAVENAME, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); vp = nd.ni_vp; if (vp != NULL) { NDFREE(&nd, NDF_ONLY_PNBUF); vrele(vp); if (vp == nd.ni_dvp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); return (EEXIST); } else { VATTR_NULL(&vattr); FILEDESC_LOCK(td->td_proc->p_fd); vattr.va_mode = (mode & ALLPERMS) & ~td->td_proc->p_fd->fd_cmask; FILEDESC_UNLOCK(td->td_proc->p_fd); vattr.va_rdev = dev; whiteout = 0; switch (mode & S_IFMT) { case S_IFMT: /* used by badsect to flag bad sectors */ vattr.va_type = VBAD; break; case S_IFCHR: vattr.va_type = VCHR; break; case S_IFBLK: vattr.va_type = VBLK; break; case S_IFWHT: whiteout = 1; break; default: error = EINVAL; break; } } if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } #ifdef MAC if (error == 0 && !whiteout) error = mac_check_vnode_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd, &vattr); #endif if (!error) { VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE); if (whiteout) error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE); else { error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); if (error == 0) vput(nd.ni_vp); } } NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); vn_finished_write(mp); ASSERT_VOP_UNLOCKED(nd.ni_dvp, "mknod"); ASSERT_VOP_UNLOCKED(nd.ni_vp, "mknod"); return (error); } /* * Create a named pipe. */ #ifndef _SYS_SYSPROTO_H_ struct mkfifo_args { char *path; int mode; }; #endif /* ARGSUSED */ int mkfifo(td, uap) struct thread *td; register struct mkfifo_args /* { char *path; int mode; } */ *uap; { return (kern_mkfifo(td, uap->path, UIO_USERSPACE, uap->mode)); } int kern_mkfifo(struct thread *td, char *path, enum uio_seg pathseg, int mode) { struct mount *mp; struct vattr vattr; int error; struct nameidata nd; restart: bwillwrite(); NDINIT(&nd, CREATE, LOCKPARENT | SAVENAME, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); if (nd.ni_vp != NULL) { NDFREE(&nd, NDF_ONLY_PNBUF); vrele(nd.ni_vp); if (nd.ni_vp == nd.ni_dvp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); return (EEXIST); } if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } VATTR_NULL(&vattr); vattr.va_type = VFIFO; FILEDESC_LOCK(td->td_proc->p_fd); vattr.va_mode = (mode & ALLPERMS) & ~td->td_proc->p_fd->fd_cmask; FILEDESC_UNLOCK(td->td_proc->p_fd); #ifdef MAC error = mac_check_vnode_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd, &vattr); if (error) goto out; #endif VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE); error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); if (error == 0) vput(nd.ni_vp); #ifdef MAC out: #endif NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); vn_finished_write(mp); return (error); } /* * Make a hard file link. */ #ifndef _SYS_SYSPROTO_H_ struct link_args { char *path; char *link; }; #endif /* ARGSUSED */ int link(td, uap) struct thread *td; register struct link_args /* { char *path; char *link; } */ *uap; { return (kern_link(td, uap->path, uap->link, UIO_USERSPACE)); } int kern_link(struct thread *td, char *path, char *link, enum uio_seg segflg) { struct vnode *vp; struct mount *mp; struct nameidata nd; int error; bwillwrite(); NDINIT(&nd, LOOKUP, FOLLOW|NOOBJ, segflg, path, td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; if (vp->v_type == VDIR) { vrele(vp); return (EPERM); /* POSIX */ } if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) { vrele(vp); return (error); } NDINIT(&nd, CREATE, LOCKPARENT | NOOBJ | SAVENAME, segflg, link, td); if ((error = namei(&nd)) == 0) { if (nd.ni_vp != NULL) { vrele(nd.ni_vp); if (nd.ni_dvp == nd.ni_vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); error = EEXIST; } else if ((error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td)) == 0) { VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE); VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); #ifdef MAC error = mac_check_vnode_link(td->td_ucred, nd.ni_dvp, vp, &nd.ni_cnd); if (error == 0) #endif error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd); VOP_UNLOCK(vp, 0, td); vput(nd.ni_dvp); } NDFREE(&nd, NDF_ONLY_PNBUF); } vrele(vp); vn_finished_write(mp); ASSERT_VOP_UNLOCKED(nd.ni_dvp, "link"); ASSERT_VOP_UNLOCKED(nd.ni_vp, "link"); return (error); } /* * Make a symbolic link. */ #ifndef _SYS_SYSPROTO_H_ struct symlink_args { char *path; char *link; }; #endif /* ARGSUSED */ int symlink(td, uap) struct thread *td; register struct symlink_args /* { char *path; char *link; } */ *uap; { return (kern_symlink(td, uap->path, uap->link, UIO_USERSPACE)); } int kern_symlink(struct thread *td, char *path, char *link, enum uio_seg segflg) { struct mount *mp; struct vattr vattr; char *syspath; int error; struct nameidata nd; if (segflg == UIO_SYSSPACE) { syspath = path; } else { syspath = uma_zalloc(namei_zone, M_WAITOK); if ((error = copyinstr(path, syspath, MAXPATHLEN, NULL)) != 0) goto out; } restart: bwillwrite(); NDINIT(&nd, CREATE, LOCKPARENT | NOOBJ | SAVENAME, segflg, link, td); if ((error = namei(&nd)) != 0) goto out; if (nd.ni_vp) { NDFREE(&nd, NDF_ONLY_PNBUF); vrele(nd.ni_vp); if (nd.ni_vp == nd.ni_dvp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); error = EEXIST; goto out; } if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } VATTR_NULL(&vattr); FILEDESC_LOCK(td->td_proc->p_fd); vattr.va_mode = ACCESSPERMS &~ td->td_proc->p_fd->fd_cmask; FILEDESC_UNLOCK(td->td_proc->p_fd); #ifdef MAC vattr.va_type = VLNK; error = mac_check_vnode_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd, &vattr); if (error) goto out2; #endif VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE); error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, syspath); if (error == 0) vput(nd.ni_vp); #ifdef MAC out2: #endif NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); vn_finished_write(mp); ASSERT_VOP_UNLOCKED(nd.ni_dvp, "symlink"); ASSERT_VOP_UNLOCKED(nd.ni_vp, "symlink"); out: if (segflg != UIO_SYSSPACE) uma_zfree(namei_zone, syspath); return (error); } /* * Delete a whiteout from the filesystem. */ /* ARGSUSED */ int undelete(td, uap) struct thread *td; register struct undelete_args /* { char *path; } */ *uap; { int error; struct mount *mp; struct nameidata nd; restart: bwillwrite(); NDINIT(&nd, DELETE, LOCKPARENT|DOWHITEOUT, UIO_USERSPACE, uap->path, td); error = namei(&nd); if (error) return (error); if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) { NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_vp) vrele(nd.ni_vp); if (nd.ni_vp == nd.ni_dvp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); return (EEXIST); } if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE); error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE); NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); vn_finished_write(mp); ASSERT_VOP_UNLOCKED(nd.ni_dvp, "undelete"); ASSERT_VOP_UNLOCKED(nd.ni_vp, "undelete"); return (error); } /* * Delete a name from the filesystem. */ #ifndef _SYS_SYSPROTO_H_ struct unlink_args { char *path; }; #endif /* ARGSUSED */ int unlink(td, uap) struct thread *td; struct unlink_args /* { char *path; } */ *uap; { return (kern_unlink(td, uap->path, UIO_USERSPACE)); } int kern_unlink(struct thread *td, char *path, enum uio_seg pathseg) { struct mount *mp; struct vnode *vp; int error; struct nameidata nd; restart: bwillwrite(); NDINIT(&nd, DELETE, LOCKPARENT|LOCKLEAF, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); vp = nd.ni_vp; if (vp->v_type == VDIR) error = EPERM; /* POSIX */ else { /* * The root of a mounted filesystem cannot be deleted. * * XXX: can this only be a VDIR case? */ if (vp->v_vflag & VV_ROOT) error = EBUSY; } if (error == 0) { if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); if (vp == nd.ni_dvp) vrele(vp); else vput(vp); vput(nd.ni_dvp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } #ifdef MAC error = mac_check_vnode_delete(td->td_ucred, nd.ni_dvp, vp, &nd.ni_cnd); if (error) goto out; #endif VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE); error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd); #ifdef MAC out: #endif vn_finished_write(mp); } NDFREE(&nd, NDF_ONLY_PNBUF); if (vp == nd.ni_dvp) vrele(vp); else vput(vp); vput(nd.ni_dvp); ASSERT_VOP_UNLOCKED(nd.ni_dvp, "unlink"); ASSERT_VOP_UNLOCKED(nd.ni_vp, "unlink"); return (error); } /* * Reposition read/write file offset. */ #ifndef _SYS_SYSPROTO_H_ struct lseek_args { int fd; int pad; off_t offset; int whence; }; #endif int lseek(td, uap) struct thread *td; register struct lseek_args /* { int fd; int pad; off_t offset; int whence; } */ *uap; { struct ucred *cred = td->td_ucred; struct file *fp; struct vnode *vp; struct vattr vattr; off_t offset; int error, noneg; if ((error = fget(td, uap->fd, &fp)) != 0) return (error); if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) { fdrop(fp, td); return (ESPIPE); } vp = fp->f_vnode; noneg = (vp->v_type != VCHR); offset = uap->offset; switch (uap->whence) { case L_INCR: if (noneg && (fp->f_offset < 0 || (offset > 0 && fp->f_offset > OFF_MAX - offset))) { error = EOVERFLOW; break; } offset += fp->f_offset; break; case L_XTND: vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); error = VOP_GETATTR(vp, &vattr, cred, td); VOP_UNLOCK(vp, 0, td); if (error) break; if (noneg && (vattr.va_size > OFF_MAX || (offset > 0 && vattr.va_size > OFF_MAX - offset))) { error = EOVERFLOW; break; } offset += vattr.va_size; break; case L_SET: break; default: error = EINVAL; } if (error == 0 && noneg && offset < 0) error = EINVAL; if (error != 0) { fdrop(fp, td); return (error); } fp->f_offset = offset; *(off_t *)(td->td_retval) = fp->f_offset; fdrop(fp, td); return (0); } #if defined(COMPAT_43) || defined(COMPAT_SUNOS) /* * Reposition read/write file offset. */ #ifndef _SYS_SYSPROTO_H_ struct olseek_args { int fd; long offset; int whence; }; #endif int olseek(td, uap) struct thread *td; register struct olseek_args /* { int fd; long offset; int whence; } */ *uap; { struct lseek_args /* { int fd; int pad; off_t offset; int whence; } */ nuap; int error; nuap.fd = uap->fd; nuap.offset = uap->offset; nuap.whence = uap->whence; error = lseek(td, &nuap); return (error); } #endif /* COMPAT_43 */ /* * Check access permissions using passed credentials. */ static int vn_access(vp, user_flags, cred, td) struct vnode *vp; int user_flags; struct ucred *cred; struct thread *td; { int error, flags; /* Flags == 0 means only check for existence. */ error = 0; if (user_flags) { flags = 0; if (user_flags & R_OK) flags |= VREAD; if (user_flags & W_OK) flags |= VWRITE; if (user_flags & X_OK) flags |= VEXEC; #ifdef MAC error = mac_check_vnode_access(cred, vp, flags); if (error) return (error); #endif if ((flags & VWRITE) == 0 || (error = vn_writechk(vp)) == 0) error = VOP_ACCESS(vp, flags, cred, td); } return (error); } /* * Check access permissions using "real" credentials. */ #ifndef _SYS_SYSPROTO_H_ struct access_args { char *path; int flags; }; #endif int access(td, uap) struct thread *td; register struct access_args /* { char *path; int flags; } */ *uap; { return (kern_access(td, uap->path, UIO_USERSPACE, uap->flags)); } int kern_access(struct thread *td, char *path, enum uio_seg pathseg, int flags) { struct ucred *cred, *tmpcred; register struct vnode *vp; int error; struct nameidata nd; /* * Create and modify a temporary credential instead of one that * is potentially shared. This could also mess up socket * buffer accounting which can run in an interrupt context. * * XXX - Depending on how "threads" are finally implemented, it * may be better to explicitly pass the credential to namei() * rather than to modify the potentially shared process structure. */ cred = td->td_ucred; tmpcred = crdup(cred); tmpcred->cr_uid = cred->cr_ruid; tmpcred->cr_groups[0] = cred->cr_rgid; td->td_ucred = tmpcred; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, pathseg, path, td); if ((error = namei(&nd)) != 0) goto out1; vp = nd.ni_vp; error = vn_access(vp, flags, tmpcred, td); NDFREE(&nd, NDF_ONLY_PNBUF); vput(vp); out1: td->td_ucred = cred; crfree(tmpcred); return (error); } /* * Check access permissions using "effective" credentials. */ #ifndef _SYS_SYSPROTO_H_ struct eaccess_args { char *path; int flags; }; #endif int eaccess(td, uap) struct thread *td; register struct eaccess_args /* { char *path; int flags; } */ *uap; { struct nameidata nd; struct vnode *vp; int error; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, uap->path, td); if ((error = namei(&nd)) != 0) return (error); vp = nd.ni_vp; error = vn_access(vp, uap->flags, td->td_ucred, td); NDFREE(&nd, NDF_ONLY_PNBUF); vput(vp); return (error); } #if defined(COMPAT_43) || defined(COMPAT_SUNOS) /* * Get file status; this version follows links. */ #ifndef _SYS_SYSPROTO_H_ struct ostat_args { char *path; struct ostat *ub; }; #endif /* ARGSUSED */ int ostat(td, uap) struct thread *td; register struct ostat_args /* { char *path; struct ostat *ub; } */ *uap; { struct stat sb; struct ostat osb; int error; struct nameidata nd; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, uap->path, td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = vn_stat(nd.ni_vp, &sb, td->td_ucred, NOCRED, td); vput(nd.ni_vp); if (error) return (error); cvtstat(&sb, &osb); error = copyout(&osb, uap->ub, sizeof (osb)); return (error); } /* * Get file status; this version does not follow links. */ #ifndef _SYS_SYSPROTO_H_ struct olstat_args { char *path; struct ostat *ub; }; #endif /* ARGSUSED */ int olstat(td, uap) struct thread *td; register struct olstat_args /* { char *path; struct ostat *ub; } */ *uap; { struct vnode *vp; struct stat sb; struct ostat osb; int error; struct nameidata nd; NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, uap->path, td); if ((error = namei(&nd)) != 0) return (error); vp = nd.ni_vp; error = vn_stat(vp, &sb, td->td_ucred, NOCRED, td); NDFREE(&nd, NDF_ONLY_PNBUF); vput(vp); if (error) return (error); cvtstat(&sb, &osb); error = copyout(&osb, uap->ub, sizeof (osb)); return (error); } /* * Convert from an old to a new stat structure. */ void cvtstat(st, ost) struct stat *st; struct ostat *ost; { ost->st_dev = st->st_dev; ost->st_ino = st->st_ino; ost->st_mode = st->st_mode; ost->st_nlink = st->st_nlink; ost->st_uid = st->st_uid; ost->st_gid = st->st_gid; ost->st_rdev = st->st_rdev; if (st->st_size < (quad_t)1 << 32) ost->st_size = st->st_size; else ost->st_size = -2; ost->st_atime = st->st_atime; ost->st_mtime = st->st_mtime; ost->st_ctime = st->st_ctime; ost->st_blksize = st->st_blksize; ost->st_blocks = st->st_blocks; ost->st_flags = st->st_flags; ost->st_gen = st->st_gen; } #endif /* COMPAT_43 || COMPAT_SUNOS */ /* * Get file status; this version follows links. */ #ifndef _SYS_SYSPROTO_H_ struct stat_args { char *path; struct stat *ub; }; #endif /* ARGSUSED */ int stat(td, uap) struct thread *td; register struct stat_args /* { char *path; struct stat *ub; } */ *uap; { struct stat sb; int error; struct nameidata nd; #ifdef LOOKUP_SHARED NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | NOOBJ, UIO_USERSPACE, uap->path, td); #else NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, uap->path, td); #endif if ((error = namei(&nd)) != 0) return (error); error = vn_stat(nd.ni_vp, &sb, td->td_ucred, NOCRED, td); NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_vp); if (error) return (error); error = copyout(&sb, uap->ub, sizeof (sb)); return (error); } /* * Get file status; this version does not follow links. */ #ifndef _SYS_SYSPROTO_H_ struct lstat_args { char *path; struct stat *ub; }; #endif /* ARGSUSED */ int lstat(td, uap) struct thread *td; register struct lstat_args /* { char *path; struct stat *ub; } */ *uap; { int error; struct vnode *vp; struct stat sb; struct nameidata nd; NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, uap->path, td); if ((error = namei(&nd)) != 0) return (error); vp = nd.ni_vp; error = vn_stat(vp, &sb, td->td_ucred, NOCRED, td); NDFREE(&nd, NDF_ONLY_PNBUF); vput(vp); if (error) return (error); error = copyout(&sb, uap->ub, sizeof (sb)); return (error); } /* * Implementation of the NetBSD stat() function. * XXX This should probably be collapsed with the FreeBSD version, * as the differences are only due to vn_stat() clearing spares at * the end of the structures. vn_stat could be split to avoid this, * and thus collapse the following to close to zero code. */ void cvtnstat(sb, nsb) struct stat *sb; struct nstat *nsb; { bzero(nsb, sizeof *nsb); nsb->st_dev = sb->st_dev; nsb->st_ino = sb->st_ino; nsb->st_mode = sb->st_mode; nsb->st_nlink = sb->st_nlink; nsb->st_uid = sb->st_uid; nsb->st_gid = sb->st_gid; nsb->st_rdev = sb->st_rdev; nsb->st_atimespec = sb->st_atimespec; nsb->st_mtimespec = sb->st_mtimespec; nsb->st_ctimespec = sb->st_ctimespec; nsb->st_size = sb->st_size; nsb->st_blocks = sb->st_blocks; nsb->st_blksize = sb->st_blksize; nsb->st_flags = sb->st_flags; nsb->st_gen = sb->st_gen; nsb->st_birthtimespec = sb->st_birthtimespec; } #ifndef _SYS_SYSPROTO_H_ struct nstat_args { char *path; struct nstat *ub; }; #endif /* ARGSUSED */ int nstat(td, uap) struct thread *td; register struct nstat_args /* { char *path; struct nstat *ub; } */ *uap; { struct stat sb; struct nstat nsb; int error; struct nameidata nd; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, uap->path, td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = vn_stat(nd.ni_vp, &sb, td->td_ucred, NOCRED, td); vput(nd.ni_vp); if (error) return (error); cvtnstat(&sb, &nsb); error = copyout(&nsb, uap->ub, sizeof (nsb)); return (error); } /* * NetBSD lstat. Get file status; this version does not follow links. */ #ifndef _SYS_SYSPROTO_H_ struct lstat_args { char *path; struct stat *ub; }; #endif /* ARGSUSED */ int nlstat(td, uap) struct thread *td; register struct nlstat_args /* { char *path; struct nstat *ub; } */ *uap; { int error; struct vnode *vp; struct stat sb; struct nstat nsb; struct nameidata nd; NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, uap->path, td); if ((error = namei(&nd)) != 0) return (error); vp = nd.ni_vp; NDFREE(&nd, NDF_ONLY_PNBUF); error = vn_stat(vp, &sb, td->td_ucred, NOCRED, td); vput(vp); if (error) return (error); cvtnstat(&sb, &nsb); error = copyout(&nsb, uap->ub, sizeof (nsb)); return (error); } /* * Get configurable pathname variables. */ #ifndef _SYS_SYSPROTO_H_ struct pathconf_args { char *path; int name; }; #endif /* ARGSUSED */ int pathconf(td, uap) struct thread *td; register struct pathconf_args /* { char *path; int name; } */ *uap; { int error; struct nameidata nd; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, uap->path, td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); /* If asynchronous I/O is available, it works for all files. */ if (uap->name == _PC_ASYNC_IO) td->td_retval[0] = async_io_version; else error = VOP_PATHCONF(nd.ni_vp, uap->name, td->td_retval); vput(nd.ni_vp); return (error); } /* * Return target name of a symbolic link. */ #ifndef _SYS_SYSPROTO_H_ struct readlink_args { char *path; char *buf; int count; }; #endif /* ARGSUSED */ int readlink(td, uap) struct thread *td; register struct readlink_args /* { char *path; char *buf; int count; } */ *uap; { return (kern_readlink(td, uap->path, UIO_USERSPACE, uap->buf, UIO_USERSPACE, uap->count)); } int kern_readlink(struct thread *td, char *path, enum uio_seg pathseg, char *buf, enum uio_seg bufseg, int count) { register struct vnode *vp; struct iovec aiov; struct uio auio; int error; struct nameidata nd; NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; #ifdef MAC error = mac_check_vnode_readlink(td->td_ucred, vp); if (error) { vput(vp); return (error); } #endif if (vp->v_type != VLNK) error = EINVAL; else { aiov.iov_base = buf; aiov.iov_len = count; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = 0; auio.uio_rw = UIO_READ; auio.uio_segflg = bufseg; auio.uio_td = td; auio.uio_resid = count; error = VOP_READLINK(vp, &auio, td->td_ucred); } vput(vp); td->td_retval[0] = count - auio.uio_resid; return (error); } /* * Common implementation code for chflags() and fchflags(). */ static int setfflags(td, vp, flags) struct thread *td; struct vnode *vp; int flags; { int error; struct mount *mp; struct vattr vattr; /* * Prevent non-root users from setting flags on devices. When * a device is reused, users can retain ownership of the device * if they are allowed to set flags and programs assume that * chown can't fail when done as root. */ if (vp->v_type == VCHR || vp->v_type == VBLK) { error = suser_cred(td->td_ucred, PRISON_ROOT); if (error) return (error); } if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) return (error); VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); VATTR_NULL(&vattr); vattr.va_flags = flags; #ifdef MAC error = mac_check_vnode_setflags(td->td_ucred, vp, vattr.va_flags); if (error == 0) #endif error = VOP_SETATTR(vp, &vattr, td->td_ucred, td); VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); return (error); } /* * Change flags of a file given a path name. */ #ifndef _SYS_SYSPROTO_H_ struct chflags_args { char *path; int flags; }; #endif /* ARGSUSED */ int chflags(td, uap) struct thread *td; register struct chflags_args /* { char *path; int flags; } */ *uap; { int error; struct nameidata nd; NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = setfflags(td, nd.ni_vp, uap->flags); vrele(nd.ni_vp); return error; } /* * Same as chflags() but doesn't follow symlinks. */ int lchflags(td, uap) struct thread *td; register struct lchflags_args /* { char *path; int flags; } */ *uap; { int error; struct nameidata nd; NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, uap->path, td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = setfflags(td, nd.ni_vp, uap->flags); vrele(nd.ni_vp); return error; } /* * Change flags of a file given a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fchflags_args { int fd; int flags; }; #endif /* ARGSUSED */ int fchflags(td, uap) struct thread *td; register struct fchflags_args /* { int fd; int flags; } */ *uap; { struct file *fp; int error; if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0) return (error); error = setfflags(td, fp->f_vnode, uap->flags); fdrop(fp, td); return (error); } /* * Common implementation code for chmod(), lchmod() and fchmod(). */ static int setfmode(td, vp, mode) struct thread *td; struct vnode *vp; int mode; { int error; struct mount *mp; struct vattr vattr; if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) return (error); VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); VATTR_NULL(&vattr); vattr.va_mode = mode & ALLPERMS; #ifdef MAC error = mac_check_vnode_setmode(td->td_ucred, vp, vattr.va_mode); if (error == 0) #endif error = VOP_SETATTR(vp, &vattr, td->td_ucred, td); VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); return error; } /* * Change mode of a file given path name. */ #ifndef _SYS_SYSPROTO_H_ struct chmod_args { char *path; int mode; }; #endif /* ARGSUSED */ int chmod(td, uap) struct thread *td; register struct chmod_args /* { char *path; int mode; } */ *uap; { return (kern_chmod(td, uap->path, UIO_USERSPACE, uap->mode)); } int kern_chmod(struct thread *td, char *path, enum uio_seg pathseg, int mode) { int error; struct nameidata nd; NDINIT(&nd, LOOKUP, FOLLOW, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = setfmode(td, nd.ni_vp, mode); vrele(nd.ni_vp); return error; } /* * Change mode of a file given path name (don't follow links.) */ #ifndef _SYS_SYSPROTO_H_ struct lchmod_args { char *path; int mode; }; #endif /* ARGSUSED */ int lchmod(td, uap) struct thread *td; register struct lchmod_args /* { char *path; int mode; } */ *uap; { int error; struct nameidata nd; NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, uap->path, td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = setfmode(td, nd.ni_vp, uap->mode); vrele(nd.ni_vp); return error; } /* * Change mode of a file given a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fchmod_args { int fd; int mode; }; #endif /* ARGSUSED */ int fchmod(td, uap) struct thread *td; register struct fchmod_args /* { int fd; int mode; } */ *uap; { struct file *fp; int error; if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0) return (error); error = setfmode(td, fp->f_vnode, uap->mode); fdrop(fp, td); return (error); } /* * Common implementation for chown(), lchown(), and fchown() */ static int setfown(td, vp, uid, gid) struct thread *td; struct vnode *vp; uid_t uid; gid_t gid; { int error; struct mount *mp; struct vattr vattr; if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) return (error); VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); VATTR_NULL(&vattr); vattr.va_uid = uid; vattr.va_gid = gid; #ifdef MAC error = mac_check_vnode_setowner(td->td_ucred, vp, vattr.va_uid, vattr.va_gid); if (error == 0) #endif error = VOP_SETATTR(vp, &vattr, td->td_ucred, td); VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); return error; } /* * Set ownership given a path name. */ #ifndef _SYS_SYSPROTO_H_ struct chown_args { char *path; int uid; int gid; }; #endif /* ARGSUSED */ int chown(td, uap) struct thread *td; register struct chown_args /* { char *path; int uid; int gid; } */ *uap; { return (kern_chown(td, uap->path, UIO_USERSPACE, uap->uid, uap->gid)); } int kern_chown(struct thread *td, char *path, enum uio_seg pathseg, int uid, int gid) { int error; struct nameidata nd; NDINIT(&nd, LOOKUP, FOLLOW, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = setfown(td, nd.ni_vp, uid, gid); vrele(nd.ni_vp); return (error); } /* * Set ownership given a path name, do not cross symlinks. */ #ifndef _SYS_SYSPROTO_H_ struct lchown_args { char *path; int uid; int gid; }; #endif /* ARGSUSED */ int lchown(td, uap) struct thread *td; register struct lchown_args /* { char *path; int uid; int gid; } */ *uap; { return (kern_lchown(td, uap->path, UIO_USERSPACE, uap->uid, uap->gid)); } int kern_lchown(struct thread *td, char *path, enum uio_seg pathseg, int uid, int gid) { int error; struct nameidata nd; NDINIT(&nd, LOOKUP, NOFOLLOW, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = setfown(td, nd.ni_vp, uid, gid); vrele(nd.ni_vp); return (error); } /* * Set ownership given a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fchown_args { int fd; int uid; int gid; }; #endif /* ARGSUSED */ int fchown(td, uap) struct thread *td; register struct fchown_args /* { int fd; int uid; int gid; } */ *uap; { struct file *fp; int error; if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0) return (error); error = setfown(td, fp->f_vnode, uap->uid, uap->gid); fdrop(fp, td); return (error); } /* * Common implementation code for utimes(), lutimes(), and futimes(). */ static int getutimes(usrtvp, tvpseg, tsp) const struct timeval *usrtvp; enum uio_seg tvpseg; struct timespec *tsp; { struct timeval tv[2]; const struct timeval *tvp; int error; if (usrtvp == NULL) { microtime(&tv[0]); TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]); tsp[1] = tsp[0]; } else { if (tvpseg == UIO_SYSSPACE) { tvp = usrtvp; } else { if ((error = copyin(usrtvp, tv, sizeof(tv))) != 0) return (error); tvp = tv; } TIMEVAL_TO_TIMESPEC(&tvp[0], &tsp[0]); TIMEVAL_TO_TIMESPEC(&tvp[1], &tsp[1]); } return 0; } /* * Common implementation code for utimes(), lutimes(), and futimes(). */ static int setutimes(td, vp, ts, numtimes, nullflag) struct thread *td; struct vnode *vp; const struct timespec *ts; int numtimes; int nullflag; { int error, setbirthtime; struct mount *mp; struct vattr vattr; if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) return (error); VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); setbirthtime = 0; if (numtimes < 3 && VOP_GETATTR(vp, &vattr, td->td_ucred, td) == 0 && timespeccmp(&ts[1], &vattr.va_birthtime, < )) setbirthtime = 1; VATTR_NULL(&vattr); vattr.va_atime = ts[0]; vattr.va_mtime = ts[1]; if (setbirthtime) vattr.va_birthtime = ts[1]; if (numtimes > 2) vattr.va_birthtime = ts[2]; if (nullflag) vattr.va_vaflags |= VA_UTIMES_NULL; #ifdef MAC error = mac_check_vnode_setutimes(td->td_ucred, vp, vattr.va_atime, vattr.va_mtime); #endif if (error == 0) error = VOP_SETATTR(vp, &vattr, td->td_ucred, td); VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); return error; } /* * Set the access and modification times of a file. */ #ifndef _SYS_SYSPROTO_H_ struct utimes_args { char *path; struct timeval *tptr; }; #endif /* ARGSUSED */ int utimes(td, uap) struct thread *td; register struct utimes_args /* { char *path; struct timeval *tptr; } */ *uap; { return (kern_utimes(td, uap->path, UIO_USERSPACE, uap->tptr, UIO_USERSPACE)); } int kern_utimes(struct thread *td, char *path, enum uio_seg pathseg, struct timeval *tptr, enum uio_seg tptrseg) { struct timespec ts[2]; int error; struct nameidata nd; if ((error = getutimes(tptr, tptrseg, ts)) != 0) return (error); NDINIT(&nd, LOOKUP, FOLLOW, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL); vrele(nd.ni_vp); return (error); } /* * Set the access and modification times of a file. */ #ifndef _SYS_SYSPROTO_H_ struct lutimes_args { char *path; struct timeval *tptr; }; #endif /* ARGSUSED */ int lutimes(td, uap) struct thread *td; register struct lutimes_args /* { char *path; struct timeval *tptr; } */ *uap; { return (kern_lutimes(td, uap->path, UIO_USERSPACE, uap->tptr, UIO_USERSPACE)); } int kern_lutimes(struct thread *td, char *path, enum uio_seg pathseg, struct timeval *tptr, enum uio_seg tptrseg) { struct timespec ts[2]; int error; struct nameidata nd; if ((error = getutimes(tptr, tptrseg, ts)) != 0) return (error); NDINIT(&nd, LOOKUP, NOFOLLOW, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL); vrele(nd.ni_vp); return (error); } /* * Set the access and modification times of a file. */ #ifndef _SYS_SYSPROTO_H_ struct futimes_args { int fd; struct timeval *tptr; }; #endif /* ARGSUSED */ int futimes(td, uap) struct thread *td; register struct futimes_args /* { int fd; struct timeval *tptr; } */ *uap; { return (kern_futimes(td, uap->fd, uap->tptr, UIO_USERSPACE)); } int kern_futimes(struct thread *td, int fd, struct timeval *tptr, enum uio_seg tptrseg) { struct timespec ts[2]; struct file *fp; int error; if ((error = getutimes(tptr, tptrseg, ts)) != 0) return (error); if ((error = getvnode(td->td_proc->p_fd, fd, &fp)) != 0) return (error); error = setutimes(td, fp->f_vnode, ts, 2, tptr == NULL); fdrop(fp, td); return (error); } /* * Truncate a file given its path name. */ #ifndef _SYS_SYSPROTO_H_ struct truncate_args { char *path; int pad; off_t length; }; #endif /* ARGSUSED */ int truncate(td, uap) struct thread *td; register struct truncate_args /* { char *path; int pad; off_t length; } */ *uap; { return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length)); } int kern_truncate(struct thread *td, char *path, enum uio_seg pathseg, off_t length) { struct mount *mp; struct vnode *vp; struct vattr vattr; int error; struct nameidata nd; if (length < 0) return(EINVAL); NDINIT(&nd, LOOKUP, FOLLOW, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); vp = nd.ni_vp; if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) { vrele(vp); return (error); } NDFREE(&nd, NDF_ONLY_PNBUF); VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); if (vp->v_type == VDIR) error = EISDIR; #ifdef MAC else if ((error = mac_check_vnode_write(td->td_ucred, NOCRED, vp))) { } #endif else if ((error = vn_writechk(vp)) == 0 && (error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td)) == 0) { VATTR_NULL(&vattr); vattr.va_size = length; error = VOP_SETATTR(vp, &vattr, td->td_ucred, td); } vput(vp); vn_finished_write(mp); return (error); } /* * Truncate a file given a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct ftruncate_args { int fd; int pad; off_t length; }; #endif /* ARGSUSED */ int ftruncate(td, uap) struct thread *td; register struct ftruncate_args /* { int fd; int pad; off_t length; } */ *uap; { struct mount *mp; struct vattr vattr; struct vnode *vp; struct file *fp; int error; if (uap->length < 0) return(EINVAL); if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0) return (error); if ((fp->f_flag & FWRITE) == 0) { fdrop(fp, td); return (EINVAL); } vp = fp->f_vnode; if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) { fdrop(fp, td); return (error); } VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); if (vp->v_type == VDIR) error = EISDIR; #ifdef MAC else if ((error = mac_check_vnode_write(td->td_ucred, fp->f_cred, vp))) { } #endif else if ((error = vn_writechk(vp)) == 0) { VATTR_NULL(&vattr); vattr.va_size = uap->length; error = VOP_SETATTR(vp, &vattr, fp->f_cred, td); } VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); fdrop(fp, td); return (error); } #if defined(COMPAT_43) || defined(COMPAT_SUNOS) /* * Truncate a file given its path name. */ #ifndef _SYS_SYSPROTO_H_ struct otruncate_args { char *path; long length; }; #endif /* ARGSUSED */ int otruncate(td, uap) struct thread *td; register struct otruncate_args /* { char *path; long length; } */ *uap; { struct truncate_args /* { char *path; int pad; off_t length; } */ nuap; nuap.path = uap->path; nuap.length = uap->length; return (truncate(td, &nuap)); } /* * Truncate a file given a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct oftruncate_args { int fd; long length; }; #endif /* ARGSUSED */ int oftruncate(td, uap) struct thread *td; register struct oftruncate_args /* { int fd; long length; } */ *uap; { struct ftruncate_args /* { int fd; int pad; off_t length; } */ nuap; nuap.fd = uap->fd; nuap.length = uap->length; return (ftruncate(td, &nuap)); } #endif /* COMPAT_43 || COMPAT_SUNOS */ /* * Sync an open file. */ #ifndef _SYS_SYSPROTO_H_ struct fsync_args { int fd; }; #endif /* ARGSUSED */ int fsync(td, uap) struct thread *td; struct fsync_args /* { int fd; } */ *uap; { struct vnode *vp; struct mount *mp; struct file *fp; vm_object_t obj; int error; GIANT_REQUIRED; if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0) return (error); vp = fp->f_vnode; if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) { fdrop(fp, td); return (error); } vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); if (VOP_GETVOBJECT(vp, &obj) == 0) { VM_OBJECT_LOCK(obj); vm_object_page_clean(obj, 0, 0, 0); VM_OBJECT_UNLOCK(obj); } error = VOP_FSYNC(vp, fp->f_cred, MNT_WAIT, td); if (error == 0 && vp->v_mount && (vp->v_mount->mnt_flag & MNT_SOFTDEP) && softdep_fsync_hook != NULL) error = (*softdep_fsync_hook)(vp); VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); fdrop(fp, td); return (error); } /* * Rename files. Source and destination must either both be directories, * or both not be directories. If target is a directory, it must be empty. */ #ifndef _SYS_SYSPROTO_H_ struct rename_args { char *from; char *to; }; #endif /* ARGSUSED */ int rename(td, uap) struct thread *td; register struct rename_args /* { char *from; char *to; } */ *uap; { return (kern_rename(td, uap->from, uap->to, UIO_USERSPACE)); } int kern_rename(struct thread *td, char *from, char *to, enum uio_seg pathseg) { struct mount *mp = NULL; struct vnode *tvp, *fvp, *tdvp; struct nameidata fromnd, tond; int error; bwillwrite(); #ifdef MAC NDINIT(&fromnd, DELETE, LOCKPARENT | LOCKLEAF | SAVESTART, pathseg, from, td); #else NDINIT(&fromnd, DELETE, WANTPARENT | SAVESTART, pathseg, from, td); #endif if ((error = namei(&fromnd)) != 0) return (error); #ifdef MAC error = mac_check_vnode_rename_from(td->td_ucred, fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd); VOP_UNLOCK(fromnd.ni_dvp, 0, td); VOP_UNLOCK(fromnd.ni_vp, 0, td); #endif fvp = fromnd.ni_vp; if (error == 0) error = vn_start_write(fvp, &mp, V_WAIT | PCATCH); if (error != 0) { NDFREE(&fromnd, NDF_ONLY_PNBUF); vrele(fromnd.ni_dvp); vrele(fvp); goto out1; } NDINIT(&tond, RENAME, LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | NOOBJ, pathseg, to, td); if (fromnd.ni_vp->v_type == VDIR) tond.ni_cnd.cn_flags |= WILLBEDIR; if ((error = namei(&tond)) != 0) { /* Translate error code for rename("dir1", "dir2/."). */ if (error == EISDIR && fvp->v_type == VDIR) error = EINVAL; NDFREE(&fromnd, NDF_ONLY_PNBUF); vrele(fromnd.ni_dvp); vrele(fvp); goto out1; } tdvp = tond.ni_dvp; tvp = tond.ni_vp; if (tvp != NULL) { if (fvp->v_type == VDIR && tvp->v_type != VDIR) { error = ENOTDIR; goto out; } else if (fvp->v_type != VDIR && tvp->v_type == VDIR) { error = EISDIR; goto out; } } if (fvp == tdvp) error = EINVAL; /* * If the source is the same as the destination (that is, if they * are links to the same vnode), then there is nothing to do. */ if (fvp == tvp) error = -1; #ifdef MAC else error = mac_check_vnode_rename_to(td->td_ucred, tdvp, tond.ni_vp, fromnd.ni_dvp == tdvp, &tond.ni_cnd); #endif out: if (!error) { VOP_LEASE(tdvp, td, td->td_ucred, LEASE_WRITE); if (fromnd.ni_dvp != tdvp) { VOP_LEASE(fromnd.ni_dvp, td, td->td_ucred, LEASE_WRITE); } if (tvp) { VOP_LEASE(tvp, td, td->td_ucred, LEASE_WRITE); } error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd, tond.ni_dvp, tond.ni_vp, &tond.ni_cnd); NDFREE(&fromnd, NDF_ONLY_PNBUF); NDFREE(&tond, NDF_ONLY_PNBUF); } else { NDFREE(&fromnd, NDF_ONLY_PNBUF); NDFREE(&tond, NDF_ONLY_PNBUF); if (tdvp == tvp) vrele(tdvp); else vput(tdvp); if (tvp) vput(tvp); vrele(fromnd.ni_dvp); vrele(fvp); } vrele(tond.ni_startdir); ASSERT_VOP_UNLOCKED(fromnd.ni_dvp, "rename"); ASSERT_VOP_UNLOCKED(fromnd.ni_vp, "rename"); ASSERT_VOP_UNLOCKED(tond.ni_dvp, "rename"); ASSERT_VOP_UNLOCKED(tond.ni_vp, "rename"); out1: vn_finished_write(mp); if (fromnd.ni_startdir) vrele(fromnd.ni_startdir); if (error == -1) return (0); return (error); } /* * Make a directory file. */ #ifndef _SYS_SYSPROTO_H_ struct mkdir_args { char *path; int mode; }; #endif /* ARGSUSED */ int mkdir(td, uap) struct thread *td; register struct mkdir_args /* { char *path; int mode; } */ *uap; { return (kern_mkdir(td, uap->path, UIO_USERSPACE, uap->mode)); } int kern_mkdir(struct thread *td, char *path, enum uio_seg segflg, int mode) { struct mount *mp; struct vnode *vp; struct vattr vattr; int error; struct nameidata nd; restart: bwillwrite(); NDINIT(&nd, CREATE, LOCKPARENT | SAVENAME, segflg, path, td); nd.ni_cnd.cn_flags |= WILLBEDIR; if ((error = namei(&nd)) != 0) return (error); vp = nd.ni_vp; if (vp != NULL) { NDFREE(&nd, NDF_ONLY_PNBUF); vrele(vp); /* * XXX namei called with LOCKPARENT but not LOCKLEAF has * the strange behaviour of leaving the vnode unlocked * if the target is the same vnode as the parent. */ if (vp == nd.ni_dvp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); return (EEXIST); } if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } VATTR_NULL(&vattr); vattr.va_type = VDIR; FILEDESC_LOCK(td->td_proc->p_fd); vattr.va_mode = (mode & ACCESSPERMS) &~ td->td_proc->p_fd->fd_cmask; FILEDESC_UNLOCK(td->td_proc->p_fd); #ifdef MAC error = mac_check_vnode_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd, &vattr); if (error) goto out; #endif VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE); error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); #ifdef MAC out: #endif NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if (!error) vput(nd.ni_vp); vn_finished_write(mp); ASSERT_VOP_UNLOCKED(nd.ni_dvp, "mkdir"); ASSERT_VOP_UNLOCKED(nd.ni_vp, "mkdir"); return (error); } /* * Remove a directory file. */ #ifndef _SYS_SYSPROTO_H_ struct rmdir_args { char *path; }; #endif /* ARGSUSED */ int rmdir(td, uap) struct thread *td; struct rmdir_args /* { char *path; } */ *uap; { return (kern_rmdir(td, uap->path, UIO_USERSPACE)); } int kern_rmdir(struct thread *td, char *path, enum uio_seg pathseg) { struct mount *mp; struct vnode *vp; int error; struct nameidata nd; restart: bwillwrite(); NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); vp = nd.ni_vp; if (vp->v_type != VDIR) { error = ENOTDIR; goto out; } /* * No rmdir "." please. */ if (nd.ni_dvp == vp) { error = EINVAL; goto out; } /* * The root of a mounted filesystem cannot be deleted. */ if (vp->v_vflag & VV_ROOT) { error = EBUSY; goto out; } #ifdef MAC error = mac_check_vnode_delete(td->td_ucred, nd.ni_dvp, vp, &nd.ni_cnd); if (error) goto out; #endif if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_dvp == vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); vput(vp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE); VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd); vn_finished_write(mp); out: NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_dvp == vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); vput(vp); ASSERT_VOP_UNLOCKED(nd.ni_dvp, "rmdir"); ASSERT_VOP_UNLOCKED(nd.ni_vp, "rmdir"); return (error); } #ifdef COMPAT_43 /* * Read a block of directory entries in a filesystem independent format. */ #ifndef _SYS_SYSPROTO_H_ struct ogetdirentries_args { int fd; char *buf; u_int count; long *basep; }; #endif int ogetdirentries(td, uap) struct thread *td; register struct ogetdirentries_args /* { int fd; char *buf; u_int count; long *basep; } */ *uap; { struct vnode *vp; struct file *fp; struct uio auio, kuio; struct iovec aiov, kiov; struct dirent *dp, *edp; caddr_t dirbuf; int error, eofflag, readcnt; long loff; /* XXX arbitrary sanity limit on `count'. */ if (uap->count > 64 * 1024) return (EINVAL); if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0) return (error); if ((fp->f_flag & FREAD) == 0) { fdrop(fp, td); return (EBADF); } vp = fp->f_vnode; unionread: if (vp->v_type != VDIR) { fdrop(fp, td); return (EINVAL); } aiov.iov_base = uap->buf; aiov.iov_len = uap->count; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_USERSPACE; auio.uio_td = td; auio.uio_resid = uap->count; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); loff = auio.uio_offset = fp->f_offset; #ifdef MAC error = mac_check_vnode_readdir(td->td_ucred, vp); if (error) { VOP_UNLOCK(vp, 0, td); fdrop(fp, td); return (error); } #endif # if (BYTE_ORDER != LITTLE_ENDIAN) if (vp->v_mount->mnt_maxsymlinklen <= 0) { error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL, NULL); fp->f_offset = auio.uio_offset; } else # endif { kuio = auio; kuio.uio_iov = &kiov; kuio.uio_segflg = UIO_SYSSPACE; kiov.iov_len = uap->count; MALLOC(dirbuf, caddr_t, uap->count, M_TEMP, M_WAITOK); kiov.iov_base = dirbuf; error = VOP_READDIR(vp, &kuio, fp->f_cred, &eofflag, NULL, NULL); fp->f_offset = kuio.uio_offset; if (error == 0) { readcnt = uap->count - kuio.uio_resid; edp = (struct dirent *)&dirbuf[readcnt]; for (dp = (struct dirent *)dirbuf; dp < edp; ) { # if (BYTE_ORDER == LITTLE_ENDIAN) /* * The expected low byte of * dp->d_namlen is our dp->d_type. * The high MBZ byte of dp->d_namlen * is our dp->d_namlen. */ dp->d_type = dp->d_namlen; dp->d_namlen = 0; # else /* * The dp->d_type is the high byte * of the expected dp->d_namlen, * so must be zero'ed. */ dp->d_type = 0; # endif if (dp->d_reclen > 0) { dp = (struct dirent *) ((char *)dp + dp->d_reclen); } else { error = EIO; break; } } if (dp >= edp) error = uiomove(dirbuf, readcnt, &auio); } FREE(dirbuf, M_TEMP); } VOP_UNLOCK(vp, 0, td); if (error) { fdrop(fp, td); return (error); } if (uap->count == auio.uio_resid) { if (union_dircheckp) { error = union_dircheckp(td, &vp, fp); if (error == -1) goto unionread; if (error) { fdrop(fp, td); return (error); } } /* * XXX We could delay dropping the lock above but * union_dircheckp complicates things. */ vn_lock(vp, LK_EXCLUSIVE|LK_RETRY, td); if ((vp->v_vflag & VV_ROOT) && (vp->v_mount->mnt_flag & MNT_UNION)) { struct vnode *tvp = vp; vp = vp->v_mount->mnt_vnodecovered; VREF(vp); fp->f_vnode = vp; fp->f_data = vp; fp->f_offset = 0; vput(tvp); goto unionread; } VOP_UNLOCK(vp, 0, td); } error = copyout(&loff, uap->basep, sizeof(long)); fdrop(fp, td); td->td_retval[0] = uap->count - auio.uio_resid; return (error); } #endif /* COMPAT_43 */ /* * Read a block of directory entries in a filesystem independent format. */ #ifndef _SYS_SYSPROTO_H_ struct getdirentries_args { int fd; char *buf; u_int count; long *basep; }; #endif int getdirentries(td, uap) struct thread *td; register struct getdirentries_args /* { int fd; char *buf; u_int count; long *basep; } */ *uap; { struct vnode *vp; struct file *fp; struct uio auio; struct iovec aiov; long loff; int error, eofflag; if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0) return (error); if ((fp->f_flag & FREAD) == 0) { fdrop(fp, td); return (EBADF); } vp = fp->f_vnode; unionread: if (vp->v_type != VDIR) { fdrop(fp, td); return (EINVAL); } aiov.iov_base = uap->buf; aiov.iov_len = uap->count; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_USERSPACE; auio.uio_td = td; auio.uio_resid = uap->count; /* vn_lock(vp, LK_SHARED | LK_RETRY, td); */ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); loff = auio.uio_offset = fp->f_offset; #ifdef MAC error = mac_check_vnode_readdir(td->td_ucred, vp); if (error == 0) #endif error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL, NULL); fp->f_offset = auio.uio_offset; VOP_UNLOCK(vp, 0, td); if (error) { fdrop(fp, td); return (error); } if (uap->count == auio.uio_resid) { if (union_dircheckp) { error = union_dircheckp(td, &vp, fp); if (error == -1) goto unionread; if (error) { fdrop(fp, td); return (error); } } /* * XXX We could delay dropping the lock above but * union_dircheckp complicates things. */ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); if ((vp->v_vflag & VV_ROOT) && (vp->v_mount->mnt_flag & MNT_UNION)) { struct vnode *tvp = vp; vp = vp->v_mount->mnt_vnodecovered; VREF(vp); fp->f_vnode = vp; fp->f_data = vp; fp->f_offset = 0; vput(tvp); goto unionread; } VOP_UNLOCK(vp, 0, td); } if (uap->basep != NULL) { error = copyout(&loff, uap->basep, sizeof(long)); } td->td_retval[0] = uap->count - auio.uio_resid; fdrop(fp, td); return (error); } #ifndef _SYS_SYSPROTO_H_ struct getdents_args { int fd; char *buf; size_t count; }; #endif int getdents(td, uap) struct thread *td; register struct getdents_args /* { int fd; char *buf; u_int count; } */ *uap; { struct getdirentries_args ap; ap.fd = uap->fd; ap.buf = uap->buf; ap.count = uap->count; ap.basep = NULL; return getdirentries(td, &ap); } /* * Set the mode mask for creation of filesystem nodes. * * MP SAFE */ #ifndef _SYS_SYSPROTO_H_ struct umask_args { int newmask; }; #endif int umask(td, uap) struct thread *td; struct umask_args /* { int newmask; } */ *uap; { register struct filedesc *fdp; FILEDESC_LOCK(td->td_proc->p_fd); fdp = td->td_proc->p_fd; td->td_retval[0] = fdp->fd_cmask; fdp->fd_cmask = uap->newmask & ALLPERMS; FILEDESC_UNLOCK(td->td_proc->p_fd); return (0); } /* * Void all references to file by ripping underlying filesystem * away from vnode. */ #ifndef _SYS_SYSPROTO_H_ struct revoke_args { char *path; }; #endif /* ARGSUSED */ int revoke(td, uap) struct thread *td; register struct revoke_args /* { char *path; } */ *uap; { struct mount *mp; struct vnode *vp; struct vattr vattr; int error; struct nameidata nd; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, uap->path, td); if ((error = namei(&nd)) != 0) return (error); vp = nd.ni_vp; NDFREE(&nd, NDF_ONLY_PNBUF); if (vp->v_type != VCHR) { vput(vp); return (EINVAL); } #ifdef MAC error = mac_check_vnode_revoke(td->td_ucred, vp); if (error) { vput(vp); return (error); } #endif error = VOP_GETATTR(vp, &vattr, td->td_ucred, td); if (error) { vput(vp); return (error); } VOP_UNLOCK(vp, 0, td); if (td->td_ucred->cr_uid != vattr.va_uid) { error = suser_cred(td->td_ucred, PRISON_ROOT); if (error) goto out; } if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) goto out; if (vcount(vp) > 1) VOP_REVOKE(vp, REVOKEALL); vn_finished_write(mp); out: vrele(vp); return (error); } /* * Convert a user file descriptor to a kernel file entry. * The file entry is locked upon returning. */ int getvnode(fdp, fd, fpp) struct filedesc *fdp; int fd; struct file **fpp; { int error; struct file *fp; fp = NULL; if (fdp == NULL) error = EBADF; else { FILEDESC_LOCK(fdp); if ((u_int)fd >= fdp->fd_nfiles || (fp = fdp->fd_ofiles[fd]) == NULL) error = EBADF; else if (fp->f_vnode == NULL) { fp = NULL; error = EINVAL; } else { fhold(fp); error = 0; } FILEDESC_UNLOCK(fdp); } *fpp = fp; return (error); } /* * Get (NFS) file handle */ #ifndef _SYS_SYSPROTO_H_ struct getfh_args { char *fname; fhandle_t *fhp; }; #endif int getfh(td, uap) struct thread *td; register struct getfh_args *uap; { struct nameidata nd; fhandle_t fh; register struct vnode *vp; int error; /* * Must be super user */ error = suser(td); if (error) return (error); NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, uap->fname, td); error = namei(&nd); if (error) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; bzero(&fh, sizeof(fh)); fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid; error = VFS_VPTOFH(vp, &fh.fh_fid); vput(vp); if (error) return (error); error = copyout(&fh, uap->fhp, sizeof (fh)); return (error); } /* * syscall for the rpc.lockd to use to translate a NFS file handle into * an open descriptor. * * warning: do not remove the suser() call or this becomes one giant * security hole. */ #ifndef _SYS_SYSPROTO_H_ struct fhopen_args { const struct fhandle *u_fhp; int flags; }; #endif int fhopen(td, uap) struct thread *td; struct fhopen_args /* { const struct fhandle *u_fhp; int flags; } */ *uap; { struct proc *p = td->td_proc; struct mount *mp; struct vnode *vp; struct fhandle fhp; struct vattr vat; struct vattr *vap = &vat; struct flock lf; struct file *fp; register struct filedesc *fdp = p->p_fd; int fmode, mode, error, type; struct file *nfp; int indx; /* * Must be super user */ error = suser(td); if (error) return (error); fmode = FFLAGS(uap->flags); /* why not allow a non-read/write open for our lockd? */ if (((fmode & (FREAD | FWRITE)) == 0) || (fmode & O_CREAT)) return (EINVAL); error = copyin(uap->u_fhp, &fhp, sizeof(fhp)); if (error) return(error); /* find the mount point */ mp = vfs_getvfs(&fhp.fh_fsid); if (mp == NULL) return (ESTALE); /* now give me my vnode, it gets returned to me locked */ error = VFS_FHTOVP(mp, &fhp.fh_fid, &vp); if (error) return (error); /* * from now on we have to make sure not * to forget about the vnode * any error that causes an abort must vput(vp) * just set error = err and 'goto bad;'. */ /* * from vn_open */ if (vp->v_type == VLNK) { error = EMLINK; goto bad; } if (vp->v_type == VSOCK) { error = EOPNOTSUPP; goto bad; } mode = 0; if (fmode & (FWRITE | O_TRUNC)) { if (vp->v_type == VDIR) { error = EISDIR; goto bad; } error = vn_writechk(vp); if (error) goto bad; mode |= VWRITE; } if (fmode & FREAD) mode |= VREAD; if (fmode & O_APPEND) mode |= VAPPEND; #ifdef MAC error = mac_check_vnode_open(td->td_ucred, vp, mode); if (error) goto bad; #endif if (mode) { error = VOP_ACCESS(vp, mode, td->td_ucred, td); if (error) goto bad; } if (fmode & O_TRUNC) { VOP_UNLOCK(vp, 0, td); /* XXX */ if ((error = vn_start_write(NULL, &mp, V_WAIT | PCATCH)) != 0) { vrele(vp); return (error); } VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); /* XXX */ #ifdef MAC /* * We don't yet have fp->f_cred, so use td->td_ucred, which * should be right. */ error = mac_check_vnode_write(td->td_ucred, td->td_ucred, vp); if (error == 0) { #endif VATTR_NULL(vap); vap->va_size = 0; error = VOP_SETATTR(vp, vap, td->td_ucred, td); #ifdef MAC } #endif vn_finished_write(mp); if (error) goto bad; } error = VOP_OPEN(vp, fmode, td->td_ucred, td, -1); if (error) goto bad; /* * Make sure that a VM object is created for VMIO support. */ if (vn_canvmio(vp) == TRUE) { if ((error = vfs_object_create(vp, td, td->td_ucred)) != 0) goto bad; } if (fmode & FWRITE) vp->v_writecount++; /* * end of vn_open code */ if ((error = falloc(td, &nfp, &indx)) != 0) { if (fmode & FWRITE) vp->v_writecount--; goto bad; } /* An extra reference on `nfp' has been held for us by falloc(). */ fp = nfp; nfp->f_vnode = vp; nfp->f_data = vp; nfp->f_flag = fmode & FMASK; nfp->f_ops = &vnops; nfp->f_type = DTYPE_VNODE; if (fmode & (O_EXLOCK | O_SHLOCK)) { lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; if (fmode & O_EXLOCK) lf.l_type = F_WRLCK; else lf.l_type = F_RDLCK; type = F_FLOCK; if ((fmode & FNONBLOCK) == 0) type |= F_WAIT; VOP_UNLOCK(vp, 0, td); if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type)) != 0) { /* * The lock request failed. Normally close the * descriptor but handle the case where someone might * have dup()d or close()d it when we weren't looking. */ FILEDESC_LOCK(fdp); if (fdp->fd_ofiles[indx] == fp) { fdp->fd_ofiles[indx] = NULL; FILEDESC_UNLOCK(fdp); fdrop(fp, td); } else FILEDESC_UNLOCK(fdp); /* * release our private reference */ fdrop(fp, td); return(error); } vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); fp->f_flag |= FHASLOCK; } if ((vp->v_type == VREG) && (VOP_GETVOBJECT(vp, NULL) != 0)) vfs_object_create(vp, td, td->td_ucred); VOP_UNLOCK(vp, 0, td); fdrop(fp, td); td->td_retval[0] = indx; return (0); bad: vput(vp); return (error); } /* * Stat an (NFS) file handle. */ #ifndef _SYS_SYSPROTO_H_ struct fhstat_args { struct fhandle *u_fhp; struct stat *sb; }; #endif int fhstat(td, uap) struct thread *td; register struct fhstat_args /* { struct fhandle *u_fhp; struct stat *sb; } */ *uap; { struct stat sb; fhandle_t fh; struct mount *mp; struct vnode *vp; int error; /* * Must be super user */ error = suser(td); if (error) return (error); error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t)); if (error) return (error); if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL) return (ESTALE); if ((error = VFS_FHTOVP(mp, &fh.fh_fid, &vp))) return (error); error = vn_stat(vp, &sb, td->td_ucred, NOCRED, td); vput(vp); if (error) return (error); error = copyout(&sb, uap->sb, sizeof(sb)); return (error); } /* * Implement fstatfs() for (NFS) file handles. */ #ifndef _SYS_SYSPROTO_H_ struct fhstatfs_args { struct fhandle *u_fhp; struct statfs *buf; }; #endif int fhstatfs(td, uap) struct thread *td; struct fhstatfs_args /* { struct fhandle *u_fhp; struct statfs *buf; } */ *uap; { - struct statfs *sp; + struct statfs *sp, sb; struct mount *mp; struct vnode *vp; - struct statfs sb; fhandle_t fh; int error; /* * Must be super user */ error = suser(td); if (error) return (error); if ((error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t))) != 0) return (error); if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL) return (ESTALE); if ((error = VFS_FHTOVP(mp, &fh.fh_fid, &vp))) return (error); mp = vp->v_mount; sp = &mp->mnt_stat; vput(vp); #ifdef MAC error = mac_check_mount_stat(td->td_ucred, mp); if (error) return (error); #endif + /* + * Set these in case the underlying filesystem fails to do so. + */ + sp->f_version = STATFS_VERSION; + sp->f_namemax = NAME_MAX; + sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; if ((error = VFS_STATFS(mp, sp, td)) != 0) return (error); - sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; if (suser(td)) { bcopy(sp, &sb, sizeof(sb)); sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0; sp = &sb; } return (copyout(sp, uap->buf, sizeof(*sp))); } /* * Syscall to push extended attribute configuration information into the * VFS. Accepts a path, which it converts to a mountpoint, as well as * a command (int cmd), and attribute name and misc data. For now, the * attribute name is left in userspace for consumption by the VFS_op. * It will probably be changed to be copied into sysspace by the * syscall in the future, once issues with various consumers of the * attribute code have raised their hands. * * Currently this is used only by UFS Extended Attributes. */ int extattrctl(td, uap) struct thread *td; struct extattrctl_args /* { const char *path; int cmd; const char *filename; int attrnamespace; const char *attrname; } */ *uap; { struct vnode *filename_vp; struct nameidata nd; struct mount *mp, *mp_writable; char attrname[EXTATTR_MAXNAMELEN]; int error; /* * uap->attrname is not always defined. We check again later when we * invoke the VFS call so as to pass in NULL there if needed. */ if (uap->attrname != NULL) { error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL); if (error) return (error); } /* * uap->filename is not always defined. If it is, grab a vnode lock, * which VFS_EXTATTRCTL() will later release. */ filename_vp = NULL; if (uap->filename != NULL) { NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, uap->filename, td); error = namei(&nd); if (error) return (error); filename_vp = nd.ni_vp; NDFREE(&nd, NDF_NO_VP_RELE | NDF_NO_VP_UNLOCK); } /* uap->path is always defined. */ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, td); error = namei(&nd); if (error) { if (filename_vp != NULL) vput(filename_vp); return (error); } mp = nd.ni_vp->v_mount; error = vn_start_write(nd.ni_vp, &mp_writable, V_WAIT | PCATCH); NDFREE(&nd, 0); if (error) { if (filename_vp != NULL) vput(filename_vp); return (error); } error = VFS_EXTATTRCTL(mp, uap->cmd, filename_vp, uap->attrnamespace, uap->attrname != NULL ? attrname : NULL, td); vn_finished_write(mp_writable); /* * VFS_EXTATTRCTL will have unlocked, but not de-ref'd, * filename_vp, so vrele it if it is defined. */ if (filename_vp != NULL) vrele(filename_vp); return (error); } /*- * Set a named extended attribute on a file or directory * * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace", * kernelspace string pointer "attrname", userspace buffer * pointer "data", buffer length "nbytes", thread "td". * Returns: 0 on success, an error number otherwise * Locks: none * References: vp must be a valid reference for the duration of the call */ static int extattr_set_vp(struct vnode *vp, int attrnamespace, const char *attrname, void *data, size_t nbytes, struct thread *td) { struct mount *mp; struct uio auio; struct iovec aiov; ssize_t cnt; int error; error = vn_start_write(vp, &mp, V_WAIT | PCATCH); if (error) return (error); VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); aiov.iov_base = data; aiov.iov_len = nbytes; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = 0; if (nbytes > INT_MAX) { error = EINVAL; goto done; } auio.uio_resid = nbytes; auio.uio_rw = UIO_WRITE; auio.uio_segflg = UIO_USERSPACE; auio.uio_td = td; cnt = nbytes; #ifdef MAC error = mac_check_vnode_setextattr(td->td_ucred, vp, attrnamespace, attrname, &auio); if (error) goto done; #endif error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, td->td_ucred, td); cnt -= auio.uio_resid; td->td_retval[0] = cnt; done: VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); return (error); } int extattr_set_fd(td, uap) struct thread *td; struct extattr_set_fd_args /* { int fd; int attrnamespace; const char *attrname; void *data; size_t nbytes; } */ *uap; { struct file *fp; char attrname[EXTATTR_MAXNAMELEN]; int error; error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL); if (error) return (error); error = getvnode(td->td_proc->p_fd, uap->fd, &fp); if (error) return (error); error = extattr_set_vp(fp->f_vnode, uap->attrnamespace, attrname, uap->data, uap->nbytes, td); fdrop(fp, td); return (error); } int extattr_set_file(td, uap) struct thread *td; struct extattr_set_file_args /* { const char *path; int attrnamespace; const char *attrname; void *data; size_t nbytes; } */ *uap; { struct nameidata nd; char attrname[EXTATTR_MAXNAMELEN]; int error; error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL); if (error) return (error); NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, td); error = namei(&nd); if (error) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = extattr_set_vp(nd.ni_vp, uap->attrnamespace, attrname, uap->data, uap->nbytes, td); vrele(nd.ni_vp); return (error); } int extattr_set_link(td, uap) struct thread *td; struct extattr_set_link_args /* { const char *path; int attrnamespace; const char *attrname; void *data; size_t nbytes; } */ *uap; { struct nameidata nd; char attrname[EXTATTR_MAXNAMELEN]; int error; error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL); if (error) return (error); NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, uap->path, td); error = namei(&nd); if (error) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = extattr_set_vp(nd.ni_vp, uap->attrnamespace, attrname, uap->data, uap->nbytes, td); vrele(nd.ni_vp); return (error); } /*- * Get a named extended attribute on a file or directory * * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace", * kernelspace string pointer "attrname", userspace buffer * pointer "data", buffer length "nbytes", thread "td". * Returns: 0 on success, an error number otherwise * Locks: none * References: vp must be a valid reference for the duration of the call */ static int extattr_get_vp(struct vnode *vp, int attrnamespace, const char *attrname, void *data, size_t nbytes, struct thread *td) { struct uio auio, *auiop; struct iovec aiov; ssize_t cnt; size_t size, *sizep; int error; /* * XXX: Temporary API compatibility for applications that know * about this hack ("" means list), but haven't been updated * for the extattr_list_*() system calls yet. This will go * away for FreeBSD 5.3. */ if (strlen(attrname) == 0) return (extattr_list_vp(vp, attrnamespace, data, nbytes, td)); VOP_LEASE(vp, td, td->td_ucred, LEASE_READ); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); /* * Slightly unusual semantics: if the user provides a NULL data * pointer, they don't want to receive the data, just the * maximum read length. */ auiop = NULL; sizep = NULL; cnt = 0; if (data != NULL) { aiov.iov_base = data; aiov.iov_len = nbytes; auio.uio_iov = &aiov; auio.uio_offset = 0; if (nbytes > INT_MAX) { error = EINVAL; goto done; } auio.uio_resid = nbytes; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_USERSPACE; auio.uio_td = td; auiop = &auio; cnt = nbytes; } else sizep = &size; #ifdef MAC error = mac_check_vnode_getextattr(td->td_ucred, vp, attrnamespace, attrname, &auio); if (error) goto done; #endif error = VOP_GETEXTATTR(vp, attrnamespace, attrname, auiop, sizep, td->td_ucred, td); if (auiop != NULL) { cnt -= auio.uio_resid; td->td_retval[0] = cnt; } else td->td_retval[0] = size; done: VOP_UNLOCK(vp, 0, td); return (error); } int extattr_get_fd(td, uap) struct thread *td; struct extattr_get_fd_args /* { int fd; int attrnamespace; const char *attrname; void *data; size_t nbytes; } */ *uap; { struct file *fp; char attrname[EXTATTR_MAXNAMELEN]; int error; error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL); if (error) return (error); error = getvnode(td->td_proc->p_fd, uap->fd, &fp); if (error) return (error); error = extattr_get_vp(fp->f_vnode, uap->attrnamespace, attrname, uap->data, uap->nbytes, td); fdrop(fp, td); return (error); } int extattr_get_file(td, uap) struct thread *td; struct extattr_get_file_args /* { const char *path; int attrnamespace; const char *attrname; void *data; size_t nbytes; } */ *uap; { struct nameidata nd; char attrname[EXTATTR_MAXNAMELEN]; int error; error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL); if (error) return (error); NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, td); error = namei(&nd); if (error) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = extattr_get_vp(nd.ni_vp, uap->attrnamespace, attrname, uap->data, uap->nbytes, td); vrele(nd.ni_vp); return (error); } int extattr_get_link(td, uap) struct thread *td; struct extattr_get_link_args /* { const char *path; int attrnamespace; const char *attrname; void *data; size_t nbytes; } */ *uap; { struct nameidata nd; char attrname[EXTATTR_MAXNAMELEN]; int error; error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL); if (error) return (error); NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, uap->path, td); error = namei(&nd); if (error) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = extattr_get_vp(nd.ni_vp, uap->attrnamespace, attrname, uap->data, uap->nbytes, td); vrele(nd.ni_vp); return (error); } /* * extattr_delete_vp(): Delete a named extended attribute on a file or * directory * * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace", * kernelspace string pointer "attrname", proc "p" * Returns: 0 on success, an error number otherwise * Locks: none * References: vp must be a valid reference for the duration of the call */ static int extattr_delete_vp(struct vnode *vp, int attrnamespace, const char *attrname, struct thread *td) { struct mount *mp; int error; error = vn_start_write(vp, &mp, V_WAIT | PCATCH); if (error) return (error); VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); #ifdef MAC error = mac_check_vnode_deleteextattr(td->td_ucred, vp, attrnamespace, attrname); if (error) goto done; #endif error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, td->td_ucred, td); if (error == EOPNOTSUPP) error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL, td->td_ucred, td); #ifdef MAC done: #endif VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); return (error); } int extattr_delete_fd(td, uap) struct thread *td; struct extattr_delete_fd_args /* { int fd; int attrnamespace; const char *attrname; } */ *uap; { struct file *fp; struct vnode *vp; char attrname[EXTATTR_MAXNAMELEN]; int error; error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL); if (error) return (error); error = getvnode(td->td_proc->p_fd, uap->fd, &fp); if (error) return (error); vp = fp->f_vnode; error = extattr_delete_vp(vp, uap->attrnamespace, attrname, td); fdrop(fp, td); return (error); } int extattr_delete_file(td, uap) struct thread *td; struct extattr_delete_file_args /* { const char *path; int attrnamespace; const char *attrname; } */ *uap; { struct nameidata nd; char attrname[EXTATTR_MAXNAMELEN]; int error; error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL); if (error) return(error); NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, td); error = namei(&nd); if (error) return(error); NDFREE(&nd, NDF_ONLY_PNBUF); error = extattr_delete_vp(nd.ni_vp, uap->attrnamespace, attrname, td); vrele(nd.ni_vp); return(error); } int extattr_delete_link(td, uap) struct thread *td; struct extattr_delete_link_args /* { const char *path; int attrnamespace; const char *attrname; } */ *uap; { struct nameidata nd; char attrname[EXTATTR_MAXNAMELEN]; int error; error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL); if (error) return(error); NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, uap->path, td); error = namei(&nd); if (error) return(error); NDFREE(&nd, NDF_ONLY_PNBUF); error = extattr_delete_vp(nd.ni_vp, uap->attrnamespace, attrname, td); vrele(nd.ni_vp); return(error); } /*- * Retrieve a list of extended attributes on a file or directory. * * Arguments: unlocked vnode "vp", attribute namespace 'attrnamespace", * userspace buffer pointer "data", buffer length "nbytes", * thread "td". * Returns: 0 on success, an error number otherwise * Locks: none * References: vp must be a valid reference for the duration of the call */ static int extattr_list_vp(struct vnode *vp, int attrnamespace, void *data, size_t nbytes, struct thread *td) { struct uio auio, *auiop; size_t size, *sizep; struct iovec aiov; ssize_t cnt; int error; VOP_LEASE(vp, td, td->td_ucred, LEASE_READ); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); auiop = NULL; sizep = NULL; cnt = 0; if (data != NULL) { aiov.iov_base = data; aiov.iov_len = nbytes; auio.uio_iov = &aiov; auio.uio_offset = 0; if (nbytes > INT_MAX) { error = EINVAL; goto done; } auio.uio_resid = nbytes; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_USERSPACE; auio.uio_td = td; auiop = &auio; cnt = nbytes; } else sizep = &size; #ifdef MAC error = mac_check_vnode_listextattr(td->td_ucred, vp, attrnamespace); if (error) goto done; #endif error = VOP_LISTEXTATTR(vp, attrnamespace, auiop, sizep, td->td_ucred, td); if (auiop != NULL) { cnt -= auio.uio_resid; td->td_retval[0] = cnt; } else td->td_retval[0] = size; done: VOP_UNLOCK(vp, 0, td); return (error); } int extattr_list_fd(td, uap) struct thread *td; struct extattr_list_fd_args /* { int fd; int attrnamespace; void *data; size_t nbytes; } */ *uap; { struct file *fp; int error; error = getvnode(td->td_proc->p_fd, uap->fd, &fp); if (error) return (error); error = extattr_list_vp(fp->f_vnode, uap->attrnamespace, uap->data, uap->nbytes, td); fdrop(fp, td); return (error); } int extattr_list_file(td, uap) struct thread*td; struct extattr_list_file_args /* { const char *path; int attrnamespace; void *data; size_t nbytes; } */ *uap; { struct nameidata nd; int error; NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, td); error = namei(&nd); if (error) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = extattr_list_vp(nd.ni_vp, uap->attrnamespace, uap->data, uap->nbytes, td); vrele(nd.ni_vp); return (error); } int extattr_list_link(td, uap) struct thread*td; struct extattr_list_link_args /* { const char *path; int attrnamespace; void *data; size_t nbytes; } */ *uap; { struct nameidata nd; int error; NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, uap->path, td); error = namei(&nd); if (error) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = extattr_list_vp(nd.ni_vp, uap->attrnamespace, uap->data, uap->nbytes, td); vrele(nd.ni_vp); return (error); } Index: head/sys/kern/vfs_syscalls.c =================================================================== --- head/sys/kern/vfs_syscalls.c (revision 122536) +++ head/sys/kern/vfs_syscalls.c (revision 122537) @@ -1,4508 +1,4791 @@ /* * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_syscalls.c 8.13 (Berkeley) 4/15/94 */ #include __FBSDID("$FreeBSD$"); /* For 4.3 integer FS ID compatibility */ #include "opt_compat.h" #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int chroot_refuse_vdir_fds(struct filedesc *fdp); static int getutimes(const struct timeval *, enum uio_seg, struct timespec *); static int setfown(struct thread *td, struct vnode *, uid_t, gid_t); static int setfmode(struct thread *td, struct vnode *, int); static int setfflags(struct thread *td, struct vnode *, int); static int setutimes(struct thread *td, struct vnode *, const struct timespec *, int, int); static int vn_access(struct vnode *vp, int user_flags, struct ucred *cred, struct thread *td); static int extattr_list_vp(struct vnode *vp, int attrnamespace, void *data, size_t nbytes, struct thread *td); int (*union_dircheckp)(struct thread *td, struct vnode **, struct file *); int (*softdep_fsync_hook)(struct vnode *); /* * The module initialization routine for POSIX asynchronous I/O will * set this to the version of AIO that it implements. (Zero means * that it is not implemented.) This value is used here by pathconf() * and in kern_descrip.c by fpathconf(). */ int async_io_version; /* * Sync each mounted filesystem. */ #ifndef _SYS_SYSPROTO_H_ struct sync_args { int dummy; }; #endif #ifdef DEBUG static int syncprt = 0; SYSCTL_INT(_debug, OID_AUTO, syncprt, CTLFLAG_RW, &syncprt, 0, ""); #endif /* ARGSUSED */ int sync(td, uap) struct thread *td; struct sync_args *uap; { struct mount *mp, *nmp; int asyncflag; mtx_lock(&mountlist_mtx); for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } if ((mp->mnt_flag & MNT_RDONLY) == 0 && vn_start_write(NULL, &mp, V_NOWAIT) == 0) { asyncflag = mp->mnt_flag & MNT_ASYNC; mp->mnt_flag &= ~MNT_ASYNC; vfs_msync(mp, MNT_NOWAIT); VFS_SYNC(mp, MNT_NOWAIT, ((td != NULL) ? td->td_ucred : NOCRED), td); mp->mnt_flag |= asyncflag; vn_finished_write(mp); } mtx_lock(&mountlist_mtx); nmp = TAILQ_NEXT(mp, mnt_list); vfs_unbusy(mp, td); } mtx_unlock(&mountlist_mtx); #if 0 /* * XXX don't call vfs_bufstats() yet because that routine * was not imported in the Lite2 merge. */ #ifdef DIAGNOSTIC if (syncprt) vfs_bufstats(); #endif /* DIAGNOSTIC */ #endif return (0); } /* XXX PRISON: could be per prison flag */ static int prison_quotas; #if 0 SYSCTL_INT(_kern_prison, OID_AUTO, quotas, CTLFLAG_RW, &prison_quotas, 0, ""); #endif /* * Change filesystem quotas. */ #ifndef _SYS_SYSPROTO_H_ struct quotactl_args { char *path; int cmd; int uid; caddr_t arg; }; #endif /* ARGSUSED */ int quotactl(td, uap) struct thread *td; register struct quotactl_args /* { char *path; int cmd; int uid; caddr_t arg; } */ *uap; { struct mount *mp; int error; struct nameidata nd; if (jailed(td->td_ucred) && !prison_quotas) return (EPERM); NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = vn_start_write(nd.ni_vp, &mp, V_WAIT | PCATCH); vrele(nd.ni_vp); if (error) return (error); error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, uap->arg, td); vn_finished_write(mp); return (error); } /* * Get filesystem statistics. */ #ifndef _SYS_SYSPROTO_H_ struct statfs_args { char *path; struct statfs *buf; }; #endif /* ARGSUSED */ int statfs(td, uap) struct thread *td; register struct statfs_args /* { char *path; struct statfs *buf; } */ *uap; { - register struct mount *mp; - register struct statfs *sp; + struct mount *mp; + struct statfs *sp, sb; int error; struct nameidata nd; - struct statfs sb; NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, td); if ((error = namei(&nd)) != 0) return (error); mp = nd.ni_vp->v_mount; sp = &mp->mnt_stat; NDFREE(&nd, NDF_ONLY_PNBUF); vrele(nd.ni_vp); #ifdef MAC error = mac_check_mount_stat(td->td_ucred, mp); if (error) return (error); #endif + /* + * Set these in case the underlying filesystem fails to do so. + */ + sp->f_version = STATFS_VERSION; + sp->f_namemax = NAME_MAX; + sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; error = VFS_STATFS(mp, sp, td); if (error) return (error); - sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; if (suser(td)) { bcopy(sp, &sb, sizeof(sb)); sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0; sp = &sb; } return (copyout(sp, uap->buf, sizeof(*sp))); } /* * Get filesystem statistics. */ #ifndef _SYS_SYSPROTO_H_ struct fstatfs_args { int fd; struct statfs *buf; }; #endif /* ARGSUSED */ int fstatfs(td, uap) struct thread *td; register struct fstatfs_args /* { int fd; struct statfs *buf; } */ *uap; { struct file *fp; struct mount *mp; - register struct statfs *sp; + struct statfs *sp, sb; int error; - struct statfs sb; if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0) return (error); mp = fp->f_vnode->v_mount; fdrop(fp, td); if (mp == NULL) return (EBADF); #ifdef MAC error = mac_check_mount_stat(td->td_ucred, mp); if (error) return (error); #endif sp = &mp->mnt_stat; + /* + * Set these in case the underlying filesystem fails to do so. + */ + sp->f_version = STATFS_VERSION; + sp->f_namemax = NAME_MAX; + sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; error = VFS_STATFS(mp, sp, td); if (error) return (error); - sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; if (suser(td)) { bcopy(sp, &sb, sizeof(sb)); sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0; sp = &sb; } return (copyout(sp, uap->buf, sizeof(*sp))); } /* * Get statistics on all filesystems. */ #ifndef _SYS_SYSPROTO_H_ struct getfsstat_args { struct statfs *buf; long bufsize; int flags; }; #endif int getfsstat(td, uap) struct thread *td; register struct getfsstat_args /* { struct statfs *buf; long bufsize; int flags; } */ *uap; { - register struct mount *mp, *nmp; - register struct statfs *sp; + struct mount *mp, *nmp; + struct statfs *sp, sb; caddr_t sfsp; long count, maxcount, error; maxcount = uap->bufsize / sizeof(struct statfs); sfsp = (caddr_t)uap->buf; count = 0; mtx_lock(&mountlist_mtx); for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { #ifdef MAC if (mac_check_mount_stat(td->td_ucred, mp) != 0) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } #endif if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } if (sfsp && count < maxcount) { sp = &mp->mnt_stat; /* + * Set these in case the underlying filesystem + * fails to do so. + */ + sp->f_version = STATFS_VERSION; + sp->f_namemax = NAME_MAX; + sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; + /* * If MNT_NOWAIT or MNT_LAZY is specified, do not * refresh the fsstat cache. MNT_NOWAIT or MNT_LAZY * overrides MNT_WAIT. */ if (((uap->flags & (MNT_LAZY|MNT_NOWAIT)) == 0 || (uap->flags & MNT_WAIT)) && (error = VFS_STATFS(mp, sp, td))) { mtx_lock(&mountlist_mtx); nmp = TAILQ_NEXT(mp, mnt_list); vfs_unbusy(mp, td); continue; } - sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; + if (suser(td)) { + bcopy(sp, &sb, sizeof(sb)); + sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0; + sp = &sb; + } error = copyout(sp, sfsp, sizeof(*sp)); if (error) { vfs_unbusy(mp, td); return (error); } sfsp += sizeof(*sp); } count++; mtx_lock(&mountlist_mtx); nmp = TAILQ_NEXT(mp, mnt_list); vfs_unbusy(mp, td); } mtx_unlock(&mountlist_mtx); if (sfsp && count > maxcount) td->td_retval[0] = maxcount; else td->td_retval[0] = count; return (0); } +#ifdef COMPAT_FREEBSD4 /* + * Get old format filesystem statistics. + */ +static void cvtstatfs(struct thread *, struct statfs *, struct ostatfs *); + +#ifndef _SYS_SYSPROTO_H_ +struct freebsd4_statfs_args { + char *path; + struct ostatfs *buf; +}; +#endif +/* ARGSUSED */ +int +freebsd4_statfs(td, uap) + struct thread *td; + struct freebsd4_statfs_args /* { + char *path; + struct ostatfs *buf; + } */ *uap; +{ + struct mount *mp; + struct statfs *sp; + struct ostatfs osb; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, td); + if ((error = namei(&nd)) != 0) + return (error); + mp = nd.ni_vp->v_mount; + sp = &mp->mnt_stat; + NDFREE(&nd, NDF_ONLY_PNBUF); + vrele(nd.ni_vp); +#ifdef MAC + error = mac_check_mount_stat(td->td_ucred, mp); + if (error) + return (error); +#endif + error = VFS_STATFS(mp, sp, td); + if (error) + return (error); + sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; + cvtstatfs(td, sp, &osb); + return (copyout(&osb, uap->buf, sizeof(osb))); +} + +/* + * Get filesystem statistics. + */ +#ifndef _SYS_SYSPROTO_H_ +struct freebsd4_fstatfs_args { + int fd; + struct ostatfs *buf; +}; +#endif +/* ARGSUSED */ +int +freebsd4_fstatfs(td, uap) + struct thread *td; + struct freebsd4_fstatfs_args /* { + int fd; + struct ostatfs *buf; + } */ *uap; +{ + struct file *fp; + struct mount *mp; + struct statfs *sp; + struct ostatfs osb; + int error; + + if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0) + return (error); + mp = fp->f_vnode->v_mount; + fdrop(fp, td); + if (mp == NULL) + return (EBADF); +#ifdef MAC + error = mac_check_mount_stat(td->td_ucred, mp); + if (error) + return (error); +#endif + sp = &mp->mnt_stat; + error = VFS_STATFS(mp, sp, td); + if (error) + return (error); + sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; + cvtstatfs(td, sp, &osb); + return (copyout(&osb, uap->buf, sizeof(osb))); +} + +/* + * Get statistics on all filesystems. + */ +#ifndef _SYS_SYSPROTO_H_ +struct freebsd4_getfsstat_args { + struct ostatfs *buf; + long bufsize; + int flags; +}; +#endif +int +freebsd4_getfsstat(td, uap) + struct thread *td; + register struct freebsd4_getfsstat_args /* { + struct ostatfs *buf; + long bufsize; + int flags; + } */ *uap; +{ + struct mount *mp, *nmp; + struct statfs *sp; + struct ostatfs osb; + caddr_t sfsp; + long count, maxcount, error; + + maxcount = uap->bufsize / sizeof(struct ostatfs); + sfsp = (caddr_t)uap->buf; + count = 0; + mtx_lock(&mountlist_mtx); + for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { +#ifdef MAC + if (mac_check_mount_stat(td->td_ucred, mp) != 0) { + nmp = TAILQ_NEXT(mp, mnt_list); + continue; + } +#endif + if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) { + nmp = TAILQ_NEXT(mp, mnt_list); + continue; + } + if (sfsp && count < maxcount) { + sp = &mp->mnt_stat; + /* + * If MNT_NOWAIT or MNT_LAZY is specified, do not + * refresh the fsstat cache. MNT_NOWAIT or MNT_LAZY + * overrides MNT_WAIT. + */ + if (((uap->flags & (MNT_LAZY|MNT_NOWAIT)) == 0 || + (uap->flags & MNT_WAIT)) && + (error = VFS_STATFS(mp, sp, td))) { + mtx_lock(&mountlist_mtx); + nmp = TAILQ_NEXT(mp, mnt_list); + vfs_unbusy(mp, td); + continue; + } + sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; + cvtstatfs(td, sp, &osb); + error = copyout(&osb, sfsp, sizeof(osb)); + if (error) { + vfs_unbusy(mp, td); + return (error); + } + sfsp += sizeof(osb); + } + count++; + mtx_lock(&mountlist_mtx); + nmp = TAILQ_NEXT(mp, mnt_list); + vfs_unbusy(mp, td); + } + mtx_unlock(&mountlist_mtx); + if (sfsp && count > maxcount) + td->td_retval[0] = maxcount; + else + td->td_retval[0] = count; + return (0); +} + +/* + * Implement fstatfs() for (NFS) file handles. + */ +#ifndef _SYS_SYSPROTO_H_ +struct freebsd4_fhstatfs_args { + struct fhandle *u_fhp; + struct ostatfs *buf; +}; +#endif +int +freebsd4_fhstatfs(td, uap) + struct thread *td; + struct freebsd4_fhstatfs_args /* { + struct fhandle *u_fhp; + struct ostatfs *buf; + } */ *uap; +{ + struct statfs *sp; + struct mount *mp; + struct vnode *vp; + struct ostatfs osb; + fhandle_t fh; + int error; + + /* + * Must be super user + */ + error = suser(td); + if (error) + return (error); + + if ((error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t))) != 0) + return (error); + + if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL) + return (ESTALE); + if ((error = VFS_FHTOVP(mp, &fh.fh_fid, &vp))) + return (error); + mp = vp->v_mount; + sp = &mp->mnt_stat; + vput(vp); +#ifdef MAC + error = mac_check_mount_stat(td->td_ucred, mp); + if (error) + return (error); +#endif + if ((error = VFS_STATFS(mp, sp, td)) != 0) + return (error); + sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; + cvtstatfs(td, sp, &osb); + return (copyout(&osb, uap->buf, sizeof(osb))); +} + +/* + * Convert a new format statfs structure to an old format statfs structure. + */ +static void +cvtstatfs(td, nsp, osp) + struct thread *td; + struct statfs *nsp; + struct ostatfs *osp; +{ + + bzero(osp, sizeof(*osp)); + osp->f_bsize = MIN(nsp->f_bsize, LONG_MAX); + osp->f_iosize = MIN(nsp->f_iosize, LONG_MAX); + osp->f_blocks = MIN(nsp->f_blocks, LONG_MAX); + osp->f_bfree = MIN(nsp->f_bfree, LONG_MAX); + osp->f_bavail = MIN(nsp->f_bavail, LONG_MAX); + osp->f_files = MIN(nsp->f_files, LONG_MAX); + osp->f_ffree = MIN(nsp->f_ffree, LONG_MAX); + osp->f_owner = nsp->f_owner; + osp->f_type = nsp->f_type; + osp->f_flags = nsp->f_flags; + osp->f_syncwrites = MIN(nsp->f_syncwrites, LONG_MAX); + osp->f_asyncwrites = MIN(nsp->f_asyncwrites, LONG_MAX); + osp->f_syncreads = MIN(nsp->f_syncreads, LONG_MAX); + osp->f_asyncreads = MIN(nsp->f_asyncreads, LONG_MAX); + bcopy(nsp->f_fstypename, osp->f_fstypename, + MIN(MFSNAMELEN, OMNAMELEN)); + bcopy(nsp->f_mntonname, osp->f_mntonname, + MIN(MFSNAMELEN, OMNAMELEN)); + bcopy(nsp->f_mntfromname, osp->f_mntfromname, + MIN(MFSNAMELEN, OMNAMELEN)); + if (suser(td)) { + osp->f_fsid.val[0] = osp->f_fsid.val[1] = 0; + } else { + osp->f_fsid = nsp->f_fsid; + } +} +#endif /* COMPAT_FREEBSD4 */ + +/* * Change current working directory to a given file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fchdir_args { int fd; }; #endif /* ARGSUSED */ int fchdir(td, uap) struct thread *td; struct fchdir_args /* { int fd; } */ *uap; { register struct filedesc *fdp = td->td_proc->p_fd; struct vnode *vp, *tdp, *vpold; struct mount *mp; struct file *fp; int error; if ((error = getvnode(fdp, uap->fd, &fp)) != 0) return (error); vp = fp->f_vnode; VREF(vp); fdrop(fp, td); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); if (vp->v_type != VDIR) error = ENOTDIR; #ifdef MAC else if ((error = mac_check_vnode_chdir(td->td_ucred, vp)) != 0) { } #endif else error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td); while (!error && (mp = vp->v_mountedhere) != NULL) { if (vfs_busy(mp, 0, 0, td)) continue; error = VFS_ROOT(mp, &tdp); vfs_unbusy(mp, td); if (error) break; vput(vp); vp = tdp; } if (error) { vput(vp); return (error); } VOP_UNLOCK(vp, 0, td); FILEDESC_LOCK(fdp); vpold = fdp->fd_cdir; fdp->fd_cdir = vp; FILEDESC_UNLOCK(fdp); vrele(vpold); return (0); } /* * Change current working directory (``.''). */ #ifndef _SYS_SYSPROTO_H_ struct chdir_args { char *path; }; #endif /* ARGSUSED */ int chdir(td, uap) struct thread *td; struct chdir_args /* { char *path; } */ *uap; { return (kern_chdir(td, uap->path, UIO_USERSPACE)); } int kern_chdir(struct thread *td, char *path, enum uio_seg pathseg) { register struct filedesc *fdp = td->td_proc->p_fd; int error; struct nameidata nd; struct vnode *vp; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); if ((error = change_dir(nd.ni_vp, td)) != 0) { vput(nd.ni_vp); NDFREE(&nd, NDF_ONLY_PNBUF); return (error); } VOP_UNLOCK(nd.ni_vp, 0, td); NDFREE(&nd, NDF_ONLY_PNBUF); FILEDESC_LOCK(fdp); vp = fdp->fd_cdir; fdp->fd_cdir = nd.ni_vp; FILEDESC_UNLOCK(fdp); vrele(vp); return (0); } /* * Helper function for raised chroot(2) security function: Refuse if * any filedescriptors are open directories. */ static int chroot_refuse_vdir_fds(fdp) struct filedesc *fdp; { struct vnode *vp; struct file *fp; int fd; FILEDESC_LOCK_ASSERT(fdp, MA_OWNED); for (fd = 0; fd < fdp->fd_nfiles ; fd++) { fp = fget_locked(fdp, fd); if (fp == NULL) continue; if (fp->f_type == DTYPE_VNODE) { vp = fp->f_vnode; if (vp->v_type == VDIR) return (EPERM); } } return (0); } /* * This sysctl determines if we will allow a process to chroot(2) if it * has a directory open: * 0: disallowed for all processes. * 1: allowed for processes that were not already chroot(2)'ed. * 2: allowed for all processes. */ static int chroot_allow_open_directories = 1; SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW, &chroot_allow_open_directories, 0, ""); /* * Change notion of root (``/'') directory. */ #ifndef _SYS_SYSPROTO_H_ struct chroot_args { char *path; }; #endif /* ARGSUSED */ int chroot(td, uap) struct thread *td; struct chroot_args /* { char *path; } */ *uap; { int error; struct nameidata nd; error = suser_cred(td->td_ucred, PRISON_ROOT); if (error) return (error); NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, uap->path, td); mtx_lock(&Giant); error = namei(&nd); if (error) goto error; if ((error = change_dir(nd.ni_vp, td)) != 0) goto e_vunlock; #ifdef MAC if ((error = mac_check_vnode_chroot(td->td_ucred, nd.ni_vp))) goto e_vunlock; #endif VOP_UNLOCK(nd.ni_vp, 0, td); error = change_root(nd.ni_vp, td); vrele(nd.ni_vp); NDFREE(&nd, NDF_ONLY_PNBUF); mtx_unlock(&Giant); return (error); e_vunlock: vput(nd.ni_vp); error: mtx_unlock(&Giant); NDFREE(&nd, NDF_ONLY_PNBUF); return (error); } /* * Common routine for chroot and chdir. Callers must provide a locked vnode * instance. */ int change_dir(vp, td) struct vnode *vp; struct thread *td; { int error; ASSERT_VOP_LOCKED(vp, "change_dir(): vp not locked"); if (vp->v_type != VDIR) return (ENOTDIR); #ifdef MAC error = mac_check_vnode_chdir(td->td_ucred, vp); if (error) return (error); #endif error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td); return (error); } /* * Common routine for kern_chroot() and jail_attach(). The caller is * responsible for invoking suser() and mac_check_chroot() to authorize this * operation. */ int change_root(vp, td) struct vnode *vp; struct thread *td; { struct filedesc *fdp; struct vnode *oldvp; int error; mtx_assert(&Giant, MA_OWNED); fdp = td->td_proc->p_fd; FILEDESC_LOCK(fdp); if (chroot_allow_open_directories == 0 || (chroot_allow_open_directories == 1 && fdp->fd_rdir != rootvnode)) { error = chroot_refuse_vdir_fds(fdp); if (error) { FILEDESC_UNLOCK(fdp); return (error); } } oldvp = fdp->fd_rdir; fdp->fd_rdir = vp; VREF(fdp->fd_rdir); if (!fdp->fd_jdir) { fdp->fd_jdir = vp; VREF(fdp->fd_jdir); } FILEDESC_UNLOCK(fdp); vrele(oldvp); return (0); } /* * Check permissions, allocate an open file structure, * and call the device open routine if any. */ #ifndef _SYS_SYSPROTO_H_ struct open_args { char *path; int flags; int mode; }; #endif int open(td, uap) struct thread *td; register struct open_args /* { char *path; int flags; int mode; } */ *uap; { return (kern_open(td, uap->path, UIO_USERSPACE, uap->flags, uap->mode)); } int kern_open(struct thread *td, char *path, enum uio_seg pathseg, int flags, int mode) { struct proc *p = td->td_proc; struct filedesc *fdp = p->p_fd; struct file *fp; struct vnode *vp; struct vattr vat; struct mount *mp; int cmode; struct file *nfp; int type, indx, error; struct flock lf; struct nameidata nd; if ((flags & O_ACCMODE) == O_ACCMODE) return (EINVAL); flags = FFLAGS(flags); error = falloc(td, &nfp, &indx); if (error) return (error); /* An extra reference on `nfp' has been held for us by falloc(). */ fp = nfp; cmode = ((mode &~ fdp->fd_cmask) & ALLPERMS) &~ S_ISTXT; NDINIT(&nd, LOOKUP, FOLLOW, pathseg, path, td); td->td_dupfd = -1; /* XXX check for fdopen */ error = vn_open(&nd, &flags, cmode, indx); if (error) { /* * If the vn_open replaced the method vector, something * wonderous happened deep below and we just pass it up * pretending we know what we do. */ if (error == ENXIO && fp->f_ops != &badfileops) { fdrop(fp, td); td->td_retval[0] = indx; return (0); } /* * release our own reference */ fdrop(fp, td); /* * handle special fdopen() case. bleh. dupfdopen() is * responsible for dropping the old contents of ofiles[indx] * if it succeeds. */ if ((error == ENODEV || error == ENXIO) && td->td_dupfd >= 0 && /* XXX from fdopen */ (error = dupfdopen(td, fdp, indx, td->td_dupfd, flags, error)) == 0) { td->td_retval[0] = indx; return (0); } /* * Clean up the descriptor, but only if another thread hadn't * replaced or closed it. */ FILEDESC_LOCK(fdp); if (fdp->fd_ofiles[indx] == fp) { fdp->fd_ofiles[indx] = NULL; FILEDESC_UNLOCK(fdp); fdrop(fp, td); } else FILEDESC_UNLOCK(fdp); if (error == ERESTART) error = EINTR; return (error); } td->td_dupfd = 0; NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; /* * There should be 2 references on the file, one from the descriptor * table, and one for us. * * Handle the case where someone closed the file (via its file * descriptor) while we were blocked. The end result should look * like opening the file succeeded but it was immediately closed. */ FILEDESC_LOCK(fdp); FILE_LOCK(fp); if (fp->f_count == 1) { KASSERT(fdp->fd_ofiles[indx] != fp, ("Open file descriptor lost all refs")); FILEDESC_UNLOCK(fdp); FILE_UNLOCK(fp); VOP_UNLOCK(vp, 0, td); vn_close(vp, flags & FMASK, fp->f_cred, td); fdrop(fp, td); td->td_retval[0] = indx; return 0; } fp->f_vnode = vp; fp->f_data = vp; fp->f_flag = flags & FMASK; fp->f_ops = &vnops; fp->f_seqcount = 1; fp->f_type = (vp->v_type == VFIFO ? DTYPE_FIFO : DTYPE_VNODE); FILEDESC_UNLOCK(fdp); FILE_UNLOCK(fp); /* assert that vn_open created a backing object if one is needed */ KASSERT(!vn_canvmio(vp) || VOP_GETVOBJECT(vp, NULL) == 0, ("open: vmio vnode has no backing object after vn_open")); VOP_UNLOCK(vp, 0, td); if (flags & (O_EXLOCK | O_SHLOCK)) { lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; if (flags & O_EXLOCK) lf.l_type = F_WRLCK; else lf.l_type = F_RDLCK; type = F_FLOCK; if ((flags & FNONBLOCK) == 0) type |= F_WAIT; if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type)) != 0) goto bad; fp->f_flag |= FHASLOCK; } if (flags & O_TRUNC) { if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) goto bad; VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); VATTR_NULL(&vat); vat.va_size = 0; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); #ifdef MAC error = mac_check_vnode_write(td->td_ucred, fp->f_cred, vp); if (error == 0) #endif error = VOP_SETATTR(vp, &vat, td->td_ucred, td); VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); if (error) goto bad; } /* * Release our private reference, leaving the one associated with * the descriptor table intact. */ fdrop(fp, td); td->td_retval[0] = indx; return (0); bad: FILEDESC_LOCK(fdp); if (fdp->fd_ofiles[indx] == fp) { fdp->fd_ofiles[indx] = NULL; FILEDESC_UNLOCK(fdp); fdrop(fp, td); } else FILEDESC_UNLOCK(fdp); fdrop(fp, td); return (error); } #ifdef COMPAT_43 /* * Create a file. */ #ifndef _SYS_SYSPROTO_H_ struct ocreat_args { char *path; int mode; }; #endif int ocreat(td, uap) struct thread *td; register struct ocreat_args /* { char *path; int mode; } */ *uap; { struct open_args /* { char *path; int flags; int mode; } */ nuap; nuap.path = uap->path; nuap.mode = uap->mode; nuap.flags = O_WRONLY | O_CREAT | O_TRUNC; return (open(td, &nuap)); } #endif /* COMPAT_43 */ /* * Create a special file. */ #ifndef _SYS_SYSPROTO_H_ struct mknod_args { char *path; int mode; int dev; }; #endif /* ARGSUSED */ int mknod(td, uap) struct thread *td; register struct mknod_args /* { char *path; int mode; int dev; } */ *uap; { return (kern_mknod(td, uap->path, UIO_USERSPACE, uap->mode, uap->dev)); } int kern_mknod(struct thread *td, char *path, enum uio_seg pathseg, int mode, int dev) { struct vnode *vp; struct mount *mp; struct vattr vattr; int error; int whiteout = 0; struct nameidata nd; switch (mode & S_IFMT) { case S_IFCHR: case S_IFBLK: error = suser(td); break; default: error = suser_cred(td->td_ucred, PRISON_ROOT); break; } if (error) return (error); restart: bwillwrite(); NDINIT(&nd, CREATE, LOCKPARENT | SAVENAME, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); vp = nd.ni_vp; if (vp != NULL) { NDFREE(&nd, NDF_ONLY_PNBUF); vrele(vp); if (vp == nd.ni_dvp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); return (EEXIST); } else { VATTR_NULL(&vattr); FILEDESC_LOCK(td->td_proc->p_fd); vattr.va_mode = (mode & ALLPERMS) & ~td->td_proc->p_fd->fd_cmask; FILEDESC_UNLOCK(td->td_proc->p_fd); vattr.va_rdev = dev; whiteout = 0; switch (mode & S_IFMT) { case S_IFMT: /* used by badsect to flag bad sectors */ vattr.va_type = VBAD; break; case S_IFCHR: vattr.va_type = VCHR; break; case S_IFBLK: vattr.va_type = VBLK; break; case S_IFWHT: whiteout = 1; break; default: error = EINVAL; break; } } if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } #ifdef MAC if (error == 0 && !whiteout) error = mac_check_vnode_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd, &vattr); #endif if (!error) { VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE); if (whiteout) error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE); else { error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); if (error == 0) vput(nd.ni_vp); } } NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); vn_finished_write(mp); ASSERT_VOP_UNLOCKED(nd.ni_dvp, "mknod"); ASSERT_VOP_UNLOCKED(nd.ni_vp, "mknod"); return (error); } /* * Create a named pipe. */ #ifndef _SYS_SYSPROTO_H_ struct mkfifo_args { char *path; int mode; }; #endif /* ARGSUSED */ int mkfifo(td, uap) struct thread *td; register struct mkfifo_args /* { char *path; int mode; } */ *uap; { return (kern_mkfifo(td, uap->path, UIO_USERSPACE, uap->mode)); } int kern_mkfifo(struct thread *td, char *path, enum uio_seg pathseg, int mode) { struct mount *mp; struct vattr vattr; int error; struct nameidata nd; restart: bwillwrite(); NDINIT(&nd, CREATE, LOCKPARENT | SAVENAME, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); if (nd.ni_vp != NULL) { NDFREE(&nd, NDF_ONLY_PNBUF); vrele(nd.ni_vp); if (nd.ni_vp == nd.ni_dvp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); return (EEXIST); } if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } VATTR_NULL(&vattr); vattr.va_type = VFIFO; FILEDESC_LOCK(td->td_proc->p_fd); vattr.va_mode = (mode & ALLPERMS) & ~td->td_proc->p_fd->fd_cmask; FILEDESC_UNLOCK(td->td_proc->p_fd); #ifdef MAC error = mac_check_vnode_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd, &vattr); if (error) goto out; #endif VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE); error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); if (error == 0) vput(nd.ni_vp); #ifdef MAC out: #endif NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); vn_finished_write(mp); return (error); } /* * Make a hard file link. */ #ifndef _SYS_SYSPROTO_H_ struct link_args { char *path; char *link; }; #endif /* ARGSUSED */ int link(td, uap) struct thread *td; register struct link_args /* { char *path; char *link; } */ *uap; { return (kern_link(td, uap->path, uap->link, UIO_USERSPACE)); } int kern_link(struct thread *td, char *path, char *link, enum uio_seg segflg) { struct vnode *vp; struct mount *mp; struct nameidata nd; int error; bwillwrite(); NDINIT(&nd, LOOKUP, FOLLOW|NOOBJ, segflg, path, td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; if (vp->v_type == VDIR) { vrele(vp); return (EPERM); /* POSIX */ } if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) { vrele(vp); return (error); } NDINIT(&nd, CREATE, LOCKPARENT | NOOBJ | SAVENAME, segflg, link, td); if ((error = namei(&nd)) == 0) { if (nd.ni_vp != NULL) { vrele(nd.ni_vp); if (nd.ni_dvp == nd.ni_vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); error = EEXIST; } else if ((error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td)) == 0) { VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE); VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); #ifdef MAC error = mac_check_vnode_link(td->td_ucred, nd.ni_dvp, vp, &nd.ni_cnd); if (error == 0) #endif error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd); VOP_UNLOCK(vp, 0, td); vput(nd.ni_dvp); } NDFREE(&nd, NDF_ONLY_PNBUF); } vrele(vp); vn_finished_write(mp); ASSERT_VOP_UNLOCKED(nd.ni_dvp, "link"); ASSERT_VOP_UNLOCKED(nd.ni_vp, "link"); return (error); } /* * Make a symbolic link. */ #ifndef _SYS_SYSPROTO_H_ struct symlink_args { char *path; char *link; }; #endif /* ARGSUSED */ int symlink(td, uap) struct thread *td; register struct symlink_args /* { char *path; char *link; } */ *uap; { return (kern_symlink(td, uap->path, uap->link, UIO_USERSPACE)); } int kern_symlink(struct thread *td, char *path, char *link, enum uio_seg segflg) { struct mount *mp; struct vattr vattr; char *syspath; int error; struct nameidata nd; if (segflg == UIO_SYSSPACE) { syspath = path; } else { syspath = uma_zalloc(namei_zone, M_WAITOK); if ((error = copyinstr(path, syspath, MAXPATHLEN, NULL)) != 0) goto out; } restart: bwillwrite(); NDINIT(&nd, CREATE, LOCKPARENT | NOOBJ | SAVENAME, segflg, link, td); if ((error = namei(&nd)) != 0) goto out; if (nd.ni_vp) { NDFREE(&nd, NDF_ONLY_PNBUF); vrele(nd.ni_vp); if (nd.ni_vp == nd.ni_dvp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); error = EEXIST; goto out; } if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } VATTR_NULL(&vattr); FILEDESC_LOCK(td->td_proc->p_fd); vattr.va_mode = ACCESSPERMS &~ td->td_proc->p_fd->fd_cmask; FILEDESC_UNLOCK(td->td_proc->p_fd); #ifdef MAC vattr.va_type = VLNK; error = mac_check_vnode_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd, &vattr); if (error) goto out2; #endif VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE); error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, syspath); if (error == 0) vput(nd.ni_vp); #ifdef MAC out2: #endif NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); vn_finished_write(mp); ASSERT_VOP_UNLOCKED(nd.ni_dvp, "symlink"); ASSERT_VOP_UNLOCKED(nd.ni_vp, "symlink"); out: if (segflg != UIO_SYSSPACE) uma_zfree(namei_zone, syspath); return (error); } /* * Delete a whiteout from the filesystem. */ /* ARGSUSED */ int undelete(td, uap) struct thread *td; register struct undelete_args /* { char *path; } */ *uap; { int error; struct mount *mp; struct nameidata nd; restart: bwillwrite(); NDINIT(&nd, DELETE, LOCKPARENT|DOWHITEOUT, UIO_USERSPACE, uap->path, td); error = namei(&nd); if (error) return (error); if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) { NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_vp) vrele(nd.ni_vp); if (nd.ni_vp == nd.ni_dvp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); return (EEXIST); } if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE); error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE); NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); vn_finished_write(mp); ASSERT_VOP_UNLOCKED(nd.ni_dvp, "undelete"); ASSERT_VOP_UNLOCKED(nd.ni_vp, "undelete"); return (error); } /* * Delete a name from the filesystem. */ #ifndef _SYS_SYSPROTO_H_ struct unlink_args { char *path; }; #endif /* ARGSUSED */ int unlink(td, uap) struct thread *td; struct unlink_args /* { char *path; } */ *uap; { return (kern_unlink(td, uap->path, UIO_USERSPACE)); } int kern_unlink(struct thread *td, char *path, enum uio_seg pathseg) { struct mount *mp; struct vnode *vp; int error; struct nameidata nd; restart: bwillwrite(); NDINIT(&nd, DELETE, LOCKPARENT|LOCKLEAF, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); vp = nd.ni_vp; if (vp->v_type == VDIR) error = EPERM; /* POSIX */ else { /* * The root of a mounted filesystem cannot be deleted. * * XXX: can this only be a VDIR case? */ if (vp->v_vflag & VV_ROOT) error = EBUSY; } if (error == 0) { if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); if (vp == nd.ni_dvp) vrele(vp); else vput(vp); vput(nd.ni_dvp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } #ifdef MAC error = mac_check_vnode_delete(td->td_ucred, nd.ni_dvp, vp, &nd.ni_cnd); if (error) goto out; #endif VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE); error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd); #ifdef MAC out: #endif vn_finished_write(mp); } NDFREE(&nd, NDF_ONLY_PNBUF); if (vp == nd.ni_dvp) vrele(vp); else vput(vp); vput(nd.ni_dvp); ASSERT_VOP_UNLOCKED(nd.ni_dvp, "unlink"); ASSERT_VOP_UNLOCKED(nd.ni_vp, "unlink"); return (error); } /* * Reposition read/write file offset. */ #ifndef _SYS_SYSPROTO_H_ struct lseek_args { int fd; int pad; off_t offset; int whence; }; #endif int lseek(td, uap) struct thread *td; register struct lseek_args /* { int fd; int pad; off_t offset; int whence; } */ *uap; { struct ucred *cred = td->td_ucred; struct file *fp; struct vnode *vp; struct vattr vattr; off_t offset; int error, noneg; if ((error = fget(td, uap->fd, &fp)) != 0) return (error); if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) { fdrop(fp, td); return (ESPIPE); } vp = fp->f_vnode; noneg = (vp->v_type != VCHR); offset = uap->offset; switch (uap->whence) { case L_INCR: if (noneg && (fp->f_offset < 0 || (offset > 0 && fp->f_offset > OFF_MAX - offset))) { error = EOVERFLOW; break; } offset += fp->f_offset; break; case L_XTND: vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); error = VOP_GETATTR(vp, &vattr, cred, td); VOP_UNLOCK(vp, 0, td); if (error) break; if (noneg && (vattr.va_size > OFF_MAX || (offset > 0 && vattr.va_size > OFF_MAX - offset))) { error = EOVERFLOW; break; } offset += vattr.va_size; break; case L_SET: break; default: error = EINVAL; } if (error == 0 && noneg && offset < 0) error = EINVAL; if (error != 0) { fdrop(fp, td); return (error); } fp->f_offset = offset; *(off_t *)(td->td_retval) = fp->f_offset; fdrop(fp, td); return (0); } #if defined(COMPAT_43) || defined(COMPAT_SUNOS) /* * Reposition read/write file offset. */ #ifndef _SYS_SYSPROTO_H_ struct olseek_args { int fd; long offset; int whence; }; #endif int olseek(td, uap) struct thread *td; register struct olseek_args /* { int fd; long offset; int whence; } */ *uap; { struct lseek_args /* { int fd; int pad; off_t offset; int whence; } */ nuap; int error; nuap.fd = uap->fd; nuap.offset = uap->offset; nuap.whence = uap->whence; error = lseek(td, &nuap); return (error); } #endif /* COMPAT_43 */ /* * Check access permissions using passed credentials. */ static int vn_access(vp, user_flags, cred, td) struct vnode *vp; int user_flags; struct ucred *cred; struct thread *td; { int error, flags; /* Flags == 0 means only check for existence. */ error = 0; if (user_flags) { flags = 0; if (user_flags & R_OK) flags |= VREAD; if (user_flags & W_OK) flags |= VWRITE; if (user_flags & X_OK) flags |= VEXEC; #ifdef MAC error = mac_check_vnode_access(cred, vp, flags); if (error) return (error); #endif if ((flags & VWRITE) == 0 || (error = vn_writechk(vp)) == 0) error = VOP_ACCESS(vp, flags, cred, td); } return (error); } /* * Check access permissions using "real" credentials. */ #ifndef _SYS_SYSPROTO_H_ struct access_args { char *path; int flags; }; #endif int access(td, uap) struct thread *td; register struct access_args /* { char *path; int flags; } */ *uap; { return (kern_access(td, uap->path, UIO_USERSPACE, uap->flags)); } int kern_access(struct thread *td, char *path, enum uio_seg pathseg, int flags) { struct ucred *cred, *tmpcred; register struct vnode *vp; int error; struct nameidata nd; /* * Create and modify a temporary credential instead of one that * is potentially shared. This could also mess up socket * buffer accounting which can run in an interrupt context. * * XXX - Depending on how "threads" are finally implemented, it * may be better to explicitly pass the credential to namei() * rather than to modify the potentially shared process structure. */ cred = td->td_ucred; tmpcred = crdup(cred); tmpcred->cr_uid = cred->cr_ruid; tmpcred->cr_groups[0] = cred->cr_rgid; td->td_ucred = tmpcred; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, pathseg, path, td); if ((error = namei(&nd)) != 0) goto out1; vp = nd.ni_vp; error = vn_access(vp, flags, tmpcred, td); NDFREE(&nd, NDF_ONLY_PNBUF); vput(vp); out1: td->td_ucred = cred; crfree(tmpcred); return (error); } /* * Check access permissions using "effective" credentials. */ #ifndef _SYS_SYSPROTO_H_ struct eaccess_args { char *path; int flags; }; #endif int eaccess(td, uap) struct thread *td; register struct eaccess_args /* { char *path; int flags; } */ *uap; { struct nameidata nd; struct vnode *vp; int error; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, uap->path, td); if ((error = namei(&nd)) != 0) return (error); vp = nd.ni_vp; error = vn_access(vp, uap->flags, td->td_ucred, td); NDFREE(&nd, NDF_ONLY_PNBUF); vput(vp); return (error); } #if defined(COMPAT_43) || defined(COMPAT_SUNOS) /* * Get file status; this version follows links. */ #ifndef _SYS_SYSPROTO_H_ struct ostat_args { char *path; struct ostat *ub; }; #endif /* ARGSUSED */ int ostat(td, uap) struct thread *td; register struct ostat_args /* { char *path; struct ostat *ub; } */ *uap; { struct stat sb; struct ostat osb; int error; struct nameidata nd; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, uap->path, td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = vn_stat(nd.ni_vp, &sb, td->td_ucred, NOCRED, td); vput(nd.ni_vp); if (error) return (error); cvtstat(&sb, &osb); error = copyout(&osb, uap->ub, sizeof (osb)); return (error); } /* * Get file status; this version does not follow links. */ #ifndef _SYS_SYSPROTO_H_ struct olstat_args { char *path; struct ostat *ub; }; #endif /* ARGSUSED */ int olstat(td, uap) struct thread *td; register struct olstat_args /* { char *path; struct ostat *ub; } */ *uap; { struct vnode *vp; struct stat sb; struct ostat osb; int error; struct nameidata nd; NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, uap->path, td); if ((error = namei(&nd)) != 0) return (error); vp = nd.ni_vp; error = vn_stat(vp, &sb, td->td_ucred, NOCRED, td); NDFREE(&nd, NDF_ONLY_PNBUF); vput(vp); if (error) return (error); cvtstat(&sb, &osb); error = copyout(&osb, uap->ub, sizeof (osb)); return (error); } /* * Convert from an old to a new stat structure. */ void cvtstat(st, ost) struct stat *st; struct ostat *ost; { ost->st_dev = st->st_dev; ost->st_ino = st->st_ino; ost->st_mode = st->st_mode; ost->st_nlink = st->st_nlink; ost->st_uid = st->st_uid; ost->st_gid = st->st_gid; ost->st_rdev = st->st_rdev; if (st->st_size < (quad_t)1 << 32) ost->st_size = st->st_size; else ost->st_size = -2; ost->st_atime = st->st_atime; ost->st_mtime = st->st_mtime; ost->st_ctime = st->st_ctime; ost->st_blksize = st->st_blksize; ost->st_blocks = st->st_blocks; ost->st_flags = st->st_flags; ost->st_gen = st->st_gen; } #endif /* COMPAT_43 || COMPAT_SUNOS */ /* * Get file status; this version follows links. */ #ifndef _SYS_SYSPROTO_H_ struct stat_args { char *path; struct stat *ub; }; #endif /* ARGSUSED */ int stat(td, uap) struct thread *td; register struct stat_args /* { char *path; struct stat *ub; } */ *uap; { struct stat sb; int error; struct nameidata nd; #ifdef LOOKUP_SHARED NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | NOOBJ, UIO_USERSPACE, uap->path, td); #else NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, uap->path, td); #endif if ((error = namei(&nd)) != 0) return (error); error = vn_stat(nd.ni_vp, &sb, td->td_ucred, NOCRED, td); NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_vp); if (error) return (error); error = copyout(&sb, uap->ub, sizeof (sb)); return (error); } /* * Get file status; this version does not follow links. */ #ifndef _SYS_SYSPROTO_H_ struct lstat_args { char *path; struct stat *ub; }; #endif /* ARGSUSED */ int lstat(td, uap) struct thread *td; register struct lstat_args /* { char *path; struct stat *ub; } */ *uap; { int error; struct vnode *vp; struct stat sb; struct nameidata nd; NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, uap->path, td); if ((error = namei(&nd)) != 0) return (error); vp = nd.ni_vp; error = vn_stat(vp, &sb, td->td_ucred, NOCRED, td); NDFREE(&nd, NDF_ONLY_PNBUF); vput(vp); if (error) return (error); error = copyout(&sb, uap->ub, sizeof (sb)); return (error); } /* * Implementation of the NetBSD stat() function. * XXX This should probably be collapsed with the FreeBSD version, * as the differences are only due to vn_stat() clearing spares at * the end of the structures. vn_stat could be split to avoid this, * and thus collapse the following to close to zero code. */ void cvtnstat(sb, nsb) struct stat *sb; struct nstat *nsb; { bzero(nsb, sizeof *nsb); nsb->st_dev = sb->st_dev; nsb->st_ino = sb->st_ino; nsb->st_mode = sb->st_mode; nsb->st_nlink = sb->st_nlink; nsb->st_uid = sb->st_uid; nsb->st_gid = sb->st_gid; nsb->st_rdev = sb->st_rdev; nsb->st_atimespec = sb->st_atimespec; nsb->st_mtimespec = sb->st_mtimespec; nsb->st_ctimespec = sb->st_ctimespec; nsb->st_size = sb->st_size; nsb->st_blocks = sb->st_blocks; nsb->st_blksize = sb->st_blksize; nsb->st_flags = sb->st_flags; nsb->st_gen = sb->st_gen; nsb->st_birthtimespec = sb->st_birthtimespec; } #ifndef _SYS_SYSPROTO_H_ struct nstat_args { char *path; struct nstat *ub; }; #endif /* ARGSUSED */ int nstat(td, uap) struct thread *td; register struct nstat_args /* { char *path; struct nstat *ub; } */ *uap; { struct stat sb; struct nstat nsb; int error; struct nameidata nd; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, uap->path, td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = vn_stat(nd.ni_vp, &sb, td->td_ucred, NOCRED, td); vput(nd.ni_vp); if (error) return (error); cvtnstat(&sb, &nsb); error = copyout(&nsb, uap->ub, sizeof (nsb)); return (error); } /* * NetBSD lstat. Get file status; this version does not follow links. */ #ifndef _SYS_SYSPROTO_H_ struct lstat_args { char *path; struct stat *ub; }; #endif /* ARGSUSED */ int nlstat(td, uap) struct thread *td; register struct nlstat_args /* { char *path; struct nstat *ub; } */ *uap; { int error; struct vnode *vp; struct stat sb; struct nstat nsb; struct nameidata nd; NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, uap->path, td); if ((error = namei(&nd)) != 0) return (error); vp = nd.ni_vp; NDFREE(&nd, NDF_ONLY_PNBUF); error = vn_stat(vp, &sb, td->td_ucred, NOCRED, td); vput(vp); if (error) return (error); cvtnstat(&sb, &nsb); error = copyout(&nsb, uap->ub, sizeof (nsb)); return (error); } /* * Get configurable pathname variables. */ #ifndef _SYS_SYSPROTO_H_ struct pathconf_args { char *path; int name; }; #endif /* ARGSUSED */ int pathconf(td, uap) struct thread *td; register struct pathconf_args /* { char *path; int name; } */ *uap; { int error; struct nameidata nd; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, uap->path, td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); /* If asynchronous I/O is available, it works for all files. */ if (uap->name == _PC_ASYNC_IO) td->td_retval[0] = async_io_version; else error = VOP_PATHCONF(nd.ni_vp, uap->name, td->td_retval); vput(nd.ni_vp); return (error); } /* * Return target name of a symbolic link. */ #ifndef _SYS_SYSPROTO_H_ struct readlink_args { char *path; char *buf; int count; }; #endif /* ARGSUSED */ int readlink(td, uap) struct thread *td; register struct readlink_args /* { char *path; char *buf; int count; } */ *uap; { return (kern_readlink(td, uap->path, UIO_USERSPACE, uap->buf, UIO_USERSPACE, uap->count)); } int kern_readlink(struct thread *td, char *path, enum uio_seg pathseg, char *buf, enum uio_seg bufseg, int count) { register struct vnode *vp; struct iovec aiov; struct uio auio; int error; struct nameidata nd; NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; #ifdef MAC error = mac_check_vnode_readlink(td->td_ucred, vp); if (error) { vput(vp); return (error); } #endif if (vp->v_type != VLNK) error = EINVAL; else { aiov.iov_base = buf; aiov.iov_len = count; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = 0; auio.uio_rw = UIO_READ; auio.uio_segflg = bufseg; auio.uio_td = td; auio.uio_resid = count; error = VOP_READLINK(vp, &auio, td->td_ucred); } vput(vp); td->td_retval[0] = count - auio.uio_resid; return (error); } /* * Common implementation code for chflags() and fchflags(). */ static int setfflags(td, vp, flags) struct thread *td; struct vnode *vp; int flags; { int error; struct mount *mp; struct vattr vattr; /* * Prevent non-root users from setting flags on devices. When * a device is reused, users can retain ownership of the device * if they are allowed to set flags and programs assume that * chown can't fail when done as root. */ if (vp->v_type == VCHR || vp->v_type == VBLK) { error = suser_cred(td->td_ucred, PRISON_ROOT); if (error) return (error); } if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) return (error); VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); VATTR_NULL(&vattr); vattr.va_flags = flags; #ifdef MAC error = mac_check_vnode_setflags(td->td_ucred, vp, vattr.va_flags); if (error == 0) #endif error = VOP_SETATTR(vp, &vattr, td->td_ucred, td); VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); return (error); } /* * Change flags of a file given a path name. */ #ifndef _SYS_SYSPROTO_H_ struct chflags_args { char *path; int flags; }; #endif /* ARGSUSED */ int chflags(td, uap) struct thread *td; register struct chflags_args /* { char *path; int flags; } */ *uap; { int error; struct nameidata nd; NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = setfflags(td, nd.ni_vp, uap->flags); vrele(nd.ni_vp); return error; } /* * Same as chflags() but doesn't follow symlinks. */ int lchflags(td, uap) struct thread *td; register struct lchflags_args /* { char *path; int flags; } */ *uap; { int error; struct nameidata nd; NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, uap->path, td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = setfflags(td, nd.ni_vp, uap->flags); vrele(nd.ni_vp); return error; } /* * Change flags of a file given a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fchflags_args { int fd; int flags; }; #endif /* ARGSUSED */ int fchflags(td, uap) struct thread *td; register struct fchflags_args /* { int fd; int flags; } */ *uap; { struct file *fp; int error; if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0) return (error); error = setfflags(td, fp->f_vnode, uap->flags); fdrop(fp, td); return (error); } /* * Common implementation code for chmod(), lchmod() and fchmod(). */ static int setfmode(td, vp, mode) struct thread *td; struct vnode *vp; int mode; { int error; struct mount *mp; struct vattr vattr; if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) return (error); VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); VATTR_NULL(&vattr); vattr.va_mode = mode & ALLPERMS; #ifdef MAC error = mac_check_vnode_setmode(td->td_ucred, vp, vattr.va_mode); if (error == 0) #endif error = VOP_SETATTR(vp, &vattr, td->td_ucred, td); VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); return error; } /* * Change mode of a file given path name. */ #ifndef _SYS_SYSPROTO_H_ struct chmod_args { char *path; int mode; }; #endif /* ARGSUSED */ int chmod(td, uap) struct thread *td; register struct chmod_args /* { char *path; int mode; } */ *uap; { return (kern_chmod(td, uap->path, UIO_USERSPACE, uap->mode)); } int kern_chmod(struct thread *td, char *path, enum uio_seg pathseg, int mode) { int error; struct nameidata nd; NDINIT(&nd, LOOKUP, FOLLOW, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = setfmode(td, nd.ni_vp, mode); vrele(nd.ni_vp); return error; } /* * Change mode of a file given path name (don't follow links.) */ #ifndef _SYS_SYSPROTO_H_ struct lchmod_args { char *path; int mode; }; #endif /* ARGSUSED */ int lchmod(td, uap) struct thread *td; register struct lchmod_args /* { char *path; int mode; } */ *uap; { int error; struct nameidata nd; NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, uap->path, td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = setfmode(td, nd.ni_vp, uap->mode); vrele(nd.ni_vp); return error; } /* * Change mode of a file given a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fchmod_args { int fd; int mode; }; #endif /* ARGSUSED */ int fchmod(td, uap) struct thread *td; register struct fchmod_args /* { int fd; int mode; } */ *uap; { struct file *fp; int error; if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0) return (error); error = setfmode(td, fp->f_vnode, uap->mode); fdrop(fp, td); return (error); } /* * Common implementation for chown(), lchown(), and fchown() */ static int setfown(td, vp, uid, gid) struct thread *td; struct vnode *vp; uid_t uid; gid_t gid; { int error; struct mount *mp; struct vattr vattr; if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) return (error); VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); VATTR_NULL(&vattr); vattr.va_uid = uid; vattr.va_gid = gid; #ifdef MAC error = mac_check_vnode_setowner(td->td_ucred, vp, vattr.va_uid, vattr.va_gid); if (error == 0) #endif error = VOP_SETATTR(vp, &vattr, td->td_ucred, td); VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); return error; } /* * Set ownership given a path name. */ #ifndef _SYS_SYSPROTO_H_ struct chown_args { char *path; int uid; int gid; }; #endif /* ARGSUSED */ int chown(td, uap) struct thread *td; register struct chown_args /* { char *path; int uid; int gid; } */ *uap; { return (kern_chown(td, uap->path, UIO_USERSPACE, uap->uid, uap->gid)); } int kern_chown(struct thread *td, char *path, enum uio_seg pathseg, int uid, int gid) { int error; struct nameidata nd; NDINIT(&nd, LOOKUP, FOLLOW, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = setfown(td, nd.ni_vp, uid, gid); vrele(nd.ni_vp); return (error); } /* * Set ownership given a path name, do not cross symlinks. */ #ifndef _SYS_SYSPROTO_H_ struct lchown_args { char *path; int uid; int gid; }; #endif /* ARGSUSED */ int lchown(td, uap) struct thread *td; register struct lchown_args /* { char *path; int uid; int gid; } */ *uap; { return (kern_lchown(td, uap->path, UIO_USERSPACE, uap->uid, uap->gid)); } int kern_lchown(struct thread *td, char *path, enum uio_seg pathseg, int uid, int gid) { int error; struct nameidata nd; NDINIT(&nd, LOOKUP, NOFOLLOW, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = setfown(td, nd.ni_vp, uid, gid); vrele(nd.ni_vp); return (error); } /* * Set ownership given a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fchown_args { int fd; int uid; int gid; }; #endif /* ARGSUSED */ int fchown(td, uap) struct thread *td; register struct fchown_args /* { int fd; int uid; int gid; } */ *uap; { struct file *fp; int error; if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0) return (error); error = setfown(td, fp->f_vnode, uap->uid, uap->gid); fdrop(fp, td); return (error); } /* * Common implementation code for utimes(), lutimes(), and futimes(). */ static int getutimes(usrtvp, tvpseg, tsp) const struct timeval *usrtvp; enum uio_seg tvpseg; struct timespec *tsp; { struct timeval tv[2]; const struct timeval *tvp; int error; if (usrtvp == NULL) { microtime(&tv[0]); TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]); tsp[1] = tsp[0]; } else { if (tvpseg == UIO_SYSSPACE) { tvp = usrtvp; } else { if ((error = copyin(usrtvp, tv, sizeof(tv))) != 0) return (error); tvp = tv; } TIMEVAL_TO_TIMESPEC(&tvp[0], &tsp[0]); TIMEVAL_TO_TIMESPEC(&tvp[1], &tsp[1]); } return 0; } /* * Common implementation code for utimes(), lutimes(), and futimes(). */ static int setutimes(td, vp, ts, numtimes, nullflag) struct thread *td; struct vnode *vp; const struct timespec *ts; int numtimes; int nullflag; { int error, setbirthtime; struct mount *mp; struct vattr vattr; if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) return (error); VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); setbirthtime = 0; if (numtimes < 3 && VOP_GETATTR(vp, &vattr, td->td_ucred, td) == 0 && timespeccmp(&ts[1], &vattr.va_birthtime, < )) setbirthtime = 1; VATTR_NULL(&vattr); vattr.va_atime = ts[0]; vattr.va_mtime = ts[1]; if (setbirthtime) vattr.va_birthtime = ts[1]; if (numtimes > 2) vattr.va_birthtime = ts[2]; if (nullflag) vattr.va_vaflags |= VA_UTIMES_NULL; #ifdef MAC error = mac_check_vnode_setutimes(td->td_ucred, vp, vattr.va_atime, vattr.va_mtime); #endif if (error == 0) error = VOP_SETATTR(vp, &vattr, td->td_ucred, td); VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); return error; } /* * Set the access and modification times of a file. */ #ifndef _SYS_SYSPROTO_H_ struct utimes_args { char *path; struct timeval *tptr; }; #endif /* ARGSUSED */ int utimes(td, uap) struct thread *td; register struct utimes_args /* { char *path; struct timeval *tptr; } */ *uap; { return (kern_utimes(td, uap->path, UIO_USERSPACE, uap->tptr, UIO_USERSPACE)); } int kern_utimes(struct thread *td, char *path, enum uio_seg pathseg, struct timeval *tptr, enum uio_seg tptrseg) { struct timespec ts[2]; int error; struct nameidata nd; if ((error = getutimes(tptr, tptrseg, ts)) != 0) return (error); NDINIT(&nd, LOOKUP, FOLLOW, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL); vrele(nd.ni_vp); return (error); } /* * Set the access and modification times of a file. */ #ifndef _SYS_SYSPROTO_H_ struct lutimes_args { char *path; struct timeval *tptr; }; #endif /* ARGSUSED */ int lutimes(td, uap) struct thread *td; register struct lutimes_args /* { char *path; struct timeval *tptr; } */ *uap; { return (kern_lutimes(td, uap->path, UIO_USERSPACE, uap->tptr, UIO_USERSPACE)); } int kern_lutimes(struct thread *td, char *path, enum uio_seg pathseg, struct timeval *tptr, enum uio_seg tptrseg) { struct timespec ts[2]; int error; struct nameidata nd; if ((error = getutimes(tptr, tptrseg, ts)) != 0) return (error); NDINIT(&nd, LOOKUP, NOFOLLOW, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL); vrele(nd.ni_vp); return (error); } /* * Set the access and modification times of a file. */ #ifndef _SYS_SYSPROTO_H_ struct futimes_args { int fd; struct timeval *tptr; }; #endif /* ARGSUSED */ int futimes(td, uap) struct thread *td; register struct futimes_args /* { int fd; struct timeval *tptr; } */ *uap; { return (kern_futimes(td, uap->fd, uap->tptr, UIO_USERSPACE)); } int kern_futimes(struct thread *td, int fd, struct timeval *tptr, enum uio_seg tptrseg) { struct timespec ts[2]; struct file *fp; int error; if ((error = getutimes(tptr, tptrseg, ts)) != 0) return (error); if ((error = getvnode(td->td_proc->p_fd, fd, &fp)) != 0) return (error); error = setutimes(td, fp->f_vnode, ts, 2, tptr == NULL); fdrop(fp, td); return (error); } /* * Truncate a file given its path name. */ #ifndef _SYS_SYSPROTO_H_ struct truncate_args { char *path; int pad; off_t length; }; #endif /* ARGSUSED */ int truncate(td, uap) struct thread *td; register struct truncate_args /* { char *path; int pad; off_t length; } */ *uap; { return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length)); } int kern_truncate(struct thread *td, char *path, enum uio_seg pathseg, off_t length) { struct mount *mp; struct vnode *vp; struct vattr vattr; int error; struct nameidata nd; if (length < 0) return(EINVAL); NDINIT(&nd, LOOKUP, FOLLOW, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); vp = nd.ni_vp; if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) { vrele(vp); return (error); } NDFREE(&nd, NDF_ONLY_PNBUF); VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); if (vp->v_type == VDIR) error = EISDIR; #ifdef MAC else if ((error = mac_check_vnode_write(td->td_ucred, NOCRED, vp))) { } #endif else if ((error = vn_writechk(vp)) == 0 && (error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td)) == 0) { VATTR_NULL(&vattr); vattr.va_size = length; error = VOP_SETATTR(vp, &vattr, td->td_ucred, td); } vput(vp); vn_finished_write(mp); return (error); } /* * Truncate a file given a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct ftruncate_args { int fd; int pad; off_t length; }; #endif /* ARGSUSED */ int ftruncate(td, uap) struct thread *td; register struct ftruncate_args /* { int fd; int pad; off_t length; } */ *uap; { struct mount *mp; struct vattr vattr; struct vnode *vp; struct file *fp; int error; if (uap->length < 0) return(EINVAL); if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0) return (error); if ((fp->f_flag & FWRITE) == 0) { fdrop(fp, td); return (EINVAL); } vp = fp->f_vnode; if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) { fdrop(fp, td); return (error); } VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); if (vp->v_type == VDIR) error = EISDIR; #ifdef MAC else if ((error = mac_check_vnode_write(td->td_ucred, fp->f_cred, vp))) { } #endif else if ((error = vn_writechk(vp)) == 0) { VATTR_NULL(&vattr); vattr.va_size = uap->length; error = VOP_SETATTR(vp, &vattr, fp->f_cred, td); } VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); fdrop(fp, td); return (error); } #if defined(COMPAT_43) || defined(COMPAT_SUNOS) /* * Truncate a file given its path name. */ #ifndef _SYS_SYSPROTO_H_ struct otruncate_args { char *path; long length; }; #endif /* ARGSUSED */ int otruncate(td, uap) struct thread *td; register struct otruncate_args /* { char *path; long length; } */ *uap; { struct truncate_args /* { char *path; int pad; off_t length; } */ nuap; nuap.path = uap->path; nuap.length = uap->length; return (truncate(td, &nuap)); } /* * Truncate a file given a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct oftruncate_args { int fd; long length; }; #endif /* ARGSUSED */ int oftruncate(td, uap) struct thread *td; register struct oftruncate_args /* { int fd; long length; } */ *uap; { struct ftruncate_args /* { int fd; int pad; off_t length; } */ nuap; nuap.fd = uap->fd; nuap.length = uap->length; return (ftruncate(td, &nuap)); } #endif /* COMPAT_43 || COMPAT_SUNOS */ /* * Sync an open file. */ #ifndef _SYS_SYSPROTO_H_ struct fsync_args { int fd; }; #endif /* ARGSUSED */ int fsync(td, uap) struct thread *td; struct fsync_args /* { int fd; } */ *uap; { struct vnode *vp; struct mount *mp; struct file *fp; vm_object_t obj; int error; GIANT_REQUIRED; if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0) return (error); vp = fp->f_vnode; if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) { fdrop(fp, td); return (error); } vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); if (VOP_GETVOBJECT(vp, &obj) == 0) { VM_OBJECT_LOCK(obj); vm_object_page_clean(obj, 0, 0, 0); VM_OBJECT_UNLOCK(obj); } error = VOP_FSYNC(vp, fp->f_cred, MNT_WAIT, td); if (error == 0 && vp->v_mount && (vp->v_mount->mnt_flag & MNT_SOFTDEP) && softdep_fsync_hook != NULL) error = (*softdep_fsync_hook)(vp); VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); fdrop(fp, td); return (error); } /* * Rename files. Source and destination must either both be directories, * or both not be directories. If target is a directory, it must be empty. */ #ifndef _SYS_SYSPROTO_H_ struct rename_args { char *from; char *to; }; #endif /* ARGSUSED */ int rename(td, uap) struct thread *td; register struct rename_args /* { char *from; char *to; } */ *uap; { return (kern_rename(td, uap->from, uap->to, UIO_USERSPACE)); } int kern_rename(struct thread *td, char *from, char *to, enum uio_seg pathseg) { struct mount *mp = NULL; struct vnode *tvp, *fvp, *tdvp; struct nameidata fromnd, tond; int error; bwillwrite(); #ifdef MAC NDINIT(&fromnd, DELETE, LOCKPARENT | LOCKLEAF | SAVESTART, pathseg, from, td); #else NDINIT(&fromnd, DELETE, WANTPARENT | SAVESTART, pathseg, from, td); #endif if ((error = namei(&fromnd)) != 0) return (error); #ifdef MAC error = mac_check_vnode_rename_from(td->td_ucred, fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd); VOP_UNLOCK(fromnd.ni_dvp, 0, td); VOP_UNLOCK(fromnd.ni_vp, 0, td); #endif fvp = fromnd.ni_vp; if (error == 0) error = vn_start_write(fvp, &mp, V_WAIT | PCATCH); if (error != 0) { NDFREE(&fromnd, NDF_ONLY_PNBUF); vrele(fromnd.ni_dvp); vrele(fvp); goto out1; } NDINIT(&tond, RENAME, LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | NOOBJ, pathseg, to, td); if (fromnd.ni_vp->v_type == VDIR) tond.ni_cnd.cn_flags |= WILLBEDIR; if ((error = namei(&tond)) != 0) { /* Translate error code for rename("dir1", "dir2/."). */ if (error == EISDIR && fvp->v_type == VDIR) error = EINVAL; NDFREE(&fromnd, NDF_ONLY_PNBUF); vrele(fromnd.ni_dvp); vrele(fvp); goto out1; } tdvp = tond.ni_dvp; tvp = tond.ni_vp; if (tvp != NULL) { if (fvp->v_type == VDIR && tvp->v_type != VDIR) { error = ENOTDIR; goto out; } else if (fvp->v_type != VDIR && tvp->v_type == VDIR) { error = EISDIR; goto out; } } if (fvp == tdvp) error = EINVAL; /* * If the source is the same as the destination (that is, if they * are links to the same vnode), then there is nothing to do. */ if (fvp == tvp) error = -1; #ifdef MAC else error = mac_check_vnode_rename_to(td->td_ucred, tdvp, tond.ni_vp, fromnd.ni_dvp == tdvp, &tond.ni_cnd); #endif out: if (!error) { VOP_LEASE(tdvp, td, td->td_ucred, LEASE_WRITE); if (fromnd.ni_dvp != tdvp) { VOP_LEASE(fromnd.ni_dvp, td, td->td_ucred, LEASE_WRITE); } if (tvp) { VOP_LEASE(tvp, td, td->td_ucred, LEASE_WRITE); } error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd, tond.ni_dvp, tond.ni_vp, &tond.ni_cnd); NDFREE(&fromnd, NDF_ONLY_PNBUF); NDFREE(&tond, NDF_ONLY_PNBUF); } else { NDFREE(&fromnd, NDF_ONLY_PNBUF); NDFREE(&tond, NDF_ONLY_PNBUF); if (tdvp == tvp) vrele(tdvp); else vput(tdvp); if (tvp) vput(tvp); vrele(fromnd.ni_dvp); vrele(fvp); } vrele(tond.ni_startdir); ASSERT_VOP_UNLOCKED(fromnd.ni_dvp, "rename"); ASSERT_VOP_UNLOCKED(fromnd.ni_vp, "rename"); ASSERT_VOP_UNLOCKED(tond.ni_dvp, "rename"); ASSERT_VOP_UNLOCKED(tond.ni_vp, "rename"); out1: vn_finished_write(mp); if (fromnd.ni_startdir) vrele(fromnd.ni_startdir); if (error == -1) return (0); return (error); } /* * Make a directory file. */ #ifndef _SYS_SYSPROTO_H_ struct mkdir_args { char *path; int mode; }; #endif /* ARGSUSED */ int mkdir(td, uap) struct thread *td; register struct mkdir_args /* { char *path; int mode; } */ *uap; { return (kern_mkdir(td, uap->path, UIO_USERSPACE, uap->mode)); } int kern_mkdir(struct thread *td, char *path, enum uio_seg segflg, int mode) { struct mount *mp; struct vnode *vp; struct vattr vattr; int error; struct nameidata nd; restart: bwillwrite(); NDINIT(&nd, CREATE, LOCKPARENT | SAVENAME, segflg, path, td); nd.ni_cnd.cn_flags |= WILLBEDIR; if ((error = namei(&nd)) != 0) return (error); vp = nd.ni_vp; if (vp != NULL) { NDFREE(&nd, NDF_ONLY_PNBUF); vrele(vp); /* * XXX namei called with LOCKPARENT but not LOCKLEAF has * the strange behaviour of leaving the vnode unlocked * if the target is the same vnode as the parent. */ if (vp == nd.ni_dvp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); return (EEXIST); } if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } VATTR_NULL(&vattr); vattr.va_type = VDIR; FILEDESC_LOCK(td->td_proc->p_fd); vattr.va_mode = (mode & ACCESSPERMS) &~ td->td_proc->p_fd->fd_cmask; FILEDESC_UNLOCK(td->td_proc->p_fd); #ifdef MAC error = mac_check_vnode_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd, &vattr); if (error) goto out; #endif VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE); error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); #ifdef MAC out: #endif NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if (!error) vput(nd.ni_vp); vn_finished_write(mp); ASSERT_VOP_UNLOCKED(nd.ni_dvp, "mkdir"); ASSERT_VOP_UNLOCKED(nd.ni_vp, "mkdir"); return (error); } /* * Remove a directory file. */ #ifndef _SYS_SYSPROTO_H_ struct rmdir_args { char *path; }; #endif /* ARGSUSED */ int rmdir(td, uap) struct thread *td; struct rmdir_args /* { char *path; } */ *uap; { return (kern_rmdir(td, uap->path, UIO_USERSPACE)); } int kern_rmdir(struct thread *td, char *path, enum uio_seg pathseg) { struct mount *mp; struct vnode *vp; int error; struct nameidata nd; restart: bwillwrite(); NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); vp = nd.ni_vp; if (vp->v_type != VDIR) { error = ENOTDIR; goto out; } /* * No rmdir "." please. */ if (nd.ni_dvp == vp) { error = EINVAL; goto out; } /* * The root of a mounted filesystem cannot be deleted. */ if (vp->v_vflag & VV_ROOT) { error = EBUSY; goto out; } #ifdef MAC error = mac_check_vnode_delete(td->td_ucred, nd.ni_dvp, vp, &nd.ni_cnd); if (error) goto out; #endif if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_dvp == vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); vput(vp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE); VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd); vn_finished_write(mp); out: NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_dvp == vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); vput(vp); ASSERT_VOP_UNLOCKED(nd.ni_dvp, "rmdir"); ASSERT_VOP_UNLOCKED(nd.ni_vp, "rmdir"); return (error); } #ifdef COMPAT_43 /* * Read a block of directory entries in a filesystem independent format. */ #ifndef _SYS_SYSPROTO_H_ struct ogetdirentries_args { int fd; char *buf; u_int count; long *basep; }; #endif int ogetdirentries(td, uap) struct thread *td; register struct ogetdirentries_args /* { int fd; char *buf; u_int count; long *basep; } */ *uap; { struct vnode *vp; struct file *fp; struct uio auio, kuio; struct iovec aiov, kiov; struct dirent *dp, *edp; caddr_t dirbuf; int error, eofflag, readcnt; long loff; /* XXX arbitrary sanity limit on `count'. */ if (uap->count > 64 * 1024) return (EINVAL); if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0) return (error); if ((fp->f_flag & FREAD) == 0) { fdrop(fp, td); return (EBADF); } vp = fp->f_vnode; unionread: if (vp->v_type != VDIR) { fdrop(fp, td); return (EINVAL); } aiov.iov_base = uap->buf; aiov.iov_len = uap->count; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_USERSPACE; auio.uio_td = td; auio.uio_resid = uap->count; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); loff = auio.uio_offset = fp->f_offset; #ifdef MAC error = mac_check_vnode_readdir(td->td_ucred, vp); if (error) { VOP_UNLOCK(vp, 0, td); fdrop(fp, td); return (error); } #endif # if (BYTE_ORDER != LITTLE_ENDIAN) if (vp->v_mount->mnt_maxsymlinklen <= 0) { error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL, NULL); fp->f_offset = auio.uio_offset; } else # endif { kuio = auio; kuio.uio_iov = &kiov; kuio.uio_segflg = UIO_SYSSPACE; kiov.iov_len = uap->count; MALLOC(dirbuf, caddr_t, uap->count, M_TEMP, M_WAITOK); kiov.iov_base = dirbuf; error = VOP_READDIR(vp, &kuio, fp->f_cred, &eofflag, NULL, NULL); fp->f_offset = kuio.uio_offset; if (error == 0) { readcnt = uap->count - kuio.uio_resid; edp = (struct dirent *)&dirbuf[readcnt]; for (dp = (struct dirent *)dirbuf; dp < edp; ) { # if (BYTE_ORDER == LITTLE_ENDIAN) /* * The expected low byte of * dp->d_namlen is our dp->d_type. * The high MBZ byte of dp->d_namlen * is our dp->d_namlen. */ dp->d_type = dp->d_namlen; dp->d_namlen = 0; # else /* * The dp->d_type is the high byte * of the expected dp->d_namlen, * so must be zero'ed. */ dp->d_type = 0; # endif if (dp->d_reclen > 0) { dp = (struct dirent *) ((char *)dp + dp->d_reclen); } else { error = EIO; break; } } if (dp >= edp) error = uiomove(dirbuf, readcnt, &auio); } FREE(dirbuf, M_TEMP); } VOP_UNLOCK(vp, 0, td); if (error) { fdrop(fp, td); return (error); } if (uap->count == auio.uio_resid) { if (union_dircheckp) { error = union_dircheckp(td, &vp, fp); if (error == -1) goto unionread; if (error) { fdrop(fp, td); return (error); } } /* * XXX We could delay dropping the lock above but * union_dircheckp complicates things. */ vn_lock(vp, LK_EXCLUSIVE|LK_RETRY, td); if ((vp->v_vflag & VV_ROOT) && (vp->v_mount->mnt_flag & MNT_UNION)) { struct vnode *tvp = vp; vp = vp->v_mount->mnt_vnodecovered; VREF(vp); fp->f_vnode = vp; fp->f_data = vp; fp->f_offset = 0; vput(tvp); goto unionread; } VOP_UNLOCK(vp, 0, td); } error = copyout(&loff, uap->basep, sizeof(long)); fdrop(fp, td); td->td_retval[0] = uap->count - auio.uio_resid; return (error); } #endif /* COMPAT_43 */ /* * Read a block of directory entries in a filesystem independent format. */ #ifndef _SYS_SYSPROTO_H_ struct getdirentries_args { int fd; char *buf; u_int count; long *basep; }; #endif int getdirentries(td, uap) struct thread *td; register struct getdirentries_args /* { int fd; char *buf; u_int count; long *basep; } */ *uap; { struct vnode *vp; struct file *fp; struct uio auio; struct iovec aiov; long loff; int error, eofflag; if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0) return (error); if ((fp->f_flag & FREAD) == 0) { fdrop(fp, td); return (EBADF); } vp = fp->f_vnode; unionread: if (vp->v_type != VDIR) { fdrop(fp, td); return (EINVAL); } aiov.iov_base = uap->buf; aiov.iov_len = uap->count; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_USERSPACE; auio.uio_td = td; auio.uio_resid = uap->count; /* vn_lock(vp, LK_SHARED | LK_RETRY, td); */ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); loff = auio.uio_offset = fp->f_offset; #ifdef MAC error = mac_check_vnode_readdir(td->td_ucred, vp); if (error == 0) #endif error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL, NULL); fp->f_offset = auio.uio_offset; VOP_UNLOCK(vp, 0, td); if (error) { fdrop(fp, td); return (error); } if (uap->count == auio.uio_resid) { if (union_dircheckp) { error = union_dircheckp(td, &vp, fp); if (error == -1) goto unionread; if (error) { fdrop(fp, td); return (error); } } /* * XXX We could delay dropping the lock above but * union_dircheckp complicates things. */ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); if ((vp->v_vflag & VV_ROOT) && (vp->v_mount->mnt_flag & MNT_UNION)) { struct vnode *tvp = vp; vp = vp->v_mount->mnt_vnodecovered; VREF(vp); fp->f_vnode = vp; fp->f_data = vp; fp->f_offset = 0; vput(tvp); goto unionread; } VOP_UNLOCK(vp, 0, td); } if (uap->basep != NULL) { error = copyout(&loff, uap->basep, sizeof(long)); } td->td_retval[0] = uap->count - auio.uio_resid; fdrop(fp, td); return (error); } #ifndef _SYS_SYSPROTO_H_ struct getdents_args { int fd; char *buf; size_t count; }; #endif int getdents(td, uap) struct thread *td; register struct getdents_args /* { int fd; char *buf; u_int count; } */ *uap; { struct getdirentries_args ap; ap.fd = uap->fd; ap.buf = uap->buf; ap.count = uap->count; ap.basep = NULL; return getdirentries(td, &ap); } /* * Set the mode mask for creation of filesystem nodes. * * MP SAFE */ #ifndef _SYS_SYSPROTO_H_ struct umask_args { int newmask; }; #endif int umask(td, uap) struct thread *td; struct umask_args /* { int newmask; } */ *uap; { register struct filedesc *fdp; FILEDESC_LOCK(td->td_proc->p_fd); fdp = td->td_proc->p_fd; td->td_retval[0] = fdp->fd_cmask; fdp->fd_cmask = uap->newmask & ALLPERMS; FILEDESC_UNLOCK(td->td_proc->p_fd); return (0); } /* * Void all references to file by ripping underlying filesystem * away from vnode. */ #ifndef _SYS_SYSPROTO_H_ struct revoke_args { char *path; }; #endif /* ARGSUSED */ int revoke(td, uap) struct thread *td; register struct revoke_args /* { char *path; } */ *uap; { struct mount *mp; struct vnode *vp; struct vattr vattr; int error; struct nameidata nd; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, uap->path, td); if ((error = namei(&nd)) != 0) return (error); vp = nd.ni_vp; NDFREE(&nd, NDF_ONLY_PNBUF); if (vp->v_type != VCHR) { vput(vp); return (EINVAL); } #ifdef MAC error = mac_check_vnode_revoke(td->td_ucred, vp); if (error) { vput(vp); return (error); } #endif error = VOP_GETATTR(vp, &vattr, td->td_ucred, td); if (error) { vput(vp); return (error); } VOP_UNLOCK(vp, 0, td); if (td->td_ucred->cr_uid != vattr.va_uid) { error = suser_cred(td->td_ucred, PRISON_ROOT); if (error) goto out; } if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) goto out; if (vcount(vp) > 1) VOP_REVOKE(vp, REVOKEALL); vn_finished_write(mp); out: vrele(vp); return (error); } /* * Convert a user file descriptor to a kernel file entry. * The file entry is locked upon returning. */ int getvnode(fdp, fd, fpp) struct filedesc *fdp; int fd; struct file **fpp; { int error; struct file *fp; fp = NULL; if (fdp == NULL) error = EBADF; else { FILEDESC_LOCK(fdp); if ((u_int)fd >= fdp->fd_nfiles || (fp = fdp->fd_ofiles[fd]) == NULL) error = EBADF; else if (fp->f_vnode == NULL) { fp = NULL; error = EINVAL; } else { fhold(fp); error = 0; } FILEDESC_UNLOCK(fdp); } *fpp = fp; return (error); } /* * Get (NFS) file handle */ #ifndef _SYS_SYSPROTO_H_ struct getfh_args { char *fname; fhandle_t *fhp; }; #endif int getfh(td, uap) struct thread *td; register struct getfh_args *uap; { struct nameidata nd; fhandle_t fh; register struct vnode *vp; int error; /* * Must be super user */ error = suser(td); if (error) return (error); NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, uap->fname, td); error = namei(&nd); if (error) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; bzero(&fh, sizeof(fh)); fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid; error = VFS_VPTOFH(vp, &fh.fh_fid); vput(vp); if (error) return (error); error = copyout(&fh, uap->fhp, sizeof (fh)); return (error); } /* * syscall for the rpc.lockd to use to translate a NFS file handle into * an open descriptor. * * warning: do not remove the suser() call or this becomes one giant * security hole. */ #ifndef _SYS_SYSPROTO_H_ struct fhopen_args { const struct fhandle *u_fhp; int flags; }; #endif int fhopen(td, uap) struct thread *td; struct fhopen_args /* { const struct fhandle *u_fhp; int flags; } */ *uap; { struct proc *p = td->td_proc; struct mount *mp; struct vnode *vp; struct fhandle fhp; struct vattr vat; struct vattr *vap = &vat; struct flock lf; struct file *fp; register struct filedesc *fdp = p->p_fd; int fmode, mode, error, type; struct file *nfp; int indx; /* * Must be super user */ error = suser(td); if (error) return (error); fmode = FFLAGS(uap->flags); /* why not allow a non-read/write open for our lockd? */ if (((fmode & (FREAD | FWRITE)) == 0) || (fmode & O_CREAT)) return (EINVAL); error = copyin(uap->u_fhp, &fhp, sizeof(fhp)); if (error) return(error); /* find the mount point */ mp = vfs_getvfs(&fhp.fh_fsid); if (mp == NULL) return (ESTALE); /* now give me my vnode, it gets returned to me locked */ error = VFS_FHTOVP(mp, &fhp.fh_fid, &vp); if (error) return (error); /* * from now on we have to make sure not * to forget about the vnode * any error that causes an abort must vput(vp) * just set error = err and 'goto bad;'. */ /* * from vn_open */ if (vp->v_type == VLNK) { error = EMLINK; goto bad; } if (vp->v_type == VSOCK) { error = EOPNOTSUPP; goto bad; } mode = 0; if (fmode & (FWRITE | O_TRUNC)) { if (vp->v_type == VDIR) { error = EISDIR; goto bad; } error = vn_writechk(vp); if (error) goto bad; mode |= VWRITE; } if (fmode & FREAD) mode |= VREAD; if (fmode & O_APPEND) mode |= VAPPEND; #ifdef MAC error = mac_check_vnode_open(td->td_ucred, vp, mode); if (error) goto bad; #endif if (mode) { error = VOP_ACCESS(vp, mode, td->td_ucred, td); if (error) goto bad; } if (fmode & O_TRUNC) { VOP_UNLOCK(vp, 0, td); /* XXX */ if ((error = vn_start_write(NULL, &mp, V_WAIT | PCATCH)) != 0) { vrele(vp); return (error); } VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); /* XXX */ #ifdef MAC /* * We don't yet have fp->f_cred, so use td->td_ucred, which * should be right. */ error = mac_check_vnode_write(td->td_ucred, td->td_ucred, vp); if (error == 0) { #endif VATTR_NULL(vap); vap->va_size = 0; error = VOP_SETATTR(vp, vap, td->td_ucred, td); #ifdef MAC } #endif vn_finished_write(mp); if (error) goto bad; } error = VOP_OPEN(vp, fmode, td->td_ucred, td, -1); if (error) goto bad; /* * Make sure that a VM object is created for VMIO support. */ if (vn_canvmio(vp) == TRUE) { if ((error = vfs_object_create(vp, td, td->td_ucred)) != 0) goto bad; } if (fmode & FWRITE) vp->v_writecount++; /* * end of vn_open code */ if ((error = falloc(td, &nfp, &indx)) != 0) { if (fmode & FWRITE) vp->v_writecount--; goto bad; } /* An extra reference on `nfp' has been held for us by falloc(). */ fp = nfp; nfp->f_vnode = vp; nfp->f_data = vp; nfp->f_flag = fmode & FMASK; nfp->f_ops = &vnops; nfp->f_type = DTYPE_VNODE; if (fmode & (O_EXLOCK | O_SHLOCK)) { lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; if (fmode & O_EXLOCK) lf.l_type = F_WRLCK; else lf.l_type = F_RDLCK; type = F_FLOCK; if ((fmode & FNONBLOCK) == 0) type |= F_WAIT; VOP_UNLOCK(vp, 0, td); if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type)) != 0) { /* * The lock request failed. Normally close the * descriptor but handle the case where someone might * have dup()d or close()d it when we weren't looking. */ FILEDESC_LOCK(fdp); if (fdp->fd_ofiles[indx] == fp) { fdp->fd_ofiles[indx] = NULL; FILEDESC_UNLOCK(fdp); fdrop(fp, td); } else FILEDESC_UNLOCK(fdp); /* * release our private reference */ fdrop(fp, td); return(error); } vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); fp->f_flag |= FHASLOCK; } if ((vp->v_type == VREG) && (VOP_GETVOBJECT(vp, NULL) != 0)) vfs_object_create(vp, td, td->td_ucred); VOP_UNLOCK(vp, 0, td); fdrop(fp, td); td->td_retval[0] = indx; return (0); bad: vput(vp); return (error); } /* * Stat an (NFS) file handle. */ #ifndef _SYS_SYSPROTO_H_ struct fhstat_args { struct fhandle *u_fhp; struct stat *sb; }; #endif int fhstat(td, uap) struct thread *td; register struct fhstat_args /* { struct fhandle *u_fhp; struct stat *sb; } */ *uap; { struct stat sb; fhandle_t fh; struct mount *mp; struct vnode *vp; int error; /* * Must be super user */ error = suser(td); if (error) return (error); error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t)); if (error) return (error); if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL) return (ESTALE); if ((error = VFS_FHTOVP(mp, &fh.fh_fid, &vp))) return (error); error = vn_stat(vp, &sb, td->td_ucred, NOCRED, td); vput(vp); if (error) return (error); error = copyout(&sb, uap->sb, sizeof(sb)); return (error); } /* * Implement fstatfs() for (NFS) file handles. */ #ifndef _SYS_SYSPROTO_H_ struct fhstatfs_args { struct fhandle *u_fhp; struct statfs *buf; }; #endif int fhstatfs(td, uap) struct thread *td; struct fhstatfs_args /* { struct fhandle *u_fhp; struct statfs *buf; } */ *uap; { - struct statfs *sp; + struct statfs *sp, sb; struct mount *mp; struct vnode *vp; - struct statfs sb; fhandle_t fh; int error; /* * Must be super user */ error = suser(td); if (error) return (error); if ((error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t))) != 0) return (error); if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL) return (ESTALE); if ((error = VFS_FHTOVP(mp, &fh.fh_fid, &vp))) return (error); mp = vp->v_mount; sp = &mp->mnt_stat; vput(vp); #ifdef MAC error = mac_check_mount_stat(td->td_ucred, mp); if (error) return (error); #endif + /* + * Set these in case the underlying filesystem fails to do so. + */ + sp->f_version = STATFS_VERSION; + sp->f_namemax = NAME_MAX; + sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; if ((error = VFS_STATFS(mp, sp, td)) != 0) return (error); - sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; if (suser(td)) { bcopy(sp, &sb, sizeof(sb)); sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0; sp = &sb; } return (copyout(sp, uap->buf, sizeof(*sp))); } /* * Syscall to push extended attribute configuration information into the * VFS. Accepts a path, which it converts to a mountpoint, as well as * a command (int cmd), and attribute name and misc data. For now, the * attribute name is left in userspace for consumption by the VFS_op. * It will probably be changed to be copied into sysspace by the * syscall in the future, once issues with various consumers of the * attribute code have raised their hands. * * Currently this is used only by UFS Extended Attributes. */ int extattrctl(td, uap) struct thread *td; struct extattrctl_args /* { const char *path; int cmd; const char *filename; int attrnamespace; const char *attrname; } */ *uap; { struct vnode *filename_vp; struct nameidata nd; struct mount *mp, *mp_writable; char attrname[EXTATTR_MAXNAMELEN]; int error; /* * uap->attrname is not always defined. We check again later when we * invoke the VFS call so as to pass in NULL there if needed. */ if (uap->attrname != NULL) { error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL); if (error) return (error); } /* * uap->filename is not always defined. If it is, grab a vnode lock, * which VFS_EXTATTRCTL() will later release. */ filename_vp = NULL; if (uap->filename != NULL) { NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, uap->filename, td); error = namei(&nd); if (error) return (error); filename_vp = nd.ni_vp; NDFREE(&nd, NDF_NO_VP_RELE | NDF_NO_VP_UNLOCK); } /* uap->path is always defined. */ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, td); error = namei(&nd); if (error) { if (filename_vp != NULL) vput(filename_vp); return (error); } mp = nd.ni_vp->v_mount; error = vn_start_write(nd.ni_vp, &mp_writable, V_WAIT | PCATCH); NDFREE(&nd, 0); if (error) { if (filename_vp != NULL) vput(filename_vp); return (error); } error = VFS_EXTATTRCTL(mp, uap->cmd, filename_vp, uap->attrnamespace, uap->attrname != NULL ? attrname : NULL, td); vn_finished_write(mp_writable); /* * VFS_EXTATTRCTL will have unlocked, but not de-ref'd, * filename_vp, so vrele it if it is defined. */ if (filename_vp != NULL) vrele(filename_vp); return (error); } /*- * Set a named extended attribute on a file or directory * * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace", * kernelspace string pointer "attrname", userspace buffer * pointer "data", buffer length "nbytes", thread "td". * Returns: 0 on success, an error number otherwise * Locks: none * References: vp must be a valid reference for the duration of the call */ static int extattr_set_vp(struct vnode *vp, int attrnamespace, const char *attrname, void *data, size_t nbytes, struct thread *td) { struct mount *mp; struct uio auio; struct iovec aiov; ssize_t cnt; int error; error = vn_start_write(vp, &mp, V_WAIT | PCATCH); if (error) return (error); VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); aiov.iov_base = data; aiov.iov_len = nbytes; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = 0; if (nbytes > INT_MAX) { error = EINVAL; goto done; } auio.uio_resid = nbytes; auio.uio_rw = UIO_WRITE; auio.uio_segflg = UIO_USERSPACE; auio.uio_td = td; cnt = nbytes; #ifdef MAC error = mac_check_vnode_setextattr(td->td_ucred, vp, attrnamespace, attrname, &auio); if (error) goto done; #endif error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, td->td_ucred, td); cnt -= auio.uio_resid; td->td_retval[0] = cnt; done: VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); return (error); } int extattr_set_fd(td, uap) struct thread *td; struct extattr_set_fd_args /* { int fd; int attrnamespace; const char *attrname; void *data; size_t nbytes; } */ *uap; { struct file *fp; char attrname[EXTATTR_MAXNAMELEN]; int error; error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL); if (error) return (error); error = getvnode(td->td_proc->p_fd, uap->fd, &fp); if (error) return (error); error = extattr_set_vp(fp->f_vnode, uap->attrnamespace, attrname, uap->data, uap->nbytes, td); fdrop(fp, td); return (error); } int extattr_set_file(td, uap) struct thread *td; struct extattr_set_file_args /* { const char *path; int attrnamespace; const char *attrname; void *data; size_t nbytes; } */ *uap; { struct nameidata nd; char attrname[EXTATTR_MAXNAMELEN]; int error; error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL); if (error) return (error); NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, td); error = namei(&nd); if (error) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = extattr_set_vp(nd.ni_vp, uap->attrnamespace, attrname, uap->data, uap->nbytes, td); vrele(nd.ni_vp); return (error); } int extattr_set_link(td, uap) struct thread *td; struct extattr_set_link_args /* { const char *path; int attrnamespace; const char *attrname; void *data; size_t nbytes; } */ *uap; { struct nameidata nd; char attrname[EXTATTR_MAXNAMELEN]; int error; error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL); if (error) return (error); NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, uap->path, td); error = namei(&nd); if (error) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = extattr_set_vp(nd.ni_vp, uap->attrnamespace, attrname, uap->data, uap->nbytes, td); vrele(nd.ni_vp); return (error); } /*- * Get a named extended attribute on a file or directory * * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace", * kernelspace string pointer "attrname", userspace buffer * pointer "data", buffer length "nbytes", thread "td". * Returns: 0 on success, an error number otherwise * Locks: none * References: vp must be a valid reference for the duration of the call */ static int extattr_get_vp(struct vnode *vp, int attrnamespace, const char *attrname, void *data, size_t nbytes, struct thread *td) { struct uio auio, *auiop; struct iovec aiov; ssize_t cnt; size_t size, *sizep; int error; /* * XXX: Temporary API compatibility for applications that know * about this hack ("" means list), but haven't been updated * for the extattr_list_*() system calls yet. This will go * away for FreeBSD 5.3. */ if (strlen(attrname) == 0) return (extattr_list_vp(vp, attrnamespace, data, nbytes, td)); VOP_LEASE(vp, td, td->td_ucred, LEASE_READ); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); /* * Slightly unusual semantics: if the user provides a NULL data * pointer, they don't want to receive the data, just the * maximum read length. */ auiop = NULL; sizep = NULL; cnt = 0; if (data != NULL) { aiov.iov_base = data; aiov.iov_len = nbytes; auio.uio_iov = &aiov; auio.uio_offset = 0; if (nbytes > INT_MAX) { error = EINVAL; goto done; } auio.uio_resid = nbytes; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_USERSPACE; auio.uio_td = td; auiop = &auio; cnt = nbytes; } else sizep = &size; #ifdef MAC error = mac_check_vnode_getextattr(td->td_ucred, vp, attrnamespace, attrname, &auio); if (error) goto done; #endif error = VOP_GETEXTATTR(vp, attrnamespace, attrname, auiop, sizep, td->td_ucred, td); if (auiop != NULL) { cnt -= auio.uio_resid; td->td_retval[0] = cnt; } else td->td_retval[0] = size; done: VOP_UNLOCK(vp, 0, td); return (error); } int extattr_get_fd(td, uap) struct thread *td; struct extattr_get_fd_args /* { int fd; int attrnamespace; const char *attrname; void *data; size_t nbytes; } */ *uap; { struct file *fp; char attrname[EXTATTR_MAXNAMELEN]; int error; error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL); if (error) return (error); error = getvnode(td->td_proc->p_fd, uap->fd, &fp); if (error) return (error); error = extattr_get_vp(fp->f_vnode, uap->attrnamespace, attrname, uap->data, uap->nbytes, td); fdrop(fp, td); return (error); } int extattr_get_file(td, uap) struct thread *td; struct extattr_get_file_args /* { const char *path; int attrnamespace; const char *attrname; void *data; size_t nbytes; } */ *uap; { struct nameidata nd; char attrname[EXTATTR_MAXNAMELEN]; int error; error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL); if (error) return (error); NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, td); error = namei(&nd); if (error) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = extattr_get_vp(nd.ni_vp, uap->attrnamespace, attrname, uap->data, uap->nbytes, td); vrele(nd.ni_vp); return (error); } int extattr_get_link(td, uap) struct thread *td; struct extattr_get_link_args /* { const char *path; int attrnamespace; const char *attrname; void *data; size_t nbytes; } */ *uap; { struct nameidata nd; char attrname[EXTATTR_MAXNAMELEN]; int error; error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL); if (error) return (error); NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, uap->path, td); error = namei(&nd); if (error) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = extattr_get_vp(nd.ni_vp, uap->attrnamespace, attrname, uap->data, uap->nbytes, td); vrele(nd.ni_vp); return (error); } /* * extattr_delete_vp(): Delete a named extended attribute on a file or * directory * * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace", * kernelspace string pointer "attrname", proc "p" * Returns: 0 on success, an error number otherwise * Locks: none * References: vp must be a valid reference for the duration of the call */ static int extattr_delete_vp(struct vnode *vp, int attrnamespace, const char *attrname, struct thread *td) { struct mount *mp; int error; error = vn_start_write(vp, &mp, V_WAIT | PCATCH); if (error) return (error); VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); #ifdef MAC error = mac_check_vnode_deleteextattr(td->td_ucred, vp, attrnamespace, attrname); if (error) goto done; #endif error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, td->td_ucred, td); if (error == EOPNOTSUPP) error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL, td->td_ucred, td); #ifdef MAC done: #endif VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); return (error); } int extattr_delete_fd(td, uap) struct thread *td; struct extattr_delete_fd_args /* { int fd; int attrnamespace; const char *attrname; } */ *uap; { struct file *fp; struct vnode *vp; char attrname[EXTATTR_MAXNAMELEN]; int error; error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL); if (error) return (error); error = getvnode(td->td_proc->p_fd, uap->fd, &fp); if (error) return (error); vp = fp->f_vnode; error = extattr_delete_vp(vp, uap->attrnamespace, attrname, td); fdrop(fp, td); return (error); } int extattr_delete_file(td, uap) struct thread *td; struct extattr_delete_file_args /* { const char *path; int attrnamespace; const char *attrname; } */ *uap; { struct nameidata nd; char attrname[EXTATTR_MAXNAMELEN]; int error; error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL); if (error) return(error); NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, td); error = namei(&nd); if (error) return(error); NDFREE(&nd, NDF_ONLY_PNBUF); error = extattr_delete_vp(nd.ni_vp, uap->attrnamespace, attrname, td); vrele(nd.ni_vp); return(error); } int extattr_delete_link(td, uap) struct thread *td; struct extattr_delete_link_args /* { const char *path; int attrnamespace; const char *attrname; } */ *uap; { struct nameidata nd; char attrname[EXTATTR_MAXNAMELEN]; int error; error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL); if (error) return(error); NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, uap->path, td); error = namei(&nd); if (error) return(error); NDFREE(&nd, NDF_ONLY_PNBUF); error = extattr_delete_vp(nd.ni_vp, uap->attrnamespace, attrname, td); vrele(nd.ni_vp); return(error); } /*- * Retrieve a list of extended attributes on a file or directory. * * Arguments: unlocked vnode "vp", attribute namespace 'attrnamespace", * userspace buffer pointer "data", buffer length "nbytes", * thread "td". * Returns: 0 on success, an error number otherwise * Locks: none * References: vp must be a valid reference for the duration of the call */ static int extattr_list_vp(struct vnode *vp, int attrnamespace, void *data, size_t nbytes, struct thread *td) { struct uio auio, *auiop; size_t size, *sizep; struct iovec aiov; ssize_t cnt; int error; VOP_LEASE(vp, td, td->td_ucred, LEASE_READ); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); auiop = NULL; sizep = NULL; cnt = 0; if (data != NULL) { aiov.iov_base = data; aiov.iov_len = nbytes; auio.uio_iov = &aiov; auio.uio_offset = 0; if (nbytes > INT_MAX) { error = EINVAL; goto done; } auio.uio_resid = nbytes; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_USERSPACE; auio.uio_td = td; auiop = &auio; cnt = nbytes; } else sizep = &size; #ifdef MAC error = mac_check_vnode_listextattr(td->td_ucred, vp, attrnamespace); if (error) goto done; #endif error = VOP_LISTEXTATTR(vp, attrnamespace, auiop, sizep, td->td_ucred, td); if (auiop != NULL) { cnt -= auio.uio_resid; td->td_retval[0] = cnt; } else td->td_retval[0] = size; done: VOP_UNLOCK(vp, 0, td); return (error); } int extattr_list_fd(td, uap) struct thread *td; struct extattr_list_fd_args /* { int fd; int attrnamespace; void *data; size_t nbytes; } */ *uap; { struct file *fp; int error; error = getvnode(td->td_proc->p_fd, uap->fd, &fp); if (error) return (error); error = extattr_list_vp(fp->f_vnode, uap->attrnamespace, uap->data, uap->nbytes, td); fdrop(fp, td); return (error); } int extattr_list_file(td, uap) struct thread*td; struct extattr_list_file_args /* { const char *path; int attrnamespace; void *data; size_t nbytes; } */ *uap; { struct nameidata nd; int error; NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, td); error = namei(&nd); if (error) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = extattr_list_vp(nd.ni_vp, uap->attrnamespace, uap->data, uap->nbytes, td); vrele(nd.ni_vp); return (error); } int extattr_list_link(td, uap) struct thread*td; struct extattr_list_link_args /* { const char *path; int attrnamespace; void *data; size_t nbytes; } */ *uap; { struct nameidata nd; int error; NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, uap->path, td); error = namei(&nd); if (error) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = extattr_list_vp(nd.ni_vp, uap->attrnamespace, uap->data, uap->nbytes, td); vrele(nd.ni_vp); return (error); } Index: head/sys/sys/mount.h =================================================================== --- head/sys/sys/mount.h (revision 122536) +++ head/sys/sys/mount.h (revision 122537) @@ -1,550 +1,577 @@ /* * Copyright (c) 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)mount.h 8.21 (Berkeley) 5/20/95 * $FreeBSD$ */ #ifndef _SYS_MOUNT_H_ #define _SYS_MOUNT_H_ #include #include #ifdef _KERNEL #include #include #include #endif typedef struct fsid { int32_t val[2]; } fsid_t; /* filesystem id type */ /* * File identifier. * These are unique per filesystem on a single machine. */ #define MAXFIDSZ 16 struct fid { u_short fid_len; /* length of data in bytes */ u_short fid_reserved; /* force longword alignment */ char fid_data[MAXFIDSZ]; /* data (variable length) */ }; /* * filesystem statistics */ +#define MFSNAMELEN 16 /* length of type name including null */ +#define MNAMELEN 88 /* size of on/from name bufs */ +#define STATFS_VERSION 0x20030518 /* current version number */ +struct statfs { + uint32_t f_version; /* structure version number */ + uint32_t f_type; /* type of filesystem */ + uint64_t f_flags; /* copy of mount exported flags */ + uint64_t f_bsize; /* filesystem fragment size */ + uint64_t f_iosize; /* optimal transfer block size */ + uint64_t f_blocks; /* total data blocks in filesystem */ + uint64_t f_bfree; /* free blocks in filesystem */ + int64_t f_bavail; /* free blocks avail to non-superuser */ + uint64_t f_files; /* total file nodes in filesystem */ + int64_t f_ffree; /* free nodes avail to non-superuser */ + uint64_t f_syncwrites; /* count of sync writes since mount */ + uint64_t f_asyncwrites; /* count of async writes since mount */ + uint64_t f_syncreads; /* count of sync reads since mount */ + uint64_t f_asyncreads; /* count of async reads since mount */ + uint64_t f_spare[10]; /* unused spare */ + uint32_t f_namemax; /* maximum filename length */ + uid_t f_owner; /* user that mounted the filesystem */ + fsid_t f_fsid; /* filesystem id */ + char f_charspare[80]; /* spare string space */ + char f_fstypename[MFSNAMELEN]; /* filesystem type name */ + char f_mntfromname[MNAMELEN]; /* mounted filesystem */ + char f_mntonname[MNAMELEN]; /* directory on which mounted */ +}; -#define MFSNAMELEN 16 /* length of fs type name, including null */ -#define MNAMELEN (88 - 2 * sizeof(long)) /* size of on/from name bufs */ +#ifdef _KERNEL +#define OMFSNAMELEN 16 /* length of fs type name, including null */ +#define OMNAMELEN (88 - 2 * sizeof(long)) /* size of on/from name bufs */ /* XXX getfsstat.2 is out of date with write and read counter changes here. */ /* XXX statfs.2 is out of date with read counter changes here. */ -struct statfs { +struct ostatfs { long f_spare2; /* placeholder */ long f_bsize; /* fundamental filesystem block size */ long f_iosize; /* optimal transfer block size */ long f_blocks; /* total data blocks in filesystem */ long f_bfree; /* free blocks in fs */ long f_bavail; /* free blocks avail to non-superuser */ long f_files; /* total file nodes in filesystem */ long f_ffree; /* free file nodes in fs */ fsid_t f_fsid; /* filesystem id */ uid_t f_owner; /* user that mounted the filesystem */ int f_type; /* type of filesystem */ int f_flags; /* copy of mount exported flags */ long f_syncwrites; /* count of sync writes since mount */ long f_asyncwrites; /* count of async writes since mount */ - char f_fstypename[MFSNAMELEN]; /* fs type name */ - char f_mntonname[MNAMELEN]; /* directory on which mounted */ + char f_fstypename[OMFSNAMELEN]; /* fs type name */ + char f_mntonname[OMNAMELEN]; /* directory on which mounted */ long f_syncreads; /* count of sync reads since mount */ long f_asyncreads; /* count of async reads since mount */ short f_spares1; /* unused spare */ - char f_mntfromname[MNAMELEN];/* mounted filesystem */ + char f_mntfromname[OMNAMELEN];/* mounted filesystem */ short f_spares2; /* unused spare */ /* * XXX on machines where longs are aligned to 8-byte boundaries, there * is an unnamed int32_t here. This spare was after the apparent end * of the struct until we bit off the read counters from f_mntonname. */ long f_spare[2]; /* unused spare */ }; -#ifdef _KERNEL #define MMAXOPTIONLEN 65536 /* maximum length of a mount option */ TAILQ_HEAD(vnodelst, vnode); TAILQ_HEAD(vfsoptlist, vfsopt); struct vfsopt { TAILQ_ENTRY(vfsopt) link; char *name; void *value; int len; }; /* * Structure per mounted filesystem. Each mounted filesystem has an * array of operations and an instance record. The filesystems are * put on a doubly linked list. * * NOTE: mnt_nvnodelist and mnt_reservedvnlist. At the moment vnodes * are linked into mnt_nvnodelist. At some point in the near future the * vnode list will be split into a 'dirty' and 'clean' list. mnt_nvnodelist * will become the dirty list and mnt_reservedvnlist will become the 'clean' * list. Filesystem kld's syncing code should remain compatible since * they only need to scan the dirty vnode list (nvnodelist -> dirtyvnodelist). */ struct mount { TAILQ_ENTRY(mount) mnt_list; /* mount list */ struct vfsops *mnt_op; /* operations on fs */ struct vfsconf *mnt_vfc; /* configuration info */ struct vnode *mnt_vnodecovered; /* vnode we mounted on */ struct vnode *mnt_syncer; /* syncer vnode */ struct vnodelst mnt_nvnodelist; /* list of vnodes this mount */ struct vnodelst mnt_reservedvnlist; /* (future) dirty vnode list */ struct lock mnt_lock; /* mount structure lock */ struct mtx mnt_mtx; /* mount structure interlock */ int mnt_writeopcount; /* write syscalls in progress */ int mnt_flag; /* flags shared with user */ struct vfsoptlist *mnt_opt; /* current mount options */ struct vfsoptlist *mnt_optnew; /* new options passed to fs */ int mnt_kern_flag; /* kernel only flags */ int mnt_maxsymlinklen; /* max size of short symlink */ struct statfs mnt_stat; /* cache of filesystem stats */ struct ucred *mnt_cred; /* credentials of mounter */ qaddr_t mnt_data; /* private data */ time_t mnt_time; /* last time written*/ int mnt_iosize_max; /* max size for clusters, etc */ struct netexport *mnt_export; /* export list */ struct label *mnt_mntlabel; /* MAC label for the mount */ struct label *mnt_fslabel; /* MAC label for the fs */ int mnt_nvnodelistsize; /* # of vnodes on this mount */ }; #define MNT_ILOCK(mp) mtx_lock(&(mp)->mnt_mtx) #define MNT_IUNLOCK(mp) mtx_unlock(&(mp)->mnt_mtx) #endif /* _KERNEL */ /* * User specifiable flags. */ #define MNT_RDONLY 0x00000001 /* read only filesystem */ #define MNT_SYNCHRONOUS 0x00000002 /* filesystem written synchronously */ #define MNT_NOEXEC 0x00000004 /* can't exec from filesystem */ #define MNT_NOSUID 0x00000008 /* don't honor setuid bits on fs */ #define MNT_NODEV 0x00000010 /* don't interpret special files */ #define MNT_UNION 0x00000020 /* union with underlying filesystem */ #define MNT_ASYNC 0x00000040 /* filesystem written asynchronously */ #define MNT_SUIDDIR 0x00100000 /* special handling of SUID on dirs */ #define MNT_SOFTDEP 0x00200000 /* soft updates being done */ #define MNT_NOSYMFOLLOW 0x00400000 /* do not follow symlinks */ #define MNT_JAILDEVFS 0x02000000 /* jail-friendly DEVFS behaviour */ #define MNT_MULTILABEL 0x04000000 /* MAC support for individual objects */ #define MNT_ACLS 0x08000000 /* ACL support enabled */ #define MNT_NOATIME 0x10000000 /* disable update of file access time */ #define MNT_NOCLUSTERR 0x40000000 /* disable cluster read */ #define MNT_NOCLUSTERW 0x80000000 /* disable cluster write */ /* * NFS export related mount flags. */ #define MNT_EXRDONLY 0x00000080 /* exported read only */ #define MNT_EXPORTED 0x00000100 /* filesystem is exported */ #define MNT_DEFEXPORTED 0x00000200 /* exported to the world */ #define MNT_EXPORTANON 0x00000400 /* use anon uid mapping for everyone */ #define MNT_EXKERB 0x00000800 /* exported with Kerberos uid mapping */ #define MNT_EXPUBLIC 0x20000000 /* public export (WebNFS) */ /* * Flags set by internal operations, * but visible to the user. * XXX some of these are not quite right.. (I've never seen the root flag set) */ #define MNT_LOCAL 0x00001000 /* filesystem is stored locally */ #define MNT_QUOTA 0x00002000 /* quotas are enabled on filesystem */ #define MNT_ROOTFS 0x00004000 /* identifies the root filesystem */ #define MNT_USER 0x00008000 /* mounted by a user */ #define MNT_IGNORE 0x00800000 /* do not show entry in df */ /* * Mask of flags that are visible to statfs(). * XXX I think that this could now become (~(MNT_CMDFLAGS)) * but the 'mount' program may need changing to handle this. */ #define MNT_VISFLAGMASK (MNT_RDONLY | MNT_SYNCHRONOUS | MNT_NOEXEC | \ MNT_NOSUID | MNT_NODEV | MNT_UNION | \ MNT_ASYNC | MNT_EXRDONLY | MNT_EXPORTED | \ MNT_DEFEXPORTED | MNT_EXPORTANON| MNT_EXKERB | \ MNT_LOCAL | MNT_USER | MNT_QUOTA | \ MNT_ROOTFS | MNT_NOATIME | MNT_NOCLUSTERR| \ MNT_NOCLUSTERW | MNT_SUIDDIR | MNT_SOFTDEP | \ MNT_IGNORE | MNT_EXPUBLIC | MNT_NOSYMFOLLOW | \ MNT_JAILDEVFS | MNT_MULTILABEL | MNT_ACLS) /* Mask of flags that can be updated. */ #define MNT_UPDATEMASK (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV | \ MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | \ MNT_NOATIME | \ MNT_NOSYMFOLLOW | MNT_IGNORE | MNT_JAILDEVFS | \ MNT_NOCLUSTERR | MNT_NOCLUSTERW | MNT_SUIDDIR | \ MNT_ACLS ) /* * External filesystem command modifier flags. * Unmount can use the MNT_FORCE flag. * XXX These are not STATES and really should be somewhere else. */ #define MNT_UPDATE 0x00010000 /* not a real mount, just an update */ #define MNT_DELEXPORT 0x00020000 /* delete export host lists */ #define MNT_RELOAD 0x00040000 /* reload filesystem data */ #define MNT_FORCE 0x00080000 /* force unmount or readonly change */ #define MNT_SNAPSHOT 0x01000000 /* snapshot the filesystem */ #define MNT_BYFSID 0x08000000 /* specify filesystem by ID. */ #define MNT_CMDFLAGS (MNT_UPDATE | MNT_DELEXPORT | MNT_RELOAD | \ MNT_FORCE | MNT_SNAPSHOT | MNT_BYFSID) /* * Internal filesystem control flags stored in mnt_kern_flag. * * MNTK_UNMOUNT locks the mount entry so that name lookup cannot proceed * past the mount point. This keeps the subtree stable during mounts * and unmounts. * * MNTK_UNMOUNTF permits filesystems to detect a forced unmount while * dounmount() is still waiting to lock the mountpoint. This allows * the filesystem to cancel operations that might otherwise deadlock * with the unmount attempt (used by NFS). */ #define MNTK_UNMOUNTF 0x00000001 /* forced unmount in progress */ #define MNTK_UNMOUNT 0x01000000 /* unmount in progress */ #define MNTK_MWAIT 0x02000000 /* waiting for unmount to finish */ #define MNTK_WANTRDWR 0x04000000 /* upgrade to read/write requested */ #define MNTK_SUSPEND 0x08000000 /* request write suspension */ #define MNTK_SUSPENDED 0x10000000 /* write operations are suspended */ /* * Sysctl CTL_VFS definitions. * * Second level identifier specifies which filesystem. Second level * identifier VFS_VFSCONF returns information about all filesystems. * Second level identifier VFS_GENERIC is non-terminal. */ #define VFS_VFSCONF 0 /* get configured filesystems */ #define VFS_GENERIC 0 /* generic filesystem information */ /* * Third level identifiers for VFS_GENERIC are given below; third * level identifiers for specific filesystems are given in their * mount specific header files. */ #define VFS_MAXTYPENUM 1 /* int: highest defined filesystem type */ #define VFS_CONF 2 /* struct: vfsconf for filesystem given as next argument */ /* * Flags for various system call interfaces. * * waitfor flags to vfs_sync() and getfsstat() */ #define MNT_WAIT 1 /* synchronously wait for I/O to complete */ #define MNT_NOWAIT 2 /* start all I/O, but do not wait for it */ #define MNT_LAZY 3 /* push data not written by filesystem syncer */ /* * Generic file handle */ struct fhandle { fsid_t fh_fsid; /* Filesystem id of mount point */ struct fid fh_fid; /* Filesys specific id */ }; typedef struct fhandle fhandle_t; /* * Export arguments for local filesystem mount calls. */ struct export_args { int ex_flags; /* export related flags */ uid_t ex_root; /* mapping for root uid */ struct xucred ex_anon; /* mapping for anonymous user */ struct sockaddr *ex_addr; /* net address to which exported */ u_char ex_addrlen; /* and the net address length */ struct sockaddr *ex_mask; /* mask of valid bits in saddr */ u_char ex_masklen; /* and the smask length */ char *ex_indexfile; /* index file for WebNFS URLs */ }; /* * Structure holding information for a publicly exported filesystem * (WebNFS). Currently the specs allow just for one such filesystem. */ struct nfs_public { int np_valid; /* Do we hold valid information */ fhandle_t np_handle; /* Filehandle for pub fs (internal) */ struct mount *np_mount; /* Mountpoint of exported fs */ char *np_index; /* Index file */ }; /* * Filesystem configuration information. One of these exists for each * type of filesystem supported by the kernel. These are searched at * mount time to identify the requested filesystem. */ struct vfsconf { struct vfsops *vfc_vfsops; /* filesystem operations vector */ char vfc_name[MFSNAMELEN]; /* filesystem type name */ int vfc_typenum; /* historic filesystem type number */ int vfc_refcount; /* number mounted of this type */ int vfc_flags; /* permanent flags */ struct vfsoptdecl *vfc_opts; /* mount options */ struct vfsconf *vfc_next; /* next in list */ }; /* Userland version of the struct vfsconf. */ struct xvfsconf { struct vfsops *vfc_vfsops; /* filesystem operations vector */ char vfc_name[MFSNAMELEN]; /* filesystem type name */ int vfc_typenum; /* historic filesystem type number */ int vfc_refcount; /* number mounted of this type */ int vfc_flags; /* permanent flags */ struct vfsconf *vfc_next; /* next in list */ }; struct ovfsconf { void *vfc_vfsops; char vfc_name[32]; int vfc_index; int vfc_refcount; int vfc_flags; }; /* * NB: these flags refer to IMPLEMENTATION properties, not properties of * any actual mounts; i.e., it does not make sense to change the flags. */ #define VFCF_STATIC 0x00010000 /* statically compiled into kernel */ #define VFCF_NETWORK 0x00020000 /* may get data over the network */ #define VFCF_READONLY 0x00040000 /* writes are not implemented */ #define VFCF_SYNTHETIC 0x00080000 /* data does not represent real files */ #define VFCF_LOOPBACK 0x00100000 /* aliases some other mounted FS */ #define VFCF_UNICODE 0x00200000 /* stores file names as Unicode*/ struct iovec; struct uio; #ifdef _KERNEL #ifdef MALLOC_DECLARE MALLOC_DECLARE(M_MOUNT); #endif extern int maxvfsconf; /* highest defined filesystem type */ extern int nfs_mount_type; /* vfc_typenum for nfs, or -1 */ extern struct vfsconf *vfsconf; /* head of list of filesystem types */ /* * Operations supported on mounted filesystem. */ struct mount_args; struct nameidata; typedef int vfs_mount_t(struct mount *mp, char *path, caddr_t data, struct nameidata *ndp, struct thread *td); typedef int vfs_start_t(struct mount *mp, int flags, struct thread *td); typedef int vfs_unmount_t(struct mount *mp, int mntflags, struct thread *td); typedef int vfs_root_t(struct mount *mp, struct vnode **vpp); typedef int vfs_quotactl_t(struct mount *mp, int cmds, uid_t uid, caddr_t arg, struct thread *td); typedef int vfs_statfs_t(struct mount *mp, struct statfs *sbp, struct thread *td); typedef int vfs_sync_t(struct mount *mp, int waitfor, struct ucred *cred, struct thread *td); typedef int vfs_vget_t(struct mount *mp, ino_t ino, int flags, struct vnode **vpp); typedef int vfs_fhtovp_t(struct mount *mp, struct fid *fhp, struct vnode **vpp); typedef int vfs_checkexp_t(struct mount *mp, struct sockaddr *nam, int *extflagsp, struct ucred **credanonp); typedef int vfs_vptofh_t(struct vnode *vp, struct fid *fhp); typedef int vfs_init_t(struct vfsconf *); typedef int vfs_uninit_t(struct vfsconf *); typedef int vfs_extattrctl_t(struct mount *mp, int cmd, struct vnode *filename_vp, int attrnamespace, const char *attrname, struct thread *td); typedef int vfs_nmount_t(struct mount *mp, struct nameidata *ndp, struct thread *td); struct vfsops { vfs_mount_t *vfs_mount; vfs_start_t *vfs_start; vfs_unmount_t *vfs_unmount; vfs_root_t *vfs_root; vfs_quotactl_t *vfs_quotactl; vfs_statfs_t *vfs_statfs; vfs_sync_t *vfs_sync; vfs_vget_t *vfs_vget; vfs_fhtovp_t *vfs_fhtovp; vfs_checkexp_t *vfs_checkexp; vfs_vptofh_t *vfs_vptofh; vfs_init_t *vfs_init; vfs_uninit_t *vfs_uninit; vfs_extattrctl_t *vfs_extattrctl; /* Additions below are not binary compatible with 5.0 and below. */ vfs_nmount_t *vfs_nmount; }; #define VFS_NMOUNT(MP, NDP, P) (*(MP)->mnt_op->vfs_nmount)(MP, NDP, P) #define VFS_MOUNT(MP, PATH, DATA, NDP, P) \ (*(MP)->mnt_op->vfs_mount)(MP, PATH, DATA, NDP, P) #define VFS_START(MP, FLAGS, P) (*(MP)->mnt_op->vfs_start)(MP, FLAGS, P) #define VFS_UNMOUNT(MP, FORCE, P) (*(MP)->mnt_op->vfs_unmount)(MP, FORCE, P) #define VFS_ROOT(MP, VPP) (*(MP)->mnt_op->vfs_root)(MP, VPP) #define VFS_QUOTACTL(MP,C,U,A,P) (*(MP)->mnt_op->vfs_quotactl)(MP, C, U, A, P) #define VFS_STATFS(MP, SBP, P) (*(MP)->mnt_op->vfs_statfs)(MP, SBP, P) #define VFS_SYNC(MP, WAIT, C, P) (*(MP)->mnt_op->vfs_sync)(MP, WAIT, C, P) #define VFS_VGET(MP, INO, FLAGS, VPP) \ (*(MP)->mnt_op->vfs_vget)(MP, INO, FLAGS, VPP) #define VFS_FHTOVP(MP, FIDP, VPP) \ (*(MP)->mnt_op->vfs_fhtovp)(MP, FIDP, VPP) #define VFS_VPTOFH(VP, FIDP) (*(VP)->v_mount->mnt_op->vfs_vptofh)(VP, FIDP) #define VFS_CHECKEXP(MP, NAM, EXFLG, CRED) \ (*(MP)->mnt_op->vfs_checkexp)(MP, NAM, EXFLG, CRED) #define VFS_EXTATTRCTL(MP, C, FN, NS, N, P) \ (*(MP)->mnt_op->vfs_extattrctl)(MP, C, FN, NS, N, P) #include #define VFS_SET(vfsops, fsname, flags) \ static struct vfsconf fsname ## _vfsconf = { \ &vfsops, \ #fsname, \ -1, \ 0, \ flags \ }; \ static moduledata_t fsname ## _mod = { \ #fsname, \ vfs_modevent, \ & fsname ## _vfsconf \ }; \ DECLARE_MODULE(fsname, fsname ## _mod, SI_SUB_VFS, SI_ORDER_MIDDLE) extern char *mountrootfsname; /* * exported vnode operations */ int dounmount(struct mount *, int, struct thread *td); int kernel_mount(struct iovec *iovp, unsigned int iovcnt, int flags); int kernel_vmount(int flags, ...); int vfs_getopt(struct vfsoptlist *, const char *, void **, int *); int vfs_copyopt(struct vfsoptlist *, const char *, void *, int); int vfs_mount(struct thread *td, const char *type, char *path, int flags, void *data); int vfs_setpublicfs /* set publicly exported fs */ (struct mount *, struct netexport *, struct export_args *); int vfs_lock(struct mount *); /* lock a vfs */ void vfs_msync(struct mount *, int); void vfs_unlock(struct mount *); /* unlock a vfs */ int vfs_busy(struct mount *, int, struct mtx *, struct thread *td); int vfs_export /* process mount export info */ (struct mount *, struct export_args *); struct netcred *vfs_export_lookup /* lookup host in fs export list */ (struct mount *, struct sockaddr *); int vfs_allocate_syncvnode(struct mount *); void vfs_getnewfsid(struct mount *); dev_t vfs_getrootfsid(struct mount *); struct mount *vfs_getvfs(fsid_t *); /* return vfs given fsid */ int vfs_modevent(module_t, int, void *); int vfs_mountedon(struct vnode *); /* is a vfs mounted on vp */ void vfs_mountroot(void); /* mount our root filesystem */ int vfs_rootmountalloc(char *, char *, struct mount **); void vfs_mount_destroy(struct mount *, struct thread *); void vfs_unbusy(struct mount *, struct thread *td); void vfs_unmountall(void); int vfs_register(struct vfsconf *); int vfs_unregister(struct vfsconf *); extern TAILQ_HEAD(mntlist, mount) mountlist; /* mounted filesystem list */ extern struct mtx mountlist_mtx; extern struct nfs_public nfs_pub; /* * Declarations for these vfs default operations are located in * kern/vfs_default.c, they should be used instead of making "dummy" * functions or casting entries in the VFS op table to "enopnotsupp()". */ vfs_start_t vfs_stdstart; vfs_root_t vfs_stdroot; vfs_quotactl_t vfs_stdquotactl; vfs_statfs_t vfs_stdstatfs; vfs_sync_t vfs_stdsync; vfs_sync_t vfs_stdnosync; vfs_vget_t vfs_stdvget; vfs_fhtovp_t vfs_stdfhtovp; vfs_checkexp_t vfs_stdcheckexp; vfs_vptofh_t vfs_stdvptofh; vfs_init_t vfs_stdinit; vfs_uninit_t vfs_stduninit; vfs_extattrctl_t vfs_stdextattrctl; /* XXX - these should be indirect functions!!! */ int softdep_fsync(struct vnode *); int softdep_process_worklist(struct mount *); #else /* !_KERNEL */ #include struct stat; __BEGIN_DECLS int fhopen(const struct fhandle *, int); int fhstat(const struct fhandle *, struct stat *); int fhstatfs(const struct fhandle *, struct statfs *); int fstatfs(int, struct statfs *); int getfh(const char *, fhandle_t *); int getfsstat(struct statfs *, long, int); int getmntinfo(struct statfs **, int); int mount(const char *, const char *, int, void *); int nmount(struct iovec *, u_int, int); int statfs(const char *, struct statfs *); int unmount(const char *, int); /* C library stuff */ void endvfsent(void); struct ovfsconf *getvfsbytype(int); struct ovfsconf *getvfsent(void); int getvfsbyname(const char *, struct xvfsconf *); void setvfsent(int); int vfsisloadable(const char *); int vfsload(const char *); __END_DECLS #endif /* _KERNEL */ #endif /* !_SYS_MOUNT_H_ */ Index: head/sys/ufs/ffs/ffs_vfsops.c =================================================================== --- head/sys/ufs/ffs/ffs_vfsops.c (revision 122536) +++ head/sys/ufs/ffs/ffs_vfsops.c (revision 122537) @@ -1,1549 +1,1560 @@ /* * Copyright (c) 1989, 1991, 1993, 1994 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ffs_vfsops.c 8.31 (Berkeley) 5/20/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_mac.h" #include "opt_quota.h" #include "opt_ufs.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include uma_zone_t uma_inode, uma_ufs1, uma_ufs2; static int ffs_sbupdate(struct ufsmount *, int); int ffs_reload(struct mount *,struct ucred *,struct thread *); static int ffs_mountfs(struct vnode *, struct mount *, struct thread *); static void ffs_oldfscompat_read(struct fs *, struct ufsmount *, ufs2_daddr_t); static void ffs_oldfscompat_write(struct fs *, struct ufsmount *); static void ffs_ifree(struct ufsmount *ump, struct inode *ip); static vfs_init_t ffs_init; static vfs_uninit_t ffs_uninit; static vfs_extattrctl_t ffs_extattrctl; static struct vfsops ufs_vfsops = { .vfs_extattrctl = ffs_extattrctl, .vfs_fhtovp = ffs_fhtovp, .vfs_init = ffs_init, .vfs_mount = ffs_mount, .vfs_quotactl = ufs_quotactl, .vfs_root = ufs_root, .vfs_start = ufs_start, .vfs_statfs = ffs_statfs, .vfs_sync = ffs_sync, .vfs_uninit = ffs_uninit, .vfs_unmount = ffs_unmount, .vfs_vget = ffs_vget, .vfs_vptofh = ffs_vptofh, }; VFS_SET(ufs_vfsops, ufs, 0); /* * ffs_mount * * Called when mounting local physical media * * PARAMETERS: * mountroot * mp mount point structure * path NULL (flag for root mount!!!) * data * ndp * p process (user credentials check [statfs]) * * mount * mp mount point structure * path path to mount point * data pointer to argument struct in user space * ndp mount point namei() return (used for * credentials on reload), reused to look * up block device. * p process (user credentials check) * * RETURNS: 0 Success * !0 error number (errno.h) * * LOCK STATE: * * ENTRY * mount point is locked * EXIT * mount point is locked * * NOTES: * A NULL path can be used for a flag since the mount * system call will fail with EFAULT in copyinstr in * namei() if it is a genuine NULL from the user. */ int ffs_mount(mp, path, data, ndp, td) struct mount *mp; /* mount struct pointer*/ char *path; /* path to mount point*/ caddr_t data; /* arguments to FS specific mount*/ struct nameidata *ndp; /* mount point credentials*/ struct thread *td; /* process requesting mount*/ { size_t size; struct vnode *devvp; struct ufs_args args; struct ufsmount *ump = 0; struct fs *fs; int error, flags; mode_t accessmode; if (uma_inode == NULL) { uma_inode = uma_zcreate("FFS inode", sizeof(struct inode), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); uma_ufs1 = uma_zcreate("FFS1 dinode", sizeof(struct ufs1_dinode), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); uma_ufs2 = uma_zcreate("FFS2 dinode", sizeof(struct ufs2_dinode), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); } /* * Use NULL path to indicate we are mounting the root filesystem. */ if (path == NULL) { if ((error = bdevvp(rootdev, &rootvp))) { printf("ffs_mountroot: can't find rootvp\n"); return (error); } if ((error = ffs_mountfs(rootvp, mp, td)) != 0) return (error); (void)VFS_STATFS(mp, &mp->mnt_stat, td); return (0); } /* * Mounting non-root filesystem or updating a filesystem */ if ((error = copyin(data, (caddr_t)&args, sizeof(struct ufs_args)))!= 0) return (error); /* * If updating, check whether changing from read-only to * read/write; if there is no device name, that's all we do. */ if (mp->mnt_flag & MNT_UPDATE) { ump = VFSTOUFS(mp); fs = ump->um_fs; devvp = ump->um_devvp; if (fs->fs_ronly == 0 && (mp->mnt_flag & MNT_RDONLY)) { if ((error = vn_start_write(NULL, &mp, V_WAIT)) != 0) return (error); /* * Flush any dirty data. */ if ((error = VFS_SYNC(mp, MNT_WAIT, td->td_ucred, td)) != 0) { vn_finished_write(mp); return (error); } /* * Check for and optionally get rid of files open * for writing. */ flags = WRITECLOSE; if (mp->mnt_flag & MNT_FORCE) flags |= FORCECLOSE; if (mp->mnt_flag & MNT_SOFTDEP) { error = softdep_flushfiles(mp, flags, td); } else { error = ffs_flushfiles(mp, flags, td); } if (error) { vn_finished_write(mp); return (error); } if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) { printf("%s: %s: blocks %jd files %d\n", fs->fs_fsmnt, "update error", (intmax_t)fs->fs_pendingblocks, fs->fs_pendinginodes); fs->fs_pendingblocks = 0; fs->fs_pendinginodes = 0; } fs->fs_ronly = 1; if ((fs->fs_flags & (FS_UNCLEAN | FS_NEEDSFSCK)) == 0) fs->fs_clean = 1; if ((error = ffs_sbupdate(ump, MNT_WAIT)) != 0) { fs->fs_ronly = 0; fs->fs_clean = 0; vn_finished_write(mp); return (error); } vn_finished_write(mp); } if ((mp->mnt_flag & MNT_RELOAD) && (error = ffs_reload(mp, ndp->ni_cnd.cn_cred, td)) != 0) return (error); if (fs->fs_ronly && (mp->mnt_kern_flag & MNTK_WANTRDWR)) { /* * If upgrade to read-write by non-root, then verify * that user has necessary permissions on the device. */ if (suser(td)) { vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, td); if ((error = VOP_ACCESS(devvp, VREAD | VWRITE, td->td_ucred, td)) != 0) { VOP_UNLOCK(devvp, 0, td); return (error); } VOP_UNLOCK(devvp, 0, td); } fs->fs_flags &= ~FS_UNCLEAN; if (fs->fs_clean == 0) { fs->fs_flags |= FS_UNCLEAN; if ((mp->mnt_flag & MNT_FORCE) || ((fs->fs_flags & FS_NEEDSFSCK) == 0 && (fs->fs_flags & FS_DOSOFTDEP))) { printf("WARNING: %s was not %s\n", fs->fs_fsmnt, "properly dismounted"); } else { printf( "WARNING: R/W mount of %s denied. Filesystem is not clean - run fsck\n", fs->fs_fsmnt); return (EPERM); } } if ((error = vn_start_write(NULL, &mp, V_WAIT)) != 0) return (error); fs->fs_ronly = 0; fs->fs_clean = 0; if ((error = ffs_sbupdate(ump, MNT_WAIT)) != 0) { vn_finished_write(mp); return (error); } /* check to see if we need to start softdep */ if ((fs->fs_flags & FS_DOSOFTDEP) && (error = softdep_mount(devvp, mp, fs, td->td_ucred))){ vn_finished_write(mp); return (error); } if (fs->fs_snapinum[0] != 0) ffs_snapshot_mount(mp); vn_finished_write(mp); } /* * Soft updates is incompatible with "async", * so if we are doing softupdates stop the user * from setting the async flag in an update. * Softdep_mount() clears it in an initial mount * or ro->rw remount. */ if (mp->mnt_flag & MNT_SOFTDEP) mp->mnt_flag &= ~MNT_ASYNC; /* * If not updating name, process export requests. */ if (args.fspec == 0) return (vfs_export(mp, &args.export)); /* * If this is a snapshot request, take the snapshot. */ if (mp->mnt_flag & MNT_SNAPSHOT) return (ffs_snapshot(mp, args.fspec)); } /* * Not an update, or updating the name: look up the name * and verify that it refers to a sensible block device. */ NDINIT(ndp, LOOKUP, FOLLOW, UIO_USERSPACE, args.fspec, td); if ((error = namei(ndp)) != 0) return (error); NDFREE(ndp, NDF_ONLY_PNBUF); devvp = ndp->ni_vp; if (!vn_isdisk(devvp, &error)) { vrele(devvp); return (error); } /* * If mount by non-root, then verify that user has necessary * permissions on the device. */ if (suser(td)) { accessmode = VREAD; if ((mp->mnt_flag & MNT_RDONLY) == 0) accessmode |= VWRITE; vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, td); if ((error = VOP_ACCESS(devvp, accessmode, td->td_ucred, td))!= 0){ vput(devvp); return (error); } VOP_UNLOCK(devvp, 0, td); } if (mp->mnt_flag & MNT_UPDATE) { /* * Update only * * If it's not the same vnode, or at least the same device * then it's not correct. */ if (devvp != ump->um_devvp && devvp->v_rdev != ump->um_devvp->v_rdev) error = EINVAL; /* needs translation */ vrele(devvp); if (error) return (error); } else { /* * New mount * * We need the name for the mount point (also used for * "last mounted on") copied in. If an error occurs, * the mount point is discarded by the upper level code. * Note that vfs_mount() populates f_mntonname for us. */ if ((error = ffs_mountfs(devvp, mp, td)) != 0) { vrele(devvp); return (error); } } /* * Save "mounted from" device name info for mount point (NULL pad). */ copyinstr(args.fspec, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, &size); bzero( mp->mnt_stat.f_mntfromname + size, MNAMELEN - size); /* * Initialize filesystem stat information in mount struct. */ (void)VFS_STATFS(mp, &mp->mnt_stat, td); return (0); } /* * Reload all incore data for a filesystem (used after running fsck on * the root filesystem and finding things to fix). The filesystem must * be mounted read-only. * * Things to do to update the mount: * 1) invalidate all cached meta-data. * 2) re-read superblock from disk. * 3) re-read summary information from disk. * 4) invalidate all inactive vnodes. * 5) invalidate all cached file data. * 6) re-read inode data for all active vnodes. */ int ffs_reload(mp, cred, td) struct mount *mp; struct ucred *cred; struct thread *td; { struct vnode *vp, *nvp, *devvp; struct inode *ip; void *space; struct buf *bp; struct fs *fs, *newfs; ufs2_daddr_t sblockloc; int i, blks, size, error; int32_t *lp; if ((mp->mnt_flag & MNT_RDONLY) == 0) return (EINVAL); /* * Step 1: invalidate all cached meta-data. */ devvp = VFSTOUFS(mp)->um_devvp; vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, td); error = vinvalbuf(devvp, 0, cred, td, 0, 0); VOP_UNLOCK(devvp, 0, td); if (error) panic("ffs_reload: dirty1"); /* * Only VMIO the backing device if the backing device is a real * block device. */ if (vn_isdisk(devvp, NULL)) { vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, td); vfs_object_create(devvp, td, td->td_ucred); VOP_UNLOCK(devvp, 0, td); } /* * Step 2: re-read superblock from disk. */ fs = VFSTOUFS(mp)->um_fs; if ((error = bread(devvp, btodb(fs->fs_sblockloc), fs->fs_sbsize, NOCRED, &bp)) != 0) return (error); newfs = (struct fs *)bp->b_data; if ((newfs->fs_magic != FS_UFS1_MAGIC && newfs->fs_magic != FS_UFS2_MAGIC) || newfs->fs_bsize > MAXBSIZE || newfs->fs_bsize < sizeof(struct fs)) { brelse(bp); return (EIO); /* XXX needs translation */ } /* * Copy pointer fields back into superblock before copying in XXX * new superblock. These should really be in the ufsmount. XXX * Note that important parameters (eg fs_ncg) are unchanged. */ newfs->fs_csp = fs->fs_csp; newfs->fs_maxcluster = fs->fs_maxcluster; newfs->fs_contigdirs = fs->fs_contigdirs; newfs->fs_active = fs->fs_active; sblockloc = fs->fs_sblockloc; bcopy(newfs, fs, (u_int)fs->fs_sbsize); brelse(bp); mp->mnt_maxsymlinklen = fs->fs_maxsymlinklen; ffs_oldfscompat_read(fs, VFSTOUFS(mp), sblockloc); if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) { printf("%s: reload pending error: blocks %jd files %d\n", fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks, fs->fs_pendinginodes); fs->fs_pendingblocks = 0; fs->fs_pendinginodes = 0; } /* * Step 3: re-read summary information from disk. */ blks = howmany(fs->fs_cssize, fs->fs_fsize); space = fs->fs_csp; for (i = 0; i < blks; i += fs->fs_frag) { size = fs->fs_bsize; if (i + fs->fs_frag > blks) size = (blks - i) * fs->fs_fsize; error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + i), size, NOCRED, &bp); if (error) return (error); bcopy(bp->b_data, space, (u_int)size); space = (char *)space + size; brelse(bp); } /* * We no longer know anything about clusters per cylinder group. */ if (fs->fs_contigsumsize > 0) { lp = fs->fs_maxcluster; for (i = 0; i < fs->fs_ncg; i++) *lp++ = fs->fs_contigsumsize; } loop: MNT_ILOCK(mp); for (vp = TAILQ_FIRST(&mp->mnt_nvnodelist); vp != NULL; vp = nvp) { if (vp->v_mount != mp) { MNT_IUNLOCK(mp); goto loop; } nvp = TAILQ_NEXT(vp, v_nmntvnodes); VI_LOCK(vp); if (vp->v_iflag & VI_XLOCK) { VI_UNLOCK(vp); continue; } MNT_IUNLOCK(mp); /* * Step 4: invalidate all inactive vnodes. */ if (vp->v_usecount == 0) { vgonel(vp, td); goto loop; } /* * Step 5: invalidate all cached file data. */ if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, td)) { goto loop; } if (vinvalbuf(vp, 0, cred, td, 0, 0)) panic("ffs_reload: dirty2"); /* * Step 6: re-read inode data for all active vnodes. */ ip = VTOI(vp); error = bread(devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), (int)fs->fs_bsize, NOCRED, &bp); if (error) { VOP_UNLOCK(vp, 0, td); vrele(vp); return (error); } ffs_load_inode(bp, ip, fs, ip->i_number); ip->i_effnlink = ip->i_nlink; brelse(bp); VOP_UNLOCK(vp, 0, td); vrele(vp); MNT_ILOCK(mp); } MNT_IUNLOCK(mp); return (0); } /* * Possible superblock locations ordered from most to least likely. */ static int sblock_try[] = SBLOCKSEARCH; /* * Common code for mount and mountroot */ static int ffs_mountfs(devvp, mp, td) struct vnode *devvp; struct mount *mp; struct thread *td; { struct ufsmount *ump; struct buf *bp; struct fs *fs; dev_t dev; void *space; ufs2_daddr_t sblockloc; int error, i, blks, size, ronly; int32_t *lp; struct ucred *cred; size_t strsize; int ncount; dev = devvp->v_rdev; cred = td ? td->td_ucred : NOCRED; /* * Disallow multiple mounts of the same device. * Disallow mounting of a device that is currently in use * (except for root, which might share swap device for miniroot). * Flush out any old buffers remaining from a previous use. */ error = vfs_mountedon(devvp); if (error) return (error); ncount = vcount(devvp); if (ncount > 1 && devvp != rootvp) return (EBUSY); vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, td); error = vinvalbuf(devvp, V_SAVE, cred, td, 0, 0); VOP_UNLOCK(devvp, 0, td); if (error) return (error); /* * Only VMIO the backing device if the backing device is a real * block device. * Note that it is optional that the backing device be VMIOed. This * increases the opportunity for metadata caching. */ if (vn_isdisk(devvp, NULL)) { vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, td); vfs_object_create(devvp, td, cred); VOP_UNLOCK(devvp, 0, td); } ronly = (mp->mnt_flag & MNT_RDONLY) != 0; vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, td); /* * XXX: We don't re-VOP_OPEN in FREAD|FWRITE mode if the filesystem * XXX: is subsequently remounted, so open it FREAD|FWRITE from the * XXX: start to avoid getting trashed later on. */ #ifdef notyet error = VOP_OPEN(devvp, ronly ? FREAD : FREAD|FWRITE, FSCRED, td, -1); #else error = VOP_OPEN(devvp, FREAD|FWRITE, FSCRED, td, -1); #endif VOP_UNLOCK(devvp, 0, td); if (error) return (error); if (devvp->v_rdev->si_iosize_max != 0) mp->mnt_iosize_max = devvp->v_rdev->si_iosize_max; if (mp->mnt_iosize_max > MAXPHYS) mp->mnt_iosize_max = MAXPHYS; bp = NULL; ump = NULL; fs = NULL; sblockloc = 0; /* * Try reading the superblock in each of its possible locations. */ for (i = 0; sblock_try[i] != -1; i++) { if ((error = bread(devvp, sblock_try[i] / DEV_BSIZE, SBLOCKSIZE, cred, &bp)) != 0) goto out; fs = (struct fs *)bp->b_data; sblockloc = sblock_try[i]; if ((fs->fs_magic == FS_UFS1_MAGIC || (fs->fs_magic == FS_UFS2_MAGIC && (fs->fs_sblockloc == sblockloc || (fs->fs_old_flags & FS_FLAGS_UPDATED) == 0))) && fs->fs_bsize <= MAXBSIZE && fs->fs_bsize >= sizeof(struct fs)) break; brelse(bp); bp = NULL; } if (sblock_try[i] == -1) { error = EINVAL; /* XXX needs translation */ goto out; } fs->fs_fmod = 0; fs->fs_flags &= ~FS_INDEXDIRS; /* no support for directory indicies */ fs->fs_flags &= ~FS_UNCLEAN; if (fs->fs_clean == 0) { fs->fs_flags |= FS_UNCLEAN; if (ronly || (mp->mnt_flag & MNT_FORCE) || ((fs->fs_flags & FS_NEEDSFSCK) == 0 && (fs->fs_flags & FS_DOSOFTDEP))) { printf( "WARNING: %s was not properly dismounted\n", fs->fs_fsmnt); } else { printf( "WARNING: R/W mount of %s denied. Filesystem is not clean - run fsck\n", fs->fs_fsmnt); error = EPERM; goto out; } if ((fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) && (mp->mnt_flag & MNT_FORCE)) { printf("%s: lost blocks %jd files %d\n", fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks, fs->fs_pendinginodes); fs->fs_pendingblocks = 0; fs->fs_pendinginodes = 0; } } if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) { printf("%s: mount pending error: blocks %jd files %d\n", fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks, fs->fs_pendinginodes); fs->fs_pendingblocks = 0; fs->fs_pendinginodes = 0; } ump = malloc(sizeof *ump, M_UFSMNT, M_WAITOK | M_ZERO); ump->um_fs = malloc((u_long)fs->fs_sbsize, M_UFSMNT, M_WAITOK); if (fs->fs_magic == FS_UFS1_MAGIC) { ump->um_fstype = UFS1; ump->um_balloc = ffs_balloc_ufs1; } else { ump->um_fstype = UFS2; ump->um_balloc = ffs_balloc_ufs2; } ump->um_blkatoff = ffs_blkatoff; ump->um_truncate = ffs_truncate; ump->um_update = ffs_update; ump->um_valloc = ffs_valloc; ump->um_vfree = ffs_vfree; ump->um_ifree = ffs_ifree; bcopy(bp->b_data, ump->um_fs, (u_int)fs->fs_sbsize); if (fs->fs_sbsize < SBLOCKSIZE) bp->b_flags |= B_INVAL | B_NOCACHE; brelse(bp); bp = NULL; fs = ump->um_fs; ffs_oldfscompat_read(fs, ump, sblockloc); fs->fs_ronly = ronly; size = fs->fs_cssize; blks = howmany(size, fs->fs_fsize); if (fs->fs_contigsumsize > 0) size += fs->fs_ncg * sizeof(int32_t); size += fs->fs_ncg * sizeof(u_int8_t); space = malloc((u_long)size, M_UFSMNT, M_WAITOK); fs->fs_csp = space; for (i = 0; i < blks; i += fs->fs_frag) { size = fs->fs_bsize; if (i + fs->fs_frag > blks) size = (blks - i) * fs->fs_fsize; if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + i), size, cred, &bp)) != 0) { free(fs->fs_csp, M_UFSMNT); goto out; } bcopy(bp->b_data, space, (u_int)size); space = (char *)space + size; brelse(bp); bp = NULL; } if (fs->fs_contigsumsize > 0) { fs->fs_maxcluster = lp = space; for (i = 0; i < fs->fs_ncg; i++) *lp++ = fs->fs_contigsumsize; space = lp; } size = fs->fs_ncg * sizeof(u_int8_t); fs->fs_contigdirs = (u_int8_t *)space; bzero(fs->fs_contigdirs, size); fs->fs_active = NULL; mp->mnt_data = (qaddr_t)ump; mp->mnt_stat.f_fsid.val[0] = fs->fs_id[0]; mp->mnt_stat.f_fsid.val[1] = fs->fs_id[1]; if (fs->fs_id[0] == 0 || fs->fs_id[1] == 0 || vfs_getvfs(&mp->mnt_stat.f_fsid)) vfs_getnewfsid(mp); mp->mnt_maxsymlinklen = fs->fs_maxsymlinklen; mp->mnt_flag |= MNT_LOCAL; if ((fs->fs_flags & FS_MULTILABEL) != 0) #ifdef MAC mp->mnt_flag |= MNT_MULTILABEL; #else printf( "WARNING: %s: multilabel flag on fs but no MAC support\n", fs->fs_fsmnt); #endif if ((fs->fs_flags & FS_ACLS) != 0) #ifdef UFS_ACL mp->mnt_flag |= MNT_ACLS; #else printf( "WARNING: %s: ACLs flag on fs but no ACLs support\n", fs->fs_fsmnt); #endif ump->um_mountp = mp; ump->um_dev = dev; ump->um_devvp = devvp; ump->um_nindir = fs->fs_nindir; ump->um_bptrtodb = fs->fs_fsbtodb; ump->um_seqinc = fs->fs_frag; for (i = 0; i < MAXQUOTAS; i++) ump->um_quotas[i] = NULLVP; #ifdef UFS_EXTATTR ufs_extattr_uepm_init(&ump->um_extattr); #endif devvp->v_rdev->si_mountpoint = mp; /* * Set FS local "last mounted on" information (NULL pad) */ copystr( mp->mnt_stat.f_mntonname, /* mount point*/ fs->fs_fsmnt, /* copy area*/ sizeof(fs->fs_fsmnt) - 1, /* max size*/ &strsize); /* real size*/ bzero( fs->fs_fsmnt + strsize, sizeof(fs->fs_fsmnt) - strsize); if( mp->mnt_flag & MNT_ROOTFS) { /* * Root mount; update timestamp in mount structure. * this will be used by the common root mount code * to update the system clock. */ mp->mnt_time = fs->fs_time; } if (ronly == 0) { if ((fs->fs_flags & FS_DOSOFTDEP) && (error = softdep_mount(devvp, mp, fs, cred)) != 0) { free(fs->fs_csp, M_UFSMNT); goto out; } if (fs->fs_snapinum[0] != 0) ffs_snapshot_mount(mp); fs->fs_fmod = 1; fs->fs_clean = 0; (void) ffs_sbupdate(ump, MNT_WAIT); } #ifdef UFS_EXTATTR #ifdef UFS_EXTATTR_AUTOSTART /* * * Auto-starting does the following: * - check for /.attribute in the fs, and extattr_start if so * - for each file in .attribute, enable that file with * an attribute of the same name. * Not clear how to report errors -- probably eat them. * This would all happen while the filesystem was busy/not * available, so would effectively be "atomic". */ (void) ufs_extattr_autostart(mp, td); #endif /* !UFS_EXTATTR_AUTOSTART */ #endif /* !UFS_EXTATTR */ return (0); out: devvp->v_rdev->si_mountpoint = NULL; if (bp) brelse(bp); /* XXX: see comment above VOP_OPEN */ #ifdef notyet (void)VOP_CLOSE(devvp, ronly ? FREAD : FREAD|FWRITE, cred, td); #else (void)VOP_CLOSE(devvp, FREAD|FWRITE, cred, td); #endif if (ump) { free(ump->um_fs, M_UFSMNT); free(ump, M_UFSMNT); mp->mnt_data = (qaddr_t)0; } return (error); } #include int bigcgs = 0; SYSCTL_INT(_debug, OID_AUTO, bigcgs, CTLFLAG_RW, &bigcgs, 0, ""); /* * Sanity checks for loading old filesystem superblocks. * See ffs_oldfscompat_write below for unwound actions. * * XXX - Parts get retired eventually. * Unfortunately new bits get added. */ static void ffs_oldfscompat_read(fs, ump, sblockloc) struct fs *fs; struct ufsmount *ump; ufs2_daddr_t sblockloc; { off_t maxfilesize; /* * If not yet done, update fs_flags location and value of fs_sblockloc. */ if ((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0) { fs->fs_flags = fs->fs_old_flags; fs->fs_old_flags |= FS_FLAGS_UPDATED; fs->fs_sblockloc = sblockloc; } /* * If not yet done, update UFS1 superblock with new wider fields. */ if (fs->fs_magic == FS_UFS1_MAGIC && fs->fs_maxbsize != fs->fs_bsize) { fs->fs_maxbsize = fs->fs_bsize; fs->fs_time = fs->fs_old_time; fs->fs_size = fs->fs_old_size; fs->fs_dsize = fs->fs_old_dsize; fs->fs_csaddr = fs->fs_old_csaddr; fs->fs_cstotal.cs_ndir = fs->fs_old_cstotal.cs_ndir; fs->fs_cstotal.cs_nbfree = fs->fs_old_cstotal.cs_nbfree; fs->fs_cstotal.cs_nifree = fs->fs_old_cstotal.cs_nifree; fs->fs_cstotal.cs_nffree = fs->fs_old_cstotal.cs_nffree; } if (fs->fs_magic == FS_UFS1_MAGIC && fs->fs_old_inodefmt < FS_44INODEFMT) { fs->fs_maxfilesize = (u_quad_t) 1LL << 39; fs->fs_qbmask = ~fs->fs_bmask; fs->fs_qfmask = ~fs->fs_fmask; } if (fs->fs_magic == FS_UFS1_MAGIC) { ump->um_savedmaxfilesize = fs->fs_maxfilesize; maxfilesize = (u_int64_t)0x40000000 * fs->fs_bsize - 1; if (fs->fs_maxfilesize > maxfilesize) fs->fs_maxfilesize = maxfilesize; } /* Compatibility for old filesystems */ if (fs->fs_avgfilesize <= 0) fs->fs_avgfilesize = AVFILESIZ; if (fs->fs_avgfpdir <= 0) fs->fs_avgfpdir = AFPDIR; if (bigcgs) { fs->fs_save_cgsize = fs->fs_cgsize; fs->fs_cgsize = fs->fs_bsize; } } /* * Unwinding superblock updates for old filesystems. * See ffs_oldfscompat_read above for details. * * XXX - Parts get retired eventually. * Unfortunately new bits get added. */ static void ffs_oldfscompat_write(fs, ump) struct fs *fs; struct ufsmount *ump; { /* * Copy back UFS2 updated fields that UFS1 inspects. */ if (fs->fs_magic == FS_UFS1_MAGIC) { fs->fs_old_time = fs->fs_time; fs->fs_old_cstotal.cs_ndir = fs->fs_cstotal.cs_ndir; fs->fs_old_cstotal.cs_nbfree = fs->fs_cstotal.cs_nbfree; fs->fs_old_cstotal.cs_nifree = fs->fs_cstotal.cs_nifree; fs->fs_old_cstotal.cs_nffree = fs->fs_cstotal.cs_nffree; fs->fs_maxfilesize = ump->um_savedmaxfilesize; } if (bigcgs) { fs->fs_cgsize = fs->fs_save_cgsize; fs->fs_save_cgsize = 0; } } /* * unmount system call */ int ffs_unmount(mp, mntflags, td) struct mount *mp; int mntflags; struct thread *td; { struct ufsmount *ump = VFSTOUFS(mp); struct fs *fs; int error, flags; flags = 0; if (mntflags & MNT_FORCE) { flags |= FORCECLOSE; } #ifdef UFS_EXTATTR if ((error = ufs_extattr_stop(mp, td))) { if (error != EOPNOTSUPP) printf("ffs_unmount: ufs_extattr_stop returned %d\n", error); } else { ufs_extattr_uepm_destroy(&ump->um_extattr); } #endif if (mp->mnt_flag & MNT_SOFTDEP) { if ((error = softdep_flushfiles(mp, flags, td)) != 0) return (error); } else { if ((error = ffs_flushfiles(mp, flags, td)) != 0) return (error); } fs = ump->um_fs; if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) { printf("%s: unmount pending error: blocks %jd files %d\n", fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks, fs->fs_pendinginodes); fs->fs_pendingblocks = 0; fs->fs_pendinginodes = 0; } if (fs->fs_ronly == 0) { fs->fs_clean = fs->fs_flags & (FS_UNCLEAN|FS_NEEDSFSCK) ? 0 : 1; error = ffs_sbupdate(ump, MNT_WAIT); if (error) { fs->fs_clean = 0; return (error); } } ump->um_devvp->v_rdev->si_mountpoint = NULL; vinvalbuf(ump->um_devvp, V_SAVE, NOCRED, td, 0, 0); /* XXX: see comment above VOP_OPEN */ #ifdef notyet error = VOP_CLOSE(ump->um_devvp, fs->fs_ronly ? FREAD : FREAD|FWRITE, NOCRED, td); #else error = VOP_CLOSE(ump->um_devvp, FREAD|FWRITE, NOCRED, td); #endif vrele(ump->um_devvp); free(fs->fs_csp, M_UFSMNT); free(fs, M_UFSMNT); free(ump, M_UFSMNT); mp->mnt_data = (qaddr_t)0; mp->mnt_flag &= ~MNT_LOCAL; return (error); } /* * Flush out all the files in a filesystem. */ int ffs_flushfiles(mp, flags, td) struct mount *mp; int flags; struct thread *td; { struct ufsmount *ump; int error; ump = VFSTOUFS(mp); #ifdef QUOTA if (mp->mnt_flag & MNT_QUOTA) { int i; error = vflush(mp, 0, SKIPSYSTEM|flags); if (error) return (error); for (i = 0; i < MAXQUOTAS; i++) { if (ump->um_quotas[i] == NULLVP) continue; quotaoff(td, mp, i); } /* * Here we fall through to vflush again to ensure * that we have gotten rid of all the system vnodes. */ } #endif ASSERT_VOP_LOCKED(ump->um_devvp, "ffs_flushfiles"); if (ump->um_devvp->v_vflag & VV_COPYONWRITE) { if ((error = vflush(mp, 0, SKIPSYSTEM | flags)) != 0) return (error); ffs_snapshot_unmount(mp); /* * Here we fall through to vflush again to ensure * that we have gotten rid of all the system vnodes. */ } /* * Flush all the files. */ if ((error = vflush(mp, 0, flags)) != 0) return (error); /* * Flush filesystem metadata. */ vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY, td); error = VOP_FSYNC(ump->um_devvp, td->td_ucred, MNT_WAIT, td); VOP_UNLOCK(ump->um_devvp, 0, td); return (error); } /* * Get filesystem statistics. */ int ffs_statfs(mp, sbp, td) struct mount *mp; struct statfs *sbp; struct thread *td; { struct ufsmount *ump; struct fs *fs; ump = VFSTOUFS(mp); fs = ump->um_fs; if (fs->fs_magic != FS_UFS1_MAGIC && fs->fs_magic != FS_UFS2_MAGIC) panic("ffs_statfs"); + sbp->f_version = STATFS_VERSION; sbp->f_bsize = fs->fs_fsize; sbp->f_iosize = fs->fs_bsize; sbp->f_blocks = fs->fs_dsize; sbp->f_bfree = fs->fs_cstotal.cs_nbfree * fs->fs_frag + fs->fs_cstotal.cs_nffree + dbtofsb(fs, fs->fs_pendingblocks); sbp->f_bavail = freespace(fs, fs->fs_minfree) + dbtofsb(fs, fs->fs_pendingblocks); sbp->f_files = fs->fs_ncg * fs->fs_ipg - ROOTINO; sbp->f_ffree = fs->fs_cstotal.cs_nifree + fs->fs_pendinginodes; + sbp->f_namemax = NAME_MAX; if (sbp != &mp->mnt_stat) { + sbp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; sbp->f_type = mp->mnt_vfc->vfc_typenum; + sbp->f_syncwrites = mp->mnt_stat.f_syncwrites; + sbp->f_asyncwrites = mp->mnt_stat.f_asyncwrites; + sbp->f_syncreads = mp->mnt_stat.f_syncreads; + sbp->f_asyncreads = mp->mnt_stat.f_asyncreads; + sbp->f_owner = mp->mnt_stat.f_owner; + sbp->f_fsid = mp->mnt_stat.f_fsid; + bcopy((caddr_t)mp->mnt_stat.f_fstypename, + (caddr_t)&sbp->f_fstypename[0], MFSNAMELEN); bcopy((caddr_t)mp->mnt_stat.f_mntonname, (caddr_t)&sbp->f_mntonname[0], MNAMELEN); bcopy((caddr_t)mp->mnt_stat.f_mntfromname, (caddr_t)&sbp->f_mntfromname[0], MNAMELEN); } return (0); } /* * Go through the disk queues to initiate sandbagged IO; * go through the inodes to write those that have been modified; * initiate the writing of the super block if it has been modified. * * Note: we are always called with the filesystem marked `MPBUSY'. */ int ffs_sync(mp, waitfor, cred, td) struct mount *mp; int waitfor; struct ucred *cred; struct thread *td; { struct vnode *nvp, *vp, *devvp; struct inode *ip; struct ufsmount *ump = VFSTOUFS(mp); struct fs *fs; int error, count, wait, lockreq, allerror = 0; fs = ump->um_fs; if (fs->fs_fmod != 0 && fs->fs_ronly != 0) { /* XXX */ printf("fs = %s\n", fs->fs_fsmnt); panic("ffs_sync: rofs mod"); } /* * Write back each (modified) inode. */ wait = 0; lockreq = LK_EXCLUSIVE | LK_NOWAIT; if (waitfor == MNT_WAIT) { wait = 1; lockreq = LK_EXCLUSIVE; } lockreq |= LK_INTERLOCK; MNT_ILOCK(mp); loop: for (vp = TAILQ_FIRST(&mp->mnt_nvnodelist); vp != NULL; vp = nvp) { /* * If the vnode that we are about to sync is no longer * associated with this mount point, start over. */ if (vp->v_mount != mp) goto loop; /* * Depend on the mntvnode_slock to keep things stable enough * for a quick test. Since there might be hundreds of * thousands of vnodes, we cannot afford even a subroutine * call unless there's a good chance that we have work to do. */ nvp = TAILQ_NEXT(vp, v_nmntvnodes); VI_LOCK(vp); if (vp->v_iflag & VI_XLOCK) { VI_UNLOCK(vp); continue; } ip = VTOI(vp); if (vp->v_type == VNON || ((ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) == 0 && TAILQ_EMPTY(&vp->v_dirtyblkhd))) { VI_UNLOCK(vp); continue; } MNT_IUNLOCK(mp); if ((error = vget(vp, lockreq, td)) != 0) { MNT_ILOCK(mp); if (error == ENOENT) goto loop; continue; } if ((error = VOP_FSYNC(vp, cred, waitfor, td)) != 0) allerror = error; VOP_UNLOCK(vp, 0, td); vrele(vp); MNT_ILOCK(mp); if (TAILQ_NEXT(vp, v_nmntvnodes) != nvp) goto loop; } MNT_IUNLOCK(mp); /* * Force stale filesystem control information to be flushed. */ if (waitfor == MNT_WAIT) { if ((error = softdep_flushworklist(ump->um_mountp, &count, td))) allerror = error; /* Flushed work items may create new vnodes to clean */ if (allerror == 0 && count) { MNT_ILOCK(mp); goto loop; } } #ifdef QUOTA qsync(mp); #endif devvp = ump->um_devvp; VI_LOCK(devvp); if (waitfor != MNT_LAZY && (devvp->v_numoutput > 0 || TAILQ_FIRST(&devvp->v_dirtyblkhd))) { vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK, td); if ((error = VOP_FSYNC(devvp, cred, waitfor, td)) != 0) allerror = error; VOP_UNLOCK(devvp, 0, td); if (allerror == 0 && waitfor == MNT_WAIT) { MNT_ILOCK(mp); goto loop; } } else VI_UNLOCK(devvp); /* * Write back modified superblock. */ if (fs->fs_fmod != 0 && (error = ffs_sbupdate(ump, waitfor)) != 0) allerror = error; return (allerror); } int ffs_vget(mp, ino, flags, vpp) struct mount *mp; ino_t ino; int flags; struct vnode **vpp; { struct thread *td = curthread; /* XXX */ struct fs *fs; struct inode *ip; struct ufsmount *ump; struct buf *bp; struct vnode *vp; dev_t dev; int error; ump = VFSTOUFS(mp); dev = ump->um_dev; /* * We do not lock vnode creation as it is believed to be too * expensive for such rare case as simultaneous creation of vnode * for same ino by different processes. We just allow them to race * and check later to decide who wins. Let the race begin! */ if ((error = ufs_ihashget(dev, ino, flags, vpp)) != 0) return (error); if (*vpp != NULL) return (0); /* * If this MALLOC() is performed after the getnewvnode() * it might block, leaving a vnode with a NULL v_data to be * found by ffs_sync() if a sync happens to fire right then, * which will cause a panic because ffs_sync() blindly * dereferences vp->v_data (as well it should). */ ip = uma_zalloc(uma_inode, M_WAITOK); /* Allocate a new vnode/inode. */ error = getnewvnode("ufs", mp, ffs_vnodeop_p, &vp); if (error) { *vpp = NULL; uma_zfree(uma_inode, ip); return (error); } bzero((caddr_t)ip, sizeof(struct inode)); /* * FFS supports recursive locking. */ vp->v_vnlock->lk_flags |= LK_CANRECURSE; vp->v_data = ip; ip->i_vnode = vp; ip->i_ump = ump; ip->i_fs = fs = ump->um_fs; ip->i_dev = dev; ip->i_number = ino; #ifdef QUOTA { int i; for (i = 0; i < MAXQUOTAS; i++) ip->i_dquot[i] = NODQUOT; } #endif /* * Exclusively lock the vnode before adding to hash. Note, that we * must not release nor downgrade the lock (despite flags argument * says) till it is fully initialized. */ lockmgr(vp->v_vnlock, LK_EXCLUSIVE, (struct mtx *)0, td); /* * Atomicaly (in terms of ufs_hash operations) check the hash for * duplicate of vnode being created and add it to the hash. If a * duplicate vnode was found, it will be vget()ed from hash for us. */ if ((error = ufs_ihashins(ip, flags, vpp)) != 0) { vput(vp); *vpp = NULL; return (error); } /* We lost the race, then throw away our vnode and return existing */ if (*vpp != NULL) { vput(vp); return (0); } /* Read in the disk contents for the inode, copy into the inode. */ error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ino)), (int)fs->fs_bsize, NOCRED, &bp); if (error) { /* * The inode does not contain anything useful, so it would * be misleading to leave it on its hash chain. With mode * still zero, it will be unlinked and returned to the free * list by vput(). */ brelse(bp); vput(vp); *vpp = NULL; return (error); } if (ip->i_ump->um_fstype == UFS1) ip->i_din1 = uma_zalloc(uma_ufs1, M_WAITOK); else ip->i_din2 = uma_zalloc(uma_ufs2, M_WAITOK); ffs_load_inode(bp, ip, fs, ino); if (DOINGSOFTDEP(vp)) softdep_load_inodeblock(ip); else ip->i_effnlink = ip->i_nlink; bqrelse(bp); /* * Initialize the vnode from the inode, check for aliases. * Note that the underlying vnode may have changed. */ error = ufs_vinit(mp, ffs_specop_p, ffs_fifoop_p, &vp); if (error) { vput(vp); *vpp = NULL; return (error); } /* * Finish inode initialization. */ VREF(ip->i_devvp); /* * Set up a generation number for this inode if it does not * already have one. This should only happen on old filesystems. */ if (ip->i_gen == 0) { ip->i_gen = arc4random() / 2 + 1; if ((vp->v_mount->mnt_flag & MNT_RDONLY) == 0) { ip->i_flag |= IN_MODIFIED; DIP(ip, i_gen) = ip->i_gen; } } /* * Ensure that uid and gid are correct. This is a temporary * fix until fsck has been changed to do the update. */ if (fs->fs_magic == FS_UFS1_MAGIC && /* XXX */ fs->fs_old_inodefmt < FS_44INODEFMT) { /* XXX */ ip->i_uid = ip->i_din1->di_ouid; /* XXX */ ip->i_gid = ip->i_din1->di_ogid; /* XXX */ } /* XXX */ #ifdef MAC if ((mp->mnt_flag & MNT_MULTILABEL) && ip->i_mode) { /* * If this vnode is already allocated, and we're running * multi-label, attempt to perform a label association * from the extended attributes on the inode. */ error = mac_associate_vnode_extattr(mp, vp); if (error) { /* ufs_inactive will release ip->i_devvp ref. */ vput(vp); *vpp = NULL; return (error); } } #endif *vpp = vp; return (0); } /* * File handle to vnode * * Have to be really careful about stale file handles: * - check that the inode number is valid * - call ffs_vget() to get the locked inode * - check for an unallocated inode (i_mode == 0) * - check that the given client host has export rights and return * those rights via. exflagsp and credanonp */ int ffs_fhtovp(mp, fhp, vpp) struct mount *mp; struct fid *fhp; struct vnode **vpp; { struct ufid *ufhp; struct fs *fs; ufhp = (struct ufid *)fhp; fs = VFSTOUFS(mp)->um_fs; if (ufhp->ufid_ino < ROOTINO || ufhp->ufid_ino >= fs->fs_ncg * fs->fs_ipg) return (ESTALE); return (ufs_fhtovp(mp, ufhp, vpp)); } /* * Vnode pointer to File handle */ /* ARGSUSED */ int ffs_vptofh(vp, fhp) struct vnode *vp; struct fid *fhp; { struct inode *ip; struct ufid *ufhp; ip = VTOI(vp); ufhp = (struct ufid *)fhp; ufhp->ufid_len = sizeof(struct ufid); ufhp->ufid_ino = ip->i_number; ufhp->ufid_gen = ip->i_gen; return (0); } /* * Initialize the filesystem. */ static int ffs_init(vfsp) struct vfsconf *vfsp; { softdep_initialize(); return (ufs_init(vfsp)); } /* * Undo the work of ffs_init(). */ static int ffs_uninit(vfsp) struct vfsconf *vfsp; { int ret; ret = ufs_uninit(vfsp); softdep_uninitialize(); return (ret); } /* * Write a superblock and associated information back to disk. */ static int ffs_sbupdate(mp, waitfor) struct ufsmount *mp; int waitfor; { struct fs *fs = mp->um_fs; struct buf *bp; int blks; void *space; int i, size, error, allerror = 0; if (fs->fs_ronly == 1 && (mp->um_mountp->mnt_flag & (MNT_RDONLY | MNT_UPDATE)) != (MNT_RDONLY | MNT_UPDATE)) panic("ffs_sbupdate: write read-only filesystem"); /* * First write back the summary information. */ blks = howmany(fs->fs_cssize, fs->fs_fsize); space = fs->fs_csp; for (i = 0; i < blks; i += fs->fs_frag) { size = fs->fs_bsize; if (i + fs->fs_frag > blks) size = (blks - i) * fs->fs_fsize; bp = getblk(mp->um_devvp, fsbtodb(fs, fs->fs_csaddr + i), size, 0, 0, 0); bcopy(space, bp->b_data, (u_int)size); space = (char *)space + size; if (waitfor != MNT_WAIT) bawrite(bp); else if ((error = bwrite(bp)) != 0) allerror = error; } /* * Now write back the superblock itself. If any errors occurred * up to this point, then fail so that the superblock avoids * being written out as clean. */ if (allerror) return (allerror); if (fs->fs_magic == FS_UFS1_MAGIC && fs->fs_sblockloc != SBLOCK_UFS1 && (fs->fs_flags & FS_FLAGS_UPDATED) == 0) { printf("%s: correcting fs_sblockloc from %jd to %d\n", fs->fs_fsmnt, fs->fs_sblockloc, SBLOCK_UFS1); fs->fs_sblockloc = SBLOCK_UFS1; } if (fs->fs_magic == FS_UFS2_MAGIC && fs->fs_sblockloc != SBLOCK_UFS2 && (fs->fs_flags & FS_FLAGS_UPDATED) == 0) { printf("%s: correcting fs_sblockloc from %jd to %d\n", fs->fs_fsmnt, fs->fs_sblockloc, SBLOCK_UFS2); fs->fs_sblockloc = SBLOCK_UFS2; } bp = getblk(mp->um_devvp, btodb(fs->fs_sblockloc), (int)fs->fs_sbsize, 0, 0, 0); fs->fs_fmod = 0; fs->fs_time = time_second; bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize); ffs_oldfscompat_write((struct fs *)bp->b_data, mp); if (waitfor != MNT_WAIT) bawrite(bp); else if ((error = bwrite(bp)) != 0) allerror = error; return (allerror); } static int ffs_extattrctl(struct mount *mp, int cmd, struct vnode *filename_vp, int attrnamespace, const char *attrname, struct thread *td) { #ifdef UFS_EXTATTR return (ufs_extattrctl(mp, cmd, filename_vp, attrnamespace, attrname, td)); #else return (vfs_stdextattrctl(mp, cmd, filename_vp, attrnamespace, attrname, td)); #endif } static void ffs_ifree(struct ufsmount *ump, struct inode *ip) { if (ump->um_fstype == UFS1 && ip->i_din1 != NULL) uma_zfree(uma_ufs1, ip->i_din1); else if (ip->i_din2 != NULL) uma_zfree(uma_ufs2, ip->i_din2); uma_zfree(uma_inode, ip); }