Index: head/lib/libufs/Makefile =================================================================== --- head/lib/libufs/Makefile (revision 323922) +++ head/lib/libufs/Makefile (revision 323923) @@ -1,31 +1,31 @@ # $FreeBSD$ PACKAGE=lib${LIB} LIB= ufs SHLIBDIR?= /lib SHLIB_MAJOR= 6 -SRCS= block.c cgroup.c inode.c sblock.c type.c ffs_subr.c ffs_tables.c +SRCS= block.c cgroup.c crc32.c inode.c sblock.c type.c ffs_subr.c ffs_tables.c INCS= libufs.h MAN= bread.3 cgread.3 libufs.3 sbread.3 ufs_disk_close.3 MLINKS+= bread.3 bwrite.3 MLINKS+= bread.3 berase.3 MLINKS+= cgread.3 cgread1.3 MLINKS+= cgread.3 cgwrite1.3 MLINKS+= sbread.3 sbwrite.3 MLINKS+= ufs_disk_close.3 ufs_disk_fillout.3 MLINKS+= ufs_disk_close.3 ufs_disk_fillout_blank.3 MLINKS+= ufs_disk_close.3 ufs_disk_write.3 -.PATH: ${SRCTOP}/sys/ufs/ffs +.PATH: ${SRCTOP}/sys/libkern ${SRCTOP}/sys/ufs/ffs WARNS?= 2 CFLAGS+= -D_LIBUFS .if defined(LIBUFS_DEBUG) CFLAGS+= -D_LIBUFS_DEBUGGING .endif CFLAGS+= -I${.CURDIR} .include Index: head/lib/libufs/cgroup.c =================================================================== --- head/lib/libufs/cgroup.c (revision 323922) +++ head/lib/libufs/cgroup.c (revision 323923) @@ -1,223 +1,228 @@ /* * Copyright (c) 2003 Juli Mallett. All rights reserved. * * This software was written by Juli Mallett for the * FreeBSD project. Redistribution and use in source and binary forms, with * or without modification, are permitted provided that the following * conditions are met: * * 1. Redistribution of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * 2. Redistribution in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include ufs2_daddr_t cgballoc(struct uufsd *disk) { u_int8_t *blksfree; struct cg *cgp; struct fs *fs; long bno; fs = &disk->d_fs; cgp = &disk->d_cg; blksfree = cg_blksfree(cgp); for (bno = 0; bno < fs->fs_fpg / fs->fs_frag; bno++) if (ffs_isblock(fs, blksfree, bno)) goto gotit; return (0); gotit: fs->fs_cs(fs, cgp->cg_cgx).cs_nbfree--; ffs_clrblock(fs, blksfree, (long)bno); ffs_clusteracct(fs, cgp, bno, -1); cgp->cg_cs.cs_nbfree--; fs->fs_cstotal.cs_nbfree--; fs->fs_fmod = 1; return (cgbase(fs, cgp->cg_cgx) + blkstofrags(fs, bno)); } int cgbfree(struct uufsd *disk, ufs2_daddr_t bno, long size) { u_int8_t *blksfree; struct fs *fs; struct cg *cgp; ufs1_daddr_t fragno, cgbno; int i, cg, blk, frags, bbase; fs = &disk->d_fs; cg = dtog(fs, bno); if (cgread1(disk, cg) != 1) return (-1); cgp = &disk->d_cg; cgbno = dtogd(fs, bno); blksfree = cg_blksfree(cgp); if (size == fs->fs_bsize) { fragno = fragstoblks(fs, cgbno); ffs_setblock(fs, blksfree, fragno); ffs_clusteracct(fs, cgp, fragno, 1); cgp->cg_cs.cs_nbfree++; fs->fs_cstotal.cs_nbfree++; fs->fs_cs(fs, cg).cs_nbfree++; } else { bbase = cgbno - fragnum(fs, cgbno); /* * decrement the counts associated with the old frags */ blk = blkmap(fs, blksfree, bbase); ffs_fragacct(fs, blk, cgp->cg_frsum, -1); /* * deallocate the fragment */ frags = numfrags(fs, size); for (i = 0; i < frags; i++) setbit(blksfree, cgbno + i); cgp->cg_cs.cs_nffree += i; fs->fs_cstotal.cs_nffree += i; fs->fs_cs(fs, cg).cs_nffree += i; /* * add back in counts associated with the new frags */ blk = blkmap(fs, blksfree, bbase); ffs_fragacct(fs, blk, cgp->cg_frsum, 1); /* * if a complete block has been reassembled, account for it */ fragno = fragstoblks(fs, bbase); if (ffs_isblock(fs, blksfree, fragno)) { cgp->cg_cs.cs_nffree -= fs->fs_frag; fs->fs_cstotal.cs_nffree -= fs->fs_frag; fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag; ffs_clusteracct(fs, cgp, fragno, 1); cgp->cg_cs.cs_nbfree++; fs->fs_cstotal.cs_nbfree++; fs->fs_cs(fs, cg).cs_nbfree++; } } return cgwrite(disk); } ino_t cgialloc(struct uufsd *disk) { struct ufs2_dinode *dp2; u_int8_t *inosused; struct cg *cgp; struct fs *fs; ino_t ino; int i; fs = &disk->d_fs; cgp = &disk->d_cg; inosused = cg_inosused(cgp); for (ino = 0; ino < fs->fs_ipg; ino++) if (isclr(inosused, ino)) goto gotit; return (0); gotit: if (fs->fs_magic == FS_UFS2_MAGIC && ino + INOPB(fs) > cgp->cg_initediblk && cgp->cg_initediblk < cgp->cg_niblk) { char block[MAXBSIZE]; bzero(block, (int)fs->fs_bsize); dp2 = (struct ufs2_dinode *)█ for (i = 0; i < INOPB(fs); i++) { dp2->di_gen = arc4random(); dp2++; } if (bwrite(disk, ino_to_fsba(fs, cgp->cg_cgx * fs->fs_ipg + cgp->cg_initediblk), block, fs->fs_bsize)) return (0); cgp->cg_initediblk += INOPB(fs); } setbit(inosused, ino); cgp->cg_irotor = ino; cgp->cg_cs.cs_nifree--; fs->fs_cstotal.cs_nifree--; fs->fs_cs(fs, cgp->cg_cgx).cs_nifree--; fs->fs_fmod = 1; return (ino + (cgp->cg_cgx * fs->fs_ipg)); } int cgread(struct uufsd *disk) { return (cgread1(disk, disk->d_ccg++)); } int cgread1(struct uufsd *disk, int c) { struct fs *fs; fs = &disk->d_fs; if ((unsigned)c >= fs->fs_ncg) { return (0); } if (bread(disk, fsbtodb(fs, cgtod(fs, c)), disk->d_cgunion.d_buf, fs->fs_bsize) == -1) { ERROR(disk, "unable to read cylinder group"); return (-1); } disk->d_lcg = c; return (1); } int cgwrite(struct uufsd *disk) { return (cgwrite1(disk, disk->d_lcg)); } int cgwrite1(struct uufsd *disk, int c) { struct fs *fs; fs = &disk->d_fs; + if ((fs->fs_metackhash & CK_CYLGRP) != 0) { + disk->d_cg.cg_ckhash = 0; + disk->d_cg.cg_ckhash = + calculate_crc32c(~0L, (void *)&disk->d_cg, fs->fs_cgsize); + } if (bwrite(disk, fsbtodb(fs, cgtod(fs, c)), disk->d_cgunion.d_buf, fs->fs_bsize) == -1) { ERROR(disk, "unable to write cylinder group"); return (-1); } return (0); } Index: head/lib/libufs/libufs.h =================================================================== --- head/lib/libufs/libufs.h (revision 323922) +++ head/lib/libufs/libufs.h (revision 323923) @@ -1,149 +1,154 @@ /* * Copyright (c) 2002 Juli Mallett. All rights reserved. * * This software was written by Juli Mallett for the * FreeBSD project. Redistribution and use in source and binary forms, with * or without modification, are permitted provided that the following * conditions are met: * * 1. Redistribution of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * 2. Redistribution in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef __LIBUFS_H__ #define __LIBUFS_H__ /* * libufs structures. */ /* * userland ufs disk. */ struct uufsd { const char *d_name; /* disk name */ int d_ufs; /* decimal UFS version */ int d_fd; /* raw device file descriptor */ long d_bsize; /* device bsize */ ufs2_daddr_t d_sblock; /* superblock location */ struct csum *d_sbcsum; /* Superblock summary info */ caddr_t d_inoblock; /* inode block */ uint32_t d_inomin; /* low inode (not ino_t for ABI compat) */ uint32_t d_inomax; /* high inode (not ino_t for ABI compat) */ union { struct fs d_fs; /* filesystem information */ char d_sb[MAXBSIZE]; /* superblock as buffer */ } d_sbunion; union { struct cg d_cg; /* cylinder group */ char d_buf[MAXBSIZE]; /* cylinder group storage */ } d_cgunion; int d_ccg; /* current cylinder group */ int d_lcg; /* last cylinder group (in d_cg) */ const char *d_error; /* human readable disk error */ int d_mine; /* internal flags */ #define d_fs d_sbunion.d_fs #define d_sb d_sbunion.d_sb #define d_cg d_cgunion.d_cg }; /* * libufs macros (internal, non-exported). */ #ifdef _LIBUFS /* * Trace steps through libufs, to be used at entry and erroneous return. */ static inline void ERROR(struct uufsd *u, const char *str) { #ifdef _LIBUFS_DEBUGGING if (str != NULL) { fprintf(stderr, "libufs: %s", str); if (errno != 0) fprintf(stderr, ": %s", strerror(errno)); fprintf(stderr, "\n"); } #endif if (u != NULL) u->d_error = str; } #endif /* _LIBUFS */ __BEGIN_DECLS /* * libufs prototypes. */ /* * block.c */ ssize_t bread(struct uufsd *, ufs2_daddr_t, void *, size_t); ssize_t bwrite(struct uufsd *, ufs2_daddr_t, const void *, size_t); int berase(struct uufsd *, ufs2_daddr_t, ufs2_daddr_t); /* * cgroup.c */ ufs2_daddr_t cgballoc(struct uufsd *); int cgbfree(struct uufsd *, ufs2_daddr_t, long); ino_t cgialloc(struct uufsd *); int cgread(struct uufsd *); int cgread1(struct uufsd *, int); int cgwrite(struct uufsd *); int cgwrite1(struct uufsd *, int); /* * inode.c */ int getino(struct uufsd *, void **, ino_t, int *); int putino(struct uufsd *); /* * sblock.c */ int sbread(struct uufsd *); int sbwrite(struct uufsd *, int); /* * type.c */ int ufs_disk_close(struct uufsd *); int ufs_disk_fillout(struct uufsd *, const char *); int ufs_disk_fillout_blank(struct uufsd *, const char *); int ufs_disk_write(struct uufsd *); /* * ffs_subr.c */ void ffs_clrblock(struct fs *, u_char *, ufs1_daddr_t); void ffs_clusteracct(struct fs *, struct cg *, ufs1_daddr_t, int); void ffs_fragacct(struct fs *, int, int32_t [], int); int ffs_isblock(struct fs *, u_char *, ufs1_daddr_t); int ffs_isfreeblock(struct fs *, u_char *, ufs1_daddr_t); void ffs_setblock(struct fs *, u_char *, ufs1_daddr_t); +/* + * crc32c.c + */ +uint32_t calculate_crc32c(uint32_t, const void *, size_t); + __END_DECLS #endif /* __LIBUFS_H__ */ Index: head/sbin/dumpfs/dumpfs.c =================================================================== --- head/sbin/dumpfs/dumpfs.c (revision 323922) +++ head/sbin/dumpfs/dumpfs.c (revision 323923) @@ -1,509 +1,527 @@ /* * Copyright (c) 2009 Robert N. M. Watson * All rights reserved. * * This software was developed at the University of Cambridge Computer * Laboratory with support from a grant from Google, Inc. * * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Marshall * Kirk McKusick and Network Associates Laboratories, the Security * Research Division of Network Associates, Inc. under DARPA/SPAWAR * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS * research program. * * Copyright (c) 1983, 1992, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef lint static const char copyright[] = "@(#) Copyright (c) 1983, 1992, 1993\n\ The Regents of the University of California. All rights reserved.\n"; #endif /* not lint */ #ifndef lint #if 0 static char sccsid[] = "@(#)dumpfs.c 8.5 (Berkeley) 4/29/95"; #endif static const char rcsid[] = "$FreeBSD$"; #endif /* not lint */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define afs disk.d_fs #define acg disk.d_cg static struct uufsd disk; static int dumpfs(const char *); static int dumpfsid(void); static int dumpcg(void); static int dumpfreespace(const char *, int); static void dumpfreespacecg(int); static int marshal(const char *); static void pbits(void *, int); static void pblklist(void *, int, off_t, int); static void ufserr(const char *); static void usage(void) __dead2; int main(int argc, char *argv[]) { const char *name; int ch, dofreespace, domarshal, dolabel, eval; dofreespace = domarshal = dolabel = eval = 0; while ((ch = getopt(argc, argv, "lfm")) != -1) { switch (ch) { case 'f': dofreespace++; break; case 'm': domarshal = 1; break; case 'l': dolabel = 1; break; case '?': default: usage(); } } argc -= optind; argv += optind; if (argc < 1) usage(); if (dofreespace && domarshal) usage(); if (dofreespace > 2) usage(); while ((name = *argv++) != NULL) { if (ufs_disk_fillout(&disk, name) == -1) { ufserr(name); eval |= 1; continue; } if (dofreespace) eval |= dumpfreespace(name, dofreespace); else if (domarshal) eval |= marshal(name); else if (dolabel) eval |= dumpfsid(); else eval |= dumpfs(name); ufs_disk_close(&disk); } exit(eval); } static int dumpfsid(void) { printf("%sufsid/%08x%08x\n", _PATH_DEV, afs.fs_id[0], afs.fs_id[1]); return 0; } static int dumpfs(const char *name) { time_t fstime; int64_t fssize; int32_t fsflags; int i; switch (disk.d_ufs) { case 2: fssize = afs.fs_size; fstime = afs.fs_time; printf("magic\t%x (UFS2)\ttime\t%s", afs.fs_magic, ctime(&fstime)); printf("superblock location\t%jd\tid\t[ %08x %08x ]\n", (intmax_t)afs.fs_sblockloc, afs.fs_id[0], afs.fs_id[1]); printf("ncg\t%d\tsize\t%jd\tblocks\t%jd\n", afs.fs_ncg, (intmax_t)fssize, (intmax_t)afs.fs_dsize); break; case 1: fssize = afs.fs_old_size; fstime = afs.fs_old_time; printf("magic\t%x (UFS1)\ttime\t%s", afs.fs_magic, ctime(&fstime)); printf("id\t[ %08x %08x ]\n", afs.fs_id[0], afs.fs_id[1]); printf("ncg\t%d\tsize\t%jd\tblocks\t%jd\n", afs.fs_ncg, (intmax_t)fssize, (intmax_t)afs.fs_dsize); break; default: goto err; } printf("bsize\t%d\tshift\t%d\tmask\t0x%08x\n", afs.fs_bsize, afs.fs_bshift, afs.fs_bmask); printf("fsize\t%d\tshift\t%d\tmask\t0x%08x\n", afs.fs_fsize, afs.fs_fshift, afs.fs_fmask); printf("frag\t%d\tshift\t%d\tfsbtodb\t%d\n", afs.fs_frag, afs.fs_fragshift, afs.fs_fsbtodb); printf("minfree\t%d%%\toptim\t%s\tsymlinklen %d\n", afs.fs_minfree, afs.fs_optim == FS_OPTSPACE ? "space" : "time", afs.fs_maxsymlinklen); switch (disk.d_ufs) { case 2: printf("%s %d\tmaxbpg\t%d\tmaxcontig %d\tcontigsumsize %d\n", "maxbsize", afs.fs_maxbsize, afs.fs_maxbpg, afs.fs_maxcontig, afs.fs_contigsumsize); printf("nbfree\t%jd\tndir\t%jd\tnifree\t%jd\tnffree\t%jd\n", (intmax_t)afs.fs_cstotal.cs_nbfree, (intmax_t)afs.fs_cstotal.cs_ndir, (intmax_t)afs.fs_cstotal.cs_nifree, (intmax_t)afs.fs_cstotal.cs_nffree); printf("bpg\t%d\tfpg\t%d\tipg\t%d\tunrefs\t%jd\n", afs.fs_fpg / afs.fs_frag, afs.fs_fpg, afs.fs_ipg, (intmax_t)afs.fs_unrefs); printf("nindir\t%d\tinopb\t%d\tmaxfilesize\t%ju\n", afs.fs_nindir, afs.fs_inopb, (uintmax_t)afs.fs_maxfilesize); printf("sbsize\t%d\tcgsize\t%d\tcsaddr\t%jd\tcssize\t%d\n", afs.fs_sbsize, afs.fs_cgsize, (intmax_t)afs.fs_csaddr, afs.fs_cssize); break; case 1: printf("maxbpg\t%d\tmaxcontig %d\tcontigsumsize %d\n", afs.fs_maxbpg, afs.fs_maxcontig, afs.fs_contigsumsize); printf("nbfree\t%d\tndir\t%d\tnifree\t%d\tnffree\t%d\n", afs.fs_old_cstotal.cs_nbfree, afs.fs_old_cstotal.cs_ndir, afs.fs_old_cstotal.cs_nifree, afs.fs_old_cstotal.cs_nffree); printf("cpg\t%d\tbpg\t%d\tfpg\t%d\tipg\t%d\n", afs.fs_old_cpg, afs.fs_fpg / afs.fs_frag, afs.fs_fpg, afs.fs_ipg); printf("nindir\t%d\tinopb\t%d\tnspf\t%d\tmaxfilesize\t%ju\n", afs.fs_nindir, afs.fs_inopb, afs.fs_old_nspf, (uintmax_t)afs.fs_maxfilesize); printf("sbsize\t%d\tcgsize\t%d\tcgoffset %d\tcgmask\t0x%08x\n", afs.fs_sbsize, afs.fs_cgsize, afs.fs_old_cgoffset, afs.fs_old_cgmask); printf("csaddr\t%d\tcssize\t%d\n", afs.fs_old_csaddr, afs.fs_cssize); printf("rotdelay %dms\trps\t%d\ttrackskew %d\tinterleave %d\n", afs.fs_old_rotdelay, afs.fs_old_rps, afs.fs_old_trackskew, afs.fs_old_interleave); printf("nsect\t%d\tnpsect\t%d\tspc\t%d\n", afs.fs_old_nsect, afs.fs_old_npsect, afs.fs_old_spc); break; default: goto err; } printf("sblkno\t%d\tcblkno\t%d\tiblkno\t%d\tdblkno\t%d\n", afs.fs_sblkno, afs.fs_cblkno, afs.fs_iblkno, afs.fs_dblkno); printf("cgrotor\t%d\tfmod\t%d\tronly\t%d\tclean\t%d\n", afs.fs_cgrotor, afs.fs_fmod, afs.fs_ronly, afs.fs_clean); printf("metaspace %jd\tavgfpdir %d\tavgfilesize %d\n", afs.fs_metaspace, afs.fs_avgfpdir, afs.fs_avgfilesize); printf("flags\t"); if (afs.fs_old_flags & FS_FLAGS_UPDATED) fsflags = afs.fs_flags; else fsflags = afs.fs_old_flags; if (fsflags == 0) printf("none"); if (fsflags & FS_UNCLEAN) printf("unclean "); if (fsflags & FS_DOSOFTDEP) printf("soft-updates%s ", (fsflags & FS_SUJ) ? "+journal" : ""); if (fsflags & FS_NEEDSFSCK) printf("needs fsck run "); if (fsflags & FS_INDEXDIRS) printf("indexed directories "); if (fsflags & FS_ACLS) printf("acls "); if (fsflags & FS_MULTILABEL) printf("multilabel "); if (fsflags & FS_GJOURNAL) printf("gjournal "); if (fsflags & FS_FLAGS_UPDATED) printf("fs_flags expanded "); if (fsflags & FS_NFS4ACLS) printf("nfsv4acls "); if (fsflags & FS_TRIM) printf("trim "); fsflags &= ~(FS_UNCLEAN | FS_DOSOFTDEP | FS_NEEDSFSCK | FS_INDEXDIRS | FS_ACLS | FS_MULTILABEL | FS_GJOURNAL | FS_FLAGS_UPDATED | FS_NFS4ACLS | FS_SUJ | FS_TRIM); if (fsflags != 0) printf("unknown flags (%#x)", fsflags); putchar('\n'); + printf("check hashes\t"); + fsflags = afs.fs_metackhash; + if (fsflags == 0) + printf("none"); + if (fsflags & CK_SUPERBLOCK) + printf("superblock "); + if (fsflags & CK_CYLGRP) + printf("cylinder-groups "); + if (fsflags & CK_INODE) + printf("inodes "); + if (fsflags & CK_INDIR) + printf("indirect-blocks "); + if (fsflags & CK_DIR) + printf("directories "); + fsflags &= ~(CK_SUPERBLOCK | CK_CYLGRP | CK_INODE | CK_INDIR | CK_DIR); + if (fsflags != 0) + printf("unknown flags (%#x)", fsflags); + putchar('\n'); printf("fsmnt\t%s\n", afs.fs_fsmnt); printf("volname\t%s\tswuid\t%ju\tprovidersize\t%ju\n", afs.fs_volname, (uintmax_t)afs.fs_swuid, (uintmax_t)afs.fs_providersize); printf("\ncs[].cs_(nbfree,ndir,nifree,nffree):\n\t"); afs.fs_csp = calloc(1, afs.fs_cssize); if (bread(&disk, fsbtodb(&afs, afs.fs_csaddr), afs.fs_csp, afs.fs_cssize) == -1) goto err; for (i = 0; i < afs.fs_ncg; i++) { struct csum *cs = &afs.fs_cs(&afs, i); if (i && i % 4 == 0) printf("\n\t"); printf("(%d,%d,%d,%d) ", cs->cs_nbfree, cs->cs_ndir, cs->cs_nifree, cs->cs_nffree); } printf("\n"); if (fssize % afs.fs_fpg) { if (disk.d_ufs == 1) printf("cylinders in last group %d\n", howmany(afs.fs_old_size % afs.fs_fpg, afs.fs_old_spc / afs.fs_old_nspf)); printf("blocks in last group %ld\n\n", (long)((fssize % afs.fs_fpg) / afs.fs_frag)); } while ((i = cgread(&disk)) != 0) { if (i == -1 || dumpcg()) goto err; } return (0); err: ufserr(name); return (1); } static int dumpcg(void) { time_t cgtime; off_t cur; int i, j; printf("\ncg %d:\n", disk.d_lcg); cur = fsbtodb(&afs, cgtod(&afs, disk.d_lcg)) * disk.d_bsize; switch (disk.d_ufs) { case 2: cgtime = acg.cg_time; printf("magic\t%x\ttell\t%jx\ttime\t%s", acg.cg_magic, (intmax_t)cur, ctime(&cgtime)); printf("cgx\t%d\tndblk\t%d\tniblk\t%d\tinitiblk %d\tunrefs %d\n", acg.cg_cgx, acg.cg_ndblk, acg.cg_niblk, acg.cg_initediblk, acg.cg_unrefs); break; case 1: cgtime = acg.cg_old_time; printf("magic\t%x\ttell\t%jx\ttime\t%s", acg.cg_magic, (intmax_t)cur, ctime(&cgtime)); printf("cgx\t%d\tncyl\t%d\tniblk\t%d\tndblk\t%d\n", acg.cg_cgx, acg.cg_old_ncyl, acg.cg_old_niblk, acg.cg_ndblk); break; default: break; } printf("nbfree\t%d\tndir\t%d\tnifree\t%d\tnffree\t%d\n", acg.cg_cs.cs_nbfree, acg.cg_cs.cs_ndir, acg.cg_cs.cs_nifree, acg.cg_cs.cs_nffree); printf("rotor\t%d\tirotor\t%d\tfrotor\t%d\nfrsum", acg.cg_rotor, acg.cg_irotor, acg.cg_frotor); for (i = 1, j = 0; i < afs.fs_frag; i++) { printf("\t%d", acg.cg_frsum[i]); j += i * acg.cg_frsum[i]; } printf("\nsum of frsum: %d", j); if (afs.fs_contigsumsize > 0) { for (i = 1; i < afs.fs_contigsumsize; i++) { if ((i - 1) % 8 == 0) printf("\nclusters %d-%d:", i, MIN(afs.fs_contigsumsize - 1, i + 7)); printf("\t%d", cg_clustersum(&acg)[i]); } printf("\nclusters size %d and over: %d\n", afs.fs_contigsumsize, cg_clustersum(&acg)[afs.fs_contigsumsize]); printf("clusters free:\t"); pbits(cg_clustersfree(&acg), acg.cg_nclusterblks); } else printf("\n"); printf("inodes used:\t"); pbits(cg_inosused(&acg), afs.fs_ipg); printf("blks free:\t"); pbits(cg_blksfree(&acg), afs.fs_fpg); return (0); } static int dumpfreespace(const char *name, int fflag) { int i; while ((i = cgread(&disk)) != 0) { if (i == -1) goto err; dumpfreespacecg(fflag); } return (0); err: ufserr(name); return (1); } static void dumpfreespacecg(int fflag) { pblklist(cg_blksfree(&acg), afs.fs_fpg, disk.d_lcg * afs.fs_fpg, fflag); } static int marshal(const char *name) { struct fs *fs; fs = &disk.d_fs; printf("# newfs command for %s (%s)\n", name, disk.d_name); printf("newfs "); if (fs->fs_volname[0] != '\0') printf("-L %s ", fs->fs_volname); printf("-O %d ", disk.d_ufs); if (fs->fs_flags & FS_DOSOFTDEP) printf("-U "); printf("-a %d ", fs->fs_maxcontig); printf("-b %d ", fs->fs_bsize); /* -c is dumb */ printf("-d %d ", fs->fs_maxbsize); printf("-e %d ", fs->fs_maxbpg); printf("-f %d ", fs->fs_fsize); printf("-g %d ", fs->fs_avgfilesize); printf("-h %d ", fs->fs_avgfpdir); printf("-i %jd ", fragroundup(fs, lblktosize(fs, fragstoblks(fs, fs->fs_fpg)) / fs->fs_ipg)); if (fs->fs_flags & FS_SUJ) printf("-j "); if (fs->fs_flags & FS_GJOURNAL) printf("-J "); printf("-k %jd ", fs->fs_metaspace); if (fs->fs_flags & FS_MULTILABEL) printf("-l "); printf("-m %d ", fs->fs_minfree); /* -n unimplemented */ printf("-o "); switch (fs->fs_optim) { case FS_OPTSPACE: printf("space "); break; case FS_OPTTIME: printf("time "); break; default: printf("unknown "); break; } /* -p..r unimplemented */ printf("-s %jd ", (intmax_t)fsbtodb(fs, fs->fs_size)); if (fs->fs_flags & FS_TRIM) printf("-t "); printf("%s ", disk.d_name); printf("\n"); return 0; } static void pbits(void *vp, int max) { int i; char *p; int count, j; for (count = i = 0, p = vp; i < max; i++) if (isset(p, i)) { if (count) printf(",%s", count % 6 ? " " : "\n\t"); count++; printf("%d", i); j = i; while ((i+1) __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include "fsck.h" static void slowio_start(void); static void slowio_end(void); static void printIOstats(void); static long diskreads, totaldiskreads, totalreads; /* Disk cache statistics */ static struct timespec startpass, finishpass; struct timeval slowio_starttime; int slowio_delay_usec = 10000; /* Initial IO delay for background fsck */ int slowio_pollcnt; static struct bufarea cgblk; /* backup buffer for cylinder group blocks */ static TAILQ_HEAD(buflist, bufarea) bufhead; /* head of buffer cache list */ static int numbufs; /* size of buffer cache */ static char *buftype[BT_NUMBUFTYPES] = BT_NAMES; static struct bufarea *cgbufs; /* header for cylinder group cache */ static int flushtries; /* number of tries to reclaim memory */ void fsutilinit(void) { diskreads = totaldiskreads = totalreads = 0; bzero(&startpass, sizeof(struct timespec)); bzero(&finishpass, sizeof(struct timespec)); bzero(&slowio_starttime, sizeof(struct timeval)); slowio_delay_usec = 10000; slowio_pollcnt = 0; bzero(&cgblk, sizeof(struct bufarea)); TAILQ_INIT(&bufhead); numbufs = 0; /* buftype ? */ cgbufs = NULL; flushtries = 0; } int ftypeok(union dinode *dp) { switch (DIP(dp, di_mode) & IFMT) { case IFDIR: case IFREG: case IFBLK: case IFCHR: case IFLNK: case IFSOCK: case IFIFO: return (1); default: if (debug) printf("bad file type 0%o\n", DIP(dp, di_mode)); return (0); } } int reply(const char *question) { int persevere; char c; if (preen) pfatal("INTERNAL ERROR: GOT TO reply()"); persevere = !strcmp(question, "CONTINUE"); printf("\n"); if (!persevere && (nflag || (fswritefd < 0 && bkgrdflag == 0))) { printf("%s? no\n\n", question); resolved = 0; return (0); } if (yflag || (persevere && nflag)) { printf("%s? yes\n\n", question); return (1); } do { printf("%s? [yn] ", question); (void) fflush(stdout); c = getc(stdin); while (c != '\n' && getc(stdin) != '\n') { if (feof(stdin)) { resolved = 0; return (0); } } } while (c != 'y' && c != 'Y' && c != 'n' && c != 'N'); printf("\n"); if (c == 'y' || c == 'Y') return (1); resolved = 0; return (0); } /* * Look up state information for an inode. */ struct inostat * inoinfo(ino_t inum) { static struct inostat unallocated = { USTATE, 0, 0 }; struct inostatlist *ilp; int iloff; if (inum > maxino) errx(EEXIT, "inoinfo: inumber %ju out of range", (uintmax_t)inum); ilp = &inostathead[inum / sblock.fs_ipg]; iloff = inum % sblock.fs_ipg; if (iloff >= ilp->il_numalloced) return (&unallocated); return (&ilp->il_stat[iloff]); } /* * Malloc buffers and set up cache. */ void bufinit(void) { struct bufarea *bp; long bufcnt, i; char *bufp; pbp = pdirbp = (struct bufarea *)0; bufp = Malloc((unsigned int)sblock.fs_bsize); if (bufp == NULL) errx(EEXIT, "cannot allocate buffer pool"); cgblk.b_un.b_buf = bufp; initbarea(&cgblk, BT_CYLGRP); TAILQ_INIT(&bufhead); bufcnt = MAXBUFS; if (bufcnt < MINBUFS) bufcnt = MINBUFS; for (i = 0; i < bufcnt; i++) { bp = (struct bufarea *)Malloc(sizeof(struct bufarea)); bufp = Malloc((unsigned int)sblock.fs_bsize); if (bp == NULL || bufp == NULL) { if (i >= MINBUFS) break; errx(EEXIT, "cannot allocate buffer pool"); } bp->b_un.b_buf = bufp; TAILQ_INSERT_HEAD(&bufhead, bp, b_list); initbarea(bp, BT_UNKNOWN); } numbufs = i; /* save number of buffers */ for (i = 0; i < BT_NUMBUFTYPES; i++) { readtime[i].tv_sec = totalreadtime[i].tv_sec = 0; readtime[i].tv_nsec = totalreadtime[i].tv_nsec = 0; readcnt[i] = totalreadcnt[i] = 0; } } /* * Manage cylinder group buffers. */ static struct bufarea *cgbufs; /* header for cylinder group cache */ static int flushtries; /* number of tries to reclaim memory */ struct bufarea * cgget(int cg) { struct bufarea *cgbp; struct cg *cgp; if (cgbufs == NULL) { cgbufs = calloc(sblock.fs_ncg, sizeof(struct bufarea)); if (cgbufs == NULL) errx(EEXIT, "cannot allocate cylinder group buffers"); } cgbp = &cgbufs[cg]; if (cgbp->b_un.b_cg != NULL) return (cgbp); cgp = NULL; if (flushtries == 0) cgp = malloc((unsigned int)sblock.fs_cgsize); if (cgp == NULL) { getblk(&cgblk, cgtod(&sblock, cg), sblock.fs_cgsize); return (&cgblk); } cgbp->b_un.b_cg = cgp; initbarea(cgbp, BT_CYLGRP); getblk(cgbp, cgtod(&sblock, cg), sblock.fs_cgsize); return (cgbp); } /* * Attempt to flush a cylinder group cache entry. * Return whether the flush was successful. */ int flushentry(void) { struct bufarea *cgbp; if (flushtries == sblock.fs_ncg || cgbufs == NULL) return (0); cgbp = &cgbufs[flushtries++]; if (cgbp->b_un.b_cg == NULL) return (0); flush(fswritefd, cgbp); free(cgbp->b_un.b_buf); cgbp->b_un.b_buf = NULL; return (1); } /* * Manage a cache of directory blocks. */ struct bufarea * getdatablk(ufs2_daddr_t blkno, long size, int type) { struct bufarea *bp; TAILQ_FOREACH(bp, &bufhead, b_list) if (bp->b_bno == fsbtodb(&sblock, blkno)) goto foundit; TAILQ_FOREACH_REVERSE(bp, &bufhead, buflist, b_list) if ((bp->b_flags & B_INUSE) == 0) break; if (bp == NULL) errx(EEXIT, "deadlocked buffer pool"); bp->b_type = type; getblk(bp, blkno, size); /* fall through */ foundit: if (debug && bp->b_type != type) printf("Buffer type changed from %s to %s\n", buftype[bp->b_type], buftype[type]); TAILQ_REMOVE(&bufhead, bp, b_list); TAILQ_INSERT_HEAD(&bufhead, bp, b_list); bp->b_flags |= B_INUSE; return (bp); } /* * Timespec operations (from ). */ #define timespecsub(vvp, uvp) \ do { \ (vvp)->tv_sec -= (uvp)->tv_sec; \ (vvp)->tv_nsec -= (uvp)->tv_nsec; \ if ((vvp)->tv_nsec < 0) { \ (vvp)->tv_sec--; \ (vvp)->tv_nsec += 1000000000; \ } \ } while (0) #define timespecadd(vvp, uvp) \ do { \ (vvp)->tv_sec += (uvp)->tv_sec; \ (vvp)->tv_nsec += (uvp)->tv_nsec; \ if ((vvp)->tv_nsec >= 1000000000) { \ (vvp)->tv_sec++; \ (vvp)->tv_nsec -= 1000000000; \ } \ } while (0) void getblk(struct bufarea *bp, ufs2_daddr_t blk, long size) { ufs2_daddr_t dblk; struct timespec start, finish; dblk = fsbtodb(&sblock, blk); if (bp->b_bno == dblk) { totalreads++; } else { flush(fswritefd, bp); if (debug) { readcnt[bp->b_type]++; clock_gettime(CLOCK_REALTIME_PRECISE, &start); } bp->b_errs = blread(fsreadfd, bp->b_un.b_buf, dblk, size); if (debug) { clock_gettime(CLOCK_REALTIME_PRECISE, &finish); timespecsub(&finish, &start); timespecadd(&readtime[bp->b_type], &finish); } bp->b_bno = dblk; bp->b_size = size; } } void flush(int fd, struct bufarea *bp) { int i, j; if (!bp->b_dirty) return; + /* + * Calculate any needed check hashes. + */ + switch (bp->b_type) { + case BT_CYLGRP: + if ((sblock.fs_metackhash & CK_CYLGRP) == 0) + break; + bp->b_un.b_cg->cg_ckhash = 0; + bp->b_un.b_cg->cg_ckhash = + calculate_crc32c(~0L, bp->b_un.b_buf, bp->b_size); + break; + default: + break; + } bp->b_dirty = 0; if (fswritefd < 0) { pfatal("WRITING IN READ_ONLY MODE.\n"); return; } if (bp->b_errs != 0) pfatal("WRITING %sZERO'ED BLOCK %lld TO DISK\n", (bp->b_errs == bp->b_size / dev_bsize) ? "" : "PARTIALLY ", (long long)bp->b_bno); bp->b_errs = 0; blwrite(fd, bp->b_un.b_buf, bp->b_bno, bp->b_size); if (bp != &sblk) return; for (i = 0, j = 0; i < sblock.fs_cssize; i += sblock.fs_bsize, j++) { blwrite(fswritefd, (char *)sblock.fs_csp + i, fsbtodb(&sblock, sblock.fs_csaddr + j * sblock.fs_frag), MIN(sblock.fs_cssize - i, sblock.fs_bsize)); } } void rwerror(const char *mesg, ufs2_daddr_t blk) { if (bkgrdcheck) exit(EEXIT); if (preen == 0) printf("\n"); pfatal("CANNOT %s: %ld", mesg, (long)blk); if (reply("CONTINUE") == 0) exit(EEXIT); } void ckfini(int markclean) { struct bufarea *bp, *nbp; int ofsmodified, cnt; if (bkgrdflag) { unlink(snapname); if ((!(sblock.fs_flags & FS_UNCLEAN)) != markclean) { cmd.value = FS_UNCLEAN; cmd.size = markclean ? -1 : 1; if (sysctlbyname("vfs.ffs.setflags", 0, 0, &cmd, sizeof cmd) == -1) rwerror("SET FILE SYSTEM FLAGS", FS_UNCLEAN); if (!preen) { printf("\n***** FILE SYSTEM MARKED %s *****\n", markclean ? "CLEAN" : "DIRTY"); if (!markclean) rerun = 1; } } else if (!preen && !markclean) { printf("\n***** FILE SYSTEM STILL DIRTY *****\n"); rerun = 1; } } if (debug && totalreads > 0) printf("cache with %d buffers missed %ld of %ld (%d%%)\n", numbufs, totaldiskreads, totalreads, (int)(totaldiskreads * 100 / totalreads)); if (fswritefd < 0) { (void)close(fsreadfd); return; } flush(fswritefd, &sblk); if (havesb && cursnapshot == 0 && sblock.fs_magic == FS_UFS2_MAGIC && sblk.b_bno != sblock.fs_sblockloc / dev_bsize && !preen && reply("UPDATE STANDARD SUPERBLOCK")) { sblk.b_bno = sblock.fs_sblockloc / dev_bsize; sbdirty(); flush(fswritefd, &sblk); } flush(fswritefd, &cgblk); free(cgblk.b_un.b_buf); cnt = 0; TAILQ_FOREACH_REVERSE_SAFE(bp, &bufhead, buflist, b_list, nbp) { TAILQ_REMOVE(&bufhead, bp, b_list); cnt++; flush(fswritefd, bp); free(bp->b_un.b_buf); free((char *)bp); } if (numbufs != cnt) errx(EEXIT, "panic: lost %d buffers", numbufs - cnt); if (cgbufs != NULL) { for (cnt = 0; cnt < sblock.fs_ncg; cnt++) { if (cgbufs[cnt].b_un.b_cg == NULL) continue; flush(fswritefd, &cgbufs[cnt]); free(cgbufs[cnt].b_un.b_cg); } free(cgbufs); } pbp = pdirbp = (struct bufarea *)0; if (cursnapshot == 0 && sblock.fs_clean != markclean) { if ((sblock.fs_clean = markclean) != 0) { sblock.fs_flags &= ~(FS_UNCLEAN | FS_NEEDSFSCK); sblock.fs_pendingblocks = 0; sblock.fs_pendinginodes = 0; } sbdirty(); ofsmodified = fsmodified; flush(fswritefd, &sblk); fsmodified = ofsmodified; if (!preen) { printf("\n***** FILE SYSTEM MARKED %s *****\n", markclean ? "CLEAN" : "DIRTY"); if (!markclean) rerun = 1; } } else if (!preen) { if (markclean) { printf("\n***** FILE SYSTEM IS CLEAN *****\n"); } else { printf("\n***** FILE SYSTEM STILL DIRTY *****\n"); rerun = 1; } } (void)close(fsreadfd); (void)close(fswritefd); } /* * Print out I/O statistics. */ void IOstats(char *what) { int i; if (debug == 0) return; if (diskreads == 0) { printf("%s: no I/O\n\n", what); return; } if (startpass.tv_sec == 0) startpass = startprog; printf("%s: I/O statistics\n", what); printIOstats(); totaldiskreads += diskreads; diskreads = 0; for (i = 0; i < BT_NUMBUFTYPES; i++) { timespecadd(&totalreadtime[i], &readtime[i]); totalreadcnt[i] += readcnt[i]; readtime[i].tv_sec = readtime[i].tv_nsec = 0; readcnt[i] = 0; } clock_gettime(CLOCK_REALTIME_PRECISE, &startpass); } void finalIOstats(void) { int i; if (debug == 0) return; printf("Final I/O statistics\n"); totaldiskreads += diskreads; diskreads = totaldiskreads; startpass = startprog; for (i = 0; i < BT_NUMBUFTYPES; i++) { timespecadd(&totalreadtime[i], &readtime[i]); totalreadcnt[i] += readcnt[i]; readtime[i] = totalreadtime[i]; readcnt[i] = totalreadcnt[i]; } printIOstats(); } static void printIOstats(void) { long long msec, totalmsec; int i; clock_gettime(CLOCK_REALTIME_PRECISE, &finishpass); timespecsub(&finishpass, &startpass); printf("Running time: %jd.%03ld sec\n", (intmax_t)finishpass.tv_sec, finishpass.tv_nsec / 1000000); printf("buffer reads by type:\n"); for (totalmsec = 0, i = 0; i < BT_NUMBUFTYPES; i++) totalmsec += readtime[i].tv_sec * 1000 + readtime[i].tv_nsec / 1000000; if (totalmsec == 0) totalmsec = 1; for (i = 0; i < BT_NUMBUFTYPES; i++) { if (readcnt[i] == 0) continue; msec = readtime[i].tv_sec * 1000 + readtime[i].tv_nsec / 1000000; printf("%21s:%8ld %2ld.%ld%% %4jd.%03ld sec %2lld.%lld%%\n", buftype[i], readcnt[i], readcnt[i] * 100 / diskreads, (readcnt[i] * 1000 / diskreads) % 10, (intmax_t)readtime[i].tv_sec, readtime[i].tv_nsec / 1000000, msec * 100 / totalmsec, (msec * 1000 / totalmsec) % 10); } printf("\n"); } int blread(int fd, char *buf, ufs2_daddr_t blk, long size) { char *cp; int i, errs; off_t offset; offset = blk; offset *= dev_bsize; if (bkgrdflag) slowio_start(); totalreads++; diskreads++; if (lseek(fd, offset, 0) < 0) rwerror("SEEK BLK", blk); else if (read(fd, buf, (int)size) == size) { if (bkgrdflag) slowio_end(); return (0); } /* * This is handled specially here instead of in rwerror because * rwerror is used for all sorts of errors, not just true read/write * errors. It should be refactored and fixed. */ if (surrender) { pfatal("CANNOT READ_BLK: %ld", (long)blk); errx(EEXIT, "ABORTING DUE TO READ ERRORS"); } else rwerror("READ BLK", blk); if (lseek(fd, offset, 0) < 0) rwerror("SEEK BLK", blk); errs = 0; memset(buf, 0, (size_t)size); printf("THE FOLLOWING DISK SECTORS COULD NOT BE READ:"); for (cp = buf, i = 0; i < size; i += secsize, cp += secsize) { if (read(fd, cp, (int)secsize) != secsize) { (void)lseek(fd, offset + i + secsize, 0); if (secsize != dev_bsize && dev_bsize != 1) printf(" %jd (%jd),", (intmax_t)(blk * dev_bsize + i) / secsize, (intmax_t)blk + i / dev_bsize); else printf(" %jd,", (intmax_t)blk + i / dev_bsize); errs++; } } printf("\n"); if (errs) resolved = 0; return (errs); } void blwrite(int fd, char *buf, ufs2_daddr_t blk, ssize_t size) { int i; char *cp; off_t offset; if (fd < 0) return; offset = blk; offset *= dev_bsize; if (lseek(fd, offset, 0) < 0) rwerror("SEEK BLK", blk); else if (write(fd, buf, size) == size) { fsmodified = 1; return; } resolved = 0; rwerror("WRITE BLK", blk); if (lseek(fd, offset, 0) < 0) rwerror("SEEK BLK", blk); printf("THE FOLLOWING SECTORS COULD NOT BE WRITTEN:"); for (cp = buf, i = 0; i < size; i += dev_bsize, cp += dev_bsize) if (write(fd, cp, dev_bsize) != dev_bsize) { (void)lseek(fd, offset + i + dev_bsize, 0); printf(" %jd,", (intmax_t)blk + i / dev_bsize); } printf("\n"); return; } void blerase(int fd, ufs2_daddr_t blk, long size) { off_t ioarg[2]; if (fd < 0) return; ioarg[0] = blk * dev_bsize; ioarg[1] = size; ioctl(fd, DIOCGDELETE, ioarg); /* we don't really care if we succeed or not */ return; } /* * Fill a contiguous region with all-zeroes. Note ZEROBUFSIZE is by * definition a multiple of dev_bsize. */ void blzero(int fd, ufs2_daddr_t blk, long size) { static char *zero; off_t offset, len; if (fd < 0) return; if (zero == NULL) { zero = calloc(ZEROBUFSIZE, 1); if (zero == NULL) errx(EEXIT, "cannot allocate buffer pool"); } offset = blk * dev_bsize; if (lseek(fd, offset, 0) < 0) rwerror("SEEK BLK", blk); while (size > 0) { len = MIN(ZEROBUFSIZE, size); if (write(fd, zero, len) != len) rwerror("WRITE BLK", blk); blk += len / dev_bsize; size -= len; } } /* * Verify cylinder group's magic number and other parameters. If the * test fails, offer an option to rebuild the whole cylinder group. */ int check_cgmagic(int cg, struct bufarea *cgbp) { struct cg *cgp = cgbp->b_un.b_cg; /* * Extended cylinder group checks. */ if (cg_chkmagic(cgp) && ((sblock.fs_magic == FS_UFS1_MAGIC && cgp->cg_old_niblk == sblock.fs_ipg && cgp->cg_ndblk <= sblock.fs_fpg && cgp->cg_old_ncyl <= sblock.fs_old_cpg) || (sblock.fs_magic == FS_UFS2_MAGIC && cgp->cg_niblk == sblock.fs_ipg && cgp->cg_ndblk <= sblock.fs_fpg && cgp->cg_initediblk <= sblock.fs_ipg))) { return (1); } pfatal("CYLINDER GROUP %d: BAD MAGIC NUMBER", cg); if (!reply("REBUILD CYLINDER GROUP")) { printf("YOU WILL NEED TO RERUN FSCK.\n"); rerun = 1; return (1); } /* * Zero out the cylinder group and then initialize critical fields. * Bit maps and summaries will be recalculated by later passes. */ memset(cgp, 0, (size_t)sblock.fs_cgsize); cgp->cg_magic = CG_MAGIC; cgp->cg_cgx = cg; cgp->cg_niblk = sblock.fs_ipg; cgp->cg_initediblk = MIN(sblock.fs_ipg, 2 * INOPB(&sblock)); if (cgbase(&sblock, cg) + sblock.fs_fpg < sblock.fs_size) cgp->cg_ndblk = sblock.fs_fpg; else cgp->cg_ndblk = sblock.fs_size - cgbase(&sblock, cg); cgp->cg_iusedoff = &cgp->cg_space[0] - (u_char *)(&cgp->cg_firstfield); if (sblock.fs_magic == FS_UFS1_MAGIC) { cgp->cg_niblk = 0; cgp->cg_initediblk = 0; cgp->cg_old_ncyl = sblock.fs_old_cpg; cgp->cg_old_niblk = sblock.fs_ipg; cgp->cg_old_btotoff = cgp->cg_iusedoff; cgp->cg_old_boff = cgp->cg_old_btotoff + sblock.fs_old_cpg * sizeof(int32_t); cgp->cg_iusedoff = cgp->cg_old_boff + sblock.fs_old_cpg * sizeof(u_int16_t); } cgp->cg_freeoff = cgp->cg_iusedoff + howmany(sblock.fs_ipg, CHAR_BIT); cgp->cg_nextfreeoff = cgp->cg_freeoff + howmany(sblock.fs_fpg,CHAR_BIT); if (sblock.fs_contigsumsize > 0) { cgp->cg_nclusterblks = cgp->cg_ndblk / sblock.fs_frag; cgp->cg_clustersumoff = roundup(cgp->cg_nextfreeoff, sizeof(u_int32_t)); cgp->cg_clustersumoff -= sizeof(u_int32_t); cgp->cg_clusteroff = cgp->cg_clustersumoff + (sblock.fs_contigsumsize + 1) * sizeof(u_int32_t); cgp->cg_nextfreeoff = cgp->cg_clusteroff + howmany(fragstoblks(&sblock, sblock.fs_fpg), CHAR_BIT); } dirty(cgbp); return (0); } /* * allocate a data block with the specified number of fragments */ ufs2_daddr_t allocblk(long frags) { int i, j, k, cg, baseblk; struct bufarea *cgbp; struct cg *cgp; if (frags <= 0 || frags > sblock.fs_frag) return (0); for (i = 0; i < maxfsblock - sblock.fs_frag; i += sblock.fs_frag) { for (j = 0; j <= sblock.fs_frag - frags; j++) { if (testbmap(i + j)) continue; for (k = 1; k < frags; k++) if (testbmap(i + j + k)) break; if (k < frags) { j += k; continue; } cg = dtog(&sblock, i + j); cgbp = cgget(cg); cgp = cgbp->b_un.b_cg; if (!check_cgmagic(cg, cgbp)) return (0); baseblk = dtogd(&sblock, i + j); for (k = 0; k < frags; k++) { setbmap(i + j + k); clrbit(cg_blksfree(cgp), baseblk + k); } n_blks += frags; if (frags == sblock.fs_frag) cgp->cg_cs.cs_nbfree--; else cgp->cg_cs.cs_nffree -= frags; dirty(cgbp); return (i + j); } } return (0); } /* * Free a previously allocated block */ void freeblk(ufs2_daddr_t blkno, long frags) { struct inodesc idesc; idesc.id_blkno = blkno; idesc.id_numfrags = frags; (void)pass4check(&idesc); } /* Slow down IO so as to leave some disk bandwidth for other processes */ void slowio_start() { /* Delay one in every 8 operations */ slowio_pollcnt = (slowio_pollcnt + 1) & 7; if (slowio_pollcnt == 0) { gettimeofday(&slowio_starttime, NULL); } } void slowio_end() { struct timeval tv; int delay_usec; if (slowio_pollcnt != 0) return; /* Update the slowdown interval. */ gettimeofday(&tv, NULL); delay_usec = (tv.tv_sec - slowio_starttime.tv_sec) * 1000000 + (tv.tv_usec - slowio_starttime.tv_usec); if (delay_usec < 64) delay_usec = 64; if (delay_usec > 2500000) delay_usec = 2500000; slowio_delay_usec = (slowio_delay_usec * 63 + delay_usec) >> 6; /* delay by 8 times the average IO delay */ if (slowio_delay_usec > 64) usleep(slowio_delay_usec * 8); } /* * Find a pathname */ void getpathname(char *namebuf, ino_t curdir, ino_t ino) { int len; char *cp; struct inodesc idesc; static int busy = 0; if (curdir == ino && ino == UFS_ROOTINO) { (void)strcpy(namebuf, "/"); return; } if (busy || !INO_IS_DVALID(curdir)) { (void)strcpy(namebuf, "?"); return; } busy = 1; memset(&idesc, 0, sizeof(struct inodesc)); idesc.id_type = DATA; idesc.id_fix = IGNORE; cp = &namebuf[MAXPATHLEN - 1]; *cp = '\0'; if (curdir != ino) { idesc.id_parent = curdir; goto namelookup; } while (ino != UFS_ROOTINO) { idesc.id_number = ino; idesc.id_func = findino; idesc.id_name = strdup(".."); if ((ckinode(ginode(ino), &idesc) & FOUND) == 0) break; namelookup: idesc.id_number = idesc.id_parent; idesc.id_parent = ino; idesc.id_func = findname; idesc.id_name = namebuf; if ((ckinode(ginode(idesc.id_number), &idesc)&FOUND) == 0) break; len = strlen(namebuf); cp -= len; memmove(cp, namebuf, (size_t)len); *--cp = '/'; if (cp < &namebuf[UFS_MAXNAMLEN]) break; ino = idesc.id_number; } busy = 0; if (ino != UFS_ROOTINO) *--cp = '?'; memmove(namebuf, cp, (size_t)(&namebuf[MAXPATHLEN] - cp)); } void catch(int sig __unused) { ckfini(0); exit(12); } /* * When preening, allow a single quit to signal * a special exit after file system checks complete * so that reboot sequence may be interrupted. */ void catchquit(int sig __unused) { printf("returning to single-user after file system check\n"); returntosingle = 1; (void)signal(SIGQUIT, SIG_DFL); } /* * determine whether an inode should be fixed. */ int dofix(struct inodesc *idesc, const char *msg) { switch (idesc->id_fix) { case DONTKNOW: if (idesc->id_type == DATA) direrror(idesc->id_number, msg); else pwarn("%s", msg); if (preen) { printf(" (SALVAGED)\n"); idesc->id_fix = FIX; return (ALTERED); } if (reply("SALVAGE") == 0) { idesc->id_fix = NOFIX; return (0); } idesc->id_fix = FIX; return (ALTERED); case FIX: return (ALTERED); case NOFIX: case IGNORE: return (0); default: errx(EEXIT, "UNKNOWN INODESC FIX MODE %d", idesc->id_fix); } /* NOTREACHED */ return (0); } #include /* * An unexpected inconsistency occurred. * Die if preening or file system is running with soft dependency protocol, * otherwise just print message and continue. */ void pfatal(const char *fmt, ...) { va_list ap; va_start(ap, fmt); if (!preen) { (void)vfprintf(stdout, fmt, ap); va_end(ap); if (usedsoftdep) (void)fprintf(stdout, "\nUNEXPECTED SOFT UPDATE INCONSISTENCY\n"); /* * Force foreground fsck to clean up inconsistency. */ if (bkgrdflag) { cmd.value = FS_NEEDSFSCK; cmd.size = 1; if (sysctlbyname("vfs.ffs.setflags", 0, 0, &cmd, sizeof cmd) == -1) pwarn("CANNOT SET FS_NEEDSFSCK FLAG\n"); fprintf(stdout, "CANNOT RUN IN BACKGROUND\n"); ckfini(0); exit(EEXIT); } return; } if (cdevname == NULL) cdevname = strdup("fsck"); (void)fprintf(stdout, "%s: ", cdevname); (void)vfprintf(stdout, fmt, ap); (void)fprintf(stdout, "\n%s: UNEXPECTED%sINCONSISTENCY; RUN fsck MANUALLY.\n", cdevname, usedsoftdep ? " SOFT UPDATE " : " "); /* * Force foreground fsck to clean up inconsistency. */ if (bkgrdflag) { cmd.value = FS_NEEDSFSCK; cmd.size = 1; if (sysctlbyname("vfs.ffs.setflags", 0, 0, &cmd, sizeof cmd) == -1) pwarn("CANNOT SET FS_NEEDSFSCK FLAG\n"); } ckfini(0); exit(EEXIT); } /* * Pwarn just prints a message when not preening or running soft dependency * protocol, or a warning (preceded by filename) when preening. */ void pwarn(const char *fmt, ...) { va_list ap; va_start(ap, fmt); if (preen) (void)fprintf(stdout, "%s: ", cdevname); (void)vfprintf(stdout, fmt, ap); va_end(ap); } /* * Stub for routines from kernel. */ void panic(const char *fmt, ...) { va_list ap; va_start(ap, fmt); pfatal("INTERNAL INCONSISTENCY:"); (void)vfprintf(stdout, fmt, ap); va_end(ap); exit(EEXIT); } Index: head/sbin/fsck_ffs/pass5.c =================================================================== --- head/sbin/fsck_ffs/pass5.c (revision 323922) +++ head/sbin/fsck_ffs/pass5.c (revision 323923) @@ -1,596 +1,609 @@ /* * Copyright (c) 1980, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #if 0 #ifndef lint static const char sccsid[] = "@(#)pass5.c 8.9 (Berkeley) 4/28/95"; #endif /* not lint */ #endif #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include "fsck.h" static void check_maps(u_char *, u_char *, int, ufs2_daddr_t, const char *, int *, int, int, int); static void clear_blocks(ufs2_daddr_t start, ufs2_daddr_t end); void pass5(void) { int c, i, j, blk, frags, basesize, mapsize; int inomapsize, blkmapsize; struct fs *fs = &sblock; ufs2_daddr_t d, dbase, dmax, start; int rewritecg = 0; struct csum *cs; struct csum_total cstotal; struct inodesc idesc[3]; char buf[MAXBSIZE]; struct cg *cg, *newcg = (struct cg *)buf; struct bufarea *cgbp; inoinfo(UFS_WINO)->ino_state = USTATE; memset(newcg, 0, (size_t)fs->fs_cgsize); newcg->cg_niblk = fs->fs_ipg; + if (preen == 0 && yflag == 0 && fs->fs_magic == FS_UFS2_MAGIC && + fswritefd != -1 && (fs->fs_metackhash & CK_CYLGRP) == 0 && + reply("ADD CYLINDER GROUP CHECKSUM PROTECTION") != 0) { + fs->fs_metackhash |= CK_CYLGRP; + rewritecg = 1; + sbdirty(); + } if (cvtlevel >= 3) { if (fs->fs_maxcontig < 2 && fs->fs_contigsumsize > 0) { if (preen) pwarn("DELETING CLUSTERING MAPS\n"); if (preen || reply("DELETE CLUSTERING MAPS")) { fs->fs_contigsumsize = 0; rewritecg = 1; sbdirty(); } } if (fs->fs_maxcontig > 1) { const char *doit = NULL; if (fs->fs_contigsumsize < 1) { doit = "CREAT"; } else if (fs->fs_contigsumsize < fs->fs_maxcontig && fs->fs_contigsumsize < FS_MAXCONTIG) { doit = "EXPAND"; } if (doit) { i = fs->fs_contigsumsize; fs->fs_contigsumsize = MIN(fs->fs_maxcontig, FS_MAXCONTIG); if (CGSIZE(fs) > (u_int)fs->fs_bsize) { pwarn("CANNOT %s CLUSTER MAPS\n", doit); fs->fs_contigsumsize = i; } else if (preen || reply("CREATE CLUSTER MAPS")) { if (preen) pwarn("%sING CLUSTER MAPS\n", doit); fs->fs_cgsize = fragroundup(fs, CGSIZE(fs)); rewritecg = 1; sbdirty(); } } } } basesize = &newcg->cg_space[0] - (u_char *)(&newcg->cg_firstfield); if (sblock.fs_magic == FS_UFS2_MAGIC) { newcg->cg_iusedoff = basesize; } else { /* * We reserve the space for the old rotation summary * tables for the benefit of old kernels, but do not * maintain them in modern kernels. In time, they can * go away. */ newcg->cg_old_btotoff = basesize; newcg->cg_old_boff = newcg->cg_old_btotoff + fs->fs_old_cpg * sizeof(int32_t); newcg->cg_iusedoff = newcg->cg_old_boff + fs->fs_old_cpg * fs->fs_old_nrpos * sizeof(u_int16_t); memset(&newcg->cg_space[0], 0, newcg->cg_iusedoff - basesize); } inomapsize = howmany(fs->fs_ipg, CHAR_BIT); newcg->cg_freeoff = newcg->cg_iusedoff + inomapsize; blkmapsize = howmany(fs->fs_fpg, CHAR_BIT); newcg->cg_nextfreeoff = newcg->cg_freeoff + blkmapsize; if (fs->fs_contigsumsize > 0) { newcg->cg_clustersumoff = newcg->cg_nextfreeoff - sizeof(u_int32_t); newcg->cg_clustersumoff = roundup(newcg->cg_clustersumoff, sizeof(u_int32_t)); newcg->cg_clusteroff = newcg->cg_clustersumoff + (fs->fs_contigsumsize + 1) * sizeof(u_int32_t); newcg->cg_nextfreeoff = newcg->cg_clusteroff + howmany(fragstoblks(fs, fs->fs_fpg), CHAR_BIT); } newcg->cg_magic = CG_MAGIC; mapsize = newcg->cg_nextfreeoff - newcg->cg_iusedoff; memset(&idesc[0], 0, sizeof idesc); for (i = 0; i < 3; i++) idesc[i].id_type = ADDR; memset(&cstotal, 0, sizeof(struct csum_total)); dmax = blknum(fs, fs->fs_size + fs->fs_frag - 1); for (d = fs->fs_size; d < dmax; d++) setbmap(d); for (c = 0; c < fs->fs_ncg; c++) { if (got_siginfo) { printf("%s: phase 5: cyl group %d of %d (%d%%)\n", cdevname, c, sblock.fs_ncg, c * 100 / sblock.fs_ncg); got_siginfo = 0; } if (got_sigalarm) { setproctitle("%s p5 %d%%", cdevname, c * 100 / sblock.fs_ncg); got_sigalarm = 0; } cgbp = cgget(c); cg = cgbp->b_un.b_cg; if (!cg_chkmagic(cg)) pfatal("CG %d: BAD MAGIC NUMBER\n", c); newcg->cg_time = cg->cg_time; newcg->cg_old_time = cg->cg_old_time; newcg->cg_unrefs = cg->cg_unrefs; newcg->cg_cgx = c; dbase = cgbase(fs, c); dmax = dbase + fs->fs_fpg; if (dmax > fs->fs_size) dmax = fs->fs_size; newcg->cg_ndblk = dmax - dbase; if (fs->fs_magic == FS_UFS1_MAGIC) { if (c == fs->fs_ncg - 1) newcg->cg_old_ncyl = howmany(newcg->cg_ndblk, fs->fs_fpg / fs->fs_old_cpg); else newcg->cg_old_ncyl = fs->fs_old_cpg; newcg->cg_old_niblk = fs->fs_ipg; newcg->cg_niblk = 0; } if (fs->fs_contigsumsize > 0) newcg->cg_nclusterblks = newcg->cg_ndblk / fs->fs_frag; newcg->cg_cs.cs_ndir = 0; newcg->cg_cs.cs_nffree = 0; newcg->cg_cs.cs_nbfree = 0; newcg->cg_cs.cs_nifree = fs->fs_ipg; if (cg->cg_rotor >= 0 && cg->cg_rotor < newcg->cg_ndblk) newcg->cg_rotor = cg->cg_rotor; else newcg->cg_rotor = 0; if (cg->cg_frotor >= 0 && cg->cg_frotor < newcg->cg_ndblk) newcg->cg_frotor = cg->cg_frotor; else newcg->cg_frotor = 0; if (cg->cg_irotor >= 0 && cg->cg_irotor < fs->fs_ipg) newcg->cg_irotor = cg->cg_irotor; else newcg->cg_irotor = 0; if (fs->fs_magic == FS_UFS1_MAGIC) { newcg->cg_initediblk = 0; } else { if ((unsigned)cg->cg_initediblk > fs->fs_ipg) newcg->cg_initediblk = fs->fs_ipg; else newcg->cg_initediblk = cg->cg_initediblk; } memset(&newcg->cg_frsum[0], 0, sizeof newcg->cg_frsum); memset(cg_inosused(newcg), 0, (size_t)(mapsize)); j = fs->fs_ipg * c; for (i = 0; i < inostathead[c].il_numalloced; j++, i++) { switch (inoinfo(j)->ino_state) { case USTATE: break; case DSTATE: case DCLEAR: case DFOUND: case DZLINK: newcg->cg_cs.cs_ndir++; /* FALLTHROUGH */ case FSTATE: case FCLEAR: case FZLINK: newcg->cg_cs.cs_nifree--; setbit(cg_inosused(newcg), i); break; default: if (j < (int)UFS_ROOTINO) break; errx(EEXIT, "BAD STATE %d FOR INODE I=%d", inoinfo(j)->ino_state, j); } } if (c == 0) for (i = 0; i < (int)UFS_ROOTINO; i++) { setbit(cg_inosused(newcg), i); newcg->cg_cs.cs_nifree--; } start = -1; for (i = 0, d = dbase; d < dmax; d += fs->fs_frag, i += fs->fs_frag) { frags = 0; for (j = 0; j < fs->fs_frag; j++) { if (testbmap(d + j)) { if ((Eflag || Zflag) && start != -1) { clear_blocks(start, d + j - 1); start = -1; } continue; } if (start == -1) start = d + j; setbit(cg_blksfree(newcg), i + j); frags++; } if (frags == fs->fs_frag) { newcg->cg_cs.cs_nbfree++; if (fs->fs_contigsumsize > 0) setbit(cg_clustersfree(newcg), i / fs->fs_frag); } else if (frags > 0) { newcg->cg_cs.cs_nffree += frags; blk = blkmap(fs, cg_blksfree(newcg), i); ffs_fragacct(fs, blk, newcg->cg_frsum, 1); } } if ((Eflag || Zflag) && start != -1) clear_blocks(start, d - 1); if (fs->fs_contigsumsize > 0) { int32_t *sump = cg_clustersum(newcg); u_char *mapp = cg_clustersfree(newcg); int map = *mapp++; int bit = 1; int run = 0; for (i = 0; i < newcg->cg_nclusterblks; i++) { if ((map & bit) != 0) { run++; } else if (run != 0) { if (run > fs->fs_contigsumsize) run = fs->fs_contigsumsize; sump[run]++; run = 0; } if ((i & (CHAR_BIT - 1)) != (CHAR_BIT - 1)) { bit <<= 1; } else { map = *mapp++; bit = 1; } } if (run != 0) { if (run > fs->fs_contigsumsize) run = fs->fs_contigsumsize; sump[run]++; } } + if ((fs->fs_metackhash & CK_CYLGRP) != 0) { + newcg->cg_ckhash = 0; + newcg->cg_ckhash = + calculate_crc32c(~0L, (void *)newcg, fs->fs_cgsize); + } + if (bkgrdflag != 0) { cstotal.cs_nffree += cg->cg_cs.cs_nffree; cstotal.cs_nbfree += cg->cg_cs.cs_nbfree; cstotal.cs_nifree += cg->cg_cs.cs_nifree; cstotal.cs_ndir += cg->cg_cs.cs_ndir; } else { cstotal.cs_nffree += newcg->cg_cs.cs_nffree; cstotal.cs_nbfree += newcg->cg_cs.cs_nbfree; cstotal.cs_nifree += newcg->cg_cs.cs_nifree; cstotal.cs_ndir += newcg->cg_cs.cs_ndir; } cs = &fs->fs_cs(fs, c); if (cursnapshot == 0 && memcmp(&newcg->cg_cs, cs, sizeof *cs) != 0 && dofix(&idesc[0], "FREE BLK COUNT(S) WRONG IN SUPERBLK")) { memmove(cs, &newcg->cg_cs, sizeof *cs); sbdirty(); } if (rewritecg) { memmove(cg, newcg, (size_t)fs->fs_cgsize); dirty(cgbp); continue; } if (cursnapshot == 0 && memcmp(newcg, cg, basesize) != 0 && dofix(&idesc[2], "SUMMARY INFORMATION BAD")) { memmove(cg, newcg, (size_t)basesize); dirty(cgbp); } if (bkgrdflag != 0 || usedsoftdep || debug) update_maps(cg, newcg, bkgrdflag); if (cursnapshot == 0 && memcmp(cg_inosused(newcg), cg_inosused(cg), mapsize) != 0 && dofix(&idesc[1], "BLK(S) MISSING IN BIT MAPS")) { memmove(cg_inosused(cg), cg_inosused(newcg), (size_t)mapsize); dirty(cgbp); } } if (cursnapshot == 0 && memcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal) != 0 && dofix(&idesc[0], "SUMMARY BLK COUNT(S) WRONG IN SUPERBLK")) { memmove(&fs->fs_cstotal, &cstotal, sizeof cstotal); fs->fs_ronly = 0; fs->fs_fmod = 0; sbdirty(); } /* * When doing background fsck on a snapshot, figure out whether * the superblock summary is inaccurate and correct it when * necessary. */ if (cursnapshot != 0) { cmd.size = 1; cmd.value = cstotal.cs_ndir - fs->fs_cstotal.cs_ndir; if (cmd.value != 0) { if (debug) printf("adjndir by %+" PRIi64 "\n", cmd.value); if (bkgrdsumadj == 0 || sysctl(adjndir, MIBSIZE, 0, 0, &cmd, sizeof cmd) == -1) rwerror("ADJUST NUMBER OF DIRECTORIES", cmd.value); } cmd.value = cstotal.cs_nbfree - fs->fs_cstotal.cs_nbfree; if (cmd.value != 0) { if (debug) printf("adjnbfree by %+" PRIi64 "\n", cmd.value); if (bkgrdsumadj == 0 || sysctl(adjnbfree, MIBSIZE, 0, 0, &cmd, sizeof cmd) == -1) rwerror("ADJUST NUMBER OF FREE BLOCKS", cmd.value); } cmd.value = cstotal.cs_nifree - fs->fs_cstotal.cs_nifree; if (cmd.value != 0) { if (debug) printf("adjnifree by %+" PRIi64 "\n", cmd.value); if (bkgrdsumadj == 0 || sysctl(adjnifree, MIBSIZE, 0, 0, &cmd, sizeof cmd) == -1) rwerror("ADJUST NUMBER OF FREE INODES", cmd.value); } cmd.value = cstotal.cs_nffree - fs->fs_cstotal.cs_nffree; if (cmd.value != 0) { if (debug) printf("adjnffree by %+" PRIi64 "\n", cmd.value); if (bkgrdsumadj == 0 || sysctl(adjnffree, MIBSIZE, 0, 0, &cmd, sizeof cmd) == -1) rwerror("ADJUST NUMBER OF FREE FRAGS", cmd.value); } cmd.value = cstotal.cs_numclusters - fs->fs_cstotal.cs_numclusters; if (cmd.value != 0) { if (debug) printf("adjnumclusters by %+" PRIi64 "\n", cmd.value); if (bkgrdsumadj == 0 || sysctl(adjnumclusters, MIBSIZE, 0, 0, &cmd, sizeof cmd) == -1) rwerror("ADJUST NUMBER OF FREE CLUSTERS", cmd.value); } } } /* * Compare the original cylinder group inode and block bitmaps with the * updated cylinder group inode and block bitmaps. Free inodes and blocks * that have been added. Complain if any previously freed inodes blocks * are now allocated. */ void update_maps( struct cg *oldcg, /* cylinder group of claimed allocations */ struct cg *newcg, /* cylinder group of determined allocations */ int usesysctl) /* 1 => use sysctl interface to update maps */ { int inomapsize, excessdirs; struct fs *fs = &sblock; inomapsize = howmany(fs->fs_ipg, CHAR_BIT); excessdirs = oldcg->cg_cs.cs_ndir - newcg->cg_cs.cs_ndir; if (excessdirs < 0) { pfatal("LOST %d DIRECTORIES\n", -excessdirs); excessdirs = 0; } if (excessdirs > 0) check_maps(cg_inosused(newcg), cg_inosused(oldcg), inomapsize, oldcg->cg_cgx * (ufs2_daddr_t)fs->fs_ipg, "DIR", freedirs, 0, excessdirs, usesysctl); check_maps(cg_inosused(newcg), cg_inosused(oldcg), inomapsize, oldcg->cg_cgx * (ufs2_daddr_t)fs->fs_ipg, "FILE", freefiles, excessdirs, fs->fs_ipg, usesysctl); check_maps(cg_blksfree(oldcg), cg_blksfree(newcg), howmany(fs->fs_fpg, CHAR_BIT), oldcg->cg_cgx * (ufs2_daddr_t)fs->fs_fpg, "FRAG", freeblks, 0, fs->fs_fpg, usesysctl); } static void check_maps( u_char *map1, /* map of claimed allocations */ u_char *map2, /* map of determined allocations */ int mapsize, /* size of above two maps */ ufs2_daddr_t startvalue, /* resource value for first element in map */ const char *name, /* name of resource found in maps */ int *opcode, /* sysctl opcode to free resource */ int skip, /* number of entries to skip before starting to free */ int limit, /* limit on number of entries to free */ int usesysctl) /* 1 => use sysctl interface to update maps */ { # define BUFSIZE 16 char buf[BUFSIZE]; long i, j, k, l, m, size; ufs2_daddr_t n, astart, aend, ustart, uend; void (*msg)(const char *fmt, ...); if (usesysctl) msg = pfatal; else msg = pwarn; astart = ustart = aend = uend = -1; for (i = 0; i < mapsize; i++) { j = *map1++; k = *map2++; if (j == k) continue; for (m = 0, l = 1; m < CHAR_BIT; m++, l <<= 1) { if ((j & l) == (k & l)) continue; n = startvalue + i * CHAR_BIT + m; if ((j & l) != 0) { if (astart == -1) { astart = aend = n; continue; } if (aend + 1 == n) { aend = n; continue; } if (astart == aend) (*msg)("ALLOCATED %s %" PRId64 " MARKED FREE\n", name, astart); else (*msg)("%s %sS %" PRId64 "-%" PRId64 " MARKED FREE\n", "ALLOCATED", name, astart, aend); astart = aend = n; } else { if (ustart == -1) { ustart = uend = n; continue; } if (uend + 1 == n) { uend = n; continue; } size = uend - ustart + 1; if (size <= skip) { skip -= size; ustart = uend = n; continue; } if (skip > 0) { ustart += skip; size -= skip; skip = 0; } if (size > limit) size = limit; if (debug && size == 1) pwarn("%s %s %" PRId64 " MARKED USED\n", "UNALLOCATED", name, ustart); else if (debug) pwarn("%s %sS %" PRId64 "-%" PRId64 " MARKED USED\n", "UNALLOCATED", name, ustart, ustart + size - 1); if (usesysctl != 0) { cmd.value = ustart; cmd.size = size; if (sysctl(opcode, MIBSIZE, 0, 0, &cmd, sizeof cmd) == -1) { snprintf(buf, BUFSIZE, "FREE %s", name); rwerror(buf, cmd.value); } } limit -= size; if (limit <= 0) return; ustart = uend = n; } } } if (astart != -1) { if (astart == aend) (*msg)("ALLOCATED %s %" PRId64 " MARKED FREE\n", name, astart); else (*msg)("ALLOCATED %sS %" PRId64 "-%" PRId64 " MARKED FREE\n", name, astart, aend); } if (ustart != -1) { size = uend - ustart + 1; if (size <= skip) return; if (skip > 0) { ustart += skip; size -= skip; } if (size > limit) size = limit; if (debug) { if (size == 1) pwarn("UNALLOCATED %s %" PRId64 " MARKED USED\n", name, ustart); else pwarn("UNALLOCATED %sS %" PRId64 "-%" PRId64 " MARKED USED\n", name, ustart, ustart + size - 1); } if (usesysctl != 0) { cmd.value = ustart; cmd.size = size; if (sysctl(opcode, MIBSIZE, 0, 0, &cmd, sizeof cmd) == -1) { snprintf(buf, BUFSIZE, "FREE %s", name); rwerror(buf, cmd.value); } } } } static void clear_blocks(ufs2_daddr_t start, ufs2_daddr_t end) { if (debug) printf("Zero frags %jd to %jd\n", start, end); if (Zflag) blzero(fswritefd, fsbtodb(&sblock, start), lfragtosize(&sblock, end - start + 1)); if (Eflag) blerase(fswritefd, fsbtodb(&sblock, start), lfragtosize(&sblock, end - start + 1)); } Index: head/sbin/fsdb/Makefile =================================================================== --- head/sbin/fsdb/Makefile (revision 323922) +++ head/sbin/fsdb/Makefile (revision 323923) @@ -1,16 +1,16 @@ # $NetBSD: Makefile,v 1.1.1.1 1995/10/08 23:08:36 thorpej Exp $ # @(#)Makefile 8.1 (Berkeley) 6/5/93 # $FreeBSD$ PACKAGE=runtime PROG= fsdb MAN= fsdb.8 SRCS= fsdb.c fsdbutil.c \ dir.c ea.c fsutil.c inode.c pass1.c pass1b.c pass2.c pass3.c pass4.c \ pass5.c setup.c utilities.c ffs_subr.c ffs_tables.c globs.c CFLAGS+= -I${.CURDIR:H}/fsck_ffs WARNS?= 2 -LIBADD= edit +LIBADD= edit ufs .PATH: ${.CURDIR:H}/fsck_ffs ${SRCTOP}/sys/ufs/ffs .include Index: head/sbin/newfs/mkfs.c =================================================================== --- head/sbin/newfs/mkfs.c (revision 323922) +++ head/sbin/newfs/mkfs.c (revision 323923) @@ -1,1183 +1,1206 @@ /* * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Marshall * Kirk McKusick and Network Associates Laboratories, the Security * Research Division of Network Associates, Inc. under DARPA/SPAWAR * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS * research program. * * Copyright (c) 1980, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #if 0 #ifndef lint static char sccsid[] = "@(#)mkfs.c 8.11 (Berkeley) 5/3/95"; #endif /* not lint */ #endif #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "newfs.h" /* * make file system for cylinder-group style file systems */ #define UMASK 0755 #define POWEROF2(num) (((num) & ((num) - 1)) == 0) static struct csum *fscs; #define sblock disk.d_fs #define acg disk.d_cg union dinode { struct ufs1_dinode dp1; struct ufs2_dinode dp2; }; #define DIP(dp, field) \ ((sblock.fs_magic == FS_UFS1_MAGIC) ? \ (dp)->dp1.field : (dp)->dp2.field) static caddr_t iobuf; static long iobufsize; static ufs2_daddr_t alloc(int size, int mode); static int charsperline(void); static void clrblock(struct fs *, unsigned char *, int); static void fsinit(time_t); static int ilog2(int); static void initcg(int, time_t); static int isblock(struct fs *, unsigned char *, int); static void iput(union dinode *, ino_t); static int makedir(struct direct *, int); static void setblock(struct fs *, unsigned char *, int); static void wtfs(ufs2_daddr_t, int, char *); +static void cgckhash(struct cg *); static u_int32_t newfs_random(void); static int do_sbwrite(struct uufsd *disk) { if (!disk->d_sblock) disk->d_sblock = disk->d_fs.fs_sblockloc / disk->d_bsize; return (pwrite(disk->d_fd, &disk->d_fs, SBLOCKSIZE, (off_t)((part_ofs + disk->d_sblock) * disk->d_bsize))); } void mkfs(struct partition *pp, char *fsys) { int fragsperinode, optimalfpg, origdensity, minfpg, lastminfpg; long i, j, csfrags; uint cg; time_t utime; quad_t sizepb; int width; ino_t maxinum; int minfragsperinode; /* minimum ratio of frags to inodes */ char tmpbuf[100]; /* XXX this will break in about 2,500 years */ struct fsrecovery *fsr; char *fsrbuf; union { struct fs fdummy; char cdummy[SBLOCKSIZE]; } dummy; #define fsdummy dummy.fdummy #define chdummy dummy.cdummy /* * Our blocks == sector size, and the version of UFS we are using is * specified by Oflag. */ disk.d_bsize = sectorsize; disk.d_ufs = Oflag; if (Rflag) utime = 1000000000; else time(&utime); sblock.fs_old_flags = FS_FLAGS_UPDATED; sblock.fs_flags = 0; if (Uflag) sblock.fs_flags |= FS_DOSOFTDEP; if (Lflag) strlcpy(sblock.fs_volname, volumelabel, MAXVOLLEN); if (Jflag) sblock.fs_flags |= FS_GJOURNAL; if (lflag) sblock.fs_flags |= FS_MULTILABEL; if (tflag) sblock.fs_flags |= FS_TRIM; /* * Validate the given file system size. * Verify that its last block can actually be accessed. * Convert to file system fragment sized units. */ if (fssize <= 0) { printf("preposterous size %jd\n", (intmax_t)fssize); exit(13); } wtfs(fssize - (realsectorsize / DEV_BSIZE), realsectorsize, (char *)&sblock); /* * collect and verify the file system density info */ sblock.fs_avgfilesize = avgfilesize; sblock.fs_avgfpdir = avgfilesperdir; if (sblock.fs_avgfilesize <= 0) printf("illegal expected average file size %d\n", sblock.fs_avgfilesize), exit(14); if (sblock.fs_avgfpdir <= 0) printf("illegal expected number of files per directory %d\n", sblock.fs_avgfpdir), exit(15); restart: /* * collect and verify the block and fragment sizes */ sblock.fs_bsize = bsize; sblock.fs_fsize = fsize; if (!POWEROF2(sblock.fs_bsize)) { printf("block size must be a power of 2, not %d\n", sblock.fs_bsize); exit(16); } if (!POWEROF2(sblock.fs_fsize)) { printf("fragment size must be a power of 2, not %d\n", sblock.fs_fsize); exit(17); } if (sblock.fs_fsize < sectorsize) { printf("increasing fragment size from %d to sector size (%d)\n", sblock.fs_fsize, sectorsize); sblock.fs_fsize = sectorsize; } if (sblock.fs_bsize > MAXBSIZE) { printf("decreasing block size from %d to maximum (%d)\n", sblock.fs_bsize, MAXBSIZE); sblock.fs_bsize = MAXBSIZE; } if (sblock.fs_bsize < MINBSIZE) { printf("increasing block size from %d to minimum (%d)\n", sblock.fs_bsize, MINBSIZE); sblock.fs_bsize = MINBSIZE; } if (sblock.fs_fsize > MAXBSIZE) { printf("decreasing fragment size from %d to maximum (%d)\n", sblock.fs_fsize, MAXBSIZE); sblock.fs_fsize = MAXBSIZE; } if (sblock.fs_bsize < sblock.fs_fsize) { printf("increasing block size from %d to fragment size (%d)\n", sblock.fs_bsize, sblock.fs_fsize); sblock.fs_bsize = sblock.fs_fsize; } if (sblock.fs_fsize * MAXFRAG < sblock.fs_bsize) { printf( "increasing fragment size from %d to block size / %d (%d)\n", sblock.fs_fsize, MAXFRAG, sblock.fs_bsize / MAXFRAG); sblock.fs_fsize = sblock.fs_bsize / MAXFRAG; } if (maxbsize == 0) maxbsize = bsize; if (maxbsize < bsize || !POWEROF2(maxbsize)) { sblock.fs_maxbsize = sblock.fs_bsize; printf("Extent size set to %d\n", sblock.fs_maxbsize); } else if (sblock.fs_maxbsize > FS_MAXCONTIG * sblock.fs_bsize) { sblock.fs_maxbsize = FS_MAXCONTIG * sblock.fs_bsize; printf("Extent size reduced to %d\n", sblock.fs_maxbsize); } else { sblock.fs_maxbsize = maxbsize; } /* * Maxcontig sets the default for the maximum number of blocks * that may be allocated sequentially. With file system clustering * it is possible to allocate contiguous blocks up to the maximum * transfer size permitted by the controller or buffering. */ if (maxcontig == 0) maxcontig = MAX(1, MAXPHYS / bsize); sblock.fs_maxcontig = maxcontig; if (sblock.fs_maxcontig < sblock.fs_maxbsize / sblock.fs_bsize) { sblock.fs_maxcontig = sblock.fs_maxbsize / sblock.fs_bsize; printf("Maxcontig raised to %d\n", sblock.fs_maxbsize); } if (sblock.fs_maxcontig > 1) sblock.fs_contigsumsize = MIN(sblock.fs_maxcontig,FS_MAXCONTIG); sblock.fs_bmask = ~(sblock.fs_bsize - 1); sblock.fs_fmask = ~(sblock.fs_fsize - 1); sblock.fs_qbmask = ~sblock.fs_bmask; sblock.fs_qfmask = ~sblock.fs_fmask; sblock.fs_bshift = ilog2(sblock.fs_bsize); sblock.fs_fshift = ilog2(sblock.fs_fsize); sblock.fs_frag = numfrags(&sblock, sblock.fs_bsize); sblock.fs_fragshift = ilog2(sblock.fs_frag); if (sblock.fs_frag > MAXFRAG) { printf("fragment size %d is still too small (can't happen)\n", sblock.fs_bsize / MAXFRAG); exit(21); } sblock.fs_fsbtodb = ilog2(sblock.fs_fsize / sectorsize); sblock.fs_size = fssize = dbtofsb(&sblock, fssize); sblock.fs_providersize = dbtofsb(&sblock, mediasize / sectorsize); /* * Before the filesystem is finally initialized, mark it * as incompletely initialized. */ sblock.fs_magic = FS_BAD_MAGIC; if (Oflag == 1) { sblock.fs_sblockloc = SBLOCK_UFS1; sblock.fs_nindir = sblock.fs_bsize / sizeof(ufs1_daddr_t); sblock.fs_inopb = sblock.fs_bsize / sizeof(struct ufs1_dinode); sblock.fs_maxsymlinklen = ((UFS_NDADDR + UFS_NIADDR) * sizeof(ufs1_daddr_t)); sblock.fs_old_inodefmt = FS_44INODEFMT; sblock.fs_old_cgoffset = 0; sblock.fs_old_cgmask = 0xffffffff; sblock.fs_old_size = sblock.fs_size; sblock.fs_old_rotdelay = 0; sblock.fs_old_rps = 60; sblock.fs_old_nspf = sblock.fs_fsize / sectorsize; sblock.fs_old_cpg = 1; sblock.fs_old_interleave = 1; sblock.fs_old_trackskew = 0; sblock.fs_old_cpc = 0; sblock.fs_old_postblformat = 1; sblock.fs_old_nrpos = 1; } else { sblock.fs_sblockloc = SBLOCK_UFS2; sblock.fs_nindir = sblock.fs_bsize / sizeof(ufs2_daddr_t); sblock.fs_inopb = sblock.fs_bsize / sizeof(struct ufs2_dinode); sblock.fs_maxsymlinklen = ((UFS_NDADDR + UFS_NIADDR) * sizeof(ufs2_daddr_t)); } sblock.fs_sblkno = roundup(howmany(sblock.fs_sblockloc + SBLOCKSIZE, sblock.fs_fsize), sblock.fs_frag); sblock.fs_cblkno = sblock.fs_sblkno + roundup(howmany(SBLOCKSIZE, sblock.fs_fsize), sblock.fs_frag); sblock.fs_iblkno = sblock.fs_cblkno + sblock.fs_frag; sblock.fs_maxfilesize = sblock.fs_bsize * UFS_NDADDR - 1; for (sizepb = sblock.fs_bsize, i = 0; i < UFS_NIADDR; i++) { sizepb *= NINDIR(&sblock); sblock.fs_maxfilesize += sizepb; } /* * It's impossible to create a snapshot in case that fs_maxfilesize * is smaller than the fssize. */ if (sblock.fs_maxfilesize < (u_quad_t)fssize) { warnx("WARNING: You will be unable to create snapshots on this " "file system. Correct by using a larger blocksize."); } /* * Calculate the number of blocks to put into each cylinder group. * * This algorithm selects the number of blocks per cylinder * group. The first goal is to have at least enough data blocks * in each cylinder group to meet the density requirement. Once * this goal is achieved we try to expand to have at least * MINCYLGRPS cylinder groups. Once this goal is achieved, we * pack as many blocks into each cylinder group map as will fit. * * We start by calculating the smallest number of blocks that we * can put into each cylinder group. If this is too big, we reduce * the density until it fits. */ maxinum = (((int64_t)(1)) << 32) - INOPB(&sblock); minfragsperinode = 1 + fssize / maxinum; if (density == 0) { density = MAX(NFPI, minfragsperinode) * fsize; } else if (density < minfragsperinode * fsize) { origdensity = density; density = minfragsperinode * fsize; fprintf(stderr, "density increased from %d to %d\n", origdensity, density); } origdensity = density; for (;;) { fragsperinode = MAX(numfrags(&sblock, density), 1); if (fragsperinode < minfragsperinode) { bsize <<= 1; fsize <<= 1; printf("Block size too small for a file system %s %d\n", "of this size. Increasing blocksize to", bsize); goto restart; } minfpg = fragsperinode * INOPB(&sblock); if (minfpg > sblock.fs_size) minfpg = sblock.fs_size; sblock.fs_ipg = INOPB(&sblock); sblock.fs_fpg = roundup(sblock.fs_iblkno + sblock.fs_ipg / INOPF(&sblock), sblock.fs_frag); if (sblock.fs_fpg < minfpg) sblock.fs_fpg = minfpg; sblock.fs_ipg = roundup(howmany(sblock.fs_fpg, fragsperinode), INOPB(&sblock)); sblock.fs_fpg = roundup(sblock.fs_iblkno + sblock.fs_ipg / INOPF(&sblock), sblock.fs_frag); if (sblock.fs_fpg < minfpg) sblock.fs_fpg = minfpg; sblock.fs_ipg = roundup(howmany(sblock.fs_fpg, fragsperinode), INOPB(&sblock)); if (CGSIZE(&sblock) < (unsigned long)sblock.fs_bsize) break; density -= sblock.fs_fsize; } if (density != origdensity) printf("density reduced from %d to %d\n", origdensity, density); /* * Start packing more blocks into the cylinder group until * it cannot grow any larger, the number of cylinder groups * drops below MINCYLGRPS, or we reach the size requested. * For UFS1 inodes per cylinder group are stored in an int16_t * so fs_ipg is limited to 2^15 - 1. */ for ( ; sblock.fs_fpg < maxblkspercg; sblock.fs_fpg += sblock.fs_frag) { sblock.fs_ipg = roundup(howmany(sblock.fs_fpg, fragsperinode), INOPB(&sblock)); if (Oflag > 1 || (Oflag == 1 && sblock.fs_ipg <= 0x7fff)) { if (sblock.fs_size / sblock.fs_fpg < MINCYLGRPS) break; if (CGSIZE(&sblock) < (unsigned long)sblock.fs_bsize) continue; if (CGSIZE(&sblock) == (unsigned long)sblock.fs_bsize) break; } sblock.fs_fpg -= sblock.fs_frag; sblock.fs_ipg = roundup(howmany(sblock.fs_fpg, fragsperinode), INOPB(&sblock)); break; } /* * Check to be sure that the last cylinder group has enough blocks * to be viable. If it is too small, reduce the number of blocks * per cylinder group which will have the effect of moving more * blocks into the last cylinder group. */ optimalfpg = sblock.fs_fpg; for (;;) { sblock.fs_ncg = howmany(sblock.fs_size, sblock.fs_fpg); lastminfpg = roundup(sblock.fs_iblkno + sblock.fs_ipg / INOPF(&sblock), sblock.fs_frag); if (sblock.fs_size < lastminfpg) { printf("Filesystem size %jd < minimum size of %d\n", (intmax_t)sblock.fs_size, lastminfpg); exit(28); } if (sblock.fs_size % sblock.fs_fpg >= lastminfpg || sblock.fs_size % sblock.fs_fpg == 0) break; sblock.fs_fpg -= sblock.fs_frag; sblock.fs_ipg = roundup(howmany(sblock.fs_fpg, fragsperinode), INOPB(&sblock)); } if (optimalfpg != sblock.fs_fpg) printf("Reduced frags per cylinder group from %d to %d %s\n", optimalfpg, sblock.fs_fpg, "to enlarge last cyl group"); sblock.fs_cgsize = fragroundup(&sblock, CGSIZE(&sblock)); sblock.fs_dblkno = sblock.fs_iblkno + sblock.fs_ipg / INOPF(&sblock); if (Oflag == 1) { sblock.fs_old_spc = sblock.fs_fpg * sblock.fs_old_nspf; sblock.fs_old_nsect = sblock.fs_old_spc; sblock.fs_old_npsect = sblock.fs_old_spc; sblock.fs_old_ncyl = sblock.fs_ncg; } /* * fill in remaining fields of the super block */ sblock.fs_csaddr = cgdmin(&sblock, 0); sblock.fs_cssize = fragroundup(&sblock, sblock.fs_ncg * sizeof(struct csum)); fscs = (struct csum *)calloc(1, sblock.fs_cssize); if (fscs == NULL) errx(31, "calloc failed"); sblock.fs_sbsize = fragroundup(&sblock, sizeof(struct fs)); if (sblock.fs_sbsize > SBLOCKSIZE) sblock.fs_sbsize = SBLOCKSIZE; if (sblock.fs_sbsize < realsectorsize) sblock.fs_sbsize = realsectorsize; sblock.fs_minfree = minfree; if (metaspace > 0 && metaspace < sblock.fs_fpg / 2) sblock.fs_metaspace = blknum(&sblock, metaspace); else if (metaspace != -1) /* reserve half of minfree for metadata blocks */ sblock.fs_metaspace = blknum(&sblock, (sblock.fs_fpg * minfree) / 200); if (maxbpg == 0) sblock.fs_maxbpg = MAXBLKPG(sblock.fs_bsize); else sblock.fs_maxbpg = maxbpg; sblock.fs_optim = opt; sblock.fs_cgrotor = 0; sblock.fs_pendingblocks = 0; sblock.fs_pendinginodes = 0; sblock.fs_fmod = 0; sblock.fs_ronly = 0; sblock.fs_state = 0; sblock.fs_clean = 1; sblock.fs_id[0] = (long)utime; sblock.fs_id[1] = newfs_random(); sblock.fs_fsmnt[0] = '\0'; csfrags = howmany(sblock.fs_cssize, sblock.fs_fsize); sblock.fs_dsize = sblock.fs_size - sblock.fs_sblkno - sblock.fs_ncg * (sblock.fs_dblkno - sblock.fs_sblkno); sblock.fs_cstotal.cs_nbfree = fragstoblks(&sblock, sblock.fs_dsize) - howmany(csfrags, sblock.fs_frag); sblock.fs_cstotal.cs_nffree = fragnum(&sblock, sblock.fs_size) + (fragnum(&sblock, csfrags) > 0 ? sblock.fs_frag - fragnum(&sblock, csfrags) : 0); sblock.fs_cstotal.cs_nifree = sblock.fs_ncg * sblock.fs_ipg - UFS_ROOTINO; sblock.fs_cstotal.cs_ndir = 0; sblock.fs_dsize -= csfrags; sblock.fs_time = utime; if (Oflag == 1) { sblock.fs_old_time = utime; sblock.fs_old_dsize = sblock.fs_dsize; sblock.fs_old_csaddr = sblock.fs_csaddr; sblock.fs_old_cstotal.cs_ndir = sblock.fs_cstotal.cs_ndir; sblock.fs_old_cstotal.cs_nbfree = sblock.fs_cstotal.cs_nbfree; sblock.fs_old_cstotal.cs_nifree = sblock.fs_cstotal.cs_nifree; sblock.fs_old_cstotal.cs_nffree = sblock.fs_cstotal.cs_nffree; } + /* + * Set flags for metadata that is being check-hashed. + */ + if (Oflag > 1) + sblock.fs_metackhash = CK_CYLGRP; /* * Dump out summary information about file system. */ # define B2MBFACTOR (1 / (1024.0 * 1024.0)) printf("%s: %.1fMB (%jd sectors) block size %d, fragment size %d\n", fsys, (float)sblock.fs_size * sblock.fs_fsize * B2MBFACTOR, (intmax_t)fsbtodb(&sblock, sblock.fs_size), sblock.fs_bsize, sblock.fs_fsize); printf("\tusing %d cylinder groups of %.2fMB, %d blks, %d inodes.\n", sblock.fs_ncg, (float)sblock.fs_fpg * sblock.fs_fsize * B2MBFACTOR, sblock.fs_fpg / sblock.fs_frag, sblock.fs_ipg); if (sblock.fs_flags & FS_DOSOFTDEP) printf("\twith soft updates\n"); # undef B2MBFACTOR if (Eflag && !Nflag) { printf("Erasing sectors [%jd...%jd]\n", sblock.fs_sblockloc / disk.d_bsize, fsbtodb(&sblock, sblock.fs_size) - 1); berase(&disk, sblock.fs_sblockloc / disk.d_bsize, sblock.fs_size * sblock.fs_fsize - sblock.fs_sblockloc); } /* * Wipe out old UFS1 superblock(s) if necessary. */ if (!Nflag && Oflag != 1 && realsectorsize <= SBLOCK_UFS1) { i = bread(&disk, part_ofs + SBLOCK_UFS1 / disk.d_bsize, chdummy, SBLOCKSIZE); if (i == -1) err(1, "can't read old UFS1 superblock: %s", disk.d_error); if (fsdummy.fs_magic == FS_UFS1_MAGIC) { fsdummy.fs_magic = 0; bwrite(&disk, part_ofs + SBLOCK_UFS1 / disk.d_bsize, chdummy, SBLOCKSIZE); for (cg = 0; cg < fsdummy.fs_ncg; cg++) { if (fsbtodb(&fsdummy, cgsblock(&fsdummy, cg)) > fssize) break; bwrite(&disk, part_ofs + fsbtodb(&fsdummy, cgsblock(&fsdummy, cg)), chdummy, SBLOCKSIZE); } } } if (!Nflag) do_sbwrite(&disk); if (Xflag == 1) { printf("** Exiting on Xflag 1\n"); exit(0); } if (Xflag == 2) printf("** Leaving BAD MAGIC on Xflag 2\n"); else sblock.fs_magic = (Oflag != 1) ? FS_UFS2_MAGIC : FS_UFS1_MAGIC; /* * Now build the cylinders group blocks and * then print out indices of cylinder groups. */ printf("super-block backups (for fsck_ffs -b #) at:\n"); i = 0; width = charsperline(); /* * allocate space for superblock, cylinder group map, and * two sets of inode blocks. */ if (sblock.fs_bsize < SBLOCKSIZE) iobufsize = SBLOCKSIZE + 3 * sblock.fs_bsize; else iobufsize = 4 * sblock.fs_bsize; if ((iobuf = calloc(1, iobufsize)) == 0) { printf("Cannot allocate I/O buffer\n"); exit(38); } /* * Make a copy of the superblock into the buffer that we will be * writing out in each cylinder group. */ bcopy((char *)&sblock, iobuf, SBLOCKSIZE); for (cg = 0; cg < sblock.fs_ncg; cg++) { initcg(cg, utime); j = snprintf(tmpbuf, sizeof(tmpbuf), " %jd%s", (intmax_t)fsbtodb(&sblock, cgsblock(&sblock, cg)), cg < (sblock.fs_ncg-1) ? "," : ""); if (j < 0) tmpbuf[j = 0] = '\0'; if (i + j >= width) { printf("\n"); i = 0; } i += j; printf("%s", tmpbuf); fflush(stdout); } printf("\n"); if (Nflag) exit(0); /* * Now construct the initial file system, * then write out the super-block. */ fsinit(utime); if (Oflag == 1) { sblock.fs_old_cstotal.cs_ndir = sblock.fs_cstotal.cs_ndir; sblock.fs_old_cstotal.cs_nbfree = sblock.fs_cstotal.cs_nbfree; sblock.fs_old_cstotal.cs_nifree = sblock.fs_cstotal.cs_nifree; sblock.fs_old_cstotal.cs_nffree = sblock.fs_cstotal.cs_nffree; } if (Xflag == 3) { printf("** Exiting on Xflag 3\n"); exit(0); } if (!Nflag) { do_sbwrite(&disk); /* * For UFS1 filesystems with a blocksize of 64K, the first * alternate superblock resides at the location used for * the default UFS2 superblock. As there is a valid * superblock at this location, the boot code will use * it as its first choice. Thus we have to ensure that * all of its statistcs on usage are correct. */ if (Oflag == 1 && sblock.fs_bsize == 65536) wtfs(fsbtodb(&sblock, cgsblock(&sblock, 0)), sblock.fs_bsize, (char *)&sblock); } for (i = 0; i < sblock.fs_cssize; i += sblock.fs_bsize) wtfs(fsbtodb(&sblock, sblock.fs_csaddr + numfrags(&sblock, i)), MIN(sblock.fs_cssize - i, sblock.fs_bsize), ((char *)fscs) + i); /* * Read the last sector of the boot block, replace the last * 20 bytes with the recovery information, then write it back. * The recovery information only works for UFS2 filesystems. */ if (sblock.fs_magic == FS_UFS2_MAGIC) { if ((fsrbuf = malloc(realsectorsize)) == NULL || bread(&disk, part_ofs + (SBLOCK_UFS2 - realsectorsize) / disk.d_bsize, fsrbuf, realsectorsize) == -1) err(1, "can't read recovery area: %s", disk.d_error); fsr = (struct fsrecovery *)&fsrbuf[realsectorsize - sizeof *fsr]; fsr->fsr_magic = sblock.fs_magic; fsr->fsr_fpg = sblock.fs_fpg; fsr->fsr_fsbtodb = sblock.fs_fsbtodb; fsr->fsr_sblkno = sblock.fs_sblkno; fsr->fsr_ncg = sblock.fs_ncg; wtfs((SBLOCK_UFS2 - realsectorsize) / disk.d_bsize, realsectorsize, fsrbuf); free(fsrbuf); } /* * Update information about this partition in pack * label, to that it may be updated on disk. */ if (pp != NULL) { pp->p_fstype = FS_BSDFFS; pp->p_fsize = sblock.fs_fsize; pp->p_frag = sblock.fs_frag; pp->p_cpg = sblock.fs_fpg; } } /* * Initialize a cylinder group. */ void initcg(int cylno, time_t utime) { long blkno, start; uint i, j, d, dlower, dupper; ufs2_daddr_t cbase, dmax; struct ufs1_dinode *dp1; struct ufs2_dinode *dp2; struct csum *cs; /* * Determine block bounds for cylinder group. * Allow space for super block summary information in first * cylinder group. */ cbase = cgbase(&sblock, cylno); dmax = cbase + sblock.fs_fpg; if (dmax > sblock.fs_size) dmax = sblock.fs_size; dlower = cgsblock(&sblock, cylno) - cbase; dupper = cgdmin(&sblock, cylno) - cbase; if (cylno == 0) dupper += howmany(sblock.fs_cssize, sblock.fs_fsize); cs = &fscs[cylno]; memset(&acg, 0, sblock.fs_cgsize); acg.cg_time = utime; acg.cg_magic = CG_MAGIC; acg.cg_cgx = cylno; acg.cg_niblk = sblock.fs_ipg; acg.cg_initediblk = MIN(sblock.fs_ipg, 2 * INOPB(&sblock)); acg.cg_ndblk = dmax - cbase; if (sblock.fs_contigsumsize > 0) acg.cg_nclusterblks = acg.cg_ndblk / sblock.fs_frag; start = &acg.cg_space[0] - (u_char *)(&acg.cg_firstfield); if (Oflag == 2) { acg.cg_iusedoff = start; } else { acg.cg_old_ncyl = sblock.fs_old_cpg; acg.cg_old_time = acg.cg_time; acg.cg_time = 0; acg.cg_old_niblk = acg.cg_niblk; acg.cg_niblk = 0; acg.cg_initediblk = 0; acg.cg_old_btotoff = start; acg.cg_old_boff = acg.cg_old_btotoff + sblock.fs_old_cpg * sizeof(int32_t); acg.cg_iusedoff = acg.cg_old_boff + sblock.fs_old_cpg * sizeof(u_int16_t); } acg.cg_freeoff = acg.cg_iusedoff + howmany(sblock.fs_ipg, CHAR_BIT); acg.cg_nextfreeoff = acg.cg_freeoff + howmany(sblock.fs_fpg, CHAR_BIT); if (sblock.fs_contigsumsize > 0) { acg.cg_clustersumoff = roundup(acg.cg_nextfreeoff, sizeof(u_int32_t)); acg.cg_clustersumoff -= sizeof(u_int32_t); acg.cg_clusteroff = acg.cg_clustersumoff + (sblock.fs_contigsumsize + 1) * sizeof(u_int32_t); acg.cg_nextfreeoff = acg.cg_clusteroff + howmany(fragstoblks(&sblock, sblock.fs_fpg), CHAR_BIT); } if (acg.cg_nextfreeoff > (unsigned)sblock.fs_cgsize) { printf("Panic: cylinder group too big\n"); exit(37); } acg.cg_cs.cs_nifree += sblock.fs_ipg; if (cylno == 0) for (i = 0; i < (long)UFS_ROOTINO; i++) { setbit(cg_inosused(&acg), i); acg.cg_cs.cs_nifree--; } if (cylno > 0) { /* * In cylno 0, beginning space is reserved * for boot and super blocks. */ for (d = 0; d < dlower; d += sblock.fs_frag) { blkno = d / sblock.fs_frag; setblock(&sblock, cg_blksfree(&acg), blkno); if (sblock.fs_contigsumsize > 0) setbit(cg_clustersfree(&acg), blkno); acg.cg_cs.cs_nbfree++; } } if ((i = dupper % sblock.fs_frag)) { acg.cg_frsum[sblock.fs_frag - i]++; for (d = dupper + sblock.fs_frag - i; dupper < d; dupper++) { setbit(cg_blksfree(&acg), dupper); acg.cg_cs.cs_nffree++; } } for (d = dupper; d + sblock.fs_frag <= acg.cg_ndblk; d += sblock.fs_frag) { blkno = d / sblock.fs_frag; setblock(&sblock, cg_blksfree(&acg), blkno); if (sblock.fs_contigsumsize > 0) setbit(cg_clustersfree(&acg), blkno); acg.cg_cs.cs_nbfree++; } if (d < acg.cg_ndblk) { acg.cg_frsum[acg.cg_ndblk - d]++; for (; d < acg.cg_ndblk; d++) { setbit(cg_blksfree(&acg), d); acg.cg_cs.cs_nffree++; } } if (sblock.fs_contigsumsize > 0) { int32_t *sump = cg_clustersum(&acg); u_char *mapp = cg_clustersfree(&acg); int map = *mapp++; int bit = 1; int run = 0; for (i = 0; i < acg.cg_nclusterblks; i++) { if ((map & bit) != 0) run++; else if (run != 0) { if (run > sblock.fs_contigsumsize) run = sblock.fs_contigsumsize; sump[run]++; run = 0; } if ((i & (CHAR_BIT - 1)) != CHAR_BIT - 1) bit <<= 1; else { map = *mapp++; bit = 1; } } if (run != 0) { if (run > sblock.fs_contigsumsize) run = sblock.fs_contigsumsize; sump[run]++; } } *cs = acg.cg_cs; + cgckhash(&acg); /* * Write out the duplicate super block, the cylinder group map * and two blocks worth of inodes in a single write. */ start = MAX(sblock.fs_bsize, SBLOCKSIZE); bcopy((char *)&acg, &iobuf[start], sblock.fs_cgsize); start += sblock.fs_bsize; dp1 = (struct ufs1_dinode *)(&iobuf[start]); dp2 = (struct ufs2_dinode *)(&iobuf[start]); for (i = 0; i < acg.cg_initediblk; i++) { if (sblock.fs_magic == FS_UFS1_MAGIC) { dp1->di_gen = newfs_random(); dp1++; } else { dp2->di_gen = newfs_random(); dp2++; } } wtfs(fsbtodb(&sblock, cgsblock(&sblock, cylno)), iobufsize, iobuf); /* * For the old file system, we have to initialize all the inodes. */ if (Oflag == 1) { for (i = 2 * sblock.fs_frag; i < sblock.fs_ipg / INOPF(&sblock); i += sblock.fs_frag) { dp1 = (struct ufs1_dinode *)(&iobuf[start]); for (j = 0; j < INOPB(&sblock); j++) { dp1->di_gen = newfs_random(); dp1++; } wtfs(fsbtodb(&sblock, cgimin(&sblock, cylno) + i), sblock.fs_bsize, &iobuf[start]); } } } /* * initialize the file system */ #define ROOTLINKCNT 3 static struct direct root_dir[] = { { UFS_ROOTINO, sizeof(struct direct), DT_DIR, 1, "." }, { UFS_ROOTINO, sizeof(struct direct), DT_DIR, 2, ".." }, { UFS_ROOTINO + 1, sizeof(struct direct), DT_DIR, 5, ".snap" }, }; #define SNAPLINKCNT 2 static struct direct snap_dir[] = { { UFS_ROOTINO + 1, sizeof(struct direct), DT_DIR, 1, "." }, { UFS_ROOTINO, sizeof(struct direct), DT_DIR, 2, ".." }, }; void fsinit(time_t utime) { union dinode node; struct group *grp; gid_t gid; int entries; memset(&node, 0, sizeof node); if ((grp = getgrnam("operator")) != NULL) { gid = grp->gr_gid; } else { warnx("Cannot retrieve operator gid, using gid 0."); gid = 0; } entries = (nflag) ? ROOTLINKCNT - 1: ROOTLINKCNT; if (sblock.fs_magic == FS_UFS1_MAGIC) { /* * initialize the node */ node.dp1.di_atime = utime; node.dp1.di_mtime = utime; node.dp1.di_ctime = utime; /* * create the root directory */ node.dp1.di_mode = IFDIR | UMASK; node.dp1.di_nlink = entries; node.dp1.di_size = makedir(root_dir, entries); node.dp1.di_db[0] = alloc(sblock.fs_fsize, node.dp1.di_mode); node.dp1.di_blocks = btodb(fragroundup(&sblock, node.dp1.di_size)); wtfs(fsbtodb(&sblock, node.dp1.di_db[0]), sblock.fs_fsize, iobuf); iput(&node, UFS_ROOTINO); if (!nflag) { /* * create the .snap directory */ node.dp1.di_mode |= 020; node.dp1.di_gid = gid; node.dp1.di_nlink = SNAPLINKCNT; node.dp1.di_size = makedir(snap_dir, SNAPLINKCNT); node.dp1.di_db[0] = alloc(sblock.fs_fsize, node.dp1.di_mode); node.dp1.di_blocks = btodb(fragroundup(&sblock, node.dp1.di_size)); wtfs(fsbtodb(&sblock, node.dp1.di_db[0]), sblock.fs_fsize, iobuf); iput(&node, UFS_ROOTINO + 1); } } else { /* * initialize the node */ node.dp2.di_atime = utime; node.dp2.di_mtime = utime; node.dp2.di_ctime = utime; node.dp2.di_birthtime = utime; /* * create the root directory */ node.dp2.di_mode = IFDIR | UMASK; node.dp2.di_nlink = entries; node.dp2.di_size = makedir(root_dir, entries); node.dp2.di_db[0] = alloc(sblock.fs_fsize, node.dp2.di_mode); node.dp2.di_blocks = btodb(fragroundup(&sblock, node.dp2.di_size)); wtfs(fsbtodb(&sblock, node.dp2.di_db[0]), sblock.fs_fsize, iobuf); iput(&node, UFS_ROOTINO); if (!nflag) { /* * create the .snap directory */ node.dp2.di_mode |= 020; node.dp2.di_gid = gid; node.dp2.di_nlink = SNAPLINKCNT; node.dp2.di_size = makedir(snap_dir, SNAPLINKCNT); node.dp2.di_db[0] = alloc(sblock.fs_fsize, node.dp2.di_mode); node.dp2.di_blocks = btodb(fragroundup(&sblock, node.dp2.di_size)); wtfs(fsbtodb(&sblock, node.dp2.di_db[0]), sblock.fs_fsize, iobuf); iput(&node, UFS_ROOTINO + 1); } } } /* * construct a set of directory entries in "iobuf". * return size of directory. */ int makedir(struct direct *protodir, int entries) { char *cp; int i, spcleft; spcleft = DIRBLKSIZ; memset(iobuf, 0, DIRBLKSIZ); for (cp = iobuf, i = 0; i < entries - 1; i++) { protodir[i].d_reclen = DIRSIZ(0, &protodir[i]); memmove(cp, &protodir[i], protodir[i].d_reclen); cp += protodir[i].d_reclen; spcleft -= protodir[i].d_reclen; } protodir[i].d_reclen = spcleft; memmove(cp, &protodir[i], DIRSIZ(0, &protodir[i])); return (DIRBLKSIZ); } /* * allocate a block or frag */ ufs2_daddr_t alloc(int size, int mode) { int i, blkno, frag; uint d; bread(&disk, part_ofs + fsbtodb(&sblock, cgtod(&sblock, 0)), (char *)&acg, sblock.fs_cgsize); if (acg.cg_magic != CG_MAGIC) { printf("cg 0: bad magic number\n"); exit(38); } if (acg.cg_cs.cs_nbfree == 0) { printf("first cylinder group ran out of space\n"); exit(39); } for (d = 0; d < acg.cg_ndblk; d += sblock.fs_frag) if (isblock(&sblock, cg_blksfree(&acg), d / sblock.fs_frag)) goto goth; printf("internal error: can't find block in cyl 0\n"); exit(40); goth: blkno = fragstoblks(&sblock, d); clrblock(&sblock, cg_blksfree(&acg), blkno); if (sblock.fs_contigsumsize > 0) clrbit(cg_clustersfree(&acg), blkno); acg.cg_cs.cs_nbfree--; sblock.fs_cstotal.cs_nbfree--; fscs[0].cs_nbfree--; if (mode & IFDIR) { acg.cg_cs.cs_ndir++; sblock.fs_cstotal.cs_ndir++; fscs[0].cs_ndir++; } if (size != sblock.fs_bsize) { frag = howmany(size, sblock.fs_fsize); fscs[0].cs_nffree += sblock.fs_frag - frag; sblock.fs_cstotal.cs_nffree += sblock.fs_frag - frag; acg.cg_cs.cs_nffree += sblock.fs_frag - frag; acg.cg_frsum[sblock.fs_frag - frag]++; for (i = frag; i < sblock.fs_frag; i++) setbit(cg_blksfree(&acg), d + i); } /* XXX cgwrite(&disk, 0)??? */ + cgckhash(&acg); wtfs(fsbtodb(&sblock, cgtod(&sblock, 0)), sblock.fs_cgsize, (char *)&acg); return ((ufs2_daddr_t)d); } /* * Allocate an inode on the disk */ void iput(union dinode *ip, ino_t ino) { ufs2_daddr_t d; bread(&disk, part_ofs + fsbtodb(&sblock, cgtod(&sblock, 0)), (char *)&acg, sblock.fs_cgsize); if (acg.cg_magic != CG_MAGIC) { printf("cg 0: bad magic number\n"); exit(31); } acg.cg_cs.cs_nifree--; setbit(cg_inosused(&acg), ino); + cgckhash(&acg); wtfs(fsbtodb(&sblock, cgtod(&sblock, 0)), sblock.fs_cgsize, (char *)&acg); sblock.fs_cstotal.cs_nifree--; fscs[0].cs_nifree--; if (ino >= (unsigned long)sblock.fs_ipg * sblock.fs_ncg) { printf("fsinit: inode value out of range (%ju).\n", (uintmax_t)ino); exit(32); } d = fsbtodb(&sblock, ino_to_fsba(&sblock, ino)); bread(&disk, part_ofs + d, (char *)iobuf, sblock.fs_bsize); if (sblock.fs_magic == FS_UFS1_MAGIC) ((struct ufs1_dinode *)iobuf)[ino_to_fsbo(&sblock, ino)] = ip->dp1; else ((struct ufs2_dinode *)iobuf)[ino_to_fsbo(&sblock, ino)] = ip->dp2; wtfs(d, sblock.fs_bsize, (char *)iobuf); } /* * possibly write to disk */ static void wtfs(ufs2_daddr_t bno, int size, char *bf) { if (Nflag) return; if (bwrite(&disk, part_ofs + bno, bf, size) < 0) err(36, "wtfs: %d bytes at sector %jd", size, (intmax_t)bno); +} + +/* + * Calculate the check-hash of the cylinder group. + */ +static void +cgckhash(cgp) + struct cg *cgp; +{ + + if ((sblock.fs_metackhash & CK_CYLGRP) == 0) + return; + cgp->cg_ckhash = 0; + cgp->cg_ckhash = calculate_crc32c(~0L, (void *)cgp, sblock.fs_cgsize); } /* * check if a block is available */ static int isblock(struct fs *fs, unsigned char *cp, int h) { unsigned char mask; switch (fs->fs_frag) { case 8: return (cp[h] == 0xff); case 4: mask = 0x0f << ((h & 0x1) << 2); return ((cp[h >> 1] & mask) == mask); case 2: mask = 0x03 << ((h & 0x3) << 1); return ((cp[h >> 2] & mask) == mask); case 1: mask = 0x01 << (h & 0x7); return ((cp[h >> 3] & mask) == mask); default: fprintf(stderr, "isblock bad fs_frag %d\n", fs->fs_frag); return (0); } } /* * take a block out of the map */ static void clrblock(struct fs *fs, unsigned char *cp, int h) { switch ((fs)->fs_frag) { case 8: cp[h] = 0; return; case 4: cp[h >> 1] &= ~(0x0f << ((h & 0x1) << 2)); return; case 2: cp[h >> 2] &= ~(0x03 << ((h & 0x3) << 1)); return; case 1: cp[h >> 3] &= ~(0x01 << (h & 0x7)); return; default: fprintf(stderr, "clrblock bad fs_frag %d\n", fs->fs_frag); return; } } /* * put a block into the map */ static void setblock(struct fs *fs, unsigned char *cp, int h) { switch (fs->fs_frag) { case 8: cp[h] = 0xff; return; case 4: cp[h >> 1] |= (0x0f << ((h & 0x1) << 2)); return; case 2: cp[h >> 2] |= (0x03 << ((h & 0x3) << 1)); return; case 1: cp[h >> 3] |= (0x01 << (h & 0x7)); return; default: fprintf(stderr, "setblock bad fs_frag %d\n", fs->fs_frag); return; } } /* * Determine the number of characters in a * single line. */ static int charsperline(void) { int columns; char *cp; struct winsize ws; columns = 0; if (ioctl(0, TIOCGWINSZ, &ws) != -1) columns = ws.ws_col; if (columns == 0 && (cp = getenv("COLUMNS"))) columns = atoi(cp); if (columns == 0) columns = 80; /* last resort */ return (columns); } static int ilog2(int val) { u_int n; for (n = 0; n < sizeof(n) * CHAR_BIT; n++) if (1 << n == val) return (n); errx(1, "ilog2: %d is not a power of 2\n", val); } /* * For the regression test, return predictable random values. * Otherwise use a true random number generator. */ static u_int32_t newfs_random(void) { static int nextnum = 1; if (Rflag) return (nextnum++); return (arc4random()); } Index: head/sys/kern/vfs_bio.c =================================================================== --- head/sys/kern/vfs_bio.c (revision 323922) +++ head/sys/kern/vfs_bio.c (revision 323923) @@ -1,5028 +1,5048 @@ /*- * Copyright (c) 2004 Poul-Henning Kamp * Copyright (c) 1994,1997 John S. Dyson * Copyright (c) 2013 The FreeBSD Foundation * All rights reserved. * * Portions of this software were developed by Konstantin Belousov * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * this file contains a new buffer I/O scheme implementing a coherent * VM object and buffer cache scheme. Pains have been taken to make * sure that the performance degradation associated with schemes such * as this is not realized. * * Author: John S. Dyson * Significant help during the development and debugging phases * had been provided by David Greenman, also of the FreeBSD core team. * * see man buf(9) for more info. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "opt_compat.h" #include "opt_swap.h" static MALLOC_DEFINE(M_BIOBUF, "biobuf", "BIO buffer"); struct bio_ops bioops; /* I/O operation notification */ struct buf_ops buf_ops_bio = { .bop_name = "buf_ops_bio", .bop_write = bufwrite, .bop_strategy = bufstrategy, .bop_sync = bufsync, .bop_bdflush = bufbdflush, }; static struct buf *buf; /* buffer header pool */ extern struct buf *swbuf; /* Swap buffer header pool. */ caddr_t unmapped_buf; /* Used below and for softdep flushing threads in ufs/ffs/ffs_softdep.c */ struct proc *bufdaemonproc; struct proc *bufspacedaemonproc; static int inmem(struct vnode *vp, daddr_t blkno); static void vm_hold_free_pages(struct buf *bp, int newbsize); static void vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to); static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m); static void vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off, vm_page_t m); static void vfs_clean_pages_dirty_buf(struct buf *bp); static void vfs_setdirty_locked_object(struct buf *bp); static void vfs_vmio_invalidate(struct buf *bp); static void vfs_vmio_truncate(struct buf *bp, int npages); static void vfs_vmio_extend(struct buf *bp, int npages, int size); static int vfs_bio_clcheck(struct vnode *vp, int size, daddr_t lblkno, daddr_t blkno); +static void breada(struct vnode *, daddr_t *, int *, int, struct ucred *, int, + void (*)(struct buf *)); static int buf_flush(struct vnode *vp, int); static int buf_recycle(bool); static int buf_scan(bool); static int flushbufqueues(struct vnode *, int, int); static void buf_daemon(void); static void bremfreel(struct buf *bp); static __inline void bd_wakeup(void); static int sysctl_runningspace(SYSCTL_HANDLER_ARGS); static void bufkva_reclaim(vmem_t *, int); static void bufkva_free(struct buf *); static int buf_import(void *, void **, int, int); static void buf_release(void *, void **, int); static void maxbcachebuf_adjust(void); #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) static int sysctl_bufspace(SYSCTL_HANDLER_ARGS); #endif int vmiodirenable = TRUE; SYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW, &vmiodirenable, 0, "Use the VM system for directory writes"); long runningbufspace; SYSCTL_LONG(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0, "Amount of presently outstanding async buffer io"); static long bufspace; #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) SYSCTL_PROC(_vfs, OID_AUTO, bufspace, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RD, &bufspace, 0, sysctl_bufspace, "L", "Virtual memory used for buffers"); #else SYSCTL_LONG(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, &bufspace, 0, "Physical memory used for buffers"); #endif static long bufkvaspace; SYSCTL_LONG(_vfs, OID_AUTO, bufkvaspace, CTLFLAG_RD, &bufkvaspace, 0, "Kernel virtual memory used for buffers"); static long maxbufspace; SYSCTL_LONG(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RW, &maxbufspace, 0, "Maximum allowed value of bufspace (including metadata)"); static long bufmallocspace; SYSCTL_LONG(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0, "Amount of malloced memory for buffers"); static long maxbufmallocspace; SYSCTL_LONG(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace, 0, "Maximum amount of malloced memory for buffers"); static long lobufspace; SYSCTL_LONG(_vfs, OID_AUTO, lobufspace, CTLFLAG_RW, &lobufspace, 0, "Minimum amount of buffers we want to have"); long hibufspace; SYSCTL_LONG(_vfs, OID_AUTO, hibufspace, CTLFLAG_RW, &hibufspace, 0, "Maximum allowed value of bufspace (excluding metadata)"); long bufspacethresh; SYSCTL_LONG(_vfs, OID_AUTO, bufspacethresh, CTLFLAG_RW, &bufspacethresh, 0, "Bufspace consumed before waking the daemon to free some"); static int buffreekvacnt; SYSCTL_INT(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt, 0, "Number of times we have freed the KVA space from some buffer"); static int bufdefragcnt; SYSCTL_INT(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW, &bufdefragcnt, 0, "Number of times we have had to repeat buffer allocation to defragment"); static long lorunningspace; SYSCTL_PROC(_vfs, OID_AUTO, lorunningspace, CTLTYPE_LONG | CTLFLAG_MPSAFE | CTLFLAG_RW, &lorunningspace, 0, sysctl_runningspace, "L", "Minimum preferred space used for in-progress I/O"); static long hirunningspace; SYSCTL_PROC(_vfs, OID_AUTO, hirunningspace, CTLTYPE_LONG | CTLFLAG_MPSAFE | CTLFLAG_RW, &hirunningspace, 0, sysctl_runningspace, "L", "Maximum amount of space to use for in-progress I/O"); int dirtybufferflushes; SYSCTL_INT(_vfs, OID_AUTO, dirtybufferflushes, CTLFLAG_RW, &dirtybufferflushes, 0, "Number of bdwrite to bawrite conversions to limit dirty buffers"); int bdwriteskip; SYSCTL_INT(_vfs, OID_AUTO, bdwriteskip, CTLFLAG_RW, &bdwriteskip, 0, "Number of buffers supplied to bdwrite with snapshot deadlock risk"); int altbufferflushes; SYSCTL_INT(_vfs, OID_AUTO, altbufferflushes, CTLFLAG_RW, &altbufferflushes, 0, "Number of fsync flushes to limit dirty buffers"); static int recursiveflushes; SYSCTL_INT(_vfs, OID_AUTO, recursiveflushes, CTLFLAG_RW, &recursiveflushes, 0, "Number of flushes skipped due to being recursive"); static int numdirtybuffers; SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD, &numdirtybuffers, 0, "Number of buffers that are dirty (has unwritten changes) at the moment"); static int lodirtybuffers; SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW, &lodirtybuffers, 0, "How many buffers we want to have free before bufdaemon can sleep"); static int hidirtybuffers; SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW, &hidirtybuffers, 0, "When the number of dirty buffers is considered severe"); int dirtybufthresh; SYSCTL_INT(_vfs, OID_AUTO, dirtybufthresh, CTLFLAG_RW, &dirtybufthresh, 0, "Number of bdwrite to bawrite conversions to clear dirty buffers"); static int numfreebuffers; SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, &numfreebuffers, 0, "Number of free buffers"); static int lofreebuffers; SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW, &lofreebuffers, 0, "Target number of free buffers"); static int hifreebuffers; SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW, &hifreebuffers, 0, "Threshold for clean buffer recycling"); static int getnewbufcalls; SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW, &getnewbufcalls, 0, "Number of calls to getnewbuf"); static int getnewbufrestarts; SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW, &getnewbufrestarts, 0, "Number of times getnewbuf has had to restart a buffer acquisition"); static int mappingrestarts; SYSCTL_INT(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RW, &mappingrestarts, 0, "Number of times getblk has had to restart a buffer mapping for " "unmapped buffer"); static int numbufallocfails; SYSCTL_INT(_vfs, OID_AUTO, numbufallocfails, CTLFLAG_RW, &numbufallocfails, 0, "Number of times buffer allocations failed"); static int flushbufqtarget = 100; SYSCTL_INT(_vfs, OID_AUTO, flushbufqtarget, CTLFLAG_RW, &flushbufqtarget, 0, "Amount of work to do in flushbufqueues when helping bufdaemon"); static long notbufdflushes; SYSCTL_LONG(_vfs, OID_AUTO, notbufdflushes, CTLFLAG_RD, ¬bufdflushes, 0, "Number of dirty buffer flushes done by the bufdaemon helpers"); static long barrierwrites; SYSCTL_LONG(_vfs, OID_AUTO, barrierwrites, CTLFLAG_RW, &barrierwrites, 0, "Number of barrier writes"); SYSCTL_INT(_vfs, OID_AUTO, unmapped_buf_allowed, CTLFLAG_RD, &unmapped_buf_allowed, 0, "Permit the use of the unmapped i/o"); int maxbcachebuf = MAXBCACHEBUF; SYSCTL_INT(_vfs, OID_AUTO, maxbcachebuf, CTLFLAG_RDTUN, &maxbcachebuf, 0, "Maximum size of a buffer cache block"); /* * This lock synchronizes access to bd_request. */ static struct mtx_padalign __exclusive_cache_line bdlock; /* * This lock protects the runningbufreq and synchronizes runningbufwakeup and * waitrunningbufspace(). */ static struct mtx_padalign __exclusive_cache_line rbreqlock; /* * Lock that protects needsbuffer and the sleeps/wakeups surrounding it. */ static struct rwlock_padalign __exclusive_cache_line nblock; /* * Lock that protects bdirtywait. */ static struct mtx_padalign __exclusive_cache_line bdirtylock; /* * Wakeup point for bufdaemon, as well as indicator of whether it is already * active. Set to 1 when the bufdaemon is already "on" the queue, 0 when it * is idling. */ static int bd_request; /* * Request/wakeup point for the bufspace daemon. */ static int bufspace_request; /* * Request for the buf daemon to write more buffers than is indicated by * lodirtybuf. This may be necessary to push out excess dependencies or * defragment the address space where a simple count of the number of dirty * buffers is insufficient to characterize the demand for flushing them. */ static int bd_speedupreq; /* * Synchronization (sleep/wakeup) variable for active buffer space requests. * Set when wait starts, cleared prior to wakeup(). * Used in runningbufwakeup() and waitrunningbufspace(). */ static int runningbufreq; /* * Synchronization (sleep/wakeup) variable for buffer requests. * Can contain the VFS_BIO_NEED flags defined below; setting/clearing is done * by and/or. * Used in numdirtywakeup(), bufspace_wakeup(), bwillwrite(), * getnewbuf(), and getblk(). */ static volatile int needsbuffer; /* * Synchronization for bwillwrite() waiters. */ static int bdirtywait; /* * Definitions for the buffer free lists. */ #define QUEUE_NONE 0 /* on no queue */ #define QUEUE_EMPTY 1 /* empty buffer headers */ #define QUEUE_DIRTY 2 /* B_DELWRI buffers */ #define QUEUE_CLEAN 3 /* non-B_DELWRI buffers */ #define QUEUE_SENTINEL 1024 /* not an queue index, but mark for sentinel */ /* Maximum number of clean buffer queues. */ #define CLEAN_QUEUES 16 /* Configured number of clean queues. */ static int clean_queues; /* Maximum number of buffer queues. */ #define BUFFER_QUEUES (QUEUE_CLEAN + CLEAN_QUEUES) /* Queues for free buffers with various properties */ static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } }; #ifdef INVARIANTS static int bq_len[BUFFER_QUEUES]; #endif /* * Lock for each bufqueue */ static struct mtx_padalign __exclusive_cache_line bqlocks[BUFFER_QUEUES]; /* * per-cpu empty buffer cache. */ uma_zone_t buf_zone; /* * Single global constant for BUF_WMESG, to avoid getting multiple references. * buf_wmesg is referred from macros. */ const char *buf_wmesg = BUF_WMESG; static int sysctl_runningspace(SYSCTL_HANDLER_ARGS) { long value; int error; value = *(long *)arg1; error = sysctl_handle_long(oidp, &value, 0, req); if (error != 0 || req->newptr == NULL) return (error); mtx_lock(&rbreqlock); if (arg1 == &hirunningspace) { if (value < lorunningspace) error = EINVAL; else hirunningspace = value; } else { KASSERT(arg1 == &lorunningspace, ("%s: unknown arg1", __func__)); if (value > hirunningspace) error = EINVAL; else lorunningspace = value; } mtx_unlock(&rbreqlock); return (error); } #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) static int sysctl_bufspace(SYSCTL_HANDLER_ARGS) { long lvalue; int ivalue; if (sizeof(int) == sizeof(long) || req->oldlen >= sizeof(long)) return (sysctl_handle_long(oidp, arg1, arg2, req)); lvalue = *(long *)arg1; if (lvalue > INT_MAX) /* On overflow, still write out a long to trigger ENOMEM. */ return (sysctl_handle_long(oidp, &lvalue, 0, req)); ivalue = lvalue; return (sysctl_handle_int(oidp, &ivalue, 0, req)); } #endif static int bqcleanq(void) { static int nextq; return ((atomic_fetchadd_int(&nextq, 1) % clean_queues) + QUEUE_CLEAN); } static int bqisclean(int qindex) { return (qindex >= QUEUE_CLEAN && qindex < QUEUE_CLEAN + CLEAN_QUEUES); } /* * bqlock: * * Return the appropriate queue lock based on the index. */ static inline struct mtx * bqlock(int qindex) { return (struct mtx *)&bqlocks[qindex]; } /* * bdirtywakeup: * * Wakeup any bwillwrite() waiters. */ static void bdirtywakeup(void) { mtx_lock(&bdirtylock); if (bdirtywait) { bdirtywait = 0; wakeup(&bdirtywait); } mtx_unlock(&bdirtylock); } /* * bdirtysub: * * Decrement the numdirtybuffers count by one and wakeup any * threads blocked in bwillwrite(). */ static void bdirtysub(void) { if (atomic_fetchadd_int(&numdirtybuffers, -1) == (lodirtybuffers + hidirtybuffers) / 2) bdirtywakeup(); } /* * bdirtyadd: * * Increment the numdirtybuffers count by one and wakeup the buf * daemon if needed. */ static void bdirtyadd(void) { /* * Only do the wakeup once as we cross the boundary. The * buf daemon will keep running until the condition clears. */ if (atomic_fetchadd_int(&numdirtybuffers, 1) == (lodirtybuffers + hidirtybuffers) / 2) bd_wakeup(); } /* * bufspace_wakeup: * * Called when buffer space is potentially available for recovery. * getnewbuf() will block on this flag when it is unable to free * sufficient buffer space. Buffer space becomes recoverable when * bp's get placed back in the queues. */ static void bufspace_wakeup(void) { /* * If someone is waiting for bufspace, wake them up. * * Since needsbuffer is set prior to doing an additional queue * scan it is safe to check for the flag prior to acquiring the * lock. The thread that is preparing to scan again before * blocking would discover the buf we released. */ if (needsbuffer) { rw_rlock(&nblock); if (atomic_cmpset_int(&needsbuffer, 1, 0) == 1) wakeup(__DEVOLATILE(void *, &needsbuffer)); rw_runlock(&nblock); } } /* * bufspace_daemonwakeup: * * Wakeup the daemon responsible for freeing clean bufs. */ static void bufspace_daemonwakeup(void) { rw_rlock(&nblock); if (bufspace_request == 0) { bufspace_request = 1; wakeup(&bufspace_request); } rw_runlock(&nblock); } /* * bufspace_adjust: * * Adjust the reported bufspace for a KVA managed buffer, possibly * waking any waiters. */ static void bufspace_adjust(struct buf *bp, int bufsize) { long space; int diff; KASSERT((bp->b_flags & B_MALLOC) == 0, ("bufspace_adjust: malloc buf %p", bp)); diff = bufsize - bp->b_bufsize; if (diff < 0) { atomic_subtract_long(&bufspace, -diff); bufspace_wakeup(); } else { space = atomic_fetchadd_long(&bufspace, diff); /* Wake up the daemon on the transition. */ if (space < bufspacethresh && space + diff >= bufspacethresh) bufspace_daemonwakeup(); } bp->b_bufsize = bufsize; } /* * bufspace_reserve: * * Reserve bufspace before calling allocbuf(). metadata has a * different space limit than data. */ static int bufspace_reserve(int size, bool metadata) { long limit; long space; if (metadata) limit = maxbufspace; else limit = hibufspace; do { space = bufspace; if (space + size > limit) return (ENOSPC); } while (atomic_cmpset_long(&bufspace, space, space + size) == 0); /* Wake up the daemon on the transition. */ if (space < bufspacethresh && space + size >= bufspacethresh) bufspace_daemonwakeup(); return (0); } /* * bufspace_release: * * Release reserved bufspace after bufspace_adjust() has consumed it. */ static void bufspace_release(int size) { atomic_subtract_long(&bufspace, size); bufspace_wakeup(); } /* * bufspace_wait: * * Wait for bufspace, acting as the buf daemon if a locked vnode is * supplied. needsbuffer must be set in a safe fashion prior to * polling for space. The operation must be re-tried on return. */ static void bufspace_wait(struct vnode *vp, int gbflags, int slpflag, int slptimeo) { struct thread *td; int error, fl, norunbuf; if ((gbflags & GB_NOWAIT_BD) != 0) return; td = curthread; rw_wlock(&nblock); while (needsbuffer != 0) { if (vp != NULL && vp->v_type != VCHR && (td->td_pflags & TDP_BUFNEED) == 0) { rw_wunlock(&nblock); /* * getblk() is called with a vnode locked, and * some majority of the dirty buffers may as * well belong to the vnode. Flushing the * buffers there would make a progress that * cannot be achieved by the buf_daemon, that * cannot lock the vnode. */ norunbuf = ~(TDP_BUFNEED | TDP_NORUNNINGBUF) | (td->td_pflags & TDP_NORUNNINGBUF); /* * Play bufdaemon. The getnewbuf() function * may be called while the thread owns lock * for another dirty buffer for the same * vnode, which makes it impossible to use * VOP_FSYNC() there, due to the buffer lock * recursion. */ td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF; fl = buf_flush(vp, flushbufqtarget); td->td_pflags &= norunbuf; rw_wlock(&nblock); if (fl != 0) continue; if (needsbuffer == 0) break; } error = rw_sleep(__DEVOLATILE(void *, &needsbuffer), &nblock, (PRIBIO + 4) | slpflag, "newbuf", slptimeo); if (error != 0) break; } rw_wunlock(&nblock); } /* * bufspace_daemon: * * buffer space management daemon. Tries to maintain some marginal * amount of free buffer space so that requesting processes neither * block nor work to reclaim buffers. */ static void bufspace_daemon(void) { for (;;) { kproc_suspend_check(bufspacedaemonproc); /* * Free buffers from the clean queue until we meet our * targets. * * Theory of operation: The buffer cache is most efficient * when some free buffer headers and space are always * available to getnewbuf(). This daemon attempts to prevent * the excessive blocking and synchronization associated * with shortfall. It goes through three phases according * demand: * * 1) The daemon wakes up voluntarily once per-second * during idle periods when the counters are below * the wakeup thresholds (bufspacethresh, lofreebuffers). * * 2) The daemon wakes up as we cross the thresholds * ahead of any potential blocking. This may bounce * slightly according to the rate of consumption and * release. * * 3) The daemon and consumers are starved for working * clean buffers. This is the 'bufspace' sleep below * which will inefficiently trade bufs with bqrelse * until we return to condition 2. */ while (bufspace > lobufspace || numfreebuffers < hifreebuffers) { if (buf_recycle(false) != 0) { atomic_set_int(&needsbuffer, 1); if (buf_recycle(false) != 0) { rw_wlock(&nblock); if (needsbuffer) rw_sleep(__DEVOLATILE(void *, &needsbuffer), &nblock, PRIBIO|PDROP, "bufspace", hz/10); else rw_wunlock(&nblock); } } maybe_yield(); } /* * Re-check our limits under the exclusive nblock. */ rw_wlock(&nblock); if (bufspace < bufspacethresh && numfreebuffers > lofreebuffers) { bufspace_request = 0; rw_sleep(&bufspace_request, &nblock, PRIBIO|PDROP, "-", hz); } else rw_wunlock(&nblock); } } static struct kproc_desc bufspace_kp = { "bufspacedaemon", bufspace_daemon, &bufspacedaemonproc }; SYSINIT(bufspacedaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &bufspace_kp); /* * bufmallocadjust: * * Adjust the reported bufspace for a malloc managed buffer, possibly * waking any waiters. */ static void bufmallocadjust(struct buf *bp, int bufsize) { int diff; KASSERT((bp->b_flags & B_MALLOC) != 0, ("bufmallocadjust: non-malloc buf %p", bp)); diff = bufsize - bp->b_bufsize; if (diff < 0) atomic_subtract_long(&bufmallocspace, -diff); else atomic_add_long(&bufmallocspace, diff); bp->b_bufsize = bufsize; } /* * runningwakeup: * * Wake up processes that are waiting on asynchronous writes to fall * below lorunningspace. */ static void runningwakeup(void) { mtx_lock(&rbreqlock); if (runningbufreq) { runningbufreq = 0; wakeup(&runningbufreq); } mtx_unlock(&rbreqlock); } /* * runningbufwakeup: * * Decrement the outstanding write count according. */ void runningbufwakeup(struct buf *bp) { long space, bspace; bspace = bp->b_runningbufspace; if (bspace == 0) return; space = atomic_fetchadd_long(&runningbufspace, -bspace); KASSERT(space >= bspace, ("runningbufspace underflow %ld %ld", space, bspace)); bp->b_runningbufspace = 0; /* * Only acquire the lock and wakeup on the transition from exceeding * the threshold to falling below it. */ if (space < lorunningspace) return; if (space - bspace > lorunningspace) return; runningwakeup(); } /* * waitrunningbufspace() * * runningbufspace is a measure of the amount of I/O currently * running. This routine is used in async-write situations to * prevent creating huge backups of pending writes to a device. * Only asynchronous writes are governed by this function. * * This does NOT turn an async write into a sync write. It waits * for earlier writes to complete and generally returns before the * caller's write has reached the device. */ void waitrunningbufspace(void) { mtx_lock(&rbreqlock); while (runningbufspace > hirunningspace) { runningbufreq = 1; msleep(&runningbufreq, &rbreqlock, PVM, "wdrain", 0); } mtx_unlock(&rbreqlock); } /* * vfs_buf_test_cache: * * Called when a buffer is extended. This function clears the B_CACHE * bit if the newly extended portion of the buffer does not contain * valid data. */ static __inline void vfs_buf_test_cache(struct buf *bp, vm_ooffset_t foff, vm_offset_t off, vm_offset_t size, vm_page_t m) { VM_OBJECT_ASSERT_LOCKED(m->object); if (bp->b_flags & B_CACHE) { int base = (foff + off) & PAGE_MASK; if (vm_page_is_valid(m, base, size) == 0) bp->b_flags &= ~B_CACHE; } } /* Wake up the buffer daemon if necessary */ static __inline void bd_wakeup(void) { mtx_lock(&bdlock); if (bd_request == 0) { bd_request = 1; wakeup(&bd_request); } mtx_unlock(&bdlock); } /* * Adjust the maxbcachbuf tunable. */ static void maxbcachebuf_adjust(void) { int i; /* * maxbcachebuf must be a power of 2 >= MAXBSIZE. */ i = 2; while (i * 2 <= maxbcachebuf) i *= 2; maxbcachebuf = i; if (maxbcachebuf < MAXBSIZE) maxbcachebuf = MAXBSIZE; if (maxbcachebuf > MAXPHYS) maxbcachebuf = MAXPHYS; if (bootverbose != 0 && maxbcachebuf != MAXBCACHEBUF) printf("maxbcachebuf=%d\n", maxbcachebuf); } /* * bd_speedup - speedup the buffer cache flushing code */ void bd_speedup(void) { int needwake; mtx_lock(&bdlock); needwake = 0; if (bd_speedupreq == 0 || bd_request == 0) needwake = 1; bd_speedupreq = 1; bd_request = 1; if (needwake) wakeup(&bd_request); mtx_unlock(&bdlock); } #ifndef NSWBUF_MIN #define NSWBUF_MIN 16 #endif #ifdef __i386__ #define TRANSIENT_DENOM 5 #else #define TRANSIENT_DENOM 10 #endif /* * Calculating buffer cache scaling values and reserve space for buffer * headers. This is called during low level kernel initialization and * may be called more then once. We CANNOT write to the memory area * being reserved at this time. */ caddr_t kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est) { int tuned_nbuf; long maxbuf, maxbuf_sz, buf_sz, biotmap_sz; /* * physmem_est is in pages. Convert it to kilobytes (assumes * PAGE_SIZE is >= 1K) */ physmem_est = physmem_est * (PAGE_SIZE / 1024); maxbcachebuf_adjust(); /* * The nominal buffer size (and minimum KVA allocation) is BKVASIZE. * For the first 64MB of ram nominally allocate sufficient buffers to * cover 1/4 of our ram. Beyond the first 64MB allocate additional * buffers to cover 1/10 of our ram over 64MB. When auto-sizing * the buffer cache we limit the eventual kva reservation to * maxbcache bytes. * * factor represents the 1/4 x ram conversion. */ if (nbuf == 0) { int factor = 4 * BKVASIZE / 1024; nbuf = 50; if (physmem_est > 4096) nbuf += min((physmem_est - 4096) / factor, 65536 / factor); if (physmem_est > 65536) nbuf += min((physmem_est - 65536) * 2 / (factor * 5), 32 * 1024 * 1024 / (factor * 5)); if (maxbcache && nbuf > maxbcache / BKVASIZE) nbuf = maxbcache / BKVASIZE; tuned_nbuf = 1; } else tuned_nbuf = 0; /* XXX Avoid unsigned long overflows later on with maxbufspace. */ maxbuf = (LONG_MAX / 3) / BKVASIZE; if (nbuf > maxbuf) { if (!tuned_nbuf) printf("Warning: nbufs lowered from %d to %ld\n", nbuf, maxbuf); nbuf = maxbuf; } /* * Ideal allocation size for the transient bio submap is 10% * of the maximal space buffer map. This roughly corresponds * to the amount of the buffer mapped for typical UFS load. * * Clip the buffer map to reserve space for the transient * BIOs, if its extent is bigger than 90% (80% on i386) of the * maximum buffer map extent on the platform. * * The fall-back to the maxbuf in case of maxbcache unset, * allows to not trim the buffer KVA for the architectures * with ample KVA space. */ if (bio_transient_maxcnt == 0 && unmapped_buf_allowed) { maxbuf_sz = maxbcache != 0 ? maxbcache : maxbuf * BKVASIZE; buf_sz = (long)nbuf * BKVASIZE; if (buf_sz < maxbuf_sz / TRANSIENT_DENOM * (TRANSIENT_DENOM - 1)) { /* * There is more KVA than memory. Do not * adjust buffer map size, and assign the rest * of maxbuf to transient map. */ biotmap_sz = maxbuf_sz - buf_sz; } else { /* * Buffer map spans all KVA we could afford on * this platform. Give 10% (20% on i386) of * the buffer map to the transient bio map. */ biotmap_sz = buf_sz / TRANSIENT_DENOM; buf_sz -= biotmap_sz; } if (biotmap_sz / INT_MAX > MAXPHYS) bio_transient_maxcnt = INT_MAX; else bio_transient_maxcnt = biotmap_sz / MAXPHYS; /* * Artificially limit to 1024 simultaneous in-flight I/Os * using the transient mapping. */ if (bio_transient_maxcnt > 1024) bio_transient_maxcnt = 1024; if (tuned_nbuf) nbuf = buf_sz / BKVASIZE; } /* * swbufs are used as temporary holders for I/O, such as paging I/O. * We have no less then 16 and no more then 256. */ nswbuf = min(nbuf / 4, 256); TUNABLE_INT_FETCH("kern.nswbuf", &nswbuf); if (nswbuf < NSWBUF_MIN) nswbuf = NSWBUF_MIN; /* * Reserve space for the buffer cache buffers */ swbuf = (void *)v; v = (caddr_t)(swbuf + nswbuf); buf = (void *)v; v = (caddr_t)(buf + nbuf); return(v); } /* Initialize the buffer subsystem. Called before use of any buffers. */ void bufinit(void) { struct buf *bp; int i; KASSERT(maxbcachebuf >= MAXBSIZE, ("maxbcachebuf (%d) must be >= MAXBSIZE (%d)\n", maxbcachebuf, MAXBSIZE)); mtx_init(&bqlocks[QUEUE_DIRTY], "bufq dirty lock", NULL, MTX_DEF); mtx_init(&bqlocks[QUEUE_EMPTY], "bufq empty lock", NULL, MTX_DEF); for (i = QUEUE_CLEAN; i < QUEUE_CLEAN + CLEAN_QUEUES; i++) mtx_init(&bqlocks[i], "bufq clean lock", NULL, MTX_DEF); mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF); rw_init(&nblock, "needsbuffer lock"); mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF); mtx_init(&bdirtylock, "dirty buf lock", NULL, MTX_DEF); /* next, make a null set of free lists */ for (i = 0; i < BUFFER_QUEUES; i++) TAILQ_INIT(&bufqueues[i]); unmapped_buf = (caddr_t)kva_alloc(MAXPHYS); /* finally, initialize each buffer header and stick on empty q */ for (i = 0; i < nbuf; i++) { bp = &buf[i]; bzero(bp, sizeof *bp); bp->b_flags = B_INVAL; bp->b_rcred = NOCRED; bp->b_wcred = NOCRED; bp->b_qindex = QUEUE_EMPTY; bp->b_xflags = 0; bp->b_data = bp->b_kvabase = unmapped_buf; LIST_INIT(&bp->b_dep); BUF_LOCKINIT(bp); TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist); #ifdef INVARIANTS bq_len[QUEUE_EMPTY]++; #endif } /* * maxbufspace is the absolute maximum amount of buffer space we are * allowed to reserve in KVM and in real terms. The absolute maximum * is nominally used by metadata. hibufspace is the nominal maximum * used by most other requests. The differential is required to * ensure that metadata deadlocks don't occur. * * maxbufspace is based on BKVASIZE. Allocating buffers larger then * this may result in KVM fragmentation which is not handled optimally * by the system. XXX This is less true with vmem. We could use * PAGE_SIZE. */ maxbufspace = (long)nbuf * BKVASIZE; hibufspace = lmax(3 * maxbufspace / 4, maxbufspace - maxbcachebuf * 10); lobufspace = (hibufspace / 20) * 19; /* 95% */ bufspacethresh = lobufspace + (hibufspace - lobufspace) / 2; /* * Note: The 16 MiB upper limit for hirunningspace was chosen * arbitrarily and may need further tuning. It corresponds to * 128 outstanding write IO requests (if IO size is 128 KiB), * which fits with many RAID controllers' tagged queuing limits. * The lower 1 MiB limit is the historical upper limit for * hirunningspace. */ hirunningspace = lmax(lmin(roundup(hibufspace / 64, maxbcachebuf), 16 * 1024 * 1024), 1024 * 1024); lorunningspace = roundup((hirunningspace * 2) / 3, maxbcachebuf); /* * Limit the amount of malloc memory since it is wired permanently into * the kernel space. Even though this is accounted for in the buffer * allocation, we don't want the malloced region to grow uncontrolled. * The malloc scheme improves memory utilization significantly on * average (small) directories. */ maxbufmallocspace = hibufspace / 20; /* * Reduce the chance of a deadlock occurring by limiting the number * of delayed-write dirty buffers we allow to stack up. */ hidirtybuffers = nbuf / 4 + 20; dirtybufthresh = hidirtybuffers * 9 / 10; numdirtybuffers = 0; /* * To support extreme low-memory systems, make sure hidirtybuffers * cannot eat up all available buffer space. This occurs when our * minimum cannot be met. We try to size hidirtybuffers to 3/4 our * buffer space assuming BKVASIZE'd buffers. */ while ((long)hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) { hidirtybuffers >>= 1; } lodirtybuffers = hidirtybuffers / 2; /* * lofreebuffers should be sufficient to avoid stalling waiting on * buf headers under heavy utilization. The bufs in per-cpu caches * are counted as free but will be unavailable to threads executing * on other cpus. * * hifreebuffers is the free target for the bufspace daemon. This * should be set appropriately to limit work per-iteration. */ lofreebuffers = MIN((nbuf / 25) + (20 * mp_ncpus), 128 * mp_ncpus); hifreebuffers = (3 * lofreebuffers) / 2; numfreebuffers = nbuf; /* Setup the kva and free list allocators. */ vmem_set_reclaim(buffer_arena, bufkva_reclaim); buf_zone = uma_zcache_create("buf free cache", sizeof(struct buf), NULL, NULL, NULL, NULL, buf_import, buf_release, NULL, 0); /* * Size the clean queue according to the amount of buffer space. * One queue per-256mb up to the max. More queues gives better * concurrency but less accurate LRU. */ clean_queues = MIN(howmany(maxbufspace, 256*1024*1024), CLEAN_QUEUES); } #ifdef INVARIANTS static inline void vfs_buf_check_mapped(struct buf *bp) { KASSERT(bp->b_kvabase != unmapped_buf, ("mapped buf: b_kvabase was not updated %p", bp)); KASSERT(bp->b_data != unmapped_buf, ("mapped buf: b_data was not updated %p", bp)); KASSERT(bp->b_data < unmapped_buf || bp->b_data >= unmapped_buf + MAXPHYS, ("b_data + b_offset unmapped %p", bp)); } static inline void vfs_buf_check_unmapped(struct buf *bp) { KASSERT(bp->b_data == unmapped_buf, ("unmapped buf: corrupted b_data %p", bp)); } #define BUF_CHECK_MAPPED(bp) vfs_buf_check_mapped(bp) #define BUF_CHECK_UNMAPPED(bp) vfs_buf_check_unmapped(bp) #else #define BUF_CHECK_MAPPED(bp) do {} while (0) #define BUF_CHECK_UNMAPPED(bp) do {} while (0) #endif static int isbufbusy(struct buf *bp) { if (((bp->b_flags & B_INVAL) == 0 && BUF_ISLOCKED(bp)) || ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI)) return (1); return (0); } /* * Shutdown the system cleanly to prepare for reboot, halt, or power off. */ void bufshutdown(int show_busybufs) { static int first_buf_printf = 1; struct buf *bp; int iter, nbusy, pbusy; #ifndef PREEMPTION int subiter; #endif /* * Sync filesystems for shutdown */ wdog_kern_pat(WD_LASTVAL); sys_sync(curthread, NULL); /* * With soft updates, some buffers that are * written will be remarked as dirty until other * buffers are written. */ for (iter = pbusy = 0; iter < 20; iter++) { nbusy = 0; for (bp = &buf[nbuf]; --bp >= buf; ) if (isbufbusy(bp)) nbusy++; if (nbusy == 0) { if (first_buf_printf) printf("All buffers synced."); break; } if (first_buf_printf) { printf("Syncing disks, buffers remaining... "); first_buf_printf = 0; } printf("%d ", nbusy); if (nbusy < pbusy) iter = 0; pbusy = nbusy; wdog_kern_pat(WD_LASTVAL); sys_sync(curthread, NULL); #ifdef PREEMPTION /* * Drop Giant and spin for a while to allow * interrupt threads to run. */ DROP_GIANT(); DELAY(50000 * iter); PICKUP_GIANT(); #else /* * Drop Giant and context switch several times to * allow interrupt threads to run. */ DROP_GIANT(); for (subiter = 0; subiter < 50 * iter; subiter++) { thread_lock(curthread); mi_switch(SW_VOL, NULL); thread_unlock(curthread); DELAY(1000); } PICKUP_GIANT(); #endif } printf("\n"); /* * Count only busy local buffers to prevent forcing * a fsck if we're just a client of a wedged NFS server */ nbusy = 0; for (bp = &buf[nbuf]; --bp >= buf; ) { if (isbufbusy(bp)) { #if 0 /* XXX: This is bogus. We should probably have a BO_REMOTE flag instead */ if (bp->b_dev == NULL) { TAILQ_REMOVE(&mountlist, bp->b_vp->v_mount, mnt_list); continue; } #endif nbusy++; if (show_busybufs > 0) { printf( "%d: buf:%p, vnode:%p, flags:%0x, blkno:%jd, lblkno:%jd, buflock:", nbusy, bp, bp->b_vp, bp->b_flags, (intmax_t)bp->b_blkno, (intmax_t)bp->b_lblkno); BUF_LOCKPRINTINFO(bp); if (show_busybufs > 1) vn_printf(bp->b_vp, "vnode content: "); } } } if (nbusy) { /* * Failed to sync all blocks. Indicate this and don't * unmount filesystems (thus forcing an fsck on reboot). */ printf("Giving up on %d buffers\n", nbusy); DELAY(5000000); /* 5 seconds */ } else { if (!first_buf_printf) printf("Final sync complete\n"); /* * Unmount filesystems */ if (panicstr == NULL) vfs_unmountall(); } swapoff_all(); DELAY(100000); /* wait for console output to finish */ } static void bpmap_qenter(struct buf *bp) { BUF_CHECK_MAPPED(bp); /* * bp->b_data is relative to bp->b_offset, but * bp->b_offset may be offset into the first page. */ bp->b_data = (caddr_t)trunc_page((vm_offset_t)bp->b_data); pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages); bp->b_data = (caddr_t)((vm_offset_t)bp->b_data | (vm_offset_t)(bp->b_offset & PAGE_MASK)); } /* * binsfree: * * Insert the buffer into the appropriate free list. */ static void binsfree(struct buf *bp, int qindex) { struct mtx *olock, *nlock; if (qindex != QUEUE_EMPTY) { BUF_ASSERT_XLOCKED(bp); } /* * Stick to the same clean queue for the lifetime of the buf to * limit locking below. Otherwise pick ont sequentially. */ if (qindex == QUEUE_CLEAN) { if (bqisclean(bp->b_qindex)) qindex = bp->b_qindex; else qindex = bqcleanq(); } /* * Handle delayed bremfree() processing. */ nlock = bqlock(qindex); if (bp->b_flags & B_REMFREE) { olock = bqlock(bp->b_qindex); mtx_lock(olock); bremfreel(bp); if (olock != nlock) { mtx_unlock(olock); mtx_lock(nlock); } } else mtx_lock(nlock); if (bp->b_qindex != QUEUE_NONE) panic("binsfree: free buffer onto another queue???"); bp->b_qindex = qindex; if (bp->b_flags & B_AGE) TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist); else TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist); #ifdef INVARIANTS bq_len[bp->b_qindex]++; #endif mtx_unlock(nlock); } /* * buf_free: * * Free a buffer to the buf zone once it no longer has valid contents. */ static void buf_free(struct buf *bp) { if (bp->b_flags & B_REMFREE) bremfreef(bp); if (bp->b_vflags & BV_BKGRDINPROG) panic("losing buffer 1"); if (bp->b_rcred != NOCRED) { crfree(bp->b_rcred); bp->b_rcred = NOCRED; } if (bp->b_wcred != NOCRED) { crfree(bp->b_wcred); bp->b_wcred = NOCRED; } if (!LIST_EMPTY(&bp->b_dep)) buf_deallocate(bp); bufkva_free(bp); BUF_UNLOCK(bp); uma_zfree(buf_zone, bp); atomic_add_int(&numfreebuffers, 1); bufspace_wakeup(); } /* * buf_import: * * Import bufs into the uma cache from the buf list. The system still * expects a static array of bufs and much of the synchronization * around bufs assumes type stable storage. As a result, UMA is used * only as a per-cpu cache of bufs still maintained on a global list. */ static int buf_import(void *arg, void **store, int cnt, int flags) { struct buf *bp; int i; mtx_lock(&bqlocks[QUEUE_EMPTY]); for (i = 0; i < cnt; i++) { bp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]); if (bp == NULL) break; bremfreel(bp); store[i] = bp; } mtx_unlock(&bqlocks[QUEUE_EMPTY]); return (i); } /* * buf_release: * * Release bufs from the uma cache back to the buffer queues. */ static void buf_release(void *arg, void **store, int cnt) { int i; for (i = 0; i < cnt; i++) binsfree(store[i], QUEUE_EMPTY); } /* * buf_alloc: * * Allocate an empty buffer header. */ static struct buf * buf_alloc(void) { struct buf *bp; bp = uma_zalloc(buf_zone, M_NOWAIT); if (bp == NULL) { bufspace_daemonwakeup(); atomic_add_int(&numbufallocfails, 1); return (NULL); } /* * Wake-up the bufspace daemon on transition. */ if (atomic_fetchadd_int(&numfreebuffers, -1) == lofreebuffers) bufspace_daemonwakeup(); if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) panic("getnewbuf_empty: Locked buf %p on free queue.", bp); KASSERT(bp->b_vp == NULL, ("bp: %p still has vnode %p.", bp, bp->b_vp)); KASSERT((bp->b_flags & (B_DELWRI | B_NOREUSE)) == 0, ("invalid buffer %p flags %#x", bp, bp->b_flags)); KASSERT((bp->b_xflags & (BX_VNCLEAN|BX_VNDIRTY)) == 0, ("bp: %p still on a buffer list. xflags %X", bp, bp->b_xflags)); KASSERT(bp->b_npages == 0, ("bp: %p still has %d vm pages\n", bp, bp->b_npages)); KASSERT(bp->b_kvasize == 0, ("bp: %p still has kva\n", bp)); KASSERT(bp->b_bufsize == 0, ("bp: %p still has bufspace\n", bp)); bp->b_flags = 0; bp->b_ioflags = 0; bp->b_xflags = 0; bp->b_vflags = 0; bp->b_vp = NULL; bp->b_blkno = bp->b_lblkno = 0; bp->b_offset = NOOFFSET; bp->b_iodone = 0; bp->b_error = 0; bp->b_resid = 0; bp->b_bcount = 0; bp->b_npages = 0; bp->b_dirtyoff = bp->b_dirtyend = 0; bp->b_bufobj = NULL; bp->b_data = bp->b_kvabase = unmapped_buf; bp->b_fsprivate1 = NULL; bp->b_fsprivate2 = NULL; bp->b_fsprivate3 = NULL; LIST_INIT(&bp->b_dep); return (bp); } /* * buf_qrecycle: * * Free a buffer from the given bufqueue. kva controls whether the * freed buf must own some kva resources. This is used for * defragmenting. */ static int buf_qrecycle(int qindex, bool kva) { struct buf *bp, *nbp; if (kva) atomic_add_int(&bufdefragcnt, 1); nbp = NULL; mtx_lock(&bqlocks[qindex]); nbp = TAILQ_FIRST(&bufqueues[qindex]); /* * Run scan, possibly freeing data and/or kva mappings on the fly * depending. */ while ((bp = nbp) != NULL) { /* * Calculate next bp (we can only use it if we do not * release the bqlock). */ nbp = TAILQ_NEXT(bp, b_freelist); /* * If we are defragging then we need a buffer with * some kva to reclaim. */ if (kva && bp->b_kvasize == 0) continue; if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) continue; /* * Skip buffers with background writes in progress. */ if ((bp->b_vflags & BV_BKGRDINPROG) != 0) { BUF_UNLOCK(bp); continue; } KASSERT(bp->b_qindex == qindex, ("getnewbuf: inconsistent queue %d bp %p", qindex, bp)); /* * NOTE: nbp is now entirely invalid. We can only restart * the scan from this point on. */ bremfreel(bp); mtx_unlock(&bqlocks[qindex]); /* * Requeue the background write buffer with error and * restart the scan. */ if ((bp->b_vflags & BV_BKGRDERR) != 0) { bqrelse(bp); mtx_lock(&bqlocks[qindex]); nbp = TAILQ_FIRST(&bufqueues[qindex]); continue; } bp->b_flags |= B_INVAL; brelse(bp); return (0); } mtx_unlock(&bqlocks[qindex]); return (ENOBUFS); } /* * buf_recycle: * * Iterate through all clean queues until we find a buf to recycle or * exhaust the search. */ static int buf_recycle(bool kva) { int qindex, first_qindex; qindex = first_qindex = bqcleanq(); do { if (buf_qrecycle(qindex, kva) == 0) return (0); if (++qindex == QUEUE_CLEAN + clean_queues) qindex = QUEUE_CLEAN; } while (qindex != first_qindex); return (ENOBUFS); } /* * buf_scan: * * Scan the clean queues looking for a buffer to recycle. needsbuffer * is set on failure so that the caller may optionally bufspace_wait() * in a race-free fashion. */ static int buf_scan(bool defrag) { int error; /* * To avoid heavy synchronization and wakeup races we set * needsbuffer and re-poll before failing. This ensures that * no frees can be missed between an unsuccessful poll and * going to sleep in a synchronized fashion. */ if ((error = buf_recycle(defrag)) != 0) { atomic_set_int(&needsbuffer, 1); bufspace_daemonwakeup(); error = buf_recycle(defrag); } if (error == 0) atomic_add_int(&getnewbufrestarts, 1); return (error); } /* * bremfree: * * Mark the buffer for removal from the appropriate free list. * */ void bremfree(struct buf *bp) { CTR3(KTR_BUF, "bremfree(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT((bp->b_flags & B_REMFREE) == 0, ("bremfree: buffer %p already marked for delayed removal.", bp)); KASSERT(bp->b_qindex != QUEUE_NONE, ("bremfree: buffer %p not on a queue.", bp)); BUF_ASSERT_XLOCKED(bp); bp->b_flags |= B_REMFREE; } /* * bremfreef: * * Force an immediate removal from a free list. Used only in nfs when * it abuses the b_freelist pointer. */ void bremfreef(struct buf *bp) { struct mtx *qlock; qlock = bqlock(bp->b_qindex); mtx_lock(qlock); bremfreel(bp); mtx_unlock(qlock); } /* * bremfreel: * * Removes a buffer from the free list, must be called with the * correct qlock held. */ static void bremfreel(struct buf *bp) { CTR3(KTR_BUF, "bremfreel(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_qindex != QUEUE_NONE, ("bremfreel: buffer %p not on a queue.", bp)); if (bp->b_qindex != QUEUE_EMPTY) { BUF_ASSERT_XLOCKED(bp); } mtx_assert(bqlock(bp->b_qindex), MA_OWNED); TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist); #ifdef INVARIANTS KASSERT(bq_len[bp->b_qindex] >= 1, ("queue %d underflow", bp->b_qindex)); bq_len[bp->b_qindex]--; #endif bp->b_qindex = QUEUE_NONE; bp->b_flags &= ~B_REMFREE; } /* * bufkva_free: * * Free the kva allocation for a buffer. * */ static void bufkva_free(struct buf *bp) { #ifdef INVARIANTS if (bp->b_kvasize == 0) { KASSERT(bp->b_kvabase == unmapped_buf && bp->b_data == unmapped_buf, ("Leaked KVA space on %p", bp)); } else if (buf_mapped(bp)) BUF_CHECK_MAPPED(bp); else BUF_CHECK_UNMAPPED(bp); #endif if (bp->b_kvasize == 0) return; vmem_free(buffer_arena, (vm_offset_t)bp->b_kvabase, bp->b_kvasize); atomic_subtract_long(&bufkvaspace, bp->b_kvasize); atomic_add_int(&buffreekvacnt, 1); bp->b_data = bp->b_kvabase = unmapped_buf; bp->b_kvasize = 0; } /* * bufkva_alloc: * * Allocate the buffer KVA and set b_kvasize and b_kvabase. */ static int bufkva_alloc(struct buf *bp, int maxsize, int gbflags) { vm_offset_t addr; int error; KASSERT((gbflags & GB_UNMAPPED) == 0 || (gbflags & GB_KVAALLOC) != 0, ("Invalid gbflags 0x%x in %s", gbflags, __func__)); bufkva_free(bp); addr = 0; error = vmem_alloc(buffer_arena, maxsize, M_BESTFIT | M_NOWAIT, &addr); if (error != 0) { /* * Buffer map is too fragmented. Request the caller * to defragment the map. */ return (error); } bp->b_kvabase = (caddr_t)addr; bp->b_kvasize = maxsize; atomic_add_long(&bufkvaspace, bp->b_kvasize); if ((gbflags & GB_UNMAPPED) != 0) { bp->b_data = unmapped_buf; BUF_CHECK_UNMAPPED(bp); } else { bp->b_data = bp->b_kvabase; BUF_CHECK_MAPPED(bp); } return (0); } /* * bufkva_reclaim: * * Reclaim buffer kva by freeing buffers holding kva. This is a vmem * callback that fires to avoid returning failure. */ static void bufkva_reclaim(vmem_t *vmem, int flags) { int i; for (i = 0; i < 5; i++) if (buf_scan(true) != 0) break; return; } - /* * Attempt to initiate asynchronous I/O on read-ahead blocks. We must * clear BIO_ERROR and B_INVAL prior to initiating I/O . If B_CACHE is set, * the buffer is valid and we do not have to do anything. */ -void -breada(struct vnode * vp, daddr_t * rablkno, int * rabsize, - int cnt, struct ucred * cred) +static void +breada(struct vnode * vp, daddr_t * rablkno, int * rabsize, int cnt, + struct ucred * cred, int flags, void (*ckhashfunc)(struct buf *)) { struct buf *rabp; int i; for (i = 0; i < cnt; i++, rablkno++, rabsize++) { if (inmem(vp, *rablkno)) continue; rabp = getblk(vp, *rablkno, *rabsize, 0, 0, 0); - - if ((rabp->b_flags & B_CACHE) == 0) { - if (!TD_IS_IDLETHREAD(curthread)) { + if ((rabp->b_flags & B_CACHE) != 0) { + brelse(rabp); + continue; + } + if (!TD_IS_IDLETHREAD(curthread)) { #ifdef RACCT - if (racct_enable) { - PROC_LOCK(curproc); - racct_add_buf(curproc, rabp, 0); - PROC_UNLOCK(curproc); - } -#endif /* RACCT */ - curthread->td_ru.ru_inblock++; + if (racct_enable) { + PROC_LOCK(curproc); + racct_add_buf(curproc, rabp, 0); + PROC_UNLOCK(curproc); } - rabp->b_flags |= B_ASYNC; - rabp->b_flags &= ~B_INVAL; - rabp->b_ioflags &= ~BIO_ERROR; - rabp->b_iocmd = BIO_READ; - if (rabp->b_rcred == NOCRED && cred != NOCRED) - rabp->b_rcred = crhold(cred); - vfs_busy_pages(rabp, 0); - BUF_KERNPROC(rabp); - rabp->b_iooffset = dbtob(rabp->b_blkno); - bstrategy(rabp); - } else { - brelse(rabp); +#endif /* RACCT */ + curthread->td_ru.ru_inblock++; } + rabp->b_flags |= B_ASYNC; + rabp->b_flags &= ~B_INVAL; + if ((flags & GB_CKHASH) != 0) { + rabp->b_flags |= B_CKHASH; + rabp->b_ckhashcalc = ckhashfunc; + } + rabp->b_ioflags &= ~BIO_ERROR; + rabp->b_iocmd = BIO_READ; + if (rabp->b_rcred == NOCRED && cred != NOCRED) + rabp->b_rcred = crhold(cred); + vfs_busy_pages(rabp, 0); + BUF_KERNPROC(rabp); + rabp->b_iooffset = dbtob(rabp->b_blkno); + bstrategy(rabp); } } /* * Entry point for bread() and breadn() via #defines in sys/buf.h. * * Get a buffer with the specified data. Look in the cache first. We * must clear BIO_ERROR and B_INVAL prior to initiating I/O. If B_CACHE * is set, the buffer is valid and we do not have to do anything, see * getblk(). Also starts asynchronous I/O on read-ahead blocks. * * Always return a NULL buffer pointer (in bpp) when returning an error. */ int breadn_flags(struct vnode *vp, daddr_t blkno, int size, daddr_t *rablkno, - int *rabsize, int cnt, struct ucred *cred, int flags, struct buf **bpp) + int *rabsize, int cnt, struct ucred *cred, int flags, + void (*ckhashfunc)(struct buf *), struct buf **bpp) { struct buf *bp; - int rv = 0, readwait = 0; + int readwait, rv; CTR3(KTR_BUF, "breadn(%p, %jd, %d)", vp, blkno, size); /* * Can only return NULL if GB_LOCK_NOWAIT flag is specified. */ *bpp = bp = getblk(vp, blkno, size, 0, 0, flags); if (bp == NULL) return (EBUSY); - /* if not found in cache, do some I/O */ + /* + * If not found in cache, do some I/O + */ + readwait = 0; if ((bp->b_flags & B_CACHE) == 0) { if (!TD_IS_IDLETHREAD(curthread)) { #ifdef RACCT if (racct_enable) { PROC_LOCK(curproc); racct_add_buf(curproc, bp, 0); PROC_UNLOCK(curproc); } #endif /* RACCT */ curthread->td_ru.ru_inblock++; } bp->b_iocmd = BIO_READ; bp->b_flags &= ~B_INVAL; + if ((flags & GB_CKHASH) != 0) { + bp->b_flags |= B_CKHASH; + bp->b_ckhashcalc = ckhashfunc; + } bp->b_ioflags &= ~BIO_ERROR; if (bp->b_rcred == NOCRED && cred != NOCRED) bp->b_rcred = crhold(cred); vfs_busy_pages(bp, 0); bp->b_iooffset = dbtob(bp->b_blkno); bstrategy(bp); ++readwait; } - breada(vp, rablkno, rabsize, cnt, cred); + /* + * Attempt to initiate asynchronous I/O on read-ahead blocks. + */ + breada(vp, rablkno, rabsize, cnt, cred, flags, ckhashfunc); + rv = 0; if (readwait) { rv = bufwait(bp); if (rv != 0) { brelse(bp); *bpp = NULL; } } return (rv); } /* * Write, release buffer on completion. (Done by iodone * if async). Do not bother writing anything if the buffer * is invalid. * * Note that we set B_CACHE here, indicating that buffer is * fully valid and thus cacheable. This is true even of NFS * now so we set it generally. This could be set either here * or in biodone() since the I/O is synchronous. We put it * here. */ int bufwrite(struct buf *bp) { int oldflags; struct vnode *vp; long space; int vp_md; CTR3(KTR_BUF, "bufwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); if ((bp->b_bufobj->bo_flag & BO_DEAD) != 0) { bp->b_flags |= B_INVAL | B_RELBUF; bp->b_flags &= ~B_CACHE; brelse(bp); return (ENXIO); } if (bp->b_flags & B_INVAL) { brelse(bp); return (0); } if (bp->b_flags & B_BARRIER) barrierwrites++; oldflags = bp->b_flags; BUF_ASSERT_HELD(bp); KASSERT(!(bp->b_vflags & BV_BKGRDINPROG), ("FFS background buffer should not get here %p", bp)); vp = bp->b_vp; if (vp) vp_md = vp->v_vflag & VV_MD; else vp_md = 0; /* * Mark the buffer clean. Increment the bufobj write count * before bundirty() call, to prevent other thread from seeing * empty dirty list and zero counter for writes in progress, * falsely indicating that the bufobj is clean. */ bufobj_wref(bp->b_bufobj); bundirty(bp); bp->b_flags &= ~B_DONE; bp->b_ioflags &= ~BIO_ERROR; bp->b_flags |= B_CACHE; bp->b_iocmd = BIO_WRITE; vfs_busy_pages(bp, 1); /* * Normal bwrites pipeline writes */ bp->b_runningbufspace = bp->b_bufsize; space = atomic_fetchadd_long(&runningbufspace, bp->b_runningbufspace); if (!TD_IS_IDLETHREAD(curthread)) { #ifdef RACCT if (racct_enable) { PROC_LOCK(curproc); racct_add_buf(curproc, bp, 1); PROC_UNLOCK(curproc); } #endif /* RACCT */ curthread->td_ru.ru_oublock++; } if (oldflags & B_ASYNC) BUF_KERNPROC(bp); bp->b_iooffset = dbtob(bp->b_blkno); buf_track(bp, __func__); bstrategy(bp); if ((oldflags & B_ASYNC) == 0) { int rtval = bufwait(bp); brelse(bp); return (rtval); } else if (space > hirunningspace) { /* * don't allow the async write to saturate the I/O * system. We will not deadlock here because * we are blocking waiting for I/O that is already in-progress * to complete. We do not block here if it is the update * or syncer daemon trying to clean up as that can lead * to deadlock. */ if ((curthread->td_pflags & TDP_NORUNNINGBUF) == 0 && !vp_md) waitrunningbufspace(); } return (0); } void bufbdflush(struct bufobj *bo, struct buf *bp) { struct buf *nbp; if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10) { (void) VOP_FSYNC(bp->b_vp, MNT_NOWAIT, curthread); altbufferflushes++; } else if (bo->bo_dirty.bv_cnt > dirtybufthresh) { BO_LOCK(bo); /* * Try to find a buffer to flush. */ TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) { if ((nbp->b_vflags & BV_BKGRDINPROG) || BUF_LOCK(nbp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) continue; if (bp == nbp) panic("bdwrite: found ourselves"); BO_UNLOCK(bo); /* Don't countdeps with the bo lock held. */ if (buf_countdeps(nbp, 0)) { BO_LOCK(bo); BUF_UNLOCK(nbp); continue; } if (nbp->b_flags & B_CLUSTEROK) { vfs_bio_awrite(nbp); } else { bremfree(nbp); bawrite(nbp); } dirtybufferflushes++; break; } if (nbp == NULL) BO_UNLOCK(bo); } } /* * Delayed write. (Buffer is marked dirty). Do not bother writing * anything if the buffer is marked invalid. * * Note that since the buffer must be completely valid, we can safely * set B_CACHE. In fact, we have to set B_CACHE here rather then in * biodone() in order to prevent getblk from writing the buffer * out synchronously. */ void bdwrite(struct buf *bp) { struct thread *td = curthread; struct vnode *vp; struct bufobj *bo; CTR3(KTR_BUF, "bdwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); KASSERT((bp->b_flags & B_BARRIER) == 0, ("Barrier request in delayed write %p", bp)); BUF_ASSERT_HELD(bp); if (bp->b_flags & B_INVAL) { brelse(bp); return; } /* * If we have too many dirty buffers, don't create any more. * If we are wildly over our limit, then force a complete * cleanup. Otherwise, just keep the situation from getting * out of control. Note that we have to avoid a recursive * disaster and not try to clean up after our own cleanup! */ vp = bp->b_vp; bo = bp->b_bufobj; if ((td->td_pflags & (TDP_COWINPROGRESS|TDP_INBDFLUSH)) == 0) { td->td_pflags |= TDP_INBDFLUSH; BO_BDFLUSH(bo, bp); td->td_pflags &= ~TDP_INBDFLUSH; } else recursiveflushes++; bdirty(bp); /* * Set B_CACHE, indicating that the buffer is fully valid. This is * true even of NFS now. */ bp->b_flags |= B_CACHE; /* * This bmap keeps the system from needing to do the bmap later, * perhaps when the system is attempting to do a sync. Since it * is likely that the indirect block -- or whatever other datastructure * that the filesystem needs is still in memory now, it is a good * thing to do this. Note also, that if the pageout daemon is * requesting a sync -- there might not be enough memory to do * the bmap then... So, this is important to do. */ if (vp->v_type != VCHR && bp->b_lblkno == bp->b_blkno) { VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL); } buf_track(bp, __func__); /* * Set the *dirty* buffer range based upon the VM system dirty * pages. * * Mark the buffer pages as clean. We need to do this here to * satisfy the vnode_pager and the pageout daemon, so that it * thinks that the pages have been "cleaned". Note that since * the pages are in a delayed write buffer -- the VFS layer * "will" see that the pages get written out on the next sync, * or perhaps the cluster will be completed. */ vfs_clean_pages_dirty_buf(bp); bqrelse(bp); /* * note: we cannot initiate I/O from a bdwrite even if we wanted to, * due to the softdep code. */ } /* * bdirty: * * Turn buffer into delayed write request. We must clear BIO_READ and * B_RELBUF, and we must set B_DELWRI. We reassign the buffer to * itself to properly update it in the dirty/clean lists. We mark it * B_DONE to ensure that any asynchronization of the buffer properly * clears B_DONE ( else a panic will occur later ). * * bdirty() is kinda like bdwrite() - we have to clear B_INVAL which * might have been set pre-getblk(). Unlike bwrite/bdwrite, bdirty() * should only be called if the buffer is known-good. * * Since the buffer is not on a queue, we do not update the numfreebuffers * count. * * The buffer must be on QUEUE_NONE. */ void bdirty(struct buf *bp) { CTR3(KTR_BUF, "bdirty(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE, ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex)); BUF_ASSERT_HELD(bp); bp->b_flags &= ~(B_RELBUF); bp->b_iocmd = BIO_WRITE; if ((bp->b_flags & B_DELWRI) == 0) { bp->b_flags |= /* XXX B_DONE | */ B_DELWRI; reassignbuf(bp); bdirtyadd(); } } /* * bundirty: * * Clear B_DELWRI for buffer. * * Since the buffer is not on a queue, we do not update the numfreebuffers * count. * * The buffer must be on QUEUE_NONE. */ void bundirty(struct buf *bp) { CTR3(KTR_BUF, "bundirty(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE, ("bundirty: buffer %p still on queue %d", bp, bp->b_qindex)); BUF_ASSERT_HELD(bp); if (bp->b_flags & B_DELWRI) { bp->b_flags &= ~B_DELWRI; reassignbuf(bp); bdirtysub(); } /* * Since it is now being written, we can clear its deferred write flag. */ bp->b_flags &= ~B_DEFERRED; } /* * bawrite: * * Asynchronous write. Start output on a buffer, but do not wait for * it to complete. The buffer is released when the output completes. * * bwrite() ( or the VOP routine anyway ) is responsible for handling * B_INVAL buffers. Not us. */ void bawrite(struct buf *bp) { bp->b_flags |= B_ASYNC; (void) bwrite(bp); } /* * babarrierwrite: * * Asynchronous barrier write. Start output on a buffer, but do not * wait for it to complete. Place a write barrier after this write so * that this buffer and all buffers written before it are committed to * the disk before any buffers written after this write are committed * to the disk. The buffer is released when the output completes. */ void babarrierwrite(struct buf *bp) { bp->b_flags |= B_ASYNC | B_BARRIER; (void) bwrite(bp); } /* * bbarrierwrite: * * Synchronous barrier write. Start output on a buffer and wait for * it to complete. Place a write barrier after this write so that * this buffer and all buffers written before it are committed to * the disk before any buffers written after this write are committed * to the disk. The buffer is released when the output completes. */ int bbarrierwrite(struct buf *bp) { bp->b_flags |= B_BARRIER; return (bwrite(bp)); } /* * bwillwrite: * * Called prior to the locking of any vnodes when we are expecting to * write. We do not want to starve the buffer cache with too many * dirty buffers so we block here. By blocking prior to the locking * of any vnodes we attempt to avoid the situation where a locked vnode * prevents the various system daemons from flushing related buffers. */ void bwillwrite(void) { if (numdirtybuffers >= hidirtybuffers) { mtx_lock(&bdirtylock); while (numdirtybuffers >= hidirtybuffers) { bdirtywait = 1; msleep(&bdirtywait, &bdirtylock, (PRIBIO + 4), "flswai", 0); } mtx_unlock(&bdirtylock); } } /* * Return true if we have too many dirty buffers. */ int buf_dirty_count_severe(void) { return(numdirtybuffers >= hidirtybuffers); } /* * brelse: * * Release a busy buffer and, if requested, free its resources. The * buffer will be stashed in the appropriate bufqueue[] allowing it * to be accessed later as a cache entity or reused for other purposes. */ void brelse(struct buf *bp) { int qindex; /* * Many functions erroneously call brelse with a NULL bp under rare * error conditions. Simply return when called with a NULL bp. */ if (bp == NULL) return; CTR3(KTR_BUF, "brelse(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); KASSERT((bp->b_flags & B_VMIO) != 0 || (bp->b_flags & B_NOREUSE) == 0, ("brelse: non-VMIO buffer marked NOREUSE")); if (BUF_LOCKRECURSED(bp)) { /* * Do not process, in particular, do not handle the * B_INVAL/B_RELBUF and do not release to free list. */ BUF_UNLOCK(bp); return; } if (bp->b_flags & B_MANAGED) { bqrelse(bp); return; } if ((bp->b_vflags & (BV_BKGRDINPROG | BV_BKGRDERR)) == BV_BKGRDERR) { BO_LOCK(bp->b_bufobj); bp->b_vflags &= ~BV_BKGRDERR; BO_UNLOCK(bp->b_bufobj); bdirty(bp); } if (bp->b_iocmd == BIO_WRITE && (bp->b_ioflags & BIO_ERROR) && (bp->b_error != ENXIO || !LIST_EMPTY(&bp->b_dep)) && !(bp->b_flags & B_INVAL)) { /* * Failed write, redirty. All errors except ENXIO (which * means the device is gone) are expected to be potentially * transient - underlying media might work if tried again * after EIO, and memory might be available after an ENOMEM. * * Do this also for buffers that failed with ENXIO, but have * non-empty dependencies - the soft updates code might need * to access the buffer to untangle them. * * Must clear BIO_ERROR to prevent pages from being scrapped. */ bp->b_ioflags &= ~BIO_ERROR; bdirty(bp); } else if ((bp->b_flags & (B_NOCACHE | B_INVAL)) || (bp->b_ioflags & BIO_ERROR) || (bp->b_bufsize <= 0)) { /* * Either a failed read I/O, or we were asked to free or not * cache the buffer, or we failed to write to a device that's * no longer present. */ bp->b_flags |= B_INVAL; if (!LIST_EMPTY(&bp->b_dep)) buf_deallocate(bp); if (bp->b_flags & B_DELWRI) bdirtysub(); bp->b_flags &= ~(B_DELWRI | B_CACHE); if ((bp->b_flags & B_VMIO) == 0) { allocbuf(bp, 0); if (bp->b_vp) brelvp(bp); } } /* * We must clear B_RELBUF if B_DELWRI is set. If vfs_vmio_truncate() * is called with B_DELWRI set, the underlying pages may wind up * getting freed causing a previous write (bdwrite()) to get 'lost' * because pages associated with a B_DELWRI bp are marked clean. * * We still allow the B_INVAL case to call vfs_vmio_truncate(), even * if B_DELWRI is set. */ if (bp->b_flags & B_DELWRI) bp->b_flags &= ~B_RELBUF; /* * VMIO buffer rundown. It is not very necessary to keep a VMIO buffer * constituted, not even NFS buffers now. Two flags effect this. If * B_INVAL, the struct buf is invalidated but the VM object is kept * around ( i.e. so it is trivial to reconstitute the buffer later ). * * If BIO_ERROR or B_NOCACHE is set, pages in the VM object will be * invalidated. BIO_ERROR cannot be set for a failed write unless the * buffer is also B_INVAL because it hits the re-dirtying code above. * * Normally we can do this whether a buffer is B_DELWRI or not. If * the buffer is an NFS buffer, it is tracking piecemeal writes or * the commit state and we cannot afford to lose the buffer. If the * buffer has a background write in progress, we need to keep it * around to prevent it from being reconstituted and starting a second * background write. */ if ((bp->b_flags & B_VMIO) && (bp->b_flags & B_NOCACHE || (bp->b_ioflags & BIO_ERROR && bp->b_iocmd == BIO_READ)) && !(bp->b_vp->v_mount != NULL && (bp->b_vp->v_mount->mnt_vfc->vfc_flags & VFCF_NETWORK) != 0 && !vn_isdisk(bp->b_vp, NULL) && (bp->b_flags & B_DELWRI))) { vfs_vmio_invalidate(bp); allocbuf(bp, 0); } if ((bp->b_flags & (B_INVAL | B_RELBUF)) != 0 || (bp->b_flags & (B_DELWRI | B_NOREUSE)) == B_NOREUSE) { allocbuf(bp, 0); bp->b_flags &= ~B_NOREUSE; if (bp->b_vp != NULL) brelvp(bp); } /* * If the buffer has junk contents signal it and eventually * clean up B_DELWRI and diassociate the vnode so that gbincore() * doesn't find it. */ if (bp->b_bufsize == 0 || (bp->b_ioflags & BIO_ERROR) != 0 || (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF)) != 0) bp->b_flags |= B_INVAL; if (bp->b_flags & B_INVAL) { if (bp->b_flags & B_DELWRI) bundirty(bp); if (bp->b_vp) brelvp(bp); } buf_track(bp, __func__); /* buffers with no memory */ if (bp->b_bufsize == 0) { buf_free(bp); return; } /* buffers with junk contents */ if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF) || (bp->b_ioflags & BIO_ERROR)) { bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA); if (bp->b_vflags & BV_BKGRDINPROG) panic("losing buffer 2"); qindex = QUEUE_CLEAN; bp->b_flags |= B_AGE; /* remaining buffers */ } else if (bp->b_flags & B_DELWRI) qindex = QUEUE_DIRTY; else qindex = QUEUE_CLEAN; binsfree(bp, qindex); bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF | B_DIRECT); if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY)) panic("brelse: not dirty"); /* unlock */ BUF_UNLOCK(bp); if (qindex == QUEUE_CLEAN) bufspace_wakeup(); } /* * Release a buffer back to the appropriate queue but do not try to free * it. The buffer is expected to be used again soon. * * bqrelse() is used by bdwrite() to requeue a delayed write, and used by * biodone() to requeue an async I/O on completion. It is also used when * known good buffers need to be requeued but we think we may need the data * again soon. * * XXX we should be able to leave the B_RELBUF hint set on completion. */ void bqrelse(struct buf *bp) { int qindex; CTR3(KTR_BUF, "bqrelse(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); qindex = QUEUE_NONE; if (BUF_LOCKRECURSED(bp)) { /* do not release to free list */ BUF_UNLOCK(bp); return; } bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF); if (bp->b_flags & B_MANAGED) { if (bp->b_flags & B_REMFREE) bremfreef(bp); goto out; } /* buffers with stale but valid contents */ if ((bp->b_flags & B_DELWRI) != 0 || (bp->b_vflags & (BV_BKGRDINPROG | BV_BKGRDERR)) == BV_BKGRDERR) { BO_LOCK(bp->b_bufobj); bp->b_vflags &= ~BV_BKGRDERR; BO_UNLOCK(bp->b_bufobj); qindex = QUEUE_DIRTY; } else { if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY)) panic("bqrelse: not dirty"); if ((bp->b_flags & B_NOREUSE) != 0) { brelse(bp); return; } qindex = QUEUE_CLEAN; } binsfree(bp, qindex); out: buf_track(bp, __func__); /* unlock */ BUF_UNLOCK(bp); if (qindex == QUEUE_CLEAN) bufspace_wakeup(); } /* * Complete I/O to a VMIO backed page. Validate the pages as appropriate, * restore bogus pages. */ static void vfs_vmio_iodone(struct buf *bp) { vm_ooffset_t foff; vm_page_t m; vm_object_t obj; struct vnode *vp; int i, iosize, resid; bool bogus; obj = bp->b_bufobj->bo_object; KASSERT(obj->paging_in_progress >= bp->b_npages, ("vfs_vmio_iodone: paging in progress(%d) < b_npages(%d)", obj->paging_in_progress, bp->b_npages)); vp = bp->b_vp; KASSERT(vp->v_holdcnt > 0, ("vfs_vmio_iodone: vnode %p has zero hold count", vp)); KASSERT(vp->v_object != NULL, ("vfs_vmio_iodone: vnode %p has no vm_object", vp)); foff = bp->b_offset; KASSERT(bp->b_offset != NOOFFSET, ("vfs_vmio_iodone: bp %p has no buffer offset", bp)); bogus = false; iosize = bp->b_bcount - bp->b_resid; VM_OBJECT_WLOCK(obj); for (i = 0; i < bp->b_npages; i++) { resid = ((foff + PAGE_SIZE) & ~(off_t)PAGE_MASK) - foff; if (resid > iosize) resid = iosize; /* * cleanup bogus pages, restoring the originals */ m = bp->b_pages[i]; if (m == bogus_page) { bogus = true; m = vm_page_lookup(obj, OFF_TO_IDX(foff)); if (m == NULL) panic("biodone: page disappeared!"); bp->b_pages[i] = m; } else if ((bp->b_iocmd == BIO_READ) && resid > 0) { /* * In the write case, the valid and clean bits are * already changed correctly ( see bdwrite() ), so we * only need to do this here in the read case. */ KASSERT((m->dirty & vm_page_bits(foff & PAGE_MASK, resid)) == 0, ("vfs_vmio_iodone: page %p " "has unexpected dirty bits", m)); vfs_page_set_valid(bp, foff, m); } KASSERT(OFF_TO_IDX(foff) == m->pindex, ("vfs_vmio_iodone: foff(%jd)/pindex(%ju) mismatch", (intmax_t)foff, (uintmax_t)m->pindex)); vm_page_sunbusy(m); foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; iosize -= resid; } vm_object_pip_wakeupn(obj, bp->b_npages); VM_OBJECT_WUNLOCK(obj); if (bogus && buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } } /* * Unwire a page held by a buf and place it on the appropriate vm queue. */ static void vfs_vmio_unwire(struct buf *bp, vm_page_t m) { bool freed; vm_page_lock(m); if (vm_page_unwire(m, PQ_NONE)) { /* * Determine if the page should be freed before adding * it to the inactive queue. */ if (m->valid == 0) { freed = !vm_page_busied(m); if (freed) vm_page_free(m); } else if ((bp->b_flags & B_DIRECT) != 0) freed = vm_page_try_to_free(m); else freed = false; if (!freed) { /* * If the page is unlikely to be reused, let the * VM know. Otherwise, maintain LRU page * ordering and put the page at the tail of the * inactive queue. */ if ((bp->b_flags & B_NOREUSE) != 0) vm_page_deactivate_noreuse(m); else vm_page_deactivate(m); } } vm_page_unlock(m); } /* * Perform page invalidation when a buffer is released. The fully invalid * pages will be reclaimed later in vfs_vmio_truncate(). */ static void vfs_vmio_invalidate(struct buf *bp) { vm_object_t obj; vm_page_t m; int i, resid, poffset, presid; if (buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); pmap_qremove(trunc_page((vm_offset_t)bp->b_data), bp->b_npages); } else BUF_CHECK_UNMAPPED(bp); /* * Get the base offset and length of the buffer. Note that * in the VMIO case if the buffer block size is not * page-aligned then b_data pointer may not be page-aligned. * But our b_pages[] array *IS* page aligned. * * block sizes less then DEV_BSIZE (usually 512) are not * supported due to the page granularity bits (m->valid, * m->dirty, etc...). * * See man buf(9) for more information */ obj = bp->b_bufobj->bo_object; resid = bp->b_bufsize; poffset = bp->b_offset & PAGE_MASK; VM_OBJECT_WLOCK(obj); for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; if (m == bogus_page) panic("vfs_vmio_invalidate: Unexpected bogus page."); bp->b_pages[i] = NULL; presid = resid > (PAGE_SIZE - poffset) ? (PAGE_SIZE - poffset) : resid; KASSERT(presid >= 0, ("brelse: extra page")); while (vm_page_xbusied(m)) { vm_page_lock(m); VM_OBJECT_WUNLOCK(obj); vm_page_busy_sleep(m, "mbncsh", true); VM_OBJECT_WLOCK(obj); } if (pmap_page_wired_mappings(m) == 0) vm_page_set_invalid(m, poffset, presid); vfs_vmio_unwire(bp, m); resid -= presid; poffset = 0; } VM_OBJECT_WUNLOCK(obj); bp->b_npages = 0; } /* * Page-granular truncation of an existing VMIO buffer. */ static void vfs_vmio_truncate(struct buf *bp, int desiredpages) { vm_object_t obj; vm_page_t m; int i; if (bp->b_npages == desiredpages) return; if (buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); pmap_qremove((vm_offset_t)trunc_page((vm_offset_t)bp->b_data) + (desiredpages << PAGE_SHIFT), bp->b_npages - desiredpages); } else BUF_CHECK_UNMAPPED(bp); obj = bp->b_bufobj->bo_object; if (obj != NULL) VM_OBJECT_WLOCK(obj); for (i = desiredpages; i < bp->b_npages; i++) { m = bp->b_pages[i]; KASSERT(m != bogus_page, ("allocbuf: bogus page found")); bp->b_pages[i] = NULL; vfs_vmio_unwire(bp, m); } if (obj != NULL) VM_OBJECT_WUNLOCK(obj); bp->b_npages = desiredpages; } /* * Byte granular extension of VMIO buffers. */ static void vfs_vmio_extend(struct buf *bp, int desiredpages, int size) { /* * We are growing the buffer, possibly in a * byte-granular fashion. */ vm_object_t obj; vm_offset_t toff; vm_offset_t tinc; vm_page_t m; /* * Step 1, bring in the VM pages from the object, allocating * them if necessary. We must clear B_CACHE if these pages * are not valid for the range covered by the buffer. */ obj = bp->b_bufobj->bo_object; VM_OBJECT_WLOCK(obj); if (bp->b_npages < desiredpages) { /* * We must allocate system pages since blocking * here could interfere with paging I/O, no * matter which process we are. * * Only exclusive busy can be tested here. * Blocking on shared busy might lead to * deadlocks once allocbuf() is called after * pages are vfs_busy_pages(). */ (void)vm_page_grab_pages(obj, OFF_TO_IDX(bp->b_offset) + bp->b_npages, VM_ALLOC_SYSTEM | VM_ALLOC_IGN_SBUSY | VM_ALLOC_NOBUSY | VM_ALLOC_WIRED, &bp->b_pages[bp->b_npages], desiredpages - bp->b_npages); bp->b_npages = desiredpages; } /* * Step 2. We've loaded the pages into the buffer, * we have to figure out if we can still have B_CACHE * set. Note that B_CACHE is set according to the * byte-granular range ( bcount and size ), not the * aligned range ( newbsize ). * * The VM test is against m->valid, which is DEV_BSIZE * aligned. Needless to say, the validity of the data * needs to also be DEV_BSIZE aligned. Note that this * fails with NFS if the server or some other client * extends the file's EOF. If our buffer is resized, * B_CACHE may remain set! XXX */ toff = bp->b_bcount; tinc = PAGE_SIZE - ((bp->b_offset + toff) & PAGE_MASK); while ((bp->b_flags & B_CACHE) && toff < size) { vm_pindex_t pi; if (tinc > (size - toff)) tinc = size - toff; pi = ((bp->b_offset & PAGE_MASK) + toff) >> PAGE_SHIFT; m = bp->b_pages[pi]; vfs_buf_test_cache(bp, bp->b_offset, toff, tinc, m); toff += tinc; tinc = PAGE_SIZE; } VM_OBJECT_WUNLOCK(obj); /* * Step 3, fixup the KVA pmap. */ if (buf_mapped(bp)) bpmap_qenter(bp); else BUF_CHECK_UNMAPPED(bp); } /* * Check to see if a block at a particular lbn is available for a clustered * write. */ static int vfs_bio_clcheck(struct vnode *vp, int size, daddr_t lblkno, daddr_t blkno) { struct buf *bpa; int match; match = 0; /* If the buf isn't in core skip it */ if ((bpa = gbincore(&vp->v_bufobj, lblkno)) == NULL) return (0); /* If the buf is busy we don't want to wait for it */ if (BUF_LOCK(bpa, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) return (0); /* Only cluster with valid clusterable delayed write buffers */ if ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) != (B_DELWRI | B_CLUSTEROK)) goto done; if (bpa->b_bufsize != size) goto done; /* * Check to see if it is in the expected place on disk and that the * block has been mapped. */ if ((bpa->b_blkno != bpa->b_lblkno) && (bpa->b_blkno == blkno)) match = 1; done: BUF_UNLOCK(bpa); return (match); } /* * vfs_bio_awrite: * * Implement clustered async writes for clearing out B_DELWRI buffers. * This is much better then the old way of writing only one buffer at * a time. Note that we may not be presented with the buffers in the * correct order, so we search for the cluster in both directions. */ int vfs_bio_awrite(struct buf *bp) { struct bufobj *bo; int i; int j; daddr_t lblkno = bp->b_lblkno; struct vnode *vp = bp->b_vp; int ncl; int nwritten; int size; int maxcl; int gbflags; bo = &vp->v_bufobj; gbflags = (bp->b_data == unmapped_buf) ? GB_UNMAPPED : 0; /* * right now we support clustered writing only to regular files. If * we find a clusterable block we could be in the middle of a cluster * rather then at the beginning. */ if ((vp->v_type == VREG) && (vp->v_mount != 0) && /* Only on nodes that have the size info */ (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) { size = vp->v_mount->mnt_stat.f_iosize; maxcl = MAXPHYS / size; BO_RLOCK(bo); for (i = 1; i < maxcl; i++) if (vfs_bio_clcheck(vp, size, lblkno + i, bp->b_blkno + ((i * size) >> DEV_BSHIFT)) == 0) break; for (j = 1; i + j <= maxcl && j <= lblkno; j++) if (vfs_bio_clcheck(vp, size, lblkno - j, bp->b_blkno - ((j * size) >> DEV_BSHIFT)) == 0) break; BO_RUNLOCK(bo); --j; ncl = i + j; /* * this is a possible cluster write */ if (ncl != 1) { BUF_UNLOCK(bp); nwritten = cluster_wbuild(vp, size, lblkno - j, ncl, gbflags); return (nwritten); } } bremfree(bp); bp->b_flags |= B_ASYNC; /* * default (old) behavior, writing out only one block * * XXX returns b_bufsize instead of b_bcount for nwritten? */ nwritten = bp->b_bufsize; (void) bwrite(bp); return (nwritten); } /* * getnewbuf_kva: * * Allocate KVA for an empty buf header according to gbflags. */ static int getnewbuf_kva(struct buf *bp, int gbflags, int maxsize) { if ((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_UNMAPPED) { /* * In order to keep fragmentation sane we only allocate kva * in BKVASIZE chunks. XXX with vmem we can do page size. */ maxsize = (maxsize + BKVAMASK) & ~BKVAMASK; if (maxsize != bp->b_kvasize && bufkva_alloc(bp, maxsize, gbflags)) return (ENOSPC); } return (0); } /* * getnewbuf: * * Find and initialize a new buffer header, freeing up existing buffers * in the bufqueues as necessary. The new buffer is returned locked. * * We block if: * We have insufficient buffer headers * We have insufficient buffer space * buffer_arena is too fragmented ( space reservation fails ) * If we have to flush dirty buffers ( but we try to avoid this ) * * The caller is responsible for releasing the reserved bufspace after * allocbuf() is called. */ static struct buf * getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int maxsize, int gbflags) { struct buf *bp; bool metadata, reserved; bp = NULL; KASSERT((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC, ("GB_KVAALLOC only makes sense with GB_UNMAPPED")); if (!unmapped_buf_allowed) gbflags &= ~(GB_UNMAPPED | GB_KVAALLOC); if (vp == NULL || (vp->v_vflag & (VV_MD | VV_SYSTEM)) != 0 || vp->v_type == VCHR) metadata = true; else metadata = false; atomic_add_int(&getnewbufcalls, 1); reserved = false; do { if (reserved == false && bufspace_reserve(maxsize, metadata) != 0) continue; reserved = true; if ((bp = buf_alloc()) == NULL) continue; if (getnewbuf_kva(bp, gbflags, maxsize) == 0) return (bp); break; } while(buf_scan(false) == 0); if (reserved) atomic_subtract_long(&bufspace, maxsize); if (bp != NULL) { bp->b_flags |= B_INVAL; brelse(bp); } bufspace_wait(vp, gbflags, slpflag, slptimeo); return (NULL); } /* * buf_daemon: * * buffer flushing daemon. Buffers are normally flushed by the * update daemon but if it cannot keep up this process starts to * take the load in an attempt to prevent getnewbuf() from blocking. */ static struct kproc_desc buf_kp = { "bufdaemon", buf_daemon, &bufdaemonproc }; SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp); static int buf_flush(struct vnode *vp, int target) { int flushed; flushed = flushbufqueues(vp, target, 0); if (flushed == 0) { /* * Could not find any buffers without rollback * dependencies, so just write the first one * in the hopes of eventually making progress. */ if (vp != NULL && target > 2) target /= 2; flushbufqueues(vp, target, 1); } return (flushed); } static void buf_daemon() { int lodirty; /* * This process needs to be suspended prior to shutdown sync. */ EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, bufdaemonproc, SHUTDOWN_PRI_LAST); /* * This process is allowed to take the buffer cache to the limit */ curthread->td_pflags |= TDP_NORUNNINGBUF | TDP_BUFNEED; mtx_lock(&bdlock); for (;;) { bd_request = 0; mtx_unlock(&bdlock); kproc_suspend_check(bufdaemonproc); lodirty = lodirtybuffers; if (bd_speedupreq) { lodirty = numdirtybuffers / 2; bd_speedupreq = 0; } /* * Do the flush. Limit the amount of in-transit I/O we * allow to build up, otherwise we would completely saturate * the I/O system. */ while (numdirtybuffers > lodirty) { if (buf_flush(NULL, numdirtybuffers - lodirty) == 0) break; kern_yield(PRI_USER); } /* * Only clear bd_request if we have reached our low water * mark. The buf_daemon normally waits 1 second and * then incrementally flushes any dirty buffers that have * built up, within reason. * * If we were unable to hit our low water mark and couldn't * find any flushable buffers, we sleep for a short period * to avoid endless loops on unlockable buffers. */ mtx_lock(&bdlock); if (numdirtybuffers <= lodirtybuffers) { /* * We reached our low water mark, reset the * request and sleep until we are needed again. * The sleep is just so the suspend code works. */ bd_request = 0; /* * Do an extra wakeup in case dirty threshold * changed via sysctl and the explicit transition * out of shortfall was missed. */ bdirtywakeup(); if (runningbufspace <= lorunningspace) runningwakeup(); msleep(&bd_request, &bdlock, PVM, "psleep", hz); } else { /* * We couldn't find any flushable dirty buffers but * still have too many dirty buffers, we * have to sleep and try again. (rare) */ msleep(&bd_request, &bdlock, PVM, "qsleep", hz / 10); } } } /* * flushbufqueues: * * Try to flush a buffer in the dirty queue. We must be careful to * free up B_INVAL buffers instead of write them, which NFS is * particularly sensitive to. */ static int flushwithdeps = 0; SYSCTL_INT(_vfs, OID_AUTO, flushwithdeps, CTLFLAG_RW, &flushwithdeps, 0, "Number of buffers flushed with dependecies that require rollbacks"); static int flushbufqueues(struct vnode *lvp, int target, int flushdeps) { struct buf *sentinel; struct vnode *vp; struct mount *mp; struct buf *bp; int hasdeps; int flushed; int queue; int error; bool unlock; flushed = 0; queue = QUEUE_DIRTY; bp = NULL; sentinel = malloc(sizeof(struct buf), M_TEMP, M_WAITOK | M_ZERO); sentinel->b_qindex = QUEUE_SENTINEL; mtx_lock(&bqlocks[queue]); TAILQ_INSERT_HEAD(&bufqueues[queue], sentinel, b_freelist); mtx_unlock(&bqlocks[queue]); while (flushed != target) { maybe_yield(); mtx_lock(&bqlocks[queue]); bp = TAILQ_NEXT(sentinel, b_freelist); if (bp != NULL) { TAILQ_REMOVE(&bufqueues[queue], sentinel, b_freelist); TAILQ_INSERT_AFTER(&bufqueues[queue], bp, sentinel, b_freelist); } else { mtx_unlock(&bqlocks[queue]); break; } /* * Skip sentinels inserted by other invocations of the * flushbufqueues(), taking care to not reorder them. * * Only flush the buffers that belong to the * vnode locked by the curthread. */ if (bp->b_qindex == QUEUE_SENTINEL || (lvp != NULL && bp->b_vp != lvp)) { mtx_unlock(&bqlocks[queue]); continue; } error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL); mtx_unlock(&bqlocks[queue]); if (error != 0) continue; /* * BKGRDINPROG can only be set with the buf and bufobj * locks both held. We tolerate a race to clear it here. */ if ((bp->b_vflags & BV_BKGRDINPROG) != 0 || (bp->b_flags & B_DELWRI) == 0) { BUF_UNLOCK(bp); continue; } if (bp->b_flags & B_INVAL) { bremfreef(bp); brelse(bp); flushed++; continue; } if (!LIST_EMPTY(&bp->b_dep) && buf_countdeps(bp, 0)) { if (flushdeps == 0) { BUF_UNLOCK(bp); continue; } hasdeps = 1; } else hasdeps = 0; /* * We must hold the lock on a vnode before writing * one of its buffers. Otherwise we may confuse, or * in the case of a snapshot vnode, deadlock the * system. * * The lock order here is the reverse of the normal * of vnode followed by buf lock. This is ok because * the NOWAIT will prevent deadlock. */ vp = bp->b_vp; if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { BUF_UNLOCK(bp); continue; } if (lvp == NULL) { unlock = true; error = vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT); } else { ASSERT_VOP_LOCKED(vp, "getbuf"); unlock = false; error = VOP_ISLOCKED(vp) == LK_EXCLUSIVE ? 0 : vn_lock(vp, LK_TRYUPGRADE); } if (error == 0) { CTR3(KTR_BUF, "flushbufqueue(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); if (curproc == bufdaemonproc) { vfs_bio_awrite(bp); } else { bremfree(bp); bwrite(bp); notbufdflushes++; } vn_finished_write(mp); if (unlock) VOP_UNLOCK(vp, 0); flushwithdeps += hasdeps; flushed++; /* * Sleeping on runningbufspace while holding * vnode lock leads to deadlock. */ if (curproc == bufdaemonproc && runningbufspace > hirunningspace) waitrunningbufspace(); continue; } vn_finished_write(mp); BUF_UNLOCK(bp); } mtx_lock(&bqlocks[queue]); TAILQ_REMOVE(&bufqueues[queue], sentinel, b_freelist); mtx_unlock(&bqlocks[queue]); free(sentinel, M_TEMP); return (flushed); } /* * Check to see if a block is currently memory resident. */ struct buf * incore(struct bufobj *bo, daddr_t blkno) { struct buf *bp; BO_RLOCK(bo); bp = gbincore(bo, blkno); BO_RUNLOCK(bo); return (bp); } /* * Returns true if no I/O is needed to access the * associated VM object. This is like incore except * it also hunts around in the VM system for the data. */ static int inmem(struct vnode * vp, daddr_t blkno) { vm_object_t obj; vm_offset_t toff, tinc, size; vm_page_t m; vm_ooffset_t off; ASSERT_VOP_LOCKED(vp, "inmem"); if (incore(&vp->v_bufobj, blkno)) return 1; if (vp->v_mount == NULL) return 0; obj = vp->v_object; if (obj == NULL) return (0); size = PAGE_SIZE; if (size > vp->v_mount->mnt_stat.f_iosize) size = vp->v_mount->mnt_stat.f_iosize; off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosize; VM_OBJECT_RLOCK(obj); for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) { m = vm_page_lookup(obj, OFF_TO_IDX(off + toff)); if (!m) goto notinmem; tinc = size; if (tinc > PAGE_SIZE - ((toff + off) & PAGE_MASK)) tinc = PAGE_SIZE - ((toff + off) & PAGE_MASK); if (vm_page_is_valid(m, (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0) goto notinmem; } VM_OBJECT_RUNLOCK(obj); return 1; notinmem: VM_OBJECT_RUNLOCK(obj); return (0); } /* * Set the dirty range for a buffer based on the status of the dirty * bits in the pages comprising the buffer. The range is limited * to the size of the buffer. * * Tell the VM system that the pages associated with this buffer * are clean. This is used for delayed writes where the data is * going to go to disk eventually without additional VM intevention. * * Note that while we only really need to clean through to b_bcount, we * just go ahead and clean through to b_bufsize. */ static void vfs_clean_pages_dirty_buf(struct buf *bp) { vm_ooffset_t foff, noff, eoff; vm_page_t m; int i; if ((bp->b_flags & B_VMIO) == 0 || bp->b_bufsize == 0) return; foff = bp->b_offset; KASSERT(bp->b_offset != NOOFFSET, ("vfs_clean_pages_dirty_buf: no buffer offset")); VM_OBJECT_WLOCK(bp->b_bufobj->bo_object); vfs_drain_busy_pages(bp); vfs_setdirty_locked_object(bp); for (i = 0; i < bp->b_npages; i++) { noff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; eoff = noff; if (eoff > bp->b_offset + bp->b_bufsize) eoff = bp->b_offset + bp->b_bufsize; m = bp->b_pages[i]; vfs_page_set_validclean(bp, foff, m); /* vm_page_clear_dirty(m, foff & PAGE_MASK, eoff - foff); */ foff = noff; } VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object); } static void vfs_setdirty_locked_object(struct buf *bp) { vm_object_t object; int i; object = bp->b_bufobj->bo_object; VM_OBJECT_ASSERT_WLOCKED(object); /* * We qualify the scan for modified pages on whether the * object has been flushed yet. */ if ((object->flags & OBJ_MIGHTBEDIRTY) != 0) { vm_offset_t boffset; vm_offset_t eoffset; /* * test the pages to see if they have been modified directly * by users through the VM system. */ for (i = 0; i < bp->b_npages; i++) vm_page_test_dirty(bp->b_pages[i]); /* * Calculate the encompassing dirty range, boffset and eoffset, * (eoffset - boffset) bytes. */ for (i = 0; i < bp->b_npages; i++) { if (bp->b_pages[i]->dirty) break; } boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK); for (i = bp->b_npages - 1; i >= 0; --i) { if (bp->b_pages[i]->dirty) { break; } } eoffset = ((i + 1) << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK); /* * Fit it to the buffer. */ if (eoffset > bp->b_bcount) eoffset = bp->b_bcount; /* * If we have a good dirty range, merge with the existing * dirty range. */ if (boffset < eoffset) { if (bp->b_dirtyoff > boffset) bp->b_dirtyoff = boffset; if (bp->b_dirtyend < eoffset) bp->b_dirtyend = eoffset; } } } /* * Allocate the KVA mapping for an existing buffer. * If an unmapped buffer is provided but a mapped buffer is requested, take * also care to properly setup mappings between pages and KVA. */ static void bp_unmapped_get_kva(struct buf *bp, daddr_t blkno, int size, int gbflags) { int bsize, maxsize, need_mapping, need_kva; off_t offset; need_mapping = bp->b_data == unmapped_buf && (gbflags & GB_UNMAPPED) == 0; need_kva = bp->b_kvabase == unmapped_buf && bp->b_data == unmapped_buf && (gbflags & GB_KVAALLOC) != 0; if (!need_mapping && !need_kva) return; BUF_CHECK_UNMAPPED(bp); if (need_mapping && bp->b_kvabase != unmapped_buf) { /* * Buffer is not mapped, but the KVA was already * reserved at the time of the instantiation. Use the * allocated space. */ goto has_addr; } /* * Calculate the amount of the address space we would reserve * if the buffer was mapped. */ bsize = vn_isdisk(bp->b_vp, NULL) ? DEV_BSIZE : bp->b_bufobj->bo_bsize; KASSERT(bsize != 0, ("bsize == 0, check bo->bo_bsize")); offset = blkno * bsize; maxsize = size + (offset & PAGE_MASK); maxsize = imax(maxsize, bsize); while (bufkva_alloc(bp, maxsize, gbflags) != 0) { if ((gbflags & GB_NOWAIT_BD) != 0) { /* * XXXKIB: defragmentation cannot * succeed, not sure what else to do. */ panic("GB_NOWAIT_BD and GB_UNMAPPED %p", bp); } atomic_add_int(&mappingrestarts, 1); bufspace_wait(bp->b_vp, gbflags, 0, 0); } has_addr: if (need_mapping) { /* b_offset is handled by bpmap_qenter. */ bp->b_data = bp->b_kvabase; BUF_CHECK_MAPPED(bp); bpmap_qenter(bp); } } /* * getblk: * * Get a block given a specified block and offset into a file/device. * The buffers B_DONE bit will be cleared on return, making it almost * ready for an I/O initiation. B_INVAL may or may not be set on * return. The caller should clear B_INVAL prior to initiating a * READ. * * For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for * an existing buffer. * * For a VMIO buffer, B_CACHE is modified according to the backing VM. * If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set * and then cleared based on the backing VM. If the previous buffer is * non-0-sized but invalid, B_CACHE will be cleared. * * If getblk() must create a new buffer, the new buffer is returned with * both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which * case it is returned with B_INVAL clear and B_CACHE set based on the * backing VM. * * getblk() also forces a bwrite() for any B_DELWRI buffer whos * B_CACHE bit is clear. * * What this means, basically, is that the caller should use B_CACHE to * determine whether the buffer is fully valid or not and should clear * B_INVAL prior to issuing a read. If the caller intends to validate * the buffer by loading its data area with something, the caller needs * to clear B_INVAL. If the caller does this without issuing an I/O, * the caller should set B_CACHE ( as an optimization ), else the caller * should issue the I/O and biodone() will set B_CACHE if the I/O was * a write attempt or if it was a successful read. If the caller * intends to issue a READ, the caller must clear B_INVAL and BIO_ERROR * prior to issuing the READ. biodone() will *not* clear B_INVAL. */ struct buf * getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo, int flags) { struct buf *bp; struct bufobj *bo; int bsize, error, maxsize, vmio; off_t offset; CTR3(KTR_BUF, "getblk(%p, %ld, %d)", vp, (long)blkno, size); KASSERT((flags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC, ("GB_KVAALLOC only makes sense with GB_UNMAPPED")); ASSERT_VOP_LOCKED(vp, "getblk"); if (size > maxbcachebuf) panic("getblk: size(%d) > maxbcachebuf(%d)\n", size, maxbcachebuf); if (!unmapped_buf_allowed) flags &= ~(GB_UNMAPPED | GB_KVAALLOC); bo = &vp->v_bufobj; loop: BO_RLOCK(bo); bp = gbincore(bo, blkno); if (bp != NULL) { int lockflags; /* * Buffer is in-core. If the buffer is not busy nor managed, * it must be on a queue. */ lockflags = LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK; if (flags & GB_LOCK_NOWAIT) lockflags |= LK_NOWAIT; error = BUF_TIMELOCK(bp, lockflags, BO_LOCKPTR(bo), "getblk", slpflag, slptimeo); /* * If we slept and got the lock we have to restart in case * the buffer changed identities. */ if (error == ENOLCK) goto loop; /* We timed out or were interrupted. */ else if (error) return (NULL); /* If recursed, assume caller knows the rules. */ else if (BUF_LOCKRECURSED(bp)) goto end; /* * The buffer is locked. B_CACHE is cleared if the buffer is * invalid. Otherwise, for a non-VMIO buffer, B_CACHE is set * and for a VMIO buffer B_CACHE is adjusted according to the * backing VM cache. */ if (bp->b_flags & B_INVAL) bp->b_flags &= ~B_CACHE; else if ((bp->b_flags & (B_VMIO | B_INVAL)) == 0) bp->b_flags |= B_CACHE; if (bp->b_flags & B_MANAGED) MPASS(bp->b_qindex == QUEUE_NONE); else bremfree(bp); /* * check for size inconsistencies for non-VMIO case. */ if (bp->b_bcount != size) { if ((bp->b_flags & B_VMIO) == 0 || (size > bp->b_kvasize)) { if (bp->b_flags & B_DELWRI) { bp->b_flags |= B_NOCACHE; bwrite(bp); } else { if (LIST_EMPTY(&bp->b_dep)) { bp->b_flags |= B_RELBUF; brelse(bp); } else { bp->b_flags |= B_NOCACHE; bwrite(bp); } } goto loop; } } /* * Handle the case of unmapped buffer which should * become mapped, or the buffer for which KVA * reservation is requested. */ bp_unmapped_get_kva(bp, blkno, size, flags); /* * If the size is inconsistent in the VMIO case, we can resize * the buffer. This might lead to B_CACHE getting set or * cleared. If the size has not changed, B_CACHE remains * unchanged from its previous state. */ allocbuf(bp, size); KASSERT(bp->b_offset != NOOFFSET, ("getblk: no buffer offset")); /* * A buffer with B_DELWRI set and B_CACHE clear must * be committed before we can return the buffer in * order to prevent the caller from issuing a read * ( due to B_CACHE not being set ) and overwriting * it. * * Most callers, including NFS and FFS, need this to * operate properly either because they assume they * can issue a read if B_CACHE is not set, or because * ( for example ) an uncached B_DELWRI might loop due * to softupdates re-dirtying the buffer. In the latter * case, B_CACHE is set after the first write completes, * preventing further loops. * NOTE! b*write() sets B_CACHE. If we cleared B_CACHE * above while extending the buffer, we cannot allow the * buffer to remain with B_CACHE set after the write * completes or it will represent a corrupt state. To * deal with this we set B_NOCACHE to scrap the buffer * after the write. * * We might be able to do something fancy, like setting * B_CACHE in bwrite() except if B_DELWRI is already set, * so the below call doesn't set B_CACHE, but that gets real * confusing. This is much easier. */ if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) { bp->b_flags |= B_NOCACHE; bwrite(bp); goto loop; } bp->b_flags &= ~B_DONE; } else { /* * Buffer is not in-core, create new buffer. The buffer * returned by getnewbuf() is locked. Note that the returned * buffer is also considered valid (not marked B_INVAL). */ BO_RUNLOCK(bo); /* * If the user does not want us to create the buffer, bail out * here. */ if (flags & GB_NOCREAT) return NULL; if (numfreebuffers == 0 && TD_IS_IDLETHREAD(curthread)) return NULL; bsize = vn_isdisk(vp, NULL) ? DEV_BSIZE : bo->bo_bsize; KASSERT(bsize != 0, ("bsize == 0, check bo->bo_bsize")); offset = blkno * bsize; vmio = vp->v_object != NULL; if (vmio) { maxsize = size + (offset & PAGE_MASK); } else { maxsize = size; /* Do not allow non-VMIO notmapped buffers. */ flags &= ~(GB_UNMAPPED | GB_KVAALLOC); } maxsize = imax(maxsize, bsize); bp = getnewbuf(vp, slpflag, slptimeo, maxsize, flags); if (bp == NULL) { if (slpflag || slptimeo) return NULL; /* * XXX This is here until the sleep path is diagnosed * enough to work under very low memory conditions. * * There's an issue on low memory, 4BSD+non-preempt * systems (eg MIPS routers with 32MB RAM) where buffer * exhaustion occurs without sleeping for buffer * reclaimation. This just sticks in a loop and * constantly attempts to allocate a buffer, which * hits exhaustion and tries to wakeup bufdaemon. * This never happens because we never yield. * * The real solution is to identify and fix these cases * so we aren't effectively busy-waiting in a loop * until the reclaimation path has cycles to run. */ kern_yield(PRI_USER); goto loop; } /* * This code is used to make sure that a buffer is not * created while the getnewbuf routine is blocked. * This can be a problem whether the vnode is locked or not. * If the buffer is created out from under us, we have to * throw away the one we just created. * * Note: this must occur before we associate the buffer * with the vp especially considering limitations in * the splay tree implementation when dealing with duplicate * lblkno's. */ BO_LOCK(bo); if (gbincore(bo, blkno)) { BO_UNLOCK(bo); bp->b_flags |= B_INVAL; brelse(bp); bufspace_release(maxsize); goto loop; } /* * Insert the buffer into the hash, so that it can * be found by incore. */ bp->b_blkno = bp->b_lblkno = blkno; bp->b_offset = offset; bgetvp(vp, bp); BO_UNLOCK(bo); /* * set B_VMIO bit. allocbuf() the buffer bigger. Since the * buffer size starts out as 0, B_CACHE will be set by * allocbuf() for the VMIO case prior to it testing the * backing store for validity. */ if (vmio) { bp->b_flags |= B_VMIO; KASSERT(vp->v_object == bp->b_bufobj->bo_object, ("ARGH! different b_bufobj->bo_object %p %p %p\n", bp, vp->v_object, bp->b_bufobj->bo_object)); } else { bp->b_flags &= ~B_VMIO; KASSERT(bp->b_bufobj->bo_object == NULL, ("ARGH! has b_bufobj->bo_object %p %p\n", bp, bp->b_bufobj->bo_object)); BUF_CHECK_MAPPED(bp); } allocbuf(bp, size); bufspace_release(maxsize); bp->b_flags &= ~B_DONE; } CTR4(KTR_BUF, "getblk(%p, %ld, %d) = %p", vp, (long)blkno, size, bp); BUF_ASSERT_HELD(bp); end: buf_track(bp, __func__); KASSERT(bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); return (bp); } /* * Get an empty, disassociated buffer of given size. The buffer is initially * set to B_INVAL. */ struct buf * geteblk(int size, int flags) { struct buf *bp; int maxsize; maxsize = (size + BKVAMASK) & ~BKVAMASK; while ((bp = getnewbuf(NULL, 0, 0, maxsize, flags)) == NULL) { if ((flags & GB_NOWAIT_BD) && (curthread->td_pflags & TDP_BUFNEED) != 0) return (NULL); } allocbuf(bp, size); bufspace_release(maxsize); bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */ BUF_ASSERT_HELD(bp); return (bp); } /* * Truncate the backing store for a non-vmio buffer. */ static void vfs_nonvmio_truncate(struct buf *bp, int newbsize) { if (bp->b_flags & B_MALLOC) { /* * malloced buffers are not shrunk */ if (newbsize == 0) { bufmallocadjust(bp, 0); free(bp->b_data, M_BIOBUF); bp->b_data = bp->b_kvabase; bp->b_flags &= ~B_MALLOC; } return; } vm_hold_free_pages(bp, newbsize); bufspace_adjust(bp, newbsize); } /* * Extend the backing for a non-VMIO buffer. */ static void vfs_nonvmio_extend(struct buf *bp, int newbsize) { caddr_t origbuf; int origbufsize; /* * We only use malloced memory on the first allocation. * and revert to page-allocated memory when the buffer * grows. * * There is a potential smp race here that could lead * to bufmallocspace slightly passing the max. It * is probably extremely rare and not worth worrying * over. */ if (bp->b_bufsize == 0 && newbsize <= PAGE_SIZE/2 && bufmallocspace < maxbufmallocspace) { bp->b_data = malloc(newbsize, M_BIOBUF, M_WAITOK); bp->b_flags |= B_MALLOC; bufmallocadjust(bp, newbsize); return; } /* * If the buffer is growing on its other-than-first * allocation then we revert to the page-allocation * scheme. */ origbuf = NULL; origbufsize = 0; if (bp->b_flags & B_MALLOC) { origbuf = bp->b_data; origbufsize = bp->b_bufsize; bp->b_data = bp->b_kvabase; bufmallocadjust(bp, 0); bp->b_flags &= ~B_MALLOC; newbsize = round_page(newbsize); } vm_hold_load_pages(bp, (vm_offset_t) bp->b_data + bp->b_bufsize, (vm_offset_t) bp->b_data + newbsize); if (origbuf != NULL) { bcopy(origbuf, bp->b_data, origbufsize); free(origbuf, M_BIOBUF); } bufspace_adjust(bp, newbsize); } /* * This code constitutes the buffer memory from either anonymous system * memory (in the case of non-VMIO operations) or from an associated * VM object (in the case of VMIO operations). This code is able to * resize a buffer up or down. * * Note that this code is tricky, and has many complications to resolve * deadlock or inconsistent data situations. Tread lightly!!! * There are B_CACHE and B_DELWRI interactions that must be dealt with by * the caller. Calling this code willy nilly can result in the loss of data. * * allocbuf() only adjusts B_CACHE for VMIO buffers. getblk() deals with * B_CACHE for the non-VMIO case. */ int allocbuf(struct buf *bp, int size) { int newbsize; BUF_ASSERT_HELD(bp); if (bp->b_bcount == size) return (1); if (bp->b_kvasize != 0 && bp->b_kvasize < size) panic("allocbuf: buffer too small"); newbsize = roundup2(size, DEV_BSIZE); if ((bp->b_flags & B_VMIO) == 0) { if ((bp->b_flags & B_MALLOC) == 0) newbsize = round_page(newbsize); /* * Just get anonymous memory from the kernel. Don't * mess with B_CACHE. */ if (newbsize < bp->b_bufsize) vfs_nonvmio_truncate(bp, newbsize); else if (newbsize > bp->b_bufsize) vfs_nonvmio_extend(bp, newbsize); } else { int desiredpages; desiredpages = (size == 0) ? 0 : num_pages((bp->b_offset & PAGE_MASK) + newbsize); if (bp->b_flags & B_MALLOC) panic("allocbuf: VMIO buffer can't be malloced"); /* * Set B_CACHE initially if buffer is 0 length or will become * 0-length. */ if (size == 0 || bp->b_bufsize == 0) bp->b_flags |= B_CACHE; if (newbsize < bp->b_bufsize) vfs_vmio_truncate(bp, desiredpages); /* XXX This looks as if it should be newbsize > b_bufsize */ else if (size > bp->b_bcount) vfs_vmio_extend(bp, desiredpages, size); bufspace_adjust(bp, newbsize); } bp->b_bcount = size; /* requested buffer size. */ return (1); } extern int inflight_transient_maps; void biodone(struct bio *bp) { struct mtx *mtxp; void (*done)(struct bio *); vm_offset_t start, end; biotrack(bp, __func__); if ((bp->bio_flags & BIO_TRANSIENT_MAPPING) != 0) { bp->bio_flags &= ~BIO_TRANSIENT_MAPPING; bp->bio_flags |= BIO_UNMAPPED; start = trunc_page((vm_offset_t)bp->bio_data); end = round_page((vm_offset_t)bp->bio_data + bp->bio_length); bp->bio_data = unmapped_buf; pmap_qremove(start, atop(end - start)); vmem_free(transient_arena, start, end - start); atomic_add_int(&inflight_transient_maps, -1); } done = bp->bio_done; if (done == NULL) { mtxp = mtx_pool_find(mtxpool_sleep, bp); mtx_lock(mtxp); bp->bio_flags |= BIO_DONE; wakeup(bp); mtx_unlock(mtxp); } else done(bp); } /* * Wait for a BIO to finish. */ int biowait(struct bio *bp, const char *wchan) { struct mtx *mtxp; mtxp = mtx_pool_find(mtxpool_sleep, bp); mtx_lock(mtxp); while ((bp->bio_flags & BIO_DONE) == 0) msleep(bp, mtxp, PRIBIO, wchan, 0); mtx_unlock(mtxp); if (bp->bio_error != 0) return (bp->bio_error); if (!(bp->bio_flags & BIO_ERROR)) return (0); return (EIO); } void biofinish(struct bio *bp, struct devstat *stat, int error) { if (error) { bp->bio_error = error; bp->bio_flags |= BIO_ERROR; } if (stat != NULL) devstat_end_transaction_bio(stat, bp); biodone(bp); } #if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING) void biotrack_buf(struct bio *bp, const char *location) { buf_track(bp->bio_track_bp, location); } #endif /* * bufwait: * * Wait for buffer I/O completion, returning error status. The buffer * is left locked and B_DONE on return. B_EINTR is converted into an EINTR * error and cleared. */ int bufwait(struct buf *bp) { if (bp->b_iocmd == BIO_READ) bwait(bp, PRIBIO, "biord"); else bwait(bp, PRIBIO, "biowr"); if (bp->b_flags & B_EINTR) { bp->b_flags &= ~B_EINTR; return (EINTR); } if (bp->b_ioflags & BIO_ERROR) { return (bp->b_error ? bp->b_error : EIO); } else { return (0); } } /* * bufdone: * * Finish I/O on a buffer, optionally calling a completion function. * This is usually called from an interrupt so process blocking is * not allowed. * * biodone is also responsible for setting B_CACHE in a B_VMIO bp. * In a non-VMIO bp, B_CACHE will be set on the next getblk() * assuming B_INVAL is clear. * * For the VMIO case, we set B_CACHE if the op was a read and no * read error occurred, or if the op was a write. B_CACHE is never * set if the buffer is invalid or otherwise uncacheable. * * biodone does not mess with B_INVAL, allowing the I/O routine or the * initiator to leave B_INVAL set to brelse the buffer out of existence * in the biodone routine. */ void bufdone(struct buf *bp) { struct bufobj *dropobj; void (*biodone)(struct buf *); buf_track(bp, __func__); CTR3(KTR_BUF, "bufdone(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); dropobj = NULL; KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp)); BUF_ASSERT_HELD(bp); runningbufwakeup(bp); if (bp->b_iocmd == BIO_WRITE) dropobj = bp->b_bufobj; + else if ((bp->b_flags & B_CKHASH) != 0) { + KASSERT(buf_mapped(bp), ("biodone: bp %p not mapped", bp)); + (*bp->b_ckhashcalc)(bp); + } /* call optional completion function if requested */ if (bp->b_iodone != NULL) { biodone = bp->b_iodone; bp->b_iodone = NULL; (*biodone) (bp); if (dropobj) bufobj_wdrop(dropobj); return; } bufdone_finish(bp); if (dropobj) bufobj_wdrop(dropobj); } void bufdone_finish(struct buf *bp) { BUF_ASSERT_HELD(bp); if (!LIST_EMPTY(&bp->b_dep)) buf_complete(bp); if (bp->b_flags & B_VMIO) { /* * Set B_CACHE if the op was a normal read and no error * occurred. B_CACHE is set for writes in the b*write() * routines. */ if (bp->b_iocmd == BIO_READ && !(bp->b_flags & (B_INVAL|B_NOCACHE)) && !(bp->b_ioflags & BIO_ERROR)) bp->b_flags |= B_CACHE; vfs_vmio_iodone(bp); } /* * For asynchronous completions, release the buffer now. The brelse * will do a wakeup there if necessary - so no need to do a wakeup * here in the async case. The sync case always needs to do a wakeup. */ if (bp->b_flags & B_ASYNC) { if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_RELBUF)) || (bp->b_ioflags & BIO_ERROR)) brelse(bp); else bqrelse(bp); } else bdone(bp); } /* * This routine is called in lieu of iodone in the case of * incomplete I/O. This keeps the busy status for pages * consistent. */ void vfs_unbusy_pages(struct buf *bp) { int i; vm_object_t obj; vm_page_t m; runningbufwakeup(bp); if (!(bp->b_flags & B_VMIO)) return; obj = bp->b_bufobj->bo_object; VM_OBJECT_WLOCK(obj); for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; if (m == bogus_page) { m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_offset) + i); if (!m) panic("vfs_unbusy_pages: page missing\n"); bp->b_pages[i] = m; if (buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } else BUF_CHECK_UNMAPPED(bp); } vm_page_sunbusy(m); } vm_object_pip_wakeupn(obj, bp->b_npages); VM_OBJECT_WUNLOCK(obj); } /* * vfs_page_set_valid: * * Set the valid bits in a page based on the supplied offset. The * range is restricted to the buffer's size. * * This routine is typically called after a read completes. */ static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m) { vm_ooffset_t eoff; /* * Compute the end offset, eoff, such that [off, eoff) does not span a * page boundary and eoff is not greater than the end of the buffer. * The end of the buffer, in this case, is our file EOF, not the * allocation size of the buffer. */ eoff = (off + PAGE_SIZE) & ~(vm_ooffset_t)PAGE_MASK; if (eoff > bp->b_offset + bp->b_bcount) eoff = bp->b_offset + bp->b_bcount; /* * Set valid range. This is typically the entire buffer and thus the * entire page. */ if (eoff > off) vm_page_set_valid_range(m, off & PAGE_MASK, eoff - off); } /* * vfs_page_set_validclean: * * Set the valid bits and clear the dirty bits in a page based on the * supplied offset. The range is restricted to the buffer's size. */ static void vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off, vm_page_t m) { vm_ooffset_t soff, eoff; /* * Start and end offsets in buffer. eoff - soff may not cross a * page boundary or cross the end of the buffer. The end of the * buffer, in this case, is our file EOF, not the allocation size * of the buffer. */ soff = off; eoff = (off + PAGE_SIZE) & ~(off_t)PAGE_MASK; if (eoff > bp->b_offset + bp->b_bcount) eoff = bp->b_offset + bp->b_bcount; /* * Set valid range. This is typically the entire buffer and thus the * entire page. */ if (eoff > soff) { vm_page_set_validclean( m, (vm_offset_t) (soff & PAGE_MASK), (vm_offset_t) (eoff - soff) ); } } /* * Ensure that all buffer pages are not exclusive busied. If any page is * exclusive busy, drain it. */ void vfs_drain_busy_pages(struct buf *bp) { vm_page_t m; int i, last_busied; VM_OBJECT_ASSERT_WLOCKED(bp->b_bufobj->bo_object); last_busied = 0; for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; if (vm_page_xbusied(m)) { for (; last_busied < i; last_busied++) vm_page_sbusy(bp->b_pages[last_busied]); while (vm_page_xbusied(m)) { vm_page_lock(m); VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object); vm_page_busy_sleep(m, "vbpage", true); VM_OBJECT_WLOCK(bp->b_bufobj->bo_object); } } } for (i = 0; i < last_busied; i++) vm_page_sunbusy(bp->b_pages[i]); } /* * This routine is called before a device strategy routine. * It is used to tell the VM system that paging I/O is in * progress, and treat the pages associated with the buffer * almost as being exclusive busy. Also the object paging_in_progress * flag is handled to make sure that the object doesn't become * inconsistent. * * Since I/O has not been initiated yet, certain buffer flags * such as BIO_ERROR or B_INVAL may be in an inconsistent state * and should be ignored. */ void vfs_busy_pages(struct buf *bp, int clear_modify) { vm_object_t obj; vm_ooffset_t foff; vm_page_t m; int i; bool bogus; if (!(bp->b_flags & B_VMIO)) return; obj = bp->b_bufobj->bo_object; foff = bp->b_offset; KASSERT(bp->b_offset != NOOFFSET, ("vfs_busy_pages: no buffer offset")); VM_OBJECT_WLOCK(obj); vfs_drain_busy_pages(bp); if (bp->b_bufsize != 0) vfs_setdirty_locked_object(bp); bogus = false; for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; if ((bp->b_flags & B_CLUSTER) == 0) { vm_object_pip_add(obj, 1); vm_page_sbusy(m); } /* * When readying a buffer for a read ( i.e * clear_modify == 0 ), it is important to do * bogus_page replacement for valid pages in * partially instantiated buffers. Partially * instantiated buffers can, in turn, occur when * reconstituting a buffer from its VM backing store * base. We only have to do this if B_CACHE is * clear ( which causes the I/O to occur in the * first place ). The replacement prevents the read * I/O from overwriting potentially dirty VM-backed * pages. XXX bogus page replacement is, uh, bogus. * It may not work properly with small-block devices. * We need to find a better way. */ if (clear_modify) { pmap_remove_write(m); vfs_page_set_validclean(bp, foff, m); } else if (m->valid == VM_PAGE_BITS_ALL && (bp->b_flags & B_CACHE) == 0) { bp->b_pages[i] = bogus_page; bogus = true; } foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; } VM_OBJECT_WUNLOCK(obj); if (bogus && buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } } /* * vfs_bio_set_valid: * * Set the range within the buffer to valid. The range is * relative to the beginning of the buffer, b_offset. Note that * b_offset itself may be offset from the beginning of the first * page. */ void vfs_bio_set_valid(struct buf *bp, int base, int size) { int i, n; vm_page_t m; if (!(bp->b_flags & B_VMIO)) return; /* * Fixup base to be relative to beginning of first page. * Set initial n to be the maximum number of bytes in the * first page that can be validated. */ base += (bp->b_offset & PAGE_MASK); n = PAGE_SIZE - (base & PAGE_MASK); VM_OBJECT_WLOCK(bp->b_bufobj->bo_object); for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) { m = bp->b_pages[i]; if (n > size) n = size; vm_page_set_valid_range(m, base & PAGE_MASK, n); base += n; size -= n; n = PAGE_SIZE; } VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object); } /* * vfs_bio_clrbuf: * * If the specified buffer is a non-VMIO buffer, clear the entire * buffer. If the specified buffer is a VMIO buffer, clear and * validate only the previously invalid portions of the buffer. * This routine essentially fakes an I/O, so we need to clear * BIO_ERROR and B_INVAL. * * Note that while we only theoretically need to clear through b_bcount, * we go ahead and clear through b_bufsize. */ void vfs_bio_clrbuf(struct buf *bp) { int i, j, mask, sa, ea, slide; if ((bp->b_flags & (B_VMIO | B_MALLOC)) != B_VMIO) { clrbuf(bp); return; } bp->b_flags &= ~B_INVAL; bp->b_ioflags &= ~BIO_ERROR; VM_OBJECT_WLOCK(bp->b_bufobj->bo_object); if ((bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) && (bp->b_offset & PAGE_MASK) == 0) { if (bp->b_pages[0] == bogus_page) goto unlock; mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1; VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[0]->object); if ((bp->b_pages[0]->valid & mask) == mask) goto unlock; if ((bp->b_pages[0]->valid & mask) == 0) { pmap_zero_page_area(bp->b_pages[0], 0, bp->b_bufsize); bp->b_pages[0]->valid |= mask; goto unlock; } } sa = bp->b_offset & PAGE_MASK; slide = 0; for (i = 0; i < bp->b_npages; i++, sa = 0) { slide = imin(slide + PAGE_SIZE, bp->b_offset + bp->b_bufsize); ea = slide & PAGE_MASK; if (ea == 0) ea = PAGE_SIZE; if (bp->b_pages[i] == bogus_page) continue; j = sa / DEV_BSIZE; mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j; VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[i]->object); if ((bp->b_pages[i]->valid & mask) == mask) continue; if ((bp->b_pages[i]->valid & mask) == 0) pmap_zero_page_area(bp->b_pages[i], sa, ea - sa); else { for (; sa < ea; sa += DEV_BSIZE, j++) { if ((bp->b_pages[i]->valid & (1 << j)) == 0) { pmap_zero_page_area(bp->b_pages[i], sa, DEV_BSIZE); } } } bp->b_pages[i]->valid |= mask; } unlock: VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object); bp->b_resid = 0; } void vfs_bio_bzero_buf(struct buf *bp, int base, int size) { vm_page_t m; int i, n; if (buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); bzero(bp->b_data + base, size); } else { BUF_CHECK_UNMAPPED(bp); n = PAGE_SIZE - (base & PAGE_MASK); for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) { m = bp->b_pages[i]; if (n > size) n = size; pmap_zero_page_area(m, base & PAGE_MASK, n); base += n; size -= n; n = PAGE_SIZE; } } } /* * Update buffer flags based on I/O request parameters, optionally releasing the * buffer. If it's VMIO or direct I/O, the buffer pages are released to the VM, * where they may be placed on a page queue (VMIO) or freed immediately (direct * I/O). Otherwise the buffer is released to the cache. */ static void b_io_dismiss(struct buf *bp, int ioflag, bool release) { KASSERT((ioflag & IO_NOREUSE) == 0 || (ioflag & IO_VMIO) != 0, ("buf %p non-VMIO noreuse", bp)); if ((ioflag & IO_DIRECT) != 0) bp->b_flags |= B_DIRECT; if ((ioflag & (IO_VMIO | IO_DIRECT)) != 0 && LIST_EMPTY(&bp->b_dep)) { bp->b_flags |= B_RELBUF; if ((ioflag & IO_NOREUSE) != 0) bp->b_flags |= B_NOREUSE; if (release) brelse(bp); } else if (release) bqrelse(bp); } void vfs_bio_brelse(struct buf *bp, int ioflag) { b_io_dismiss(bp, ioflag, true); } void vfs_bio_set_flags(struct buf *bp, int ioflag) { b_io_dismiss(bp, ioflag, false); } /* * vm_hold_load_pages and vm_hold_free_pages get pages into * a buffers address space. The pages are anonymous and are * not associated with a file object. */ static void vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to) { vm_offset_t pg; vm_page_t p; int index; BUF_CHECK_MAPPED(bp); to = round_page(to); from = round_page(from); index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; for (pg = from; pg < to; pg += PAGE_SIZE, index++) { tryagain: /* * note: must allocate system pages since blocking here * could interfere with paging I/O, no matter which * process we are. */ p = vm_page_alloc(NULL, 0, VM_ALLOC_SYSTEM | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_COUNT((to - pg) >> PAGE_SHIFT)); if (p == NULL) { VM_WAIT; goto tryagain; } pmap_qenter(pg, &p, 1); bp->b_pages[index] = p; } bp->b_npages = index; } /* Return pages associated with this buf to the vm system */ static void vm_hold_free_pages(struct buf *bp, int newbsize) { vm_offset_t from; vm_page_t p; int index, newnpages; BUF_CHECK_MAPPED(bp); from = round_page((vm_offset_t)bp->b_data + newbsize); newnpages = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; if (bp->b_npages > newnpages) pmap_qremove(from, bp->b_npages - newnpages); for (index = newnpages; index < bp->b_npages; index++) { p = bp->b_pages[index]; bp->b_pages[index] = NULL; p->wire_count--; vm_page_free(p); } atomic_subtract_int(&vm_cnt.v_wire_count, bp->b_npages - newnpages); bp->b_npages = newnpages; } /* * Map an IO request into kernel virtual address space. * * All requests are (re)mapped into kernel VA space. * Notice that we use b_bufsize for the size of the buffer * to be mapped. b_bcount might be modified by the driver. * * Note that even if the caller determines that the address space should * be valid, a race or a smaller-file mapped into a larger space may * actually cause vmapbuf() to fail, so all callers of vmapbuf() MUST * check the return value. * * This function only works with pager buffers. */ int vmapbuf(struct buf *bp, int mapbuf) { vm_prot_t prot; int pidx; if (bp->b_bufsize < 0) return (-1); prot = VM_PROT_READ; if (bp->b_iocmd == BIO_READ) prot |= VM_PROT_WRITE; /* Less backwards than it looks */ if ((pidx = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map, (vm_offset_t)bp->b_data, bp->b_bufsize, prot, bp->b_pages, btoc(MAXPHYS))) < 0) return (-1); bp->b_npages = pidx; bp->b_offset = ((vm_offset_t)bp->b_data) & PAGE_MASK; if (mapbuf || !unmapped_buf_allowed) { pmap_qenter((vm_offset_t)bp->b_kvabase, bp->b_pages, pidx); bp->b_data = bp->b_kvabase + bp->b_offset; } else bp->b_data = unmapped_buf; return(0); } /* * Free the io map PTEs associated with this IO operation. * We also invalidate the TLB entries and restore the original b_addr. * * This function only works with pager buffers. */ void vunmapbuf(struct buf *bp) { int npages; npages = bp->b_npages; if (buf_mapped(bp)) pmap_qremove(trunc_page((vm_offset_t)bp->b_data), npages); vm_page_unhold_pages(bp->b_pages, npages); bp->b_data = unmapped_buf; } void bdone(struct buf *bp) { struct mtx *mtxp; mtxp = mtx_pool_find(mtxpool_sleep, bp); mtx_lock(mtxp); bp->b_flags |= B_DONE; wakeup(bp); mtx_unlock(mtxp); } void bwait(struct buf *bp, u_char pri, const char *wchan) { struct mtx *mtxp; mtxp = mtx_pool_find(mtxpool_sleep, bp); mtx_lock(mtxp); while ((bp->b_flags & B_DONE) == 0) msleep(bp, mtxp, pri, wchan, 0); mtx_unlock(mtxp); } int bufsync(struct bufobj *bo, int waitfor) { return (VOP_FSYNC(bo2vnode(bo), waitfor, curthread)); } void bufstrategy(struct bufobj *bo, struct buf *bp) { int i = 0; struct vnode *vp; vp = bp->b_vp; KASSERT(vp == bo->bo_private, ("Inconsistent vnode bufstrategy")); KASSERT(vp->v_type != VCHR && vp->v_type != VBLK, ("Wrong vnode in bufstrategy(bp=%p, vp=%p)", bp, vp)); i = VOP_STRATEGY(vp, bp); KASSERT(i == 0, ("VOP_STRATEGY failed bp=%p vp=%p", bp, bp->b_vp)); } void bufobj_wrefl(struct bufobj *bo) { KASSERT(bo != NULL, ("NULL bo in bufobj_wref")); ASSERT_BO_WLOCKED(bo); bo->bo_numoutput++; } void bufobj_wref(struct bufobj *bo) { KASSERT(bo != NULL, ("NULL bo in bufobj_wref")); BO_LOCK(bo); bo->bo_numoutput++; BO_UNLOCK(bo); } void bufobj_wdrop(struct bufobj *bo) { KASSERT(bo != NULL, ("NULL bo in bufobj_wdrop")); BO_LOCK(bo); KASSERT(bo->bo_numoutput > 0, ("bufobj_wdrop non-positive count")); if ((--bo->bo_numoutput == 0) && (bo->bo_flag & BO_WWAIT)) { bo->bo_flag &= ~BO_WWAIT; wakeup(&bo->bo_numoutput); } BO_UNLOCK(bo); } int bufobj_wwait(struct bufobj *bo, int slpflag, int timeo) { int error; KASSERT(bo != NULL, ("NULL bo in bufobj_wwait")); ASSERT_BO_WLOCKED(bo); error = 0; while (bo->bo_numoutput) { bo->bo_flag |= BO_WWAIT; error = msleep(&bo->bo_numoutput, BO_LOCKPTR(bo), slpflag | (PRIBIO + 1), "bo_wwait", timeo); if (error) break; } return (error); } /* * Set bio_data or bio_ma for struct bio from the struct buf. */ void bdata2bio(struct buf *bp, struct bio *bip) { if (!buf_mapped(bp)) { KASSERT(unmapped_buf_allowed, ("unmapped")); bip->bio_ma = bp->b_pages; bip->bio_ma_n = bp->b_npages; bip->bio_data = unmapped_buf; bip->bio_ma_offset = (vm_offset_t)bp->b_offset & PAGE_MASK; bip->bio_flags |= BIO_UNMAPPED; KASSERT(round_page(bip->bio_ma_offset + bip->bio_length) / PAGE_SIZE == bp->b_npages, ("Buffer %p too short: %d %lld %d", bp, bip->bio_ma_offset, (long long)bip->bio_length, bip->bio_ma_n)); } else { bip->bio_data = bp->b_data; bip->bio_ma = NULL; } } /* * The MIPS pmap code currently doesn't handle aliased pages. * The VIPT caches may not handle page aliasing themselves, leading * to data corruption. * * As such, this code makes a system extremely unhappy if said * system doesn't support unaliasing the above situation in hardware. * Some "recent" systems (eg some mips24k/mips74k cores) don't enable * this feature at build time, so it has to be handled in software. * * Once the MIPS pmap/cache code grows to support this function on * earlier chips, it should be flipped back off. */ #ifdef __mips__ static int buf_pager_relbuf = 1; #else static int buf_pager_relbuf = 0; #endif SYSCTL_INT(_vfs, OID_AUTO, buf_pager_relbuf, CTLFLAG_RWTUN, &buf_pager_relbuf, 0, "Make buffer pager release buffers after reading"); /* * The buffer pager. It uses buffer reads to validate pages. * * In contrast to the generic local pager from vm/vnode_pager.c, this * pager correctly and easily handles volumes where the underlying * device block size is greater than the machine page size. The * buffer cache transparently extends the requested page run to be * aligned at the block boundary, and does the necessary bogus page * replacements in the addends to avoid obliterating already valid * pages. * * The only non-trivial issue is that the exclusive busy state for * pages, which is assumed by the vm_pager_getpages() interface, is * incompatible with the VMIO buffer cache's desire to share-busy the * pages. This function performs a trivial downgrade of the pages' * state before reading buffers, and a less trivial upgrade from the * shared-busy to excl-busy state after the read. */ int vfs_bio_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind, int *rahead, vbg_get_lblkno_t get_lblkno, vbg_get_blksize_t get_blksize) { vm_page_t m; vm_object_t object; struct buf *bp; struct mount *mp; daddr_t lbn, lbnp; vm_ooffset_t la, lb, poff, poffe; long bsize; int bo_bs, br_flags, error, i, pgsin, pgsin_a, pgsin_b; bool redo, lpart; object = vp->v_object; mp = vp->v_mount; la = IDX_TO_OFF(ma[count - 1]->pindex); if (la >= object->un_pager.vnp.vnp_size) return (VM_PAGER_BAD); lpart = la + PAGE_SIZE > object->un_pager.vnp.vnp_size; bo_bs = get_blksize(vp, get_lblkno(vp, IDX_TO_OFF(ma[0]->pindex))); /* * Calculate read-ahead, behind and total pages. */ pgsin = count; lb = IDX_TO_OFF(ma[0]->pindex); pgsin_b = OFF_TO_IDX(lb - rounddown2(lb, bo_bs)); pgsin += pgsin_b; if (rbehind != NULL) *rbehind = pgsin_b; pgsin_a = OFF_TO_IDX(roundup2(la, bo_bs) - la); if (la + IDX_TO_OFF(pgsin_a) >= object->un_pager.vnp.vnp_size) pgsin_a = OFF_TO_IDX(roundup2(object->un_pager.vnp.vnp_size, PAGE_SIZE) - la); pgsin += pgsin_a; if (rahead != NULL) *rahead = pgsin_a; VM_CNT_INC(v_vnodein); VM_CNT_ADD(v_vnodepgsin, pgsin); br_flags = (mp != NULL && (mp->mnt_kern_flag & MNTK_UNMAPPED_BUFS) != 0) ? GB_UNMAPPED : 0; VM_OBJECT_WLOCK(object); again: for (i = 0; i < count; i++) vm_page_busy_downgrade(ma[i]); VM_OBJECT_WUNLOCK(object); lbnp = -1; for (i = 0; i < count; i++) { m = ma[i]; /* * Pages are shared busy and the object lock is not * owned, which together allow for the pages' * invalidation. The racy test for validity avoids * useless creation of the buffer for the most typical * case when invalidation is not used in redo or for * parallel read. The shared->excl upgrade loop at * the end of the function catches the race in a * reliable way (protected by the object lock). */ if (m->valid == VM_PAGE_BITS_ALL) continue; poff = IDX_TO_OFF(m->pindex); poffe = MIN(poff + PAGE_SIZE, object->un_pager.vnp.vnp_size); for (; poff < poffe; poff += bsize) { lbn = get_lblkno(vp, poff); if (lbn == lbnp) goto next_page; lbnp = lbn; bsize = get_blksize(vp, lbn); error = bread_gb(vp, lbn, bsize, curthread->td_ucred, br_flags, &bp); if (error != 0) goto end_pages; if (LIST_EMPTY(&bp->b_dep)) { /* * Invalidation clears m->valid, but * may leave B_CACHE flag if the * buffer existed at the invalidation * time. In this case, recycle the * buffer to do real read on next * bread() after redo. * * Otherwise B_RELBUF is not strictly * necessary, enable to reduce buf * cache pressure. */ if (buf_pager_relbuf || m->valid != VM_PAGE_BITS_ALL) bp->b_flags |= B_RELBUF; bp->b_flags &= ~B_NOCACHE; brelse(bp); } else { bqrelse(bp); } } KASSERT(1 /* racy, enable for debugging */ || m->valid == VM_PAGE_BITS_ALL || i == count - 1, ("buf %d %p invalid", i, m)); if (i == count - 1 && lpart) { VM_OBJECT_WLOCK(object); if (m->valid != 0 && m->valid != VM_PAGE_BITS_ALL) vm_page_zero_invalid(m, TRUE); VM_OBJECT_WUNLOCK(object); } next_page:; } end_pages: VM_OBJECT_WLOCK(object); redo = false; for (i = 0; i < count; i++) { vm_page_sunbusy(ma[i]); ma[i] = vm_page_grab(object, ma[i]->pindex, VM_ALLOC_NORMAL); /* * Since the pages were only sbusy while neither the * buffer nor the object lock was held by us, or * reallocated while vm_page_grab() slept for busy * relinguish, they could have been invalidated. * Recheck the valid bits and re-read as needed. * * Note that the last page is made fully valid in the * read loop, and partial validity for the page at * index count - 1 could mean that the page was * invalidated or removed, so we must restart for * safety as well. */ if (ma[i]->valid != VM_PAGE_BITS_ALL) redo = true; } if (redo && error == 0) goto again; VM_OBJECT_WUNLOCK(object); return (error != 0 ? VM_PAGER_ERROR : VM_PAGER_OK); } #include "opt_ddb.h" #ifdef DDB #include /* DDB command to show buffer data */ DB_SHOW_COMMAND(buffer, db_show_buffer) { /* get args */ struct buf *bp = (struct buf *)addr; #ifdef FULL_BUF_TRACKING uint32_t i, j; #endif if (!have_addr) { db_printf("usage: show buffer \n"); return; } db_printf("buf at %p\n", bp); db_printf("b_flags = 0x%b, b_xflags=0x%b, b_vflags=0x%b\n", (u_int)bp->b_flags, PRINT_BUF_FLAGS, (u_int)bp->b_xflags, PRINT_BUF_XFLAGS, (u_int)bp->b_vflags, PRINT_BUF_VFLAGS); db_printf( "b_error = %d, b_bufsize = %ld, b_bcount = %ld, b_resid = %ld\n" "b_bufobj = (%p), b_data = %p, b_blkno = %jd, b_lblkno = %jd, " "b_dep = %p\n", bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid, bp->b_bufobj, bp->b_data, (intmax_t)bp->b_blkno, (intmax_t)bp->b_lblkno, bp->b_dep.lh_first); db_printf("b_kvabase = %p, b_kvasize = %d\n", bp->b_kvabase, bp->b_kvasize); if (bp->b_npages) { int i; db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages); for (i = 0; i < bp->b_npages; i++) { vm_page_t m; m = bp->b_pages[i]; if (m != NULL) db_printf("(%p, 0x%lx, 0x%lx)", m->object, (u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m)); else db_printf("( ??? )"); if ((i + 1) < bp->b_npages) db_printf(","); } db_printf("\n"); } #if defined(FULL_BUF_TRACKING) db_printf("b_io_tracking: b_io_tcnt = %u\n", bp->b_io_tcnt); i = bp->b_io_tcnt % BUF_TRACKING_SIZE; for (j = 1; j <= BUF_TRACKING_SIZE; j++) { if (bp->b_io_tracking[BUF_TRACKING_ENTRY(i - j)] == NULL) continue; db_printf(" %2u: %s\n", j, bp->b_io_tracking[BUF_TRACKING_ENTRY(i - j)]); } #elif defined(BUF_TRACKING) db_printf("b_io_tracking: %s\n", bp->b_io_tracking); #endif db_printf(" "); BUF_LOCKPRINTINFO(bp); } DB_SHOW_COMMAND(lockedbufs, lockedbufs) { struct buf *bp; int i; for (i = 0; i < nbuf; i++) { bp = &buf[i]; if (BUF_ISLOCKED(bp)) { db_show_buffer((uintptr_t)bp, 1, 0, NULL); db_printf("\n"); if (db_pager_quit) break; } } } DB_SHOW_COMMAND(vnodebufs, db_show_vnodebufs) { struct vnode *vp; struct buf *bp; if (!have_addr) { db_printf("usage: show vnodebufs \n"); return; } vp = (struct vnode *)addr; db_printf("Clean buffers:\n"); TAILQ_FOREACH(bp, &vp->v_bufobj.bo_clean.bv_hd, b_bobufs) { db_show_buffer((uintptr_t)bp, 1, 0, NULL); db_printf("\n"); } db_printf("Dirty buffers:\n"); TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs) { db_show_buffer((uintptr_t)bp, 1, 0, NULL); db_printf("\n"); } } DB_COMMAND(countfreebufs, db_coundfreebufs) { struct buf *bp; int i, used = 0, nfree = 0; if (have_addr) { db_printf("usage: countfreebufs\n"); return; } for (i = 0; i < nbuf; i++) { bp = &buf[i]; if (bp->b_qindex == QUEUE_EMPTY) nfree++; else used++; } db_printf("Counted %d free, %d used (%d tot)\n", nfree, used, nfree + used); db_printf("numfreebuffers is %d\n", numfreebuffers); } #endif /* DDB */ Index: head/sys/libkern/crc32.c =================================================================== --- head/sys/libkern/crc32.c (revision 323922) +++ head/sys/libkern/crc32.c (revision 323923) @@ -1,784 +1,789 @@ /*- * COPYRIGHT (C) 1986 Gary S. Brown. You may use this program, or * code or tables extracted from it, as desired without restriction. */ /* * First, the polynomial itself and its table of feedback terms. The * polynomial is * X^32+X^26+X^23+X^22+X^16+X^12+X^11+X^10+X^8+X^7+X^5+X^4+X^2+X^1+X^0 * * Note that we take it "backwards" and put the highest-order term in * the lowest-order bit. The X^32 term is "implied"; the LSB is the * X^31 term, etc. The X^0 term (usually shown as "+1") results in * the MSB being 1 * * Note that the usual hardware shift register implementation, which * is what we're using (we're merely optimizing it by doing eight-bit * chunks at a time) shifts bits into the lowest-order term. In our * implementation, that means shifting towards the right. Why do we * do it this way? Because the calculated CRC must be transmitted in * order from highest-order term to lowest-order term. UARTs transmit * characters in order from LSB to MSB. By storing the CRC this way * we hand it to the UART in the order low-byte to high-byte; the UART * sends each low-bit to hight-bit; and the result is transmission bit * by bit from highest- to lowest-order term without requiring any bit * shuffling on our part. Reception works similarly * * The feedback terms table consists of 256, 32-bit entries. Notes * * The table can be generated at runtime if desired; code to do so * is shown later. It might not be obvious, but the feedback * terms simply represent the results of eight shift/xor opera * tions for all combinations of data and CRC register values * * The values must be right-shifted by eight bits by the "updcrc * logic; the shift must be unsigned (bring in zeroes). On some * hardware you could probably optimize the shift in assembler by * using byte-swap instructions * polynomial $edb88320 * * * CRC32 code derived from work by Gary S. Brown. */ #include __FBSDID("$FreeBSD$"); #include + +#ifdef _KERNEL #include #include #if defined(__amd64__) || defined(__i386__) #include #include #endif #if defined(__aarch64__) #include #endif +#endif /* _KERNEL */ const uint32_t crc32_tab[] = { 0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, 0x706af48f, 0xe963a535, 0x9e6495a3, 0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988, 0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91, 0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de, 0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7, 0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9, 0xfa0f3d63, 0x8d080df5, 0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172, 0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b, 0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940, 0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59, 0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423, 0xcfba9599, 0xb8bda50f, 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924, 0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d, 0x76dc4190, 0x01db7106, 0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433, 0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d, 0x91646c97, 0xe6635c01, 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e, 0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457, 0x65b0d9c6, 0x12b7e950, 0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65, 0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, 0x4adfa541, 0x3dd895d7, 0xa4d1c46d, 0xd3d6f4fb, 0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0, 0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9, 0x5005713c, 0x270241aa, 0xbe0b1010, 0xc90c2086, 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f, 0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17, 0x2eb40d81, 0xb7bd5c3b, 0xc0ba6cad, 0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a, 0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683, 0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8, 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1, 0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb, 0x196c3671, 0x6e6b06e7, 0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc, 0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5, 0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252, 0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b, 0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55, 0x316e8eef, 0x4669be79, 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236, 0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f, 0xc5ba3bbe, 0xb2bd0b28, 0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d, 0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a, 0x9c0906a9, 0xeb0e363f, 0x72076785, 0x05005713, 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38, 0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, 0x86d3d2d4, 0xf1d4e242, 0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777, 0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, 0x8f659eff, 0xf862ae69, 0x616bffd3, 0x166ccf45, 0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2, 0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db, 0xaed16a4a, 0xd9d65adc, 0x40df0b66, 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9, 0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605, 0xcdd70693, 0x54de5729, 0x23d967bf, 0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94, 0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d }; /* * A function that calculates the CRC-32 based on the table above is * given below for documentation purposes. An equivalent implementation * of this function that's actually used in the kernel can be found * in sys/libkern.h, where it can be inlined. * * uint32_t * crc32(const void *buf, size_t size) * { * const uint8_t *p = buf; * uint32_t crc; * * crc = ~0U; * while (size--) * crc = crc32_tab[(crc ^ *p++) & 0xFF] ^ (crc >> 8); * return crc ^ ~0U; * } */ /* CRC32C routines, these use a different polynomial */ /*****************************************************************/ /* */ /* CRC LOOKUP TABLE */ /* ================ */ /* The following CRC lookup table was generated automagically */ /* by the Rocksoft^tm Model CRC Algorithm Table Generation */ /* Program V1.0 using the following model parameters: */ /* */ /* Width : 4 bytes. */ /* Poly : 0x1EDC6F41L */ /* Reverse : TRUE. */ /* */ /* For more information on the Rocksoft^tm Model CRC Algorithm, */ /* see the document titled "A Painless Guide to CRC Error */ /* Detection Algorithms" by Ross Williams */ /* (ross@guest.adelaide.edu.au.). This document is likely to be */ /* in the FTP archive "ftp.adelaide.edu.au/pub/rocksoft". */ /* */ /*****************************************************************/ static const uint32_t crc32Table[256] = { 0x00000000L, 0xF26B8303L, 0xE13B70F7L, 0x1350F3F4L, 0xC79A971FL, 0x35F1141CL, 0x26A1E7E8L, 0xD4CA64EBL, 0x8AD958CFL, 0x78B2DBCCL, 0x6BE22838L, 0x9989AB3BL, 0x4D43CFD0L, 0xBF284CD3L, 0xAC78BF27L, 0x5E133C24L, 0x105EC76FL, 0xE235446CL, 0xF165B798L, 0x030E349BL, 0xD7C45070L, 0x25AFD373L, 0x36FF2087L, 0xC494A384L, 0x9A879FA0L, 0x68EC1CA3L, 0x7BBCEF57L, 0x89D76C54L, 0x5D1D08BFL, 0xAF768BBCL, 0xBC267848L, 0x4E4DFB4BL, 0x20BD8EDEL, 0xD2D60DDDL, 0xC186FE29L, 0x33ED7D2AL, 0xE72719C1L, 0x154C9AC2L, 0x061C6936L, 0xF477EA35L, 0xAA64D611L, 0x580F5512L, 0x4B5FA6E6L, 0xB93425E5L, 0x6DFE410EL, 0x9F95C20DL, 0x8CC531F9L, 0x7EAEB2FAL, 0x30E349B1L, 0xC288CAB2L, 0xD1D83946L, 0x23B3BA45L, 0xF779DEAEL, 0x05125DADL, 0x1642AE59L, 0xE4292D5AL, 0xBA3A117EL, 0x4851927DL, 0x5B016189L, 0xA96AE28AL, 0x7DA08661L, 0x8FCB0562L, 0x9C9BF696L, 0x6EF07595L, 0x417B1DBCL, 0xB3109EBFL, 0xA0406D4BL, 0x522BEE48L, 0x86E18AA3L, 0x748A09A0L, 0x67DAFA54L, 0x95B17957L, 0xCBA24573L, 0x39C9C670L, 0x2A993584L, 0xD8F2B687L, 0x0C38D26CL, 0xFE53516FL, 0xED03A29BL, 0x1F682198L, 0x5125DAD3L, 0xA34E59D0L, 0xB01EAA24L, 0x42752927L, 0x96BF4DCCL, 0x64D4CECFL, 0x77843D3BL, 0x85EFBE38L, 0xDBFC821CL, 0x2997011FL, 0x3AC7F2EBL, 0xC8AC71E8L, 0x1C661503L, 0xEE0D9600L, 0xFD5D65F4L, 0x0F36E6F7L, 0x61C69362L, 0x93AD1061L, 0x80FDE395L, 0x72966096L, 0xA65C047DL, 0x5437877EL, 0x4767748AL, 0xB50CF789L, 0xEB1FCBADL, 0x197448AEL, 0x0A24BB5AL, 0xF84F3859L, 0x2C855CB2L, 0xDEEEDFB1L, 0xCDBE2C45L, 0x3FD5AF46L, 0x7198540DL, 0x83F3D70EL, 0x90A324FAL, 0x62C8A7F9L, 0xB602C312L, 0x44694011L, 0x5739B3E5L, 0xA55230E6L, 0xFB410CC2L, 0x092A8FC1L, 0x1A7A7C35L, 0xE811FF36L, 0x3CDB9BDDL, 0xCEB018DEL, 0xDDE0EB2AL, 0x2F8B6829L, 0x82F63B78L, 0x709DB87BL, 0x63CD4B8FL, 0x91A6C88CL, 0x456CAC67L, 0xB7072F64L, 0xA457DC90L, 0x563C5F93L, 0x082F63B7L, 0xFA44E0B4L, 0xE9141340L, 0x1B7F9043L, 0xCFB5F4A8L, 0x3DDE77ABL, 0x2E8E845FL, 0xDCE5075CL, 0x92A8FC17L, 0x60C37F14L, 0x73938CE0L, 0x81F80FE3L, 0x55326B08L, 0xA759E80BL, 0xB4091BFFL, 0x466298FCL, 0x1871A4D8L, 0xEA1A27DBL, 0xF94AD42FL, 0x0B21572CL, 0xDFEB33C7L, 0x2D80B0C4L, 0x3ED04330L, 0xCCBBC033L, 0xA24BB5A6L, 0x502036A5L, 0x4370C551L, 0xB11B4652L, 0x65D122B9L, 0x97BAA1BAL, 0x84EA524EL, 0x7681D14DL, 0x2892ED69L, 0xDAF96E6AL, 0xC9A99D9EL, 0x3BC21E9DL, 0xEF087A76L, 0x1D63F975L, 0x0E330A81L, 0xFC588982L, 0xB21572C9L, 0x407EF1CAL, 0x532E023EL, 0xA145813DL, 0x758FE5D6L, 0x87E466D5L, 0x94B49521L, 0x66DF1622L, 0x38CC2A06L, 0xCAA7A905L, 0xD9F75AF1L, 0x2B9CD9F2L, 0xFF56BD19L, 0x0D3D3E1AL, 0x1E6DCDEEL, 0xEC064EEDL, 0xC38D26C4L, 0x31E6A5C7L, 0x22B65633L, 0xD0DDD530L, 0x0417B1DBL, 0xF67C32D8L, 0xE52CC12CL, 0x1747422FL, 0x49547E0BL, 0xBB3FFD08L, 0xA86F0EFCL, 0x5A048DFFL, 0x8ECEE914L, 0x7CA56A17L, 0x6FF599E3L, 0x9D9E1AE0L, 0xD3D3E1ABL, 0x21B862A8L, 0x32E8915CL, 0xC083125FL, 0x144976B4L, 0xE622F5B7L, 0xF5720643L, 0x07198540L, 0x590AB964L, 0xAB613A67L, 0xB831C993L, 0x4A5A4A90L, 0x9E902E7BL, 0x6CFBAD78L, 0x7FAB5E8CL, 0x8DC0DD8FL, 0xE330A81AL, 0x115B2B19L, 0x020BD8EDL, 0xF0605BEEL, 0x24AA3F05L, 0xD6C1BC06L, 0xC5914FF2L, 0x37FACCF1L, 0x69E9F0D5L, 0x9B8273D6L, 0x88D28022L, 0x7AB90321L, 0xAE7367CAL, 0x5C18E4C9L, 0x4F48173DL, 0xBD23943EL, 0xF36E6F75L, 0x0105EC76L, 0x12551F82L, 0xE03E9C81L, 0x34F4F86AL, 0xC69F7B69L, 0xD5CF889DL, 0x27A40B9EL, 0x79B737BAL, 0x8BDCB4B9L, 0x988C474DL, 0x6AE7C44EL, 0xBE2DA0A5L, 0x4C4623A6L, 0x5F16D052L, 0xAD7D5351L }; static uint32_t singletable_crc32c(uint32_t crc, const void *buf, size_t size) { const uint8_t *p = buf; while (size--) crc = crc32Table[(crc ^ *p++) & 0xff] ^ (crc >> 8); return crc; } /* * Copyright (c) 2004-2006 Intel Corporation - All Rights Reserved * * * This software program is licensed subject to the BSD License, available at * http://www.opensource.org/licenses/bsd-license.html. * * Abstract: * * Tables for software CRC generation */ /* * The following CRC lookup table was generated automagically using the * following model parameters: * * Generator Polynomial = ................. 0x1EDC6F41 * Generator Polynomial Length = .......... 32 bits * Reflected Bits = ....................... TRUE * Table Generation Offset = .............. 32 bits * Number of Slices = ..................... 8 slices * Slice Lengths = ........................ 8 8 8 8 8 8 8 8 * Directory Name = ....................... .\ * File Name = ............................ 8x256_tables.c */ static const uint32_t sctp_crc_tableil8_o32[256] = { 0x00000000, 0xF26B8303, 0xE13B70F7, 0x1350F3F4, 0xC79A971F, 0x35F1141C, 0x26A1E7E8, 0xD4CA64EB, 0x8AD958CF, 0x78B2DBCC, 0x6BE22838, 0x9989AB3B, 0x4D43CFD0, 0xBF284CD3, 0xAC78BF27, 0x5E133C24, 0x105EC76F, 0xE235446C, 0xF165B798, 0x030E349B, 0xD7C45070, 0x25AFD373, 0x36FF2087, 0xC494A384, 0x9A879FA0, 0x68EC1CA3, 0x7BBCEF57, 0x89D76C54, 0x5D1D08BF, 0xAF768BBC, 0xBC267848, 0x4E4DFB4B, 0x20BD8EDE, 0xD2D60DDD, 0xC186FE29, 0x33ED7D2A, 0xE72719C1, 0x154C9AC2, 0x061C6936, 0xF477EA35, 0xAA64D611, 0x580F5512, 0x4B5FA6E6, 0xB93425E5, 0x6DFE410E, 0x9F95C20D, 0x8CC531F9, 0x7EAEB2FA, 0x30E349B1, 0xC288CAB2, 0xD1D83946, 0x23B3BA45, 0xF779DEAE, 0x05125DAD, 0x1642AE59, 0xE4292D5A, 0xBA3A117E, 0x4851927D, 0x5B016189, 0xA96AE28A, 0x7DA08661, 0x8FCB0562, 0x9C9BF696, 0x6EF07595, 0x417B1DBC, 0xB3109EBF, 0xA0406D4B, 0x522BEE48, 0x86E18AA3, 0x748A09A0, 0x67DAFA54, 0x95B17957, 0xCBA24573, 0x39C9C670, 0x2A993584, 0xD8F2B687, 0x0C38D26C, 0xFE53516F, 0xED03A29B, 0x1F682198, 0x5125DAD3, 0xA34E59D0, 0xB01EAA24, 0x42752927, 0x96BF4DCC, 0x64D4CECF, 0x77843D3B, 0x85EFBE38, 0xDBFC821C, 0x2997011F, 0x3AC7F2EB, 0xC8AC71E8, 0x1C661503, 0xEE0D9600, 0xFD5D65F4, 0x0F36E6F7, 0x61C69362, 0x93AD1061, 0x80FDE395, 0x72966096, 0xA65C047D, 0x5437877E, 0x4767748A, 0xB50CF789, 0xEB1FCBAD, 0x197448AE, 0x0A24BB5A, 0xF84F3859, 0x2C855CB2, 0xDEEEDFB1, 0xCDBE2C45, 0x3FD5AF46, 0x7198540D, 0x83F3D70E, 0x90A324FA, 0x62C8A7F9, 0xB602C312, 0x44694011, 0x5739B3E5, 0xA55230E6, 0xFB410CC2, 0x092A8FC1, 0x1A7A7C35, 0xE811FF36, 0x3CDB9BDD, 0xCEB018DE, 0xDDE0EB2A, 0x2F8B6829, 0x82F63B78, 0x709DB87B, 0x63CD4B8F, 0x91A6C88C, 0x456CAC67, 0xB7072F64, 0xA457DC90, 0x563C5F93, 0x082F63B7, 0xFA44E0B4, 0xE9141340, 0x1B7F9043, 0xCFB5F4A8, 0x3DDE77AB, 0x2E8E845F, 0xDCE5075C, 0x92A8FC17, 0x60C37F14, 0x73938CE0, 0x81F80FE3, 0x55326B08, 0xA759E80B, 0xB4091BFF, 0x466298FC, 0x1871A4D8, 0xEA1A27DB, 0xF94AD42F, 0x0B21572C, 0xDFEB33C7, 0x2D80B0C4, 0x3ED04330, 0xCCBBC033, 0xA24BB5A6, 0x502036A5, 0x4370C551, 0xB11B4652, 0x65D122B9, 0x97BAA1BA, 0x84EA524E, 0x7681D14D, 0x2892ED69, 0xDAF96E6A, 0xC9A99D9E, 0x3BC21E9D, 0xEF087A76, 0x1D63F975, 0x0E330A81, 0xFC588982, 0xB21572C9, 0x407EF1CA, 0x532E023E, 0xA145813D, 0x758FE5D6, 0x87E466D5, 0x94B49521, 0x66DF1622, 0x38CC2A06, 0xCAA7A905, 0xD9F75AF1, 0x2B9CD9F2, 0xFF56BD19, 0x0D3D3E1A, 0x1E6DCDEE, 0xEC064EED, 0xC38D26C4, 0x31E6A5C7, 0x22B65633, 0xD0DDD530, 0x0417B1DB, 0xF67C32D8, 0xE52CC12C, 0x1747422F, 0x49547E0B, 0xBB3FFD08, 0xA86F0EFC, 0x5A048DFF, 0x8ECEE914, 0x7CA56A17, 0x6FF599E3, 0x9D9E1AE0, 0xD3D3E1AB, 0x21B862A8, 0x32E8915C, 0xC083125F, 0x144976B4, 0xE622F5B7, 0xF5720643, 0x07198540, 0x590AB964, 0xAB613A67, 0xB831C993, 0x4A5A4A90, 0x9E902E7B, 0x6CFBAD78, 0x7FAB5E8C, 0x8DC0DD8F, 0xE330A81A, 0x115B2B19, 0x020BD8ED, 0xF0605BEE, 0x24AA3F05, 0xD6C1BC06, 0xC5914FF2, 0x37FACCF1, 0x69E9F0D5, 0x9B8273D6, 0x88D28022, 0x7AB90321, 0xAE7367CA, 0x5C18E4C9, 0x4F48173D, 0xBD23943E, 0xF36E6F75, 0x0105EC76, 0x12551F82, 0xE03E9C81, 0x34F4F86A, 0xC69F7B69, 0xD5CF889D, 0x27A40B9E, 0x79B737BA, 0x8BDCB4B9, 0x988C474D, 0x6AE7C44E, 0xBE2DA0A5, 0x4C4623A6, 0x5F16D052, 0xAD7D5351 }; /* * end of the CRC lookup table crc_tableil8_o32 */ /* * The following CRC lookup table was generated automagically using the * following model parameters: * * Generator Polynomial = ................. 0x1EDC6F41 * Generator Polynomial Length = .......... 32 bits * Reflected Bits = ....................... TRUE * Table Generation Offset = .............. 32 bits * Number of Slices = ..................... 8 slices * Slice Lengths = ........................ 8 8 8 8 8 8 8 8 * Directory Name = ....................... .\ * File Name = ............................ 8x256_tables.c */ static const uint32_t sctp_crc_tableil8_o40[256] = { 0x00000000, 0x13A29877, 0x274530EE, 0x34E7A899, 0x4E8A61DC, 0x5D28F9AB, 0x69CF5132, 0x7A6DC945, 0x9D14C3B8, 0x8EB65BCF, 0xBA51F356, 0xA9F36B21, 0xD39EA264, 0xC03C3A13, 0xF4DB928A, 0xE7790AFD, 0x3FC5F181, 0x2C6769F6, 0x1880C16F, 0x0B225918, 0x714F905D, 0x62ED082A, 0x560AA0B3, 0x45A838C4, 0xA2D13239, 0xB173AA4E, 0x859402D7, 0x96369AA0, 0xEC5B53E5, 0xFFF9CB92, 0xCB1E630B, 0xD8BCFB7C, 0x7F8BE302, 0x6C297B75, 0x58CED3EC, 0x4B6C4B9B, 0x310182DE, 0x22A31AA9, 0x1644B230, 0x05E62A47, 0xE29F20BA, 0xF13DB8CD, 0xC5DA1054, 0xD6788823, 0xAC154166, 0xBFB7D911, 0x8B507188, 0x98F2E9FF, 0x404E1283, 0x53EC8AF4, 0x670B226D, 0x74A9BA1A, 0x0EC4735F, 0x1D66EB28, 0x298143B1, 0x3A23DBC6, 0xDD5AD13B, 0xCEF8494C, 0xFA1FE1D5, 0xE9BD79A2, 0x93D0B0E7, 0x80722890, 0xB4958009, 0xA737187E, 0xFF17C604, 0xECB55E73, 0xD852F6EA, 0xCBF06E9D, 0xB19DA7D8, 0xA23F3FAF, 0x96D89736, 0x857A0F41, 0x620305BC, 0x71A19DCB, 0x45463552, 0x56E4AD25, 0x2C896460, 0x3F2BFC17, 0x0BCC548E, 0x186ECCF9, 0xC0D23785, 0xD370AFF2, 0xE797076B, 0xF4359F1C, 0x8E585659, 0x9DFACE2E, 0xA91D66B7, 0xBABFFEC0, 0x5DC6F43D, 0x4E646C4A, 0x7A83C4D3, 0x69215CA4, 0x134C95E1, 0x00EE0D96, 0x3409A50F, 0x27AB3D78, 0x809C2506, 0x933EBD71, 0xA7D915E8, 0xB47B8D9F, 0xCE1644DA, 0xDDB4DCAD, 0xE9537434, 0xFAF1EC43, 0x1D88E6BE, 0x0E2A7EC9, 0x3ACDD650, 0x296F4E27, 0x53028762, 0x40A01F15, 0x7447B78C, 0x67E52FFB, 0xBF59D487, 0xACFB4CF0, 0x981CE469, 0x8BBE7C1E, 0xF1D3B55B, 0xE2712D2C, 0xD69685B5, 0xC5341DC2, 0x224D173F, 0x31EF8F48, 0x050827D1, 0x16AABFA6, 0x6CC776E3, 0x7F65EE94, 0x4B82460D, 0x5820DE7A, 0xFBC3FAF9, 0xE861628E, 0xDC86CA17, 0xCF245260, 0xB5499B25, 0xA6EB0352, 0x920CABCB, 0x81AE33BC, 0x66D73941, 0x7575A136, 0x419209AF, 0x523091D8, 0x285D589D, 0x3BFFC0EA, 0x0F186873, 0x1CBAF004, 0xC4060B78, 0xD7A4930F, 0xE3433B96, 0xF0E1A3E1, 0x8A8C6AA4, 0x992EF2D3, 0xADC95A4A, 0xBE6BC23D, 0x5912C8C0, 0x4AB050B7, 0x7E57F82E, 0x6DF56059, 0x1798A91C, 0x043A316B, 0x30DD99F2, 0x237F0185, 0x844819FB, 0x97EA818C, 0xA30D2915, 0xB0AFB162, 0xCAC27827, 0xD960E050, 0xED8748C9, 0xFE25D0BE, 0x195CDA43, 0x0AFE4234, 0x3E19EAAD, 0x2DBB72DA, 0x57D6BB9F, 0x447423E8, 0x70938B71, 0x63311306, 0xBB8DE87A, 0xA82F700D, 0x9CC8D894, 0x8F6A40E3, 0xF50789A6, 0xE6A511D1, 0xD242B948, 0xC1E0213F, 0x26992BC2, 0x353BB3B5, 0x01DC1B2C, 0x127E835B, 0x68134A1E, 0x7BB1D269, 0x4F567AF0, 0x5CF4E287, 0x04D43CFD, 0x1776A48A, 0x23910C13, 0x30339464, 0x4A5E5D21, 0x59FCC556, 0x6D1B6DCF, 0x7EB9F5B8, 0x99C0FF45, 0x8A626732, 0xBE85CFAB, 0xAD2757DC, 0xD74A9E99, 0xC4E806EE, 0xF00FAE77, 0xE3AD3600, 0x3B11CD7C, 0x28B3550B, 0x1C54FD92, 0x0FF665E5, 0x759BACA0, 0x663934D7, 0x52DE9C4E, 0x417C0439, 0xA6050EC4, 0xB5A796B3, 0x81403E2A, 0x92E2A65D, 0xE88F6F18, 0xFB2DF76F, 0xCFCA5FF6, 0xDC68C781, 0x7B5FDFFF, 0x68FD4788, 0x5C1AEF11, 0x4FB87766, 0x35D5BE23, 0x26772654, 0x12908ECD, 0x013216BA, 0xE64B1C47, 0xF5E98430, 0xC10E2CA9, 0xD2ACB4DE, 0xA8C17D9B, 0xBB63E5EC, 0x8F844D75, 0x9C26D502, 0x449A2E7E, 0x5738B609, 0x63DF1E90, 0x707D86E7, 0x0A104FA2, 0x19B2D7D5, 0x2D557F4C, 0x3EF7E73B, 0xD98EEDC6, 0xCA2C75B1, 0xFECBDD28, 0xED69455F, 0x97048C1A, 0x84A6146D, 0xB041BCF4, 0xA3E32483 }; /* * end of the CRC lookup table crc_tableil8_o40 */ /* * The following CRC lookup table was generated automagically using the * following model parameters: * * Generator Polynomial = ................. 0x1EDC6F41 * Generator Polynomial Length = .......... 32 bits * Reflected Bits = ....................... TRUE * Table Generation Offset = .............. 32 bits * Number of Slices = ..................... 8 slices * Slice Lengths = ........................ 8 8 8 8 8 8 8 8 * Directory Name = ....................... .\ * File Name = ............................ 8x256_tables.c */ static const uint32_t sctp_crc_tableil8_o48[256] = { 0x00000000, 0xA541927E, 0x4F6F520D, 0xEA2EC073, 0x9EDEA41A, 0x3B9F3664, 0xD1B1F617, 0x74F06469, 0x38513EC5, 0x9D10ACBB, 0x773E6CC8, 0xD27FFEB6, 0xA68F9ADF, 0x03CE08A1, 0xE9E0C8D2, 0x4CA15AAC, 0x70A27D8A, 0xD5E3EFF4, 0x3FCD2F87, 0x9A8CBDF9, 0xEE7CD990, 0x4B3D4BEE, 0xA1138B9D, 0x045219E3, 0x48F3434F, 0xEDB2D131, 0x079C1142, 0xA2DD833C, 0xD62DE755, 0x736C752B, 0x9942B558, 0x3C032726, 0xE144FB14, 0x4405696A, 0xAE2BA919, 0x0B6A3B67, 0x7F9A5F0E, 0xDADBCD70, 0x30F50D03, 0x95B49F7D, 0xD915C5D1, 0x7C5457AF, 0x967A97DC, 0x333B05A2, 0x47CB61CB, 0xE28AF3B5, 0x08A433C6, 0xADE5A1B8, 0x91E6869E, 0x34A714E0, 0xDE89D493, 0x7BC846ED, 0x0F382284, 0xAA79B0FA, 0x40577089, 0xE516E2F7, 0xA9B7B85B, 0x0CF62A25, 0xE6D8EA56, 0x43997828, 0x37691C41, 0x92288E3F, 0x78064E4C, 0xDD47DC32, 0xC76580D9, 0x622412A7, 0x880AD2D4, 0x2D4B40AA, 0x59BB24C3, 0xFCFAB6BD, 0x16D476CE, 0xB395E4B0, 0xFF34BE1C, 0x5A752C62, 0xB05BEC11, 0x151A7E6F, 0x61EA1A06, 0xC4AB8878, 0x2E85480B, 0x8BC4DA75, 0xB7C7FD53, 0x12866F2D, 0xF8A8AF5E, 0x5DE93D20, 0x29195949, 0x8C58CB37, 0x66760B44, 0xC337993A, 0x8F96C396, 0x2AD751E8, 0xC0F9919B, 0x65B803E5, 0x1148678C, 0xB409F5F2, 0x5E273581, 0xFB66A7FF, 0x26217BCD, 0x8360E9B3, 0x694E29C0, 0xCC0FBBBE, 0xB8FFDFD7, 0x1DBE4DA9, 0xF7908DDA, 0x52D11FA4, 0x1E704508, 0xBB31D776, 0x511F1705, 0xF45E857B, 0x80AEE112, 0x25EF736C, 0xCFC1B31F, 0x6A802161, 0x56830647, 0xF3C29439, 0x19EC544A, 0xBCADC634, 0xC85DA25D, 0x6D1C3023, 0x8732F050, 0x2273622E, 0x6ED23882, 0xCB93AAFC, 0x21BD6A8F, 0x84FCF8F1, 0xF00C9C98, 0x554D0EE6, 0xBF63CE95, 0x1A225CEB, 0x8B277743, 0x2E66E53D, 0xC448254E, 0x6109B730, 0x15F9D359, 0xB0B84127, 0x5A968154, 0xFFD7132A, 0xB3764986, 0x1637DBF8, 0xFC191B8B, 0x595889F5, 0x2DA8ED9C, 0x88E97FE2, 0x62C7BF91, 0xC7862DEF, 0xFB850AC9, 0x5EC498B7, 0xB4EA58C4, 0x11ABCABA, 0x655BAED3, 0xC01A3CAD, 0x2A34FCDE, 0x8F756EA0, 0xC3D4340C, 0x6695A672, 0x8CBB6601, 0x29FAF47F, 0x5D0A9016, 0xF84B0268, 0x1265C21B, 0xB7245065, 0x6A638C57, 0xCF221E29, 0x250CDE5A, 0x804D4C24, 0xF4BD284D, 0x51FCBA33, 0xBBD27A40, 0x1E93E83E, 0x5232B292, 0xF77320EC, 0x1D5DE09F, 0xB81C72E1, 0xCCEC1688, 0x69AD84F6, 0x83834485, 0x26C2D6FB, 0x1AC1F1DD, 0xBF8063A3, 0x55AEA3D0, 0xF0EF31AE, 0x841F55C7, 0x215EC7B9, 0xCB7007CA, 0x6E3195B4, 0x2290CF18, 0x87D15D66, 0x6DFF9D15, 0xC8BE0F6B, 0xBC4E6B02, 0x190FF97C, 0xF321390F, 0x5660AB71, 0x4C42F79A, 0xE90365E4, 0x032DA597, 0xA66C37E9, 0xD29C5380, 0x77DDC1FE, 0x9DF3018D, 0x38B293F3, 0x7413C95F, 0xD1525B21, 0x3B7C9B52, 0x9E3D092C, 0xEACD6D45, 0x4F8CFF3B, 0xA5A23F48, 0x00E3AD36, 0x3CE08A10, 0x99A1186E, 0x738FD81D, 0xD6CE4A63, 0xA23E2E0A, 0x077FBC74, 0xED517C07, 0x4810EE79, 0x04B1B4D5, 0xA1F026AB, 0x4BDEE6D8, 0xEE9F74A6, 0x9A6F10CF, 0x3F2E82B1, 0xD50042C2, 0x7041D0BC, 0xAD060C8E, 0x08479EF0, 0xE2695E83, 0x4728CCFD, 0x33D8A894, 0x96993AEA, 0x7CB7FA99, 0xD9F668E7, 0x9557324B, 0x3016A035, 0xDA386046, 0x7F79F238, 0x0B899651, 0xAEC8042F, 0x44E6C45C, 0xE1A75622, 0xDDA47104, 0x78E5E37A, 0x92CB2309, 0x378AB177, 0x437AD51E, 0xE63B4760, 0x0C158713, 0xA954156D, 0xE5F54FC1, 0x40B4DDBF, 0xAA9A1DCC, 0x0FDB8FB2, 0x7B2BEBDB, 0xDE6A79A5, 0x3444B9D6, 0x91052BA8 }; /* * end of the CRC lookup table crc_tableil8_o48 */ /* * The following CRC lookup table was generated automagically using the * following model parameters: * * Generator Polynomial = ................. 0x1EDC6F41 * Generator Polynomial Length = .......... 32 bits * Reflected Bits = ....................... TRUE * Table Generation Offset = .............. 32 bits * Number of Slices = ..................... 8 slices * Slice Lengths = ........................ 8 8 8 8 8 8 8 8 * Directory Name = ....................... .\ * File Name = ............................ 8x256_tables.c */ static const uint32_t sctp_crc_tableil8_o56[256] = { 0x00000000, 0xDD45AAB8, 0xBF672381, 0x62228939, 0x7B2231F3, 0xA6679B4B, 0xC4451272, 0x1900B8CA, 0xF64463E6, 0x2B01C95E, 0x49234067, 0x9466EADF, 0x8D665215, 0x5023F8AD, 0x32017194, 0xEF44DB2C, 0xE964B13D, 0x34211B85, 0x560392BC, 0x8B463804, 0x924680CE, 0x4F032A76, 0x2D21A34F, 0xF06409F7, 0x1F20D2DB, 0xC2657863, 0xA047F15A, 0x7D025BE2, 0x6402E328, 0xB9474990, 0xDB65C0A9, 0x06206A11, 0xD725148B, 0x0A60BE33, 0x6842370A, 0xB5079DB2, 0xAC072578, 0x71428FC0, 0x136006F9, 0xCE25AC41, 0x2161776D, 0xFC24DDD5, 0x9E0654EC, 0x4343FE54, 0x5A43469E, 0x8706EC26, 0xE524651F, 0x3861CFA7, 0x3E41A5B6, 0xE3040F0E, 0x81268637, 0x5C632C8F, 0x45639445, 0x98263EFD, 0xFA04B7C4, 0x27411D7C, 0xC805C650, 0x15406CE8, 0x7762E5D1, 0xAA274F69, 0xB327F7A3, 0x6E625D1B, 0x0C40D422, 0xD1057E9A, 0xABA65FE7, 0x76E3F55F, 0x14C17C66, 0xC984D6DE, 0xD0846E14, 0x0DC1C4AC, 0x6FE34D95, 0xB2A6E72D, 0x5DE23C01, 0x80A796B9, 0xE2851F80, 0x3FC0B538, 0x26C00DF2, 0xFB85A74A, 0x99A72E73, 0x44E284CB, 0x42C2EEDA, 0x9F874462, 0xFDA5CD5B, 0x20E067E3, 0x39E0DF29, 0xE4A57591, 0x8687FCA8, 0x5BC25610, 0xB4868D3C, 0x69C32784, 0x0BE1AEBD, 0xD6A40405, 0xCFA4BCCF, 0x12E11677, 0x70C39F4E, 0xAD8635F6, 0x7C834B6C, 0xA1C6E1D4, 0xC3E468ED, 0x1EA1C255, 0x07A17A9F, 0xDAE4D027, 0xB8C6591E, 0x6583F3A6, 0x8AC7288A, 0x57828232, 0x35A00B0B, 0xE8E5A1B3, 0xF1E51979, 0x2CA0B3C1, 0x4E823AF8, 0x93C79040, 0x95E7FA51, 0x48A250E9, 0x2A80D9D0, 0xF7C57368, 0xEEC5CBA2, 0x3380611A, 0x51A2E823, 0x8CE7429B, 0x63A399B7, 0xBEE6330F, 0xDCC4BA36, 0x0181108E, 0x1881A844, 0xC5C402FC, 0xA7E68BC5, 0x7AA3217D, 0x52A0C93F, 0x8FE56387, 0xEDC7EABE, 0x30824006, 0x2982F8CC, 0xF4C75274, 0x96E5DB4D, 0x4BA071F5, 0xA4E4AAD9, 0x79A10061, 0x1B838958, 0xC6C623E0, 0xDFC69B2A, 0x02833192, 0x60A1B8AB, 0xBDE41213, 0xBBC47802, 0x6681D2BA, 0x04A35B83, 0xD9E6F13B, 0xC0E649F1, 0x1DA3E349, 0x7F816A70, 0xA2C4C0C8, 0x4D801BE4, 0x90C5B15C, 0xF2E73865, 0x2FA292DD, 0x36A22A17, 0xEBE780AF, 0x89C50996, 0x5480A32E, 0x8585DDB4, 0x58C0770C, 0x3AE2FE35, 0xE7A7548D, 0xFEA7EC47, 0x23E246FF, 0x41C0CFC6, 0x9C85657E, 0x73C1BE52, 0xAE8414EA, 0xCCA69DD3, 0x11E3376B, 0x08E38FA1, 0xD5A62519, 0xB784AC20, 0x6AC10698, 0x6CE16C89, 0xB1A4C631, 0xD3864F08, 0x0EC3E5B0, 0x17C35D7A, 0xCA86F7C2, 0xA8A47EFB, 0x75E1D443, 0x9AA50F6F, 0x47E0A5D7, 0x25C22CEE, 0xF8878656, 0xE1873E9C, 0x3CC29424, 0x5EE01D1D, 0x83A5B7A5, 0xF90696D8, 0x24433C60, 0x4661B559, 0x9B241FE1, 0x8224A72B, 0x5F610D93, 0x3D4384AA, 0xE0062E12, 0x0F42F53E, 0xD2075F86, 0xB025D6BF, 0x6D607C07, 0x7460C4CD, 0xA9256E75, 0xCB07E74C, 0x16424DF4, 0x106227E5, 0xCD278D5D, 0xAF050464, 0x7240AEDC, 0x6B401616, 0xB605BCAE, 0xD4273597, 0x09629F2F, 0xE6264403, 0x3B63EEBB, 0x59416782, 0x8404CD3A, 0x9D0475F0, 0x4041DF48, 0x22635671, 0xFF26FCC9, 0x2E238253, 0xF36628EB, 0x9144A1D2, 0x4C010B6A, 0x5501B3A0, 0x88441918, 0xEA669021, 0x37233A99, 0xD867E1B5, 0x05224B0D, 0x6700C234, 0xBA45688C, 0xA345D046, 0x7E007AFE, 0x1C22F3C7, 0xC167597F, 0xC747336E, 0x1A0299D6, 0x782010EF, 0xA565BA57, 0xBC65029D, 0x6120A825, 0x0302211C, 0xDE478BA4, 0x31035088, 0xEC46FA30, 0x8E647309, 0x5321D9B1, 0x4A21617B, 0x9764CBC3, 0xF54642FA, 0x2803E842 }; /* * end of the CRC lookup table crc_tableil8_o56 */ /* * The following CRC lookup table was generated automagically using the * following model parameters: * * Generator Polynomial = ................. 0x1EDC6F41 * Generator Polynomial Length = .......... 32 bits * Reflected Bits = ....................... TRUE * Table Generation Offset = .............. 32 bits * Number of Slices = ..................... 8 slices * Slice Lengths = ........................ 8 8 8 8 8 8 8 8 * Directory Name = ....................... .\ * File Name = ............................ 8x256_tables.c */ static const uint32_t sctp_crc_tableil8_o64[256] = { 0x00000000, 0x38116FAC, 0x7022DF58, 0x4833B0F4, 0xE045BEB0, 0xD854D11C, 0x906761E8, 0xA8760E44, 0xC5670B91, 0xFD76643D, 0xB545D4C9, 0x8D54BB65, 0x2522B521, 0x1D33DA8D, 0x55006A79, 0x6D1105D5, 0x8F2261D3, 0xB7330E7F, 0xFF00BE8B, 0xC711D127, 0x6F67DF63, 0x5776B0CF, 0x1F45003B, 0x27546F97, 0x4A456A42, 0x725405EE, 0x3A67B51A, 0x0276DAB6, 0xAA00D4F2, 0x9211BB5E, 0xDA220BAA, 0xE2336406, 0x1BA8B557, 0x23B9DAFB, 0x6B8A6A0F, 0x539B05A3, 0xFBED0BE7, 0xC3FC644B, 0x8BCFD4BF, 0xB3DEBB13, 0xDECFBEC6, 0xE6DED16A, 0xAEED619E, 0x96FC0E32, 0x3E8A0076, 0x069B6FDA, 0x4EA8DF2E, 0x76B9B082, 0x948AD484, 0xAC9BBB28, 0xE4A80BDC, 0xDCB96470, 0x74CF6A34, 0x4CDE0598, 0x04EDB56C, 0x3CFCDAC0, 0x51EDDF15, 0x69FCB0B9, 0x21CF004D, 0x19DE6FE1, 0xB1A861A5, 0x89B90E09, 0xC18ABEFD, 0xF99BD151, 0x37516AAE, 0x0F400502, 0x4773B5F6, 0x7F62DA5A, 0xD714D41E, 0xEF05BBB2, 0xA7360B46, 0x9F2764EA, 0xF236613F, 0xCA270E93, 0x8214BE67, 0xBA05D1CB, 0x1273DF8F, 0x2A62B023, 0x625100D7, 0x5A406F7B, 0xB8730B7D, 0x806264D1, 0xC851D425, 0xF040BB89, 0x5836B5CD, 0x6027DA61, 0x28146A95, 0x10050539, 0x7D1400EC, 0x45056F40, 0x0D36DFB4, 0x3527B018, 0x9D51BE5C, 0xA540D1F0, 0xED736104, 0xD5620EA8, 0x2CF9DFF9, 0x14E8B055, 0x5CDB00A1, 0x64CA6F0D, 0xCCBC6149, 0xF4AD0EE5, 0xBC9EBE11, 0x848FD1BD, 0xE99ED468, 0xD18FBBC4, 0x99BC0B30, 0xA1AD649C, 0x09DB6AD8, 0x31CA0574, 0x79F9B580, 0x41E8DA2C, 0xA3DBBE2A, 0x9BCAD186, 0xD3F96172, 0xEBE80EDE, 0x439E009A, 0x7B8F6F36, 0x33BCDFC2, 0x0BADB06E, 0x66BCB5BB, 0x5EADDA17, 0x169E6AE3, 0x2E8F054F, 0x86F90B0B, 0xBEE864A7, 0xF6DBD453, 0xCECABBFF, 0x6EA2D55C, 0x56B3BAF0, 0x1E800A04, 0x269165A8, 0x8EE76BEC, 0xB6F60440, 0xFEC5B4B4, 0xC6D4DB18, 0xABC5DECD, 0x93D4B161, 0xDBE70195, 0xE3F66E39, 0x4B80607D, 0x73910FD1, 0x3BA2BF25, 0x03B3D089, 0xE180B48F, 0xD991DB23, 0x91A26BD7, 0xA9B3047B, 0x01C50A3F, 0x39D46593, 0x71E7D567, 0x49F6BACB, 0x24E7BF1E, 0x1CF6D0B2, 0x54C56046, 0x6CD40FEA, 0xC4A201AE, 0xFCB36E02, 0xB480DEF6, 0x8C91B15A, 0x750A600B, 0x4D1B0FA7, 0x0528BF53, 0x3D39D0FF, 0x954FDEBB, 0xAD5EB117, 0xE56D01E3, 0xDD7C6E4F, 0xB06D6B9A, 0x887C0436, 0xC04FB4C2, 0xF85EDB6E, 0x5028D52A, 0x6839BA86, 0x200A0A72, 0x181B65DE, 0xFA2801D8, 0xC2396E74, 0x8A0ADE80, 0xB21BB12C, 0x1A6DBF68, 0x227CD0C4, 0x6A4F6030, 0x525E0F9C, 0x3F4F0A49, 0x075E65E5, 0x4F6DD511, 0x777CBABD, 0xDF0AB4F9, 0xE71BDB55, 0xAF286BA1, 0x9739040D, 0x59F3BFF2, 0x61E2D05E, 0x29D160AA, 0x11C00F06, 0xB9B60142, 0x81A76EEE, 0xC994DE1A, 0xF185B1B6, 0x9C94B463, 0xA485DBCF, 0xECB66B3B, 0xD4A70497, 0x7CD10AD3, 0x44C0657F, 0x0CF3D58B, 0x34E2BA27, 0xD6D1DE21, 0xEEC0B18D, 0xA6F30179, 0x9EE26ED5, 0x36946091, 0x0E850F3D, 0x46B6BFC9, 0x7EA7D065, 0x13B6D5B0, 0x2BA7BA1C, 0x63940AE8, 0x5B856544, 0xF3F36B00, 0xCBE204AC, 0x83D1B458, 0xBBC0DBF4, 0x425B0AA5, 0x7A4A6509, 0x3279D5FD, 0x0A68BA51, 0xA21EB415, 0x9A0FDBB9, 0xD23C6B4D, 0xEA2D04E1, 0x873C0134, 0xBF2D6E98, 0xF71EDE6C, 0xCF0FB1C0, 0x6779BF84, 0x5F68D028, 0x175B60DC, 0x2F4A0F70, 0xCD796B76, 0xF56804DA, 0xBD5BB42E, 0x854ADB82, 0x2D3CD5C6, 0x152DBA6A, 0x5D1E0A9E, 0x650F6532, 0x081E60E7, 0x300F0F4B, 0x783CBFBF, 0x402DD013, 0xE85BDE57, 0xD04AB1FB, 0x9879010F, 0xA0686EA3 }; /* * end of the CRC lookup table crc_tableil8_o64 */ /* * The following CRC lookup table was generated automagically using the * following model parameters: * * Generator Polynomial = ................. 0x1EDC6F41 * Generator Polynomial Length = .......... 32 bits * Reflected Bits = ....................... TRUE * Table Generation Offset = .............. 32 bits * Number of Slices = ..................... 8 slices * Slice Lengths = ........................ 8 8 8 8 8 8 8 8 * Directory Name = ....................... .\ * File Name = ............................ 8x256_tables.c */ static const uint32_t sctp_crc_tableil8_o72[256] = { 0x00000000, 0xEF306B19, 0xDB8CA0C3, 0x34BCCBDA, 0xB2F53777, 0x5DC55C6E, 0x697997B4, 0x8649FCAD, 0x6006181F, 0x8F367306, 0xBB8AB8DC, 0x54BAD3C5, 0xD2F32F68, 0x3DC34471, 0x097F8FAB, 0xE64FE4B2, 0xC00C303E, 0x2F3C5B27, 0x1B8090FD, 0xF4B0FBE4, 0x72F90749, 0x9DC96C50, 0xA975A78A, 0x4645CC93, 0xA00A2821, 0x4F3A4338, 0x7B8688E2, 0x94B6E3FB, 0x12FF1F56, 0xFDCF744F, 0xC973BF95, 0x2643D48C, 0x85F4168D, 0x6AC47D94, 0x5E78B64E, 0xB148DD57, 0x370121FA, 0xD8314AE3, 0xEC8D8139, 0x03BDEA20, 0xE5F20E92, 0x0AC2658B, 0x3E7EAE51, 0xD14EC548, 0x570739E5, 0xB83752FC, 0x8C8B9926, 0x63BBF23F, 0x45F826B3, 0xAAC84DAA, 0x9E748670, 0x7144ED69, 0xF70D11C4, 0x183D7ADD, 0x2C81B107, 0xC3B1DA1E, 0x25FE3EAC, 0xCACE55B5, 0xFE729E6F, 0x1142F576, 0x970B09DB, 0x783B62C2, 0x4C87A918, 0xA3B7C201, 0x0E045BEB, 0xE13430F2, 0xD588FB28, 0x3AB89031, 0xBCF16C9C, 0x53C10785, 0x677DCC5F, 0x884DA746, 0x6E0243F4, 0x813228ED, 0xB58EE337, 0x5ABE882E, 0xDCF77483, 0x33C71F9A, 0x077BD440, 0xE84BBF59, 0xCE086BD5, 0x213800CC, 0x1584CB16, 0xFAB4A00F, 0x7CFD5CA2, 0x93CD37BB, 0xA771FC61, 0x48419778, 0xAE0E73CA, 0x413E18D3, 0x7582D309, 0x9AB2B810, 0x1CFB44BD, 0xF3CB2FA4, 0xC777E47E, 0x28478F67, 0x8BF04D66, 0x64C0267F, 0x507CEDA5, 0xBF4C86BC, 0x39057A11, 0xD6351108, 0xE289DAD2, 0x0DB9B1CB, 0xEBF65579, 0x04C63E60, 0x307AF5BA, 0xDF4A9EA3, 0x5903620E, 0xB6330917, 0x828FC2CD, 0x6DBFA9D4, 0x4BFC7D58, 0xA4CC1641, 0x9070DD9B, 0x7F40B682, 0xF9094A2F, 0x16392136, 0x2285EAEC, 0xCDB581F5, 0x2BFA6547, 0xC4CA0E5E, 0xF076C584, 0x1F46AE9D, 0x990F5230, 0x763F3929, 0x4283F2F3, 0xADB399EA, 0x1C08B7D6, 0xF338DCCF, 0xC7841715, 0x28B47C0C, 0xAEFD80A1, 0x41CDEBB8, 0x75712062, 0x9A414B7B, 0x7C0EAFC9, 0x933EC4D0, 0xA7820F0A, 0x48B26413, 0xCEFB98BE, 0x21CBF3A7, 0x1577387D, 0xFA475364, 0xDC0487E8, 0x3334ECF1, 0x0788272B, 0xE8B84C32, 0x6EF1B09F, 0x81C1DB86, 0xB57D105C, 0x5A4D7B45, 0xBC029FF7, 0x5332F4EE, 0x678E3F34, 0x88BE542D, 0x0EF7A880, 0xE1C7C399, 0xD57B0843, 0x3A4B635A, 0x99FCA15B, 0x76CCCA42, 0x42700198, 0xAD406A81, 0x2B09962C, 0xC439FD35, 0xF08536EF, 0x1FB55DF6, 0xF9FAB944, 0x16CAD25D, 0x22761987, 0xCD46729E, 0x4B0F8E33, 0xA43FE52A, 0x90832EF0, 0x7FB345E9, 0x59F09165, 0xB6C0FA7C, 0x827C31A6, 0x6D4C5ABF, 0xEB05A612, 0x0435CD0B, 0x308906D1, 0xDFB96DC8, 0x39F6897A, 0xD6C6E263, 0xE27A29B9, 0x0D4A42A0, 0x8B03BE0D, 0x6433D514, 0x508F1ECE, 0xBFBF75D7, 0x120CEC3D, 0xFD3C8724, 0xC9804CFE, 0x26B027E7, 0xA0F9DB4A, 0x4FC9B053, 0x7B757B89, 0x94451090, 0x720AF422, 0x9D3A9F3B, 0xA98654E1, 0x46B63FF8, 0xC0FFC355, 0x2FCFA84C, 0x1B736396, 0xF443088F, 0xD200DC03, 0x3D30B71A, 0x098C7CC0, 0xE6BC17D9, 0x60F5EB74, 0x8FC5806D, 0xBB794BB7, 0x544920AE, 0xB206C41C, 0x5D36AF05, 0x698A64DF, 0x86BA0FC6, 0x00F3F36B, 0xEFC39872, 0xDB7F53A8, 0x344F38B1, 0x97F8FAB0, 0x78C891A9, 0x4C745A73, 0xA344316A, 0x250DCDC7, 0xCA3DA6DE, 0xFE816D04, 0x11B1061D, 0xF7FEE2AF, 0x18CE89B6, 0x2C72426C, 0xC3422975, 0x450BD5D8, 0xAA3BBEC1, 0x9E87751B, 0x71B71E02, 0x57F4CA8E, 0xB8C4A197, 0x8C786A4D, 0x63480154, 0xE501FDF9, 0x0A3196E0, 0x3E8D5D3A, 0xD1BD3623, 0x37F2D291, 0xD8C2B988, 0xEC7E7252, 0x034E194B, 0x8507E5E6, 0x6A378EFF, 0x5E8B4525, 0xB1BB2E3C }; /* * end of the CRC lookup table crc_tableil8_o72 */ /* * The following CRC lookup table was generated automagically using the * following model parameters: * * Generator Polynomial = ................. 0x1EDC6F41 * Generator Polynomial Length = .......... 32 bits * Reflected Bits = ....................... TRUE * Table Generation Offset = .............. 32 bits * Number of Slices = ..................... 8 slices * Slice Lengths = ........................ 8 8 8 8 8 8 8 8 * Directory Name = ....................... .\ * File Name = ............................ 8x256_tables.c */ static const uint32_t sctp_crc_tableil8_o80[256] = { 0x00000000, 0x68032CC8, 0xD0065990, 0xB8057558, 0xA5E0C5D1, 0xCDE3E919, 0x75E69C41, 0x1DE5B089, 0x4E2DFD53, 0x262ED19B, 0x9E2BA4C3, 0xF628880B, 0xEBCD3882, 0x83CE144A, 0x3BCB6112, 0x53C84DDA, 0x9C5BFAA6, 0xF458D66E, 0x4C5DA336, 0x245E8FFE, 0x39BB3F77, 0x51B813BF, 0xE9BD66E7, 0x81BE4A2F, 0xD27607F5, 0xBA752B3D, 0x02705E65, 0x6A7372AD, 0x7796C224, 0x1F95EEEC, 0xA7909BB4, 0xCF93B77C, 0x3D5B83BD, 0x5558AF75, 0xED5DDA2D, 0x855EF6E5, 0x98BB466C, 0xF0B86AA4, 0x48BD1FFC, 0x20BE3334, 0x73767EEE, 0x1B755226, 0xA370277E, 0xCB730BB6, 0xD696BB3F, 0xBE9597F7, 0x0690E2AF, 0x6E93CE67, 0xA100791B, 0xC90355D3, 0x7106208B, 0x19050C43, 0x04E0BCCA, 0x6CE39002, 0xD4E6E55A, 0xBCE5C992, 0xEF2D8448, 0x872EA880, 0x3F2BDDD8, 0x5728F110, 0x4ACD4199, 0x22CE6D51, 0x9ACB1809, 0xF2C834C1, 0x7AB7077A, 0x12B42BB2, 0xAAB15EEA, 0xC2B27222, 0xDF57C2AB, 0xB754EE63, 0x0F519B3B, 0x6752B7F3, 0x349AFA29, 0x5C99D6E1, 0xE49CA3B9, 0x8C9F8F71, 0x917A3FF8, 0xF9791330, 0x417C6668, 0x297F4AA0, 0xE6ECFDDC, 0x8EEFD114, 0x36EAA44C, 0x5EE98884, 0x430C380D, 0x2B0F14C5, 0x930A619D, 0xFB094D55, 0xA8C1008F, 0xC0C22C47, 0x78C7591F, 0x10C475D7, 0x0D21C55E, 0x6522E996, 0xDD279CCE, 0xB524B006, 0x47EC84C7, 0x2FEFA80F, 0x97EADD57, 0xFFE9F19F, 0xE20C4116, 0x8A0F6DDE, 0x320A1886, 0x5A09344E, 0x09C17994, 0x61C2555C, 0xD9C72004, 0xB1C40CCC, 0xAC21BC45, 0xC422908D, 0x7C27E5D5, 0x1424C91D, 0xDBB77E61, 0xB3B452A9, 0x0BB127F1, 0x63B20B39, 0x7E57BBB0, 0x16549778, 0xAE51E220, 0xC652CEE8, 0x959A8332, 0xFD99AFFA, 0x459CDAA2, 0x2D9FF66A, 0x307A46E3, 0x58796A2B, 0xE07C1F73, 0x887F33BB, 0xF56E0EF4, 0x9D6D223C, 0x25685764, 0x4D6B7BAC, 0x508ECB25, 0x388DE7ED, 0x808892B5, 0xE88BBE7D, 0xBB43F3A7, 0xD340DF6F, 0x6B45AA37, 0x034686FF, 0x1EA33676, 0x76A01ABE, 0xCEA56FE6, 0xA6A6432E, 0x6935F452, 0x0136D89A, 0xB933ADC2, 0xD130810A, 0xCCD53183, 0xA4D61D4B, 0x1CD36813, 0x74D044DB, 0x27180901, 0x4F1B25C9, 0xF71E5091, 0x9F1D7C59, 0x82F8CCD0, 0xEAFBE018, 0x52FE9540, 0x3AFDB988, 0xC8358D49, 0xA036A181, 0x1833D4D9, 0x7030F811, 0x6DD54898, 0x05D66450, 0xBDD31108, 0xD5D03DC0, 0x8618701A, 0xEE1B5CD2, 0x561E298A, 0x3E1D0542, 0x23F8B5CB, 0x4BFB9903, 0xF3FEEC5B, 0x9BFDC093, 0x546E77EF, 0x3C6D5B27, 0x84682E7F, 0xEC6B02B7, 0xF18EB23E, 0x998D9EF6, 0x2188EBAE, 0x498BC766, 0x1A438ABC, 0x7240A674, 0xCA45D32C, 0xA246FFE4, 0xBFA34F6D, 0xD7A063A5, 0x6FA516FD, 0x07A63A35, 0x8FD9098E, 0xE7DA2546, 0x5FDF501E, 0x37DC7CD6, 0x2A39CC5F, 0x423AE097, 0xFA3F95CF, 0x923CB907, 0xC1F4F4DD, 0xA9F7D815, 0x11F2AD4D, 0x79F18185, 0x6414310C, 0x0C171DC4, 0xB412689C, 0xDC114454, 0x1382F328, 0x7B81DFE0, 0xC384AAB8, 0xAB878670, 0xB66236F9, 0xDE611A31, 0x66646F69, 0x0E6743A1, 0x5DAF0E7B, 0x35AC22B3, 0x8DA957EB, 0xE5AA7B23, 0xF84FCBAA, 0x904CE762, 0x2849923A, 0x404ABEF2, 0xB2828A33, 0xDA81A6FB, 0x6284D3A3, 0x0A87FF6B, 0x17624FE2, 0x7F61632A, 0xC7641672, 0xAF673ABA, 0xFCAF7760, 0x94AC5BA8, 0x2CA92EF0, 0x44AA0238, 0x594FB2B1, 0x314C9E79, 0x8949EB21, 0xE14AC7E9, 0x2ED97095, 0x46DA5C5D, 0xFEDF2905, 0x96DC05CD, 0x8B39B544, 0xE33A998C, 0x5B3FECD4, 0x333CC01C, 0x60F48DC6, 0x08F7A10E, 0xB0F2D456, 0xD8F1F89E, 0xC5144817, 0xAD1764DF, 0x15121187, 0x7D113D4F }; /* * end of the CRC lookup table crc_tableil8_o80 */ /* * The following CRC lookup table was generated automagically using the * following model parameters: * * Generator Polynomial = ................. 0x1EDC6F41 * Generator Polynomial Length = .......... 32 bits * Reflected Bits = ....................... TRUE * Table Generation Offset = .............. 32 bits * Number of Slices = ..................... 8 slices * Slice Lengths = ........................ 8 8 8 8 8 8 8 8 * Directory Name = ....................... .\ * File Name = ............................ 8x256_tables.c */ static const uint32_t sctp_crc_tableil8_o88[256] = { 0x00000000, 0x493C7D27, 0x9278FA4E, 0xDB448769, 0x211D826D, 0x6821FF4A, 0xB3657823, 0xFA590504, 0x423B04DA, 0x0B0779FD, 0xD043FE94, 0x997F83B3, 0x632686B7, 0x2A1AFB90, 0xF15E7CF9, 0xB86201DE, 0x847609B4, 0xCD4A7493, 0x160EF3FA, 0x5F328EDD, 0xA56B8BD9, 0xEC57F6FE, 0x37137197, 0x7E2F0CB0, 0xC64D0D6E, 0x8F717049, 0x5435F720, 0x1D098A07, 0xE7508F03, 0xAE6CF224, 0x7528754D, 0x3C14086A, 0x0D006599, 0x443C18BE, 0x9F789FD7, 0xD644E2F0, 0x2C1DE7F4, 0x65219AD3, 0xBE651DBA, 0xF759609D, 0x4F3B6143, 0x06071C64, 0xDD439B0D, 0x947FE62A, 0x6E26E32E, 0x271A9E09, 0xFC5E1960, 0xB5626447, 0x89766C2D, 0xC04A110A, 0x1B0E9663, 0x5232EB44, 0xA86BEE40, 0xE1579367, 0x3A13140E, 0x732F6929, 0xCB4D68F7, 0x827115D0, 0x593592B9, 0x1009EF9E, 0xEA50EA9A, 0xA36C97BD, 0x782810D4, 0x31146DF3, 0x1A00CB32, 0x533CB615, 0x8878317C, 0xC1444C5B, 0x3B1D495F, 0x72213478, 0xA965B311, 0xE059CE36, 0x583BCFE8, 0x1107B2CF, 0xCA4335A6, 0x837F4881, 0x79264D85, 0x301A30A2, 0xEB5EB7CB, 0xA262CAEC, 0x9E76C286, 0xD74ABFA1, 0x0C0E38C8, 0x453245EF, 0xBF6B40EB, 0xF6573DCC, 0x2D13BAA5, 0x642FC782, 0xDC4DC65C, 0x9571BB7B, 0x4E353C12, 0x07094135, 0xFD504431, 0xB46C3916, 0x6F28BE7F, 0x2614C358, 0x1700AEAB, 0x5E3CD38C, 0x857854E5, 0xCC4429C2, 0x361D2CC6, 0x7F2151E1, 0xA465D688, 0xED59ABAF, 0x553BAA71, 0x1C07D756, 0xC743503F, 0x8E7F2D18, 0x7426281C, 0x3D1A553B, 0xE65ED252, 0xAF62AF75, 0x9376A71F, 0xDA4ADA38, 0x010E5D51, 0x48322076, 0xB26B2572, 0xFB575855, 0x2013DF3C, 0x692FA21B, 0xD14DA3C5, 0x9871DEE2, 0x4335598B, 0x0A0924AC, 0xF05021A8, 0xB96C5C8F, 0x6228DBE6, 0x2B14A6C1, 0x34019664, 0x7D3DEB43, 0xA6796C2A, 0xEF45110D, 0x151C1409, 0x5C20692E, 0x8764EE47, 0xCE589360, 0x763A92BE, 0x3F06EF99, 0xE44268F0, 0xAD7E15D7, 0x572710D3, 0x1E1B6DF4, 0xC55FEA9D, 0x8C6397BA, 0xB0779FD0, 0xF94BE2F7, 0x220F659E, 0x6B3318B9, 0x916A1DBD, 0xD856609A, 0x0312E7F3, 0x4A2E9AD4, 0xF24C9B0A, 0xBB70E62D, 0x60346144, 0x29081C63, 0xD3511967, 0x9A6D6440, 0x4129E329, 0x08159E0E, 0x3901F3FD, 0x703D8EDA, 0xAB7909B3, 0xE2457494, 0x181C7190, 0x51200CB7, 0x8A648BDE, 0xC358F6F9, 0x7B3AF727, 0x32068A00, 0xE9420D69, 0xA07E704E, 0x5A27754A, 0x131B086D, 0xC85F8F04, 0x8163F223, 0xBD77FA49, 0xF44B876E, 0x2F0F0007, 0x66337D20, 0x9C6A7824, 0xD5560503, 0x0E12826A, 0x472EFF4D, 0xFF4CFE93, 0xB67083B4, 0x6D3404DD, 0x240879FA, 0xDE517CFE, 0x976D01D9, 0x4C2986B0, 0x0515FB97, 0x2E015D56, 0x673D2071, 0xBC79A718, 0xF545DA3F, 0x0F1CDF3B, 0x4620A21C, 0x9D642575, 0xD4585852, 0x6C3A598C, 0x250624AB, 0xFE42A3C2, 0xB77EDEE5, 0x4D27DBE1, 0x041BA6C6, 0xDF5F21AF, 0x96635C88, 0xAA7754E2, 0xE34B29C5, 0x380FAEAC, 0x7133D38B, 0x8B6AD68F, 0xC256ABA8, 0x19122CC1, 0x502E51E6, 0xE84C5038, 0xA1702D1F, 0x7A34AA76, 0x3308D751, 0xC951D255, 0x806DAF72, 0x5B29281B, 0x1215553C, 0x230138CF, 0x6A3D45E8, 0xB179C281, 0xF845BFA6, 0x021CBAA2, 0x4B20C785, 0x906440EC, 0xD9583DCB, 0x613A3C15, 0x28064132, 0xF342C65B, 0xBA7EBB7C, 0x4027BE78, 0x091BC35F, 0xD25F4436, 0x9B633911, 0xA777317B, 0xEE4B4C5C, 0x350FCB35, 0x7C33B612, 0x866AB316, 0xCF56CE31, 0x14124958, 0x5D2E347F, 0xE54C35A1, 0xAC704886, 0x7734CFEF, 0x3E08B2C8, 0xC451B7CC, 0x8D6DCAEB, 0x56294D82, 0x1F1530A5 }; /* * end of the CRC lookup table crc_tableil8_o88 */ static uint32_t crc32c_sb8_64_bit(uint32_t crc, const unsigned char *p_buf, uint32_t length, uint32_t init_bytes) { uint32_t li; uint32_t term1, term2; uint32_t running_length; uint32_t end_bytes; running_length = rounddown(length - init_bytes, 8); end_bytes = length - init_bytes - running_length; for (li = 0; li < init_bytes; li++) crc = sctp_crc_tableil8_o32[(crc ^ *p_buf++) & 0x000000FF] ^ (crc >> 8); for (li = 0; li < running_length / 8; li++) { #if BYTE_ORDER == BIG_ENDIAN crc ^= *p_buf++; crc ^= (*p_buf++) << 8; crc ^= (*p_buf++) << 16; crc ^= (*p_buf++) << 24; #else crc ^= *(const uint32_t *) p_buf; p_buf += 4; #endif term1 = sctp_crc_tableil8_o88[crc & 0x000000FF] ^ sctp_crc_tableil8_o80[(crc >> 8) & 0x000000FF]; term2 = crc >> 16; crc = term1 ^ sctp_crc_tableil8_o72[term2 & 0x000000FF] ^ sctp_crc_tableil8_o64[(term2 >> 8) & 0x000000FF]; #if BYTE_ORDER == BIG_ENDIAN crc ^= sctp_crc_tableil8_o56[*p_buf++]; crc ^= sctp_crc_tableil8_o48[*p_buf++]; crc ^= sctp_crc_tableil8_o40[*p_buf++]; crc ^= sctp_crc_tableil8_o32[*p_buf++]; #else term1 = sctp_crc_tableil8_o56[(*(const uint32_t *) p_buf) & 0x000000FF] ^ sctp_crc_tableil8_o48[((*(const uint32_t *) p_buf) >> 8) & 0x000000FF]; term2 = (*(const uint32_t *) p_buf) >> 16; crc = crc ^ term1 ^ sctp_crc_tableil8_o40[term2 & 0x000000FF] ^ sctp_crc_tableil8_o32[(term2 >> 8) & 0x000000FF]; p_buf += 4; #endif } for (li = 0; li < end_bytes; li++) crc = sctp_crc_tableil8_o32[(crc ^ *p_buf++) & 0x000000FF] ^ (crc >> 8); return crc; } static uint32_t multitable_crc32c(uint32_t crc32c, const unsigned char *buffer, unsigned int length) { uint32_t to_even_word; if (length == 0) { return (crc32c); } to_even_word = (4 - (((uintptr_t) buffer) & 0x3)); return (crc32c_sb8_64_bit(crc32c, buffer, length, to_even_word)); } uint32_t calculate_crc32c(uint32_t crc32c, const unsigned char *buffer, unsigned int length) { +#ifdef _KERNEL #if defined(__amd64__) || defined(__i386__) if ((cpu_feature2 & CPUID2_SSE42) != 0) { return (sse42_crc32c(crc32c, buffer, length)); } else #endif #if defined(__aarch64__) uint64_t reg; /* * We only test for CRC32 support on the CPU with index 0 assuming that * this applies to all CPUs. */ reg = READ_SPECIALREG(id_aa64isar0_el1); if (ID_AA64ISAR0_CRC32(reg) != ID_AA64ISAR0_CRC32_NONE) { return (armv8_crc32c(crc32c, buffer, length)); } else #endif +#endif /* _KERNEL */ if (length < 4) { return (singletable_crc32c(crc32c, buffer, length)); } else { return (multitable_crc32c(crc32c, buffer, length)); } } Index: head/sys/sys/buf.h =================================================================== --- head/sys/sys/buf.h (revision 323922) +++ head/sys/sys/buf.h (revision 323923) @@ -1,572 +1,580 @@ /*- * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)buf.h 8.9 (Berkeley) 3/30/95 * $FreeBSD$ */ #ifndef _SYS_BUF_H_ #define _SYS_BUF_H_ #include #include #include #include struct bio; struct buf; struct bufobj; struct mount; struct vnode; struct uio; /* * To avoid including */ LIST_HEAD(workhead, worklist); /* * These are currently used only by the soft dependency code, hence * are stored once in a global variable. If other subsystems wanted * to use these hooks, a pointer to a set of bio_ops could be added * to each buffer. */ extern struct bio_ops { void (*io_start)(struct buf *); void (*io_complete)(struct buf *); void (*io_deallocate)(struct buf *); int (*io_countdeps)(struct buf *, int); } bioops; struct vm_object; struct vm_page; -typedef unsigned char b_xflags_t; +typedef uint32_t b_xflags_t; /* * The buffer header describes an I/O operation in the kernel. * * NOTES: * b_bufsize, b_bcount. b_bufsize is the allocation size of the * buffer, either DEV_BSIZE or PAGE_SIZE aligned. b_bcount is the * originally requested buffer size and can serve as a bounds check * against EOF. For most, but not all uses, b_bcount == b_bufsize. * * b_dirtyoff, b_dirtyend. Buffers support piecemeal, unaligned * ranges of dirty data that need to be written to backing store. * The range is typically clipped at b_bcount ( not b_bufsize ). * * b_resid. Number of bytes remaining in I/O. After an I/O operation * completes, b_resid is usually 0 indicating 100% success. * * All fields are protected by the buffer lock except those marked: * V - Protected by owning bufobj lock * Q - Protected by the buf queue lock * D - Protected by an dependency implementation specific lock */ struct buf { struct bufobj *b_bufobj; long b_bcount; void *b_caller1; caddr_t b_data; int b_error; uint16_t b_iocmd; /* BIO_* bio_cmd from bio.h */ uint16_t b_ioflags; /* BIO_* bio_flags from bio.h */ off_t b_iooffset; long b_resid; void (*b_iodone)(struct buf *); + void (*b_ckhashcalc)(struct buf *); + uint64_t b_ckhash; /* B_CKHASH requested check-hash */ daddr_t b_blkno; /* Underlying physical block number. */ off_t b_offset; /* Offset into file. */ TAILQ_ENTRY(buf) b_bobufs; /* (V) Buffer's associated vnode. */ uint32_t b_vflags; /* (V) BV_* flags */ unsigned short b_qindex; /* (Q) buffer queue index */ uint32_t b_flags; /* B_* flags. */ b_xflags_t b_xflags; /* extra flags */ struct lock b_lock; /* Buffer lock */ long b_bufsize; /* Allocated buffer size. */ int b_runningbufspace; /* when I/O is running, pipelining */ int b_kvasize; /* size of kva for buffer */ int b_dirtyoff; /* Offset in buffer of dirty region. */ int b_dirtyend; /* Offset of end of dirty region. */ caddr_t b_kvabase; /* base kva for buffer */ daddr_t b_lblkno; /* Logical block number. */ struct vnode *b_vp; /* Device vnode. */ struct ucred *b_rcred; /* Read credentials reference. */ struct ucred *b_wcred; /* Write credentials reference. */ union { TAILQ_ENTRY(buf) b_freelist; /* (Q) */ struct { void (*b_pgiodone)(void *, vm_page_t *, int, int); int b_pgbefore; int b_pgafter; }; }; union cluster_info { TAILQ_HEAD(cluster_list_head, buf) cluster_head; TAILQ_ENTRY(buf) cluster_entry; } b_cluster; struct vm_page *b_pages[btoc(MAXPHYS)]; int b_npages; struct workhead b_dep; /* (D) List of filesystem dependencies. */ void *b_fsprivate1; void *b_fsprivate2; void *b_fsprivate3; #if defined(FULL_BUF_TRACKING) #define BUF_TRACKING_SIZE 32 #define BUF_TRACKING_ENTRY(x) ((x) & (BUF_TRACKING_SIZE - 1)) const char *b_io_tracking[BUF_TRACKING_SIZE]; uint32_t b_io_tcnt; #elif defined(BUF_TRACKING) const char *b_io_tracking; #endif }; #define b_object b_bufobj->bo_object /* * These flags are kept in b_flags. * * Notes: * * B_ASYNC VOP calls on bp's are usually async whether or not * B_ASYNC is set, but some subsystems, such as NFS, like * to know what is best for the caller so they can * optimize the I/O. * * B_PAGING Indicates that bp is being used by the paging system or * some paging system and that the bp is not linked into * the b_vp's clean/dirty linked lists or ref counts. * Buffer vp reassignments are illegal in this case. * * B_CACHE This may only be set if the buffer is entirely valid. * The situation where B_DELWRI is set and B_CACHE is * clear MUST be committed to disk by getblk() so * B_DELWRI can also be cleared. See the comments for * getblk() in kern/vfs_bio.c. If B_CACHE is clear, * the caller is expected to clear BIO_ERROR and B_INVAL, * set BIO_READ, and initiate an I/O. * * The 'entire buffer' is defined to be the range from * 0 through b_bcount. * * B_MALLOC Request that the buffer be allocated from the malloc * pool, DEV_BSIZE aligned instead of PAGE_SIZE aligned. * * B_CLUSTEROK This flag is typically set for B_DELWRI buffers * by filesystems that allow clustering when the buffer * is fully dirty and indicates that it may be clustered * with other adjacent dirty buffers. Note the clustering * may not be used with the stage 1 data write under NFS * but may be used for the commit rpc portion. * * B_VMIO Indicates that the buffer is tied into an VM object. * The buffer's data is always PAGE_SIZE aligned even * if b_bufsize and b_bcount are not. ( b_bufsize is * always at least DEV_BSIZE aligned, though ). * * B_DIRECT Hint that we should attempt to completely free * the pages underlying the buffer. B_DIRECT is * sticky until the buffer is released and typically * only has an effect when B_RELBUF is also set. * */ #define B_AGE 0x00000001 /* Move to age queue when I/O done. */ #define B_NEEDCOMMIT 0x00000002 /* Append-write in progress. */ #define B_ASYNC 0x00000004 /* Start I/O, do not wait. */ #define B_DIRECT 0x00000008 /* direct I/O flag (pls free vmio) */ #define B_DEFERRED 0x00000010 /* Skipped over for cleaning */ #define B_CACHE 0x00000020 /* Bread found us in the cache. */ #define B_VALIDSUSPWRT 0x00000040 /* Valid write during suspension. */ #define B_DELWRI 0x00000080 /* Delay I/O until buffer reused. */ -#define B_00000100 0x00000100 /* Available flag. */ +#define B_CKHASH 0x00000100 /* checksum hash calculated on read */ #define B_DONE 0x00000200 /* I/O completed. */ #define B_EINTR 0x00000400 /* I/O was interrupted */ #define B_NOREUSE 0x00000800 /* Contents not reused once released. */ #define B_00001000 0x00001000 /* Available flag. */ #define B_INVAL 0x00002000 /* Does not contain valid info. */ #define B_BARRIER 0x00004000 /* Write this and all preceding first. */ #define B_NOCACHE 0x00008000 /* Do not cache block after use. */ #define B_MALLOC 0x00010000 /* malloced b_data */ #define B_CLUSTEROK 0x00020000 /* Pagein op, so swap() can count it. */ #define B_00040000 0x00040000 /* Available flag. */ #define B_00080000 0x00080000 /* Available flag. */ #define B_00100000 0x00100000 /* Available flag. */ #define B_00200000 0x00200000 /* Available flag. */ #define B_RELBUF 0x00400000 /* Release VMIO buffer. */ #define B_FS_FLAG1 0x00800000 /* Available flag for FS use. */ #define B_NOCOPY 0x01000000 /* Don't copy-on-write this buf. */ #define B_INFREECNT 0x02000000 /* buf is counted in numfreebufs */ #define B_PAGING 0x04000000 /* volatile paging I/O -- bypass VMIO */ #define B_MANAGED 0x08000000 /* Managed by FS. */ #define B_RAM 0x10000000 /* Read ahead mark (flag) */ #define B_VMIO 0x20000000 /* VMIO flag */ #define B_CLUSTER 0x40000000 /* pagein op, so swap() can count it */ #define B_REMFREE 0x80000000 /* Delayed bremfree */ #define PRINT_BUF_FLAGS "\20\40remfree\37cluster\36vmio\35ram\34managed" \ "\33paging\32infreecnt\31nocopy\30b23\27relbuf\26b21\25b20" \ "\24b19\23b18\22clusterok\21malloc\20nocache\17b14\16inval" \ "\15b12\14noreuse\13eintr\12done\11b8\10delwri" \ "\7validsuspwrt\6cache\5deferred\4direct\3async\2needcommit\1age" /* * These flags are kept in b_xflags. + * + * BX_FSPRIV reserves a set of eight flags that may be used by individual + * filesystems for their own purpose. Their specific definitions are + * found in the header files for each filesystem that uses them. */ #define BX_VNDIRTY 0x00000001 /* On vnode dirty list */ #define BX_VNCLEAN 0x00000002 /* On vnode clean list */ #define BX_BKGRDWRITE 0x00000010 /* Do writes in background */ #define BX_BKGRDMARKER 0x00000020 /* Mark buffer for splay tree */ #define BX_ALTDATA 0x00000040 /* Holds extended data */ +#define BX_FSPRIV 0x00FF0000 /* filesystem-specific flags mask */ #define PRINT_BUF_XFLAGS "\20\7altdata\6bkgrdmarker\5bkgrdwrite\2clean\1dirty" #define NOOFFSET (-1LL) /* No buffer offset calculated yet */ /* * These flags are kept in b_vflags. */ #define BV_SCANNED 0x00000001 /* VOP_FSYNC funcs mark written bufs */ #define BV_BKGRDINPROG 0x00000002 /* Background write in progress */ #define BV_BKGRDWAIT 0x00000004 /* Background write waiting */ #define BV_BKGRDERR 0x00000008 /* Error from background write */ #define PRINT_BUF_VFLAGS "\20\4bkgrderr\3bkgrdwait\2bkgrdinprog\1scanned" #ifdef _KERNEL /* * Buffer locking */ extern const char *buf_wmesg; /* Default buffer lock message */ #define BUF_WMESG "bufwait" #include /* XXX for curthread */ #include /* * Initialize a lock. */ #define BUF_LOCKINIT(bp) \ lockinit(&(bp)->b_lock, PRIBIO + 4, buf_wmesg, 0, 0) /* * * Get a lock sleeping non-interruptably until it becomes available. */ #define BUF_LOCK(bp, locktype, interlock) \ _lockmgr_args_rw(&(bp)->b_lock, (locktype), (interlock), \ LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT, \ LOCK_FILE, LOCK_LINE) /* * Get a lock sleeping with specified interruptably and timeout. */ #define BUF_TIMELOCK(bp, locktype, interlock, wmesg, catch, timo) \ _lockmgr_args_rw(&(bp)->b_lock, (locktype) | LK_TIMELOCK, \ (interlock), (wmesg), (PRIBIO + 4) | (catch), (timo), \ LOCK_FILE, LOCK_LINE) /* * Release a lock. Only the acquiring process may free the lock unless * it has been handed off to biodone. */ #define BUF_UNLOCK(bp) do { \ KASSERT(((bp)->b_flags & B_REMFREE) == 0, \ ("BUF_UNLOCK %p while B_REMFREE is still set.", (bp))); \ \ (void)_lockmgr_args(&(bp)->b_lock, LK_RELEASE, NULL, \ LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT, \ LOCK_FILE, LOCK_LINE); \ } while (0) /* * Check if a buffer lock is recursed. */ #define BUF_LOCKRECURSED(bp) \ lockmgr_recursed(&(bp)->b_lock) /* * Check if a buffer lock is currently held. */ #define BUF_ISLOCKED(bp) \ lockstatus(&(bp)->b_lock) /* * Free a buffer lock. */ #define BUF_LOCKFREE(bp) \ lockdestroy(&(bp)->b_lock) /* * Print informations on a buffer lock. */ #define BUF_LOCKPRINTINFO(bp) \ lockmgr_printinfo(&(bp)->b_lock) /* * Buffer lock assertions. */ #if defined(INVARIANTS) && defined(INVARIANT_SUPPORT) #define BUF_ASSERT_LOCKED(bp) \ _lockmgr_assert(&(bp)->b_lock, KA_LOCKED, LOCK_FILE, LOCK_LINE) #define BUF_ASSERT_SLOCKED(bp) \ _lockmgr_assert(&(bp)->b_lock, KA_SLOCKED, LOCK_FILE, LOCK_LINE) #define BUF_ASSERT_XLOCKED(bp) \ _lockmgr_assert(&(bp)->b_lock, KA_XLOCKED, LOCK_FILE, LOCK_LINE) #define BUF_ASSERT_UNLOCKED(bp) \ _lockmgr_assert(&(bp)->b_lock, KA_UNLOCKED, LOCK_FILE, LOCK_LINE) #define BUF_ASSERT_HELD(bp) #define BUF_ASSERT_UNHELD(bp) #else #define BUF_ASSERT_LOCKED(bp) #define BUF_ASSERT_SLOCKED(bp) #define BUF_ASSERT_XLOCKED(bp) #define BUF_ASSERT_UNLOCKED(bp) #define BUF_ASSERT_HELD(bp) #define BUF_ASSERT_UNHELD(bp) #endif #ifdef _SYS_PROC_H_ /* Avoid #include pollution */ /* * When initiating asynchronous I/O, change ownership of the lock to the * kernel. Once done, the lock may legally released by biodone. The * original owning process can no longer acquire it recursively, but must * wait until the I/O is completed and the lock has been freed by biodone. */ #define BUF_KERNPROC(bp) \ _lockmgr_disown(&(bp)->b_lock, LOCK_FILE, LOCK_LINE) #endif #endif /* _KERNEL */ struct buf_queue_head { TAILQ_HEAD(buf_queue, buf) queue; daddr_t last_pblkno; struct buf *insert_point; struct buf *switch_point; }; /* * This structure describes a clustered I/O. */ struct cluster_save { long bs_bcount; /* Saved b_bcount. */ long bs_bufsize; /* Saved b_bufsize. */ int bs_nchildren; /* Number of associated buffers. */ struct buf **bs_children; /* List of associated buffers. */ }; #ifdef _KERNEL static __inline int bwrite(struct buf *bp) { KASSERT(bp->b_bufobj != NULL, ("bwrite: no bufobj bp=%p", bp)); KASSERT(bp->b_bufobj->bo_ops != NULL, ("bwrite: no bo_ops bp=%p", bp)); KASSERT(bp->b_bufobj->bo_ops->bop_write != NULL, ("bwrite: no bop_write bp=%p", bp)); return (BO_WRITE(bp->b_bufobj, bp)); } static __inline void bstrategy(struct buf *bp) { KASSERT(bp->b_bufobj != NULL, ("bstrategy: no bufobj bp=%p", bp)); KASSERT(bp->b_bufobj->bo_ops != NULL, ("bstrategy: no bo_ops bp=%p", bp)); KASSERT(bp->b_bufobj->bo_ops->bop_strategy != NULL, ("bstrategy: no bop_strategy bp=%p", bp)); BO_STRATEGY(bp->b_bufobj, bp); } static __inline void buf_start(struct buf *bp) { if (bioops.io_start) (*bioops.io_start)(bp); } static __inline void buf_complete(struct buf *bp) { if (bioops.io_complete) (*bioops.io_complete)(bp); } static __inline void buf_deallocate(struct buf *bp) { if (bioops.io_deallocate) (*bioops.io_deallocate)(bp); } static __inline int buf_countdeps(struct buf *bp, int i) { if (bioops.io_countdeps) return ((*bioops.io_countdeps)(bp, i)); else return (0); } static __inline void buf_track(struct buf *bp, const char *location) { #if defined(FULL_BUF_TRACKING) bp->b_io_tracking[BUF_TRACKING_ENTRY(bp->b_io_tcnt++)] = location; #elif defined(BUF_TRACKING) bp->b_io_tracking = location; #endif } #endif /* _KERNEL */ /* * Zero out the buffer's data area. */ #define clrbuf(bp) { \ bzero((bp)->b_data, (u_int)(bp)->b_bcount); \ (bp)->b_resid = 0; \ } /* * Flags for getblk's last parameter. */ #define GB_LOCK_NOWAIT 0x0001 /* Fail if we block on a buf lock. */ #define GB_NOCREAT 0x0002 /* Don't create a buf if not found. */ #define GB_NOWAIT_BD 0x0004 /* Do not wait for bufdaemon. */ #define GB_UNMAPPED 0x0008 /* Do not mmap buffer pages. */ #define GB_KVAALLOC 0x0010 /* But allocate KVA. */ +#define GB_CKHASH 0x0020 /* If reading, calc checksum hash */ #ifdef _KERNEL extern int nbuf; /* The number of buffer headers */ extern long maxswzone; /* Max KVA for swap structures */ extern long maxbcache; /* Max KVA for buffer cache */ extern int maxbcachebuf; /* Max buffer cache block size */ extern long runningbufspace; extern long hibufspace; extern int dirtybufthresh; extern int bdwriteskip; extern int dirtybufferflushes; extern int altbufferflushes; extern int nswbuf; /* Number of swap I/O buffer headers. */ extern int cluster_pbuf_freecnt; /* Number of pbufs for clusters */ extern int vnode_pbuf_freecnt; /* Number of pbufs for vnode pager */ extern int vnode_async_pbuf_freecnt; /* Number of pbufs for vnode pager, asynchronous reads */ extern caddr_t unmapped_buf; /* Data address for unmapped buffers. */ static inline int buf_mapped(struct buf *bp) { return (bp->b_data != unmapped_buf); } void runningbufwakeup(struct buf *); void waitrunningbufspace(void); caddr_t kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est); void bufinit(void); void bufshutdown(int); void bdata2bio(struct buf *bp, struct bio *bip); void bwillwrite(void); int buf_dirty_count_severe(void); void bremfree(struct buf *); void bremfreef(struct buf *); /* XXX Force bremfree, only for nfs. */ #define bread(vp, blkno, size, cred, bpp) \ - breadn_flags(vp, blkno, size, NULL, NULL, 0, cred, 0, bpp) + breadn_flags(vp, blkno, size, NULL, NULL, 0, cred, 0, NULL, bpp) #define bread_gb(vp, blkno, size, cred, gbflags, bpp) \ breadn_flags(vp, blkno, size, NULL, NULL, 0, cred, \ - gbflags, bpp) + gbflags, NULL, bpp) #define breadn(vp, blkno, size, rablkno, rabsize, cnt, cred, bpp) \ - breadn_flags(vp, blkno, size, rablkno, rabsize, cnt, cred, 0, bpp) + breadn_flags(vp, blkno, size, rablkno, rabsize, cnt, cred, \ + 0, NULL, bpp) int breadn_flags(struct vnode *, daddr_t, int, daddr_t *, int *, int, - struct ucred *, int, struct buf **); -void breada(struct vnode *, daddr_t *, int *, int, struct ucred *); + struct ucred *, int, void (*)(struct buf *), struct buf **); void bdwrite(struct buf *); void bawrite(struct buf *); void babarrierwrite(struct buf *); int bbarrierwrite(struct buf *); void bdirty(struct buf *); void bundirty(struct buf *); void bufstrategy(struct bufobj *, struct buf *); void brelse(struct buf *); void bqrelse(struct buf *); int vfs_bio_awrite(struct buf *); void vfs_drain_busy_pages(struct buf *bp); struct buf * getpbuf(int *); struct buf *incore(struct bufobj *, daddr_t); struct buf *gbincore(struct bufobj *, daddr_t); struct buf *getblk(struct vnode *, daddr_t, int, int, int, int); struct buf *geteblk(int, int); int bufwait(struct buf *); int bufwrite(struct buf *); void bufdone(struct buf *); void bufdone_finish(struct buf *); void bd_speedup(void); int cluster_read(struct vnode *, u_quad_t, daddr_t, long, struct ucred *, long, int, int, struct buf **); int cluster_wbuild(struct vnode *, long, daddr_t, int, int); void cluster_write(struct vnode *, struct buf *, u_quad_t, int, int); void vfs_bio_brelse(struct buf *bp, int ioflags); void vfs_bio_bzero_buf(struct buf *bp, int base, int size); void vfs_bio_clrbuf(struct buf *); void vfs_bio_set_flags(struct buf *bp, int ioflags); void vfs_bio_set_valid(struct buf *, int base, int size); void vfs_busy_pages(struct buf *, int clear_modify); void vfs_unbusy_pages(struct buf *); int vmapbuf(struct buf *, int); void vunmapbuf(struct buf *); void relpbuf(struct buf *, int *); void brelvp(struct buf *); void bgetvp(struct vnode *, struct buf *); void pbgetbo(struct bufobj *bo, struct buf *bp); void pbgetvp(struct vnode *, struct buf *); void pbrelbo(struct buf *); void pbrelvp(struct buf *); int allocbuf(struct buf *bp, int size); void reassignbuf(struct buf *); struct buf *trypbuf(int *); void bwait(struct buf *, u_char, const char *); void bdone(struct buf *); typedef daddr_t (vbg_get_lblkno_t)(struct vnode *, vm_ooffset_t); typedef int (vbg_get_blksize_t)(struct vnode *, daddr_t); int vfs_bio_getpages(struct vnode *vp, struct vm_page **ma, int count, int *rbehind, int *rahead, vbg_get_lblkno_t get_lblkno, vbg_get_blksize_t get_blksize); #endif /* _KERNEL */ #endif /* !_SYS_BUF_H_ */ Index: head/sys/ufs/ffs/ffs_alloc.c =================================================================== --- head/sys/ufs/ffs/ffs_alloc.c (revision 323922) +++ head/sys/ufs/ffs/ffs_alloc.c (revision 323923) @@ -1,3196 +1,3225 @@ /*- * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Marshall * Kirk McKusick and Network Associates Laboratories, the Security * Research Division of Network Associates, Inc. under DARPA/SPAWAR * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS * research program * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ffs_alloc.c 8.18 (Berkeley) 5/26/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_quota.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include typedef ufs2_daddr_t allocfcn_t(struct inode *ip, u_int cg, ufs2_daddr_t bpref, int size, int rsize); static ufs2_daddr_t ffs_alloccg(struct inode *, u_int, ufs2_daddr_t, int, int); static ufs2_daddr_t ffs_alloccgblk(struct inode *, struct buf *, ufs2_daddr_t, int); static void ffs_blkfree_cg(struct ufsmount *, struct fs *, struct vnode *, ufs2_daddr_t, long, ino_t, struct workhead *); static void ffs_blkfree_trim_completed(struct bio *); static void ffs_blkfree_trim_task(void *ctx, int pending __unused); #ifdef INVARIANTS static int ffs_checkblk(struct inode *, ufs2_daddr_t, long); #endif static ufs2_daddr_t ffs_clusteralloc(struct inode *, u_int, ufs2_daddr_t, int); static ino_t ffs_dirpref(struct inode *); static ufs2_daddr_t ffs_fragextend(struct inode *, u_int, ufs2_daddr_t, int, int); static ufs2_daddr_t ffs_hashalloc (struct inode *, u_int, ufs2_daddr_t, int, int, allocfcn_t *); static ufs2_daddr_t ffs_nodealloccg(struct inode *, u_int, ufs2_daddr_t, int, int); static ufs1_daddr_t ffs_mapsearch(struct fs *, struct cg *, ufs2_daddr_t, int); static int ffs_reallocblks_ufs1(struct vop_reallocblks_args *); static int ffs_reallocblks_ufs2(struct vop_reallocblks_args *); +static void ffs_ckhash_cg(struct buf *); /* * Allocate a block in the filesystem. * * The size of the requested block is given, which must be some * multiple of fs_fsize and <= fs_bsize. * A preference may be optionally specified. If a preference is given * the following hierarchy is used to allocate a block: * 1) allocate the requested block. * 2) allocate a rotationally optimal block in the same cylinder. * 3) allocate a block in the same cylinder group. * 4) quadradically rehash into other cylinder groups, until an * available block is located. * If no block preference is given the following hierarchy is used * to allocate a block: * 1) allocate a block in the cylinder group that contains the * inode for the file. * 2) quadradically rehash into other cylinder groups, until an * available block is located. */ int ffs_alloc(ip, lbn, bpref, size, flags, cred, bnp) struct inode *ip; ufs2_daddr_t lbn, bpref; int size, flags; struct ucred *cred; ufs2_daddr_t *bnp; { struct fs *fs; struct ufsmount *ump; ufs2_daddr_t bno; u_int cg, reclaimed; static struct timeval lastfail; static int curfail; int64_t delta; #ifdef QUOTA int error; #endif *bnp = 0; ump = ITOUMP(ip); fs = ump->um_fs; mtx_assert(UFS_MTX(ump), MA_OWNED); #ifdef INVARIANTS if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0) { printf("dev = %s, bsize = %ld, size = %d, fs = %s\n", devtoname(ump->um_dev), (long)fs->fs_bsize, size, fs->fs_fsmnt); panic("ffs_alloc: bad size"); } if (cred == NOCRED) panic("ffs_alloc: missing credential"); #endif /* INVARIANTS */ reclaimed = 0; retry: #ifdef QUOTA UFS_UNLOCK(ump); error = chkdq(ip, btodb(size), cred, 0); if (error) return (error); UFS_LOCK(ump); #endif if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0) goto nospace; if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE, 0) && freespace(fs, fs->fs_minfree) - numfrags(fs, size) < 0) goto nospace; if (bpref >= fs->fs_size) bpref = 0; if (bpref == 0) cg = ino_to_cg(fs, ip->i_number); else cg = dtog(fs, bpref); bno = ffs_hashalloc(ip, cg, bpref, size, size, ffs_alloccg); if (bno > 0) { delta = btodb(size); DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta); if (flags & IO_EXT) ip->i_flag |= IN_CHANGE; else ip->i_flag |= IN_CHANGE | IN_UPDATE; *bnp = bno; return (0); } nospace: #ifdef QUOTA UFS_UNLOCK(ump); /* * Restore user's disk quota because allocation failed. */ (void) chkdq(ip, -btodb(size), cred, FORCE); UFS_LOCK(ump); #endif if (reclaimed == 0 && (flags & IO_BUFLOCKED) == 0) { reclaimed = 1; softdep_request_cleanup(fs, ITOV(ip), cred, FLUSH_BLOCKS_WAIT); goto retry; } UFS_UNLOCK(ump); if (reclaimed > 0 && ppsratecheck(&lastfail, &curfail, 1)) { ffs_fserr(fs, ip->i_number, "filesystem full"); uprintf("\n%s: write failed, filesystem is full\n", fs->fs_fsmnt); } return (ENOSPC); } /* * Reallocate a fragment to a bigger size * * The number and size of the old block is given, and a preference * and new size is also specified. The allocator attempts to extend * the original block. Failing that, the regular block allocator is * invoked to get an appropriate block. */ int ffs_realloccg(ip, lbprev, bprev, bpref, osize, nsize, flags, cred, bpp) struct inode *ip; ufs2_daddr_t lbprev; ufs2_daddr_t bprev; ufs2_daddr_t bpref; int osize, nsize, flags; struct ucred *cred; struct buf **bpp; { struct vnode *vp; struct fs *fs; struct buf *bp; struct ufsmount *ump; u_int cg, request, reclaimed; int error, gbflags; ufs2_daddr_t bno; static struct timeval lastfail; static int curfail; int64_t delta; vp = ITOV(ip); ump = ITOUMP(ip); fs = ump->um_fs; bp = NULL; gbflags = (flags & BA_UNMAPPED) != 0 ? GB_UNMAPPED : 0; mtx_assert(UFS_MTX(ump), MA_OWNED); #ifdef INVARIANTS if (vp->v_mount->mnt_kern_flag & MNTK_SUSPENDED) panic("ffs_realloccg: allocation on suspended filesystem"); if ((u_int)osize > fs->fs_bsize || fragoff(fs, osize) != 0 || (u_int)nsize > fs->fs_bsize || fragoff(fs, nsize) != 0) { printf( "dev = %s, bsize = %ld, osize = %d, nsize = %d, fs = %s\n", devtoname(ump->um_dev), (long)fs->fs_bsize, osize, nsize, fs->fs_fsmnt); panic("ffs_realloccg: bad size"); } if (cred == NOCRED) panic("ffs_realloccg: missing credential"); #endif /* INVARIANTS */ reclaimed = 0; retry: if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE, 0) && freespace(fs, fs->fs_minfree) - numfrags(fs, nsize - osize) < 0) { goto nospace; } if (bprev == 0) { printf("dev = %s, bsize = %ld, bprev = %jd, fs = %s\n", devtoname(ump->um_dev), (long)fs->fs_bsize, (intmax_t)bprev, fs->fs_fsmnt); panic("ffs_realloccg: bad bprev"); } UFS_UNLOCK(ump); /* * Allocate the extra space in the buffer. */ error = bread_gb(vp, lbprev, osize, NOCRED, gbflags, &bp); if (error) { brelse(bp); return (error); } if (bp->b_blkno == bp->b_lblkno) { if (lbprev >= UFS_NDADDR) panic("ffs_realloccg: lbprev out of range"); bp->b_blkno = fsbtodb(fs, bprev); } #ifdef QUOTA error = chkdq(ip, btodb(nsize - osize), cred, 0); if (error) { brelse(bp); return (error); } #endif /* * Check for extension in the existing location. */ *bpp = NULL; cg = dtog(fs, bprev); UFS_LOCK(ump); bno = ffs_fragextend(ip, cg, bprev, osize, nsize); if (bno) { if (bp->b_blkno != fsbtodb(fs, bno)) panic("ffs_realloccg: bad blockno"); delta = btodb(nsize - osize); DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta); if (flags & IO_EXT) ip->i_flag |= IN_CHANGE; else ip->i_flag |= IN_CHANGE | IN_UPDATE; allocbuf(bp, nsize); bp->b_flags |= B_DONE; vfs_bio_bzero_buf(bp, osize, nsize - osize); if ((bp->b_flags & (B_MALLOC | B_VMIO)) == B_VMIO) vfs_bio_set_valid(bp, osize, nsize - osize); *bpp = bp; return (0); } /* * Allocate a new disk location. */ if (bpref >= fs->fs_size) bpref = 0; switch ((int)fs->fs_optim) { case FS_OPTSPACE: /* * Allocate an exact sized fragment. Although this makes * best use of space, we will waste time relocating it if * the file continues to grow. If the fragmentation is * less than half of the minimum free reserve, we choose * to begin optimizing for time. */ request = nsize; if (fs->fs_minfree <= 5 || fs->fs_cstotal.cs_nffree > (off_t)fs->fs_dsize * fs->fs_minfree / (2 * 100)) break; log(LOG_NOTICE, "%s: optimization changed from SPACE to TIME\n", fs->fs_fsmnt); fs->fs_optim = FS_OPTTIME; break; case FS_OPTTIME: /* * At this point we have discovered a file that is trying to * grow a small fragment to a larger fragment. To save time, * we allocate a full sized block, then free the unused portion. * If the file continues to grow, the `ffs_fragextend' call * above will be able to grow it in place without further * copying. If aberrant programs cause disk fragmentation to * grow within 2% of the free reserve, we choose to begin * optimizing for space. */ request = fs->fs_bsize; if (fs->fs_cstotal.cs_nffree < (off_t)fs->fs_dsize * (fs->fs_minfree - 2) / 100) break; log(LOG_NOTICE, "%s: optimization changed from TIME to SPACE\n", fs->fs_fsmnt); fs->fs_optim = FS_OPTSPACE; break; default: printf("dev = %s, optim = %ld, fs = %s\n", devtoname(ump->um_dev), (long)fs->fs_optim, fs->fs_fsmnt); panic("ffs_realloccg: bad optim"); /* NOTREACHED */ } bno = ffs_hashalloc(ip, cg, bpref, request, nsize, ffs_alloccg); if (bno > 0) { bp->b_blkno = fsbtodb(fs, bno); if (!DOINGSOFTDEP(vp)) ffs_blkfree(ump, fs, ump->um_devvp, bprev, (long)osize, ip->i_number, vp->v_type, NULL); delta = btodb(nsize - osize); DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta); if (flags & IO_EXT) ip->i_flag |= IN_CHANGE; else ip->i_flag |= IN_CHANGE | IN_UPDATE; allocbuf(bp, nsize); bp->b_flags |= B_DONE; vfs_bio_bzero_buf(bp, osize, nsize - osize); if ((bp->b_flags & (B_MALLOC | B_VMIO)) == B_VMIO) vfs_bio_set_valid(bp, osize, nsize - osize); *bpp = bp; return (0); } #ifdef QUOTA UFS_UNLOCK(ump); /* * Restore user's disk quota because allocation failed. */ (void) chkdq(ip, -btodb(nsize - osize), cred, FORCE); UFS_LOCK(ump); #endif nospace: /* * no space available */ if (reclaimed == 0 && (flags & IO_BUFLOCKED) == 0) { reclaimed = 1; UFS_UNLOCK(ump); if (bp) { brelse(bp); bp = NULL; } UFS_LOCK(ump); softdep_request_cleanup(fs, vp, cred, FLUSH_BLOCKS_WAIT); goto retry; } UFS_UNLOCK(ump); if (bp) brelse(bp); if (reclaimed > 0 && ppsratecheck(&lastfail, &curfail, 1)) { ffs_fserr(fs, ip->i_number, "filesystem full"); uprintf("\n%s: write failed, filesystem is full\n", fs->fs_fsmnt); } return (ENOSPC); } /* * Reallocate a sequence of blocks into a contiguous sequence of blocks. * * The vnode and an array of buffer pointers for a range of sequential * logical blocks to be made contiguous is given. The allocator attempts * to find a range of sequential blocks starting as close as possible * from the end of the allocation for the logical block immediately * preceding the current range. If successful, the physical block numbers * in the buffer pointers and in the inode are changed to reflect the new * allocation. If unsuccessful, the allocation is left unchanged. The * success in doing the reallocation is returned. Note that the error * return is not reflected back to the user. Rather the previous block * allocation will be used. */ SYSCTL_NODE(_vfs, OID_AUTO, ffs, CTLFLAG_RW, 0, "FFS filesystem"); static int doasyncfree = 1; SYSCTL_INT(_vfs_ffs, OID_AUTO, doasyncfree, CTLFLAG_RW, &doasyncfree, 0, "do not force synchronous writes when blocks are reallocated"); static int doreallocblks = 1; SYSCTL_INT(_vfs_ffs, OID_AUTO, doreallocblks, CTLFLAG_RW, &doreallocblks, 0, "enable block reallocation"); static int maxclustersearch = 10; SYSCTL_INT(_vfs_ffs, OID_AUTO, maxclustersearch, CTLFLAG_RW, &maxclustersearch, 0, "max number of cylinder group to search for contigous blocks"); #ifdef DEBUG static volatile int prtrealloc = 0; #endif int ffs_reallocblks(ap) struct vop_reallocblks_args /* { struct vnode *a_vp; struct cluster_save *a_buflist; } */ *ap; { struct ufsmount *ump; /* * If the underlying device can do deletes, then skip reallocating * the blocks of this file into contiguous sequences. Devices that * benefit from BIO_DELETE also benefit from not moving the data. * These devices are flash and therefore work less well with this * optimization. Also skip if reallocblks has been disabled globally. */ ump = ap->a_vp->v_mount->mnt_data; if (ump->um_candelete || doreallocblks == 0) return (ENOSPC); /* * We can't wait in softdep prealloc as it may fsync and recurse * here. Instead we simply fail to reallocate blocks if this * rare condition arises. */ if (DOINGSOFTDEP(ap->a_vp)) if (softdep_prealloc(ap->a_vp, MNT_NOWAIT) != 0) return (ENOSPC); if (ump->um_fstype == UFS1) return (ffs_reallocblks_ufs1(ap)); return (ffs_reallocblks_ufs2(ap)); } static int ffs_reallocblks_ufs1(ap) struct vop_reallocblks_args /* { struct vnode *a_vp; struct cluster_save *a_buflist; } */ *ap; { struct fs *fs; struct inode *ip; struct vnode *vp; struct buf *sbp, *ebp; ufs1_daddr_t *bap, *sbap, *ebap; struct cluster_save *buflist; struct ufsmount *ump; ufs_lbn_t start_lbn, end_lbn; ufs1_daddr_t soff, newblk, blkno; ufs2_daddr_t pref; struct indir start_ap[UFS_NIADDR + 1], end_ap[UFS_NIADDR + 1], *idp; int i, cg, len, start_lvl, end_lvl, ssize; vp = ap->a_vp; ip = VTOI(vp); ump = ITOUMP(ip); fs = ump->um_fs; /* * If we are not tracking block clusters or if we have less than 4% * free blocks left, then do not attempt to cluster. Running with * less than 5% free block reserve is not recommended and those that * choose to do so do not expect to have good file layout. */ if (fs->fs_contigsumsize <= 0 || freespace(fs, 4) < 0) return (ENOSPC); buflist = ap->a_buflist; len = buflist->bs_nchildren; start_lbn = buflist->bs_children[0]->b_lblkno; end_lbn = start_lbn + len - 1; #ifdef INVARIANTS for (i = 0; i < len; i++) if (!ffs_checkblk(ip, dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize)) panic("ffs_reallocblks: unallocated block 1"); for (i = 1; i < len; i++) if (buflist->bs_children[i]->b_lblkno != start_lbn + i) panic("ffs_reallocblks: non-logical cluster"); blkno = buflist->bs_children[0]->b_blkno; ssize = fsbtodb(fs, fs->fs_frag); for (i = 1; i < len - 1; i++) if (buflist->bs_children[i]->b_blkno != blkno + (i * ssize)) panic("ffs_reallocblks: non-physical cluster %d", i); #endif /* * If the cluster crosses the boundary for the first indirect * block, leave space for the indirect block. Indirect blocks * are initially laid out in a position after the last direct * block. Block reallocation would usually destroy locality by * moving the indirect block out of the way to make room for * data blocks if we didn't compensate here. We should also do * this for other indirect block boundaries, but it is only * important for the first one. */ if (start_lbn < UFS_NDADDR && end_lbn >= UFS_NDADDR) return (ENOSPC); /* * If the latest allocation is in a new cylinder group, assume that * the filesystem has decided to move and do not force it back to * the previous cylinder group. */ if (dtog(fs, dbtofsb(fs, buflist->bs_children[0]->b_blkno)) != dtog(fs, dbtofsb(fs, buflist->bs_children[len - 1]->b_blkno))) return (ENOSPC); if (ufs_getlbns(vp, start_lbn, start_ap, &start_lvl) || ufs_getlbns(vp, end_lbn, end_ap, &end_lvl)) return (ENOSPC); /* * Get the starting offset and block map for the first block. */ if (start_lvl == 0) { sbap = &ip->i_din1->di_db[0]; soff = start_lbn; } else { idp = &start_ap[start_lvl - 1]; if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &sbp)) { brelse(sbp); return (ENOSPC); } sbap = (ufs1_daddr_t *)sbp->b_data; soff = idp->in_off; } /* * If the block range spans two block maps, get the second map. */ ebap = NULL; if (end_lvl == 0 || (idp = &end_ap[end_lvl - 1])->in_off + 1 >= len) { ssize = len; } else { #ifdef INVARIANTS if (start_lvl > 0 && start_ap[start_lvl - 1].in_lbn == idp->in_lbn) panic("ffs_reallocblk: start == end"); #endif ssize = len - (idp->in_off + 1); if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &ebp)) goto fail; ebap = (ufs1_daddr_t *)ebp->b_data; } /* * Find the preferred location for the cluster. If we have not * previously failed at this endeavor, then follow our standard * preference calculation. If we have failed at it, then pick up * where we last ended our search. */ UFS_LOCK(ump); if (ip->i_nextclustercg == -1) pref = ffs_blkpref_ufs1(ip, start_lbn, soff, sbap); else pref = cgdata(fs, ip->i_nextclustercg); /* * Search the block map looking for an allocation of the desired size. * To avoid wasting too much time, we limit the number of cylinder * groups that we will search. */ cg = dtog(fs, pref); for (i = min(maxclustersearch, fs->fs_ncg); i > 0; i--) { if ((newblk = ffs_clusteralloc(ip, cg, pref, len)) != 0) break; cg += 1; if (cg >= fs->fs_ncg) cg = 0; } /* * If we have failed in our search, record where we gave up for * next time. Otherwise, fall back to our usual search citerion. */ if (newblk == 0) { ip->i_nextclustercg = cg; UFS_UNLOCK(ump); goto fail; } ip->i_nextclustercg = -1; /* * We have found a new contiguous block. * * First we have to replace the old block pointers with the new * block pointers in the inode and indirect blocks associated * with the file. */ #ifdef DEBUG if (prtrealloc) printf("realloc: ino %ju, lbns %jd-%jd\n\told:", (uintmax_t)ip->i_number, (intmax_t)start_lbn, (intmax_t)end_lbn); #endif blkno = newblk; for (bap = &sbap[soff], i = 0; i < len; i++, blkno += fs->fs_frag) { if (i == ssize) { bap = ebap; soff = -i; } #ifdef INVARIANTS if (!ffs_checkblk(ip, dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize)) panic("ffs_reallocblks: unallocated block 2"); if (dbtofsb(fs, buflist->bs_children[i]->b_blkno) != *bap) panic("ffs_reallocblks: alloc mismatch"); #endif #ifdef DEBUG if (prtrealloc) printf(" %d,", *bap); #endif if (DOINGSOFTDEP(vp)) { if (sbap == &ip->i_din1->di_db[0] && i < ssize) softdep_setup_allocdirect(ip, start_lbn + i, blkno, *bap, fs->fs_bsize, fs->fs_bsize, buflist->bs_children[i]); else softdep_setup_allocindir_page(ip, start_lbn + i, i < ssize ? sbp : ebp, soff + i, blkno, *bap, buflist->bs_children[i]); } *bap++ = blkno; } /* * Next we must write out the modified inode and indirect blocks. * For strict correctness, the writes should be synchronous since * the old block values may have been written to disk. In practise * they are almost never written, but if we are concerned about * strict correctness, the `doasyncfree' flag should be set to zero. * * The test on `doasyncfree' should be changed to test a flag * that shows whether the associated buffers and inodes have * been written. The flag should be set when the cluster is * started and cleared whenever the buffer or inode is flushed. * We can then check below to see if it is set, and do the * synchronous write only when it has been cleared. */ if (sbap != &ip->i_din1->di_db[0]) { if (doasyncfree) bdwrite(sbp); else bwrite(sbp); } else { ip->i_flag |= IN_CHANGE | IN_UPDATE; if (!doasyncfree) ffs_update(vp, 1); } if (ssize < len) { if (doasyncfree) bdwrite(ebp); else bwrite(ebp); } /* * Last, free the old blocks and assign the new blocks to the buffers. */ #ifdef DEBUG if (prtrealloc) printf("\n\tnew:"); #endif for (blkno = newblk, i = 0; i < len; i++, blkno += fs->fs_frag) { if (!DOINGSOFTDEP(vp)) ffs_blkfree(ump, fs, ump->um_devvp, dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize, ip->i_number, vp->v_type, NULL); buflist->bs_children[i]->b_blkno = fsbtodb(fs, blkno); #ifdef INVARIANTS if (!ffs_checkblk(ip, dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize)) panic("ffs_reallocblks: unallocated block 3"); #endif #ifdef DEBUG if (prtrealloc) printf(" %d,", blkno); #endif } #ifdef DEBUG if (prtrealloc) { prtrealloc--; printf("\n"); } #endif return (0); fail: if (ssize < len) brelse(ebp); if (sbap != &ip->i_din1->di_db[0]) brelse(sbp); return (ENOSPC); } static int ffs_reallocblks_ufs2(ap) struct vop_reallocblks_args /* { struct vnode *a_vp; struct cluster_save *a_buflist; } */ *ap; { struct fs *fs; struct inode *ip; struct vnode *vp; struct buf *sbp, *ebp; ufs2_daddr_t *bap, *sbap, *ebap; struct cluster_save *buflist; struct ufsmount *ump; ufs_lbn_t start_lbn, end_lbn; ufs2_daddr_t soff, newblk, blkno, pref; struct indir start_ap[UFS_NIADDR + 1], end_ap[UFS_NIADDR + 1], *idp; int i, cg, len, start_lvl, end_lvl, ssize; vp = ap->a_vp; ip = VTOI(vp); ump = ITOUMP(ip); fs = ump->um_fs; /* * If we are not tracking block clusters or if we have less than 4% * free blocks left, then do not attempt to cluster. Running with * less than 5% free block reserve is not recommended and those that * choose to do so do not expect to have good file layout. */ if (fs->fs_contigsumsize <= 0 || freespace(fs, 4) < 0) return (ENOSPC); buflist = ap->a_buflist; len = buflist->bs_nchildren; start_lbn = buflist->bs_children[0]->b_lblkno; end_lbn = start_lbn + len - 1; #ifdef INVARIANTS for (i = 0; i < len; i++) if (!ffs_checkblk(ip, dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize)) panic("ffs_reallocblks: unallocated block 1"); for (i = 1; i < len; i++) if (buflist->bs_children[i]->b_lblkno != start_lbn + i) panic("ffs_reallocblks: non-logical cluster"); blkno = buflist->bs_children[0]->b_blkno; ssize = fsbtodb(fs, fs->fs_frag); for (i = 1; i < len - 1; i++) if (buflist->bs_children[i]->b_blkno != blkno + (i * ssize)) panic("ffs_reallocblks: non-physical cluster %d", i); #endif /* * If the cluster crosses the boundary for the first indirect * block, do not move anything in it. Indirect blocks are * usually initially laid out in a position between the data * blocks. Block reallocation would usually destroy locality by * moving the indirect block out of the way to make room for * data blocks if we didn't compensate here. We should also do * this for other indirect block boundaries, but it is only * important for the first one. */ if (start_lbn < UFS_NDADDR && end_lbn >= UFS_NDADDR) return (ENOSPC); /* * If the latest allocation is in a new cylinder group, assume that * the filesystem has decided to move and do not force it back to * the previous cylinder group. */ if (dtog(fs, dbtofsb(fs, buflist->bs_children[0]->b_blkno)) != dtog(fs, dbtofsb(fs, buflist->bs_children[len - 1]->b_blkno))) return (ENOSPC); if (ufs_getlbns(vp, start_lbn, start_ap, &start_lvl) || ufs_getlbns(vp, end_lbn, end_ap, &end_lvl)) return (ENOSPC); /* * Get the starting offset and block map for the first block. */ if (start_lvl == 0) { sbap = &ip->i_din2->di_db[0]; soff = start_lbn; } else { idp = &start_ap[start_lvl - 1]; if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &sbp)) { brelse(sbp); return (ENOSPC); } sbap = (ufs2_daddr_t *)sbp->b_data; soff = idp->in_off; } /* * If the block range spans two block maps, get the second map. */ ebap = NULL; if (end_lvl == 0 || (idp = &end_ap[end_lvl - 1])->in_off + 1 >= len) { ssize = len; } else { #ifdef INVARIANTS if (start_lvl > 0 && start_ap[start_lvl - 1].in_lbn == idp->in_lbn) panic("ffs_reallocblk: start == end"); #endif ssize = len - (idp->in_off + 1); if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &ebp)) goto fail; ebap = (ufs2_daddr_t *)ebp->b_data; } /* * Find the preferred location for the cluster. If we have not * previously failed at this endeavor, then follow our standard * preference calculation. If we have failed at it, then pick up * where we last ended our search. */ UFS_LOCK(ump); if (ip->i_nextclustercg == -1) pref = ffs_blkpref_ufs2(ip, start_lbn, soff, sbap); else pref = cgdata(fs, ip->i_nextclustercg); /* * Search the block map looking for an allocation of the desired size. * To avoid wasting too much time, we limit the number of cylinder * groups that we will search. */ cg = dtog(fs, pref); for (i = min(maxclustersearch, fs->fs_ncg); i > 0; i--) { if ((newblk = ffs_clusteralloc(ip, cg, pref, len)) != 0) break; cg += 1; if (cg >= fs->fs_ncg) cg = 0; } /* * If we have failed in our search, record where we gave up for * next time. Otherwise, fall back to our usual search citerion. */ if (newblk == 0) { ip->i_nextclustercg = cg; UFS_UNLOCK(ump); goto fail; } ip->i_nextclustercg = -1; /* * We have found a new contiguous block. * * First we have to replace the old block pointers with the new * block pointers in the inode and indirect blocks associated * with the file. */ #ifdef DEBUG if (prtrealloc) printf("realloc: ino %ju, lbns %jd-%jd\n\told:", (uintmax_t)ip->i_number, (intmax_t)start_lbn, (intmax_t)end_lbn); #endif blkno = newblk; for (bap = &sbap[soff], i = 0; i < len; i++, blkno += fs->fs_frag) { if (i == ssize) { bap = ebap; soff = -i; } #ifdef INVARIANTS if (!ffs_checkblk(ip, dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize)) panic("ffs_reallocblks: unallocated block 2"); if (dbtofsb(fs, buflist->bs_children[i]->b_blkno) != *bap) panic("ffs_reallocblks: alloc mismatch"); #endif #ifdef DEBUG if (prtrealloc) printf(" %jd,", (intmax_t)*bap); #endif if (DOINGSOFTDEP(vp)) { if (sbap == &ip->i_din2->di_db[0] && i < ssize) softdep_setup_allocdirect(ip, start_lbn + i, blkno, *bap, fs->fs_bsize, fs->fs_bsize, buflist->bs_children[i]); else softdep_setup_allocindir_page(ip, start_lbn + i, i < ssize ? sbp : ebp, soff + i, blkno, *bap, buflist->bs_children[i]); } *bap++ = blkno; } /* * Next we must write out the modified inode and indirect blocks. * For strict correctness, the writes should be synchronous since * the old block values may have been written to disk. In practise * they are almost never written, but if we are concerned about * strict correctness, the `doasyncfree' flag should be set to zero. * * The test on `doasyncfree' should be changed to test a flag * that shows whether the associated buffers and inodes have * been written. The flag should be set when the cluster is * started and cleared whenever the buffer or inode is flushed. * We can then check below to see if it is set, and do the * synchronous write only when it has been cleared. */ if (sbap != &ip->i_din2->di_db[0]) { if (doasyncfree) bdwrite(sbp); else bwrite(sbp); } else { ip->i_flag |= IN_CHANGE | IN_UPDATE; if (!doasyncfree) ffs_update(vp, 1); } if (ssize < len) { if (doasyncfree) bdwrite(ebp); else bwrite(ebp); } /* * Last, free the old blocks and assign the new blocks to the buffers. */ #ifdef DEBUG if (prtrealloc) printf("\n\tnew:"); #endif for (blkno = newblk, i = 0; i < len; i++, blkno += fs->fs_frag) { if (!DOINGSOFTDEP(vp)) ffs_blkfree(ump, fs, ump->um_devvp, dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize, ip->i_number, vp->v_type, NULL); buflist->bs_children[i]->b_blkno = fsbtodb(fs, blkno); #ifdef INVARIANTS if (!ffs_checkblk(ip, dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize)) panic("ffs_reallocblks: unallocated block 3"); #endif #ifdef DEBUG if (prtrealloc) printf(" %jd,", (intmax_t)blkno); #endif } #ifdef DEBUG if (prtrealloc) { prtrealloc--; printf("\n"); } #endif return (0); fail: if (ssize < len) brelse(ebp); if (sbap != &ip->i_din2->di_db[0]) brelse(sbp); return (ENOSPC); } /* * Allocate an inode in the filesystem. * * If allocating a directory, use ffs_dirpref to select the inode. * If allocating in a directory, the following hierarchy is followed: * 1) allocate the preferred inode. * 2) allocate an inode in the same cylinder group. * 3) quadradically rehash into other cylinder groups, until an * available inode is located. * If no inode preference is given the following hierarchy is used * to allocate an inode: * 1) allocate an inode in cylinder group 0. * 2) quadradically rehash into other cylinder groups, until an * available inode is located. */ int ffs_valloc(pvp, mode, cred, vpp) struct vnode *pvp; int mode; struct ucred *cred; struct vnode **vpp; { struct inode *pip; struct fs *fs; struct inode *ip; struct timespec ts; struct ufsmount *ump; ino_t ino, ipref; u_int cg; int error, error1, reclaimed; static struct timeval lastfail; static int curfail; *vpp = NULL; pip = VTOI(pvp); ump = ITOUMP(pip); fs = ump->um_fs; UFS_LOCK(ump); reclaimed = 0; retry: if (fs->fs_cstotal.cs_nifree == 0) goto noinodes; if ((mode & IFMT) == IFDIR) ipref = ffs_dirpref(pip); else ipref = pip->i_number; if (ipref >= fs->fs_ncg * fs->fs_ipg) ipref = 0; cg = ino_to_cg(fs, ipref); /* * Track number of dirs created one after another * in a same cg without intervening by files. */ if ((mode & IFMT) == IFDIR) { if (fs->fs_contigdirs[cg] < 255) fs->fs_contigdirs[cg]++; } else { if (fs->fs_contigdirs[cg] > 0) fs->fs_contigdirs[cg]--; } ino = (ino_t)ffs_hashalloc(pip, cg, ipref, mode, 0, (allocfcn_t *)ffs_nodealloccg); if (ino == 0) goto noinodes; error = ffs_vget(pvp->v_mount, ino, LK_EXCLUSIVE, vpp); if (error) { error1 = ffs_vgetf(pvp->v_mount, ino, LK_EXCLUSIVE, vpp, FFSV_FORCEINSMQ); ffs_vfree(pvp, ino, mode); if (error1 == 0) { ip = VTOI(*vpp); if (ip->i_mode) goto dup_alloc; ip->i_flag |= IN_MODIFIED; vput(*vpp); } return (error); } ip = VTOI(*vpp); if (ip->i_mode) { dup_alloc: printf("mode = 0%o, inum = %ju, fs = %s\n", ip->i_mode, (uintmax_t)ip->i_number, fs->fs_fsmnt); panic("ffs_valloc: dup alloc"); } if (DIP(ip, i_blocks) && (fs->fs_flags & FS_UNCLEAN) == 0) { /* XXX */ printf("free inode %s/%lu had %ld blocks\n", fs->fs_fsmnt, (u_long)ino, (long)DIP(ip, i_blocks)); DIP_SET(ip, i_blocks, 0); } ip->i_flags = 0; DIP_SET(ip, i_flags, 0); /* * Set up a new generation number for this inode. */ while (ip->i_gen == 0 || ++ip->i_gen == 0) ip->i_gen = arc4random(); DIP_SET(ip, i_gen, ip->i_gen); if (fs->fs_magic == FS_UFS2_MAGIC) { vfs_timestamp(&ts); ip->i_din2->di_birthtime = ts.tv_sec; ip->i_din2->di_birthnsec = ts.tv_nsec; } ufs_prepare_reclaim(*vpp); ip->i_flag = 0; (*vpp)->v_vflag = 0; (*vpp)->v_type = VNON; if (fs->fs_magic == FS_UFS2_MAGIC) { (*vpp)->v_op = &ffs_vnodeops2; ip->i_flag |= IN_UFS2; } else { (*vpp)->v_op = &ffs_vnodeops1; } return (0); noinodes: if (reclaimed == 0) { reclaimed = 1; softdep_request_cleanup(fs, pvp, cred, FLUSH_INODES_WAIT); goto retry; } UFS_UNLOCK(ump); if (ppsratecheck(&lastfail, &curfail, 1)) { ffs_fserr(fs, pip->i_number, "out of inodes"); uprintf("\n%s: create/symlink failed, no inodes free\n", fs->fs_fsmnt); } return (ENOSPC); } /* * Find a cylinder group to place a directory. * * The policy implemented by this algorithm is to allocate a * directory inode in the same cylinder group as its parent * directory, but also to reserve space for its files inodes * and data. Restrict the number of directories which may be * allocated one after another in the same cylinder group * without intervening allocation of files. * * If we allocate a first level directory then force allocation * in another cylinder group. */ static ino_t ffs_dirpref(pip) struct inode *pip; { struct fs *fs; int cg, prefcg, dirsize, cgsize; u_int avgifree, avgbfree, avgndir, curdirsize; u_int minifree, minbfree, maxndir; u_int mincg, minndir; u_int maxcontigdirs; mtx_assert(UFS_MTX(ITOUMP(pip)), MA_OWNED); fs = ITOFS(pip); avgifree = fs->fs_cstotal.cs_nifree / fs->fs_ncg; avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; avgndir = fs->fs_cstotal.cs_ndir / fs->fs_ncg; /* * Force allocation in another cg if creating a first level dir. */ ASSERT_VOP_LOCKED(ITOV(pip), "ffs_dirpref"); if (ITOV(pip)->v_vflag & VV_ROOT) { prefcg = arc4random() % fs->fs_ncg; mincg = prefcg; minndir = fs->fs_ipg; for (cg = prefcg; cg < fs->fs_ncg; cg++) if (fs->fs_cs(fs, cg).cs_ndir < minndir && fs->fs_cs(fs, cg).cs_nifree >= avgifree && fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { mincg = cg; minndir = fs->fs_cs(fs, cg).cs_ndir; } for (cg = 0; cg < prefcg; cg++) if (fs->fs_cs(fs, cg).cs_ndir < minndir && fs->fs_cs(fs, cg).cs_nifree >= avgifree && fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { mincg = cg; minndir = fs->fs_cs(fs, cg).cs_ndir; } return ((ino_t)(fs->fs_ipg * mincg)); } /* * Count various limits which used for * optimal allocation of a directory inode. */ maxndir = min(avgndir + fs->fs_ipg / 16, fs->fs_ipg); minifree = avgifree - avgifree / 4; if (minifree < 1) minifree = 1; minbfree = avgbfree - avgbfree / 4; if (minbfree < 1) minbfree = 1; cgsize = fs->fs_fsize * fs->fs_fpg; dirsize = fs->fs_avgfilesize * fs->fs_avgfpdir; curdirsize = avgndir ? (cgsize - avgbfree * fs->fs_bsize) / avgndir : 0; if (dirsize < curdirsize) dirsize = curdirsize; if (dirsize <= 0) maxcontigdirs = 0; /* dirsize overflowed */ else maxcontigdirs = min((avgbfree * fs->fs_bsize) / dirsize, 255); if (fs->fs_avgfpdir > 0) maxcontigdirs = min(maxcontigdirs, fs->fs_ipg / fs->fs_avgfpdir); if (maxcontigdirs == 0) maxcontigdirs = 1; /* * Limit number of dirs in one cg and reserve space for * regular files, but only if we have no deficit in * inodes or space. * * We are trying to find a suitable cylinder group nearby * our preferred cylinder group to place a new directory. * We scan from our preferred cylinder group forward looking * for a cylinder group that meets our criterion. If we get * to the final cylinder group and do not find anything, * we start scanning forwards from the beginning of the * filesystem. While it might seem sensible to start scanning * backwards or even to alternate looking forward and backward, * this approach fails badly when the filesystem is nearly full. * Specifically, we first search all the areas that have no space * and finally try the one preceding that. We repeat this on * every request and in the case of the final block end up * searching the entire filesystem. By jumping to the front * of the filesystem, our future forward searches always look * in new cylinder groups so finds every possible block after * one pass over the filesystem. */ prefcg = ino_to_cg(fs, pip->i_number); for (cg = prefcg; cg < fs->fs_ncg; cg++) if (fs->fs_cs(fs, cg).cs_ndir < maxndir && fs->fs_cs(fs, cg).cs_nifree >= minifree && fs->fs_cs(fs, cg).cs_nbfree >= minbfree) { if (fs->fs_contigdirs[cg] < maxcontigdirs) return ((ino_t)(fs->fs_ipg * cg)); } for (cg = 0; cg < prefcg; cg++) if (fs->fs_cs(fs, cg).cs_ndir < maxndir && fs->fs_cs(fs, cg).cs_nifree >= minifree && fs->fs_cs(fs, cg).cs_nbfree >= minbfree) { if (fs->fs_contigdirs[cg] < maxcontigdirs) return ((ino_t)(fs->fs_ipg * cg)); } /* * This is a backstop when we have deficit in space. */ for (cg = prefcg; cg < fs->fs_ncg; cg++) if (fs->fs_cs(fs, cg).cs_nifree >= avgifree) return ((ino_t)(fs->fs_ipg * cg)); for (cg = 0; cg < prefcg; cg++) if (fs->fs_cs(fs, cg).cs_nifree >= avgifree) break; return ((ino_t)(fs->fs_ipg * cg)); } /* * Select the desired position for the next block in a file. The file is * logically divided into sections. The first section is composed of the * direct blocks and the next fs_maxbpg blocks. Each additional section * contains fs_maxbpg blocks. * * If no blocks have been allocated in the first section, the policy is to * request a block in the same cylinder group as the inode that describes * the file. The first indirect is allocated immediately following the last * direct block and the data blocks for the first indirect immediately * follow it. * * If no blocks have been allocated in any other section, the indirect * block(s) are allocated in the same cylinder group as its inode in an * area reserved immediately following the inode blocks. The policy for * the data blocks is to place them in a cylinder group with a greater than * average number of free blocks. An appropriate cylinder group is found * by using a rotor that sweeps the cylinder groups. When a new group of * blocks is needed, the sweep begins in the cylinder group following the * cylinder group from which the previous allocation was made. The sweep * continues until a cylinder group with greater than the average number * of free blocks is found. If the allocation is for the first block in an * indirect block or the previous block is a hole, then the information on * the previous allocation is unavailable; here a best guess is made based * on the logical block number being allocated. * * If a section is already partially allocated, the policy is to * allocate blocks contiguously within the section if possible. */ ufs2_daddr_t ffs_blkpref_ufs1(ip, lbn, indx, bap) struct inode *ip; ufs_lbn_t lbn; int indx; ufs1_daddr_t *bap; { struct fs *fs; u_int cg, inocg; u_int avgbfree, startcg; ufs2_daddr_t pref; KASSERT(indx <= 0 || bap != NULL, ("need non-NULL bap")); mtx_assert(UFS_MTX(ITOUMP(ip)), MA_OWNED); fs = ITOFS(ip); /* * Allocation of indirect blocks is indicated by passing negative * values in indx: -1 for single indirect, -2 for double indirect, * -3 for triple indirect. As noted below, we attempt to allocate * the first indirect inline with the file data. For all later * indirect blocks, the data is often allocated in other cylinder * groups. However to speed random file access and to speed up * fsck, the filesystem reserves the first fs_metaspace blocks * (typically half of fs_minfree) of the data area of each cylinder * group to hold these later indirect blocks. */ inocg = ino_to_cg(fs, ip->i_number); if (indx < 0) { /* * Our preference for indirect blocks is the zone at the * beginning of the inode's cylinder group data area that * we try to reserve for indirect blocks. */ pref = cgmeta(fs, inocg); /* * If we are allocating the first indirect block, try to * place it immediately following the last direct block. */ if (indx == -1 && lbn < UFS_NDADDR + NINDIR(fs) && ip->i_din1->di_db[UFS_NDADDR - 1] != 0) pref = ip->i_din1->di_db[UFS_NDADDR - 1] + fs->fs_frag; return (pref); } /* * If we are allocating the first data block in the first indirect * block and the indirect has been allocated in the data block area, * try to place it immediately following the indirect block. */ if (lbn == UFS_NDADDR) { pref = ip->i_din1->di_ib[0]; if (pref != 0 && pref >= cgdata(fs, inocg) && pref < cgbase(fs, inocg + 1)) return (pref + fs->fs_frag); } /* * If we are at the beginning of a file, or we have already allocated * the maximum number of blocks per cylinder group, or we do not * have a block allocated immediately preceding us, then we need * to decide where to start allocating new blocks. */ if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) { /* * If we are allocating a directory data block, we want * to place it in the metadata area. */ if ((ip->i_mode & IFMT) == IFDIR) return (cgmeta(fs, inocg)); /* * Until we fill all the direct and all the first indirect's * blocks, we try to allocate in the data area of the inode's * cylinder group. */ if (lbn < UFS_NDADDR + NINDIR(fs)) return (cgdata(fs, inocg)); /* * Find a cylinder with greater than average number of * unused data blocks. */ if (indx == 0 || bap[indx - 1] == 0) startcg = inocg + lbn / fs->fs_maxbpg; else startcg = dtog(fs, bap[indx - 1]) + 1; startcg %= fs->fs_ncg; avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; for (cg = startcg; cg < fs->fs_ncg; cg++) if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { fs->fs_cgrotor = cg; return (cgdata(fs, cg)); } for (cg = 0; cg <= startcg; cg++) if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { fs->fs_cgrotor = cg; return (cgdata(fs, cg)); } return (0); } /* * Otherwise, we just always try to lay things out contiguously. */ return (bap[indx - 1] + fs->fs_frag); } /* * Same as above, but for UFS2 */ ufs2_daddr_t ffs_blkpref_ufs2(ip, lbn, indx, bap) struct inode *ip; ufs_lbn_t lbn; int indx; ufs2_daddr_t *bap; { struct fs *fs; u_int cg, inocg; u_int avgbfree, startcg; ufs2_daddr_t pref; KASSERT(indx <= 0 || bap != NULL, ("need non-NULL bap")); mtx_assert(UFS_MTX(ITOUMP(ip)), MA_OWNED); fs = ITOFS(ip); /* * Allocation of indirect blocks is indicated by passing negative * values in indx: -1 for single indirect, -2 for double indirect, * -3 for triple indirect. As noted below, we attempt to allocate * the first indirect inline with the file data. For all later * indirect blocks, the data is often allocated in other cylinder * groups. However to speed random file access and to speed up * fsck, the filesystem reserves the first fs_metaspace blocks * (typically half of fs_minfree) of the data area of each cylinder * group to hold these later indirect blocks. */ inocg = ino_to_cg(fs, ip->i_number); if (indx < 0) { /* * Our preference for indirect blocks is the zone at the * beginning of the inode's cylinder group data area that * we try to reserve for indirect blocks. */ pref = cgmeta(fs, inocg); /* * If we are allocating the first indirect block, try to * place it immediately following the last direct block. */ if (indx == -1 && lbn < UFS_NDADDR + NINDIR(fs) && ip->i_din2->di_db[UFS_NDADDR - 1] != 0) pref = ip->i_din2->di_db[UFS_NDADDR - 1] + fs->fs_frag; return (pref); } /* * If we are allocating the first data block in the first indirect * block and the indirect has been allocated in the data block area, * try to place it immediately following the indirect block. */ if (lbn == UFS_NDADDR) { pref = ip->i_din2->di_ib[0]; if (pref != 0 && pref >= cgdata(fs, inocg) && pref < cgbase(fs, inocg + 1)) return (pref + fs->fs_frag); } /* * If we are at the beginning of a file, or we have already allocated * the maximum number of blocks per cylinder group, or we do not * have a block allocated immediately preceding us, then we need * to decide where to start allocating new blocks. */ if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) { /* * If we are allocating a directory data block, we want * to place it in the metadata area. */ if ((ip->i_mode & IFMT) == IFDIR) return (cgmeta(fs, inocg)); /* * Until we fill all the direct and all the first indirect's * blocks, we try to allocate in the data area of the inode's * cylinder group. */ if (lbn < UFS_NDADDR + NINDIR(fs)) return (cgdata(fs, inocg)); /* * Find a cylinder with greater than average number of * unused data blocks. */ if (indx == 0 || bap[indx - 1] == 0) startcg = inocg + lbn / fs->fs_maxbpg; else startcg = dtog(fs, bap[indx - 1]) + 1; startcg %= fs->fs_ncg; avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; for (cg = startcg; cg < fs->fs_ncg; cg++) if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { fs->fs_cgrotor = cg; return (cgdata(fs, cg)); } for (cg = 0; cg <= startcg; cg++) if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { fs->fs_cgrotor = cg; return (cgdata(fs, cg)); } return (0); } /* * Otherwise, we just always try to lay things out contiguously. */ return (bap[indx - 1] + fs->fs_frag); } /* * Implement the cylinder overflow algorithm. * * The policy implemented by this algorithm is: * 1) allocate the block in its requested cylinder group. * 2) quadradically rehash on the cylinder group number. * 3) brute force search for a free block. * * Must be called with the UFS lock held. Will release the lock on success * and return with it held on failure. */ /*VARARGS5*/ static ufs2_daddr_t ffs_hashalloc(ip, cg, pref, size, rsize, allocator) struct inode *ip; u_int cg; ufs2_daddr_t pref; int size; /* Search size for data blocks, mode for inodes */ int rsize; /* Real allocated size. */ allocfcn_t *allocator; { struct fs *fs; ufs2_daddr_t result; u_int i, icg = cg; mtx_assert(UFS_MTX(ITOUMP(ip)), MA_OWNED); #ifdef INVARIANTS if (ITOV(ip)->v_mount->mnt_kern_flag & MNTK_SUSPENDED) panic("ffs_hashalloc: allocation on suspended filesystem"); #endif fs = ITOFS(ip); /* * 1: preferred cylinder group */ result = (*allocator)(ip, cg, pref, size, rsize); if (result) return (result); /* * 2: quadratic rehash */ for (i = 1; i < fs->fs_ncg; i *= 2) { cg += i; if (cg >= fs->fs_ncg) cg -= fs->fs_ncg; result = (*allocator)(ip, cg, 0, size, rsize); if (result) return (result); } /* * 3: brute force search * Note that we start at i == 2, since 0 was checked initially, * and 1 is always checked in the quadratic rehash. */ cg = (icg + 2) % fs->fs_ncg; for (i = 2; i < fs->fs_ncg; i++) { result = (*allocator)(ip, cg, 0, size, rsize); if (result) return (result); cg++; if (cg == fs->fs_ncg) cg = 0; } return (0); } /* * Determine whether a fragment can be extended. * * Check to see if the necessary fragments are available, and * if they are, allocate them. */ static ufs2_daddr_t ffs_fragextend(ip, cg, bprev, osize, nsize) struct inode *ip; u_int cg; ufs2_daddr_t bprev; int osize, nsize; { struct fs *fs; struct cg *cgp; struct buf *bp; struct ufsmount *ump; int nffree; long bno; int frags, bbase; int i, error; u_int8_t *blksfree; ump = ITOUMP(ip); fs = ump->um_fs; if (fs->fs_cs(fs, cg).cs_nffree < numfrags(fs, nsize - osize)) return (0); frags = numfrags(fs, nsize); bbase = fragnum(fs, bprev); if (bbase > fragnum(fs, (bprev + frags - 1))) { /* cannot extend across a block boundary */ return (0); } UFS_UNLOCK(ump); if ((error = ffs_getcg(fs, ump->um_devvp, cg, &bp, &cgp)) != 0) goto fail; bno = dtogd(fs, bprev); blksfree = cg_blksfree(cgp); for (i = numfrags(fs, osize); i < frags; i++) if (isclr(blksfree, bno + i)) goto fail; /* * the current fragment can be extended * deduct the count on fragment being extended into * increase the count on the remaining fragment (if any) * allocate the extended piece */ for (i = frags; i < fs->fs_frag - bbase; i++) if (isclr(blksfree, bno + i)) break; cgp->cg_frsum[i - numfrags(fs, osize)]--; if (i != frags) cgp->cg_frsum[i - frags]++; for (i = numfrags(fs, osize), nffree = 0; i < frags; i++) { clrbit(blksfree, bno + i); cgp->cg_cs.cs_nffree--; nffree++; } UFS_LOCK(ump); fs->fs_cstotal.cs_nffree -= nffree; fs->fs_cs(fs, cg).cs_nffree -= nffree; fs->fs_fmod = 1; ACTIVECLEAR(fs, cg); UFS_UNLOCK(ump); if (DOINGSOFTDEP(ITOV(ip))) softdep_setup_blkmapdep(bp, UFSTOVFS(ump), bprev, frags, numfrags(fs, osize)); bdwrite(bp); return (bprev); fail: brelse(bp); UFS_LOCK(ump); return (0); } /* * Determine whether a block can be allocated. * * Check to see if a block of the appropriate size is available, * and if it is, allocate it. */ static ufs2_daddr_t ffs_alloccg(ip, cg, bpref, size, rsize) struct inode *ip; u_int cg; ufs2_daddr_t bpref; int size; int rsize; { struct fs *fs; struct cg *cgp; struct buf *bp; struct ufsmount *ump; ufs1_daddr_t bno; ufs2_daddr_t blkno; int i, allocsiz, error, frags; u_int8_t *blksfree; ump = ITOUMP(ip); fs = ump->um_fs; if (fs->fs_cs(fs, cg).cs_nbfree == 0 && size == fs->fs_bsize) return (0); UFS_UNLOCK(ump); if ((error = ffs_getcg(fs, ump->um_devvp, cg, &bp, &cgp)) != 0 || (cgp->cg_cs.cs_nbfree == 0 && size == fs->fs_bsize)) goto fail; if (size == fs->fs_bsize) { UFS_LOCK(ump); blkno = ffs_alloccgblk(ip, bp, bpref, rsize); ACTIVECLEAR(fs, cg); UFS_UNLOCK(ump); bdwrite(bp); return (blkno); } /* * check to see if any fragments are already available * allocsiz is the size which will be allocated, hacking * it down to a smaller size if necessary */ blksfree = cg_blksfree(cgp); frags = numfrags(fs, size); for (allocsiz = frags; allocsiz < fs->fs_frag; allocsiz++) if (cgp->cg_frsum[allocsiz] != 0) break; if (allocsiz == fs->fs_frag) { /* * no fragments were available, so a block will be * allocated, and hacked up */ if (cgp->cg_cs.cs_nbfree == 0) goto fail; UFS_LOCK(ump); blkno = ffs_alloccgblk(ip, bp, bpref, rsize); ACTIVECLEAR(fs, cg); UFS_UNLOCK(ump); bdwrite(bp); return (blkno); } KASSERT(size == rsize, ("ffs_alloccg: size(%d) != rsize(%d)", size, rsize)); bno = ffs_mapsearch(fs, cgp, bpref, allocsiz); if (bno < 0) goto fail; for (i = 0; i < frags; i++) clrbit(blksfree, bno + i); cgp->cg_cs.cs_nffree -= frags; cgp->cg_frsum[allocsiz]--; if (frags != allocsiz) cgp->cg_frsum[allocsiz - frags]++; UFS_LOCK(ump); fs->fs_cstotal.cs_nffree -= frags; fs->fs_cs(fs, cg).cs_nffree -= frags; fs->fs_fmod = 1; blkno = cgbase(fs, cg) + bno; ACTIVECLEAR(fs, cg); UFS_UNLOCK(ump); if (DOINGSOFTDEP(ITOV(ip))) softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno, frags, 0); bdwrite(bp); return (blkno); fail: brelse(bp); UFS_LOCK(ump); return (0); } /* * Allocate a block in a cylinder group. * * This algorithm implements the following policy: * 1) allocate the requested block. * 2) allocate a rotationally optimal block in the same cylinder. * 3) allocate the next available block on the block rotor for the * specified cylinder group. * Note that this routine only allocates fs_bsize blocks; these * blocks may be fragmented by the routine that allocates them. */ static ufs2_daddr_t ffs_alloccgblk(ip, bp, bpref, size) struct inode *ip; struct buf *bp; ufs2_daddr_t bpref; int size; { struct fs *fs; struct cg *cgp; struct ufsmount *ump; ufs1_daddr_t bno; ufs2_daddr_t blkno; u_int8_t *blksfree; int i, cgbpref; ump = ITOUMP(ip); fs = ump->um_fs; mtx_assert(UFS_MTX(ump), MA_OWNED); cgp = (struct cg *)bp->b_data; blksfree = cg_blksfree(cgp); if (bpref == 0) { bpref = cgbase(fs, cgp->cg_cgx) + cgp->cg_rotor + fs->fs_frag; } else if ((cgbpref = dtog(fs, bpref)) != cgp->cg_cgx) { /* map bpref to correct zone in this cg */ if (bpref < cgdata(fs, cgbpref)) bpref = cgmeta(fs, cgp->cg_cgx); else bpref = cgdata(fs, cgp->cg_cgx); } /* * if the requested block is available, use it */ bno = dtogd(fs, blknum(fs, bpref)); if (ffs_isblock(fs, blksfree, fragstoblks(fs, bno))) goto gotit; /* * Take the next available block in this cylinder group. */ bno = ffs_mapsearch(fs, cgp, bpref, (int)fs->fs_frag); if (bno < 0) return (0); /* Update cg_rotor only if allocated from the data zone */ if (bno >= dtogd(fs, cgdata(fs, cgp->cg_cgx))) cgp->cg_rotor = bno; gotit: blkno = fragstoblks(fs, bno); ffs_clrblock(fs, blksfree, (long)blkno); ffs_clusteracct(fs, cgp, blkno, -1); cgp->cg_cs.cs_nbfree--; fs->fs_cstotal.cs_nbfree--; fs->fs_cs(fs, cgp->cg_cgx).cs_nbfree--; fs->fs_fmod = 1; blkno = cgbase(fs, cgp->cg_cgx) + bno; /* * If the caller didn't want the whole block free the frags here. */ size = numfrags(fs, size); if (size != fs->fs_frag) { bno = dtogd(fs, blkno); for (i = size; i < fs->fs_frag; i++) setbit(blksfree, bno + i); i = fs->fs_frag - size; cgp->cg_cs.cs_nffree += i; fs->fs_cstotal.cs_nffree += i; fs->fs_cs(fs, cgp->cg_cgx).cs_nffree += i; fs->fs_fmod = 1; cgp->cg_frsum[i]++; } /* XXX Fixme. */ UFS_UNLOCK(ump); if (DOINGSOFTDEP(ITOV(ip))) softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno, size, 0); UFS_LOCK(ump); return (blkno); } /* * Determine whether a cluster can be allocated. * * We do not currently check for optimal rotational layout if there * are multiple choices in the same cylinder group. Instead we just * take the first one that we find following bpref. */ static ufs2_daddr_t ffs_clusteralloc(ip, cg, bpref, len) struct inode *ip; u_int cg; ufs2_daddr_t bpref; int len; { struct fs *fs; struct cg *cgp; struct buf *bp; struct ufsmount *ump; int i, run, bit, map, got, error; ufs2_daddr_t bno; u_char *mapp; int32_t *lp; u_int8_t *blksfree; ump = ITOUMP(ip); fs = ump->um_fs; if (fs->fs_maxcluster[cg] < len) return (0); UFS_UNLOCK(ump); if ((error = ffs_getcg(fs, ump->um_devvp, cg, &bp, &cgp)) != 0) { UFS_LOCK(ump); return (0); } /* * Check to see if a cluster of the needed size (or bigger) is * available in this cylinder group. */ lp = &cg_clustersum(cgp)[len]; for (i = len; i <= fs->fs_contigsumsize; i++) if (*lp++ > 0) break; if (i > fs->fs_contigsumsize) { /* * This is the first time looking for a cluster in this * cylinder group. Update the cluster summary information * to reflect the true maximum sized cluster so that * future cluster allocation requests can avoid reading * the cylinder group map only to find no clusters. */ lp = &cg_clustersum(cgp)[len - 1]; for (i = len - 1; i > 0; i--) if (*lp-- > 0) break; UFS_LOCK(ump); fs->fs_maxcluster[cg] = i; brelse(bp); return (0); } /* * Search the cluster map to find a big enough cluster. * We take the first one that we find, even if it is larger * than we need as we prefer to get one close to the previous * block allocation. We do not search before the current * preference point as we do not want to allocate a block * that is allocated before the previous one (as we will * then have to wait for another pass of the elevator * algorithm before it will be read). We prefer to fail and * be recalled to try an allocation in the next cylinder group. */ if (dtog(fs, bpref) != cg) bpref = cgdata(fs, cg); else bpref = blknum(fs, bpref); bpref = fragstoblks(fs, dtogd(fs, bpref)); mapp = &cg_clustersfree(cgp)[bpref / NBBY]; map = *mapp++; bit = 1 << (bpref % NBBY); for (run = 0, got = bpref; got < cgp->cg_nclusterblks; got++) { if ((map & bit) == 0) { run = 0; } else { run++; if (run == len) break; } if ((got & (NBBY - 1)) != (NBBY - 1)) { bit <<= 1; } else { map = *mapp++; bit = 1; } } if (got >= cgp->cg_nclusterblks) { UFS_LOCK(ump); brelse(bp); return (0); } /* * Allocate the cluster that we have found. */ blksfree = cg_blksfree(cgp); for (i = 1; i <= len; i++) if (!ffs_isblock(fs, blksfree, got - run + i)) panic("ffs_clusteralloc: map mismatch"); bno = cgbase(fs, cg) + blkstofrags(fs, got - run + 1); if (dtog(fs, bno) != cg) panic("ffs_clusteralloc: allocated out of group"); len = blkstofrags(fs, len); UFS_LOCK(ump); for (i = 0; i < len; i += fs->fs_frag) if (ffs_alloccgblk(ip, bp, bno + i, fs->fs_bsize) != bno + i) panic("ffs_clusteralloc: lost block"); ACTIVECLEAR(fs, cg); UFS_UNLOCK(ump); bdwrite(bp); return (bno); } static inline struct buf * getinobuf(struct inode *ip, u_int cg, u_int32_t cginoblk, int gbflags) { struct fs *fs; fs = ITOFS(ip); return (getblk(ITODEVVP(ip), fsbtodb(fs, ino_to_fsba(fs, cg * fs->fs_ipg + cginoblk)), (int)fs->fs_bsize, 0, 0, gbflags)); } /* * Determine whether an inode can be allocated. * * Check to see if an inode is available, and if it is, * allocate it using the following policy: * 1) allocate the requested inode. * 2) allocate the next available inode after the requested * inode in the specified cylinder group. */ static ufs2_daddr_t ffs_nodealloccg(ip, cg, ipref, mode, unused) struct inode *ip; u_int cg; ufs2_daddr_t ipref; int mode; int unused; { struct fs *fs; struct cg *cgp; struct buf *bp, *ibp; struct ufsmount *ump; u_int8_t *inosused, *loc; struct ufs2_dinode *dp2; int error, start, len, i; u_int32_t old_initediblk; ump = ITOUMP(ip); fs = ump->um_fs; check_nifree: if (fs->fs_cs(fs, cg).cs_nifree == 0) return (0); UFS_UNLOCK(ump); if ((error = ffs_getcg(fs, ump->um_devvp, cg, &bp, &cgp)) != 0) { UFS_LOCK(ump); return (0); } restart: if (cgp->cg_cs.cs_nifree == 0) { brelse(bp); UFS_LOCK(ump); return (0); } inosused = cg_inosused(cgp); if (ipref) { ipref %= fs->fs_ipg; if (isclr(inosused, ipref)) goto gotit; } start = cgp->cg_irotor / NBBY; len = howmany(fs->fs_ipg - cgp->cg_irotor, NBBY); loc = memcchr(&inosused[start], 0xff, len); if (loc == NULL) { len = start + 1; start = 0; loc = memcchr(&inosused[start], 0xff, len); if (loc == NULL) { printf("cg = %d, irotor = %ld, fs = %s\n", cg, (long)cgp->cg_irotor, fs->fs_fsmnt); panic("ffs_nodealloccg: map corrupted"); /* NOTREACHED */ } } ipref = (loc - inosused) * NBBY + ffs(~*loc) - 1; gotit: /* * Check to see if we need to initialize more inodes. */ if (fs->fs_magic == FS_UFS2_MAGIC && ipref + INOPB(fs) > cgp->cg_initediblk && cgp->cg_initediblk < cgp->cg_niblk) { old_initediblk = cgp->cg_initediblk; /* * Free the cylinder group lock before writing the * initialized inode block. Entering the * babarrierwrite() with the cylinder group lock * causes lock order violation between the lock and * snaplk. * * Another thread can decide to initialize the same * inode block, but whichever thread first gets the * cylinder group lock after writing the newly * allocated inode block will update it and the other * will realize that it has lost and leave the * cylinder group unchanged. */ ibp = getinobuf(ip, cg, old_initediblk, GB_LOCK_NOWAIT); brelse(bp); if (ibp == NULL) { /* * The inode block buffer is already owned by * another thread, which must initialize it. * Wait on the buffer to allow another thread * to finish the updates, with dropped cg * buffer lock, then retry. */ ibp = getinobuf(ip, cg, old_initediblk, 0); brelse(ibp); UFS_LOCK(ump); goto check_nifree; } bzero(ibp->b_data, (int)fs->fs_bsize); dp2 = (struct ufs2_dinode *)(ibp->b_data); for (i = 0; i < INOPB(fs); i++) { while (dp2->di_gen == 0) dp2->di_gen = arc4random(); dp2++; } /* * Rather than adding a soft updates dependency to ensure * that the new inode block is written before it is claimed * by the cylinder group map, we just do a barrier write * here. The barrier write will ensure that the inode block * gets written before the updated cylinder group map can be * written. The barrier write should only slow down bulk * loading of newly created filesystems. */ babarrierwrite(ibp); /* * After the inode block is written, try to update the * cg initediblk pointer. If another thread beat us * to it, then leave it unchanged as the other thread * has already set it correctly. */ error = ffs_getcg(fs, ump->um_devvp, cg, &bp, &cgp); UFS_LOCK(ump); ACTIVECLEAR(fs, cg); UFS_UNLOCK(ump); if (error != 0) return (error); if (cgp->cg_initediblk == old_initediblk) cgp->cg_initediblk += INOPB(fs); goto restart; } cgp->cg_irotor = ipref; UFS_LOCK(ump); ACTIVECLEAR(fs, cg); setbit(inosused, ipref); cgp->cg_cs.cs_nifree--; fs->fs_cstotal.cs_nifree--; fs->fs_cs(fs, cg).cs_nifree--; fs->fs_fmod = 1; if ((mode & IFMT) == IFDIR) { cgp->cg_cs.cs_ndir++; fs->fs_cstotal.cs_ndir++; fs->fs_cs(fs, cg).cs_ndir++; } UFS_UNLOCK(ump); if (DOINGSOFTDEP(ITOV(ip))) softdep_setup_inomapdep(bp, ip, cg * fs->fs_ipg + ipref, mode); bdwrite(bp); return ((ino_t)(cg * fs->fs_ipg + ipref)); } /* * Free a block or fragment. * * The specified block or fragment is placed back in the * free map. If a fragment is deallocated, a possible * block reassembly is checked. */ static void ffs_blkfree_cg(ump, fs, devvp, bno, size, inum, dephd) struct ufsmount *ump; struct fs *fs; struct vnode *devvp; ufs2_daddr_t bno; long size; ino_t inum; struct workhead *dephd; { struct mount *mp; struct cg *cgp; struct buf *bp; ufs1_daddr_t fragno, cgbno; int i, blk, frags, bbase, error; u_int cg; u_int8_t *blksfree; struct cdev *dev; cg = dtog(fs, bno); if (devvp->v_type == VREG) { /* devvp is a snapshot */ MPASS(devvp->v_mount->mnt_data == ump); dev = ump->um_devvp->v_rdev; } else if (devvp->v_type == VCHR) { /* devvp is a normal disk device */ dev = devvp->v_rdev; ASSERT_VOP_LOCKED(devvp, "ffs_blkfree_cg"); } else return; #ifdef INVARIANTS if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0 || fragnum(fs, bno) + numfrags(fs, size) > fs->fs_frag) { printf("dev=%s, bno = %jd, bsize = %ld, size = %ld, fs = %s\n", devtoname(dev), (intmax_t)bno, (long)fs->fs_bsize, size, fs->fs_fsmnt); panic("ffs_blkfree_cg: bad size"); } #endif if ((u_int)bno >= fs->fs_size) { printf("bad block %jd, ino %lu\n", (intmax_t)bno, (u_long)inum); ffs_fserr(fs, inum, "bad block"); return; } if ((error = ffs_getcg(fs, devvp, cg, &bp, &cgp)) != 0) return; cgbno = dtogd(fs, bno); blksfree = cg_blksfree(cgp); UFS_LOCK(ump); if (size == fs->fs_bsize) { fragno = fragstoblks(fs, cgbno); if (!ffs_isfreeblock(fs, blksfree, fragno)) { if (devvp->v_type == VREG) { UFS_UNLOCK(ump); /* devvp is a snapshot */ brelse(bp); return; } printf("dev = %s, block = %jd, fs = %s\n", devtoname(dev), (intmax_t)bno, fs->fs_fsmnt); panic("ffs_blkfree_cg: freeing free block"); } ffs_setblock(fs, blksfree, fragno); ffs_clusteracct(fs, cgp, fragno, 1); cgp->cg_cs.cs_nbfree++; fs->fs_cstotal.cs_nbfree++; fs->fs_cs(fs, cg).cs_nbfree++; } else { bbase = cgbno - fragnum(fs, cgbno); /* * decrement the counts associated with the old frags */ blk = blkmap(fs, blksfree, bbase); ffs_fragacct(fs, blk, cgp->cg_frsum, -1); /* * deallocate the fragment */ frags = numfrags(fs, size); for (i = 0; i < frags; i++) { if (isset(blksfree, cgbno + i)) { printf("dev = %s, block = %jd, fs = %s\n", devtoname(dev), (intmax_t)(bno + i), fs->fs_fsmnt); panic("ffs_blkfree_cg: freeing free frag"); } setbit(blksfree, cgbno + i); } cgp->cg_cs.cs_nffree += i; fs->fs_cstotal.cs_nffree += i; fs->fs_cs(fs, cg).cs_nffree += i; /* * add back in counts associated with the new frags */ blk = blkmap(fs, blksfree, bbase); ffs_fragacct(fs, blk, cgp->cg_frsum, 1); /* * if a complete block has been reassembled, account for it */ fragno = fragstoblks(fs, bbase); if (ffs_isblock(fs, blksfree, fragno)) { cgp->cg_cs.cs_nffree -= fs->fs_frag; fs->fs_cstotal.cs_nffree -= fs->fs_frag; fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag; ffs_clusteracct(fs, cgp, fragno, 1); cgp->cg_cs.cs_nbfree++; fs->fs_cstotal.cs_nbfree++; fs->fs_cs(fs, cg).cs_nbfree++; } } fs->fs_fmod = 1; ACTIVECLEAR(fs, cg); UFS_UNLOCK(ump); mp = UFSTOVFS(ump); if (MOUNTEDSOFTDEP(mp) && devvp->v_type == VCHR) softdep_setup_blkfree(UFSTOVFS(ump), bp, bno, numfrags(fs, size), dephd); bdwrite(bp); } struct ffs_blkfree_trim_params { struct task task; struct ufsmount *ump; struct vnode *devvp; ufs2_daddr_t bno; long size; ino_t inum; struct workhead *pdephd; struct workhead dephd; }; static void ffs_blkfree_trim_task(ctx, pending) void *ctx; int pending; { struct ffs_blkfree_trim_params *tp; tp = ctx; ffs_blkfree_cg(tp->ump, tp->ump->um_fs, tp->devvp, tp->bno, tp->size, tp->inum, tp->pdephd); vn_finished_secondary_write(UFSTOVFS(tp->ump)); atomic_add_int(&tp->ump->um_trim_inflight, -1); free(tp, M_TEMP); } static void ffs_blkfree_trim_completed(bip) struct bio *bip; { struct ffs_blkfree_trim_params *tp; tp = bip->bio_caller2; g_destroy_bio(bip); TASK_INIT(&tp->task, 0, ffs_blkfree_trim_task, tp); taskqueue_enqueue(tp->ump->um_trim_tq, &tp->task); } void ffs_blkfree(ump, fs, devvp, bno, size, inum, vtype, dephd) struct ufsmount *ump; struct fs *fs; struct vnode *devvp; ufs2_daddr_t bno; long size; ino_t inum; enum vtype vtype; struct workhead *dephd; { struct mount *mp; struct bio *bip; struct ffs_blkfree_trim_params *tp; /* * Check to see if a snapshot wants to claim the block. * Check that devvp is a normal disk device, not a snapshot, * it has a snapshot(s) associated with it, and one of the * snapshots wants to claim the block. */ if (devvp->v_type == VCHR && (devvp->v_vflag & VV_COPYONWRITE) && ffs_snapblkfree(fs, devvp, bno, size, inum, vtype, dephd)) { return; } /* * Nothing to delay if TRIM is disabled, or the operation is * performed on the snapshot. */ if (!ump->um_candelete || devvp->v_type == VREG) { ffs_blkfree_cg(ump, fs, devvp, bno, size, inum, dephd); return; } /* * Postpone the set of the free bit in the cg bitmap until the * BIO_DELETE is completed. Otherwise, due to disk queue * reordering, TRIM might be issued after we reuse the block * and write some new data into it. */ atomic_add_int(&ump->um_trim_inflight, 1); tp = malloc(sizeof(struct ffs_blkfree_trim_params), M_TEMP, M_WAITOK); tp->ump = ump; tp->devvp = devvp; tp->bno = bno; tp->size = size; tp->inum = inum; if (dephd != NULL) { LIST_INIT(&tp->dephd); LIST_SWAP(dephd, &tp->dephd, worklist, wk_list); tp->pdephd = &tp->dephd; } else tp->pdephd = NULL; bip = g_alloc_bio(); bip->bio_cmd = BIO_DELETE; bip->bio_offset = dbtob(fsbtodb(fs, bno)); bip->bio_done = ffs_blkfree_trim_completed; bip->bio_length = size; bip->bio_caller2 = tp; mp = UFSTOVFS(ump); vn_start_secondary_write(NULL, &mp, 0); g_io_request(bip, (struct g_consumer *)devvp->v_bufobj.bo_private); } #ifdef INVARIANTS /* * Verify allocation of a block or fragment. Returns true if block or * fragment is allocated, false if it is free. */ static int ffs_checkblk(ip, bno, size) struct inode *ip; ufs2_daddr_t bno; long size; { struct fs *fs; struct cg *cgp; struct buf *bp; ufs1_daddr_t cgbno; int i, error, frags, free; u_int8_t *blksfree; fs = ITOFS(ip); if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0) { printf("bsize = %ld, size = %ld, fs = %s\n", (long)fs->fs_bsize, size, fs->fs_fsmnt); panic("ffs_checkblk: bad size"); } if ((u_int)bno >= fs->fs_size) panic("ffs_checkblk: bad block %jd", (intmax_t)bno); error = ffs_getcg(fs, ITODEVVP(ip), dtog(fs, bno), &bp, &cgp); if (error) panic("ffs_checkblk: cylinder group read failed"); blksfree = cg_blksfree(cgp); cgbno = dtogd(fs, bno); if (size == fs->fs_bsize) { free = ffs_isblock(fs, blksfree, fragstoblks(fs, cgbno)); } else { frags = numfrags(fs, size); for (free = 0, i = 0; i < frags; i++) if (isset(blksfree, cgbno + i)) free++; if (free != 0 && free != frags) panic("ffs_checkblk: partially free fragment"); } brelse(bp); return (!free); } #endif /* INVARIANTS */ /* * Free an inode. */ int ffs_vfree(pvp, ino, mode) struct vnode *pvp; ino_t ino; int mode; { struct ufsmount *ump; struct inode *ip; if (DOINGSOFTDEP(pvp)) { softdep_freefile(pvp, ino, mode); return (0); } ip = VTOI(pvp); ump = VFSTOUFS(pvp->v_mount); return (ffs_freefile(ump, ump->um_fs, ump->um_devvp, ino, mode, NULL)); } /* * Do the actual free operation. * The specified inode is placed back in the free map. */ int ffs_freefile(ump, fs, devvp, ino, mode, wkhd) struct ufsmount *ump; struct fs *fs; struct vnode *devvp; ino_t ino; int mode; struct workhead *wkhd; { struct cg *cgp; struct buf *bp; ufs2_daddr_t cgbno; int error; u_int cg; u_int8_t *inosused; struct cdev *dev; cg = ino_to_cg(fs, ino); if (devvp->v_type == VREG) { /* devvp is a snapshot */ MPASS(devvp->v_mount->mnt_data == ump); dev = ump->um_devvp->v_rdev; cgbno = fragstoblks(fs, cgtod(fs, cg)); } else if (devvp->v_type == VCHR) { /* devvp is a normal disk device */ dev = devvp->v_rdev; cgbno = fsbtodb(fs, cgtod(fs, cg)); } else { bp = NULL; return (0); } if (ino >= fs->fs_ipg * fs->fs_ncg) panic("ffs_freefile: range: dev = %s, ino = %ju, fs = %s", devtoname(dev), (uintmax_t)ino, fs->fs_fsmnt); if ((error = ffs_getcg(fs, devvp, cg, &bp, &cgp)) != 0) return (error); inosused = cg_inosused(cgp); ino %= fs->fs_ipg; if (isclr(inosused, ino)) { printf("dev = %s, ino = %ju, fs = %s\n", devtoname(dev), (uintmax_t)(ino + cg * fs->fs_ipg), fs->fs_fsmnt); if (fs->fs_ronly == 0) panic("ffs_freefile: freeing free inode"); } clrbit(inosused, ino); if (ino < cgp->cg_irotor) cgp->cg_irotor = ino; cgp->cg_cs.cs_nifree++; UFS_LOCK(ump); fs->fs_cstotal.cs_nifree++; fs->fs_cs(fs, cg).cs_nifree++; if ((mode & IFMT) == IFDIR) { cgp->cg_cs.cs_ndir--; fs->fs_cstotal.cs_ndir--; fs->fs_cs(fs, cg).cs_ndir--; } fs->fs_fmod = 1; ACTIVECLEAR(fs, cg); UFS_UNLOCK(ump); if (MOUNTEDSOFTDEP(UFSTOVFS(ump)) && devvp->v_type == VCHR) softdep_setup_inofree(UFSTOVFS(ump), bp, ino + cg * fs->fs_ipg, wkhd); bdwrite(bp); return (0); } /* * Check to see if a file is free. * Used to check for allocated files in snapshots. */ int ffs_checkfreefile(fs, devvp, ino) struct fs *fs; struct vnode *devvp; ino_t ino; { struct cg *cgp; struct buf *bp; ufs2_daddr_t cgbno; int ret, error; u_int cg; u_int8_t *inosused; cg = ino_to_cg(fs, ino); if (devvp->v_type == VREG) { /* devvp is a snapshot */ cgbno = fragstoblks(fs, cgtod(fs, cg)); } else if (devvp->v_type == VCHR) { /* devvp is a normal disk device */ cgbno = fsbtodb(fs, cgtod(fs, cg)); } else { return (1); } if (ino >= fs->fs_ipg * fs->fs_ncg) return (1); if ((error = ffs_getcg(fs, devvp, cg, &bp, &cgp)) != 0) return (1); inosused = cg_inosused(cgp); ino %= fs->fs_ipg; ret = isclr(inosused, ino); brelse(bp); return (ret); } /* * Find a block of the specified size in the specified cylinder group. * * It is a panic if a request is made to find a block if none are * available. */ static ufs1_daddr_t ffs_mapsearch(fs, cgp, bpref, allocsiz) struct fs *fs; struct cg *cgp; ufs2_daddr_t bpref; int allocsiz; { ufs1_daddr_t bno; int start, len, loc, i; int blk, field, subfield, pos; u_int8_t *blksfree; /* * find the fragment by searching through the free block * map for an appropriate bit pattern */ if (bpref) start = dtogd(fs, bpref) / NBBY; else start = cgp->cg_frotor / NBBY; blksfree = cg_blksfree(cgp); len = howmany(fs->fs_fpg, NBBY) - start; loc = scanc((u_int)len, (u_char *)&blksfree[start], fragtbl[fs->fs_frag], (u_char)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY)))); if (loc == 0) { len = start + 1; start = 0; loc = scanc((u_int)len, (u_char *)&blksfree[0], fragtbl[fs->fs_frag], (u_char)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY)))); if (loc == 0) { printf("start = %d, len = %d, fs = %s\n", start, len, fs->fs_fsmnt); panic("ffs_alloccg: map corrupted"); /* NOTREACHED */ } } bno = (start + len - loc) * NBBY; cgp->cg_frotor = bno; /* * found the byte in the map * sift through the bits to find the selected frag */ for (i = bno + NBBY; bno < i; bno += fs->fs_frag) { blk = blkmap(fs, blksfree, bno); blk <<= 1; field = around[allocsiz]; subfield = inside[allocsiz]; for (pos = 0; pos <= fs->fs_frag - allocsiz; pos++) { if ((blk & field) == subfield) return (bno + pos); field <<= 1; subfield <<= 1; } } printf("bno = %lu, fs = %s\n", (u_long)bno, fs->fs_fsmnt); panic("ffs_alloccg: block not in map"); return (-1); } /* * Fetch and verify a cylinder group. */ int ffs_getcg(fs, devvp, cg, bpp, cgpp) struct fs *fs; struct vnode *devvp; u_int cg; struct buf **bpp; struct cg **cgpp; { struct buf *bp; struct cg *cgp; - int error; + int flags, error; *bpp = NULL; *cgpp = NULL; - error = bread(devvp, devvp->v_type == VREG ? + flags = 0; + if ((fs->fs_metackhash & CK_CYLGRP) != 0) + flags |= GB_CKHASH; + error = breadn_flags(devvp, devvp->v_type == VREG ? fragstoblks(fs, cgtod(fs, cg)) : fsbtodb(fs, cgtod(fs, cg)), - (int)fs->fs_cgsize, NOCRED, &bp); + (int)fs->fs_cgsize, NULL, NULL, 0, NOCRED, flags, + ffs_ckhash_cg, &bp); if (error != 0) return (error); cgp = (struct cg *)bp->b_data; - if (!cg_chkmagic(cgp) || cgp->cg_cgx != cg) { + if (((fs->fs_metackhash & CK_CYLGRP) != 0 && + (bp->b_flags & B_CKHASH) != 0 && + cgp->cg_ckhash != bp->b_ckhash) || + !cg_chkmagic(cgp) || cgp->cg_cgx != cg) { + printf("checksum failed: cg %u, cgp: 0x%x != bp: 0x%lx\n", + cg, cgp->cg_ckhash, bp->b_ckhash); + bp->b_flags &= ~B_CKHASH; + bp->b_flags |= B_INVAL | B_NOCACHE; brelse(bp); return (EIO); } + bp->b_flags &= ~B_CKHASH; bp->b_xflags |= BX_BKGRDWRITE; + if ((fs->fs_metackhash & CK_CYLGRP) != 0) + bp->b_xflags |= BX_CYLGRP; cgp->cg_old_time = cgp->cg_time = time_second; *bpp = bp; *cgpp = cgp; return (0); +} + +static void +ffs_ckhash_cg(bp) + struct buf *bp; +{ + uint32_t ckhash; + struct cg *cgp; + + cgp = (struct cg *)bp->b_data; + ckhash = cgp->cg_ckhash; + cgp->cg_ckhash = 0; + bp->b_ckhash = calculate_crc32c(~0L, bp->b_data, bp->b_bcount); + cgp->cg_ckhash = ckhash; } /* * Fserr prints the name of a filesystem with an error diagnostic. * * The form of the error message is: * fs: error message */ void ffs_fserr(fs, inum, cp) struct fs *fs; ino_t inum; char *cp; { struct thread *td = curthread; /* XXX */ struct proc *p = td->td_proc; log(LOG_ERR, "pid %d (%s), uid %d inumber %ju on %s: %s\n", p->p_pid, p->p_comm, td->td_ucred->cr_uid, (uintmax_t)inum, fs->fs_fsmnt, cp); } /* * This function provides the capability for the fsck program to * update an active filesystem. Fourteen operations are provided: * * adjrefcnt(inode, amt) - adjusts the reference count on the * specified inode by the specified amount. Under normal * operation the count should always go down. Decrementing * the count to zero will cause the inode to be freed. * adjblkcnt(inode, amt) - adjust the number of blocks used by the * inode by the specified amount. * adjndir, adjbfree, adjifree, adjffree, adjnumclusters(amt) - * adjust the superblock summary. * freedirs(inode, count) - directory inodes [inode..inode + count - 1] * are marked as free. Inodes should never have to be marked * as in use. * freefiles(inode, count) - file inodes [inode..inode + count - 1] * are marked as free. Inodes should never have to be marked * as in use. * freeblks(blockno, size) - blocks [blockno..blockno + size - 1] * are marked as free. Blocks should never have to be marked * as in use. * setflags(flags, set/clear) - the fs_flags field has the specified * flags set (second parameter +1) or cleared (second parameter -1). * setcwd(dirinode) - set the current directory to dirinode in the * filesystem associated with the snapshot. * setdotdot(oldvalue, newvalue) - Verify that the inode number for ".." * in the current directory is oldvalue then change it to newvalue. * unlink(nameptr, oldvalue) - Verify that the inode number associated * with nameptr in the current directory is oldvalue then unlink it. * * The following functions may only be used on a quiescent filesystem * by the soft updates journal. They are not safe to be run on an active * filesystem. * * setinode(inode, dip) - the specified disk inode is replaced with the * contents pointed to by dip. * setbufoutput(fd, flags) - output associated with the specified file * descriptor (which must reference the character device supporting * the filesystem) switches from using physio to running through the * buffer cache when flags is set to 1. The descriptor reverts to * physio for output when flags is set to zero. */ static int sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_vfs_ffs, FFS_ADJ_REFCNT, adjrefcnt, CTLFLAG_WR|CTLTYPE_STRUCT, 0, 0, sysctl_ffs_fsck, "S,fsck", "Adjust Inode Reference Count"); static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_BLKCNT, adjblkcnt, CTLFLAG_WR, sysctl_ffs_fsck, "Adjust Inode Used Blocks Count"); static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NDIR, adjndir, CTLFLAG_WR, sysctl_ffs_fsck, "Adjust number of directories"); static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NBFREE, adjnbfree, CTLFLAG_WR, sysctl_ffs_fsck, "Adjust number of free blocks"); static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NIFREE, adjnifree, CTLFLAG_WR, sysctl_ffs_fsck, "Adjust number of free inodes"); static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NFFREE, adjnffree, CTLFLAG_WR, sysctl_ffs_fsck, "Adjust number of free frags"); static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NUMCLUSTERS, adjnumclusters, CTLFLAG_WR, sysctl_ffs_fsck, "Adjust number of free clusters"); static SYSCTL_NODE(_vfs_ffs, FFS_DIR_FREE, freedirs, CTLFLAG_WR, sysctl_ffs_fsck, "Free Range of Directory Inodes"); static SYSCTL_NODE(_vfs_ffs, FFS_FILE_FREE, freefiles, CTLFLAG_WR, sysctl_ffs_fsck, "Free Range of File Inodes"); static SYSCTL_NODE(_vfs_ffs, FFS_BLK_FREE, freeblks, CTLFLAG_WR, sysctl_ffs_fsck, "Free Range of Blocks"); static SYSCTL_NODE(_vfs_ffs, FFS_SET_FLAGS, setflags, CTLFLAG_WR, sysctl_ffs_fsck, "Change Filesystem Flags"); static SYSCTL_NODE(_vfs_ffs, FFS_SET_CWD, setcwd, CTLFLAG_WR, sysctl_ffs_fsck, "Set Current Working Directory"); static SYSCTL_NODE(_vfs_ffs, FFS_SET_DOTDOT, setdotdot, CTLFLAG_WR, sysctl_ffs_fsck, "Change Value of .. Entry"); static SYSCTL_NODE(_vfs_ffs, FFS_UNLINK, unlink, CTLFLAG_WR, sysctl_ffs_fsck, "Unlink a Duplicate Name"); static SYSCTL_NODE(_vfs_ffs, FFS_SET_INODE, setinode, CTLFLAG_WR, sysctl_ffs_fsck, "Update an On-Disk Inode"); static SYSCTL_NODE(_vfs_ffs, FFS_SET_BUFOUTPUT, setbufoutput, CTLFLAG_WR, sysctl_ffs_fsck, "Set Buffered Writing for Descriptor"); #define DEBUG 1 #ifdef DEBUG static int fsckcmds = 0; SYSCTL_INT(_debug, OID_AUTO, fsckcmds, CTLFLAG_RW, &fsckcmds, 0, ""); #endif /* DEBUG */ static int buffered_write(struct file *, struct uio *, struct ucred *, int, struct thread *); static int sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS) { struct thread *td = curthread; struct fsck_cmd cmd; struct ufsmount *ump; struct vnode *vp, *dvp, *fdvp; struct inode *ip, *dp; struct mount *mp; struct fs *fs; ufs2_daddr_t blkno; long blkcnt, blksize; struct file *fp, *vfp; cap_rights_t rights; int filetype, error; static struct fileops *origops, bufferedops; if (req->newlen > sizeof cmd) return (EBADRPC); if ((error = SYSCTL_IN(req, &cmd, sizeof cmd)) != 0) return (error); if (cmd.version != FFS_CMD_VERSION) return (ERPCMISMATCH); if ((error = getvnode(td, cmd.handle, cap_rights_init(&rights, CAP_FSCK), &fp)) != 0) return (error); vp = fp->f_data; if (vp->v_type != VREG && vp->v_type != VDIR) { fdrop(fp, td); return (EINVAL); } vn_start_write(vp, &mp, V_WAIT); if (mp == NULL || strncmp(mp->mnt_stat.f_fstypename, "ufs", MFSNAMELEN)) { vn_finished_write(mp); fdrop(fp, td); return (EINVAL); } ump = VFSTOUFS(mp); if ((mp->mnt_flag & MNT_RDONLY) && ump->um_fsckpid != td->td_proc->p_pid) { vn_finished_write(mp); fdrop(fp, td); return (EROFS); } fs = ump->um_fs; filetype = IFREG; switch (oidp->oid_number) { case FFS_SET_FLAGS: #ifdef DEBUG if (fsckcmds) printf("%s: %s flags\n", mp->mnt_stat.f_mntonname, cmd.size > 0 ? "set" : "clear"); #endif /* DEBUG */ if (cmd.size > 0) fs->fs_flags |= (long)cmd.value; else fs->fs_flags &= ~(long)cmd.value; break; case FFS_ADJ_REFCNT: #ifdef DEBUG if (fsckcmds) { printf("%s: adjust inode %jd link count by %jd\n", mp->mnt_stat.f_mntonname, (intmax_t)cmd.value, (intmax_t)cmd.size); } #endif /* DEBUG */ if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp))) break; ip = VTOI(vp); ip->i_nlink += cmd.size; DIP_SET(ip, i_nlink, ip->i_nlink); ip->i_effnlink += cmd.size; ip->i_flag |= IN_CHANGE | IN_MODIFIED; error = ffs_update(vp, 1); if (DOINGSOFTDEP(vp)) softdep_change_linkcnt(ip); vput(vp); break; case FFS_ADJ_BLKCNT: #ifdef DEBUG if (fsckcmds) { printf("%s: adjust inode %jd block count by %jd\n", mp->mnt_stat.f_mntonname, (intmax_t)cmd.value, (intmax_t)cmd.size); } #endif /* DEBUG */ if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp))) break; ip = VTOI(vp); DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + cmd.size); ip->i_flag |= IN_CHANGE | IN_MODIFIED; error = ffs_update(vp, 1); vput(vp); break; case FFS_DIR_FREE: filetype = IFDIR; /* fall through */ case FFS_FILE_FREE: #ifdef DEBUG if (fsckcmds) { if (cmd.size == 1) printf("%s: free %s inode %ju\n", mp->mnt_stat.f_mntonname, filetype == IFDIR ? "directory" : "file", (uintmax_t)cmd.value); else printf("%s: free %s inodes %ju-%ju\n", mp->mnt_stat.f_mntonname, filetype == IFDIR ? "directory" : "file", (uintmax_t)cmd.value, (uintmax_t)(cmd.value + cmd.size - 1)); } #endif /* DEBUG */ while (cmd.size > 0) { if ((error = ffs_freefile(ump, fs, ump->um_devvp, cmd.value, filetype, NULL))) break; cmd.size -= 1; cmd.value += 1; } break; case FFS_BLK_FREE: #ifdef DEBUG if (fsckcmds) { if (cmd.size == 1) printf("%s: free block %jd\n", mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); else printf("%s: free blocks %jd-%jd\n", mp->mnt_stat.f_mntonname, (intmax_t)cmd.value, (intmax_t)cmd.value + cmd.size - 1); } #endif /* DEBUG */ blkno = cmd.value; blkcnt = cmd.size; blksize = fs->fs_frag - (blkno % fs->fs_frag); while (blkcnt > 0) { if (blksize > blkcnt) blksize = blkcnt; ffs_blkfree(ump, fs, ump->um_devvp, blkno, blksize * fs->fs_fsize, UFS_ROOTINO, VDIR, NULL); blkno += blksize; blkcnt -= blksize; blksize = fs->fs_frag; } break; /* * Adjust superblock summaries. fsck(8) is expected to * submit deltas when necessary. */ case FFS_ADJ_NDIR: #ifdef DEBUG if (fsckcmds) { printf("%s: adjust number of directories by %jd\n", mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); } #endif /* DEBUG */ fs->fs_cstotal.cs_ndir += cmd.value; break; case FFS_ADJ_NBFREE: #ifdef DEBUG if (fsckcmds) { printf("%s: adjust number of free blocks by %+jd\n", mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); } #endif /* DEBUG */ fs->fs_cstotal.cs_nbfree += cmd.value; break; case FFS_ADJ_NIFREE: #ifdef DEBUG if (fsckcmds) { printf("%s: adjust number of free inodes by %+jd\n", mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); } #endif /* DEBUG */ fs->fs_cstotal.cs_nifree += cmd.value; break; case FFS_ADJ_NFFREE: #ifdef DEBUG if (fsckcmds) { printf("%s: adjust number of free frags by %+jd\n", mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); } #endif /* DEBUG */ fs->fs_cstotal.cs_nffree += cmd.value; break; case FFS_ADJ_NUMCLUSTERS: #ifdef DEBUG if (fsckcmds) { printf("%s: adjust number of free clusters by %+jd\n", mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); } #endif /* DEBUG */ fs->fs_cstotal.cs_numclusters += cmd.value; break; case FFS_SET_CWD: #ifdef DEBUG if (fsckcmds) { printf("%s: set current directory to inode %jd\n", mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); } #endif /* DEBUG */ if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_SHARED, &vp))) break; AUDIT_ARG_VNODE1(vp); if ((error = change_dir(vp, td)) != 0) { vput(vp); break; } VOP_UNLOCK(vp, 0); pwd_chdir(td, vp); break; case FFS_SET_DOTDOT: #ifdef DEBUG if (fsckcmds) { printf("%s: change .. in cwd from %jd to %jd\n", mp->mnt_stat.f_mntonname, (intmax_t)cmd.value, (intmax_t)cmd.size); } #endif /* DEBUG */ /* * First we have to get and lock the parent directory * to which ".." points. */ error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &fdvp); if (error) break; /* * Now we get and lock the child directory containing "..". */ FILEDESC_SLOCK(td->td_proc->p_fd); dvp = td->td_proc->p_fd->fd_cdir; FILEDESC_SUNLOCK(td->td_proc->p_fd); if ((error = vget(dvp, LK_EXCLUSIVE, td)) != 0) { vput(fdvp); break; } dp = VTOI(dvp); dp->i_offset = 12; /* XXX mastertemplate.dot_reclen */ error = ufs_dirrewrite(dp, VTOI(fdvp), (ino_t)cmd.size, DT_DIR, 0); cache_purge(fdvp); cache_purge(dvp); vput(dvp); vput(fdvp); break; case FFS_UNLINK: #ifdef DEBUG if (fsckcmds) { char buf[32]; if (copyinstr((char *)(intptr_t)cmd.value, buf,32,NULL)) strncpy(buf, "Name_too_long", 32); printf("%s: unlink %s (inode %jd)\n", mp->mnt_stat.f_mntonname, buf, (intmax_t)cmd.size); } #endif /* DEBUG */ /* * kern_unlinkat will do its own start/finish writes and * they do not nest, so drop ours here. Setting mp == NULL * indicates that vn_finished_write is not needed down below. */ vn_finished_write(mp); mp = NULL; error = kern_unlinkat(td, AT_FDCWD, (char *)(intptr_t)cmd.value, UIO_USERSPACE, (ino_t)cmd.size); break; case FFS_SET_INODE: if (ump->um_fsckpid != td->td_proc->p_pid) { error = EPERM; break; } #ifdef DEBUG if (fsckcmds) { printf("%s: update inode %jd\n", mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); } #endif /* DEBUG */ if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp))) break; AUDIT_ARG_VNODE1(vp); ip = VTOI(vp); if (I_IS_UFS1(ip)) error = copyin((void *)(intptr_t)cmd.size, ip->i_din1, sizeof(struct ufs1_dinode)); else error = copyin((void *)(intptr_t)cmd.size, ip->i_din2, sizeof(struct ufs2_dinode)); if (error) { vput(vp); break; } ip->i_flag |= IN_CHANGE | IN_MODIFIED; error = ffs_update(vp, 1); vput(vp); break; case FFS_SET_BUFOUTPUT: if (ump->um_fsckpid != td->td_proc->p_pid) { error = EPERM; break; } if (ITOUMP(VTOI(vp)) != ump) { error = EINVAL; break; } #ifdef DEBUG if (fsckcmds) { printf("%s: %s buffered output for descriptor %jd\n", mp->mnt_stat.f_mntonname, cmd.size == 1 ? "enable" : "disable", (intmax_t)cmd.value); } #endif /* DEBUG */ if ((error = getvnode(td, cmd.value, cap_rights_init(&rights, CAP_FSCK), &vfp)) != 0) break; if (vfp->f_vnode->v_type != VCHR) { fdrop(vfp, td); error = EINVAL; break; } if (origops == NULL) { origops = vfp->f_ops; bcopy((void *)origops, (void *)&bufferedops, sizeof(bufferedops)); bufferedops.fo_write = buffered_write; } if (cmd.size == 1) atomic_store_rel_ptr((volatile uintptr_t *)&vfp->f_ops, (uintptr_t)&bufferedops); else atomic_store_rel_ptr((volatile uintptr_t *)&vfp->f_ops, (uintptr_t)origops); fdrop(vfp, td); break; default: #ifdef DEBUG if (fsckcmds) { printf("Invalid request %d from fsck\n", oidp->oid_number); } #endif /* DEBUG */ error = EINVAL; break; } fdrop(fp, td); vn_finished_write(mp); return (error); } /* * Function to switch a descriptor to use the buffer cache to stage * its I/O. This is needed so that writes to the filesystem device * will give snapshots a chance to copy modified blocks for which it * needs to retain copies. */ static int buffered_write(fp, uio, active_cred, flags, td) struct file *fp; struct uio *uio; struct ucred *active_cred; int flags; struct thread *td; { struct vnode *devvp, *vp; struct inode *ip; struct buf *bp; struct fs *fs; struct filedesc *fdp; int error; daddr_t lbn; /* * The devvp is associated with the /dev filesystem. To discover * the filesystem with which the device is associated, we depend * on the application setting the current directory to a location * within the filesystem being written. Yes, this is an ugly hack. */ devvp = fp->f_vnode; if (!vn_isdisk(devvp, NULL)) return (EINVAL); fdp = td->td_proc->p_fd; FILEDESC_SLOCK(fdp); vp = fdp->fd_cdir; vref(vp); FILEDESC_SUNLOCK(fdp); vn_lock(vp, LK_SHARED | LK_RETRY); /* * Check that the current directory vnode indeed belongs to * UFS before trying to dereference UFS-specific v_data fields. */ if (vp->v_op != &ffs_vnodeops1 && vp->v_op != &ffs_vnodeops2) { vput(vp); return (EINVAL); } ip = VTOI(vp); if (ITODEVVP(ip) != devvp) { vput(vp); return (EINVAL); } fs = ITOFS(ip); vput(vp); foffset_lock_uio(fp, uio, flags); vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); #ifdef DEBUG if (fsckcmds) { printf("%s: buffered write for block %jd\n", fs->fs_fsmnt, (intmax_t)btodb(uio->uio_offset)); } #endif /* DEBUG */ /* * All I/O must be contained within a filesystem block, start on * a fragment boundary, and be a multiple of fragments in length. */ if (uio->uio_resid > fs->fs_bsize - (uio->uio_offset % fs->fs_bsize) || fragoff(fs, uio->uio_offset) != 0 || fragoff(fs, uio->uio_resid) != 0) { error = EINVAL; goto out; } lbn = numfrags(fs, uio->uio_offset); bp = getblk(devvp, lbn, uio->uio_resid, 0, 0, 0); bp->b_flags |= B_RELBUF; if ((error = uiomove((char *)bp->b_data, uio->uio_resid, uio)) != 0) { brelse(bp); goto out; } error = bwrite(bp); out: VOP_UNLOCK(devvp, 0); foffset_unlock_uio(fp, uio, flags | FOF_NEXTOFF); return (error); } Index: head/sys/ufs/ffs/ffs_inode.c =================================================================== --- head/sys/ufs/ffs/ffs_inode.c (revision 323922) +++ head/sys/ufs/ffs/ffs_inode.c (revision 323923) @@ -1,767 +1,767 @@ /*- * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ffs_inode.c 8.13 (Berkeley) 4/21/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_quota.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int ffs_indirtrunc(struct inode *, ufs2_daddr_t, ufs2_daddr_t, ufs2_daddr_t, int, ufs2_daddr_t *); /* * Update the access, modified, and inode change times as specified by the * IN_ACCESS, IN_UPDATE, and IN_CHANGE flags respectively. Write the inode * to disk if the IN_MODIFIED flag is set (it may be set initially, or by * the timestamp update). The IN_LAZYMOD flag is set to force a write * later if not now. The IN_LAZYACCESS is set instead of IN_MODIFIED if the fs * is currently being suspended (or is suspended) and vnode has been accessed. * If we write now, then clear IN_MODIFIED, IN_LAZYACCESS and IN_LAZYMOD to * reflect the presumably successful write, and if waitfor is set, then wait * for the write to complete. */ int ffs_update(vp, waitfor) struct vnode *vp; int waitfor; { struct fs *fs; struct buf *bp; struct inode *ip; int flags, error; ASSERT_VOP_ELOCKED(vp, "ffs_update"); ufs_itimes(vp); ip = VTOI(vp); if ((ip->i_flag & IN_MODIFIED) == 0 && waitfor == 0) return (0); ip->i_flag &= ~(IN_LAZYACCESS | IN_LAZYMOD | IN_MODIFIED); fs = ITOFS(ip); if (fs->fs_ronly && ITOUMP(ip)->um_fsckpid == 0) return (0); /* * If we are updating a snapshot and another process is currently * writing the buffer containing the inode for this snapshot then * a deadlock can occur when it tries to check the snapshot to see * if that block needs to be copied. Thus when updating a snapshot * we check to see if the buffer is already locked, and if it is * we drop the snapshot lock until the buffer has been written * and is available to us. We have to grab a reference to the * snapshot vnode to prevent it from being removed while we are * waiting for the buffer. */ flags = 0; if (IS_SNAPSHOT(ip)) flags = GB_LOCK_NOWAIT; loop: - error = breadn_flags(ITODEVVP(ip), + error = bread_gb(ITODEVVP(ip), fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), - (int) fs->fs_bsize, 0, 0, 0, NOCRED, flags, &bp); + (int) fs->fs_bsize, NOCRED, flags, &bp); if (error != 0) { if (error != EBUSY) return (error); KASSERT((IS_SNAPSHOT(ip)), ("EBUSY from non-snapshot")); /* * Wait for our inode block to become available. * * Hold a reference to the vnode to protect against * ffs_snapgone(). Since we hold a reference, it can only * get reclaimed (VI_DOOMED flag) in a forcible downgrade * or unmount. For an unmount, the entire filesystem will be * gone, so we cannot attempt to touch anything associated * with it while the vnode is unlocked; all we can do is * pause briefly and try again. If when we relock the vnode * we discover that it has been reclaimed, updating it is no * longer necessary and we can just return an error. */ vref(vp); VOP_UNLOCK(vp, 0); pause("ffsupd", 1); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vrele(vp); if ((vp->v_iflag & VI_DOOMED) != 0) return (ENOENT); goto loop; } if (DOINGSOFTDEP(vp)) softdep_update_inodeblock(ip, bp, waitfor); else if (ip->i_effnlink != ip->i_nlink) panic("ffs_update: bad link cnt"); if (I_IS_UFS1(ip)) { *((struct ufs1_dinode *)bp->b_data + ino_to_fsbo(fs, ip->i_number)) = *ip->i_din1; /* XXX: FIX? The entropy here is desirable, but the harvesting may be expensive */ random_harvest_queue(&(ip->i_din1), sizeof(ip->i_din1), 1, RANDOM_FS_ATIME); } else { *((struct ufs2_dinode *)bp->b_data + ino_to_fsbo(fs, ip->i_number)) = *ip->i_din2; /* XXX: FIX? The entropy here is desirable, but the harvesting may be expensive */ random_harvest_queue(&(ip->i_din2), sizeof(ip->i_din2), 1, RANDOM_FS_ATIME); } if (waitfor) error = bwrite(bp); else if (vm_page_count_severe() || buf_dirty_count_severe()) { bawrite(bp); error = 0; } else { if (bp->b_bufsize == fs->fs_bsize) bp->b_flags |= B_CLUSTEROK; bdwrite(bp); error = 0; } return (error); } #define SINGLE 0 /* index of single indirect block */ #define DOUBLE 1 /* index of double indirect block */ #define TRIPLE 2 /* index of triple indirect block */ /* * Truncate the inode ip to at most length size, freeing the * disk blocks. */ int ffs_truncate(vp, length, flags, cred) struct vnode *vp; off_t length; int flags; struct ucred *cred; { struct inode *ip; ufs2_daddr_t bn, lbn, lastblock, lastiblock[UFS_NIADDR]; ufs2_daddr_t indir_lbn[UFS_NIADDR], oldblks[UFS_NDADDR + UFS_NIADDR]; ufs2_daddr_t newblks[UFS_NDADDR + UFS_NIADDR]; ufs2_daddr_t count, blocksreleased = 0, datablocks, blkno; struct bufobj *bo; struct fs *fs; struct buf *bp; struct ufsmount *ump; int softdeptrunc, journaltrunc; int needextclean, extblocks; int offset, size, level, nblocks; int i, error, allerror, indiroff, waitforupdate; off_t osize; ip = VTOI(vp); ump = VFSTOUFS(vp->v_mount); fs = ump->um_fs; bo = &vp->v_bufobj; ASSERT_VOP_LOCKED(vp, "ffs_truncate"); if (length < 0) return (EINVAL); if (length > fs->fs_maxfilesize) return (EFBIG); #ifdef QUOTA error = getinoquota(ip); if (error) return (error); #endif /* * Historically clients did not have to specify which data * they were truncating. So, if not specified, we assume * traditional behavior, e.g., just the normal data. */ if ((flags & (IO_EXT | IO_NORMAL)) == 0) flags |= IO_NORMAL; if (!DOINGSOFTDEP(vp) && !DOINGASYNC(vp)) flags |= IO_SYNC; waitforupdate = (flags & IO_SYNC) != 0 || !DOINGASYNC(vp); /* * If we are truncating the extended-attributes, and cannot * do it with soft updates, then do it slowly here. If we are * truncating both the extended attributes and the file contents * (e.g., the file is being unlinked), then pick it off with * soft updates below. */ allerror = 0; needextclean = 0; softdeptrunc = 0; journaltrunc = DOINGSUJ(vp); if (journaltrunc == 0 && DOINGSOFTDEP(vp) && length == 0) softdeptrunc = !softdep_slowdown(vp); extblocks = 0; datablocks = DIP(ip, i_blocks); if (fs->fs_magic == FS_UFS2_MAGIC && ip->i_din2->di_extsize > 0) { extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize)); datablocks -= extblocks; } if ((flags & IO_EXT) && extblocks > 0) { if (length != 0) panic("ffs_truncate: partial trunc of extdata"); if (softdeptrunc || journaltrunc) { if ((flags & IO_NORMAL) == 0) goto extclean; needextclean = 1; } else { if ((error = ffs_syncvnode(vp, MNT_WAIT, 0)) != 0) return (error); #ifdef QUOTA (void) chkdq(ip, -extblocks, NOCRED, 0); #endif vinvalbuf(vp, V_ALT, 0, 0); vn_pages_remove(vp, OFF_TO_IDX(lblktosize(fs, -extblocks)), 0); osize = ip->i_din2->di_extsize; ip->i_din2->di_blocks -= extblocks; ip->i_din2->di_extsize = 0; for (i = 0; i < UFS_NXADDR; i++) { oldblks[i] = ip->i_din2->di_extb[i]; ip->i_din2->di_extb[i] = 0; } ip->i_flag |= IN_CHANGE; if ((error = ffs_update(vp, waitforupdate))) return (error); for (i = 0; i < UFS_NXADDR; i++) { if (oldblks[i] == 0) continue; ffs_blkfree(ump, fs, ITODEVVP(ip), oldblks[i], sblksize(fs, osize, i), ip->i_number, vp->v_type, NULL); } } } if ((flags & IO_NORMAL) == 0) return (0); if (vp->v_type == VLNK && (ip->i_size < vp->v_mount->mnt_maxsymlinklen || datablocks == 0)) { #ifdef INVARIANTS if (length != 0) panic("ffs_truncate: partial truncate of symlink"); #endif bzero(SHORTLINK(ip), (u_int)ip->i_size); ip->i_size = 0; DIP_SET(ip, i_size, 0); ip->i_flag |= IN_CHANGE | IN_UPDATE; if (needextclean) goto extclean; return (ffs_update(vp, waitforupdate)); } if (ip->i_size == length) { ip->i_flag |= IN_CHANGE | IN_UPDATE; if (needextclean) goto extclean; return (ffs_update(vp, 0)); } if (fs->fs_ronly) panic("ffs_truncate: read-only filesystem"); if (IS_SNAPSHOT(ip)) ffs_snapremove(vp); vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; osize = ip->i_size; /* * Lengthen the size of the file. We must ensure that the * last byte of the file is allocated. Since the smallest * value of osize is 0, length will be at least 1. */ if (osize < length) { vnode_pager_setsize(vp, length); flags |= BA_CLRBUF; error = UFS_BALLOC(vp, length - 1, 1, cred, flags, &bp); if (error) { vnode_pager_setsize(vp, osize); return (error); } ip->i_size = length; DIP_SET(ip, i_size, length); if (bp->b_bufsize == fs->fs_bsize) bp->b_flags |= B_CLUSTEROK; if (flags & IO_SYNC) bwrite(bp); else if (DOINGASYNC(vp)) bdwrite(bp); else bawrite(bp); ip->i_flag |= IN_CHANGE | IN_UPDATE; return (ffs_update(vp, waitforupdate)); } /* * Lookup block number for a given offset. Zero length files * have no blocks, so return a blkno of -1. */ lbn = lblkno(fs, length - 1); if (length == 0) { blkno = -1; } else if (lbn < UFS_NDADDR) { blkno = DIP(ip, i_db[lbn]); } else { error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, cred, BA_METAONLY, &bp); if (error) return (error); indiroff = (lbn - UFS_NDADDR) % NINDIR(fs); if (I_IS_UFS1(ip)) blkno = ((ufs1_daddr_t *)(bp->b_data))[indiroff]; else blkno = ((ufs2_daddr_t *)(bp->b_data))[indiroff]; /* * If the block number is non-zero, then the indirect block * must have been previously allocated and need not be written. * If the block number is zero, then we may have allocated * the indirect block and hence need to write it out. */ if (blkno != 0) brelse(bp); else if (flags & IO_SYNC) bwrite(bp); else bdwrite(bp); } /* * If the block number at the new end of the file is zero, * then we must allocate it to ensure that the last block of * the file is allocated. Soft updates does not handle this * case, so here we have to clean up the soft updates data * structures describing the allocation past the truncation * point. Finding and deallocating those structures is a lot of * work. Since partial truncation with a hole at the end occurs * rarely, we solve the problem by syncing the file so that it * will have no soft updates data structures left. */ if (blkno == 0 && (error = ffs_syncvnode(vp, MNT_WAIT, 0)) != 0) return (error); if (blkno != 0 && DOINGSOFTDEP(vp)) { if (softdeptrunc == 0 && journaltrunc == 0) { /* * If soft updates cannot handle this truncation, * clean up soft dependency data structures and * fall through to the synchronous truncation. */ if ((error = ffs_syncvnode(vp, MNT_WAIT, 0)) != 0) return (error); } else { flags = IO_NORMAL | (needextclean ? IO_EXT: 0); if (journaltrunc) softdep_journal_freeblocks(ip, cred, length, flags); else softdep_setup_freeblocks(ip, length, flags); ASSERT_VOP_LOCKED(vp, "ffs_truncate1"); if (journaltrunc == 0) { ip->i_flag |= IN_CHANGE | IN_UPDATE; error = ffs_update(vp, 0); } return (error); } } /* * Shorten the size of the file. If the last block of the * shortened file is unallocated, we must allocate it. * Additionally, if the file is not being truncated to a * block boundary, the contents of the partial block * following the end of the file must be zero'ed in * case it ever becomes accessible again because of * subsequent file growth. Directories however are not * zero'ed as they should grow back initialized to empty. */ offset = blkoff(fs, length); if (blkno != 0 && offset == 0) { ip->i_size = length; DIP_SET(ip, i_size, length); } else { lbn = lblkno(fs, length); flags |= BA_CLRBUF; error = UFS_BALLOC(vp, length - 1, 1, cred, flags, &bp); if (error) return (error); /* * When we are doing soft updates and the UFS_BALLOC * above fills in a direct block hole with a full sized * block that will be truncated down to a fragment below, * we must flush out the block dependency with an FSYNC * so that we do not get a soft updates inconsistency * when we create the fragment below. */ if (DOINGSOFTDEP(vp) && lbn < UFS_NDADDR && fragroundup(fs, blkoff(fs, length)) < fs->fs_bsize && (error = ffs_syncvnode(vp, MNT_WAIT, 0)) != 0) return (error); ip->i_size = length; DIP_SET(ip, i_size, length); size = blksize(fs, ip, lbn); if (vp->v_type != VDIR && offset != 0) bzero((char *)bp->b_data + offset, (u_int)(size - offset)); /* Kirk's code has reallocbuf(bp, size, 1) here */ allocbuf(bp, size); if (bp->b_bufsize == fs->fs_bsize) bp->b_flags |= B_CLUSTEROK; if (flags & IO_SYNC) bwrite(bp); else if (DOINGASYNC(vp)) bdwrite(bp); else bawrite(bp); } /* * Calculate index into inode's block list of * last direct and indirect blocks (if any) * which we want to keep. Lastblock is -1 when * the file is truncated to 0. */ lastblock = lblkno(fs, length + fs->fs_bsize - 1) - 1; lastiblock[SINGLE] = lastblock - UFS_NDADDR; lastiblock[DOUBLE] = lastiblock[SINGLE] - NINDIR(fs); lastiblock[TRIPLE] = lastiblock[DOUBLE] - NINDIR(fs) * NINDIR(fs); nblocks = btodb(fs->fs_bsize); /* * Update file and block pointers on disk before we start freeing * blocks. If we crash before free'ing blocks below, the blocks * will be returned to the free list. lastiblock values are also * normalized to -1 for calls to ffs_indirtrunc below. */ for (level = TRIPLE; level >= SINGLE; level--) { oldblks[UFS_NDADDR + level] = DIP(ip, i_ib[level]); if (lastiblock[level] < 0) { DIP_SET(ip, i_ib[level], 0); lastiblock[level] = -1; } } for (i = 0; i < UFS_NDADDR; i++) { oldblks[i] = DIP(ip, i_db[i]); if (i > lastblock) DIP_SET(ip, i_db[i], 0); } ip->i_flag |= IN_CHANGE | IN_UPDATE; allerror = ffs_update(vp, waitforupdate); /* * Having written the new inode to disk, save its new configuration * and put back the old block pointers long enough to process them. * Note that we save the new block configuration so we can check it * when we are done. */ for (i = 0; i < UFS_NDADDR; i++) { newblks[i] = DIP(ip, i_db[i]); DIP_SET(ip, i_db[i], oldblks[i]); } for (i = 0; i < UFS_NIADDR; i++) { newblks[UFS_NDADDR + i] = DIP(ip, i_ib[i]); DIP_SET(ip, i_ib[i], oldblks[UFS_NDADDR + i]); } ip->i_size = osize; DIP_SET(ip, i_size, osize); error = vtruncbuf(vp, cred, length, fs->fs_bsize); if (error && (allerror == 0)) allerror = error; /* * Indirect blocks first. */ indir_lbn[SINGLE] = -UFS_NDADDR; indir_lbn[DOUBLE] = indir_lbn[SINGLE] - NINDIR(fs) - 1; indir_lbn[TRIPLE] = indir_lbn[DOUBLE] - NINDIR(fs) * NINDIR(fs) - 1; for (level = TRIPLE; level >= SINGLE; level--) { bn = DIP(ip, i_ib[level]); if (bn != 0) { error = ffs_indirtrunc(ip, indir_lbn[level], fsbtodb(fs, bn), lastiblock[level], level, &count); if (error) allerror = error; blocksreleased += count; if (lastiblock[level] < 0) { DIP_SET(ip, i_ib[level], 0); ffs_blkfree(ump, fs, ump->um_devvp, bn, fs->fs_bsize, ip->i_number, vp->v_type, NULL); blocksreleased += nblocks; } } if (lastiblock[level] >= 0) goto done; } /* * All whole direct blocks or frags. */ for (i = UFS_NDADDR - 1; i > lastblock; i--) { long bsize; bn = DIP(ip, i_db[i]); if (bn == 0) continue; DIP_SET(ip, i_db[i], 0); bsize = blksize(fs, ip, i); ffs_blkfree(ump, fs, ump->um_devvp, bn, bsize, ip->i_number, vp->v_type, NULL); blocksreleased += btodb(bsize); } if (lastblock < 0) goto done; /* * Finally, look for a change in size of the * last direct block; release any frags. */ bn = DIP(ip, i_db[lastblock]); if (bn != 0) { long oldspace, newspace; /* * Calculate amount of space we're giving * back as old block size minus new block size. */ oldspace = blksize(fs, ip, lastblock); ip->i_size = length; DIP_SET(ip, i_size, length); newspace = blksize(fs, ip, lastblock); if (newspace == 0) panic("ffs_truncate: newspace"); if (oldspace - newspace > 0) { /* * Block number of space to be free'd is * the old block # plus the number of frags * required for the storage we're keeping. */ bn += numfrags(fs, newspace); ffs_blkfree(ump, fs, ump->um_devvp, bn, oldspace - newspace, ip->i_number, vp->v_type, NULL); blocksreleased += btodb(oldspace - newspace); } } done: #ifdef INVARIANTS for (level = SINGLE; level <= TRIPLE; level++) if (newblks[UFS_NDADDR + level] != DIP(ip, i_ib[level])) panic("ffs_truncate1"); for (i = 0; i < UFS_NDADDR; i++) if (newblks[i] != DIP(ip, i_db[i])) panic("ffs_truncate2"); BO_LOCK(bo); if (length == 0 && (fs->fs_magic != FS_UFS2_MAGIC || ip->i_din2->di_extsize == 0) && (bo->bo_dirty.bv_cnt > 0 || bo->bo_clean.bv_cnt > 0)) panic("ffs_truncate3"); BO_UNLOCK(bo); #endif /* INVARIANTS */ /* * Put back the real size. */ ip->i_size = length; DIP_SET(ip, i_size, length); if (DIP(ip, i_blocks) >= blocksreleased) DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - blocksreleased); else /* sanity */ DIP_SET(ip, i_blocks, 0); ip->i_flag |= IN_CHANGE; #ifdef QUOTA (void) chkdq(ip, -blocksreleased, NOCRED, 0); #endif return (allerror); extclean: if (journaltrunc) softdep_journal_freeblocks(ip, cred, length, IO_EXT); else softdep_setup_freeblocks(ip, length, IO_EXT); return (ffs_update(vp, waitforupdate)); } /* * Release blocks associated with the inode ip and stored in the indirect * block bn. Blocks are free'd in LIFO order up to (but not including) * lastbn. If level is greater than SINGLE, the block is an indirect block * and recursive calls to indirtrunc must be used to cleanse other indirect * blocks. */ static int ffs_indirtrunc(ip, lbn, dbn, lastbn, level, countp) struct inode *ip; ufs2_daddr_t lbn, lastbn; ufs2_daddr_t dbn; int level; ufs2_daddr_t *countp; { struct buf *bp; struct fs *fs; struct vnode *vp; caddr_t copy = NULL; int i, nblocks, error = 0, allerror = 0; ufs2_daddr_t nb, nlbn, last; ufs2_daddr_t blkcount, factor, blocksreleased = 0; ufs1_daddr_t *bap1 = NULL; ufs2_daddr_t *bap2 = NULL; #define BAP(ip, i) (I_IS_UFS1(ip) ? bap1[i] : bap2[i]) fs = ITOFS(ip); /* * Calculate index in current block of last * block to be kept. -1 indicates the entire * block so we need not calculate the index. */ factor = lbn_offset(fs, level); last = lastbn; if (lastbn > 0) last /= factor; nblocks = btodb(fs->fs_bsize); /* * Get buffer of block pointers, zero those entries corresponding * to blocks to be free'd, and update on disk copy first. Since * double(triple) indirect before single(double) indirect, calls * to bmap on these blocks will fail. However, we already have * the on disk address, so we have to set the b_blkno field * explicitly instead of letting bread do everything for us. */ vp = ITOV(ip); bp = getblk(vp, lbn, (int)fs->fs_bsize, 0, 0, 0); if ((bp->b_flags & B_CACHE) == 0) { #ifdef RACCT if (racct_enable) { PROC_LOCK(curproc); racct_add_buf(curproc, bp, 0); PROC_UNLOCK(curproc); } #endif /* RACCT */ curthread->td_ru.ru_inblock++; /* pay for read */ bp->b_iocmd = BIO_READ; bp->b_flags &= ~B_INVAL; bp->b_ioflags &= ~BIO_ERROR; if (bp->b_bcount > bp->b_bufsize) panic("ffs_indirtrunc: bad buffer size"); bp->b_blkno = dbn; vfs_busy_pages(bp, 0); bp->b_iooffset = dbtob(bp->b_blkno); bstrategy(bp); error = bufwait(bp); } if (error) { brelse(bp); *countp = 0; return (error); } if (I_IS_UFS1(ip)) bap1 = (ufs1_daddr_t *)bp->b_data; else bap2 = (ufs2_daddr_t *)bp->b_data; if (lastbn != -1) { copy = malloc(fs->fs_bsize, M_TEMP, M_WAITOK); bcopy((caddr_t)bp->b_data, copy, (u_int)fs->fs_bsize); for (i = last + 1; i < NINDIR(fs); i++) if (I_IS_UFS1(ip)) bap1[i] = 0; else bap2[i] = 0; if (DOINGASYNC(vp)) { bdwrite(bp); } else { error = bwrite(bp); if (error) allerror = error; } if (I_IS_UFS1(ip)) bap1 = (ufs1_daddr_t *)copy; else bap2 = (ufs2_daddr_t *)copy; } /* * Recursively free totally unused blocks. */ for (i = NINDIR(fs) - 1, nlbn = lbn + 1 - i * factor; i > last; i--, nlbn += factor) { nb = BAP(ip, i); if (nb == 0) continue; if (level > SINGLE) { if ((error = ffs_indirtrunc(ip, nlbn, fsbtodb(fs, nb), (ufs2_daddr_t)-1, level - 1, &blkcount)) != 0) allerror = error; blocksreleased += blkcount; } ffs_blkfree(ITOUMP(ip), fs, ITODEVVP(ip), nb, fs->fs_bsize, ip->i_number, vp->v_type, NULL); blocksreleased += nblocks; } /* * Recursively free last partial block. */ if (level > SINGLE && lastbn >= 0) { last = lastbn % factor; nb = BAP(ip, i); if (nb != 0) { error = ffs_indirtrunc(ip, nlbn, fsbtodb(fs, nb), last, level - 1, &blkcount); if (error) allerror = error; blocksreleased += blkcount; } } if (copy != NULL) { free(copy, M_TEMP); } else { bp->b_flags |= B_INVAL | B_NOCACHE; brelse(bp); } *countp = blocksreleased; return (allerror); } int ffs_rdonly(struct inode *ip) { return (ITOFS(ip)->fs_ronly != 0); } Index: head/sys/ufs/ffs/ffs_snapshot.c =================================================================== --- head/sys/ufs/ffs/ffs_snapshot.c (revision 323922) +++ head/sys/ufs/ffs/ffs_snapshot.c (revision 323923) @@ -1,2690 +1,2704 @@ /*- * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved. * * Further information about snapshots can be obtained from: * * Marshall Kirk McKusick http://www.mckusick.com/softdep/ * 1614 Oxford Street mckusick@mckusick.com * Berkeley, CA 94709-1608 +1-510-843-9542 * USA * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ffs_snapshot.c 8.11 (McKusick) 7/23/00 */ #include __FBSDID("$FreeBSD$"); #include "opt_quota.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define KERNCRED thread0.td_ucred #define DEBUG 1 #include "opt_ffs.h" #ifdef NO_FFS_SNAPSHOT int ffs_snapshot(mp, snapfile) struct mount *mp; char *snapfile; { return (EINVAL); } int ffs_snapblkfree(fs, devvp, bno, size, inum, vtype, wkhd) struct fs *fs; struct vnode *devvp; ufs2_daddr_t bno; long size; ino_t inum; enum vtype vtype; struct workhead *wkhd; { return (EINVAL); } void ffs_snapremove(vp) struct vnode *vp; { } void ffs_snapshot_mount(mp) struct mount *mp; { } void ffs_snapshot_unmount(mp) struct mount *mp; { } void ffs_snapgone(ip) struct inode *ip; { } int ffs_copyonwrite(devvp, bp) struct vnode *devvp; struct buf *bp; { return (EINVAL); } void ffs_sync_snap(mp, waitfor) struct mount *mp; int waitfor; { } #else FEATURE(ffs_snapshot, "FFS snapshot support"); LIST_HEAD(, snapdata) snapfree; static struct mtx snapfree_lock; MTX_SYSINIT(ffs_snapfree, &snapfree_lock, "snapdata free list", MTX_DEF); static int cgaccount(int, struct vnode *, struct buf *, int); static int expunge_ufs1(struct vnode *, struct inode *, struct fs *, int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, ufs_lbn_t, int), int, int); static int indiracct_ufs1(struct vnode *, struct vnode *, int, ufs1_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *, int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, ufs_lbn_t, int), int); static int fullacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, ufs_lbn_t, int); static int snapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, ufs_lbn_t, int); static int mapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, ufs_lbn_t, int); static int expunge_ufs2(struct vnode *, struct inode *, struct fs *, int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, ufs_lbn_t, int), int, int); static int indiracct_ufs2(struct vnode *, struct vnode *, int, ufs2_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *, int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, ufs_lbn_t, int), int); static int fullacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, ufs_lbn_t, int); static int snapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, ufs_lbn_t, int); static int mapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, ufs_lbn_t, int); static int readblock(struct vnode *vp, struct buf *, ufs2_daddr_t); static void try_free_snapdata(struct vnode *devvp); static struct snapdata *ffs_snapdata_acquire(struct vnode *devvp); static int ffs_bp_snapblk(struct vnode *, struct buf *); /* * To ensure the consistency of snapshots across crashes, we must * synchronously write out copied blocks before allowing the * originals to be modified. Because of the rather severe speed * penalty that this imposes, the code normally only ensures * persistence for the filesystem metadata contained within a * snapshot. Setting the following flag allows this crash * persistence to be enabled for file contents. */ int dopersistence = 0; #ifdef DEBUG #include SYSCTL_INT(_debug, OID_AUTO, dopersistence, CTLFLAG_RW, &dopersistence, 0, ""); static int snapdebug = 0; SYSCTL_INT(_debug, OID_AUTO, snapdebug, CTLFLAG_RW, &snapdebug, 0, ""); int collectsnapstats = 0; SYSCTL_INT(_debug, OID_AUTO, collectsnapstats, CTLFLAG_RW, &collectsnapstats, 0, ""); #endif /* DEBUG */ /* * Create a snapshot file and initialize it for the filesystem. */ int ffs_snapshot(mp, snapfile) struct mount *mp; char *snapfile; { ufs2_daddr_t numblks, blkno, *blkp, *snapblklist; int error, cg, snaploc; int i, size, len, loc; ufs2_daddr_t blockno; uint64_t flag; struct timespec starttime = {0, 0}, endtime; char saved_nice = 0; long redo = 0, snaplistsize = 0; int32_t *lp; void *space; struct fs *copy_fs = NULL, *fs; struct thread *td = curthread; struct inode *ip, *xp; struct buf *bp, *nbp, *ibp; struct nameidata nd; struct mount *wrtmp; struct vattr vat; struct vnode *vp, *xvp, *mvp, *devvp; struct uio auio; struct iovec aiov; struct snapdata *sn; struct ufsmount *ump; ump = VFSTOUFS(mp); fs = ump->um_fs; sn = NULL; /* * At the moment, journaled soft updates cannot support * taking snapshots. */ if (MOUNTEDSUJ(mp)) { vfs_mount_error(mp, "%s: Snapshots are not yet supported when " "running with journaled soft updates", fs->fs_fsmnt); return (EOPNOTSUPP); } MNT_ILOCK(mp); flag = mp->mnt_flag; MNT_IUNLOCK(mp); /* * Need to serialize access to snapshot code per filesystem. */ /* * Assign a snapshot slot in the superblock. */ UFS_LOCK(ump); for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) if (fs->fs_snapinum[snaploc] == 0) break; UFS_UNLOCK(ump); if (snaploc == FSMAXSNAP) return (ENOSPC); /* * Create the snapshot file. */ restart: NDINIT(&nd, CREATE, LOCKPARENT | LOCKLEAF | NOCACHE, UIO_SYSSPACE, snapfile, td); if ((error = namei(&nd)) != 0) return (error); if (nd.ni_vp != NULL) { vput(nd.ni_vp); error = EEXIST; } if (nd.ni_dvp->v_mount != mp) error = EXDEV; if (error) { NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_dvp == nd.ni_vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); return (error); } VATTR_NULL(&vat); vat.va_type = VREG; vat.va_mode = S_IRUSR; vat.va_vaflags |= VA_EXCLUSIVE; if (VOP_GETWRITEMOUNT(nd.ni_dvp, &wrtmp)) wrtmp = NULL; if (wrtmp != mp) panic("ffs_snapshot: mount mismatch"); vfs_rel(wrtmp); if (vn_start_write(NULL, &wrtmp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if ((error = vn_start_write(NULL, &wrtmp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vat); VOP_UNLOCK(nd.ni_dvp, 0); if (error) { NDFREE(&nd, NDF_ONLY_PNBUF); vn_finished_write(wrtmp); vrele(nd.ni_dvp); return (error); } vp = nd.ni_vp; vp->v_vflag |= VV_SYSTEM; ip = VTOI(vp); devvp = ITODEVVP(ip); /* * Allocate and copy the last block contents so as to be able * to set size to that of the filesystem. */ numblks = howmany(fs->fs_size, fs->fs_frag); error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(numblks - 1)), fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp); if (error) goto out; ip->i_size = lblktosize(fs, (off_t)numblks); DIP_SET(ip, i_size, ip->i_size); ip->i_flag |= IN_CHANGE | IN_UPDATE; error = readblock(vp, bp, numblks - 1); bawrite(bp); if (error != 0) goto out; /* * Preallocate critical data structures so that we can copy * them in without further allocation after we suspend all * operations on the filesystem. We would like to just release * the allocated buffers without writing them since they will * be filled in below once we are ready to go, but this upsets * the soft update code, so we go ahead and write the new buffers. * * Allocate all indirect blocks and mark all of them as not * needing to be copied. */ for (blkno = UFS_NDADDR; blkno < numblks; blkno += NINDIR(fs)) { error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno), fs->fs_bsize, td->td_ucred, BA_METAONLY, &ibp); if (error) goto out; bawrite(ibp); } /* * Allocate copies for the superblock and its summary information. */ error = UFS_BALLOC(vp, fs->fs_sblockloc, fs->fs_sbsize, KERNCRED, 0, &nbp); if (error) goto out; bawrite(nbp); blkno = fragstoblks(fs, fs->fs_csaddr); len = howmany(fs->fs_cssize, fs->fs_bsize); for (loc = 0; loc < len; loc++) { error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(blkno + loc)), fs->fs_bsize, KERNCRED, 0, &nbp); if (error) goto out; bawrite(nbp); } /* * Allocate all cylinder group blocks. */ for (cg = 0; cg < fs->fs_ncg; cg++) { error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), fs->fs_bsize, KERNCRED, 0, &nbp); if (error) goto out; bawrite(nbp); if (cg % 10 == 0) ffs_syncvnode(vp, MNT_WAIT, 0); } /* * Copy all the cylinder group maps. Although the * filesystem is still active, we hope that only a few * cylinder groups will change between now and when we * suspend operations. Thus, we will be able to quickly * touch up the few cylinder groups that changed during * the suspension period. */ len = howmany(fs->fs_ncg, NBBY); space = malloc(len, M_DEVBUF, M_WAITOK|M_ZERO); UFS_LOCK(ump); fs->fs_active = space; UFS_UNLOCK(ump); for (cg = 0; cg < fs->fs_ncg; cg++) { error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), fs->fs_bsize, KERNCRED, 0, &nbp); if (error) goto out; error = cgaccount(cg, vp, nbp, 1); bawrite(nbp); if (cg % 10 == 0) ffs_syncvnode(vp, MNT_WAIT, 0); if (error) goto out; } /* * Change inode to snapshot type file. */ ip->i_flags |= SF_SNAPSHOT; DIP_SET(ip, i_flags, ip->i_flags); ip->i_flag |= IN_CHANGE | IN_UPDATE; /* * Ensure that the snapshot is completely on disk. * Since we have marked it as a snapshot it is safe to * unlock it as no process will be allowed to write to it. */ if ((error = ffs_syncvnode(vp, MNT_WAIT, 0)) != 0) goto out; VOP_UNLOCK(vp, 0); /* * All allocations are done, so we can now snapshot the system. * * Recind nice scheduling while running with the filesystem suspended. */ if (td->td_proc->p_nice > 0) { struct proc *p; p = td->td_proc; PROC_LOCK(p); saved_nice = p->p_nice; sched_nice(p, 0); PROC_UNLOCK(p); } /* * Suspend operation on filesystem. */ for (;;) { vn_finished_write(wrtmp); if ((error = vfs_write_suspend(vp->v_mount, 0)) != 0) { vn_start_write(NULL, &wrtmp, V_WAIT); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); goto out; } if (mp->mnt_kern_flag & MNTK_SUSPENDED) break; vn_start_write(NULL, &wrtmp, V_WAIT); } vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (ip->i_effnlink == 0) { error = ENOENT; /* Snapshot file unlinked */ goto out1; } if (collectsnapstats) nanotime(&starttime); /* The last block might have changed. Copy it again to be sure. */ error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(numblks - 1)), fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp); if (error != 0) goto out1; error = readblock(vp, bp, numblks - 1); bp->b_flags |= B_VALIDSUSPWRT; bawrite(bp); if (error != 0) goto out1; /* * First, copy all the cylinder group maps that have changed. */ for (cg = 0; cg < fs->fs_ncg; cg++) { if ((ACTIVECGNUM(fs, cg) & ACTIVECGOFF(cg)) != 0) continue; redo++; error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), fs->fs_bsize, KERNCRED, 0, &nbp); if (error) goto out1; error = cgaccount(cg, vp, nbp, 2); bawrite(nbp); if (error) goto out1; } /* * Grab a copy of the superblock and its summary information. * We delay writing it until the suspension is released below. */ copy_fs = malloc((u_long)fs->fs_bsize, M_UFSMNT, M_WAITOK); bcopy(fs, copy_fs, fs->fs_sbsize); if ((fs->fs_flags & (FS_UNCLEAN | FS_NEEDSFSCK)) == 0) copy_fs->fs_clean = 1; size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE; if (fs->fs_sbsize < size) bzero(&((char *)copy_fs)[fs->fs_sbsize], size - fs->fs_sbsize); size = blkroundup(fs, fs->fs_cssize); if (fs->fs_contigsumsize > 0) size += fs->fs_ncg * sizeof(int32_t); space = malloc((u_long)size, M_UFSMNT, M_WAITOK); copy_fs->fs_csp = space; bcopy(fs->fs_csp, copy_fs->fs_csp, fs->fs_cssize); space = (char *)space + fs->fs_cssize; loc = howmany(fs->fs_cssize, fs->fs_fsize); i = fs->fs_frag - loc % fs->fs_frag; len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize; if (len > 0) { if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + loc), len, KERNCRED, &bp)) != 0) { brelse(bp); free(copy_fs->fs_csp, M_UFSMNT); free(copy_fs, M_UFSMNT); copy_fs = NULL; goto out1; } bcopy(bp->b_data, space, (u_int)len); space = (char *)space + len; bp->b_flags |= B_INVAL | B_NOCACHE; brelse(bp); } if (fs->fs_contigsumsize > 0) { copy_fs->fs_maxcluster = lp = space; for (i = 0; i < fs->fs_ncg; i++) *lp++ = fs->fs_contigsumsize; } /* * We must check for active files that have been unlinked * (e.g., with a zero link count). We have to expunge all * trace of these files from the snapshot so that they are * not reclaimed prematurely by fsck or unnecessarily dumped. * We turn off the MNTK_SUSPENDED flag to avoid a panic from * spec_strategy about writing on a suspended filesystem. * Note that we skip unlinked snapshot files as they will * be handled separately below. * * We also calculate the needed size for the snapshot list. */ snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) + FSMAXSNAP + 1 /* superblock */ + 1 /* last block */ + 1 /* size */; MNT_ILOCK(mp); mp->mnt_kern_flag &= ~MNTK_SUSPENDED; MNT_IUNLOCK(mp); loop: MNT_VNODE_FOREACH_ALL(xvp, mp, mvp) { if ((xvp->v_usecount == 0 && (xvp->v_iflag & (VI_OWEINACT | VI_DOINGINACT)) == 0) || xvp->v_type == VNON || IS_SNAPSHOT(VTOI(xvp))) { VI_UNLOCK(xvp); continue; } /* * We can skip parent directory vnode because it must have * this snapshot file in it. */ if (xvp == nd.ni_dvp) { VI_UNLOCK(xvp); continue; } vholdl(xvp); if (vn_lock(xvp, LK_EXCLUSIVE | LK_INTERLOCK) != 0) { MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); vdrop(xvp); goto loop; } VI_LOCK(xvp); if (xvp->v_usecount == 0 && (xvp->v_iflag & (VI_OWEINACT | VI_DOINGINACT)) == 0) { VI_UNLOCK(xvp); VOP_UNLOCK(xvp, 0); vdrop(xvp); continue; } VI_UNLOCK(xvp); if (snapdebug) vn_printf(xvp, "ffs_snapshot: busy vnode "); if (VOP_GETATTR(xvp, &vat, td->td_ucred) == 0 && vat.va_nlink > 0) { VOP_UNLOCK(xvp, 0); vdrop(xvp); continue; } xp = VTOI(xvp); if (ffs_checkfreefile(copy_fs, vp, xp->i_number)) { VOP_UNLOCK(xvp, 0); vdrop(xvp); continue; } /* * If there is a fragment, clear it here. */ blkno = 0; loc = howmany(xp->i_size, fs->fs_bsize) - 1; if (loc < UFS_NDADDR) { len = fragroundup(fs, blkoff(fs, xp->i_size)); if (len != 0 && len < fs->fs_bsize) { ffs_blkfree(ump, copy_fs, vp, DIP(xp, i_db[loc]), len, xp->i_number, xvp->v_type, NULL); blkno = DIP(xp, i_db[loc]); DIP_SET(xp, i_db[loc], 0); } } snaplistsize += 1; if (I_IS_UFS1(xp)) error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1, BLK_NOCOPY, 1); else error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2, BLK_NOCOPY, 1); if (blkno) DIP_SET(xp, i_db[loc], blkno); if (!error) error = ffs_freefile(ump, copy_fs, vp, xp->i_number, xp->i_mode, NULL); VOP_UNLOCK(xvp, 0); vdrop(xvp); if (error) { free(copy_fs->fs_csp, M_UFSMNT); free(copy_fs, M_UFSMNT); copy_fs = NULL; MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); goto out1; } } /* * Erase the journal file from the snapshot. */ if (fs->fs_flags & FS_SUJ) { error = softdep_journal_lookup(mp, &xvp); if (error) { free(copy_fs->fs_csp, M_UFSMNT); free(copy_fs, M_UFSMNT); copy_fs = NULL; goto out1; } xp = VTOI(xvp); if (I_IS_UFS1(xp)) error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1, BLK_NOCOPY, 0); else error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2, BLK_NOCOPY, 0); vput(xvp); } /* * Acquire a lock on the snapdata structure, creating it if necessary. */ sn = ffs_snapdata_acquire(devvp); /* * Change vnode to use shared snapshot lock instead of the original * private lock. */ vp->v_vnlock = &sn->sn_lock; lockmgr(&vp->v_lock, LK_RELEASE, NULL); xp = TAILQ_FIRST(&sn->sn_head); /* * If this is the first snapshot on this filesystem, then we need * to allocate the space for the list of preallocated snapshot blocks. * This list will be refined below, but this preliminary one will * keep us out of deadlock until the full one is ready. */ if (xp == NULL) { snapblklist = malloc(snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK); blkp = &snapblklist[1]; *blkp++ = lblkno(fs, fs->fs_sblockloc); blkno = fragstoblks(fs, fs->fs_csaddr); for (cg = 0; cg < fs->fs_ncg; cg++) { if (fragstoblks(fs, cgtod(fs, cg) > blkno)) break; *blkp++ = fragstoblks(fs, cgtod(fs, cg)); } len = howmany(fs->fs_cssize, fs->fs_bsize); for (loc = 0; loc < len; loc++) *blkp++ = blkno + loc; for (; cg < fs->fs_ncg; cg++) *blkp++ = fragstoblks(fs, cgtod(fs, cg)); snapblklist[0] = blkp - snapblklist; VI_LOCK(devvp); if (sn->sn_blklist != NULL) panic("ffs_snapshot: non-empty list"); sn->sn_blklist = snapblklist; sn->sn_listsize = blkp - snapblklist; VI_UNLOCK(devvp); } /* * Record snapshot inode. Since this is the newest snapshot, * it must be placed at the end of the list. */ VI_LOCK(devvp); fs->fs_snapinum[snaploc] = ip->i_number; if (ip->i_nextsnap.tqe_prev != 0) panic("ffs_snapshot: %ju already on list", (uintmax_t)ip->i_number); TAILQ_INSERT_TAIL(&sn->sn_head, ip, i_nextsnap); devvp->v_vflag |= VV_COPYONWRITE; VI_UNLOCK(devvp); ASSERT_VOP_LOCKED(vp, "ffs_snapshot vp"); out1: KASSERT((sn != NULL && copy_fs != NULL && error == 0) || (sn == NULL && copy_fs == NULL && error != 0), ("email phk@ and mckusick@")); /* * Resume operation on filesystem. */ vfs_write_resume(vp->v_mount, VR_START_WRITE | VR_NO_SUSPCLR); if (collectsnapstats && starttime.tv_sec > 0) { nanotime(&endtime); timespecsub(&endtime, &starttime); printf("%s: suspended %ld.%03ld sec, redo %ld of %d\n", vp->v_mount->mnt_stat.f_mntonname, (long)endtime.tv_sec, endtime.tv_nsec / 1000000, redo, fs->fs_ncg); } if (copy_fs == NULL) goto out; /* * Copy allocation information from all the snapshots in * this snapshot and then expunge them from its view. */ TAILQ_FOREACH(xp, &sn->sn_head, i_nextsnap) { if (xp == ip) break; if (I_IS_UFS1(xp)) error = expunge_ufs1(vp, xp, fs, snapacct_ufs1, BLK_SNAP, 0); else error = expunge_ufs2(vp, xp, fs, snapacct_ufs2, BLK_SNAP, 0); if (error == 0 && xp->i_effnlink == 0) { error = ffs_freefile(ump, copy_fs, vp, xp->i_number, xp->i_mode, NULL); } if (error) { fs->fs_snapinum[snaploc] = 0; goto done; } } /* * Allocate space for the full list of preallocated snapshot blocks. */ snapblklist = malloc(snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK); ip->i_snapblklist = &snapblklist[1]; /* * Expunge the blocks used by the snapshots from the set of * blocks marked as used in the snapshot bitmaps. Also, collect * the list of allocated blocks in i_snapblklist. */ if (I_IS_UFS1(ip)) error = expunge_ufs1(vp, ip, copy_fs, mapacct_ufs1, BLK_SNAP, 0); else error = expunge_ufs2(vp, ip, copy_fs, mapacct_ufs2, BLK_SNAP, 0); if (error) { fs->fs_snapinum[snaploc] = 0; free(snapblklist, M_UFSMNT); goto done; } if (snaplistsize < ip->i_snapblklist - snapblklist) panic("ffs_snapshot: list too small"); snaplistsize = ip->i_snapblklist - snapblklist; snapblklist[0] = snaplistsize; ip->i_snapblklist = 0; /* * Write out the list of allocated blocks to the end of the snapshot. */ auio.uio_iov = &aiov; auio.uio_iovcnt = 1; aiov.iov_base = (void *)snapblklist; aiov.iov_len = snaplistsize * sizeof(daddr_t); auio.uio_resid = aiov.iov_len; auio.uio_offset = ip->i_size; auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_WRITE; auio.uio_td = td; if ((error = VOP_WRITE(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { fs->fs_snapinum[snaploc] = 0; free(snapblklist, M_UFSMNT); goto done; } /* * Write the superblock and its summary information * to the snapshot. */ blkno = fragstoblks(fs, fs->fs_csaddr); len = howmany(fs->fs_cssize, fs->fs_bsize); space = copy_fs->fs_csp; for (loc = 0; loc < len; loc++) { error = bread(vp, blkno + loc, fs->fs_bsize, KERNCRED, &nbp); if (error) { brelse(nbp); fs->fs_snapinum[snaploc] = 0; free(snapblklist, M_UFSMNT); goto done; } bcopy(space, nbp->b_data, fs->fs_bsize); space = (char *)space + fs->fs_bsize; bawrite(nbp); } error = bread(vp, lblkno(fs, fs->fs_sblockloc), fs->fs_bsize, KERNCRED, &nbp); if (error) { brelse(nbp); } else { loc = blkoff(fs, fs->fs_sblockloc); bcopy((char *)copy_fs, &nbp->b_data[loc], (u_int)fs->fs_sbsize); bawrite(nbp); } /* * As this is the newest list, it is the most inclusive, so * should replace the previous list. */ VI_LOCK(devvp); space = sn->sn_blklist; sn->sn_blklist = snapblklist; sn->sn_listsize = snaplistsize; VI_UNLOCK(devvp); if (space != NULL) free(space, M_UFSMNT); /* * Preallocate all the direct blocks in the snapshot inode so * that we never have to write the inode itself to commit an * update to the contents of the snapshot. Note that once * created, the size of the snapshot will never change, so * there will never be a need to write the inode except to * update the non-integrity-critical time fields and * allocated-block count. */ for (blockno = 0; blockno < UFS_NDADDR; blockno++) { if (DIP(ip, i_db[blockno]) != 0) continue; error = UFS_BALLOC(vp, lblktosize(fs, blockno), fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp); if (error) break; error = readblock(vp, bp, blockno); bawrite(bp); if (error != 0) break; } done: free(copy_fs->fs_csp, M_UFSMNT); free(copy_fs, M_UFSMNT); copy_fs = NULL; out: NDFREE(&nd, NDF_ONLY_PNBUF); if (saved_nice > 0) { struct proc *p; p = td->td_proc; PROC_LOCK(p); sched_nice(td->td_proc, saved_nice); PROC_UNLOCK(td->td_proc); } UFS_LOCK(ump); if (fs->fs_active != 0) { free(fs->fs_active, M_DEVBUF); fs->fs_active = 0; } UFS_UNLOCK(ump); MNT_ILOCK(mp); mp->mnt_flag = (mp->mnt_flag & MNT_QUOTA) | (flag & ~MNT_QUOTA); MNT_IUNLOCK(mp); if (error) (void) ffs_truncate(vp, (off_t)0, 0, NOCRED); (void) ffs_syncvnode(vp, MNT_WAIT, 0); if (error) vput(vp); else VOP_UNLOCK(vp, 0); vrele(nd.ni_dvp); vn_finished_write(wrtmp); process_deferred_inactive(mp); return (error); } /* * Copy a cylinder group map. All the unallocated blocks are marked * BLK_NOCOPY so that the snapshot knows that it need not copy them * if they are later written. If passno is one, then this is a first * pass, so only setting needs to be done. If passno is 2, then this * is a revision to a previous pass which must be undone as the * replacement pass is done. */ static int cgaccount(cg, vp, nbp, passno) int cg; struct vnode *vp; struct buf *nbp; int passno; { struct buf *bp, *ibp; struct inode *ip; struct cg *cgp; struct fs *fs; ufs2_daddr_t base, numblks; int error, len, loc, indiroff; ip = VTOI(vp); fs = ITOFS(ip); if ((error = ffs_getcg(fs, ITODEVVP(ip), cg, &bp, &cgp)) != 0) return (error); UFS_LOCK(ITOUMP(ip)); ACTIVESET(fs, cg); /* * Recomputation of summary information might not have been performed * at mount time. Sync up summary information for current cylinder * group while data is in memory to ensure that result of background * fsck is slightly more consistent. */ fs->fs_cs(fs, cg) = cgp->cg_cs; UFS_UNLOCK(ITOUMP(ip)); bcopy(bp->b_data, nbp->b_data, fs->fs_cgsize); if (fs->fs_cgsize < fs->fs_bsize) bzero(&nbp->b_data[fs->fs_cgsize], fs->fs_bsize - fs->fs_cgsize); cgp = (struct cg *)nbp->b_data; bqrelse(bp); if (passno == 2) nbp->b_flags |= B_VALIDSUSPWRT; numblks = howmany(fs->fs_size, fs->fs_frag); len = howmany(fs->fs_fpg, fs->fs_frag); base = cgbase(fs, cg) / fs->fs_frag; if (base + len >= numblks) len = numblks - base - 1; loc = 0; if (base < UFS_NDADDR) { for ( ; loc < UFS_NDADDR; loc++) { if (ffs_isblock(fs, cg_blksfree(cgp), loc)) DIP_SET(ip, i_db[loc], BLK_NOCOPY); else if (passno == 2 && DIP(ip, i_db[loc])== BLK_NOCOPY) DIP_SET(ip, i_db[loc], 0); else if (passno == 1 && DIP(ip, i_db[loc])== BLK_NOCOPY) panic("ffs_snapshot: lost direct block"); } } error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(base + loc)), fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); if (error) { - return (error); + goto out; } indiroff = (base + loc - UFS_NDADDR) % NINDIR(fs); for ( ; loc < len; loc++, indiroff++) { if (indiroff >= NINDIR(fs)) { if (passno == 2) ibp->b_flags |= B_VALIDSUSPWRT; bawrite(ibp); error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(base + loc)), fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); if (error) { - return (error); + goto out; } indiroff = 0; } if (I_IS_UFS1(ip)) { if (ffs_isblock(fs, cg_blksfree(cgp), loc)) ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY; else if (passno == 2 && ((ufs1_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY) ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 0; else if (passno == 1 && ((ufs1_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY) panic("ffs_snapshot: lost indirect block"); continue; } if (ffs_isblock(fs, cg_blksfree(cgp), loc)) ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY; else if (passno == 2 && ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY) ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = 0; else if (passno == 1 && ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY) panic("ffs_snapshot: lost indirect block"); } if (passno == 2) ibp->b_flags |= B_VALIDSUSPWRT; bdwrite(ibp); - return (0); +out: + /* + * We have to calculate the crc32c here rather than just setting the + * BX_CYLGRP b_xflags because the allocation of the block for the + * the cylinder group map will always be a full size block (fs_bsize) + * even though the cylinder group may be smaller (fs_cgsize). The + * crc32c must be computed only over fs_cgsize whereas the BX_CYLGRP + * flag causes it to be computed over the size of the buffer. + */ + if ((fs->fs_metackhash & CK_CYLGRP) != 0) { + ((struct cg *)nbp->b_data)->cg_ckhash = 0; + ((struct cg *)nbp->b_data)->cg_ckhash = + calculate_crc32c(~0L, nbp->b_data, fs->fs_cgsize); + } + return (error); } /* * Before expunging a snapshot inode, note all the * blocks that it claims with BLK_SNAP so that fsck will * be able to account for those blocks properly and so * that this snapshot knows that it need not copy them * if the other snapshot holding them is freed. This code * is reproduced once each for UFS1 and UFS2. */ static int expunge_ufs1(snapvp, cancelip, fs, acctfunc, expungetype, clearmode) struct vnode *snapvp; struct inode *cancelip; struct fs *fs; int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, ufs_lbn_t, int); int expungetype; int clearmode; { int i, error, indiroff; ufs_lbn_t lbn, rlbn; ufs2_daddr_t len, blkno, numblks, blksperindir; struct ufs1_dinode *dip; struct thread *td = curthread; struct buf *bp; /* * Prepare to expunge the inode. If its inode block has not * yet been copied, then allocate and fill the copy. */ lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); blkno = 0; if (lbn < UFS_NDADDR) { blkno = VTOI(snapvp)->i_din1->di_db[lbn]; } else { if (DOINGSOFTDEP(snapvp)) softdep_prealloc(snapvp, MNT_WAIT); td->td_pflags |= TDP_COWINPROGRESS; error = ffs_balloc_ufs1(snapvp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, BA_METAONLY, &bp); td->td_pflags &= ~TDP_COWINPROGRESS; if (error) return (error); indiroff = (lbn - UFS_NDADDR) % NINDIR(fs); blkno = ((ufs1_daddr_t *)(bp->b_data))[indiroff]; bqrelse(bp); } if (blkno != 0) { if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp))) return (error); } else { error = ffs_balloc_ufs1(snapvp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, 0, &bp); if (error) return (error); if ((error = readblock(snapvp, bp, lbn)) != 0) return (error); } /* * Set a snapshot inode to be a zero length file, regular files * or unlinked snapshots to be completely unallocated. */ dip = (struct ufs1_dinode *)bp->b_data + ino_to_fsbo(fs, cancelip->i_number); if (clearmode || cancelip->i_effnlink == 0) dip->di_mode = 0; dip->di_size = 0; dip->di_blocks = 0; dip->di_flags &= ~SF_SNAPSHOT; bzero(&dip->di_db[0], (UFS_NDADDR + UFS_NIADDR) * sizeof(ufs1_daddr_t)); bdwrite(bp); /* * Now go through and expunge all the blocks in the file * using the function requested. */ numblks = howmany(cancelip->i_size, fs->fs_bsize); if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_db[0], &cancelip->i_din1->di_db[UFS_NDADDR], fs, 0, expungetype))) return (error); if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_ib[0], &cancelip->i_din1->di_ib[UFS_NIADDR], fs, -1, expungetype))) return (error); blksperindir = 1; lbn = -UFS_NDADDR; len = numblks - UFS_NDADDR; rlbn = UFS_NDADDR; for (i = 0; len > 0 && i < UFS_NIADDR; i++) { error = indiracct_ufs1(snapvp, ITOV(cancelip), i, cancelip->i_din1->di_ib[i], lbn, rlbn, len, blksperindir, fs, acctfunc, expungetype); if (error) return (error); blksperindir *= NINDIR(fs); lbn -= blksperindir + 1; len -= blksperindir; rlbn += blksperindir; } return (0); } /* * Descend an indirect block chain for vnode cancelvp accounting for all * its indirect blocks in snapvp. */ static int indiracct_ufs1(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, blksperindir, fs, acctfunc, expungetype) struct vnode *snapvp; struct vnode *cancelvp; int level; ufs1_daddr_t blkno; ufs_lbn_t lbn; ufs_lbn_t rlbn; ufs_lbn_t remblks; ufs_lbn_t blksperindir; struct fs *fs; int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, ufs_lbn_t, int); int expungetype; { int error, num, i; ufs_lbn_t subblksperindir; struct indir indirs[UFS_NIADDR + 2]; ufs1_daddr_t last, *bap; struct buf *bp; if (blkno == 0) { if (expungetype == BLK_NOCOPY) return (0); panic("indiracct_ufs1: missing indir"); } if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) return (error); if (lbn != indirs[num - 1 - level].in_lbn || num < 2) panic("indiracct_ufs1: botched params"); /* * We have to expand bread here since it will deadlock looking * up the block number for any blocks that are not in the cache. */ bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0, 0); bp->b_blkno = fsbtodb(fs, blkno); if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 && (error = readblock(cancelvp, bp, fragstoblks(fs, blkno)))) { brelse(bp); return (error); } /* * Account for the block pointers in this indirect block. */ last = howmany(remblks, blksperindir); if (last > NINDIR(fs)) last = NINDIR(fs); bap = malloc(fs->fs_bsize, M_DEVBUF, M_WAITOK); bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize); bqrelse(bp); error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, level == 0 ? rlbn : -1, expungetype); if (error || level == 0) goto out; /* * Account for the block pointers in each of the indirect blocks * in the levels below us. */ subblksperindir = blksperindir / NINDIR(fs); for (lbn++, level--, i = 0; i < last; i++) { error = indiracct_ufs1(snapvp, cancelvp, level, bap[i], lbn, rlbn, remblks, subblksperindir, fs, acctfunc, expungetype); if (error) goto out; rlbn += blksperindir; lbn -= blksperindir; remblks -= blksperindir; } out: free(bap, M_DEVBUF); return (error); } /* * Do both snap accounting and map accounting. */ static int fullacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype) struct vnode *vp; ufs1_daddr_t *oldblkp, *lastblkp; struct fs *fs; ufs_lbn_t lblkno; int exptype; /* BLK_SNAP or BLK_NOCOPY */ { int error; if ((error = snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype))) return (error); return (mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype)); } /* * Identify a set of blocks allocated in a snapshot inode. */ static int snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype) struct vnode *vp; ufs1_daddr_t *oldblkp, *lastblkp; struct fs *fs; ufs_lbn_t lblkno; int expungetype; /* BLK_SNAP or BLK_NOCOPY */ { struct inode *ip = VTOI(vp); ufs1_daddr_t blkno, *blkp; ufs_lbn_t lbn; struct buf *ibp; int error; for ( ; oldblkp < lastblkp; oldblkp++) { blkno = *oldblkp; if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) continue; lbn = fragstoblks(fs, blkno); if (lbn < UFS_NDADDR) { blkp = &ip->i_din1->di_db[lbn]; ip->i_flag |= IN_CHANGE | IN_UPDATE; } else { error = ffs_balloc_ufs1(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); if (error) return (error); blkp = &((ufs1_daddr_t *)(ibp->b_data)) [(lbn - UFS_NDADDR) % NINDIR(fs)]; } /* * If we are expunging a snapshot vnode and we * find a block marked BLK_NOCOPY, then it is * one that has been allocated to this snapshot after * we took our current snapshot and can be ignored. */ if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) { if (lbn >= UFS_NDADDR) brelse(ibp); } else { if (*blkp != 0) panic("snapacct_ufs1: bad block"); *blkp = expungetype; if (lbn >= UFS_NDADDR) bdwrite(ibp); } } return (0); } /* * Account for a set of blocks allocated in a snapshot inode. */ static int mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype) struct vnode *vp; ufs1_daddr_t *oldblkp, *lastblkp; struct fs *fs; ufs_lbn_t lblkno; int expungetype; { ufs1_daddr_t blkno; struct inode *ip; ino_t inum; int acctit; ip = VTOI(vp); inum = ip->i_number; if (lblkno == -1) acctit = 0; else acctit = 1; for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { blkno = *oldblkp; if (blkno == 0 || blkno == BLK_NOCOPY) continue; if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP) *ip->i_snapblklist++ = lblkno; if (blkno == BLK_SNAP) blkno = blkstofrags(fs, lblkno); ffs_blkfree(ITOUMP(ip), fs, vp, blkno, fs->fs_bsize, inum, vp->v_type, NULL); } return (0); } /* * Before expunging a snapshot inode, note all the * blocks that it claims with BLK_SNAP so that fsck will * be able to account for those blocks properly and so * that this snapshot knows that it need not copy them * if the other snapshot holding them is freed. This code * is reproduced once each for UFS1 and UFS2. */ static int expunge_ufs2(snapvp, cancelip, fs, acctfunc, expungetype, clearmode) struct vnode *snapvp; struct inode *cancelip; struct fs *fs; int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, ufs_lbn_t, int); int expungetype; int clearmode; { int i, error, indiroff; ufs_lbn_t lbn, rlbn; ufs2_daddr_t len, blkno, numblks, blksperindir; struct ufs2_dinode *dip; struct thread *td = curthread; struct buf *bp; /* * Prepare to expunge the inode. If its inode block has not * yet been copied, then allocate and fill the copy. */ lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); blkno = 0; if (lbn < UFS_NDADDR) { blkno = VTOI(snapvp)->i_din2->di_db[lbn]; } else { if (DOINGSOFTDEP(snapvp)) softdep_prealloc(snapvp, MNT_WAIT); td->td_pflags |= TDP_COWINPROGRESS; error = ffs_balloc_ufs2(snapvp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, BA_METAONLY, &bp); td->td_pflags &= ~TDP_COWINPROGRESS; if (error) return (error); indiroff = (lbn - UFS_NDADDR) % NINDIR(fs); blkno = ((ufs2_daddr_t *)(bp->b_data))[indiroff]; bqrelse(bp); } if (blkno != 0) { if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp))) return (error); } else { error = ffs_balloc_ufs2(snapvp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, 0, &bp); if (error) return (error); if ((error = readblock(snapvp, bp, lbn)) != 0) return (error); } /* * Set a snapshot inode to be a zero length file, regular files * to be completely unallocated. */ dip = (struct ufs2_dinode *)bp->b_data + ino_to_fsbo(fs, cancelip->i_number); if (clearmode || cancelip->i_effnlink == 0) dip->di_mode = 0; dip->di_size = 0; dip->di_blocks = 0; dip->di_flags &= ~SF_SNAPSHOT; bzero(&dip->di_db[0], (UFS_NDADDR + UFS_NIADDR) * sizeof(ufs2_daddr_t)); bdwrite(bp); /* * Now go through and expunge all the blocks in the file * using the function requested. */ numblks = howmany(cancelip->i_size, fs->fs_bsize); if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_db[0], &cancelip->i_din2->di_db[UFS_NDADDR], fs, 0, expungetype))) return (error); if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_ib[0], &cancelip->i_din2->di_ib[UFS_NIADDR], fs, -1, expungetype))) return (error); blksperindir = 1; lbn = -UFS_NDADDR; len = numblks - UFS_NDADDR; rlbn = UFS_NDADDR; for (i = 0; len > 0 && i < UFS_NIADDR; i++) { error = indiracct_ufs2(snapvp, ITOV(cancelip), i, cancelip->i_din2->di_ib[i], lbn, rlbn, len, blksperindir, fs, acctfunc, expungetype); if (error) return (error); blksperindir *= NINDIR(fs); lbn -= blksperindir + 1; len -= blksperindir; rlbn += blksperindir; } return (0); } /* * Descend an indirect block chain for vnode cancelvp accounting for all * its indirect blocks in snapvp. */ static int indiracct_ufs2(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, blksperindir, fs, acctfunc, expungetype) struct vnode *snapvp; struct vnode *cancelvp; int level; ufs2_daddr_t blkno; ufs_lbn_t lbn; ufs_lbn_t rlbn; ufs_lbn_t remblks; ufs_lbn_t blksperindir; struct fs *fs; int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, ufs_lbn_t, int); int expungetype; { int error, num, i; ufs_lbn_t subblksperindir; struct indir indirs[UFS_NIADDR + 2]; ufs2_daddr_t last, *bap; struct buf *bp; if (blkno == 0) { if (expungetype == BLK_NOCOPY) return (0); panic("indiracct_ufs2: missing indir"); } if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) return (error); if (lbn != indirs[num - 1 - level].in_lbn || num < 2) panic("indiracct_ufs2: botched params"); /* * We have to expand bread here since it will deadlock looking * up the block number for any blocks that are not in the cache. */ bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0, 0); bp->b_blkno = fsbtodb(fs, blkno); if ((bp->b_flags & B_CACHE) == 0 && (error = readblock(cancelvp, bp, fragstoblks(fs, blkno)))) { brelse(bp); return (error); } /* * Account for the block pointers in this indirect block. */ last = howmany(remblks, blksperindir); if (last > NINDIR(fs)) last = NINDIR(fs); bap = malloc(fs->fs_bsize, M_DEVBUF, M_WAITOK); bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize); bqrelse(bp); error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, level == 0 ? rlbn : -1, expungetype); if (error || level == 0) goto out; /* * Account for the block pointers in each of the indirect blocks * in the levels below us. */ subblksperindir = blksperindir / NINDIR(fs); for (lbn++, level--, i = 0; i < last; i++) { error = indiracct_ufs2(snapvp, cancelvp, level, bap[i], lbn, rlbn, remblks, subblksperindir, fs, acctfunc, expungetype); if (error) goto out; rlbn += blksperindir; lbn -= blksperindir; remblks -= blksperindir; } out: free(bap, M_DEVBUF); return (error); } /* * Do both snap accounting and map accounting. */ static int fullacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype) struct vnode *vp; ufs2_daddr_t *oldblkp, *lastblkp; struct fs *fs; ufs_lbn_t lblkno; int exptype; /* BLK_SNAP or BLK_NOCOPY */ { int error; if ((error = snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype))) return (error); return (mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype)); } /* * Identify a set of blocks allocated in a snapshot inode. */ static int snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype) struct vnode *vp; ufs2_daddr_t *oldblkp, *lastblkp; struct fs *fs; ufs_lbn_t lblkno; int expungetype; /* BLK_SNAP or BLK_NOCOPY */ { struct inode *ip = VTOI(vp); ufs2_daddr_t blkno, *blkp; ufs_lbn_t lbn; struct buf *ibp; int error; for ( ; oldblkp < lastblkp; oldblkp++) { blkno = *oldblkp; if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) continue; lbn = fragstoblks(fs, blkno); if (lbn < UFS_NDADDR) { blkp = &ip->i_din2->di_db[lbn]; ip->i_flag |= IN_CHANGE | IN_UPDATE; } else { error = ffs_balloc_ufs2(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); if (error) return (error); blkp = &((ufs2_daddr_t *)(ibp->b_data)) [(lbn - UFS_NDADDR) % NINDIR(fs)]; } /* * If we are expunging a snapshot vnode and we * find a block marked BLK_NOCOPY, then it is * one that has been allocated to this snapshot after * we took our current snapshot and can be ignored. */ if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) { if (lbn >= UFS_NDADDR) brelse(ibp); } else { if (*blkp != 0) panic("snapacct_ufs2: bad block"); *blkp = expungetype; if (lbn >= UFS_NDADDR) bdwrite(ibp); } } return (0); } /* * Account for a set of blocks allocated in a snapshot inode. */ static int mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype) struct vnode *vp; ufs2_daddr_t *oldblkp, *lastblkp; struct fs *fs; ufs_lbn_t lblkno; int expungetype; { ufs2_daddr_t blkno; struct inode *ip; ino_t inum; int acctit; ip = VTOI(vp); inum = ip->i_number; if (lblkno == -1) acctit = 0; else acctit = 1; for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { blkno = *oldblkp; if (blkno == 0 || blkno == BLK_NOCOPY) continue; if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP) *ip->i_snapblklist++ = lblkno; if (blkno == BLK_SNAP) blkno = blkstofrags(fs, lblkno); ffs_blkfree(ITOUMP(ip), fs, vp, blkno, fs->fs_bsize, inum, vp->v_type, NULL); } return (0); } /* * Decrement extra reference on snapshot when last name is removed. * It will not be freed until the last open reference goes away. */ void ffs_snapgone(ip) struct inode *ip; { struct inode *xp; struct fs *fs; int snaploc; struct snapdata *sn; struct ufsmount *ump; /* * Find snapshot in incore list. */ xp = NULL; sn = ITODEVVP(ip)->v_rdev->si_snapdata; if (sn != NULL) TAILQ_FOREACH(xp, &sn->sn_head, i_nextsnap) if (xp == ip) break; if (xp != NULL) vrele(ITOV(ip)); else if (snapdebug) printf("ffs_snapgone: lost snapshot vnode %ju\n", (uintmax_t)ip->i_number); /* * Delete snapshot inode from superblock. Keep list dense. */ ump = ITOUMP(ip); fs = ump->um_fs; UFS_LOCK(ump); for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) if (fs->fs_snapinum[snaploc] == ip->i_number) break; if (snaploc < FSMAXSNAP) { for (snaploc++; snaploc < FSMAXSNAP; snaploc++) { if (fs->fs_snapinum[snaploc] == 0) break; fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc]; } fs->fs_snapinum[snaploc - 1] = 0; } UFS_UNLOCK(ump); } /* * Prepare a snapshot file for being removed. */ void ffs_snapremove(vp) struct vnode *vp; { struct inode *ip; struct vnode *devvp; struct buf *ibp; struct fs *fs; ufs2_daddr_t numblks, blkno, dblk; int error, i, last, loc; struct snapdata *sn; ip = VTOI(vp); fs = ITOFS(ip); devvp = ITODEVVP(ip); /* * If active, delete from incore list (this snapshot may * already have been in the process of being deleted, so * would not have been active). * * Clear copy-on-write flag if last snapshot. */ VI_LOCK(devvp); if (ip->i_nextsnap.tqe_prev != 0) { sn = devvp->v_rdev->si_snapdata; TAILQ_REMOVE(&sn->sn_head, ip, i_nextsnap); ip->i_nextsnap.tqe_prev = 0; VI_UNLOCK(devvp); lockmgr(&vp->v_lock, LK_EXCLUSIVE, NULL); for (i = 0; i < sn->sn_lock.lk_recurse; i++) lockmgr(&vp->v_lock, LK_EXCLUSIVE, NULL); KASSERT(vp->v_vnlock == &sn->sn_lock, ("ffs_snapremove: lost lock mutation")); vp->v_vnlock = &vp->v_lock; VI_LOCK(devvp); while (sn->sn_lock.lk_recurse > 0) lockmgr(&sn->sn_lock, LK_RELEASE, NULL); lockmgr(&sn->sn_lock, LK_RELEASE, NULL); try_free_snapdata(devvp); } else VI_UNLOCK(devvp); /* * Clear all BLK_NOCOPY fields. Pass any block claims to other * snapshots that want them (see ffs_snapblkfree below). */ for (blkno = 1; blkno < UFS_NDADDR; blkno++) { dblk = DIP(ip, i_db[blkno]); if (dblk == 0) continue; if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) DIP_SET(ip, i_db[blkno], 0); else if ((dblk == blkstofrags(fs, blkno) && ffs_snapblkfree(fs, ITODEVVP(ip), dblk, fs->fs_bsize, ip->i_number, vp->v_type, NULL))) { DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - btodb(fs->fs_bsize)); DIP_SET(ip, i_db[blkno], 0); } } numblks = howmany(ip->i_size, fs->fs_bsize); for (blkno = UFS_NDADDR; blkno < numblks; blkno += NINDIR(fs)) { error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno), fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); if (error) continue; if (fs->fs_size - blkno > NINDIR(fs)) last = NINDIR(fs); else last = fs->fs_size - blkno; for (loc = 0; loc < last; loc++) { if (I_IS_UFS1(ip)) { dblk = ((ufs1_daddr_t *)(ibp->b_data))[loc]; if (dblk == 0) continue; if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) ((ufs1_daddr_t *)(ibp->b_data))[loc]= 0; else if ((dblk == blkstofrags(fs, blkno) && ffs_snapblkfree(fs, ITODEVVP(ip), dblk, fs->fs_bsize, ip->i_number, vp->v_type, NULL))) { ip->i_din1->di_blocks -= btodb(fs->fs_bsize); ((ufs1_daddr_t *)(ibp->b_data))[loc]= 0; } continue; } dblk = ((ufs2_daddr_t *)(ibp->b_data))[loc]; if (dblk == 0) continue; if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) ((ufs2_daddr_t *)(ibp->b_data))[loc] = 0; else if ((dblk == blkstofrags(fs, blkno) && ffs_snapblkfree(fs, ITODEVVP(ip), dblk, fs->fs_bsize, ip->i_number, vp->v_type, NULL))) { ip->i_din2->di_blocks -= btodb(fs->fs_bsize); ((ufs2_daddr_t *)(ibp->b_data))[loc] = 0; } } bawrite(ibp); } /* * Clear snapshot flag and drop reference. */ ip->i_flags &= ~SF_SNAPSHOT; DIP_SET(ip, i_flags, ip->i_flags); ip->i_flag |= IN_CHANGE | IN_UPDATE; /* * The dirtied indirects must be written out before * softdep_setup_freeblocks() is called. Otherwise indir_trunc() * may find indirect pointers using the magic BLK_* values. */ if (DOINGSOFTDEP(vp)) ffs_syncvnode(vp, MNT_WAIT, 0); #ifdef QUOTA /* * Reenable disk quotas for ex-snapshot file. */ if (!getinoquota(ip)) (void) chkdq(ip, DIP(ip, i_blocks), KERNCRED, FORCE); #endif } /* * Notification that a block is being freed. Return zero if the free * should be allowed to proceed. Return non-zero if the snapshot file * wants to claim the block. The block will be claimed if it is an * uncopied part of one of the snapshots. It will be freed if it is * either a BLK_NOCOPY or has already been copied in all of the snapshots. * If a fragment is being freed, then all snapshots that care about * it must make a copy since a snapshot file can only claim full sized * blocks. Note that if more than one snapshot file maps the block, * we can pick one at random to claim it. Since none of the snapshots * can change, we are assurred that they will all see the same unmodified * image. When deleting a snapshot file (see ffs_snapremove above), we * must push any of these claimed blocks to one of the other snapshots * that maps it. These claimed blocks are easily identified as they will * have a block number equal to their logical block number within the * snapshot. A copied block can never have this property because they * must always have been allocated from a BLK_NOCOPY location. */ int ffs_snapblkfree(fs, devvp, bno, size, inum, vtype, wkhd) struct fs *fs; struct vnode *devvp; ufs2_daddr_t bno; long size; ino_t inum; enum vtype vtype; struct workhead *wkhd; { struct buf *ibp, *cbp, *savedcbp = NULL; struct thread *td = curthread; struct inode *ip; struct vnode *vp = NULL; ufs_lbn_t lbn; ufs2_daddr_t blkno; int indiroff = 0, error = 0, claimedblk = 0; struct snapdata *sn; lbn = fragstoblks(fs, bno); retry: VI_LOCK(devvp); sn = devvp->v_rdev->si_snapdata; if (sn == NULL) { VI_UNLOCK(devvp); return (0); } if (lockmgr(&sn->sn_lock, LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, VI_MTX(devvp)) != 0) goto retry; TAILQ_FOREACH(ip, &sn->sn_head, i_nextsnap) { vp = ITOV(ip); if (DOINGSOFTDEP(vp)) softdep_prealloc(vp, MNT_WAIT); /* * Lookup block being written. */ if (lbn < UFS_NDADDR) { blkno = DIP(ip, i_db[lbn]); } else { td->td_pflags |= TDP_COWINPROGRESS; error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); td->td_pflags &= ~TDP_COWINPROGRESS; if (error) break; indiroff = (lbn - UFS_NDADDR) % NINDIR(fs); if (I_IS_UFS1(ip)) blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff]; else blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff]; } /* * Check to see if block needs to be copied. */ if (blkno == 0) { /* * A block that we map is being freed. If it has not * been claimed yet, we will claim or copy it (below). */ claimedblk = 1; } else if (blkno == BLK_SNAP) { /* * No previous snapshot claimed the block, * so it will be freed and become a BLK_NOCOPY * (don't care) for us. */ if (claimedblk) panic("snapblkfree: inconsistent block type"); if (lbn < UFS_NDADDR) { DIP_SET(ip, i_db[lbn], BLK_NOCOPY); ip->i_flag |= IN_CHANGE | IN_UPDATE; } else if (I_IS_UFS1(ip)) { ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY; bdwrite(ibp); } else { ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY; bdwrite(ibp); } continue; } else /* BLK_NOCOPY or default */ { /* * If the snapshot has already copied the block * (default), or does not care about the block, * it is not needed. */ if (lbn >= UFS_NDADDR) bqrelse(ibp); continue; } /* * If this is a full size block, we will just grab it * and assign it to the snapshot inode. Otherwise we * will proceed to copy it. See explanation for this * routine as to why only a single snapshot needs to * claim this block. */ if (size == fs->fs_bsize) { #ifdef DEBUG if (snapdebug) printf("%s %ju lbn %jd from inum %ju\n", "Grabonremove: snapino", (uintmax_t)ip->i_number, (intmax_t)lbn, (uintmax_t)inum); #endif /* * If journaling is tracking this write we must add * the work to the inode or indirect being written. */ if (wkhd != NULL) { if (lbn < UFS_NDADDR) softdep_inode_append(ip, curthread->td_ucred, wkhd); else softdep_buf_append(ibp, wkhd); } if (lbn < UFS_NDADDR) { DIP_SET(ip, i_db[lbn], bno); } else if (I_IS_UFS1(ip)) { ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = bno; bdwrite(ibp); } else { ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = bno; bdwrite(ibp); } DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + btodb(size)); ip->i_flag |= IN_CHANGE | IN_UPDATE; lockmgr(vp->v_vnlock, LK_RELEASE, NULL); return (1); } if (lbn >= UFS_NDADDR) bqrelse(ibp); /* * Allocate the block into which to do the copy. Note that this * allocation will never require any additional allocations for * the snapshot inode. */ td->td_pflags |= TDP_COWINPROGRESS; error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, 0, &cbp); td->td_pflags &= ~TDP_COWINPROGRESS; if (error) break; #ifdef DEBUG if (snapdebug) printf("%s%ju lbn %jd %s %ju size %ld to blkno %jd\n", "Copyonremove: snapino ", (uintmax_t)ip->i_number, (intmax_t)lbn, "for inum", (uintmax_t)inum, size, (intmax_t)cbp->b_blkno); #endif /* * If we have already read the old block contents, then * simply copy them to the new block. Note that we need * to synchronously write snapshots that have not been * unlinked, and hence will be visible after a crash, * to ensure their integrity. At a minimum we ensure the * integrity of the filesystem metadata, but use the * dopersistence sysctl-setable flag to decide on the * persistence needed for file content data. */ if (savedcbp != NULL) { bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); bawrite(cbp); if ((vtype == VDIR || dopersistence) && ip->i_effnlink > 0) (void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT); continue; } /* * Otherwise, read the old block contents into the buffer. */ if ((error = readblock(vp, cbp, lbn)) != 0) { bzero(cbp->b_data, fs->fs_bsize); bawrite(cbp); if ((vtype == VDIR || dopersistence) && ip->i_effnlink > 0) (void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT); break; } savedcbp = cbp; } /* * Note that we need to synchronously write snapshots that * have not been unlinked, and hence will be visible after * a crash, to ensure their integrity. At a minimum we * ensure the integrity of the filesystem metadata, but * use the dopersistence sysctl-setable flag to decide on * the persistence needed for file content data. */ if (savedcbp) { vp = savedcbp->b_vp; bawrite(savedcbp); if ((vtype == VDIR || dopersistence) && VTOI(vp)->i_effnlink > 0) (void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT); } /* * If we have been unable to allocate a block in which to do * the copy, then return non-zero so that the fragment will * not be freed. Although space will be lost, the snapshot * will stay consistent. */ if (error != 0 && wkhd != NULL) softdep_freework(wkhd); lockmgr(&sn->sn_lock, LK_RELEASE, NULL); return (error); } /* * Associate snapshot files when mounting. */ void ffs_snapshot_mount(mp) struct mount *mp; { struct ufsmount *ump = VFSTOUFS(mp); struct vnode *devvp = ump->um_devvp; struct fs *fs = ump->um_fs; struct thread *td = curthread; struct snapdata *sn; struct vnode *vp; struct vnode *lastvp; struct inode *ip; struct uio auio; struct iovec aiov; void *snapblklist; char *reason; daddr_t snaplistsize; int error, snaploc, loc; /* * XXX The following needs to be set before ffs_truncate or * VOP_READ can be called. */ mp->mnt_stat.f_iosize = fs->fs_bsize; /* * Process each snapshot listed in the superblock. */ vp = NULL; lastvp = NULL; sn = NULL; for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) { if (fs->fs_snapinum[snaploc] == 0) break; if ((error = ffs_vget(mp, fs->fs_snapinum[snaploc], LK_EXCLUSIVE, &vp)) != 0){ printf("ffs_snapshot_mount: vget failed %d\n", error); continue; } ip = VTOI(vp); if (!IS_SNAPSHOT(ip) || ip->i_size == lblktosize(fs, howmany(fs->fs_size, fs->fs_frag))) { if (!IS_SNAPSHOT(ip)) { reason = "non-snapshot"; } else { reason = "old format snapshot"; (void)ffs_truncate(vp, (off_t)0, 0, NOCRED); (void)ffs_syncvnode(vp, MNT_WAIT, 0); } printf("ffs_snapshot_mount: %s inode %d\n", reason, fs->fs_snapinum[snaploc]); vput(vp); vp = NULL; for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) { if (fs->fs_snapinum[loc] == 0) break; fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc]; } fs->fs_snapinum[loc - 1] = 0; snaploc--; continue; } /* * Acquire a lock on the snapdata structure, creating it if * necessary. */ sn = ffs_snapdata_acquire(devvp); /* * Change vnode to use shared snapshot lock instead of the * original private lock. */ vp->v_vnlock = &sn->sn_lock; lockmgr(&vp->v_lock, LK_RELEASE, NULL); /* * Link it onto the active snapshot list. */ VI_LOCK(devvp); if (ip->i_nextsnap.tqe_prev != 0) panic("ffs_snapshot_mount: %ju already on list", (uintmax_t)ip->i_number); else TAILQ_INSERT_TAIL(&sn->sn_head, ip, i_nextsnap); vp->v_vflag |= VV_SYSTEM; VI_UNLOCK(devvp); VOP_UNLOCK(vp, 0); lastvp = vp; } vp = lastvp; /* * No usable snapshots found. */ if (sn == NULL || vp == NULL) return; /* * Allocate the space for the block hints list. We always want to * use the list from the newest snapshot. */ auio.uio_iov = &aiov; auio.uio_iovcnt = 1; aiov.iov_base = (void *)&snaplistsize; aiov.iov_len = sizeof(snaplistsize); auio.uio_resid = aiov.iov_len; auio.uio_offset = lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)); auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_READ; auio.uio_td = td; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { printf("ffs_snapshot_mount: read_1 failed %d\n", error); VOP_UNLOCK(vp, 0); return; } snapblklist = malloc(snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK); auio.uio_iovcnt = 1; aiov.iov_base = snapblklist; aiov.iov_len = snaplistsize * sizeof (daddr_t); auio.uio_resid = aiov.iov_len; auio.uio_offset -= sizeof(snaplistsize); if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { printf("ffs_snapshot_mount: read_2 failed %d\n", error); VOP_UNLOCK(vp, 0); free(snapblklist, M_UFSMNT); return; } VOP_UNLOCK(vp, 0); VI_LOCK(devvp); ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_mount"); sn->sn_listsize = snaplistsize; sn->sn_blklist = (daddr_t *)snapblklist; devvp->v_vflag |= VV_COPYONWRITE; VI_UNLOCK(devvp); } /* * Disassociate snapshot files when unmounting. */ void ffs_snapshot_unmount(mp) struct mount *mp; { struct vnode *devvp = VFSTOUFS(mp)->um_devvp; struct snapdata *sn; struct inode *xp; struct vnode *vp; VI_LOCK(devvp); sn = devvp->v_rdev->si_snapdata; while (sn != NULL && (xp = TAILQ_FIRST(&sn->sn_head)) != NULL) { vp = ITOV(xp); TAILQ_REMOVE(&sn->sn_head, xp, i_nextsnap); xp->i_nextsnap.tqe_prev = 0; lockmgr(&sn->sn_lock, LK_INTERLOCK | LK_EXCLUSIVE, VI_MTX(devvp)); lockmgr(&vp->v_lock, LK_EXCLUSIVE, NULL); KASSERT(vp->v_vnlock == &sn->sn_lock, ("ffs_snapshot_unmount: lost lock mutation")); vp->v_vnlock = &vp->v_lock; lockmgr(&vp->v_lock, LK_RELEASE, NULL); lockmgr(&sn->sn_lock, LK_RELEASE, NULL); if (xp->i_effnlink > 0) vrele(vp); VI_LOCK(devvp); sn = devvp->v_rdev->si_snapdata; } try_free_snapdata(devvp); ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_unmount"); } /* * Check the buffer block to be belong to device buffer that shall be * locked after snaplk. devvp shall be locked on entry, and will be * leaved locked upon exit. */ static int ffs_bp_snapblk(devvp, bp) struct vnode *devvp; struct buf *bp; { struct snapdata *sn; struct fs *fs; ufs2_daddr_t lbn, *snapblklist; int lower, upper, mid; ASSERT_VI_LOCKED(devvp, "ffs_bp_snapblk"); KASSERT(devvp->v_type == VCHR, ("Not a device %p", devvp)); sn = devvp->v_rdev->si_snapdata; if (sn == NULL || TAILQ_FIRST(&sn->sn_head) == NULL) return (0); fs = ITOFS(TAILQ_FIRST(&sn->sn_head)); lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno)); snapblklist = sn->sn_blklist; upper = sn->sn_listsize - 1; lower = 1; while (lower <= upper) { mid = (lower + upper) / 2; if (snapblklist[mid] == lbn) break; if (snapblklist[mid] < lbn) lower = mid + 1; else upper = mid - 1; } if (lower <= upper) return (1); return (0); } void ffs_bdflush(bo, bp) struct bufobj *bo; struct buf *bp; { struct thread *td; struct vnode *vp, *devvp; struct buf *nbp; int bp_bdskip; if (bo->bo_dirty.bv_cnt <= dirtybufthresh) return; td = curthread; vp = bp->b_vp; devvp = bo2vnode(bo); KASSERT(vp == devvp, ("devvp != vp %p %p", bo, bp)); VI_LOCK(devvp); bp_bdskip = ffs_bp_snapblk(devvp, bp); if (bp_bdskip) bdwriteskip++; VI_UNLOCK(devvp); if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10 && !bp_bdskip) { (void) VOP_FSYNC(vp, MNT_NOWAIT, td); altbufferflushes++; } else { BO_LOCK(bo); /* * Try to find a buffer to flush. */ TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) { if ((nbp->b_vflags & BV_BKGRDINPROG) || BUF_LOCK(nbp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) continue; if (bp == nbp) panic("bdwrite: found ourselves"); BO_UNLOCK(bo); /* * Don't countdeps with the bo lock * held. */ if (buf_countdeps(nbp, 0)) { BO_LOCK(bo); BUF_UNLOCK(nbp); continue; } if (bp_bdskip) { VI_LOCK(devvp); if (!ffs_bp_snapblk(vp, nbp)) { VI_UNLOCK(devvp); BO_LOCK(bo); BUF_UNLOCK(nbp); continue; } VI_UNLOCK(devvp); } if (nbp->b_flags & B_CLUSTEROK) { vfs_bio_awrite(nbp); } else { bremfree(nbp); bawrite(nbp); } dirtybufferflushes++; break; } if (nbp == NULL) BO_UNLOCK(bo); } } /* * Check for need to copy block that is about to be written, * copying the block if necessary. */ int ffs_copyonwrite(devvp, bp) struct vnode *devvp; struct buf *bp; { struct snapdata *sn; struct buf *ibp, *cbp, *savedcbp = NULL; struct thread *td = curthread; struct fs *fs; struct inode *ip; struct vnode *vp = NULL; ufs2_daddr_t lbn, blkno, *snapblklist; int lower, upper, mid, indiroff, error = 0; int launched_async_io, prev_norunningbuf; long saved_runningbufspace; if (devvp != bp->b_vp && IS_SNAPSHOT(VTOI(bp->b_vp))) return (0); /* Update on a snapshot file */ if (td->td_pflags & TDP_COWINPROGRESS) panic("ffs_copyonwrite: recursive call"); /* * First check to see if it is in the preallocated list. * By doing this check we avoid several potential deadlocks. */ VI_LOCK(devvp); sn = devvp->v_rdev->si_snapdata; if (sn == NULL || TAILQ_EMPTY(&sn->sn_head)) { VI_UNLOCK(devvp); return (0); /* No snapshot */ } ip = TAILQ_FIRST(&sn->sn_head); fs = ITOFS(ip); lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno)); snapblklist = sn->sn_blklist; upper = sn->sn_listsize - 1; lower = 1; while (lower <= upper) { mid = (lower + upper) / 2; if (snapblklist[mid] == lbn) break; if (snapblklist[mid] < lbn) lower = mid + 1; else upper = mid - 1; } if (lower <= upper) { VI_UNLOCK(devvp); return (0); } launched_async_io = 0; prev_norunningbuf = td->td_pflags & TDP_NORUNNINGBUF; /* * Since I/O on bp isn't yet in progress and it may be blocked * for a long time waiting on snaplk, back it out of * runningbufspace, possibly waking other threads waiting for space. */ saved_runningbufspace = bp->b_runningbufspace; if (saved_runningbufspace != 0) runningbufwakeup(bp); /* * Not in the precomputed list, so check the snapshots. */ while (lockmgr(&sn->sn_lock, LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, VI_MTX(devvp)) != 0) { VI_LOCK(devvp); sn = devvp->v_rdev->si_snapdata; if (sn == NULL || TAILQ_EMPTY(&sn->sn_head)) { VI_UNLOCK(devvp); if (saved_runningbufspace != 0) { bp->b_runningbufspace = saved_runningbufspace; atomic_add_long(&runningbufspace, bp->b_runningbufspace); } return (0); /* Snapshot gone */ } } TAILQ_FOREACH(ip, &sn->sn_head, i_nextsnap) { vp = ITOV(ip); if (DOINGSOFTDEP(vp)) softdep_prealloc(vp, MNT_WAIT); /* * We ensure that everything of our own that needs to be * copied will be done at the time that ffs_snapshot is * called. Thus we can skip the check here which can * deadlock in doing the lookup in UFS_BALLOC. */ if (bp->b_vp == vp) continue; /* * Check to see if block needs to be copied. We do not have * to hold the snapshot lock while doing this lookup as it * will never require any additional allocations for the * snapshot inode. */ if (lbn < UFS_NDADDR) { blkno = DIP(ip, i_db[lbn]); } else { td->td_pflags |= TDP_COWINPROGRESS | TDP_NORUNNINGBUF; error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); td->td_pflags &= ~TDP_COWINPROGRESS; if (error) break; indiroff = (lbn - UFS_NDADDR) % NINDIR(fs); if (I_IS_UFS1(ip)) blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff]; else blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff]; bqrelse(ibp); } #ifdef INVARIANTS if (blkno == BLK_SNAP && bp->b_lblkno >= 0) panic("ffs_copyonwrite: bad copy block"); #endif if (blkno != 0) continue; /* * Allocate the block into which to do the copy. Since * multiple processes may all try to copy the same block, * we have to recheck our need to do a copy if we sleep * waiting for the lock. * * Because all snapshots on a filesystem share a single * lock, we ensure that we will never be in competition * with another process to allocate a block. */ td->td_pflags |= TDP_COWINPROGRESS | TDP_NORUNNINGBUF; error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, 0, &cbp); td->td_pflags &= ~TDP_COWINPROGRESS; if (error) break; #ifdef DEBUG if (snapdebug) { printf("Copyonwrite: snapino %ju lbn %jd for ", (uintmax_t)ip->i_number, (intmax_t)lbn); if (bp->b_vp == devvp) printf("fs metadata"); else printf("inum %ju", (uintmax_t)VTOI(bp->b_vp)->i_number); printf(" lblkno %jd to blkno %jd\n", (intmax_t)bp->b_lblkno, (intmax_t)cbp->b_blkno); } #endif /* * If we have already read the old block contents, then * simply copy them to the new block. Note that we need * to synchronously write snapshots that have not been * unlinked, and hence will be visible after a crash, * to ensure their integrity. At a minimum we ensure the * integrity of the filesystem metadata, but use the * dopersistence sysctl-setable flag to decide on the * persistence needed for file content data. */ if (savedcbp != NULL) { bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); bawrite(cbp); if ((devvp == bp->b_vp || bp->b_vp->v_type == VDIR || dopersistence) && ip->i_effnlink > 0) (void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT); else launched_async_io = 1; continue; } /* * Otherwise, read the old block contents into the buffer. */ if ((error = readblock(vp, cbp, lbn)) != 0) { bzero(cbp->b_data, fs->fs_bsize); bawrite(cbp); if ((devvp == bp->b_vp || bp->b_vp->v_type == VDIR || dopersistence) && ip->i_effnlink > 0) (void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT); else launched_async_io = 1; break; } savedcbp = cbp; } /* * Note that we need to synchronously write snapshots that * have not been unlinked, and hence will be visible after * a crash, to ensure their integrity. At a minimum we * ensure the integrity of the filesystem metadata, but * use the dopersistence sysctl-setable flag to decide on * the persistence needed for file content data. */ if (savedcbp) { vp = savedcbp->b_vp; bawrite(savedcbp); if ((devvp == bp->b_vp || bp->b_vp->v_type == VDIR || dopersistence) && VTOI(vp)->i_effnlink > 0) (void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT); else launched_async_io = 1; } lockmgr(vp->v_vnlock, LK_RELEASE, NULL); td->td_pflags = (td->td_pflags & ~TDP_NORUNNINGBUF) | prev_norunningbuf; if (launched_async_io && (td->td_pflags & TDP_NORUNNINGBUF) == 0) waitrunningbufspace(); /* * I/O on bp will now be started, so count it in runningbufspace. */ if (saved_runningbufspace != 0) { bp->b_runningbufspace = saved_runningbufspace; atomic_add_long(&runningbufspace, bp->b_runningbufspace); } return (error); } /* * sync snapshots to force freework records waiting on snapshots to claim * blocks to free. */ void ffs_sync_snap(mp, waitfor) struct mount *mp; int waitfor; { struct snapdata *sn; struct vnode *devvp; struct vnode *vp; struct inode *ip; devvp = VFSTOUFS(mp)->um_devvp; if ((devvp->v_vflag & VV_COPYONWRITE) == 0) return; for (;;) { VI_LOCK(devvp); sn = devvp->v_rdev->si_snapdata; if (sn == NULL) { VI_UNLOCK(devvp); return; } if (lockmgr(&sn->sn_lock, LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, VI_MTX(devvp)) == 0) break; } TAILQ_FOREACH(ip, &sn->sn_head, i_nextsnap) { vp = ITOV(ip); ffs_syncvnode(vp, waitfor, NO_INO_UPDT); } lockmgr(&sn->sn_lock, LK_RELEASE, NULL); } /* * Read the specified block into the given buffer. * Much of this boiler-plate comes from bwrite(). */ static int readblock(vp, bp, lbn) struct vnode *vp; struct buf *bp; ufs2_daddr_t lbn; { struct inode *ip = VTOI(vp); struct bio *bip; struct fs *fs; ip = VTOI(vp); fs = ITOFS(ip); bip = g_alloc_bio(); bip->bio_cmd = BIO_READ; bip->bio_offset = dbtob(fsbtodb(fs, blkstofrags(fs, lbn))); bip->bio_data = bp->b_data; bip->bio_length = bp->b_bcount; bip->bio_done = NULL; g_io_request(bip, ITODEVVP(ip)->v_bufobj.bo_private); bp->b_error = biowait(bip, "snaprdb"); g_destroy_bio(bip); return (bp->b_error); } #endif /* * Process file deletes that were deferred by ufs_inactive() due to * the file system being suspended. Transfer IN_LAZYACCESS into * IN_MODIFIED for vnodes that were accessed during suspension. */ void process_deferred_inactive(struct mount *mp) { struct vnode *vp, *mvp; struct inode *ip; struct thread *td; int error; td = curthread; (void) vn_start_secondary_write(NULL, &mp, V_WAIT); loop: MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { /* * IN_LAZYACCESS is checked here without holding any * vnode lock, but this flag is set only while holding * vnode interlock. */ if (vp->v_type == VNON || ((VTOI(vp)->i_flag & IN_LAZYACCESS) == 0 && ((vp->v_iflag & VI_OWEINACT) == 0 || vp->v_usecount > 0))) { VI_UNLOCK(vp); continue; } vholdl(vp); error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK); if (error != 0) { vdrop(vp); if (error == ENOENT) continue; /* vnode recycled */ MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); goto loop; } ip = VTOI(vp); if ((ip->i_flag & IN_LAZYACCESS) != 0) { ip->i_flag &= ~IN_LAZYACCESS; ip->i_flag |= IN_MODIFIED; } VI_LOCK(vp); if ((vp->v_iflag & VI_OWEINACT) == 0 || vp->v_usecount > 0) { VI_UNLOCK(vp); VOP_UNLOCK(vp, 0); vdrop(vp); continue; } vinactive(vp, td); VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp, ("process_deferred_inactive: got VI_OWEINACT")); VI_UNLOCK(vp); VOP_UNLOCK(vp, 0); vdrop(vp); } vn_finished_secondary_write(mp); } #ifndef NO_FFS_SNAPSHOT static struct snapdata * ffs_snapdata_alloc(void) { struct snapdata *sn; /* * Fetch a snapdata from the free list if there is one available. */ mtx_lock(&snapfree_lock); sn = LIST_FIRST(&snapfree); if (sn != NULL) LIST_REMOVE(sn, sn_link); mtx_unlock(&snapfree_lock); if (sn != NULL) return (sn); /* * If there were no free snapdatas allocate one. */ sn = malloc(sizeof *sn, M_UFSMNT, M_WAITOK | M_ZERO); TAILQ_INIT(&sn->sn_head); lockinit(&sn->sn_lock, PVFS, "snaplk", VLKTIMEOUT, LK_CANRECURSE | LK_NOSHARE); return (sn); } /* * The snapdata is never freed because we can not be certain that * there are no threads sleeping on the snap lock. Persisting * them permanently avoids costly synchronization in ffs_lock(). */ static void ffs_snapdata_free(struct snapdata *sn) { mtx_lock(&snapfree_lock); LIST_INSERT_HEAD(&snapfree, sn, sn_link); mtx_unlock(&snapfree_lock); } /* Try to free snapdata associated with devvp */ static void try_free_snapdata(struct vnode *devvp) { struct snapdata *sn; ufs2_daddr_t *snapblklist; ASSERT_VI_LOCKED(devvp, "try_free_snapdata"); sn = devvp->v_rdev->si_snapdata; if (sn == NULL || TAILQ_FIRST(&sn->sn_head) != NULL || (devvp->v_vflag & VV_COPYONWRITE) == 0) { VI_UNLOCK(devvp); return; } devvp->v_rdev->si_snapdata = NULL; devvp->v_vflag &= ~VV_COPYONWRITE; lockmgr(&sn->sn_lock, LK_DRAIN|LK_INTERLOCK, VI_MTX(devvp)); snapblklist = sn->sn_blklist; sn->sn_blklist = NULL; sn->sn_listsize = 0; lockmgr(&sn->sn_lock, LK_RELEASE, NULL); if (snapblklist != NULL) free(snapblklist, M_UFSMNT); ffs_snapdata_free(sn); } static struct snapdata * ffs_snapdata_acquire(struct vnode *devvp) { struct snapdata *nsn, *sn; int error; /* * Allocate a free snapdata. This is done before acquiring the * devvp lock to avoid allocation while the devvp interlock is * held. */ nsn = ffs_snapdata_alloc(); for (;;) { VI_LOCK(devvp); sn = devvp->v_rdev->si_snapdata; if (sn == NULL) { /* * This is the first snapshot on this * filesystem and we use our pre-allocated * snapdata. Publish sn with the sn_lock * owned by us, to avoid the race. */ error = lockmgr(&nsn->sn_lock, LK_EXCLUSIVE | LK_NOWAIT, NULL); if (error != 0) panic("leaked sn, lockmgr error %d", error); sn = devvp->v_rdev->si_snapdata = nsn; VI_UNLOCK(devvp); nsn = NULL; break; } /* * There is a snapshots which already exists on this * filesystem, grab a reference to the common lock. */ error = lockmgr(&sn->sn_lock, LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, VI_MTX(devvp)); if (error == 0) break; } /* * Free any unused snapdata. */ if (nsn != NULL) ffs_snapdata_free(nsn); return (sn); } #endif Index: head/sys/ufs/ffs/ffs_vfsops.c =================================================================== --- head/sys/ufs/ffs/ffs_vfsops.c (revision 323922) +++ head/sys/ufs/ffs/ffs_vfsops.c (revision 323923) @@ -1,2287 +1,2317 @@ /*- * Copyright (c) 1989, 1991, 1993, 1994 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ffs_vfsops.c 8.31 (Berkeley) 5/20/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_quota.h" #include "opt_ufs.h" #include "opt_ffs.h" #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static uma_zone_t uma_inode, uma_ufs1, uma_ufs2; static int ffs_mountfs(struct vnode *, struct mount *, struct thread *); static void ffs_oldfscompat_read(struct fs *, struct ufsmount *, ufs2_daddr_t); static void ffs_ifree(struct ufsmount *ump, struct inode *ip); static int ffs_sync_lazy(struct mount *mp); static vfs_init_t ffs_init; static vfs_uninit_t ffs_uninit; static vfs_extattrctl_t ffs_extattrctl; static vfs_cmount_t ffs_cmount; static vfs_unmount_t ffs_unmount; static vfs_mount_t ffs_mount; static vfs_statfs_t ffs_statfs; static vfs_fhtovp_t ffs_fhtovp; static vfs_sync_t ffs_sync; static struct vfsops ufs_vfsops = { .vfs_extattrctl = ffs_extattrctl, .vfs_fhtovp = ffs_fhtovp, .vfs_init = ffs_init, .vfs_mount = ffs_mount, .vfs_cmount = ffs_cmount, .vfs_quotactl = ufs_quotactl, .vfs_root = ufs_root, .vfs_statfs = ffs_statfs, .vfs_sync = ffs_sync, .vfs_uninit = ffs_uninit, .vfs_unmount = ffs_unmount, .vfs_vget = ffs_vget, .vfs_susp_clean = process_deferred_inactive, }; VFS_SET(ufs_vfsops, ufs, 0); MODULE_VERSION(ufs, 1); static b_strategy_t ffs_geom_strategy; static b_write_t ffs_bufwrite; static struct buf_ops ffs_ops = { .bop_name = "FFS", .bop_write = ffs_bufwrite, .bop_strategy = ffs_geom_strategy, .bop_sync = bufsync, #ifdef NO_FFS_SNAPSHOT .bop_bdflush = bufbdflush, #else .bop_bdflush = ffs_bdflush, #endif }; /* * Note that userquota and groupquota options are not currently used * by UFS/FFS code and generally mount(8) does not pass those options * from userland, but they can be passed by loader(8) via * vfs.root.mountfrom.options. */ static const char *ffs_opts[] = { "acls", "async", "noatime", "noclusterr", "noclusterw", "noexec", "export", "force", "from", "groupquota", "multilabel", "nfsv4acls", "fsckpid", "snapshot", "nosuid", "suiddir", "nosymfollow", "sync", "union", "userquota", NULL }; static int ffs_mount(struct mount *mp) { struct vnode *devvp; struct thread *td; struct ufsmount *ump = NULL; struct fs *fs; pid_t fsckpid = 0; int error, error1, flags; uint64_t mntorflags; accmode_t accmode; struct nameidata ndp; char *fspec; td = curthread; if (vfs_filteropt(mp->mnt_optnew, ffs_opts)) return (EINVAL); if (uma_inode == NULL) { uma_inode = uma_zcreate("FFS inode", sizeof(struct inode), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); uma_ufs1 = uma_zcreate("FFS1 dinode", sizeof(struct ufs1_dinode), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); uma_ufs2 = uma_zcreate("FFS2 dinode", sizeof(struct ufs2_dinode), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); } vfs_deleteopt(mp->mnt_optnew, "groupquota"); vfs_deleteopt(mp->mnt_optnew, "userquota"); fspec = vfs_getopts(mp->mnt_optnew, "from", &error); if (error) return (error); mntorflags = 0; if (vfs_getopt(mp->mnt_optnew, "acls", NULL, NULL) == 0) mntorflags |= MNT_ACLS; if (vfs_getopt(mp->mnt_optnew, "snapshot", NULL, NULL) == 0) { mntorflags |= MNT_SNAPSHOT; /* * Once we have set the MNT_SNAPSHOT flag, do not * persist "snapshot" in the options list. */ vfs_deleteopt(mp->mnt_optnew, "snapshot"); vfs_deleteopt(mp->mnt_opt, "snapshot"); } if (vfs_getopt(mp->mnt_optnew, "fsckpid", NULL, NULL) == 0 && vfs_scanopt(mp->mnt_optnew, "fsckpid", "%d", &fsckpid) == 1) { /* * Once we have set the restricted PID, do not * persist "fsckpid" in the options list. */ vfs_deleteopt(mp->mnt_optnew, "fsckpid"); vfs_deleteopt(mp->mnt_opt, "fsckpid"); if (mp->mnt_flag & MNT_UPDATE) { if (VFSTOUFS(mp)->um_fs->fs_ronly == 0 && vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0) == 0) { vfs_mount_error(mp, "Checker enable: Must be read-only"); return (EINVAL); } } else if (vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0) == 0) { vfs_mount_error(mp, "Checker enable: Must be read-only"); return (EINVAL); } /* Set to -1 if we are done */ if (fsckpid == 0) fsckpid = -1; } if (vfs_getopt(mp->mnt_optnew, "nfsv4acls", NULL, NULL) == 0) { if (mntorflags & MNT_ACLS) { vfs_mount_error(mp, "\"acls\" and \"nfsv4acls\" options " "are mutually exclusive"); return (EINVAL); } mntorflags |= MNT_NFS4ACLS; } MNT_ILOCK(mp); mp->mnt_flag |= mntorflags; MNT_IUNLOCK(mp); /* * If updating, check whether changing from read-only to * read/write; if there is no device name, that's all we do. */ if (mp->mnt_flag & MNT_UPDATE) { ump = VFSTOUFS(mp); fs = ump->um_fs; devvp = ump->um_devvp; if (fsckpid == -1 && ump->um_fsckpid > 0) { if ((error = ffs_flushfiles(mp, WRITECLOSE, td)) != 0 || (error = ffs_sbupdate(ump, MNT_WAIT, 0)) != 0) return (error); g_topology_lock(); /* * Return to normal read-only mode. */ error = g_access(ump->um_cp, 0, -1, 0); g_topology_unlock(); ump->um_fsckpid = 0; } if (fs->fs_ronly == 0 && vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0)) { /* * Flush any dirty data and suspend filesystem. */ if ((error = vn_start_write(NULL, &mp, V_WAIT)) != 0) return (error); error = vfs_write_suspend_umnt(mp); if (error != 0) return (error); /* * Check for and optionally get rid of files open * for writing. */ flags = WRITECLOSE; if (mp->mnt_flag & MNT_FORCE) flags |= FORCECLOSE; if (MOUNTEDSOFTDEP(mp)) { error = softdep_flushfiles(mp, flags, td); } else { error = ffs_flushfiles(mp, flags, td); } if (error) { vfs_write_resume(mp, 0); return (error); } if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) { printf("WARNING: %s Update error: blocks %jd " "files %d\n", fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks, fs->fs_pendinginodes); fs->fs_pendingblocks = 0; fs->fs_pendinginodes = 0; } if ((fs->fs_flags & (FS_UNCLEAN | FS_NEEDSFSCK)) == 0) fs->fs_clean = 1; if ((error = ffs_sbupdate(ump, MNT_WAIT, 0)) != 0) { fs->fs_ronly = 0; fs->fs_clean = 0; vfs_write_resume(mp, 0); return (error); } if (MOUNTEDSOFTDEP(mp)) softdep_unmount(mp); g_topology_lock(); /* * Drop our write and exclusive access. */ g_access(ump->um_cp, 0, -1, -1); g_topology_unlock(); fs->fs_ronly = 1; MNT_ILOCK(mp); mp->mnt_flag |= MNT_RDONLY; MNT_IUNLOCK(mp); /* * Allow the writers to note that filesystem * is ro now. */ vfs_write_resume(mp, 0); } if ((mp->mnt_flag & MNT_RELOAD) && (error = ffs_reload(mp, td, 0)) != 0) return (error); if (fs->fs_ronly && !vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0)) { /* * If we are running a checker, do not allow upgrade. */ if (ump->um_fsckpid > 0) { vfs_mount_error(mp, "Active checker, cannot upgrade to write"); return (EINVAL); } /* * If upgrade to read-write by non-root, then verify * that user has necessary permissions on the device. */ vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); error = VOP_ACCESS(devvp, VREAD | VWRITE, td->td_ucred, td); if (error) error = priv_check(td, PRIV_VFS_MOUNT_PERM); if (error) { VOP_UNLOCK(devvp, 0); return (error); } VOP_UNLOCK(devvp, 0); fs->fs_flags &= ~FS_UNCLEAN; if (fs->fs_clean == 0) { fs->fs_flags |= FS_UNCLEAN; if ((mp->mnt_flag & MNT_FORCE) || ((fs->fs_flags & (FS_SUJ | FS_NEEDSFSCK)) == 0 && (fs->fs_flags & FS_DOSOFTDEP))) { printf("WARNING: %s was not properly " "dismounted\n", fs->fs_fsmnt); } else { vfs_mount_error(mp, "R/W mount of %s denied. %s.%s", fs->fs_fsmnt, "Filesystem is not clean - run fsck", (fs->fs_flags & FS_SUJ) == 0 ? "" : " Forced mount will invalidate" " journal contents"); return (EPERM); } } g_topology_lock(); /* * Request exclusive write access. */ error = g_access(ump->um_cp, 0, 1, 1); g_topology_unlock(); if (error) return (error); if ((error = vn_start_write(NULL, &mp, V_WAIT)) != 0) return (error); fs->fs_ronly = 0; MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_RDONLY; MNT_IUNLOCK(mp); fs->fs_mtime = time_second; /* check to see if we need to start softdep */ if ((fs->fs_flags & FS_DOSOFTDEP) && (error = softdep_mount(devvp, mp, fs, td->td_ucred))){ vn_finished_write(mp); return (error); } fs->fs_clean = 0; if ((error = ffs_sbupdate(ump, MNT_WAIT, 0)) != 0) { vn_finished_write(mp); return (error); } if (fs->fs_snapinum[0] != 0) ffs_snapshot_mount(mp); vn_finished_write(mp); } /* * Soft updates is incompatible with "async", * so if we are doing softupdates stop the user * from setting the async flag in an update. * Softdep_mount() clears it in an initial mount * or ro->rw remount. */ if (MOUNTEDSOFTDEP(mp)) { /* XXX: Reset too late ? */ MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_ASYNC; MNT_IUNLOCK(mp); } /* * Keep MNT_ACLS flag if it is stored in superblock. */ if ((fs->fs_flags & FS_ACLS) != 0) { /* XXX: Set too late ? */ MNT_ILOCK(mp); mp->mnt_flag |= MNT_ACLS; MNT_IUNLOCK(mp); } if ((fs->fs_flags & FS_NFS4ACLS) != 0) { /* XXX: Set too late ? */ MNT_ILOCK(mp); mp->mnt_flag |= MNT_NFS4ACLS; MNT_IUNLOCK(mp); } /* * If this is a request from fsck to clean up the filesystem, * then allow the specified pid to proceed. */ if (fsckpid > 0) { if (ump->um_fsckpid != 0) { vfs_mount_error(mp, "Active checker already running on %s", fs->fs_fsmnt); return (EINVAL); } KASSERT(MOUNTEDSOFTDEP(mp) == 0, ("soft updates enabled on read-only file system")); g_topology_lock(); /* * Request write access. */ error = g_access(ump->um_cp, 0, 1, 0); g_topology_unlock(); if (error) { vfs_mount_error(mp, "Checker activation failed on %s", fs->fs_fsmnt); return (error); } ump->um_fsckpid = fsckpid; if (fs->fs_snapinum[0] != 0) ffs_snapshot_mount(mp); fs->fs_mtime = time_second; fs->fs_fmod = 1; fs->fs_clean = 0; (void) ffs_sbupdate(ump, MNT_WAIT, 0); } /* * If this is a snapshot request, take the snapshot. */ if (mp->mnt_flag & MNT_SNAPSHOT) return (ffs_snapshot(mp, fspec)); /* * Must not call namei() while owning busy ref. */ vfs_unbusy(mp); } /* * Not an update, or updating the name: look up the name * and verify that it refers to a sensible disk device. */ NDINIT(&ndp, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspec, td); error = namei(&ndp); if ((mp->mnt_flag & MNT_UPDATE) != 0) { /* * Unmount does not start if MNT_UPDATE is set. Mount * update busies mp before setting MNT_UPDATE. We * must be able to retain our busy ref succesfully, * without sleep. */ error1 = vfs_busy(mp, MBF_NOWAIT); MPASS(error1 == 0); } if (error != 0) return (error); NDFREE(&ndp, NDF_ONLY_PNBUF); devvp = ndp.ni_vp; if (!vn_isdisk(devvp, &error)) { vput(devvp); return (error); } /* * If mount by non-root, then verify that user has necessary * permissions on the device. */ accmode = VREAD; if ((mp->mnt_flag & MNT_RDONLY) == 0) accmode |= VWRITE; error = VOP_ACCESS(devvp, accmode, td->td_ucred, td); if (error) error = priv_check(td, PRIV_VFS_MOUNT_PERM); if (error) { vput(devvp); return (error); } if (mp->mnt_flag & MNT_UPDATE) { /* * Update only * * If it's not the same vnode, or at least the same device * then it's not correct. */ if (devvp->v_rdev != ump->um_devvp->v_rdev) error = EINVAL; /* needs translation */ vput(devvp); if (error) return (error); } else { /* * New mount * * We need the name for the mount point (also used for * "last mounted on") copied in. If an error occurs, * the mount point is discarded by the upper level code. * Note that vfs_mount_alloc() populates f_mntonname for us. */ if ((error = ffs_mountfs(devvp, mp, td)) != 0) { vrele(devvp); return (error); } if (fsckpid > 0) { KASSERT(MOUNTEDSOFTDEP(mp) == 0, ("soft updates enabled on read-only file system")); ump = VFSTOUFS(mp); fs = ump->um_fs; g_topology_lock(); /* * Request write access. */ error = g_access(ump->um_cp, 0, 1, 0); g_topology_unlock(); if (error) { printf("WARNING: %s: Checker activation " "failed\n", fs->fs_fsmnt); } else { ump->um_fsckpid = fsckpid; if (fs->fs_snapinum[0] != 0) ffs_snapshot_mount(mp); fs->fs_mtime = time_second; fs->fs_clean = 0; (void) ffs_sbupdate(ump, MNT_WAIT, 0); } } } vfs_mountedfrom(mp, fspec); return (0); } /* * Compatibility with old mount system call. */ static int ffs_cmount(struct mntarg *ma, void *data, uint64_t flags) { struct ufs_args args; struct export_args exp; int error; if (data == NULL) return (EINVAL); error = copyin(data, &args, sizeof args); if (error) return (error); vfs_oexport_conv(&args.export, &exp); ma = mount_argsu(ma, "from", args.fspec, MAXPATHLEN); ma = mount_arg(ma, "export", &exp, sizeof(exp)); error = kernel_mount(ma, flags); return (error); } /* * Reload all incore data for a filesystem (used after running fsck on * the root filesystem and finding things to fix). If the 'force' flag * is 0, the filesystem must be mounted read-only. * * Things to do to update the mount: * 1) invalidate all cached meta-data. * 2) re-read superblock from disk. * 3) re-read summary information from disk. * 4) invalidate all inactive vnodes. * 5) clear MNTK_SUSPEND2 and MNTK_SUSPENDED flags, allowing secondary * writers, if requested. * 6) invalidate all cached file data. * 7) re-read inode data for all active vnodes. */ int ffs_reload(struct mount *mp, struct thread *td, int flags) { struct vnode *vp, *mvp, *devvp; struct inode *ip; void *space; struct buf *bp; struct fs *fs, *newfs; struct ufsmount *ump; ufs2_daddr_t sblockloc; int i, blks, error; u_long size; int32_t *lp; ump = VFSTOUFS(mp); MNT_ILOCK(mp); if ((mp->mnt_flag & MNT_RDONLY) == 0 && (flags & FFSR_FORCE) == 0) { MNT_IUNLOCK(mp); return (EINVAL); } MNT_IUNLOCK(mp); /* * Step 1: invalidate all cached meta-data. */ devvp = VFSTOUFS(mp)->um_devvp; vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); if (vinvalbuf(devvp, 0, 0, 0) != 0) panic("ffs_reload: dirty1"); VOP_UNLOCK(devvp, 0); /* * Step 2: re-read superblock from disk. */ fs = VFSTOUFS(mp)->um_fs; if ((error = bread(devvp, btodb(fs->fs_sblockloc), fs->fs_sbsize, NOCRED, &bp)) != 0) return (error); newfs = (struct fs *)bp->b_data; if ((newfs->fs_magic != FS_UFS1_MAGIC && newfs->fs_magic != FS_UFS2_MAGIC) || newfs->fs_bsize > MAXBSIZE || newfs->fs_bsize < sizeof(struct fs)) { brelse(bp); return (EIO); /* XXX needs translation */ } /* * Copy pointer fields back into superblock before copying in XXX * new superblock. These should really be in the ufsmount. XXX * Note that important parameters (eg fs_ncg) are unchanged. */ newfs->fs_csp = fs->fs_csp; newfs->fs_maxcluster = fs->fs_maxcluster; newfs->fs_contigdirs = fs->fs_contigdirs; newfs->fs_active = fs->fs_active; newfs->fs_ronly = fs->fs_ronly; sblockloc = fs->fs_sblockloc; bcopy(newfs, fs, (u_int)fs->fs_sbsize); brelse(bp); mp->mnt_maxsymlinklen = fs->fs_maxsymlinklen; ffs_oldfscompat_read(fs, VFSTOUFS(mp), sblockloc); UFS_LOCK(ump); if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) { printf("WARNING: %s: reload pending error: blocks %jd " "files %d\n", fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks, fs->fs_pendinginodes); fs->fs_pendingblocks = 0; fs->fs_pendinginodes = 0; } UFS_UNLOCK(ump); /* * Step 3: re-read summary information from disk. */ size = fs->fs_cssize; blks = howmany(size, fs->fs_fsize); if (fs->fs_contigsumsize > 0) size += fs->fs_ncg * sizeof(int32_t); size += fs->fs_ncg * sizeof(u_int8_t); free(fs->fs_csp, M_UFSMNT); space = malloc(size, M_UFSMNT, M_WAITOK); fs->fs_csp = space; for (i = 0; i < blks; i += fs->fs_frag) { size = fs->fs_bsize; if (i + fs->fs_frag > blks) size = (blks - i) * fs->fs_fsize; error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + i), size, NOCRED, &bp); if (error) return (error); bcopy(bp->b_data, space, (u_int)size); space = (char *)space + size; brelse(bp); } /* * We no longer know anything about clusters per cylinder group. */ if (fs->fs_contigsumsize > 0) { fs->fs_maxcluster = lp = space; for (i = 0; i < fs->fs_ncg; i++) *lp++ = fs->fs_contigsumsize; space = lp; } size = fs->fs_ncg * sizeof(u_int8_t); fs->fs_contigdirs = (u_int8_t *)space; bzero(fs->fs_contigdirs, size); if ((flags & FFSR_UNSUSPEND) != 0) { MNT_ILOCK(mp); mp->mnt_kern_flag &= ~(MNTK_SUSPENDED | MNTK_SUSPEND2); wakeup(&mp->mnt_flag); MNT_IUNLOCK(mp); } loop: MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { /* * Skip syncer vnode. */ if (vp->v_type == VNON) { VI_UNLOCK(vp); continue; } /* * Step 4: invalidate all cached file data. */ if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, td)) { MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); goto loop; } if (vinvalbuf(vp, 0, 0, 0)) panic("ffs_reload: dirty2"); /* * Step 5: re-read inode data for all active vnodes. */ ip = VTOI(vp); error = bread(devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), (int)fs->fs_bsize, NOCRED, &bp); if (error) { VOP_UNLOCK(vp, 0); vrele(vp); MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); return (error); } ffs_load_inode(bp, ip, fs, ip->i_number); ip->i_effnlink = ip->i_nlink; brelse(bp); VOP_UNLOCK(vp, 0); vrele(vp); } return (0); } /* * Possible superblock locations ordered from most to least likely. */ static int sblock_try[] = SBLOCKSEARCH; /* * Common code for mount and mountroot */ static int ffs_mountfs(devvp, mp, td) struct vnode *devvp; struct mount *mp; struct thread *td; { struct ufsmount *ump; struct buf *bp; struct fs *fs; struct cdev *dev; void *space; ufs2_daddr_t sblockloc; int error, i, blks, len, ronly; u_long size; int32_t *lp; struct ucred *cred; struct g_consumer *cp; struct mount *nmp; bp = NULL; ump = NULL; cred = td ? td->td_ucred : NOCRED; ronly = (mp->mnt_flag & MNT_RDONLY) != 0; KASSERT(devvp->v_type == VCHR, ("reclaimed devvp")); dev = devvp->v_rdev; if (atomic_cmpset_acq_ptr((uintptr_t *)&dev->si_mountpt, 0, (uintptr_t)mp) == 0) { VOP_UNLOCK(devvp, 0); return (EBUSY); } g_topology_lock(); error = g_vfs_open(devvp, &cp, "ffs", ronly ? 0 : 1); g_topology_unlock(); if (error != 0) { atomic_store_rel_ptr((uintptr_t *)&dev->si_mountpt, 0); VOP_UNLOCK(devvp, 0); return (error); } dev_ref(dev); devvp->v_bufobj.bo_ops = &ffs_ops; VOP_UNLOCK(devvp, 0); if (dev->si_iosize_max != 0) mp->mnt_iosize_max = dev->si_iosize_max; if (mp->mnt_iosize_max > MAXPHYS) mp->mnt_iosize_max = MAXPHYS; fs = NULL; sblockloc = 0; /* * Try reading the superblock in each of its possible locations. */ for (i = 0; sblock_try[i] != -1; i++) { if ((SBLOCKSIZE % cp->provider->sectorsize) != 0) { error = EINVAL; vfs_mount_error(mp, "Invalid sectorsize %d for superblock size %d", cp->provider->sectorsize, SBLOCKSIZE); goto out; } if ((error = bread(devvp, btodb(sblock_try[i]), SBLOCKSIZE, cred, &bp)) != 0) goto out; fs = (struct fs *)bp->b_data; sblockloc = sblock_try[i]; if ((fs->fs_magic == FS_UFS1_MAGIC || (fs->fs_magic == FS_UFS2_MAGIC && (fs->fs_sblockloc == sblockloc || (fs->fs_old_flags & FS_FLAGS_UPDATED) == 0))) && fs->fs_bsize <= MAXBSIZE && fs->fs_bsize >= sizeof(struct fs)) break; brelse(bp); bp = NULL; } if (sblock_try[i] == -1) { error = EINVAL; /* XXX needs translation */ goto out; } fs->fs_fmod = 0; - fs->fs_flags &= ~FS_INDEXDIRS; /* no support for directory indices */ + /* none of these types of check-hashes are maintained */ + fs->fs_metackhash &= ~(CK_SUPERBLOCK | CK_INODE | CK_INDIR | CK_DIR); + /* no support for directory indices or any other undefined flags */ + fs->fs_flags &= ~FS_INDEXDIRS; + fs->fs_flags &= FS_SUPPORTED; fs->fs_flags &= ~FS_UNCLEAN; if (fs->fs_clean == 0) { fs->fs_flags |= FS_UNCLEAN; if (ronly || (mp->mnt_flag & MNT_FORCE) || ((fs->fs_flags & (FS_SUJ | FS_NEEDSFSCK)) == 0 && (fs->fs_flags & FS_DOSOFTDEP))) { printf("WARNING: %s was not properly dismounted\n", fs->fs_fsmnt); } else { vfs_mount_error(mp, "R/W mount of %s denied. %s%s", fs->fs_fsmnt, "Filesystem is not clean - run fsck.", (fs->fs_flags & FS_SUJ) == 0 ? "" : " Forced mount will invalidate journal contents"); error = EPERM; goto out; } if ((fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) && (mp->mnt_flag & MNT_FORCE)) { printf("WARNING: %s: lost blocks %jd files %d\n", fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks, fs->fs_pendinginodes); fs->fs_pendingblocks = 0; fs->fs_pendinginodes = 0; } } if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) { printf("WARNING: %s: mount pending error: blocks %jd " "files %d\n", fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks, fs->fs_pendinginodes); fs->fs_pendingblocks = 0; fs->fs_pendinginodes = 0; } if ((fs->fs_flags & FS_GJOURNAL) != 0) { #ifdef UFS_GJOURNAL /* * Get journal provider name. */ len = 1024; mp->mnt_gjprovider = malloc((u_long)len, M_UFSMNT, M_WAITOK); if (g_io_getattr("GJOURNAL::provider", cp, &len, mp->mnt_gjprovider) == 0) { mp->mnt_gjprovider = realloc(mp->mnt_gjprovider, len, M_UFSMNT, M_WAITOK); MNT_ILOCK(mp); mp->mnt_flag |= MNT_GJOURNAL; MNT_IUNLOCK(mp); } else { printf("WARNING: %s: GJOURNAL flag on fs " "but no gjournal provider below\n", mp->mnt_stat.f_mntonname); free(mp->mnt_gjprovider, M_UFSMNT); mp->mnt_gjprovider = NULL; } #else printf("WARNING: %s: GJOURNAL flag on fs but no " "UFS_GJOURNAL support\n", mp->mnt_stat.f_mntonname); #endif } else { mp->mnt_gjprovider = NULL; } ump = malloc(sizeof *ump, M_UFSMNT, M_WAITOK | M_ZERO); ump->um_cp = cp; ump->um_bo = &devvp->v_bufobj; ump->um_fs = malloc((u_long)fs->fs_sbsize, M_UFSMNT, M_WAITOK); if (fs->fs_magic == FS_UFS1_MAGIC) { ump->um_fstype = UFS1; ump->um_balloc = ffs_balloc_ufs1; } else { ump->um_fstype = UFS2; ump->um_balloc = ffs_balloc_ufs2; } ump->um_blkatoff = ffs_blkatoff; ump->um_truncate = ffs_truncate; ump->um_update = ffs_update; ump->um_valloc = ffs_valloc; ump->um_vfree = ffs_vfree; ump->um_ifree = ffs_ifree; ump->um_rdonly = ffs_rdonly; ump->um_snapgone = ffs_snapgone; mtx_init(UFS_MTX(ump), "FFS", "FFS Lock", MTX_DEF); bcopy(bp->b_data, ump->um_fs, (u_int)fs->fs_sbsize); if (fs->fs_sbsize < SBLOCKSIZE) bp->b_flags |= B_INVAL | B_NOCACHE; brelse(bp); bp = NULL; fs = ump->um_fs; ffs_oldfscompat_read(fs, ump, sblockloc); fs->fs_ronly = ronly; size = fs->fs_cssize; blks = howmany(size, fs->fs_fsize); if (fs->fs_contigsumsize > 0) size += fs->fs_ncg * sizeof(int32_t); size += fs->fs_ncg * sizeof(u_int8_t); space = malloc(size, M_UFSMNT, M_WAITOK); fs->fs_csp = space; for (i = 0; i < blks; i += fs->fs_frag) { size = fs->fs_bsize; if (i + fs->fs_frag > blks) size = (blks - i) * fs->fs_fsize; if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + i), size, cred, &bp)) != 0) { free(fs->fs_csp, M_UFSMNT); goto out; } bcopy(bp->b_data, space, (u_int)size); space = (char *)space + size; brelse(bp); bp = NULL; } if (fs->fs_contigsumsize > 0) { fs->fs_maxcluster = lp = space; for (i = 0; i < fs->fs_ncg; i++) *lp++ = fs->fs_contigsumsize; space = lp; } size = fs->fs_ncg * sizeof(u_int8_t); fs->fs_contigdirs = (u_int8_t *)space; bzero(fs->fs_contigdirs, size); fs->fs_active = NULL; mp->mnt_data = ump; mp->mnt_stat.f_fsid.val[0] = fs->fs_id[0]; mp->mnt_stat.f_fsid.val[1] = fs->fs_id[1]; nmp = NULL; if (fs->fs_id[0] == 0 || fs->fs_id[1] == 0 || (nmp = vfs_getvfs(&mp->mnt_stat.f_fsid))) { if (nmp) vfs_rel(nmp); vfs_getnewfsid(mp); } mp->mnt_maxsymlinklen = fs->fs_maxsymlinklen; MNT_ILOCK(mp); mp->mnt_flag |= MNT_LOCAL; MNT_IUNLOCK(mp); if ((fs->fs_flags & FS_MULTILABEL) != 0) { #ifdef MAC MNT_ILOCK(mp); mp->mnt_flag |= MNT_MULTILABEL; MNT_IUNLOCK(mp); #else printf("WARNING: %s: multilabel flag on fs but " "no MAC support\n", mp->mnt_stat.f_mntonname); #endif } if ((fs->fs_flags & FS_ACLS) != 0) { #ifdef UFS_ACL MNT_ILOCK(mp); if (mp->mnt_flag & MNT_NFS4ACLS) printf("WARNING: %s: ACLs flag on fs conflicts with " "\"nfsv4acls\" mount option; option ignored\n", mp->mnt_stat.f_mntonname); mp->mnt_flag &= ~MNT_NFS4ACLS; mp->mnt_flag |= MNT_ACLS; MNT_IUNLOCK(mp); #else printf("WARNING: %s: ACLs flag on fs but no ACLs support\n", mp->mnt_stat.f_mntonname); #endif } if ((fs->fs_flags & FS_NFS4ACLS) != 0) { #ifdef UFS_ACL MNT_ILOCK(mp); if (mp->mnt_flag & MNT_ACLS) printf("WARNING: %s: NFSv4 ACLs flag on fs conflicts " "with \"acls\" mount option; option ignored\n", mp->mnt_stat.f_mntonname); mp->mnt_flag &= ~MNT_ACLS; mp->mnt_flag |= MNT_NFS4ACLS; MNT_IUNLOCK(mp); #else printf("WARNING: %s: NFSv4 ACLs flag on fs but no " "ACLs support\n", mp->mnt_stat.f_mntonname); #endif } if ((fs->fs_flags & FS_TRIM) != 0) { len = sizeof(int); if (g_io_getattr("GEOM::candelete", cp, &len, &ump->um_candelete) == 0) { if (!ump->um_candelete) printf("WARNING: %s: TRIM flag on fs but disk " "does not support TRIM\n", mp->mnt_stat.f_mntonname); } else { printf("WARNING: %s: TRIM flag on fs but disk does " "not confirm that it supports TRIM\n", mp->mnt_stat.f_mntonname); ump->um_candelete = 0; } if (ump->um_candelete) { ump->um_trim_tq = taskqueue_create("trim", M_WAITOK, taskqueue_thread_enqueue, &ump->um_trim_tq); taskqueue_start_threads(&ump->um_trim_tq, 1, PVFS, "%s trim", mp->mnt_stat.f_mntonname); } } ump->um_mountp = mp; ump->um_dev = dev; ump->um_devvp = devvp; ump->um_nindir = fs->fs_nindir; ump->um_bptrtodb = fs->fs_fsbtodb; ump->um_seqinc = fs->fs_frag; for (i = 0; i < MAXQUOTAS; i++) ump->um_quotas[i] = NULLVP; #ifdef UFS_EXTATTR ufs_extattr_uepm_init(&ump->um_extattr); #endif /* * Set FS local "last mounted on" information (NULL pad) */ bzero(fs->fs_fsmnt, MAXMNTLEN); strlcpy(fs->fs_fsmnt, mp->mnt_stat.f_mntonname, MAXMNTLEN); mp->mnt_stat.f_iosize = fs->fs_bsize; if (mp->mnt_flag & MNT_ROOTFS) { /* * Root mount; update timestamp in mount structure. * this will be used by the common root mount code * to update the system clock. */ mp->mnt_time = fs->fs_time; } if (ronly == 0) { fs->fs_mtime = time_second; if ((fs->fs_flags & FS_DOSOFTDEP) && (error = softdep_mount(devvp, mp, fs, cred)) != 0) { free(fs->fs_csp, M_UFSMNT); ffs_flushfiles(mp, FORCECLOSE, td); goto out; } if (fs->fs_snapinum[0] != 0) ffs_snapshot_mount(mp); fs->fs_fmod = 1; fs->fs_clean = 0; (void) ffs_sbupdate(ump, MNT_WAIT, 0); } /* * Initialize filesystem state information in mount struct. */ MNT_ILOCK(mp); mp->mnt_kern_flag |= MNTK_LOOKUP_SHARED | MNTK_EXTENDED_SHARED | MNTK_NO_IOPF | MNTK_UNMAPPED_BUFS | MNTK_USES_BCACHE; MNT_IUNLOCK(mp); #ifdef UFS_EXTATTR #ifdef UFS_EXTATTR_AUTOSTART /* * * Auto-starting does the following: * - check for /.attribute in the fs, and extattr_start if so * - for each file in .attribute, enable that file with * an attribute of the same name. * Not clear how to report errors -- probably eat them. * This would all happen while the filesystem was busy/not * available, so would effectively be "atomic". */ (void) ufs_extattr_autostart(mp, td); #endif /* !UFS_EXTATTR_AUTOSTART */ #endif /* !UFS_EXTATTR */ return (0); out: if (bp) brelse(bp); if (cp != NULL) { g_topology_lock(); g_vfs_close(cp); g_topology_unlock(); } if (ump) { mtx_destroy(UFS_MTX(ump)); if (mp->mnt_gjprovider != NULL) { free(mp->mnt_gjprovider, M_UFSMNT); mp->mnt_gjprovider = NULL; } free(ump->um_fs, M_UFSMNT); free(ump, M_UFSMNT); mp->mnt_data = NULL; } atomic_store_rel_ptr((uintptr_t *)&dev->si_mountpt, 0); dev_rel(dev); return (error); } #include static int bigcgs = 0; SYSCTL_INT(_debug, OID_AUTO, bigcgs, CTLFLAG_RW, &bigcgs, 0, ""); /* * Sanity checks for loading old filesystem superblocks. * See ffs_oldfscompat_write below for unwound actions. * * XXX - Parts get retired eventually. * Unfortunately new bits get added. */ static void ffs_oldfscompat_read(fs, ump, sblockloc) struct fs *fs; struct ufsmount *ump; ufs2_daddr_t sblockloc; { off_t maxfilesize; /* * If not yet done, update fs_flags location and value of fs_sblockloc. */ if ((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0) { fs->fs_flags = fs->fs_old_flags; fs->fs_old_flags |= FS_FLAGS_UPDATED; fs->fs_sblockloc = sblockloc; } /* * If not yet done, update UFS1 superblock with new wider fields. */ if (fs->fs_magic == FS_UFS1_MAGIC && fs->fs_maxbsize != fs->fs_bsize) { fs->fs_maxbsize = fs->fs_bsize; fs->fs_time = fs->fs_old_time; fs->fs_size = fs->fs_old_size; fs->fs_dsize = fs->fs_old_dsize; fs->fs_csaddr = fs->fs_old_csaddr; fs->fs_cstotal.cs_ndir = fs->fs_old_cstotal.cs_ndir; fs->fs_cstotal.cs_nbfree = fs->fs_old_cstotal.cs_nbfree; fs->fs_cstotal.cs_nifree = fs->fs_old_cstotal.cs_nifree; fs->fs_cstotal.cs_nffree = fs->fs_old_cstotal.cs_nffree; } if (fs->fs_magic == FS_UFS1_MAGIC && fs->fs_old_inodefmt < FS_44INODEFMT) { fs->fs_maxfilesize = ((uint64_t)1 << 31) - 1; fs->fs_qbmask = ~fs->fs_bmask; fs->fs_qfmask = ~fs->fs_fmask; } if (fs->fs_magic == FS_UFS1_MAGIC) { ump->um_savedmaxfilesize = fs->fs_maxfilesize; maxfilesize = (uint64_t)0x80000000 * fs->fs_bsize - 1; if (fs->fs_maxfilesize > maxfilesize) fs->fs_maxfilesize = maxfilesize; } /* Compatibility for old filesystems */ if (fs->fs_avgfilesize <= 0) fs->fs_avgfilesize = AVFILESIZ; if (fs->fs_avgfpdir <= 0) fs->fs_avgfpdir = AFPDIR; if (bigcgs) { fs->fs_save_cgsize = fs->fs_cgsize; fs->fs_cgsize = fs->fs_bsize; } } /* * Unwinding superblock updates for old filesystems. * See ffs_oldfscompat_read above for details. * * XXX - Parts get retired eventually. * Unfortunately new bits get added. */ void ffs_oldfscompat_write(fs, ump) struct fs *fs; struct ufsmount *ump; { /* * Copy back UFS2 updated fields that UFS1 inspects. */ if (fs->fs_magic == FS_UFS1_MAGIC) { fs->fs_old_time = fs->fs_time; fs->fs_old_cstotal.cs_ndir = fs->fs_cstotal.cs_ndir; fs->fs_old_cstotal.cs_nbfree = fs->fs_cstotal.cs_nbfree; fs->fs_old_cstotal.cs_nifree = fs->fs_cstotal.cs_nifree; fs->fs_old_cstotal.cs_nffree = fs->fs_cstotal.cs_nffree; fs->fs_maxfilesize = ump->um_savedmaxfilesize; } if (bigcgs) { fs->fs_cgsize = fs->fs_save_cgsize; fs->fs_save_cgsize = 0; } } /* * unmount system call */ static int ffs_unmount(mp, mntflags) struct mount *mp; int mntflags; { struct thread *td; struct ufsmount *ump = VFSTOUFS(mp); struct fs *fs; int error, flags, susp; #ifdef UFS_EXTATTR int e_restart; #endif flags = 0; td = curthread; fs = ump->um_fs; susp = 0; if (mntflags & MNT_FORCE) { flags |= FORCECLOSE; susp = fs->fs_ronly == 0; } #ifdef UFS_EXTATTR if ((error = ufs_extattr_stop(mp, td))) { if (error != EOPNOTSUPP) printf("WARNING: unmount %s: ufs_extattr_stop " "returned errno %d\n", mp->mnt_stat.f_mntonname, error); e_restart = 0; } else { ufs_extattr_uepm_destroy(&ump->um_extattr); e_restart = 1; } #endif if (susp) { error = vfs_write_suspend_umnt(mp); if (error != 0) goto fail1; } if (MOUNTEDSOFTDEP(mp)) error = softdep_flushfiles(mp, flags, td); else error = ffs_flushfiles(mp, flags, td); if (error != 0 && error != ENXIO) goto fail; UFS_LOCK(ump); if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) { printf("WARNING: unmount %s: pending error: blocks %jd " "files %d\n", fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks, fs->fs_pendinginodes); fs->fs_pendingblocks = 0; fs->fs_pendinginodes = 0; } UFS_UNLOCK(ump); if (MOUNTEDSOFTDEP(mp)) softdep_unmount(mp); if (fs->fs_ronly == 0 || ump->um_fsckpid > 0) { fs->fs_clean = fs->fs_flags & (FS_UNCLEAN|FS_NEEDSFSCK) ? 0 : 1; error = ffs_sbupdate(ump, MNT_WAIT, 0); if (error && error != ENXIO) { fs->fs_clean = 0; goto fail; } } if (susp) vfs_write_resume(mp, VR_START_WRITE); if (ump->um_trim_tq != NULL) { while (ump->um_trim_inflight != 0) pause("ufsutr", hz); taskqueue_drain_all(ump->um_trim_tq); taskqueue_free(ump->um_trim_tq); } g_topology_lock(); if (ump->um_fsckpid > 0) { /* * Return to normal read-only mode. */ error = g_access(ump->um_cp, 0, -1, 0); ump->um_fsckpid = 0; } g_vfs_close(ump->um_cp); g_topology_unlock(); atomic_store_rel_ptr((uintptr_t *)&ump->um_dev->si_mountpt, 0); vrele(ump->um_devvp); dev_rel(ump->um_dev); mtx_destroy(UFS_MTX(ump)); if (mp->mnt_gjprovider != NULL) { free(mp->mnt_gjprovider, M_UFSMNT); mp->mnt_gjprovider = NULL; } free(fs->fs_csp, M_UFSMNT); free(fs, M_UFSMNT); free(ump, M_UFSMNT); mp->mnt_data = NULL; MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_LOCAL; MNT_IUNLOCK(mp); if (td->td_su == mp) { td->td_su = NULL; vfs_rel(mp); } return (error); fail: if (susp) vfs_write_resume(mp, VR_START_WRITE); fail1: #ifdef UFS_EXTATTR if (e_restart) { ufs_extattr_uepm_init(&ump->um_extattr); #ifdef UFS_EXTATTR_AUTOSTART (void) ufs_extattr_autostart(mp, td); #endif } #endif return (error); } /* * Flush out all the files in a filesystem. */ int ffs_flushfiles(mp, flags, td) struct mount *mp; int flags; struct thread *td; { struct ufsmount *ump; int qerror, error; ump = VFSTOUFS(mp); qerror = 0; #ifdef QUOTA if (mp->mnt_flag & MNT_QUOTA) { int i; error = vflush(mp, 0, SKIPSYSTEM|flags, td); if (error) return (error); for (i = 0; i < MAXQUOTAS; i++) { error = quotaoff(td, mp, i); if (error != 0) { if ((flags & EARLYFLUSH) == 0) return (error); else qerror = error; } } /* * Here we fall through to vflush again to ensure that * we have gotten rid of all the system vnodes, unless * quotas must not be closed. */ } #endif ASSERT_VOP_LOCKED(ump->um_devvp, "ffs_flushfiles"); if (ump->um_devvp->v_vflag & VV_COPYONWRITE) { if ((error = vflush(mp, 0, SKIPSYSTEM | flags, td)) != 0) return (error); ffs_snapshot_unmount(mp); flags |= FORCECLOSE; /* * Here we fall through to vflush again to ensure * that we have gotten rid of all the system vnodes. */ } /* * Do not close system files if quotas were not closed, to be * able to sync the remaining dquots. The freeblks softupdate * workitems might hold a reference on a dquot, preventing * quotaoff() from completing. Next round of * softdep_flushworklist() iteration should process the * blockers, allowing the next run of quotaoff() to finally * flush held dquots. * * Otherwise, flush all the files. */ if (qerror == 0 && (error = vflush(mp, 0, flags, td)) != 0) return (error); /* * Flush filesystem metadata. */ vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY); error = VOP_FSYNC(ump->um_devvp, MNT_WAIT, td); VOP_UNLOCK(ump->um_devvp, 0); return (error); } /* * Get filesystem statistics. */ static int ffs_statfs(mp, sbp) struct mount *mp; struct statfs *sbp; { struct ufsmount *ump; struct fs *fs; ump = VFSTOUFS(mp); fs = ump->um_fs; if (fs->fs_magic != FS_UFS1_MAGIC && fs->fs_magic != FS_UFS2_MAGIC) panic("ffs_statfs"); sbp->f_version = STATFS_VERSION; sbp->f_bsize = fs->fs_fsize; sbp->f_iosize = fs->fs_bsize; sbp->f_blocks = fs->fs_dsize; UFS_LOCK(ump); sbp->f_bfree = fs->fs_cstotal.cs_nbfree * fs->fs_frag + fs->fs_cstotal.cs_nffree + dbtofsb(fs, fs->fs_pendingblocks); sbp->f_bavail = freespace(fs, fs->fs_minfree) + dbtofsb(fs, fs->fs_pendingblocks); sbp->f_files = fs->fs_ncg * fs->fs_ipg - UFS_ROOTINO; sbp->f_ffree = fs->fs_cstotal.cs_nifree + fs->fs_pendinginodes; UFS_UNLOCK(ump); sbp->f_namemax = UFS_MAXNAMLEN; return (0); } static bool sync_doupdate(struct inode *ip) { return ((ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) != 0); } /* * For a lazy sync, we only care about access times, quotas and the * superblock. Other filesystem changes are already converted to * cylinder group blocks or inode blocks updates and are written to * disk by syncer. */ static int ffs_sync_lazy(mp) struct mount *mp; { struct vnode *mvp, *vp; struct inode *ip; struct thread *td; int allerror, error; allerror = 0; td = curthread; if ((mp->mnt_flag & MNT_NOATIME) != 0) goto qupdate; MNT_VNODE_FOREACH_ACTIVE(vp, mp, mvp) { if (vp->v_type == VNON) { VI_UNLOCK(vp); continue; } ip = VTOI(vp); /* * The IN_ACCESS flag is converted to IN_MODIFIED by * ufs_close() and ufs_getattr() by the calls to * ufs_itimes_locked(), without subsequent UFS_UPDATE(). * Test also all the other timestamp flags too, to pick up * any other cases that could be missed. */ if (!sync_doupdate(ip) && (vp->v_iflag & VI_OWEINACT) == 0) { VI_UNLOCK(vp); continue; } if ((error = vget(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, td)) != 0) continue; if (sync_doupdate(ip)) error = ffs_update(vp, 0); if (error != 0) allerror = error; vput(vp); } qupdate: #ifdef QUOTA qsync(mp); #endif if (VFSTOUFS(mp)->um_fs->fs_fmod != 0 && (error = ffs_sbupdate(VFSTOUFS(mp), MNT_LAZY, 0)) != 0) allerror = error; return (allerror); } /* * Go through the disk queues to initiate sandbagged IO; * go through the inodes to write those that have been modified; * initiate the writing of the super block if it has been modified. * * Note: we are always called with the filesystem marked busy using * vfs_busy(). */ static int ffs_sync(mp, waitfor) struct mount *mp; int waitfor; { struct vnode *mvp, *vp, *devvp; struct thread *td; struct inode *ip; struct ufsmount *ump = VFSTOUFS(mp); struct fs *fs; int error, count, lockreq, allerror = 0; int suspend; int suspended; int secondary_writes; int secondary_accwrites; int softdep_deps; int softdep_accdeps; struct bufobj *bo; suspend = 0; suspended = 0; td = curthread; fs = ump->um_fs; if (fs->fs_fmod != 0 && fs->fs_ronly != 0 && ump->um_fsckpid == 0) panic("%s: ffs_sync: modification on read-only filesystem", fs->fs_fsmnt); if (waitfor == MNT_LAZY) { if (!rebooting) return (ffs_sync_lazy(mp)); waitfor = MNT_NOWAIT; } /* * Write back each (modified) inode. */ lockreq = LK_EXCLUSIVE | LK_NOWAIT; if (waitfor == MNT_SUSPEND) { suspend = 1; waitfor = MNT_WAIT; } if (waitfor == MNT_WAIT) lockreq = LK_EXCLUSIVE; lockreq |= LK_INTERLOCK | LK_SLEEPFAIL; loop: /* Grab snapshot of secondary write counts */ MNT_ILOCK(mp); secondary_writes = mp->mnt_secondary_writes; secondary_accwrites = mp->mnt_secondary_accwrites; MNT_IUNLOCK(mp); /* Grab snapshot of softdep dependency counts */ softdep_get_depcounts(mp, &softdep_deps, &softdep_accdeps); MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { /* * Depend on the vnode interlock to keep things stable enough * for a quick test. Since there might be hundreds of * thousands of vnodes, we cannot afford even a subroutine * call unless there's a good chance that we have work to do. */ if (vp->v_type == VNON) { VI_UNLOCK(vp); continue; } ip = VTOI(vp); if ((ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) == 0 && vp->v_bufobj.bo_dirty.bv_cnt == 0) { VI_UNLOCK(vp); continue; } if ((error = vget(vp, lockreq, td)) != 0) { if (error == ENOENT || error == ENOLCK) { MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); goto loop; } continue; } if ((error = ffs_syncvnode(vp, waitfor, 0)) != 0) allerror = error; vput(vp); } /* * Force stale filesystem control information to be flushed. */ if (waitfor == MNT_WAIT || rebooting) { if ((error = softdep_flushworklist(ump->um_mountp, &count, td))) allerror = error; /* Flushed work items may create new vnodes to clean */ if (allerror == 0 && count) goto loop; } #ifdef QUOTA qsync(mp); #endif devvp = ump->um_devvp; bo = &devvp->v_bufobj; BO_LOCK(bo); if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) { BO_UNLOCK(bo); vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); error = VOP_FSYNC(devvp, waitfor, td); VOP_UNLOCK(devvp, 0); if (MOUNTEDSOFTDEP(mp) && (error == 0 || error == EAGAIN)) error = ffs_sbupdate(ump, waitfor, 0); if (error != 0) allerror = error; if (allerror == 0 && waitfor == MNT_WAIT) goto loop; } else if (suspend != 0) { if (softdep_check_suspend(mp, devvp, softdep_deps, softdep_accdeps, secondary_writes, secondary_accwrites) != 0) { MNT_IUNLOCK(mp); goto loop; /* More work needed */ } mtx_assert(MNT_MTX(mp), MA_OWNED); mp->mnt_kern_flag |= MNTK_SUSPEND2 | MNTK_SUSPENDED; MNT_IUNLOCK(mp); suspended = 1; } else BO_UNLOCK(bo); /* * Write back modified superblock. */ if (fs->fs_fmod != 0 && (error = ffs_sbupdate(ump, waitfor, suspended)) != 0) allerror = error; return (allerror); } int ffs_vget(mp, ino, flags, vpp) struct mount *mp; ino_t ino; int flags; struct vnode **vpp; { return (ffs_vgetf(mp, ino, flags, vpp, 0)); } int ffs_vgetf(mp, ino, flags, vpp, ffs_flags) struct mount *mp; ino_t ino; int flags; struct vnode **vpp; int ffs_flags; { struct fs *fs; struct inode *ip; struct ufsmount *ump; struct buf *bp; struct vnode *vp; int error; error = vfs_hash_get(mp, ino, flags, curthread, vpp, NULL, NULL); if (error || *vpp != NULL) return (error); /* * We must promote to an exclusive lock for vnode creation. This * can happen if lookup is passed LOCKSHARED. */ if ((flags & LK_TYPE_MASK) == LK_SHARED) { flags &= ~LK_TYPE_MASK; flags |= LK_EXCLUSIVE; } /* * We do not lock vnode creation as it is believed to be too * expensive for such rare case as simultaneous creation of vnode * for same ino by different processes. We just allow them to race * and check later to decide who wins. Let the race begin! */ ump = VFSTOUFS(mp); fs = ump->um_fs; ip = uma_zalloc(uma_inode, M_WAITOK | M_ZERO); /* Allocate a new vnode/inode. */ error = getnewvnode("ufs", mp, fs->fs_magic == FS_UFS1_MAGIC ? &ffs_vnodeops1 : &ffs_vnodeops2, &vp); if (error) { *vpp = NULL; uma_zfree(uma_inode, ip); return (error); } /* * FFS supports recursive locking. */ lockmgr(vp->v_vnlock, LK_EXCLUSIVE, NULL); VN_LOCK_AREC(vp); vp->v_data = ip; vp->v_bufobj.bo_bsize = fs->fs_bsize; ip->i_vnode = vp; ip->i_ump = ump; ip->i_number = ino; ip->i_ea_refs = 0; ip->i_nextclustercg = -1; ip->i_flag = fs->fs_magic == FS_UFS1_MAGIC ? 0 : IN_UFS2; #ifdef QUOTA { int i; for (i = 0; i < MAXQUOTAS; i++) ip->i_dquot[i] = NODQUOT; } #endif if (ffs_flags & FFSV_FORCEINSMQ) vp->v_vflag |= VV_FORCEINSMQ; error = insmntque(vp, mp); if (error != 0) { uma_zfree(uma_inode, ip); *vpp = NULL; return (error); } vp->v_vflag &= ~VV_FORCEINSMQ; error = vfs_hash_insert(vp, ino, flags, curthread, vpp, NULL, NULL); if (error || *vpp != NULL) return (error); /* Read in the disk contents for the inode, copy into the inode. */ error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ino)), (int)fs->fs_bsize, NOCRED, &bp); if (error) { /* * The inode does not contain anything useful, so it would * be misleading to leave it on its hash chain. With mode * still zero, it will be unlinked and returned to the free * list by vput(). */ brelse(bp); vput(vp); *vpp = NULL; return (error); } if (I_IS_UFS1(ip)) ip->i_din1 = uma_zalloc(uma_ufs1, M_WAITOK); else ip->i_din2 = uma_zalloc(uma_ufs2, M_WAITOK); ffs_load_inode(bp, ip, fs, ino); if (DOINGSOFTDEP(vp)) softdep_load_inodeblock(ip); else ip->i_effnlink = ip->i_nlink; bqrelse(bp); /* * Initialize the vnode from the inode, check for aliases. * Note that the underlying vnode may have changed. */ error = ufs_vinit(mp, I_IS_UFS1(ip) ? &ffs_fifoops1 : &ffs_fifoops2, &vp); if (error) { vput(vp); *vpp = NULL; return (error); } /* * Finish inode initialization. */ if (vp->v_type != VFIFO) { /* FFS supports shared locking for all files except fifos. */ VN_LOCK_ASHARE(vp); } /* * Set up a generation number for this inode if it does not * already have one. This should only happen on old filesystems. */ if (ip->i_gen == 0) { while (ip->i_gen == 0) ip->i_gen = arc4random(); if ((vp->v_mount->mnt_flag & MNT_RDONLY) == 0) { ip->i_flag |= IN_MODIFIED; DIP_SET(ip, i_gen, ip->i_gen); } } #ifdef MAC if ((mp->mnt_flag & MNT_MULTILABEL) && ip->i_mode) { /* * If this vnode is already allocated, and we're running * multi-label, attempt to perform a label association * from the extended attributes on the inode. */ error = mac_vnode_associate_extattr(mp, vp); if (error) { /* ufs_inactive will release ip->i_devvp ref. */ vput(vp); *vpp = NULL; return (error); } } #endif *vpp = vp; return (0); } /* * File handle to vnode * * Have to be really careful about stale file handles: * - check that the inode number is valid * - for UFS2 check that the inode number is initialized * - call ffs_vget() to get the locked inode * - check for an unallocated inode (i_mode == 0) * - check that the given client host has export rights and return * those rights via. exflagsp and credanonp */ static int ffs_fhtovp(mp, fhp, flags, vpp) struct mount *mp; struct fid *fhp; int flags; struct vnode **vpp; { struct ufid *ufhp; struct ufsmount *ump; struct fs *fs; struct cg *cgp; struct buf *bp; ino_t ino; u_int cg; int error; ufhp = (struct ufid *)fhp; ino = ufhp->ufid_ino; ump = VFSTOUFS(mp); fs = ump->um_fs; if (ino < UFS_ROOTINO || ino >= fs->fs_ncg * fs->fs_ipg) return (ESTALE); /* * Need to check if inode is initialized because UFS2 does lazy * initialization and nfs_fhtovp can offer arbitrary inode numbers. */ if (fs->fs_magic != FS_UFS2_MAGIC) return (ufs_fhtovp(mp, ufhp, flags, vpp)); cg = ino_to_cg(fs, ino); if ((error = ffs_getcg(fs, ump->um_devvp, cg, &bp, &cgp)) != 0) return (error); if (ino >= cg * fs->fs_ipg + cgp->cg_initediblk) { brelse(bp); return (ESTALE); } brelse(bp); return (ufs_fhtovp(mp, ufhp, flags, vpp)); } /* * Initialize the filesystem. */ static int ffs_init(vfsp) struct vfsconf *vfsp; { ffs_susp_initialize(); softdep_initialize(); return (ufs_init(vfsp)); } /* * Undo the work of ffs_init(). */ static int ffs_uninit(vfsp) struct vfsconf *vfsp; { int ret; ret = ufs_uninit(vfsp); softdep_uninitialize(); ffs_susp_uninitialize(); return (ret); } /* * Write a superblock and associated information back to disk. */ int ffs_sbupdate(ump, waitfor, suspended) struct ufsmount *ump; int waitfor; int suspended; { struct fs *fs = ump->um_fs; struct buf *sbbp; struct buf *bp; int blks; void *space; int i, size, error, allerror = 0; if (fs->fs_ronly == 1 && (ump->um_mountp->mnt_flag & (MNT_RDONLY | MNT_UPDATE)) != (MNT_RDONLY | MNT_UPDATE) && ump->um_fsckpid == 0) panic("ffs_sbupdate: write read-only filesystem"); /* * We use the superblock's buf to serialize calls to ffs_sbupdate(). */ sbbp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc), (int)fs->fs_sbsize, 0, 0, 0); /* * First write back the summary information. */ blks = howmany(fs->fs_cssize, fs->fs_fsize); space = fs->fs_csp; for (i = 0; i < blks; i += fs->fs_frag) { size = fs->fs_bsize; if (i + fs->fs_frag > blks) size = (blks - i) * fs->fs_fsize; bp = getblk(ump->um_devvp, fsbtodb(fs, fs->fs_csaddr + i), size, 0, 0, 0); bcopy(space, bp->b_data, (u_int)size); space = (char *)space + size; if (suspended) bp->b_flags |= B_VALIDSUSPWRT; if (waitfor != MNT_WAIT) bawrite(bp); else if ((error = bwrite(bp)) != 0) allerror = error; } /* * Now write back the superblock itself. If any errors occurred * up to this point, then fail so that the superblock avoids * being written out as clean. */ if (allerror) { brelse(sbbp); return (allerror); } bp = sbbp; if (fs->fs_magic == FS_UFS1_MAGIC && fs->fs_sblockloc != SBLOCK_UFS1 && (fs->fs_old_flags & FS_FLAGS_UPDATED) == 0) { printf("WARNING: %s: correcting fs_sblockloc from %jd to %d\n", fs->fs_fsmnt, fs->fs_sblockloc, SBLOCK_UFS1); fs->fs_sblockloc = SBLOCK_UFS1; } if (fs->fs_magic == FS_UFS2_MAGIC && fs->fs_sblockloc != SBLOCK_UFS2 && (fs->fs_old_flags & FS_FLAGS_UPDATED) == 0) { printf("WARNING: %s: correcting fs_sblockloc from %jd to %d\n", fs->fs_fsmnt, fs->fs_sblockloc, SBLOCK_UFS2); fs->fs_sblockloc = SBLOCK_UFS2; } fs->fs_fmod = 0; fs->fs_time = time_second; if (MOUNTEDSOFTDEP(ump->um_mountp)) softdep_setup_sbupdate(ump, (struct fs *)bp->b_data, bp); bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize); ffs_oldfscompat_write((struct fs *)bp->b_data, ump); if (suspended) bp->b_flags |= B_VALIDSUSPWRT; if (waitfor != MNT_WAIT) bawrite(bp); else if ((error = bwrite(bp)) != 0) allerror = error; return (allerror); } static int ffs_extattrctl(struct mount *mp, int cmd, struct vnode *filename_vp, int attrnamespace, const char *attrname) { #ifdef UFS_EXTATTR return (ufs_extattrctl(mp, cmd, filename_vp, attrnamespace, attrname)); #else return (vfs_stdextattrctl(mp, cmd, filename_vp, attrnamespace, attrname)); #endif } static void ffs_ifree(struct ufsmount *ump, struct inode *ip) { if (ump->um_fstype == UFS1 && ip->i_din1 != NULL) uma_zfree(uma_ufs1, ip->i_din1); else if (ip->i_din2 != NULL) uma_zfree(uma_ufs2, ip->i_din2); uma_zfree(uma_inode, ip); } static int dobkgrdwrite = 1; SYSCTL_INT(_debug, OID_AUTO, dobkgrdwrite, CTLFLAG_RW, &dobkgrdwrite, 0, "Do background writes (honoring the BV_BKGRDWRITE flag)?"); /* * Complete a background write started from bwrite. */ static void ffs_backgroundwritedone(struct buf *bp) { struct bufobj *bufobj; struct buf *origbp; /* * Find the original buffer that we are writing. */ bufobj = bp->b_bufobj; BO_LOCK(bufobj); if ((origbp = gbincore(bp->b_bufobj, bp->b_lblkno)) == NULL) panic("backgroundwritedone: lost buffer"); /* * We should mark the cylinder group buffer origbp as * dirty, to not loose the failed write. */ if ((bp->b_ioflags & BIO_ERROR) != 0) origbp->b_vflags |= BV_BKGRDERR; BO_UNLOCK(bufobj); /* * Process dependencies then return any unfinished ones. */ pbrelvp(bp); if (!LIST_EMPTY(&bp->b_dep) && (bp->b_ioflags & BIO_ERROR) == 0) buf_complete(bp); #ifdef SOFTUPDATES if (!LIST_EMPTY(&bp->b_dep)) softdep_move_dependencies(bp, origbp); #endif /* * This buffer is marked B_NOCACHE so when it is released * by biodone it will be tossed. */ bp->b_flags |= B_NOCACHE; bp->b_flags &= ~B_CACHE; /* * Prevent brelse() from trying to keep and re-dirtying bp on * errors. It causes b_bufobj dereference in * bdirty()/reassignbuf(), and b_bufobj was cleared in * pbrelvp() above. */ if ((bp->b_ioflags & BIO_ERROR) != 0) bp->b_flags |= B_INVAL; bufdone(bp); BO_LOCK(bufobj); /* * Clear the BV_BKGRDINPROG flag in the original buffer * and awaken it if it is waiting for the write to complete. * If BV_BKGRDINPROG is not set in the original buffer it must * have been released and re-instantiated - which is not legal. */ KASSERT((origbp->b_vflags & BV_BKGRDINPROG), ("backgroundwritedone: lost buffer2")); origbp->b_vflags &= ~BV_BKGRDINPROG; if (origbp->b_vflags & BV_BKGRDWAIT) { origbp->b_vflags &= ~BV_BKGRDWAIT; wakeup(&origbp->b_xflags); } BO_UNLOCK(bufobj); } /* * Write, release buffer on completion. (Done by iodone * if async). Do not bother writing anything if the buffer * is invalid. * * Note that we set B_CACHE here, indicating that buffer is * fully valid and thus cacheable. This is true even of NFS * now so we set it generally. This could be set either here * or in biodone() since the I/O is synchronous. We put it * here. */ static int ffs_bufwrite(struct buf *bp) { struct buf *newbp; CTR3(KTR_BUF, "bufwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); if (bp->b_flags & B_INVAL) { brelse(bp); return (0); } if (!BUF_ISLOCKED(bp)) panic("bufwrite: buffer is not busy???"); /* * If a background write is already in progress, delay * writing this block if it is asynchronous. Otherwise * wait for the background write to complete. */ BO_LOCK(bp->b_bufobj); if (bp->b_vflags & BV_BKGRDINPROG) { if (bp->b_flags & B_ASYNC) { BO_UNLOCK(bp->b_bufobj); bdwrite(bp); return (0); } bp->b_vflags |= BV_BKGRDWAIT; msleep(&bp->b_xflags, BO_LOCKPTR(bp->b_bufobj), PRIBIO, "bwrbg", 0); if (bp->b_vflags & BV_BKGRDINPROG) panic("bufwrite: still writing"); } bp->b_vflags &= ~BV_BKGRDERR; BO_UNLOCK(bp->b_bufobj); /* * If this buffer is marked for background writing and we * do not have to wait for it, make a copy and write the * copy so as to leave this buffer ready for further use. * * This optimization eats a lot of memory. If we have a page * or buffer shortfall we can't do it. */ if (dobkgrdwrite && (bp->b_xflags & BX_BKGRDWRITE) && (bp->b_flags & B_ASYNC) && !vm_page_count_severe() && !buf_dirty_count_severe()) { KASSERT(bp->b_iodone == NULL, ("bufwrite: needs chained iodone (%p)", bp->b_iodone)); /* get a new block */ newbp = geteblk(bp->b_bufsize, GB_NOWAIT_BD); if (newbp == NULL) goto normal_write; KASSERT(buf_mapped(bp), ("Unmapped cg")); memcpy(newbp->b_data, bp->b_data, bp->b_bufsize); BO_LOCK(bp->b_bufobj); bp->b_vflags |= BV_BKGRDINPROG; BO_UNLOCK(bp->b_bufobj); - newbp->b_xflags |= BX_BKGRDMARKER; + newbp->b_xflags |= + (bp->b_xflags & BX_FSPRIV) | BX_BKGRDMARKER; newbp->b_lblkno = bp->b_lblkno; newbp->b_blkno = bp->b_blkno; newbp->b_offset = bp->b_offset; newbp->b_iodone = ffs_backgroundwritedone; newbp->b_flags |= B_ASYNC; newbp->b_flags &= ~B_INVAL; pbgetvp(bp->b_vp, newbp); #ifdef SOFTUPDATES /* * Move over the dependencies. If there are rollbacks, * leave the parent buffer dirtied as it will need to * be written again. */ if (LIST_EMPTY(&bp->b_dep) || softdep_move_dependencies(bp, newbp) == 0) bundirty(bp); #else bundirty(bp); #endif /* * Initiate write on the copy, release the original. The * BKGRDINPROG flag prevents it from going away until * the background write completes. */ bqrelse(bp); bp = newbp; } else /* Mark the buffer clean */ bundirty(bp); /* Let the normal bufwrite do the rest for us */ normal_write: return (bufwrite(bp)); } static void ffs_geom_strategy(struct bufobj *bo, struct buf *bp) { struct vnode *vp; - int error; struct buf *tbp; - int nocopy; + int error, nocopy; vp = bo2vnode(bo); if (bp->b_iocmd == BIO_WRITE) { if ((bp->b_flags & B_VALIDSUSPWRT) == 0 && bp->b_vp != NULL && bp->b_vp->v_mount != NULL && (bp->b_vp->v_mount->mnt_kern_flag & MNTK_SUSPENDED) != 0) panic("ffs_geom_strategy: bad I/O"); nocopy = bp->b_flags & B_NOCOPY; bp->b_flags &= ~(B_VALIDSUSPWRT | B_NOCOPY); if ((vp->v_vflag & VV_COPYONWRITE) && nocopy == 0 && vp->v_rdev->si_snapdata != NULL) { if ((bp->b_flags & B_CLUSTER) != 0) { runningbufwakeup(bp); TAILQ_FOREACH(tbp, &bp->b_cluster.cluster_head, b_cluster.cluster_entry) { error = ffs_copyonwrite(vp, tbp); if (error != 0 && error != EOPNOTSUPP) { bp->b_error = error; bp->b_ioflags |= BIO_ERROR; bufdone(bp); return; } } bp->b_runningbufspace = bp->b_bufsize; atomic_add_long(&runningbufspace, bp->b_runningbufspace); } else { error = ffs_copyonwrite(vp, bp); if (error != 0 && error != EOPNOTSUPP) { bp->b_error = error; bp->b_ioflags |= BIO_ERROR; bufdone(bp); return; } } } #ifdef SOFTUPDATES if ((bp->b_flags & B_CLUSTER) != 0) { TAILQ_FOREACH(tbp, &bp->b_cluster.cluster_head, b_cluster.cluster_entry) { if (!LIST_EMPTY(&tbp->b_dep)) buf_start(tbp); } } else { if (!LIST_EMPTY(&bp->b_dep)) buf_start(bp); } #endif + /* + * Check for metadata that needs check-hashes and update them. + */ + switch (bp->b_xflags & BX_FSPRIV) { + case BX_CYLGRP: + ((struct cg *)bp->b_data)->cg_ckhash = 0; + ((struct cg *)bp->b_data)->cg_ckhash = + calculate_crc32c(~0L, bp->b_data, bp->b_bcount); + break; + + case BX_SUPERBLOCK: + case BX_INODE: + case BX_INDIR: + case BX_DIR: + printf("Check-hash write is unimplemented!!!\n"); + break; + + case 0: + break; + + default: + printf("multiple buffer types 0x%b\n", + (u_int)(bp->b_xflags & BX_FSPRIV), + PRINT_UFS_BUF_XFLAGS); + break; + } } g_vfs_strategy(bo, bp); } int ffs_own_mount(const struct mount *mp) { if (mp->mnt_op == &ufs_vfsops) return (1); return (0); } #ifdef DDB #ifdef SOFTUPDATES /* defined in ffs_softdep.c */ extern void db_print_ffs(struct ufsmount *ump); DB_SHOW_COMMAND(ffs, db_show_ffs) { struct mount *mp; struct ufsmount *ump; if (have_addr) { ump = VFSTOUFS((struct mount *)addr); db_print_ffs(ump); return; } TAILQ_FOREACH(mp, &mountlist, mnt_list) { if (!strcmp(mp->mnt_stat.f_fstypename, ufs_vfsconf.vfc_name)) db_print_ffs(VFSTOUFS(mp)); } } #endif /* SOFTUPDATES */ #endif /* DDB */ Index: head/sys/ufs/ffs/ffs_vnops.c =================================================================== --- head/sys/ufs/ffs/ffs_vnops.c (revision 323922) +++ head/sys/ufs/ffs/ffs_vnops.c (revision 323923) @@ -1,1726 +1,1726 @@ /*- * Copyright (c) 2002, 2003 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Marshall * Kirk McKusick and Network Associates Laboratories, the Security * Research Division of Network Associates, Inc. under DARPA/SPAWAR * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS * research program * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)ufs_readwrite.c 8.11 (Berkeley) 5/8/95 * from: $FreeBSD: .../ufs/ufs_readwrite.c,v 1.96 2002/08/12 09:22:11 phk ... * @(#)ffs_vnops.c 8.15 (Berkeley) 5/14/95 */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "opt_directio.h" #include "opt_ffs.h" #define ALIGNED_TO(ptr, s) \ (((uintptr_t)(ptr) & (_Alignof(s) - 1)) == 0) #ifdef DIRECTIO extern int ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone); #endif static vop_fdatasync_t ffs_fdatasync; static vop_fsync_t ffs_fsync; static vop_getpages_t ffs_getpages; static vop_lock1_t ffs_lock; static vop_read_t ffs_read; static vop_write_t ffs_write; static int ffs_extread(struct vnode *vp, struct uio *uio, int ioflag); static int ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *cred); static vop_strategy_t ffsext_strategy; static vop_closeextattr_t ffs_closeextattr; static vop_deleteextattr_t ffs_deleteextattr; static vop_getextattr_t ffs_getextattr; static vop_listextattr_t ffs_listextattr; static vop_openextattr_t ffs_openextattr; static vop_setextattr_t ffs_setextattr; static vop_vptofh_t ffs_vptofh; /* Global vfs data structures for ufs. */ struct vop_vector ffs_vnodeops1 = { .vop_default = &ufs_vnodeops, .vop_fsync = ffs_fsync, .vop_fdatasync = ffs_fdatasync, .vop_getpages = ffs_getpages, .vop_getpages_async = vnode_pager_local_getpages_async, .vop_lock1 = ffs_lock, .vop_read = ffs_read, .vop_reallocblks = ffs_reallocblks, .vop_write = ffs_write, .vop_vptofh = ffs_vptofh, }; struct vop_vector ffs_fifoops1 = { .vop_default = &ufs_fifoops, .vop_fsync = ffs_fsync, .vop_fdatasync = ffs_fdatasync, .vop_reallocblks = ffs_reallocblks, /* XXX: really ??? */ .vop_vptofh = ffs_vptofh, }; /* Global vfs data structures for ufs. */ struct vop_vector ffs_vnodeops2 = { .vop_default = &ufs_vnodeops, .vop_fsync = ffs_fsync, .vop_fdatasync = ffs_fdatasync, .vop_getpages = ffs_getpages, .vop_getpages_async = vnode_pager_local_getpages_async, .vop_lock1 = ffs_lock, .vop_read = ffs_read, .vop_reallocblks = ffs_reallocblks, .vop_write = ffs_write, .vop_closeextattr = ffs_closeextattr, .vop_deleteextattr = ffs_deleteextattr, .vop_getextattr = ffs_getextattr, .vop_listextattr = ffs_listextattr, .vop_openextattr = ffs_openextattr, .vop_setextattr = ffs_setextattr, .vop_vptofh = ffs_vptofh, }; struct vop_vector ffs_fifoops2 = { .vop_default = &ufs_fifoops, .vop_fsync = ffs_fsync, .vop_fdatasync = ffs_fdatasync, .vop_lock1 = ffs_lock, .vop_reallocblks = ffs_reallocblks, .vop_strategy = ffsext_strategy, .vop_closeextattr = ffs_closeextattr, .vop_deleteextattr = ffs_deleteextattr, .vop_getextattr = ffs_getextattr, .vop_listextattr = ffs_listextattr, .vop_openextattr = ffs_openextattr, .vop_setextattr = ffs_setextattr, .vop_vptofh = ffs_vptofh, }; /* * Synch an open file. */ /* ARGSUSED */ static int ffs_fsync(struct vop_fsync_args *ap) { struct vnode *vp; struct bufobj *bo; int error; vp = ap->a_vp; bo = &vp->v_bufobj; retry: error = ffs_syncvnode(vp, ap->a_waitfor, 0); if (error) return (error); if (ap->a_waitfor == MNT_WAIT && DOINGSOFTDEP(vp)) { error = softdep_fsync(vp); if (error) return (error); /* * The softdep_fsync() function may drop vp lock, * allowing for dirty buffers to reappear on the * bo_dirty list. Recheck and resync as needed. */ BO_LOCK(bo); if ((vp->v_type == VREG || vp->v_type == VDIR) && (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)) { BO_UNLOCK(bo); goto retry; } BO_UNLOCK(bo); } return (0); } int ffs_syncvnode(struct vnode *vp, int waitfor, int flags) { struct inode *ip; struct bufobj *bo; struct buf *bp, *nbp; ufs_lbn_t lbn; int error, passes; bool still_dirty, wait; ip = VTOI(vp); ip->i_flag &= ~IN_NEEDSYNC; bo = &vp->v_bufobj; /* * When doing MNT_WAIT we must first flush all dependencies * on the inode. */ if (DOINGSOFTDEP(vp) && waitfor == MNT_WAIT && (error = softdep_sync_metadata(vp)) != 0) return (error); /* * Flush all dirty buffers associated with a vnode. */ error = 0; passes = 0; wait = false; /* Always do an async pass first. */ lbn = lblkno(ITOFS(ip), (ip->i_size + ITOFS(ip)->fs_bsize - 1)); BO_LOCK(bo); loop: TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) bp->b_vflags &= ~BV_SCANNED; TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { /* * Reasons to skip this buffer: it has already been considered * on this pass, the buffer has dependencies that will cause * it to be redirtied and it has not already been deferred, * or it is already being written. */ if ((bp->b_vflags & BV_SCANNED) != 0) continue; bp->b_vflags |= BV_SCANNED; /* * Flush indirects in order, if requested. * * Note that if only datasync is requested, we can * skip indirect blocks when softupdates are not * active. Otherwise we must flush them with data, * since dependencies prevent data block writes. */ if (waitfor == MNT_WAIT && bp->b_lblkno <= -UFS_NDADDR && (lbn_level(bp->b_lblkno) >= passes || ((flags & DATA_ONLY) != 0 && !DOINGSOFTDEP(vp)))) continue; if (bp->b_lblkno > lbn) panic("ffs_syncvnode: syncing truncated data."); if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) == 0) { BO_UNLOCK(bo); } else if (wait) { if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo)) != 0) { bp->b_vflags &= ~BV_SCANNED; goto next; } } else continue; if ((bp->b_flags & B_DELWRI) == 0) panic("ffs_fsync: not dirty"); /* * Check for dependencies and potentially complete them. */ if (!LIST_EMPTY(&bp->b_dep) && (error = softdep_sync_buf(vp, bp, wait ? MNT_WAIT : MNT_NOWAIT)) != 0) { /* I/O error. */ if (error != EBUSY) { BUF_UNLOCK(bp); return (error); } /* If we deferred once, don't defer again. */ if ((bp->b_flags & B_DEFERRED) == 0) { bp->b_flags |= B_DEFERRED; BUF_UNLOCK(bp); goto next; } } if (wait) { bremfree(bp); if ((error = bwrite(bp)) != 0) return (error); } else if ((bp->b_flags & B_CLUSTEROK)) { (void) vfs_bio_awrite(bp); } else { bremfree(bp); (void) bawrite(bp); } next: /* * Since we may have slept during the I/O, we need * to start from a known point. */ BO_LOCK(bo); nbp = TAILQ_FIRST(&bo->bo_dirty.bv_hd); } if (waitfor != MNT_WAIT) { BO_UNLOCK(bo); if ((flags & NO_INO_UPDT) != 0) return (0); else return (ffs_update(vp, 0)); } /* Drain IO to see if we're done. */ bufobj_wwait(bo, 0, 0); /* * Block devices associated with filesystems may have new I/O * requests posted for them even if the vnode is locked, so no * amount of trying will get them clean. We make several passes * as a best effort. * * Regular files may need multiple passes to flush all dependency * work as it is possible that we must write once per indirect * level, once for the leaf, and once for the inode and each of * these will be done with one sync and one async pass. */ if (bo->bo_dirty.bv_cnt > 0) { if ((flags & DATA_ONLY) == 0) { still_dirty = true; } else { /* * For data-only sync, dirty indirect buffers * are ignored. */ still_dirty = false; TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) { if (bp->b_lblkno > -UFS_NDADDR) { still_dirty = true; break; } } } if (still_dirty) { /* Write the inode after sync passes to flush deps. */ if (wait && DOINGSOFTDEP(vp) && (flags & NO_INO_UPDT) == 0) { BO_UNLOCK(bo); ffs_update(vp, 1); BO_LOCK(bo); } /* switch between sync/async. */ wait = !wait; if (wait || ++passes < UFS_NIADDR + 2) goto loop; #ifdef INVARIANTS if (!vn_isdisk(vp, NULL)) vn_printf(vp, "ffs_fsync: dirty "); #endif } } BO_UNLOCK(bo); error = 0; if ((flags & DATA_ONLY) == 0) { if ((flags & NO_INO_UPDT) == 0) error = ffs_update(vp, 1); if (DOINGSUJ(vp)) softdep_journal_fsync(VTOI(vp)); } return (error); } static int ffs_fdatasync(struct vop_fdatasync_args *ap) { return (ffs_syncvnode(ap->a_vp, MNT_WAIT, DATA_ONLY)); } static int ffs_lock(ap) struct vop_lock1_args /* { struct vnode *a_vp; int a_flags; struct thread *a_td; char *file; int line; } */ *ap; { #ifndef NO_FFS_SNAPSHOT struct vnode *vp; int flags; struct lock *lkp; int result; switch (ap->a_flags & LK_TYPE_MASK) { case LK_SHARED: case LK_UPGRADE: case LK_EXCLUSIVE: vp = ap->a_vp; flags = ap->a_flags; for (;;) { #ifdef DEBUG_VFS_LOCKS KASSERT(vp->v_holdcnt != 0, ("ffs_lock %p: zero hold count", vp)); #endif lkp = vp->v_vnlock; result = _lockmgr_args(lkp, flags, VI_MTX(vp), LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT, ap->a_file, ap->a_line); if (lkp == vp->v_vnlock || result != 0) break; /* * Apparent success, except that the vnode * mutated between snapshot file vnode and * regular file vnode while this process * slept. The lock currently held is not the * right lock. Release it, and try to get the * new lock. */ (void) _lockmgr_args(lkp, LK_RELEASE, NULL, LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT, ap->a_file, ap->a_line); if ((flags & (LK_INTERLOCK | LK_NOWAIT)) == (LK_INTERLOCK | LK_NOWAIT)) return (EBUSY); if ((flags & LK_TYPE_MASK) == LK_UPGRADE) flags = (flags & ~LK_TYPE_MASK) | LK_EXCLUSIVE; flags &= ~LK_INTERLOCK; } break; default: result = VOP_LOCK1_APV(&ufs_vnodeops, ap); } return (result); #else return (VOP_LOCK1_APV(&ufs_vnodeops, ap)); #endif } /* * Vnode op for reading. */ static int ffs_read(ap) struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { struct vnode *vp; struct inode *ip; struct uio *uio; struct fs *fs; struct buf *bp; ufs_lbn_t lbn, nextlbn; off_t bytesinfile; long size, xfersize, blkoffset; ssize_t orig_resid; int error; int seqcount; int ioflag; vp = ap->a_vp; uio = ap->a_uio; ioflag = ap->a_ioflag; if (ap->a_ioflag & IO_EXT) #ifdef notyet return (ffs_extread(vp, uio, ioflag)); #else panic("ffs_read+IO_EXT"); #endif #ifdef DIRECTIO if ((ioflag & IO_DIRECT) != 0) { int workdone; error = ffs_rawread(vp, uio, &workdone); if (error != 0 || workdone != 0) return error; } #endif seqcount = ap->a_ioflag >> IO_SEQSHIFT; ip = VTOI(vp); #ifdef INVARIANTS if (uio->uio_rw != UIO_READ) panic("ffs_read: mode"); if (vp->v_type == VLNK) { if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen) panic("ffs_read: short symlink"); } else if (vp->v_type != VREG && vp->v_type != VDIR) panic("ffs_read: type %d", vp->v_type); #endif orig_resid = uio->uio_resid; KASSERT(orig_resid >= 0, ("ffs_read: uio->uio_resid < 0")); if (orig_resid == 0) return (0); KASSERT(uio->uio_offset >= 0, ("ffs_read: uio->uio_offset < 0")); fs = ITOFS(ip); if (uio->uio_offset < ip->i_size && uio->uio_offset >= fs->fs_maxfilesize) return (EOVERFLOW); for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0) break; lbn = lblkno(fs, uio->uio_offset); nextlbn = lbn + 1; /* * size of buffer. The buffer representing the * end of the file is rounded up to the size of * the block type ( fragment or full block, * depending ). */ size = blksize(fs, ip, lbn); blkoffset = blkoff(fs, uio->uio_offset); /* * The amount we want to transfer in this iteration is * one FS block less the amount of the data before * our startpoint (duh!) */ xfersize = fs->fs_bsize - blkoffset; /* * But if we actually want less than the block, * or the file doesn't have a whole block more of data, * then use the lesser number. */ if (uio->uio_resid < xfersize) xfersize = uio->uio_resid; if (bytesinfile < xfersize) xfersize = bytesinfile; if (lblktosize(fs, nextlbn) >= ip->i_size) { /* * Don't do readahead if this is the end of the file. */ error = bread_gb(vp, lbn, size, NOCRED, GB_UNMAPPED, &bp); } else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { /* * Otherwise if we are allowed to cluster, * grab as much as we can. * * XXX This may not be a win if we are not * doing sequential access. */ error = cluster_read(vp, ip->i_size, lbn, size, NOCRED, blkoffset + uio->uio_resid, seqcount, GB_UNMAPPED, &bp); } else if (seqcount > 1) { /* * If we are NOT allowed to cluster, then * if we appear to be acting sequentially, * fire off a request for a readahead * as well as a read. Note that the 4th and 5th * arguments point to arrays of the size specified in * the 6th argument. */ u_int nextsize = blksize(fs, ip, nextlbn); error = breadn_flags(vp, lbn, size, &nextlbn, - &nextsize, 1, NOCRED, GB_UNMAPPED, &bp); + &nextsize, 1, NOCRED, GB_UNMAPPED, NULL, &bp); } else { /* * Failing all of the above, just read what the * user asked for. Interestingly, the same as * the first option above. */ error = bread_gb(vp, lbn, size, NOCRED, GB_UNMAPPED, &bp); } if (error) { brelse(bp); bp = NULL; break; } /* * We should only get non-zero b_resid when an I/O error * has occurred, which should cause us to break above. * However, if the short read did not cause an error, * then we want to ensure that we do not uiomove bad * or uninitialized data. */ size -= bp->b_resid; if (size < xfersize) { if (size == 0) break; xfersize = size; } if (buf_mapped(bp)) { error = vn_io_fault_uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); } else { error = vn_io_fault_pgmove(bp->b_pages, blkoffset, (int)xfersize, uio); } if (error) break; vfs_bio_brelse(bp, ioflag); } /* * This can only happen in the case of an error * because the loop above resets bp to NULL on each iteration * and on normal completion has not set a new value into it. * so it must have come from a 'break' statement */ if (bp != NULL) vfs_bio_brelse(bp, ioflag); if ((error == 0 || uio->uio_resid != orig_resid) && (vp->v_mount->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0 && (ip->i_flag & IN_ACCESS) == 0) { VI_LOCK(vp); ip->i_flag |= IN_ACCESS; VI_UNLOCK(vp); } return (error); } /* * Vnode op for writing. */ static int ffs_write(ap) struct vop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { struct vnode *vp; struct uio *uio; struct inode *ip; struct fs *fs; struct buf *bp; ufs_lbn_t lbn; off_t osize; ssize_t resid; int seqcount; int blkoffset, error, flags, ioflag, size, xfersize; vp = ap->a_vp; uio = ap->a_uio; ioflag = ap->a_ioflag; if (ap->a_ioflag & IO_EXT) #ifdef notyet return (ffs_extwrite(vp, uio, ioflag, ap->a_cred)); #else panic("ffs_write+IO_EXT"); #endif seqcount = ap->a_ioflag >> IO_SEQSHIFT; ip = VTOI(vp); #ifdef INVARIANTS if (uio->uio_rw != UIO_WRITE) panic("ffs_write: mode"); #endif switch (vp->v_type) { case VREG: if (ioflag & IO_APPEND) uio->uio_offset = ip->i_size; if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size) return (EPERM); /* FALLTHROUGH */ case VLNK: break; case VDIR: panic("ffs_write: dir write"); break; default: panic("ffs_write: type %p %d (%d,%d)", vp, (int)vp->v_type, (int)uio->uio_offset, (int)uio->uio_resid ); } KASSERT(uio->uio_resid >= 0, ("ffs_write: uio->uio_resid < 0")); KASSERT(uio->uio_offset >= 0, ("ffs_write: uio->uio_offset < 0")); fs = ITOFS(ip); if ((uoff_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize) return (EFBIG); /* * Maybe this should be above the vnode op call, but so long as * file servers have no limits, I don't think it matters. */ if (vn_rlimit_fsize(vp, uio, uio->uio_td)) return (EFBIG); resid = uio->uio_resid; osize = ip->i_size; if (seqcount > BA_SEQMAX) flags = BA_SEQMAX << BA_SEQSHIFT; else flags = seqcount << BA_SEQSHIFT; if (ioflag & IO_SYNC) flags |= IO_SYNC; flags |= BA_UNMAPPED; for (error = 0; uio->uio_resid > 0;) { lbn = lblkno(fs, uio->uio_offset); blkoffset = blkoff(fs, uio->uio_offset); xfersize = fs->fs_bsize - blkoffset; if (uio->uio_resid < xfersize) xfersize = uio->uio_resid; if (uio->uio_offset + xfersize > ip->i_size) vnode_pager_setsize(vp, uio->uio_offset + xfersize); /* * We must perform a read-before-write if the transfer size * does not cover the entire buffer. */ if (fs->fs_bsize > xfersize) flags |= BA_CLRBUF; else flags &= ~BA_CLRBUF; /* XXX is uio->uio_offset the right thing here? */ error = UFS_BALLOC(vp, uio->uio_offset, xfersize, ap->a_cred, flags, &bp); if (error != 0) { vnode_pager_setsize(vp, ip->i_size); break; } if ((ioflag & (IO_SYNC|IO_INVAL)) == (IO_SYNC|IO_INVAL)) bp->b_flags |= B_NOCACHE; if (uio->uio_offset + xfersize > ip->i_size) { ip->i_size = uio->uio_offset + xfersize; DIP_SET(ip, i_size, ip->i_size); } size = blksize(fs, ip, lbn) - bp->b_resid; if (size < xfersize) xfersize = size; if (buf_mapped(bp)) { error = vn_io_fault_uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); } else { error = vn_io_fault_pgmove(bp->b_pages, blkoffset, (int)xfersize, uio); } /* * If the buffer is not already filled and we encounter an * error while trying to fill it, we have to clear out any * garbage data from the pages instantiated for the buffer. * If we do not, a failed uiomove() during a write can leave * the prior contents of the pages exposed to a userland mmap. * * Note that we need only clear buffers with a transfer size * equal to the block size because buffers with a shorter * transfer size were cleared above by the call to UFS_BALLOC() * with the BA_CLRBUF flag set. * * If the source region for uiomove identically mmaps the * buffer, uiomove() performed the NOP copy, and the buffer * content remains valid because the page fault handler * validated the pages. */ if (error != 0 && (bp->b_flags & B_CACHE) == 0 && fs->fs_bsize == xfersize) vfs_bio_clrbuf(bp); vfs_bio_set_flags(bp, ioflag); /* * If IO_SYNC each buffer is written synchronously. Otherwise * if we have a severe page deficiency write the buffer * asynchronously. Otherwise try to cluster, and if that * doesn't do it then either do an async write (if O_DIRECT), * or a delayed write (if not). */ if (ioflag & IO_SYNC) { (void)bwrite(bp); } else if (vm_page_count_severe() || buf_dirty_count_severe() || (ioflag & IO_ASYNC)) { bp->b_flags |= B_CLUSTEROK; bawrite(bp); } else if (xfersize + blkoffset == fs->fs_bsize) { if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) { bp->b_flags |= B_CLUSTEROK; cluster_write(vp, bp, ip->i_size, seqcount, GB_UNMAPPED); } else { bawrite(bp); } } else if (ioflag & IO_DIRECT) { bp->b_flags |= B_CLUSTEROK; bawrite(bp); } else { bp->b_flags |= B_CLUSTEROK; bdwrite(bp); } if (error || xfersize == 0) break; ip->i_flag |= IN_CHANGE | IN_UPDATE; } /* * If we successfully wrote any data, and we are not the superuser * we clear the setuid and setgid bits as a precaution against * tampering. */ if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid && ap->a_cred) { if (priv_check_cred(ap->a_cred, PRIV_VFS_RETAINSUGID, 0)) { ip->i_mode &= ~(ISUID | ISGID); DIP_SET(ip, i_mode, ip->i_mode); } } if (error) { if (ioflag & IO_UNIT) { (void)ffs_truncate(vp, osize, IO_NORMAL | (ioflag & IO_SYNC), ap->a_cred); uio->uio_offset -= resid - uio->uio_resid; uio->uio_resid = resid; } } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) error = ffs_update(vp, 1); return (error); } /* * Extended attribute area reading. */ static int ffs_extread(struct vnode *vp, struct uio *uio, int ioflag) { struct inode *ip; struct ufs2_dinode *dp; struct fs *fs; struct buf *bp; ufs_lbn_t lbn, nextlbn; off_t bytesinfile; long size, xfersize, blkoffset; ssize_t orig_resid; int error; ip = VTOI(vp); fs = ITOFS(ip); dp = ip->i_din2; #ifdef INVARIANTS if (uio->uio_rw != UIO_READ || fs->fs_magic != FS_UFS2_MAGIC) panic("ffs_extread: mode"); #endif orig_resid = uio->uio_resid; KASSERT(orig_resid >= 0, ("ffs_extread: uio->uio_resid < 0")); if (orig_resid == 0) return (0); KASSERT(uio->uio_offset >= 0, ("ffs_extread: uio->uio_offset < 0")); for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { if ((bytesinfile = dp->di_extsize - uio->uio_offset) <= 0) break; lbn = lblkno(fs, uio->uio_offset); nextlbn = lbn + 1; /* * size of buffer. The buffer representing the * end of the file is rounded up to the size of * the block type ( fragment or full block, * depending ). */ size = sblksize(fs, dp->di_extsize, lbn); blkoffset = blkoff(fs, uio->uio_offset); /* * The amount we want to transfer in this iteration is * one FS block less the amount of the data before * our startpoint (duh!) */ xfersize = fs->fs_bsize - blkoffset; /* * But if we actually want less than the block, * or the file doesn't have a whole block more of data, * then use the lesser number. */ if (uio->uio_resid < xfersize) xfersize = uio->uio_resid; if (bytesinfile < xfersize) xfersize = bytesinfile; if (lblktosize(fs, nextlbn) >= dp->di_extsize) { /* * Don't do readahead if this is the end of the info. */ error = bread(vp, -1 - lbn, size, NOCRED, &bp); } else { /* * If we have a second block, then * fire off a request for a readahead * as well as a read. Note that the 4th and 5th * arguments point to arrays of the size specified in * the 6th argument. */ u_int nextsize = sblksize(fs, dp->di_extsize, nextlbn); nextlbn = -1 - nextlbn; error = breadn(vp, -1 - lbn, size, &nextlbn, &nextsize, 1, NOCRED, &bp); } if (error) { brelse(bp); bp = NULL; break; } /* * We should only get non-zero b_resid when an I/O error * has occurred, which should cause us to break above. * However, if the short read did not cause an error, * then we want to ensure that we do not uiomove bad * or uninitialized data. */ size -= bp->b_resid; if (size < xfersize) { if (size == 0) break; xfersize = size; } error = uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); if (error) break; vfs_bio_brelse(bp, ioflag); } /* * This can only happen in the case of an error * because the loop above resets bp to NULL on each iteration * and on normal completion has not set a new value into it. * so it must have come from a 'break' statement */ if (bp != NULL) vfs_bio_brelse(bp, ioflag); return (error); } /* * Extended attribute area writing. */ static int ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *ucred) { struct inode *ip; struct ufs2_dinode *dp; struct fs *fs; struct buf *bp; ufs_lbn_t lbn; off_t osize; ssize_t resid; int blkoffset, error, flags, size, xfersize; ip = VTOI(vp); fs = ITOFS(ip); dp = ip->i_din2; #ifdef INVARIANTS if (uio->uio_rw != UIO_WRITE || fs->fs_magic != FS_UFS2_MAGIC) panic("ffs_extwrite: mode"); #endif if (ioflag & IO_APPEND) uio->uio_offset = dp->di_extsize; KASSERT(uio->uio_offset >= 0, ("ffs_extwrite: uio->uio_offset < 0")); KASSERT(uio->uio_resid >= 0, ("ffs_extwrite: uio->uio_resid < 0")); if ((uoff_t)uio->uio_offset + uio->uio_resid > UFS_NXADDR * fs->fs_bsize) return (EFBIG); resid = uio->uio_resid; osize = dp->di_extsize; flags = IO_EXT; if (ioflag & IO_SYNC) flags |= IO_SYNC; for (error = 0; uio->uio_resid > 0;) { lbn = lblkno(fs, uio->uio_offset); blkoffset = blkoff(fs, uio->uio_offset); xfersize = fs->fs_bsize - blkoffset; if (uio->uio_resid < xfersize) xfersize = uio->uio_resid; /* * We must perform a read-before-write if the transfer size * does not cover the entire buffer. */ if (fs->fs_bsize > xfersize) flags |= BA_CLRBUF; else flags &= ~BA_CLRBUF; error = UFS_BALLOC(vp, uio->uio_offset, xfersize, ucred, flags, &bp); if (error != 0) break; /* * If the buffer is not valid we have to clear out any * garbage data from the pages instantiated for the buffer. * If we do not, a failed uiomove() during a write can leave * the prior contents of the pages exposed to a userland * mmap(). XXX deal with uiomove() errors a better way. */ if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize) vfs_bio_clrbuf(bp); if (uio->uio_offset + xfersize > dp->di_extsize) dp->di_extsize = uio->uio_offset + xfersize; size = sblksize(fs, dp->di_extsize, lbn) - bp->b_resid; if (size < xfersize) xfersize = size; error = uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); vfs_bio_set_flags(bp, ioflag); /* * If IO_SYNC each buffer is written synchronously. Otherwise * if we have a severe page deficiency write the buffer * asynchronously. Otherwise try to cluster, and if that * doesn't do it then either do an async write (if O_DIRECT), * or a delayed write (if not). */ if (ioflag & IO_SYNC) { (void)bwrite(bp); } else if (vm_page_count_severe() || buf_dirty_count_severe() || xfersize + blkoffset == fs->fs_bsize || (ioflag & (IO_ASYNC | IO_DIRECT))) bawrite(bp); else bdwrite(bp); if (error || xfersize == 0) break; ip->i_flag |= IN_CHANGE; } /* * If we successfully wrote any data, and we are not the superuser * we clear the setuid and setgid bits as a precaution against * tampering. */ if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid && ucred) { if (priv_check_cred(ucred, PRIV_VFS_RETAINSUGID, 0)) { ip->i_mode &= ~(ISUID | ISGID); dp->di_mode = ip->i_mode; } } if (error) { if (ioflag & IO_UNIT) { (void)ffs_truncate(vp, osize, IO_EXT | (ioflag&IO_SYNC), ucred); uio->uio_offset -= resid - uio->uio_resid; uio->uio_resid = resid; } } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) error = ffs_update(vp, 1); return (error); } /* * Vnode operating to retrieve a named extended attribute. * * Locate a particular EA (nspace:name) in the area (ptr:length), and return * the length of the EA, and possibly the pointer to the entry and to the data. */ static int ffs_findextattr(u_char *ptr, u_int length, int nspace, const char *name, struct extattr **eapp, u_char **eac) { struct extattr *eap, *eaend; size_t nlen; nlen = strlen(name); KASSERT(ALIGNED_TO(ptr, struct extattr), ("unaligned")); eap = (struct extattr *)ptr; eaend = (struct extattr *)(ptr + length); for (; eap < eaend; eap = EXTATTR_NEXT(eap)) { /* make sure this entry is complete */ if (EXTATTR_NEXT(eap) > eaend) break; if (eap->ea_namespace != nspace || eap->ea_namelength != nlen || memcmp(eap->ea_name, name, nlen) != 0) continue; if (eapp != NULL) *eapp = eap; if (eac != NULL) *eac = EXTATTR_CONTENT(eap); return (EXTATTR_CONTENT_SIZE(eap)); } return (-1); } static int ffs_rdextattr(u_char **p, struct vnode *vp, struct thread *td, int extra) { struct inode *ip; struct ufs2_dinode *dp; struct fs *fs; struct uio luio; struct iovec liovec; u_int easize; int error; u_char *eae; ip = VTOI(vp); fs = ITOFS(ip); dp = ip->i_din2; easize = dp->di_extsize; if ((uoff_t)easize + extra > UFS_NXADDR * fs->fs_bsize) return (EFBIG); eae = malloc(easize + extra, M_TEMP, M_WAITOK); liovec.iov_base = eae; liovec.iov_len = easize; luio.uio_iov = &liovec; luio.uio_iovcnt = 1; luio.uio_offset = 0; luio.uio_resid = easize; luio.uio_segflg = UIO_SYSSPACE; luio.uio_rw = UIO_READ; luio.uio_td = td; error = ffs_extread(vp, &luio, IO_EXT | IO_SYNC); if (error) { free(eae, M_TEMP); return(error); } *p = eae; return (0); } static void ffs_lock_ea(struct vnode *vp) { struct inode *ip; ip = VTOI(vp); VI_LOCK(vp); while (ip->i_flag & IN_EA_LOCKED) { ip->i_flag |= IN_EA_LOCKWAIT; msleep(&ip->i_ea_refs, &vp->v_interlock, PINOD + 2, "ufs_ea", 0); } ip->i_flag |= IN_EA_LOCKED; VI_UNLOCK(vp); } static void ffs_unlock_ea(struct vnode *vp) { struct inode *ip; ip = VTOI(vp); VI_LOCK(vp); if (ip->i_flag & IN_EA_LOCKWAIT) wakeup(&ip->i_ea_refs); ip->i_flag &= ~(IN_EA_LOCKED | IN_EA_LOCKWAIT); VI_UNLOCK(vp); } static int ffs_open_ea(struct vnode *vp, struct ucred *cred, struct thread *td) { struct inode *ip; struct ufs2_dinode *dp; int error; ip = VTOI(vp); ffs_lock_ea(vp); if (ip->i_ea_area != NULL) { ip->i_ea_refs++; ffs_unlock_ea(vp); return (0); } dp = ip->i_din2; error = ffs_rdextattr(&ip->i_ea_area, vp, td, 0); if (error) { ffs_unlock_ea(vp); return (error); } ip->i_ea_len = dp->di_extsize; ip->i_ea_error = 0; ip->i_ea_refs++; ffs_unlock_ea(vp); return (0); } /* * Vnode extattr transaction commit/abort */ static int ffs_close_ea(struct vnode *vp, int commit, struct ucred *cred, struct thread *td) { struct inode *ip; struct uio luio; struct iovec liovec; int error; struct ufs2_dinode *dp; ip = VTOI(vp); ffs_lock_ea(vp); if (ip->i_ea_area == NULL) { ffs_unlock_ea(vp); return (EINVAL); } dp = ip->i_din2; error = ip->i_ea_error; if (commit && error == 0) { ASSERT_VOP_ELOCKED(vp, "ffs_close_ea commit"); if (cred == NOCRED) cred = vp->v_mount->mnt_cred; liovec.iov_base = ip->i_ea_area; liovec.iov_len = ip->i_ea_len; luio.uio_iov = &liovec; luio.uio_iovcnt = 1; luio.uio_offset = 0; luio.uio_resid = ip->i_ea_len; luio.uio_segflg = UIO_SYSSPACE; luio.uio_rw = UIO_WRITE; luio.uio_td = td; /* XXX: I'm not happy about truncating to zero size */ if (ip->i_ea_len < dp->di_extsize) error = ffs_truncate(vp, 0, IO_EXT, cred); error = ffs_extwrite(vp, &luio, IO_EXT | IO_SYNC, cred); } if (--ip->i_ea_refs == 0) { free(ip->i_ea_area, M_TEMP); ip->i_ea_area = NULL; ip->i_ea_len = 0; ip->i_ea_error = 0; } ffs_unlock_ea(vp); return (error); } /* * Vnode extattr strategy routine for fifos. * * We need to check for a read or write of the external attributes. * Otherwise we just fall through and do the usual thing. */ static int ffsext_strategy(struct vop_strategy_args *ap) /* struct vop_strategy_args { struct vnodeop_desc *a_desc; struct vnode *a_vp; struct buf *a_bp; }; */ { struct vnode *vp; daddr_t lbn; vp = ap->a_vp; lbn = ap->a_bp->b_lblkno; if (I_IS_UFS2(VTOI(vp)) && lbn < 0 && lbn >= -UFS_NXADDR) return (VOP_STRATEGY_APV(&ufs_vnodeops, ap)); if (vp->v_type == VFIFO) return (VOP_STRATEGY_APV(&ufs_fifoops, ap)); panic("spec nodes went here"); } /* * Vnode extattr transaction commit/abort */ static int ffs_openextattr(struct vop_openextattr_args *ap) /* struct vop_openextattr_args { struct vnodeop_desc *a_desc; struct vnode *a_vp; IN struct ucred *a_cred; IN struct thread *a_td; }; */ { if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) return (EOPNOTSUPP); return (ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td)); } /* * Vnode extattr transaction commit/abort */ static int ffs_closeextattr(struct vop_closeextattr_args *ap) /* struct vop_closeextattr_args { struct vnodeop_desc *a_desc; struct vnode *a_vp; int a_commit; IN struct ucred *a_cred; IN struct thread *a_td; }; */ { if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) return (EOPNOTSUPP); if (ap->a_commit && (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)) return (EROFS); return (ffs_close_ea(ap->a_vp, ap->a_commit, ap->a_cred, ap->a_td)); } /* * Vnode operation to remove a named attribute. */ static int ffs_deleteextattr(struct vop_deleteextattr_args *ap) /* vop_deleteextattr { IN struct vnode *a_vp; IN int a_attrnamespace; IN const char *a_name; IN struct ucred *a_cred; IN struct thread *a_td; }; */ { struct inode *ip; struct fs *fs; struct extattr *eap; uint32_t ul; int olen, error, i, easize; u_char *eae; void *tmp; ip = VTOI(ap->a_vp); fs = ITOFS(ip); if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) return (EOPNOTSUPP); if (strlen(ap->a_name) == 0) return (EINVAL); if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VWRITE); if (error) { /* * ffs_lock_ea is not needed there, because the vnode * must be exclusively locked. */ if (ip->i_ea_area != NULL && ip->i_ea_error == 0) ip->i_ea_error = error; return (error); } error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); if (error) return (error); /* CEM: delete could be done in-place instead */ eae = malloc(ip->i_ea_len, M_TEMP, M_WAITOK); bcopy(ip->i_ea_area, eae, ip->i_ea_len); easize = ip->i_ea_len; olen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name, &eap, NULL); if (olen == -1) { /* delete but nonexistent */ free(eae, M_TEMP); ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); return (ENOATTR); } ul = eap->ea_length; i = (u_char *)EXTATTR_NEXT(eap) - eae; bcopy(EXTATTR_NEXT(eap), eap, easize - i); easize -= ul; tmp = ip->i_ea_area; ip->i_ea_area = eae; ip->i_ea_len = easize; free(tmp, M_TEMP); error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td); return (error); } /* * Vnode operation to retrieve a named extended attribute. */ static int ffs_getextattr(struct vop_getextattr_args *ap) /* vop_getextattr { IN struct vnode *a_vp; IN int a_attrnamespace; IN const char *a_name; INOUT struct uio *a_uio; OUT size_t *a_size; IN struct ucred *a_cred; IN struct thread *a_td; }; */ { struct inode *ip; u_char *eae, *p; unsigned easize; int error, ealen; ip = VTOI(ap->a_vp); if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) return (EOPNOTSUPP); error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VREAD); if (error) return (error); error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); if (error) return (error); eae = ip->i_ea_area; easize = ip->i_ea_len; ealen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name, NULL, &p); if (ealen >= 0) { error = 0; if (ap->a_size != NULL) *ap->a_size = ealen; else if (ap->a_uio != NULL) error = uiomove(p, ealen, ap->a_uio); } else error = ENOATTR; ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); return (error); } /* * Vnode operation to retrieve extended attributes on a vnode. */ static int ffs_listextattr(struct vop_listextattr_args *ap) /* vop_listextattr { IN struct vnode *a_vp; IN int a_attrnamespace; INOUT struct uio *a_uio; OUT size_t *a_size; IN struct ucred *a_cred; IN struct thread *a_td; }; */ { struct inode *ip; struct extattr *eap, *eaend; int error, ealen; ip = VTOI(ap->a_vp); if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) return (EOPNOTSUPP); error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VREAD); if (error) return (error); error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); if (error) return (error); error = 0; if (ap->a_size != NULL) *ap->a_size = 0; KASSERT(ALIGNED_TO(ip->i_ea_area, struct extattr), ("unaligned")); eap = (struct extattr *)ip->i_ea_area; eaend = (struct extattr *)(ip->i_ea_area + ip->i_ea_len); for (; error == 0 && eap < eaend; eap = EXTATTR_NEXT(eap)) { /* make sure this entry is complete */ if (EXTATTR_NEXT(eap) > eaend) break; if (eap->ea_namespace != ap->a_attrnamespace) continue; ealen = eap->ea_namelength; if (ap->a_size != NULL) *ap->a_size += ealen + 1; else if (ap->a_uio != NULL) error = uiomove(&eap->ea_namelength, ealen + 1, ap->a_uio); } ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); return (error); } /* * Vnode operation to set a named attribute. */ static int ffs_setextattr(struct vop_setextattr_args *ap) /* vop_setextattr { IN struct vnode *a_vp; IN int a_attrnamespace; IN const char *a_name; INOUT struct uio *a_uio; IN struct ucred *a_cred; IN struct thread *a_td; }; */ { struct inode *ip; struct fs *fs; struct extattr *eap; uint32_t ealength, ul; ssize_t ealen; int olen, eapad1, eapad2, error, i, easize; u_char *eae; void *tmp; ip = VTOI(ap->a_vp); fs = ITOFS(ip); if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) return (EOPNOTSUPP); if (strlen(ap->a_name) == 0) return (EINVAL); /* XXX Now unsupported API to delete EAs using NULL uio. */ if (ap->a_uio == NULL) return (EOPNOTSUPP); if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); ealen = ap->a_uio->uio_resid; if (ealen < 0 || ealen > lblktosize(fs, UFS_NXADDR)) return (EINVAL); error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VWRITE); if (error) { /* * ffs_lock_ea is not needed there, because the vnode * must be exclusively locked. */ if (ip->i_ea_area != NULL && ip->i_ea_error == 0) ip->i_ea_error = error; return (error); } error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); if (error) return (error); ealength = sizeof(uint32_t) + 3 + strlen(ap->a_name); eapad1 = roundup2(ealength, 8) - ealength; eapad2 = roundup2(ealen, 8) - ealen; ealength += eapad1 + ealen + eapad2; /* * CEM: rewrites of the same size or smaller could be done in-place * instead. (We don't acquire any fine-grained locks in here either, * so we could also do bigger writes in-place.) */ eae = malloc(ip->i_ea_len + ealength, M_TEMP, M_WAITOK); bcopy(ip->i_ea_area, eae, ip->i_ea_len); easize = ip->i_ea_len; olen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name, &eap, NULL); if (olen == -1) { /* new, append at end */ KASSERT(ALIGNED_TO(eae + easize, struct extattr), ("unaligned")); eap = (struct extattr *)(eae + easize); easize += ealength; } else { ul = eap->ea_length; i = (u_char *)EXTATTR_NEXT(eap) - eae; if (ul != ealength) { bcopy(EXTATTR_NEXT(eap), (u_char *)eap + ealength, easize - i); easize += (ealength - ul); } } if (easize > lblktosize(fs, UFS_NXADDR)) { free(eae, M_TEMP); ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); if (ip->i_ea_area != NULL && ip->i_ea_error == 0) ip->i_ea_error = ENOSPC; return (ENOSPC); } eap->ea_length = ealength; eap->ea_namespace = ap->a_attrnamespace; eap->ea_contentpadlen = eapad2; eap->ea_namelength = strlen(ap->a_name); memcpy(eap->ea_name, ap->a_name, strlen(ap->a_name)); bzero(&eap->ea_name[strlen(ap->a_name)], eapad1); error = uiomove(EXTATTR_CONTENT(eap), ealen, ap->a_uio); if (error) { free(eae, M_TEMP); ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); if (ip->i_ea_area != NULL && ip->i_ea_error == 0) ip->i_ea_error = error; return (error); } bzero((u_char *)EXTATTR_CONTENT(eap) + ealen, eapad2); tmp = ip->i_ea_area; ip->i_ea_area = eae; ip->i_ea_len = easize; free(tmp, M_TEMP); error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td); return (error); } /* * Vnode pointer to File handle */ static int ffs_vptofh(struct vop_vptofh_args *ap) /* vop_vptofh { IN struct vnode *a_vp; IN struct fid *a_fhp; }; */ { struct inode *ip; struct ufid *ufhp; ip = VTOI(ap->a_vp); ufhp = (struct ufid *)ap->a_fhp; ufhp->ufid_len = sizeof(struct ufid); ufhp->ufid_ino = ip->i_number; ufhp->ufid_gen = ip->i_gen; return (0); } SYSCTL_DECL(_vfs_ffs); static int use_buf_pager = 1; SYSCTL_INT(_vfs_ffs, OID_AUTO, use_buf_pager, CTLFLAG_RWTUN, &use_buf_pager, 0, "Always use buffer pager instead of bmap"); static daddr_t ffs_gbp_getblkno(struct vnode *vp, vm_ooffset_t off) { return (lblkno(VFSTOUFS(vp->v_mount)->um_fs, off)); } static int ffs_gbp_getblksz(struct vnode *vp, daddr_t lbn) { return (blksize(VFSTOUFS(vp->v_mount)->um_fs, VTOI(vp), lbn)); } static int ffs_getpages(struct vop_getpages_args *ap) { struct vnode *vp; struct ufsmount *um; vp = ap->a_vp; um = VFSTOUFS(vp->v_mount); if (!use_buf_pager && um->um_devvp->v_bufobj.bo_bsize <= PAGE_SIZE) return (vnode_pager_generic_getpages(vp, ap->a_m, ap->a_count, ap->a_rbehind, ap->a_rahead, NULL, NULL)); return (vfs_bio_getpages(vp, ap->a_m, ap->a_count, ap->a_rbehind, ap->a_rahead, ffs_gbp_getblkno, ffs_gbp_getblksz)); } Index: head/sys/ufs/ffs/fs.h =================================================================== --- head/sys/ufs/ffs/fs.h (revision 323922) +++ head/sys/ufs/ffs/fs.h (revision 323923) @@ -1,791 +1,818 @@ /*- * Copyright (c) 1982, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)fs.h 8.13 (Berkeley) 3/21/95 * $FreeBSD$ */ #ifndef _UFS_FFS_FS_H_ #define _UFS_FFS_FS_H_ #include #include /* * Each disk drive contains some number of filesystems. * A filesystem consists of a number of cylinder groups. * Each cylinder group has inodes and data. * * A filesystem is described by its super-block, which in turn * describes the cylinder groups. The super-block is critical * data and is replicated in each cylinder group to protect against * catastrophic loss. This is done at `newfs' time and the critical * super-block data does not change, so the copies need not be * referenced further unless disaster strikes. * * For filesystem fs, the offsets of the various blocks of interest * are given in the super block as: * [fs->fs_sblkno] Super-block * [fs->fs_cblkno] Cylinder group block * [fs->fs_iblkno] Inode blocks * [fs->fs_dblkno] Data blocks * The beginning of cylinder group cg in fs, is given by * the ``cgbase(fs, cg)'' macro. * * Depending on the architecture and the media, the superblock may * reside in any one of four places. For tiny media where every block * counts, it is placed at the very front of the partition. Historically, * UFS1 placed it 8K from the front to leave room for the disk label and * a small bootstrap. For UFS2 it got moved to 64K from the front to leave * room for the disk label and a bigger bootstrap, and for really piggy * systems we check at 256K from the front if the first three fail. In * all cases the size of the superblock will be SBLOCKSIZE. All values are * given in byte-offset form, so they do not imply a sector size. The * SBLOCKSEARCH specifies the order in which the locations should be searched. */ #define SBLOCK_FLOPPY 0 #define SBLOCK_UFS1 8192 #define SBLOCK_UFS2 65536 #define SBLOCK_PIGGY 262144 #define SBLOCKSIZE 8192 #define SBLOCKSEARCH \ { SBLOCK_UFS2, SBLOCK_UFS1, SBLOCK_FLOPPY, SBLOCK_PIGGY, -1 } /* * Max number of fragments per block. This value is NOT tweakable. */ #define MAXFRAG 8 /* * Addresses stored in inodes are capable of addressing fragments * of `blocks'. File system blocks of at most size MAXBSIZE can * be optionally broken into 2, 4, or 8 pieces, each of which is * addressable; these pieces may be DEV_BSIZE, or some multiple of * a DEV_BSIZE unit. * * Large files consist of exclusively large data blocks. To avoid * undue wasted disk space, the last data block of a small file may be * allocated as only as many fragments of a large block as are * necessary. The filesystem format retains only a single pointer * to such a fragment, which is a piece of a single large block that * has been divided. The size of such a fragment is determinable from * information in the inode, using the ``blksize(fs, ip, lbn)'' macro. * * The filesystem records space availability at the fragment level; * to determine block availability, aligned fragments are examined. */ /* * MINBSIZE is the smallest allowable block size. * In order to insure that it is possible to create files of size * 2^32 with only two levels of indirection, MINBSIZE is set to 4096. * MINBSIZE must be big enough to hold a cylinder group block, * thus changes to (struct cg) must keep its size within MINBSIZE. * Note that super blocks are always of size SBLOCKSIZE, * and that both SBLOCKSIZE and MAXBSIZE must be >= MINBSIZE. */ #define MINBSIZE 4096 /* * The path name on which the filesystem is mounted is maintained * in fs_fsmnt. MAXMNTLEN defines the amount of space allocated in * the super block for this name. */ #define MAXMNTLEN 468 /* * The volume name for this filesystem is maintained in fs_volname. * MAXVOLLEN defines the length of the buffer allocated. */ #define MAXVOLLEN 32 /* * There is a 128-byte region in the superblock reserved for in-core * pointers to summary information. Originally this included an array * of pointers to blocks of struct csum; now there are just a few * pointers and the remaining space is padded with fs_ocsp[]. * * NOCSPTRS determines the size of this padding. One pointer (fs_csp) * is taken away to point to a contiguous array of struct csum for * all cylinder groups; a second (fs_maxcluster) points to an array * of cluster sizes that is computed as cylinder groups are inspected, * and the third points to an array that tracks the creation of new * directories. A fourth pointer, fs_active, is used when creating * snapshots; it points to a bitmap of cylinder groups for which the * free-block bitmap has changed since the snapshot operation began. */ #define NOCSPTRS ((128 / sizeof(void *)) - 4) /* * A summary of contiguous blocks of various sizes is maintained * in each cylinder group. Normally this is set by the initial * value of fs_maxcontig. To conserve space, a maximum summary size * is set by FS_MAXCONTIG. */ #define FS_MAXCONTIG 16 /* * MINFREE gives the minimum acceptable percentage of filesystem * blocks which may be free. If the freelist drops below this level * only the superuser may continue to allocate blocks. This may * be set to 0 if no reserve of free blocks is deemed necessary, * however throughput drops by fifty percent if the filesystem * is run at between 95% and 100% full; thus the minimum default * value of fs_minfree is 5%. However, to get good clustering * performance, 10% is a better choice. hence we use 10% as our * default value. With 10% free space, fragmentation is not a * problem, so we choose to optimize for time. */ #define MINFREE 8 #define DEFAULTOPT FS_OPTTIME /* * Grigoriy Orlov has done some extensive work to fine * tune the layout preferences for directories within a filesystem. * His algorithm can be tuned by adjusting the following parameters * which tell the system the average file size and the average number * of files per directory. These defaults are well selected for typical * filesystems, but may need to be tuned for odd cases like filesystems * being used for squid caches or news spools. */ #define AVFILESIZ 16384 /* expected average file size */ #define AFPDIR 64 /* expected number of files per directory */ /* * The maximum number of snapshot nodes that can be associated * with each filesystem. This limit affects only the number of * snapshot files that can be recorded within the superblock so * that they can be found when the filesystem is mounted. However, * maintaining too many will slow the filesystem performance, so * having this limit is a good idea. */ #define FSMAXSNAP 20 /* * Used to identify special blocks in snapshots: * * BLK_NOCOPY - A block that was unallocated at the time the snapshot * was taken, hence does not need to be copied when written. * BLK_SNAP - A block held by another snapshot that is not needed by this * snapshot. When the other snapshot is freed, the BLK_SNAP entries * are converted to BLK_NOCOPY. These are needed to allow fsck to * identify blocks that are in use by other snapshots (which are * expunged from this snapshot). */ #define BLK_NOCOPY ((ufs2_daddr_t)(1)) #define BLK_SNAP ((ufs2_daddr_t)(2)) /* * Sysctl values for the fast filesystem. */ #define FFS_ADJ_REFCNT 1 /* adjust inode reference count */ #define FFS_ADJ_BLKCNT 2 /* adjust inode used block count */ #define FFS_BLK_FREE 3 /* free range of blocks in map */ #define FFS_DIR_FREE 4 /* free specified dir inodes in map */ #define FFS_FILE_FREE 5 /* free specified file inodes in map */ #define FFS_SET_FLAGS 6 /* set filesystem flags */ #define FFS_ADJ_NDIR 7 /* adjust number of directories */ #define FFS_ADJ_NBFREE 8 /* adjust number of free blocks */ #define FFS_ADJ_NIFREE 9 /* adjust number of free inodes */ #define FFS_ADJ_NFFREE 10 /* adjust number of free frags */ #define FFS_ADJ_NUMCLUSTERS 11 /* adjust number of free clusters */ #define FFS_SET_CWD 12 /* set current directory */ #define FFS_SET_DOTDOT 13 /* set inode number for ".." */ #define FFS_UNLINK 14 /* remove a name in the filesystem */ #define FFS_SET_INODE 15 /* update an on-disk inode */ #define FFS_SET_BUFOUTPUT 16 /* set buffered writing on descriptor */ #define FFS_MAXID 16 /* number of valid ffs ids */ /* * Command structure passed in to the filesystem to adjust filesystem values. */ #define FFS_CMD_VERSION 0x19790518 /* version ID */ struct fsck_cmd { int32_t version; /* version of command structure */ int32_t handle; /* reference to filesystem to be changed */ int64_t value; /* inode or block number to be affected */ int64_t size; /* amount or range to be adjusted */ int64_t spare; /* reserved for future use */ }; /* * A recovery structure placed at the end of the boot block area by newfs * that can be used by fsck to search for alternate superblocks. */ struct fsrecovery { int32_t fsr_magic; /* magic number */ int32_t fsr_fsbtodb; /* fsbtodb and dbtofsb shift constant */ int32_t fsr_sblkno; /* offset of super-block in filesys */ int32_t fsr_fpg; /* blocks per group * fs_frag */ u_int32_t fsr_ncg; /* number of cylinder groups */ }; /* * Per cylinder group information; summarized in blocks allocated * from first cylinder group data blocks. These blocks have to be * read in from fs_csaddr (size fs_cssize) in addition to the * super block. */ struct csum { int32_t cs_ndir; /* number of directories */ int32_t cs_nbfree; /* number of free blocks */ int32_t cs_nifree; /* number of free inodes */ int32_t cs_nffree; /* number of free frags */ }; struct csum_total { int64_t cs_ndir; /* number of directories */ int64_t cs_nbfree; /* number of free blocks */ int64_t cs_nifree; /* number of free inodes */ int64_t cs_nffree; /* number of free frags */ int64_t cs_numclusters; /* number of free clusters */ int64_t cs_spare[3]; /* future expansion */ }; /* * Super block for an FFS filesystem. */ struct fs { int32_t fs_firstfield; /* historic filesystem linked list, */ int32_t fs_unused_1; /* used for incore super blocks */ int32_t fs_sblkno; /* offset of super-block in filesys */ int32_t fs_cblkno; /* offset of cyl-block in filesys */ int32_t fs_iblkno; /* offset of inode-blocks in filesys */ int32_t fs_dblkno; /* offset of first data after cg */ int32_t fs_old_cgoffset; /* cylinder group offset in cylinder */ int32_t fs_old_cgmask; /* used to calc mod fs_ntrak */ int32_t fs_old_time; /* last time written */ int32_t fs_old_size; /* number of blocks in fs */ int32_t fs_old_dsize; /* number of data blocks in fs */ u_int32_t fs_ncg; /* number of cylinder groups */ int32_t fs_bsize; /* size of basic blocks in fs */ int32_t fs_fsize; /* size of frag blocks in fs */ int32_t fs_frag; /* number of frags in a block in fs */ /* these are configuration parameters */ int32_t fs_minfree; /* minimum percentage of free blocks */ int32_t fs_old_rotdelay; /* num of ms for optimal next block */ int32_t fs_old_rps; /* disk revolutions per second */ /* these fields can be computed from the others */ int32_t fs_bmask; /* ``blkoff'' calc of blk offsets */ int32_t fs_fmask; /* ``fragoff'' calc of frag offsets */ int32_t fs_bshift; /* ``lblkno'' calc of logical blkno */ int32_t fs_fshift; /* ``numfrags'' calc number of frags */ /* these are configuration parameters */ int32_t fs_maxcontig; /* max number of contiguous blks */ int32_t fs_maxbpg; /* max number of blks per cyl group */ /* these fields can be computed from the others */ int32_t fs_fragshift; /* block to frag shift */ int32_t fs_fsbtodb; /* fsbtodb and dbtofsb shift constant */ int32_t fs_sbsize; /* actual size of super block */ int32_t fs_spare1[2]; /* old fs_csmask */ /* old fs_csshift */ int32_t fs_nindir; /* value of NINDIR */ u_int32_t fs_inopb; /* value of INOPB */ int32_t fs_old_nspf; /* value of NSPF */ /* yet another configuration parameter */ int32_t fs_optim; /* optimization preference, see below */ int32_t fs_old_npsect; /* # sectors/track including spares */ int32_t fs_old_interleave; /* hardware sector interleave */ int32_t fs_old_trackskew; /* sector 0 skew, per track */ int32_t fs_id[2]; /* unique filesystem id */ /* sizes determined by number of cylinder groups and their sizes */ int32_t fs_old_csaddr; /* blk addr of cyl grp summary area */ int32_t fs_cssize; /* size of cyl grp summary area */ int32_t fs_cgsize; /* cylinder group size */ int32_t fs_spare2; /* old fs_ntrak */ int32_t fs_old_nsect; /* sectors per track */ int32_t fs_old_spc; /* sectors per cylinder */ int32_t fs_old_ncyl; /* cylinders in filesystem */ int32_t fs_old_cpg; /* cylinders per group */ u_int32_t fs_ipg; /* inodes per group */ int32_t fs_fpg; /* blocks per group * fs_frag */ /* this data must be re-computed after crashes */ struct csum fs_old_cstotal; /* cylinder summary information */ /* these fields are cleared at mount time */ int8_t fs_fmod; /* super block modified flag */ int8_t fs_clean; /* filesystem is clean flag */ int8_t fs_ronly; /* mounted read-only flag */ int8_t fs_old_flags; /* old FS_ flags */ u_char fs_fsmnt[MAXMNTLEN]; /* name mounted on */ u_char fs_volname[MAXVOLLEN]; /* volume name */ u_int64_t fs_swuid; /* system-wide uid */ int32_t fs_pad; /* due to alignment of fs_swuid */ /* these fields retain the current block allocation info */ int32_t fs_cgrotor; /* last cg searched */ void *fs_ocsp[NOCSPTRS]; /* padding; was list of fs_cs buffers */ u_int8_t *fs_contigdirs; /* (u) # of contig. allocated dirs */ struct csum *fs_csp; /* (u) cg summary info buffer */ int32_t *fs_maxcluster; /* (u) max cluster in each cyl group */ u_int *fs_active; /* (u) used by snapshots to track fs */ int32_t fs_old_cpc; /* cyl per cycle in postbl */ int32_t fs_maxbsize; /* maximum blocking factor permitted */ int64_t fs_unrefs; /* number of unreferenced inodes */ int64_t fs_providersize; /* size of underlying GEOM provider */ int64_t fs_metaspace; /* size of area reserved for metadata */ int64_t fs_sparecon64[14]; /* old rotation block list head */ int64_t fs_sblockloc; /* byte offset of standard superblock */ struct csum_total fs_cstotal; /* (u) cylinder summary information */ ufs_time_t fs_time; /* last time written */ int64_t fs_size; /* number of blocks in fs */ int64_t fs_dsize; /* number of data blocks in fs */ ufs2_daddr_t fs_csaddr; /* blk addr of cyl grp summary area */ int64_t fs_pendingblocks; /* (u) blocks being freed */ u_int32_t fs_pendinginodes; /* (u) inodes being freed */ uint32_t fs_snapinum[FSMAXSNAP];/* list of snapshot inode numbers */ u_int32_t fs_avgfilesize; /* expected average file size */ u_int32_t fs_avgfpdir; /* expected # of files per directory */ int32_t fs_save_cgsize; /* save real cg size to use fs_bsize */ ufs_time_t fs_mtime; /* Last mount or fsck time. */ int32_t fs_sujfree; /* SUJ free list */ - int32_t fs_sparecon32[23]; /* reserved for future constants */ + int32_t fs_sparecon32[22]; /* reserved for future constants */ + u_int32_t fs_metackhash; /* metadata check-hash, see CK_ below */ int32_t fs_flags; /* see FS_ flags below */ int32_t fs_contigsumsize; /* size of cluster summary array */ int32_t fs_maxsymlinklen; /* max length of an internal symlink */ int32_t fs_old_inodefmt; /* format of on-disk inodes */ u_int64_t fs_maxfilesize; /* maximum representable file size */ int64_t fs_qbmask; /* ~fs_bmask for use with 64-bit size */ int64_t fs_qfmask; /* ~fs_fmask for use with 64-bit size */ int32_t fs_state; /* validate fs_clean field */ int32_t fs_old_postblformat; /* format of positional layout tables */ int32_t fs_old_nrpos; /* number of rotational positions */ int32_t fs_spare5[2]; /* old fs_postbloff */ /* old fs_rotbloff */ int32_t fs_magic; /* magic number */ }; /* Sanity checking. */ #ifdef CTASSERT CTASSERT(sizeof(struct fs) == 1376); #endif /* * Filesystem identification */ #define FS_UFS1_MAGIC 0x011954 /* UFS1 fast filesystem magic number */ #define FS_UFS2_MAGIC 0x19540119 /* UFS2 fast filesystem magic number */ #define FS_BAD_MAGIC 0x19960408 /* UFS incomplete newfs magic number */ -#define FS_OKAY 0x7c269d38 /* superblock checksum */ #define FS_42INODEFMT -1 /* 4.2BSD inode format */ #define FS_44INODEFMT 2 /* 4.4BSD inode format */ /* * Preference for optimization. */ #define FS_OPTTIME 0 /* minimize allocation time */ #define FS_OPTSPACE 1 /* minimize disk fragmentation */ /* * Filesystem flags. * * The FS_UNCLEAN flag is set by the kernel when the filesystem was * mounted with fs_clean set to zero. The FS_DOSOFTDEP flag indicates * that the filesystem should be managed by the soft updates code. * Note that the FS_NEEDSFSCK flag is set and cleared only by the * fsck utility. It is set when background fsck finds an unexpected * inconsistency which requires a traditional foreground fsck to be * run. Such inconsistencies should only be found after an uncorrectable * disk error. A foreground fsck will clear the FS_NEEDSFSCK flag when * it has successfully cleaned up the filesystem. The kernel uses this * flag to enforce that inconsistent filesystems be mounted read-only. * The FS_INDEXDIRS flag when set indicates that the kernel maintains * on-disk auxiliary indexes (such as B-trees) for speeding directory * accesses. Kernels that do not support auxiliary indices clear the * flag to indicate that the indices need to be rebuilt (by fsck) before - * they can be used. + * they can be used. When a filesystem is mounted, any flags not + * included in FS_SUPPORTED are cleared. This lets newer features + * know that the filesystem has been run on an older version of the + * filesystem and thus that data structures associated with those + * features are out-of-date and need to be rebuilt. * * FS_ACLS indicates that POSIX.1e ACLs are administratively enabled * for the file system, so they should be loaded from extended attributes, * observed for access control purposes, and be administered by object * owners. FS_NFS4ACLS indicates that NFSv4 ACLs are administratively * enabled. This flag is mutually exclusive with FS_ACLS. FS_MULTILABEL * indicates that the TrustedBSD MAC Framework should attempt to back MAC * labels into extended attributes on the file system rather than maintain * a single mount label for all objects. */ #define FS_UNCLEAN 0x0001 /* filesystem not clean at mount */ #define FS_DOSOFTDEP 0x0002 /* filesystem using soft dependencies */ #define FS_NEEDSFSCK 0x0004 /* filesystem needs sync fsck before mount */ #define FS_SUJ 0x0008 /* Filesystem using softupdate journal */ #define FS_ACLS 0x0010 /* file system has POSIX.1e ACLs enabled */ #define FS_MULTILABEL 0x0020 /* file system is MAC multi-label */ #define FS_GJOURNAL 0x0040 /* gjournaled file system */ #define FS_FLAGS_UPDATED 0x0080 /* flags have been moved to new location */ #define FS_NFS4ACLS 0x0100 /* file system has NFSv4 ACLs enabled */ #define FS_INDEXDIRS 0x0200 /* kernel supports indexed directories */ #define FS_TRIM 0x0400 /* issue BIO_DELETE for deleted blocks */ +#define FS_SUPPORTED 0xFFFF /* supported flags, others cleared at mount */ /* + * The fs_metackhash field indicates the types of metadata check-hash + * that are maintained for a filesystem. Not all filesystems check-hash + * all metadata. + */ +#define CK_SUPERBLOCK 0x0001 /* the superblock */ +#define CK_CYLGRP 0x0002 /* the cylinder groups */ +#define CK_INODE 0x0004 /* inodes */ +#define CK_INDIR 0x0008 /* indirect blocks */ +#define CK_DIR 0x0010 /* directory contents */ +/* + * The BX_FSPRIV buffer b_xflags are used to track types of data in buffers. + */ +#define BX_SUPERBLOCK 0x00010000 /* superblock */ +#define BX_CYLGRP 0x00020000 /* cylinder groups */ +#define BX_INODE 0x00040000 /* inodes */ +#define BX_INDIR 0x00080000 /* indirect blocks */ +#define BX_DIR 0x00100000 /* directory contents */ + +#define PRINT_UFS_BUF_XFLAGS "\20\25dir\24indir\23inode\22cylgrp\21superblock" + +/* * Macros to access bits in the fs_active array. */ #define ACTIVECGNUM(fs, cg) ((fs)->fs_active[(cg) / (NBBY * sizeof(int))]) #define ACTIVECGOFF(cg) (1 << ((cg) % (NBBY * sizeof(int)))) #define ACTIVESET(fs, cg) do { \ if ((fs)->fs_active) \ ACTIVECGNUM((fs), (cg)) |= ACTIVECGOFF((cg)); \ } while (0) #define ACTIVECLEAR(fs, cg) do { \ if ((fs)->fs_active) \ ACTIVECGNUM((fs), (cg)) &= ~ACTIVECGOFF((cg)); \ } while (0) /* * The size of a cylinder group is calculated by CGSIZE. The maximum size * is limited by the fact that cylinder groups are at most one block. * Its size is derived from the size of the maps maintained in the * cylinder group and the (struct cg) size. */ #define CGSIZE(fs) \ /* base cg */ (sizeof(struct cg) + sizeof(int32_t) + \ /* old btotoff */ (fs)->fs_old_cpg * sizeof(int32_t) + \ /* old boff */ (fs)->fs_old_cpg * sizeof(u_int16_t) + \ /* inode map */ howmany((fs)->fs_ipg, NBBY) + \ /* block map */ howmany((fs)->fs_fpg, NBBY) +\ /* if present */ ((fs)->fs_contigsumsize <= 0 ? 0 : \ /* cluster sum */ (fs)->fs_contigsumsize * sizeof(int32_t) + \ /* cluster map */ howmany(fragstoblks(fs, (fs)->fs_fpg), NBBY))) /* * The minimal number of cylinder groups that should be created. */ #define MINCYLGRPS 4 /* * Convert cylinder group to base address of its global summary info. */ #define fs_cs(fs, indx) fs_csp[indx] /* * Cylinder group block for a filesystem. */ #define CG_MAGIC 0x090255 struct cg { int32_t cg_firstfield; /* historic cyl groups linked list */ int32_t cg_magic; /* magic number */ int32_t cg_old_time; /* time last written */ u_int32_t cg_cgx; /* we are the cgx'th cylinder group */ int16_t cg_old_ncyl; /* number of cyl's this cg */ int16_t cg_old_niblk; /* number of inode blocks this cg */ u_int32_t cg_ndblk; /* number of data blocks this cg */ struct csum cg_cs; /* cylinder summary information */ u_int32_t cg_rotor; /* position of last used block */ u_int32_t cg_frotor; /* position of last used frag */ u_int32_t cg_irotor; /* position of last used inode */ u_int32_t cg_frsum[MAXFRAG]; /* counts of available frags */ int32_t cg_old_btotoff; /* (int32) block totals per cylinder */ int32_t cg_old_boff; /* (u_int16) free block positions */ u_int32_t cg_iusedoff; /* (u_int8) used inode map */ u_int32_t cg_freeoff; /* (u_int8) free block map */ u_int32_t cg_nextfreeoff; /* (u_int8) next available space */ u_int32_t cg_clustersumoff; /* (u_int32) counts of avail clusters */ u_int32_t cg_clusteroff; /* (u_int8) free cluster map */ u_int32_t cg_nclusterblks; /* number of clusters this cg */ u_int32_t cg_niblk; /* number of inode blocks this cg */ u_int32_t cg_initediblk; /* last initialized inode */ u_int32_t cg_unrefs; /* number of unreferenced inodes */ - int32_t cg_sparecon32[2]; /* reserved for future use */ + int32_t cg_sparecon32[1]; /* reserved for future use */ + u_int32_t cg_ckhash; /* check-hash of this cg */ ufs_time_t cg_time; /* time last written */ int64_t cg_sparecon64[3]; /* reserved for future use */ u_int8_t cg_space[1]; /* space for cylinder group maps */ /* actually longer */ }; /* * Macros for access to cylinder group array structures */ #define cg_chkmagic(cgp) ((cgp)->cg_magic == CG_MAGIC) #define cg_inosused(cgp) \ ((u_int8_t *)((u_int8_t *)(cgp) + (cgp)->cg_iusedoff)) #define cg_blksfree(cgp) \ ((u_int8_t *)((u_int8_t *)(cgp) + (cgp)->cg_freeoff)) #define cg_clustersfree(cgp) \ ((u_int8_t *)((u_int8_t *)(cgp) + (cgp)->cg_clusteroff)) #define cg_clustersum(cgp) \ ((int32_t *)((uintptr_t)(cgp) + (cgp)->cg_clustersumoff)) /* * Turn filesystem block numbers into disk block addresses. * This maps filesystem blocks to device size blocks. */ #define fsbtodb(fs, b) ((daddr_t)(b) << (fs)->fs_fsbtodb) #define dbtofsb(fs, b) ((b) >> (fs)->fs_fsbtodb) /* * Cylinder group macros to locate things in cylinder groups. * They calc filesystem addresses of cylinder group data structures. */ #define cgbase(fs, c) (((ufs2_daddr_t)(fs)->fs_fpg) * (c)) #define cgdata(fs, c) (cgdmin(fs, c) + (fs)->fs_metaspace) /* data zone */ #define cgmeta(fs, c) (cgdmin(fs, c)) /* meta data */ #define cgdmin(fs, c) (cgstart(fs, c) + (fs)->fs_dblkno) /* 1st data */ #define cgimin(fs, c) (cgstart(fs, c) + (fs)->fs_iblkno) /* inode blk */ #define cgsblock(fs, c) (cgstart(fs, c) + (fs)->fs_sblkno) /* super blk */ #define cgtod(fs, c) (cgstart(fs, c) + (fs)->fs_cblkno) /* cg block */ #define cgstart(fs, c) \ ((fs)->fs_magic == FS_UFS2_MAGIC ? cgbase(fs, c) : \ (cgbase(fs, c) + (fs)->fs_old_cgoffset * ((c) & ~((fs)->fs_old_cgmask)))) /* * Macros for handling inode numbers: * inode number to filesystem block offset. * inode number to cylinder group number. * inode number to filesystem block address. */ #define ino_to_cg(fs, x) (((ino_t)(x)) / (fs)->fs_ipg) #define ino_to_fsba(fs, x) \ ((ufs2_daddr_t)(cgimin(fs, ino_to_cg(fs, (ino_t)(x))) + \ (blkstofrags((fs), ((((ino_t)(x)) % (fs)->fs_ipg) / INOPB(fs)))))) #define ino_to_fsbo(fs, x) (((ino_t)(x)) % INOPB(fs)) /* * Give cylinder group number for a filesystem block. * Give cylinder group block number for a filesystem block. */ #define dtog(fs, d) ((d) / (fs)->fs_fpg) #define dtogd(fs, d) ((d) % (fs)->fs_fpg) /* * Extract the bits for a block from a map. * Compute the cylinder and rotational position of a cyl block addr. */ #define blkmap(fs, map, loc) \ (((map)[(loc) / NBBY] >> ((loc) % NBBY)) & (0xff >> (NBBY - (fs)->fs_frag))) /* * The following macros optimize certain frequently calculated * quantities by using shifts and masks in place of divisions * modulos and multiplications. */ #define blkoff(fs, loc) /* calculates (loc % fs->fs_bsize) */ \ ((loc) & (fs)->fs_qbmask) #define fragoff(fs, loc) /* calculates (loc % fs->fs_fsize) */ \ ((loc) & (fs)->fs_qfmask) #define lfragtosize(fs, frag) /* calculates ((off_t)frag * fs->fs_fsize) */ \ (((off_t)(frag)) << (fs)->fs_fshift) #define lblktosize(fs, blk) /* calculates ((off_t)blk * fs->fs_bsize) */ \ (((off_t)(blk)) << (fs)->fs_bshift) /* Use this only when `blk' is known to be small, e.g., < UFS_NDADDR. */ #define smalllblktosize(fs, blk) /* calculates (blk * fs->fs_bsize) */ \ ((blk) << (fs)->fs_bshift) #define lblkno(fs, loc) /* calculates (loc / fs->fs_bsize) */ \ ((loc) >> (fs)->fs_bshift) #define numfrags(fs, loc) /* calculates (loc / fs->fs_fsize) */ \ ((loc) >> (fs)->fs_fshift) #define blkroundup(fs, size) /* calculates roundup(size, fs->fs_bsize) */ \ (((size) + (fs)->fs_qbmask) & (fs)->fs_bmask) #define fragroundup(fs, size) /* calculates roundup(size, fs->fs_fsize) */ \ (((size) + (fs)->fs_qfmask) & (fs)->fs_fmask) #define fragstoblks(fs, frags) /* calculates (frags / fs->fs_frag) */ \ ((frags) >> (fs)->fs_fragshift) #define blkstofrags(fs, blks) /* calculates (blks * fs->fs_frag) */ \ ((blks) << (fs)->fs_fragshift) #define fragnum(fs, fsb) /* calculates (fsb % fs->fs_frag) */ \ ((fsb) & ((fs)->fs_frag - 1)) #define blknum(fs, fsb) /* calculates rounddown(fsb, fs->fs_frag) */ \ ((fsb) &~ ((fs)->fs_frag - 1)) /* * Determine the number of available frags given a * percentage to hold in reserve. */ #define freespace(fs, percentreserved) \ (blkstofrags((fs), (fs)->fs_cstotal.cs_nbfree) + \ (fs)->fs_cstotal.cs_nffree - \ (((off_t)((fs)->fs_dsize)) * (percentreserved) / 100)) /* * Determining the size of a file block in the filesystem. */ #define blksize(fs, ip, lbn) \ (((lbn) >= UFS_NDADDR || (ip)->i_size >= \ (uint64_t)smalllblktosize(fs, (lbn) + 1)) \ ? (fs)->fs_bsize \ : (fragroundup(fs, blkoff(fs, (ip)->i_size)))) #define sblksize(fs, size, lbn) \ (((lbn) >= UFS_NDADDR || (size) >= ((lbn) + 1) << (fs)->fs_bshift) \ ? (fs)->fs_bsize \ : (fragroundup(fs, blkoff(fs, (size))))) /* * Number of indirects in a filesystem block. */ #define NINDIR(fs) ((fs)->fs_nindir) /* * Indirect lbns are aligned on UFS_NDADDR addresses where single indirects * are the negated address of the lowest lbn reachable, double indirects * are this lbn - 1 and triple indirects are this lbn - 2. This yields * an unusual bit order to determine level. */ static inline int lbn_level(ufs_lbn_t lbn) { if (lbn >= 0) return 0; switch (lbn & 0x3) { case 0: return (0); case 1: break; case 2: return (2); case 3: return (1); default: break; } return (-1); } static inline ufs_lbn_t lbn_offset(struct fs *fs, int level) { ufs_lbn_t res; for (res = 1; level > 0; level--) res *= NINDIR(fs); return (res); } /* * Number of inodes in a secondary storage block/fragment. */ #define INOPB(fs) ((fs)->fs_inopb) #define INOPF(fs) ((fs)->fs_inopb >> (fs)->fs_fragshift) /* * Softdep journal record format. */ #define JOP_ADDREF 1 /* Add a reference to an inode. */ #define JOP_REMREF 2 /* Remove a reference from an inode. */ #define JOP_NEWBLK 3 /* Allocate a block. */ #define JOP_FREEBLK 4 /* Free a block or a tree of blocks. */ #define JOP_MVREF 5 /* Move a reference from one off to another. */ #define JOP_TRUNC 6 /* Partial truncation record. */ #define JOP_SYNC 7 /* fsync() complete record. */ #define JREC_SIZE 32 /* Record and segment header size. */ #define SUJ_MIN (4 * 1024 * 1024) /* Minimum journal size */ #define SUJ_MAX (32 * 1024 * 1024) /* Maximum journal size */ #define SUJ_FILE ".sujournal" /* Journal file name */ /* * Size of the segment record header. There is at most one for each disk * block in the journal. The segment header is followed by an array of * records. fsck depends on the first element in each record being 'op' * and the second being 'ino'. Segments may span multiple disk blocks but * the header is present on each. */ struct jsegrec { uint64_t jsr_seq; /* Our sequence number */ uint64_t jsr_oldest; /* Oldest valid sequence number */ uint16_t jsr_cnt; /* Count of valid records */ uint16_t jsr_blocks; /* Count of device bsize blocks. */ uint32_t jsr_crc; /* 32bit crc of the valid space */ ufs_time_t jsr_time; /* timestamp for mount instance */ }; /* * Reference record. Records a single link count modification. */ struct jrefrec { uint32_t jr_op; uint32_t jr_ino; uint32_t jr_parent; uint16_t jr_nlink; uint16_t jr_mode; int64_t jr_diroff; uint64_t jr_unused; }; /* * Move record. Records a reference moving within a directory block. The * nlink is unchanged but we must search both locations. */ struct jmvrec { uint32_t jm_op; uint32_t jm_ino; uint32_t jm_parent; uint16_t jm_unused; int64_t jm_oldoff; int64_t jm_newoff; }; /* * Block record. A set of frags or tree of blocks starting at an indirect are * freed or a set of frags are allocated. */ struct jblkrec { uint32_t jb_op; uint32_t jb_ino; ufs2_daddr_t jb_blkno; ufs_lbn_t jb_lbn; uint16_t jb_frags; uint16_t jb_oldfrags; uint32_t jb_unused; }; /* * Truncation record. Records a partial truncation so that it may be * completed at check time. Also used for sync records. */ struct jtrncrec { uint32_t jt_op; uint32_t jt_ino; int64_t jt_size; uint32_t jt_extsize; uint32_t jt_pad[3]; }; union jrec { struct jsegrec rec_jsegrec; struct jrefrec rec_jrefrec; struct jmvrec rec_jmvrec; struct jblkrec rec_jblkrec; struct jtrncrec rec_jtrncrec; }; #ifdef CTASSERT CTASSERT(sizeof(struct jsegrec) == JREC_SIZE); CTASSERT(sizeof(struct jrefrec) == JREC_SIZE); CTASSERT(sizeof(struct jmvrec) == JREC_SIZE); CTASSERT(sizeof(struct jblkrec) == JREC_SIZE); CTASSERT(sizeof(struct jtrncrec) == JREC_SIZE); CTASSERT(sizeof(union jrec) == JREC_SIZE); #endif extern int inside[], around[]; extern u_char *fragtbl[]; /* * IOCTLs used for filesystem write suspension. */ #define UFSSUSPEND _IOW('U', 1, fsid_t) #define UFSRESUME _IO('U', 2) #endif